1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "RISCVTargetTransformInfo.h"
10 #include "MCTargetDesc/RISCVMatInt.h"
11 #include "llvm/Analysis/TargetTransformInfo.h"
12 #include "llvm/CodeGen/BasicTTIImpl.h"
13 #include "llvm/CodeGen/CostTable.h"
14 #include "llvm/CodeGen/TargetLowering.h"
15 #include <cmath>
16 #include <optional>
17 using namespace llvm;
18
19 #define DEBUG_TYPE "riscvtti"
20
21 static cl::opt<unsigned> RVVRegisterWidthLMUL(
22 "riscv-v-register-bit-width-lmul",
23 cl::desc(
24 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
25 "by autovectorized code. Fractional LMULs are not supported."),
26 cl::init(1), cl::Hidden);
27
28 static cl::opt<unsigned> SLPMaxVF(
29 "riscv-v-slp-max-vf",
30 cl::desc(
31 "Result used for getMaximumVF query which is used exclusively by "
32 "SLP vectorizer. Defaults to 1 which disables SLP."),
33 cl::init(1), cl::Hidden);
34
getLMULCost(MVT VT)35 InstructionCost RISCVTTIImpl::getLMULCost(MVT VT) {
36 // TODO: Here assume reciprocal throughput is 1 for LMUL_1, it is
37 // implementation-defined.
38 if (!VT.isVector())
39 return InstructionCost::getInvalid();
40 unsigned Cost;
41 if (VT.isScalableVector()) {
42 unsigned LMul;
43 bool Fractional;
44 std::tie(LMul, Fractional) =
45 RISCVVType::decodeVLMUL(RISCVTargetLowering::getLMUL(VT));
46 if (Fractional)
47 Cost = 1;
48 else
49 Cost = LMul;
50 } else {
51 Cost = VT.getSizeInBits() / ST->getRealMinVLen();
52 }
53 return std::max<unsigned>(Cost, 1);
54 }
55
getIntImmCost(const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)56 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
57 TTI::TargetCostKind CostKind) {
58 assert(Ty->isIntegerTy() &&
59 "getIntImmCost can only estimate cost of materialising integers");
60
61 // We have a Zero register, so 0 is always free.
62 if (Imm == 0)
63 return TTI::TCC_Free;
64
65 // Otherwise, we check how many instructions it will take to materialise.
66 const DataLayout &DL = getDataLayout();
67 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty),
68 getST()->getFeatureBits());
69 }
70
71 // Look for patterns of shift followed by AND that can be turned into a pair of
72 // shifts. We won't need to materialize an immediate for the AND so these can
73 // be considered free.
canUseShiftPair(Instruction * Inst,const APInt & Imm)74 static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
75 uint64_t Mask = Imm.getZExtValue();
76 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
77 if (!BO || !BO->hasOneUse())
78 return false;
79
80 if (BO->getOpcode() != Instruction::Shl)
81 return false;
82
83 if (!isa<ConstantInt>(BO->getOperand(1)))
84 return false;
85
86 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
87 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
88 // is a mask shifted by c2 bits with c3 leading zeros.
89 if (isShiftedMask_64(Mask)) {
90 unsigned Trailing = countTrailingZeros(Mask);
91 if (ShAmt == Trailing)
92 return true;
93 }
94
95 return false;
96 }
97
getIntImmCostInst(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind,Instruction * Inst)98 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
99 const APInt &Imm, Type *Ty,
100 TTI::TargetCostKind CostKind,
101 Instruction *Inst) {
102 assert(Ty->isIntegerTy() &&
103 "getIntImmCost can only estimate cost of materialising integers");
104
105 // We have a Zero register, so 0 is always free.
106 if (Imm == 0)
107 return TTI::TCC_Free;
108
109 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
110 // commutative, in others the immediate comes from a specific argument index.
111 bool Takes12BitImm = false;
112 unsigned ImmArgIdx = ~0U;
113
114 switch (Opcode) {
115 case Instruction::GetElementPtr:
116 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
117 // split up large offsets in GEP into better parts than ConstantHoisting
118 // can.
119 return TTI::TCC_Free;
120 case Instruction::And:
121 // zext.h
122 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
123 return TTI::TCC_Free;
124 // zext.w
125 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
126 return TTI::TCC_Free;
127 // bclri
128 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
129 return TTI::TCC_Free;
130 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
131 canUseShiftPair(Inst, Imm))
132 return TTI::TCC_Free;
133 Takes12BitImm = true;
134 break;
135 case Instruction::Add:
136 Takes12BitImm = true;
137 break;
138 case Instruction::Or:
139 case Instruction::Xor:
140 // bseti/binvi
141 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
142 return TTI::TCC_Free;
143 Takes12BitImm = true;
144 break;
145 case Instruction::Mul:
146 // Negated power of 2 is a shift and a negate.
147 if (Imm.isNegatedPowerOf2())
148 return TTI::TCC_Free;
149 // FIXME: There is no MULI instruction.
150 Takes12BitImm = true;
151 break;
152 case Instruction::Sub:
153 case Instruction::Shl:
154 case Instruction::LShr:
155 case Instruction::AShr:
156 Takes12BitImm = true;
157 ImmArgIdx = 1;
158 break;
159 default:
160 break;
161 }
162
163 if (Takes12BitImm) {
164 // Check immediate is the correct argument...
165 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
166 // ... and fits into the 12-bit immediate.
167 if (Imm.getMinSignedBits() <= 64 &&
168 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
169 return TTI::TCC_Free;
170 }
171 }
172
173 // Otherwise, use the full materialisation cost.
174 return getIntImmCost(Imm, Ty, CostKind);
175 }
176
177 // By default, prevent hoisting.
178 return TTI::TCC_Free;
179 }
180
181 InstructionCost
getIntImmCostIntrin(Intrinsic::ID IID,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)182 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
183 const APInt &Imm, Type *Ty,
184 TTI::TargetCostKind CostKind) {
185 // Prevent hoisting in unknown cases.
186 return TTI::TCC_Free;
187 }
188
189 TargetTransformInfo::PopcntSupportKind
getPopcntSupport(unsigned TyWidth)190 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
191 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
192 return ST->hasStdExtZbb() ? TTI::PSK_FastHardware : TTI::PSK_Software;
193 }
194
shouldExpandReduction(const IntrinsicInst * II) const195 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
196 // Currently, the ExpandReductions pass can't expand scalable-vector
197 // reductions, but we still request expansion as RVV doesn't support certain
198 // reductions and the SelectionDAG can't legalize them either.
199 switch (II->getIntrinsicID()) {
200 default:
201 return false;
202 // These reductions have no equivalent in RVV
203 case Intrinsic::vector_reduce_mul:
204 case Intrinsic::vector_reduce_fmul:
205 return true;
206 }
207 }
208
getMaxVScale() const209 std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
210 if (ST->hasVInstructions())
211 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
212 return BaseT::getMaxVScale();
213 }
214
getVScaleForTuning() const215 std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
216 if (ST->hasVInstructions())
217 if (unsigned MinVLen = ST->getRealMinVLen();
218 MinVLen >= RISCV::RVVBitsPerBlock)
219 return MinVLen / RISCV::RVVBitsPerBlock;
220 return BaseT::getVScaleForTuning();
221 }
222
223 TypeSize
getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const224 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
225 unsigned LMUL = PowerOf2Floor(
226 std::max<unsigned>(std::min<unsigned>(RVVRegisterWidthLMUL, 8), 1));
227 switch (K) {
228 case TargetTransformInfo::RGK_Scalar:
229 return TypeSize::getFixed(ST->getXLen());
230 case TargetTransformInfo::RGK_FixedWidthVector:
231 return TypeSize::getFixed(
232 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
233 case TargetTransformInfo::RGK_ScalableVector:
234 return TypeSize::getScalable(
235 (ST->hasVInstructions() &&
236 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
237 ? LMUL * RISCV::RVVBitsPerBlock
238 : 0);
239 }
240
241 llvm_unreachable("Unsupported register kind");
242 }
243
getSpliceCost(VectorType * Tp,int Index)244 InstructionCost RISCVTTIImpl::getSpliceCost(VectorType *Tp, int Index) {
245 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
246
247 unsigned Cost = 2; // vslidedown+vslideup.
248 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
249 // of similar code, but I think we expand through memory.
250 return Cost * LT.first * getLMULCost(LT.second);
251 }
252
getShuffleCost(TTI::ShuffleKind Kind,VectorType * Tp,ArrayRef<int> Mask,TTI::TargetCostKind CostKind,int Index,VectorType * SubTp,ArrayRef<const Value * > Args)253 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
254 VectorType *Tp, ArrayRef<int> Mask,
255 TTI::TargetCostKind CostKind,
256 int Index, VectorType *SubTp,
257 ArrayRef<const Value *> Args) {
258 if (isa<ScalableVectorType>(Tp)) {
259 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
260 switch (Kind) {
261 default:
262 // Fallthrough to generic handling.
263 // TODO: Most of these cases will return getInvalid in generic code, and
264 // must be implemented here.
265 break;
266 case TTI::SK_Broadcast: {
267 return LT.first * 1;
268 }
269 case TTI::SK_Splice:
270 return getSpliceCost(Tp, Index);
271 case TTI::SK_Reverse:
272 // Most of the cost here is producing the vrgather index register
273 // Example sequence:
274 // csrr a0, vlenb
275 // srli a0, a0, 3
276 // addi a0, a0, -1
277 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
278 // vid.v v9
279 // vrsub.vx v10, v9, a0
280 // vrgather.vv v9, v8, v10
281 if (Tp->getElementType()->isIntegerTy(1))
282 // Mask operation additionally required extend and truncate
283 return LT.first * 9;
284 return LT.first * 6;
285 }
286 }
287
288 if (isa<FixedVectorType>(Tp) && Kind == TargetTransformInfo::SK_Broadcast) {
289 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
290 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
291 Instruction::InsertElement);
292 if (LT.second.getScalarSizeInBits() == 1) {
293 if (HasScalar) {
294 // Example sequence:
295 // andi a0, a0, 1
296 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
297 // vmv.v.x v8, a0
298 // vmsne.vi v0, v8, 0
299 return LT.first * getLMULCost(LT.second) * 3;
300 }
301 // Example sequence:
302 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
303 // vmv.v.i v8, 0
304 // vmerge.vim v8, v8, 1, v0
305 // vmv.x.s a0, v8
306 // andi a0, a0, 1
307 // vmv.v.x v8, a0
308 // vmsne.vi v0, v8, 0
309
310 return LT.first * getLMULCost(LT.second) * 6;
311 }
312
313 if (HasScalar) {
314 // Example sequence:
315 // vmv.v.x v8, a0
316 return LT.first * getLMULCost(LT.second);
317 }
318
319 // Example sequence:
320 // vrgather.vi v9, v8, 0
321 // TODO: vrgather could be slower than vmv.v.x. It is
322 // implementation-dependent.
323 return LT.first * getLMULCost(LT.second);
324 }
325
326 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
327 }
328
329 InstructionCost
getMaskedMemoryOpCost(unsigned Opcode,Type * Src,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)330 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
331 unsigned AddressSpace,
332 TTI::TargetCostKind CostKind) {
333 if (!isLegalMaskedLoadStore(Src, Alignment) ||
334 CostKind != TTI::TCK_RecipThroughput)
335 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
336 CostKind);
337
338 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
339 }
340
getGatherScatterOpCost(unsigned Opcode,Type * DataTy,const Value * Ptr,bool VariableMask,Align Alignment,TTI::TargetCostKind CostKind,const Instruction * I)341 InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
342 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
343 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
344 if (CostKind != TTI::TCK_RecipThroughput)
345 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
346 Alignment, CostKind, I);
347
348 if ((Opcode == Instruction::Load &&
349 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
350 (Opcode == Instruction::Store &&
351 !isLegalMaskedScatter(DataTy, Align(Alignment))))
352 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
353 Alignment, CostKind, I);
354
355 // Cost is proportional to the number of memory operations implied. For
356 // scalable vectors, we use an estimate on that number since we don't
357 // know exactly what VL will be.
358 auto &VTy = *cast<VectorType>(DataTy);
359 InstructionCost MemOpCost =
360 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
361 {TTI::OK_AnyValue, TTI::OP_None}, I);
362 unsigned NumLoads = getEstimatedVLFor(&VTy);
363 return NumLoads * MemOpCost;
364 }
365
366 // Currently, these represent both throughput and codesize costs
367 // for the respective intrinsics. The costs in this table are simply
368 // instruction counts with the following adjustments made:
369 // * One vsetvli is considered free.
370 static const CostTblEntry VectorIntrinsicCostTable[]{
371 {Intrinsic::floor, MVT::v2f32, 9},
372 {Intrinsic::floor, MVT::v4f32, 9},
373 {Intrinsic::floor, MVT::v8f32, 9},
374 {Intrinsic::floor, MVT::v16f32, 9},
375 {Intrinsic::floor, MVT::nxv1f32, 9},
376 {Intrinsic::floor, MVT::nxv2f32, 9},
377 {Intrinsic::floor, MVT::nxv4f32, 9},
378 {Intrinsic::floor, MVT::nxv8f32, 9},
379 {Intrinsic::floor, MVT::nxv16f32, 9},
380 {Intrinsic::floor, MVT::v2f64, 9},
381 {Intrinsic::floor, MVT::v4f64, 9},
382 {Intrinsic::floor, MVT::v8f64, 9},
383 {Intrinsic::floor, MVT::v16f64, 9},
384 {Intrinsic::floor, MVT::nxv1f64, 9},
385 {Intrinsic::floor, MVT::nxv2f64, 9},
386 {Intrinsic::floor, MVT::nxv4f64, 9},
387 {Intrinsic::floor, MVT::nxv8f64, 9},
388 {Intrinsic::ceil, MVT::v2f32, 9},
389 {Intrinsic::ceil, MVT::v4f32, 9},
390 {Intrinsic::ceil, MVT::v8f32, 9},
391 {Intrinsic::ceil, MVT::v16f32, 9},
392 {Intrinsic::ceil, MVT::nxv1f32, 9},
393 {Intrinsic::ceil, MVT::nxv2f32, 9},
394 {Intrinsic::ceil, MVT::nxv4f32, 9},
395 {Intrinsic::ceil, MVT::nxv8f32, 9},
396 {Intrinsic::ceil, MVT::nxv16f32, 9},
397 {Intrinsic::ceil, MVT::v2f64, 9},
398 {Intrinsic::ceil, MVT::v4f64, 9},
399 {Intrinsic::ceil, MVT::v8f64, 9},
400 {Intrinsic::ceil, MVT::v16f64, 9},
401 {Intrinsic::ceil, MVT::nxv1f64, 9},
402 {Intrinsic::ceil, MVT::nxv2f64, 9},
403 {Intrinsic::ceil, MVT::nxv4f64, 9},
404 {Intrinsic::ceil, MVT::nxv8f64, 9},
405 {Intrinsic::trunc, MVT::v2f32, 7},
406 {Intrinsic::trunc, MVT::v4f32, 7},
407 {Intrinsic::trunc, MVT::v8f32, 7},
408 {Intrinsic::trunc, MVT::v16f32, 7},
409 {Intrinsic::trunc, MVT::nxv1f32, 7},
410 {Intrinsic::trunc, MVT::nxv2f32, 7},
411 {Intrinsic::trunc, MVT::nxv4f32, 7},
412 {Intrinsic::trunc, MVT::nxv8f32, 7},
413 {Intrinsic::trunc, MVT::nxv16f32, 7},
414 {Intrinsic::trunc, MVT::v2f64, 7},
415 {Intrinsic::trunc, MVT::v4f64, 7},
416 {Intrinsic::trunc, MVT::v8f64, 7},
417 {Intrinsic::trunc, MVT::v16f64, 7},
418 {Intrinsic::trunc, MVT::nxv1f64, 7},
419 {Intrinsic::trunc, MVT::nxv2f64, 7},
420 {Intrinsic::trunc, MVT::nxv4f64, 7},
421 {Intrinsic::trunc, MVT::nxv8f64, 7},
422 {Intrinsic::round, MVT::v2f32, 9},
423 {Intrinsic::round, MVT::v4f32, 9},
424 {Intrinsic::round, MVT::v8f32, 9},
425 {Intrinsic::round, MVT::v16f32, 9},
426 {Intrinsic::round, MVT::nxv1f32, 9},
427 {Intrinsic::round, MVT::nxv2f32, 9},
428 {Intrinsic::round, MVT::nxv4f32, 9},
429 {Intrinsic::round, MVT::nxv8f32, 9},
430 {Intrinsic::round, MVT::nxv16f32, 9},
431 {Intrinsic::round, MVT::v2f64, 9},
432 {Intrinsic::round, MVT::v4f64, 9},
433 {Intrinsic::round, MVT::v8f64, 9},
434 {Intrinsic::round, MVT::v16f64, 9},
435 {Intrinsic::round, MVT::nxv1f64, 9},
436 {Intrinsic::round, MVT::nxv2f64, 9},
437 {Intrinsic::round, MVT::nxv4f64, 9},
438 {Intrinsic::round, MVT::nxv8f64, 9},
439 {Intrinsic::roundeven, MVT::v2f32, 9},
440 {Intrinsic::roundeven, MVT::v4f32, 9},
441 {Intrinsic::roundeven, MVT::v8f32, 9},
442 {Intrinsic::roundeven, MVT::v16f32, 9},
443 {Intrinsic::roundeven, MVT::nxv1f32, 9},
444 {Intrinsic::roundeven, MVT::nxv2f32, 9},
445 {Intrinsic::roundeven, MVT::nxv4f32, 9},
446 {Intrinsic::roundeven, MVT::nxv8f32, 9},
447 {Intrinsic::roundeven, MVT::nxv16f32, 9},
448 {Intrinsic::roundeven, MVT::v2f64, 9},
449 {Intrinsic::roundeven, MVT::v4f64, 9},
450 {Intrinsic::roundeven, MVT::v8f64, 9},
451 {Intrinsic::roundeven, MVT::v16f64, 9},
452 {Intrinsic::roundeven, MVT::nxv1f64, 9},
453 {Intrinsic::roundeven, MVT::nxv2f64, 9},
454 {Intrinsic::roundeven, MVT::nxv4f64, 9},
455 {Intrinsic::roundeven, MVT::nxv8f64, 9},
456 {Intrinsic::bswap, MVT::v2i16, 3},
457 {Intrinsic::bswap, MVT::v4i16, 3},
458 {Intrinsic::bswap, MVT::v8i16, 3},
459 {Intrinsic::bswap, MVT::v16i16, 3},
460 {Intrinsic::bswap, MVT::nxv1i16, 3},
461 {Intrinsic::bswap, MVT::nxv2i16, 3},
462 {Intrinsic::bswap, MVT::nxv4i16, 3},
463 {Intrinsic::bswap, MVT::nxv8i16, 3},
464 {Intrinsic::bswap, MVT::nxv16i16, 3},
465 {Intrinsic::bswap, MVT::v2i32, 12},
466 {Intrinsic::bswap, MVT::v4i32, 12},
467 {Intrinsic::bswap, MVT::v8i32, 12},
468 {Intrinsic::bswap, MVT::v16i32, 12},
469 {Intrinsic::bswap, MVT::nxv1i32, 12},
470 {Intrinsic::bswap, MVT::nxv2i32, 12},
471 {Intrinsic::bswap, MVT::nxv4i32, 12},
472 {Intrinsic::bswap, MVT::nxv8i32, 12},
473 {Intrinsic::bswap, MVT::nxv16i32, 12},
474 {Intrinsic::bswap, MVT::v2i64, 31},
475 {Intrinsic::bswap, MVT::v4i64, 31},
476 {Intrinsic::bswap, MVT::v8i64, 31},
477 {Intrinsic::bswap, MVT::v16i64, 31},
478 {Intrinsic::bswap, MVT::nxv1i64, 31},
479 {Intrinsic::bswap, MVT::nxv2i64, 31},
480 {Intrinsic::bswap, MVT::nxv4i64, 31},
481 {Intrinsic::bswap, MVT::nxv8i64, 31},
482 {Intrinsic::vp_bswap, MVT::v2i16, 3},
483 {Intrinsic::vp_bswap, MVT::v4i16, 3},
484 {Intrinsic::vp_bswap, MVT::v8i16, 3},
485 {Intrinsic::vp_bswap, MVT::v16i16, 3},
486 {Intrinsic::vp_bswap, MVT::nxv1i16, 3},
487 {Intrinsic::vp_bswap, MVT::nxv2i16, 3},
488 {Intrinsic::vp_bswap, MVT::nxv4i16, 3},
489 {Intrinsic::vp_bswap, MVT::nxv8i16, 3},
490 {Intrinsic::vp_bswap, MVT::nxv16i16, 3},
491 {Intrinsic::vp_bswap, MVT::v2i32, 12},
492 {Intrinsic::vp_bswap, MVT::v4i32, 12},
493 {Intrinsic::vp_bswap, MVT::v8i32, 12},
494 {Intrinsic::vp_bswap, MVT::v16i32, 12},
495 {Intrinsic::vp_bswap, MVT::nxv1i32, 12},
496 {Intrinsic::vp_bswap, MVT::nxv2i32, 12},
497 {Intrinsic::vp_bswap, MVT::nxv4i32, 12},
498 {Intrinsic::vp_bswap, MVT::nxv8i32, 12},
499 {Intrinsic::vp_bswap, MVT::nxv16i32, 12},
500 {Intrinsic::vp_bswap, MVT::v2i64, 31},
501 {Intrinsic::vp_bswap, MVT::v4i64, 31},
502 {Intrinsic::vp_bswap, MVT::v8i64, 31},
503 {Intrinsic::vp_bswap, MVT::v16i64, 31},
504 {Intrinsic::vp_bswap, MVT::nxv1i64, 31},
505 {Intrinsic::vp_bswap, MVT::nxv2i64, 31},
506 {Intrinsic::vp_bswap, MVT::nxv4i64, 31},
507 {Intrinsic::vp_bswap, MVT::nxv8i64, 31},
508 {Intrinsic::vp_fshl, MVT::v2i8, 7},
509 {Intrinsic::vp_fshl, MVT::v4i8, 7},
510 {Intrinsic::vp_fshl, MVT::v8i8, 7},
511 {Intrinsic::vp_fshl, MVT::v16i8, 7},
512 {Intrinsic::vp_fshl, MVT::nxv1i8, 7},
513 {Intrinsic::vp_fshl, MVT::nxv2i8, 7},
514 {Intrinsic::vp_fshl, MVT::nxv4i8, 7},
515 {Intrinsic::vp_fshl, MVT::nxv8i8, 7},
516 {Intrinsic::vp_fshl, MVT::nxv16i8, 7},
517 {Intrinsic::vp_fshl, MVT::nxv32i8, 7},
518 {Intrinsic::vp_fshl, MVT::nxv64i8, 7},
519 {Intrinsic::vp_fshl, MVT::v2i16, 7},
520 {Intrinsic::vp_fshl, MVT::v4i16, 7},
521 {Intrinsic::vp_fshl, MVT::v8i16, 7},
522 {Intrinsic::vp_fshl, MVT::v16i16, 7},
523 {Intrinsic::vp_fshl, MVT::nxv1i16, 7},
524 {Intrinsic::vp_fshl, MVT::nxv2i16, 7},
525 {Intrinsic::vp_fshl, MVT::nxv4i16, 7},
526 {Intrinsic::vp_fshl, MVT::nxv8i16, 7},
527 {Intrinsic::vp_fshl, MVT::nxv16i16, 7},
528 {Intrinsic::vp_fshl, MVT::nxv32i16, 7},
529 {Intrinsic::vp_fshl, MVT::v2i32, 7},
530 {Intrinsic::vp_fshl, MVT::v4i32, 7},
531 {Intrinsic::vp_fshl, MVT::v8i32, 7},
532 {Intrinsic::vp_fshl, MVT::v16i32, 7},
533 {Intrinsic::vp_fshl, MVT::nxv1i32, 7},
534 {Intrinsic::vp_fshl, MVT::nxv2i32, 7},
535 {Intrinsic::vp_fshl, MVT::nxv4i32, 7},
536 {Intrinsic::vp_fshl, MVT::nxv8i32, 7},
537 {Intrinsic::vp_fshl, MVT::nxv16i32, 7},
538 {Intrinsic::vp_fshl, MVT::v2i64, 7},
539 {Intrinsic::vp_fshl, MVT::v4i64, 7},
540 {Intrinsic::vp_fshl, MVT::v8i64, 7},
541 {Intrinsic::vp_fshl, MVT::v16i64, 7},
542 {Intrinsic::vp_fshl, MVT::nxv1i64, 7},
543 {Intrinsic::vp_fshl, MVT::nxv2i64, 7},
544 {Intrinsic::vp_fshl, MVT::nxv4i64, 7},
545 {Intrinsic::vp_fshl, MVT::nxv8i64, 7},
546 {Intrinsic::vp_fshr, MVT::v2i8, 7},
547 {Intrinsic::vp_fshr, MVT::v4i8, 7},
548 {Intrinsic::vp_fshr, MVT::v8i8, 7},
549 {Intrinsic::vp_fshr, MVT::v16i8, 7},
550 {Intrinsic::vp_fshr, MVT::nxv1i8, 7},
551 {Intrinsic::vp_fshr, MVT::nxv2i8, 7},
552 {Intrinsic::vp_fshr, MVT::nxv4i8, 7},
553 {Intrinsic::vp_fshr, MVT::nxv8i8, 7},
554 {Intrinsic::vp_fshr, MVT::nxv16i8, 7},
555 {Intrinsic::vp_fshr, MVT::nxv32i8, 7},
556 {Intrinsic::vp_fshr, MVT::nxv64i8, 7},
557 {Intrinsic::vp_fshr, MVT::v2i16, 7},
558 {Intrinsic::vp_fshr, MVT::v4i16, 7},
559 {Intrinsic::vp_fshr, MVT::v8i16, 7},
560 {Intrinsic::vp_fshr, MVT::v16i16, 7},
561 {Intrinsic::vp_fshr, MVT::nxv1i16, 7},
562 {Intrinsic::vp_fshr, MVT::nxv2i16, 7},
563 {Intrinsic::vp_fshr, MVT::nxv4i16, 7},
564 {Intrinsic::vp_fshr, MVT::nxv8i16, 7},
565 {Intrinsic::vp_fshr, MVT::nxv16i16, 7},
566 {Intrinsic::vp_fshr, MVT::nxv32i16, 7},
567 {Intrinsic::vp_fshr, MVT::v2i32, 7},
568 {Intrinsic::vp_fshr, MVT::v4i32, 7},
569 {Intrinsic::vp_fshr, MVT::v8i32, 7},
570 {Intrinsic::vp_fshr, MVT::v16i32, 7},
571 {Intrinsic::vp_fshr, MVT::nxv1i32, 7},
572 {Intrinsic::vp_fshr, MVT::nxv2i32, 7},
573 {Intrinsic::vp_fshr, MVT::nxv4i32, 7},
574 {Intrinsic::vp_fshr, MVT::nxv8i32, 7},
575 {Intrinsic::vp_fshr, MVT::nxv16i32, 7},
576 {Intrinsic::vp_fshr, MVT::v2i64, 7},
577 {Intrinsic::vp_fshr, MVT::v4i64, 7},
578 {Intrinsic::vp_fshr, MVT::v8i64, 7},
579 {Intrinsic::vp_fshr, MVT::v16i64, 7},
580 {Intrinsic::vp_fshr, MVT::nxv1i64, 7},
581 {Intrinsic::vp_fshr, MVT::nxv2i64, 7},
582 {Intrinsic::vp_fshr, MVT::nxv4i64, 7},
583 {Intrinsic::vp_fshr, MVT::nxv8i64, 7},
584 {Intrinsic::bitreverse, MVT::v2i8, 17},
585 {Intrinsic::bitreverse, MVT::v4i8, 17},
586 {Intrinsic::bitreverse, MVT::v8i8, 17},
587 {Intrinsic::bitreverse, MVT::v16i8, 17},
588 {Intrinsic::bitreverse, MVT::nxv1i8, 17},
589 {Intrinsic::bitreverse, MVT::nxv2i8, 17},
590 {Intrinsic::bitreverse, MVT::nxv4i8, 17},
591 {Intrinsic::bitreverse, MVT::nxv8i8, 17},
592 {Intrinsic::bitreverse, MVT::nxv16i8, 17},
593 {Intrinsic::bitreverse, MVT::v2i16, 24},
594 {Intrinsic::bitreverse, MVT::v4i16, 24},
595 {Intrinsic::bitreverse, MVT::v8i16, 24},
596 {Intrinsic::bitreverse, MVT::v16i16, 24},
597 {Intrinsic::bitreverse, MVT::nxv1i16, 24},
598 {Intrinsic::bitreverse, MVT::nxv2i16, 24},
599 {Intrinsic::bitreverse, MVT::nxv4i16, 24},
600 {Intrinsic::bitreverse, MVT::nxv8i16, 24},
601 {Intrinsic::bitreverse, MVT::nxv16i16, 24},
602 {Intrinsic::bitreverse, MVT::v2i32, 33},
603 {Intrinsic::bitreverse, MVT::v4i32, 33},
604 {Intrinsic::bitreverse, MVT::v8i32, 33},
605 {Intrinsic::bitreverse, MVT::v16i32, 33},
606 {Intrinsic::bitreverse, MVT::nxv1i32, 33},
607 {Intrinsic::bitreverse, MVT::nxv2i32, 33},
608 {Intrinsic::bitreverse, MVT::nxv4i32, 33},
609 {Intrinsic::bitreverse, MVT::nxv8i32, 33},
610 {Intrinsic::bitreverse, MVT::nxv16i32, 33},
611 {Intrinsic::bitreverse, MVT::v2i64, 52},
612 {Intrinsic::bitreverse, MVT::v4i64, 52},
613 {Intrinsic::bitreverse, MVT::v8i64, 52},
614 {Intrinsic::bitreverse, MVT::v16i64, 52},
615 {Intrinsic::bitreverse, MVT::nxv1i64, 52},
616 {Intrinsic::bitreverse, MVT::nxv2i64, 52},
617 {Intrinsic::bitreverse, MVT::nxv4i64, 52},
618 {Intrinsic::bitreverse, MVT::nxv8i64, 52},
619 {Intrinsic::vp_bitreverse, MVT::v2i8, 17},
620 {Intrinsic::vp_bitreverse, MVT::v4i8, 17},
621 {Intrinsic::vp_bitreverse, MVT::v8i8, 17},
622 {Intrinsic::vp_bitreverse, MVT::v16i8, 17},
623 {Intrinsic::vp_bitreverse, MVT::nxv1i8, 17},
624 {Intrinsic::vp_bitreverse, MVT::nxv2i8, 17},
625 {Intrinsic::vp_bitreverse, MVT::nxv4i8, 17},
626 {Intrinsic::vp_bitreverse, MVT::nxv8i8, 17},
627 {Intrinsic::vp_bitreverse, MVT::nxv16i8, 17},
628 {Intrinsic::vp_bitreverse, MVT::v2i16, 24},
629 {Intrinsic::vp_bitreverse, MVT::v4i16, 24},
630 {Intrinsic::vp_bitreverse, MVT::v8i16, 24},
631 {Intrinsic::vp_bitreverse, MVT::v16i16, 24},
632 {Intrinsic::vp_bitreverse, MVT::nxv1i16, 24},
633 {Intrinsic::vp_bitreverse, MVT::nxv2i16, 24},
634 {Intrinsic::vp_bitreverse, MVT::nxv4i16, 24},
635 {Intrinsic::vp_bitreverse, MVT::nxv8i16, 24},
636 {Intrinsic::vp_bitreverse, MVT::nxv16i16, 24},
637 {Intrinsic::vp_bitreverse, MVT::v2i32, 33},
638 {Intrinsic::vp_bitreverse, MVT::v4i32, 33},
639 {Intrinsic::vp_bitreverse, MVT::v8i32, 33},
640 {Intrinsic::vp_bitreverse, MVT::v16i32, 33},
641 {Intrinsic::vp_bitreverse, MVT::nxv1i32, 33},
642 {Intrinsic::vp_bitreverse, MVT::nxv2i32, 33},
643 {Intrinsic::vp_bitreverse, MVT::nxv4i32, 33},
644 {Intrinsic::vp_bitreverse, MVT::nxv8i32, 33},
645 {Intrinsic::vp_bitreverse, MVT::nxv16i32, 33},
646 {Intrinsic::vp_bitreverse, MVT::v2i64, 52},
647 {Intrinsic::vp_bitreverse, MVT::v4i64, 52},
648 {Intrinsic::vp_bitreverse, MVT::v8i64, 52},
649 {Intrinsic::vp_bitreverse, MVT::v16i64, 52},
650 {Intrinsic::vp_bitreverse, MVT::nxv1i64, 52},
651 {Intrinsic::vp_bitreverse, MVT::nxv2i64, 52},
652 {Intrinsic::vp_bitreverse, MVT::nxv4i64, 52},
653 {Intrinsic::vp_bitreverse, MVT::nxv8i64, 52},
654 {Intrinsic::ctpop, MVT::v2i8, 12},
655 {Intrinsic::ctpop, MVT::v4i8, 12},
656 {Intrinsic::ctpop, MVT::v8i8, 12},
657 {Intrinsic::ctpop, MVT::v16i8, 12},
658 {Intrinsic::ctpop, MVT::nxv1i8, 12},
659 {Intrinsic::ctpop, MVT::nxv2i8, 12},
660 {Intrinsic::ctpop, MVT::nxv4i8, 12},
661 {Intrinsic::ctpop, MVT::nxv8i8, 12},
662 {Intrinsic::ctpop, MVT::nxv16i8, 12},
663 {Intrinsic::ctpop, MVT::v2i16, 19},
664 {Intrinsic::ctpop, MVT::v4i16, 19},
665 {Intrinsic::ctpop, MVT::v8i16, 19},
666 {Intrinsic::ctpop, MVT::v16i16, 19},
667 {Intrinsic::ctpop, MVT::nxv1i16, 19},
668 {Intrinsic::ctpop, MVT::nxv2i16, 19},
669 {Intrinsic::ctpop, MVT::nxv4i16, 19},
670 {Intrinsic::ctpop, MVT::nxv8i16, 19},
671 {Intrinsic::ctpop, MVT::nxv16i16, 19},
672 {Intrinsic::ctpop, MVT::v2i32, 20},
673 {Intrinsic::ctpop, MVT::v4i32, 20},
674 {Intrinsic::ctpop, MVT::v8i32, 20},
675 {Intrinsic::ctpop, MVT::v16i32, 20},
676 {Intrinsic::ctpop, MVT::nxv1i32, 20},
677 {Intrinsic::ctpop, MVT::nxv2i32, 20},
678 {Intrinsic::ctpop, MVT::nxv4i32, 20},
679 {Intrinsic::ctpop, MVT::nxv8i32, 20},
680 {Intrinsic::ctpop, MVT::nxv16i32, 20},
681 {Intrinsic::ctpop, MVT::v2i64, 21},
682 {Intrinsic::ctpop, MVT::v4i64, 21},
683 {Intrinsic::ctpop, MVT::v8i64, 21},
684 {Intrinsic::ctpop, MVT::v16i64, 21},
685 {Intrinsic::ctpop, MVT::nxv1i64, 21},
686 {Intrinsic::ctpop, MVT::nxv2i64, 21},
687 {Intrinsic::ctpop, MVT::nxv4i64, 21},
688 {Intrinsic::ctpop, MVT::nxv8i64, 21},
689 {Intrinsic::vp_ctpop, MVT::v2i8, 12},
690 {Intrinsic::vp_ctpop, MVT::v4i8, 12},
691 {Intrinsic::vp_ctpop, MVT::v8i8, 12},
692 {Intrinsic::vp_ctpop, MVT::v16i8, 12},
693 {Intrinsic::vp_ctpop, MVT::nxv1i8, 12},
694 {Intrinsic::vp_ctpop, MVT::nxv2i8, 12},
695 {Intrinsic::vp_ctpop, MVT::nxv4i8, 12},
696 {Intrinsic::vp_ctpop, MVT::nxv8i8, 12},
697 {Intrinsic::vp_ctpop, MVT::nxv16i8, 12},
698 {Intrinsic::vp_ctpop, MVT::v2i16, 19},
699 {Intrinsic::vp_ctpop, MVT::v4i16, 19},
700 {Intrinsic::vp_ctpop, MVT::v8i16, 19},
701 {Intrinsic::vp_ctpop, MVT::v16i16, 19},
702 {Intrinsic::vp_ctpop, MVT::nxv1i16, 19},
703 {Intrinsic::vp_ctpop, MVT::nxv2i16, 19},
704 {Intrinsic::vp_ctpop, MVT::nxv4i16, 19},
705 {Intrinsic::vp_ctpop, MVT::nxv8i16, 19},
706 {Intrinsic::vp_ctpop, MVT::nxv16i16, 19},
707 {Intrinsic::vp_ctpop, MVT::v2i32, 20},
708 {Intrinsic::vp_ctpop, MVT::v4i32, 20},
709 {Intrinsic::vp_ctpop, MVT::v8i32, 20},
710 {Intrinsic::vp_ctpop, MVT::v16i32, 20},
711 {Intrinsic::vp_ctpop, MVT::nxv1i32, 20},
712 {Intrinsic::vp_ctpop, MVT::nxv2i32, 20},
713 {Intrinsic::vp_ctpop, MVT::nxv4i32, 20},
714 {Intrinsic::vp_ctpop, MVT::nxv8i32, 20},
715 {Intrinsic::vp_ctpop, MVT::nxv16i32, 20},
716 {Intrinsic::vp_ctpop, MVT::v2i64, 21},
717 {Intrinsic::vp_ctpop, MVT::v4i64, 21},
718 {Intrinsic::vp_ctpop, MVT::v8i64, 21},
719 {Intrinsic::vp_ctpop, MVT::v16i64, 21},
720 {Intrinsic::vp_ctpop, MVT::nxv1i64, 21},
721 {Intrinsic::vp_ctpop, MVT::nxv2i64, 21},
722 {Intrinsic::vp_ctpop, MVT::nxv4i64, 21},
723 {Intrinsic::vp_ctpop, MVT::nxv8i64, 21},
724 {Intrinsic::vp_ctlz, MVT::v2i8, 19},
725 {Intrinsic::vp_ctlz, MVT::v4i8, 19},
726 {Intrinsic::vp_ctlz, MVT::v8i8, 19},
727 {Intrinsic::vp_ctlz, MVT::v16i8, 19},
728 {Intrinsic::vp_ctlz, MVT::nxv1i8, 19},
729 {Intrinsic::vp_ctlz, MVT::nxv2i8, 19},
730 {Intrinsic::vp_ctlz, MVT::nxv4i8, 19},
731 {Intrinsic::vp_ctlz, MVT::nxv8i8, 19},
732 {Intrinsic::vp_ctlz, MVT::nxv16i8, 19},
733 {Intrinsic::vp_ctlz, MVT::nxv32i8, 19},
734 {Intrinsic::vp_ctlz, MVT::nxv64i8, 19},
735 {Intrinsic::vp_ctlz, MVT::v2i16, 28},
736 {Intrinsic::vp_ctlz, MVT::v4i16, 28},
737 {Intrinsic::vp_ctlz, MVT::v8i16, 28},
738 {Intrinsic::vp_ctlz, MVT::v16i16, 28},
739 {Intrinsic::vp_ctlz, MVT::nxv1i16, 28},
740 {Intrinsic::vp_ctlz, MVT::nxv2i16, 28},
741 {Intrinsic::vp_ctlz, MVT::nxv4i16, 28},
742 {Intrinsic::vp_ctlz, MVT::nxv8i16, 28},
743 {Intrinsic::vp_ctlz, MVT::nxv16i16, 28},
744 {Intrinsic::vp_ctlz, MVT::nxv32i16, 28},
745 {Intrinsic::vp_ctlz, MVT::v2i32, 31},
746 {Intrinsic::vp_ctlz, MVT::v4i32, 31},
747 {Intrinsic::vp_ctlz, MVT::v8i32, 31},
748 {Intrinsic::vp_ctlz, MVT::v16i32, 31},
749 {Intrinsic::vp_ctlz, MVT::nxv1i32, 31},
750 {Intrinsic::vp_ctlz, MVT::nxv2i32, 31},
751 {Intrinsic::vp_ctlz, MVT::nxv4i32, 31},
752 {Intrinsic::vp_ctlz, MVT::nxv8i32, 31},
753 {Intrinsic::vp_ctlz, MVT::nxv16i32, 31},
754 {Intrinsic::vp_ctlz, MVT::v2i64, 35},
755 {Intrinsic::vp_ctlz, MVT::v4i64, 35},
756 {Intrinsic::vp_ctlz, MVT::v8i64, 35},
757 {Intrinsic::vp_ctlz, MVT::v16i64, 35},
758 {Intrinsic::vp_ctlz, MVT::nxv1i64, 35},
759 {Intrinsic::vp_ctlz, MVT::nxv2i64, 35},
760 {Intrinsic::vp_ctlz, MVT::nxv4i64, 35},
761 {Intrinsic::vp_ctlz, MVT::nxv8i64, 35},
762 {Intrinsic::vp_cttz, MVT::v2i8, 16},
763 {Intrinsic::vp_cttz, MVT::v4i8, 16},
764 {Intrinsic::vp_cttz, MVT::v8i8, 16},
765 {Intrinsic::vp_cttz, MVT::v16i8, 16},
766 {Intrinsic::vp_cttz, MVT::nxv1i8, 16},
767 {Intrinsic::vp_cttz, MVT::nxv2i8, 16},
768 {Intrinsic::vp_cttz, MVT::nxv4i8, 16},
769 {Intrinsic::vp_cttz, MVT::nxv8i8, 16},
770 {Intrinsic::vp_cttz, MVT::nxv16i8, 16},
771 {Intrinsic::vp_cttz, MVT::nxv32i8, 16},
772 {Intrinsic::vp_cttz, MVT::nxv64i8, 16},
773 {Intrinsic::vp_cttz, MVT::v2i16, 23},
774 {Intrinsic::vp_cttz, MVT::v4i16, 23},
775 {Intrinsic::vp_cttz, MVT::v8i16, 23},
776 {Intrinsic::vp_cttz, MVT::v16i16, 23},
777 {Intrinsic::vp_cttz, MVT::nxv1i16, 23},
778 {Intrinsic::vp_cttz, MVT::nxv2i16, 23},
779 {Intrinsic::vp_cttz, MVT::nxv4i16, 23},
780 {Intrinsic::vp_cttz, MVT::nxv8i16, 23},
781 {Intrinsic::vp_cttz, MVT::nxv16i16, 23},
782 {Intrinsic::vp_cttz, MVT::nxv32i16, 23},
783 {Intrinsic::vp_cttz, MVT::v2i32, 24},
784 {Intrinsic::vp_cttz, MVT::v4i32, 24},
785 {Intrinsic::vp_cttz, MVT::v8i32, 24},
786 {Intrinsic::vp_cttz, MVT::v16i32, 24},
787 {Intrinsic::vp_cttz, MVT::nxv1i32, 24},
788 {Intrinsic::vp_cttz, MVT::nxv2i32, 24},
789 {Intrinsic::vp_cttz, MVT::nxv4i32, 24},
790 {Intrinsic::vp_cttz, MVT::nxv8i32, 24},
791 {Intrinsic::vp_cttz, MVT::nxv16i32, 24},
792 {Intrinsic::vp_cttz, MVT::v2i64, 25},
793 {Intrinsic::vp_cttz, MVT::v4i64, 25},
794 {Intrinsic::vp_cttz, MVT::v8i64, 25},
795 {Intrinsic::vp_cttz, MVT::v16i64, 25},
796 {Intrinsic::vp_cttz, MVT::nxv1i64, 25},
797 {Intrinsic::vp_cttz, MVT::nxv2i64, 25},
798 {Intrinsic::vp_cttz, MVT::nxv4i64, 25},
799 {Intrinsic::vp_cttz, MVT::nxv8i64, 25},
800 };
801
getISDForVPIntrinsicID(Intrinsic::ID ID)802 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) {
803 switch (ID) {
804 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
805 case Intrinsic::VPID: \
806 return ISD::VPSD;
807 #include "llvm/IR/VPIntrinsics.def"
808 #undef HELPER_MAP_VPID_TO_VPSD
809 }
810 return ISD::DELETED_NODE;
811 }
812
813 InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)814 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
815 TTI::TargetCostKind CostKind) {
816 auto *RetTy = ICA.getReturnType();
817 switch (ICA.getID()) {
818 case Intrinsic::ceil:
819 case Intrinsic::floor:
820 case Intrinsic::trunc:
821 case Intrinsic::rint:
822 case Intrinsic::round:
823 case Intrinsic::roundeven: {
824 // These all use the same code.
825 auto LT = getTypeLegalizationCost(RetTy);
826 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
827 return LT.first * 8;
828 break;
829 }
830 case Intrinsic::umin:
831 case Intrinsic::umax:
832 case Intrinsic::smin:
833 case Intrinsic::smax: {
834 auto LT = getTypeLegalizationCost(RetTy);
835 if ((ST->hasVInstructions() && LT.second.isVector()) ||
836 (LT.second.isScalarInteger() && ST->hasStdExtZbb()))
837 return LT.first;
838 break;
839 }
840 case Intrinsic::sadd_sat:
841 case Intrinsic::ssub_sat:
842 case Intrinsic::uadd_sat:
843 case Intrinsic::usub_sat: {
844 auto LT = getTypeLegalizationCost(RetTy);
845 if (ST->hasVInstructions() && LT.second.isVector())
846 return LT.first;
847 break;
848 }
849 case Intrinsic::abs: {
850 auto LT = getTypeLegalizationCost(RetTy);
851 if (ST->hasVInstructions() && LT.second.isVector()) {
852 // vrsub.vi v10, v8, 0
853 // vmax.vv v8, v8, v10
854 return LT.first * 2;
855 }
856 break;
857 }
858 case Intrinsic::fabs:
859 case Intrinsic::sqrt: {
860 auto LT = getTypeLegalizationCost(RetTy);
861 if (ST->hasVInstructions() && LT.second.isVector())
862 return LT.first;
863 break;
864 }
865 // TODO: add more intrinsic
866 case Intrinsic::experimental_stepvector: {
867 unsigned Cost = 1; // vid
868 auto LT = getTypeLegalizationCost(RetTy);
869 return Cost + (LT.first - 1);
870 }
871 case Intrinsic::vp_rint: {
872 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
873 unsigned Cost = 5;
874 auto LT = getTypeLegalizationCost(RetTy);
875 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
876 return Cost * LT.first;
877 break;
878 }
879 case Intrinsic::vp_nearbyint: {
880 // More one read and one write for fflags than vp_rint.
881 unsigned Cost = 7;
882 auto LT = getTypeLegalizationCost(RetTy);
883 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
884 return Cost * LT.first;
885 break;
886 }
887 case Intrinsic::vp_ceil:
888 case Intrinsic::vp_floor:
889 case Intrinsic::vp_round:
890 case Intrinsic::vp_roundeven:
891 case Intrinsic::vp_roundtozero: {
892 // Rounding with static rounding mode needs two more instructions to
893 // swap/write FRM than vp_rint.
894 unsigned Cost = 7;
895 auto LT = getTypeLegalizationCost(RetTy);
896 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
897 if (TLI->isOperationCustom(VPISD, LT.second))
898 return Cost * LT.first;
899 break;
900 }
901 }
902
903 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
904 auto LT = getTypeLegalizationCost(RetTy);
905 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
906 ICA.getID(), LT.second))
907 return LT.first * Entry->Cost;
908 }
909
910 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
911 }
912
getCastInstrCost(unsigned Opcode,Type * Dst,Type * Src,TTI::CastContextHint CCH,TTI::TargetCostKind CostKind,const Instruction * I)913 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
914 Type *Src,
915 TTI::CastContextHint CCH,
916 TTI::TargetCostKind CostKind,
917 const Instruction *I) {
918 if (isa<VectorType>(Dst) && isa<VectorType>(Src)) {
919 // FIXME: Need to compute legalizing cost for illegal types.
920 if (!isTypeLegal(Src) || !isTypeLegal(Dst))
921 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
922
923 // Skip if element size of Dst or Src is bigger than ELEN.
924 if (Src->getScalarSizeInBits() > ST->getELEN() ||
925 Dst->getScalarSizeInBits() > ST->getELEN())
926 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
927
928 int ISD = TLI->InstructionOpcodeToISD(Opcode);
929 assert(ISD && "Invalid opcode");
930
931 // FIXME: Need to consider vsetvli and lmul.
932 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
933 (int)Log2_32(Src->getScalarSizeInBits());
934 switch (ISD) {
935 case ISD::SIGN_EXTEND:
936 case ISD::ZERO_EXTEND:
937 if (Src->getScalarSizeInBits() == 1) {
938 // We do not use vsext/vzext to extend from mask vector.
939 // Instead we use the following instructions to extend from mask vector:
940 // vmv.v.i v8, 0
941 // vmerge.vim v8, v8, -1, v0
942 return 2;
943 }
944 return 1;
945 case ISD::TRUNCATE:
946 if (Dst->getScalarSizeInBits() == 1) {
947 // We do not use several vncvt to truncate to mask vector. So we could
948 // not use PowDiff to calculate it.
949 // Instead we use the following instructions to truncate to mask vector:
950 // vand.vi v8, v8, 1
951 // vmsne.vi v0, v8, 0
952 return 2;
953 }
954 [[fallthrough]];
955 case ISD::FP_EXTEND:
956 case ISD::FP_ROUND:
957 // Counts of narrow/widen instructions.
958 return std::abs(PowDiff);
959 case ISD::FP_TO_SINT:
960 case ISD::FP_TO_UINT:
961 case ISD::SINT_TO_FP:
962 case ISD::UINT_TO_FP:
963 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
964 // The cost of convert from or to mask vector is different from other
965 // cases. We could not use PowDiff to calculate it.
966 // For mask vector to fp, we should use the following instructions:
967 // vmv.v.i v8, 0
968 // vmerge.vim v8, v8, -1, v0
969 // vfcvt.f.x.v v8, v8
970
971 // And for fp vector to mask, we use:
972 // vfncvt.rtz.x.f.w v9, v8
973 // vand.vi v8, v9, 1
974 // vmsne.vi v0, v8, 0
975 return 3;
976 }
977 if (std::abs(PowDiff) <= 1)
978 return 1;
979 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
980 // so it only need two conversion.
981 if (Src->isIntOrIntVectorTy())
982 return 2;
983 // Counts of narrow/widen instructions.
984 return std::abs(PowDiff);
985 }
986 }
987 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
988 }
989
getEstimatedVLFor(VectorType * Ty)990 unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
991 if (isa<ScalableVectorType>(Ty)) {
992 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
993 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
994 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
995 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
996 }
997 return cast<FixedVectorType>(Ty)->getNumElements();
998 }
999
1000 InstructionCost
getMinMaxReductionCost(VectorType * Ty,VectorType * CondTy,bool IsUnsigned,TTI::TargetCostKind CostKind)1001 RISCVTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
1002 bool IsUnsigned,
1003 TTI::TargetCostKind CostKind) {
1004 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1005 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
1006
1007 // Skip if scalar size of Ty is bigger than ELEN.
1008 if (Ty->getScalarSizeInBits() > ST->getELEN())
1009 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
1010
1011 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1012 if (Ty->getElementType()->isIntegerTy(1))
1013 // vcpop sequences, see vreduction-mask.ll. umax, smin actually only
1014 // cost 2, but we don't have enough info here so we slightly over cost.
1015 return (LT.first - 1) + 3;
1016
1017 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1018 InstructionCost BaseCost = 2;
1019 unsigned VL = getEstimatedVLFor(Ty);
1020 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1021 }
1022
1023 InstructionCost
getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)1024 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1025 std::optional<FastMathFlags> FMF,
1026 TTI::TargetCostKind CostKind) {
1027 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1028 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1029
1030 // Skip if scalar size of Ty is bigger than ELEN.
1031 if (Ty->getScalarSizeInBits() > ST->getELEN())
1032 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1033
1034 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1035 assert(ISD && "Invalid opcode");
1036
1037 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1038 ISD != ISD::FADD)
1039 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1040
1041 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1042 if (Ty->getElementType()->isIntegerTy(1))
1043 // vcpop sequences, see vreduction-mask.ll
1044 return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
1045
1046 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1047 InstructionCost BaseCost = 2;
1048 unsigned VL = getEstimatedVLFor(Ty);
1049 if (TTI::requiresOrderedReduction(FMF))
1050 return (LT.first - 1) + BaseCost + VL;
1051 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1052 }
1053
getExtendedReductionCost(unsigned Opcode,bool IsUnsigned,Type * ResTy,VectorType * ValTy,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)1054 InstructionCost RISCVTTIImpl::getExtendedReductionCost(
1055 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1056 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) {
1057 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1058 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1059 FMF, CostKind);
1060
1061 // Skip if scalar size of ResTy is bigger than ELEN.
1062 if (ResTy->getScalarSizeInBits() > ST->getELEN())
1063 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1064 FMF, CostKind);
1065
1066 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1067 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1068 FMF, CostKind);
1069
1070 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1071
1072 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1073 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1074 FMF, CostKind);
1075
1076 return (LT.first - 1) +
1077 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1078 }
1079
getStoreImmCost(Type * Ty,TTI::OperandValueInfo OpInfo,TTI::TargetCostKind CostKind)1080 InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty,
1081 TTI::OperandValueInfo OpInfo,
1082 TTI::TargetCostKind CostKind) {
1083 assert(OpInfo.isConstant() && "non constant operand?");
1084 if (!isa<VectorType>(Ty))
1085 // FIXME: We need to account for immediate materialization here, but doing
1086 // a decent job requires more knowledge about the immediate than we
1087 // currently have here.
1088 return 0;
1089
1090 if (OpInfo.isUniform())
1091 // vmv.x.i, vmv.v.x, or vfmv.v.f
1092 // We ignore the cost of the scalar constant materialization to be consistent
1093 // with how we treat scalar constants themselves just above.
1094 return 1;
1095
1096 // Add a cost of address generation + the cost of the vector load. The
1097 // address is expected to be a PC relative offset to a constant pool entry
1098 // using auipc/addi.
1099 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
1100 /*AddressSpace=*/0, CostKind);
1101 }
1102
1103
getMemoryOpCost(unsigned Opcode,Type * Src,MaybeAlign Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,TTI::OperandValueInfo OpInfo,const Instruction * I)1104 InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1105 MaybeAlign Alignment,
1106 unsigned AddressSpace,
1107 TTI::TargetCostKind CostKind,
1108 TTI::OperandValueInfo OpInfo,
1109 const Instruction *I) {
1110 InstructionCost Cost = 0;
1111 if (Opcode == Instruction::Store && OpInfo.isConstant())
1112 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1113 return Cost + BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1114 CostKind, OpInfo, I);
1115 }
1116
getCmpSelInstrCost(unsigned Opcode,Type * ValTy,Type * CondTy,CmpInst::Predicate VecPred,TTI::TargetCostKind CostKind,const Instruction * I)1117 InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1118 Type *CondTy,
1119 CmpInst::Predicate VecPred,
1120 TTI::TargetCostKind CostKind,
1121 const Instruction *I) {
1122 if (CostKind != TTI::TCK_RecipThroughput)
1123 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1124 I);
1125
1126 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1127 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1128 I);
1129
1130 // Skip if scalar size of ValTy is bigger than ELEN.
1131 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELEN())
1132 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1133 I);
1134
1135 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1136 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1137 if (CondTy->isVectorTy()) {
1138 if (ValTy->getScalarSizeInBits() == 1) {
1139 // vmandn.mm v8, v8, v9
1140 // vmand.mm v9, v0, v9
1141 // vmor.mm v0, v9, v8
1142 return LT.first * 3;
1143 }
1144 // vselect and max/min are supported natively.
1145 return LT.first * 1;
1146 }
1147
1148 if (ValTy->getScalarSizeInBits() == 1) {
1149 // vmv.v.x v9, a0
1150 // vmsne.vi v9, v9, 0
1151 // vmandn.mm v8, v8, v9
1152 // vmand.mm v9, v0, v9
1153 // vmor.mm v0, v9, v8
1154 return LT.first * 5;
1155 }
1156
1157 // vmv.v.x v10, a0
1158 // vmsne.vi v0, v10, 0
1159 // vmerge.vvm v8, v9, v8, v0
1160 return LT.first * 3;
1161 }
1162
1163 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1164 ValTy->isVectorTy()) {
1165 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1166
1167 // Support natively.
1168 if (CmpInst::isIntPredicate(VecPred))
1169 return LT.first * 1;
1170
1171 // If we do not support the input floating point vector type, use the base
1172 // one which will calculate as:
1173 // ScalarizeCost + Num * Cost for fixed vector,
1174 // InvalidCost for scalable vector.
1175 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1176 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1177 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1178 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1179 I);
1180 switch (VecPred) {
1181 // Support natively.
1182 case CmpInst::FCMP_OEQ:
1183 case CmpInst::FCMP_OGT:
1184 case CmpInst::FCMP_OGE:
1185 case CmpInst::FCMP_OLT:
1186 case CmpInst::FCMP_OLE:
1187 case CmpInst::FCMP_UNE:
1188 return LT.first * 1;
1189 // TODO: Other comparisons?
1190 default:
1191 break;
1192 }
1193 }
1194
1195 // TODO: Add cost for scalar type.
1196
1197 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1198 }
1199
getVectorInstrCost(unsigned Opcode,Type * Val,TTI::TargetCostKind CostKind,unsigned Index,Value * Op0,Value * Op1)1200 InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1201 TTI::TargetCostKind CostKind,
1202 unsigned Index, Value *Op0,
1203 Value *Op1) {
1204 assert(Val->isVectorTy() && "This must be a vector type");
1205
1206 if (Opcode != Instruction::ExtractElement &&
1207 Opcode != Instruction::InsertElement)
1208 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1209
1210 // Legalize the type.
1211 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1212
1213 // This type is legalized to a scalar type.
1214 if (!LT.second.isVector())
1215 return 0;
1216
1217 // For unsupported scalable vector.
1218 if (LT.second.isScalableVector() && !LT.first.isValid())
1219 return LT.first;
1220
1221 if (!isTypeLegal(Val))
1222 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1223
1224 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1225 // and vslideup + vmv.s.x to insert element to vector.
1226 unsigned BaseCost = 1;
1227 // When insertelement we should add the index with 1 as the input of vslideup.
1228 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1229
1230 if (Index != -1U) {
1231 // The type may be split. For fixed-width vectors we can normalize the
1232 // index to the new type.
1233 if (LT.second.isFixedLengthVector()) {
1234 unsigned Width = LT.second.getVectorNumElements();
1235 Index = Index % Width;
1236 }
1237
1238 // We could extract/insert the first element without vslidedown/vslideup.
1239 if (Index == 0)
1240 SlideCost = 0;
1241 else if (Opcode == Instruction::InsertElement)
1242 SlideCost = 1; // With a constant index, we do not need to use addi.
1243 }
1244
1245 // Mask vector extract/insert element is different from normal case.
1246 if (Val->getScalarSizeInBits() == 1) {
1247 // For extractelement, we need the following instructions:
1248 // vmv.v.i v8, 0
1249 // vmerge.vim v8, v8, 1, v0
1250 // vsetivli zero, 1, e8, m2, ta, mu (not count)
1251 // vslidedown.vx v8, v8, a0
1252 // vmv.x.s a0, v8
1253
1254 // For insertelement, we need the following instructions:
1255 // vsetvli a2, zero, e8, m1, ta, mu (not count)
1256 // vmv.s.x v8, a0
1257 // vmv.v.i v9, 0
1258 // vmerge.vim v9, v9, 1, v0
1259 // addi a0, a1, 1
1260 // vsetvli zero, a0, e8, m1, tu, mu (not count)
1261 // vslideup.vx v9, v8, a1
1262 // vsetvli a0, zero, e8, m1, ta, mu (not count)
1263 // vand.vi v8, v9, 1
1264 // vmsne.vi v0, v8, 0
1265
1266 // TODO: should we count these special vsetvlis?
1267 BaseCost = Opcode == Instruction::InsertElement ? 5 : 3;
1268 }
1269 // Extract i64 in the target that has XLEN=32 need more instruction.
1270 if (Val->getScalarType()->isIntegerTy() &&
1271 ST->getXLen() < Val->getScalarSizeInBits()) {
1272 // For extractelement, we need the following instructions:
1273 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1274 // vslidedown.vx v8, v8, a0
1275 // vmv.x.s a0, v8
1276 // li a1, 32
1277 // vsrl.vx v8, v8, a1
1278 // vmv.x.s a1, v8
1279
1280 // For insertelement, we need the following instructions:
1281 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1282 // vmv.v.i v12, 0
1283 // vslide1up.vx v16, v12, a1
1284 // vslide1up.vx v12, v16, a0
1285 // addi a0, a2, 1
1286 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1287 // vslideup.vx v8, v12, a2
1288
1289 // TODO: should we count these special vsetvlis?
1290 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1291 }
1292 return BaseCost + SlideCost;
1293 }
1294
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueInfo Op1Info,TTI::OperandValueInfo Op2Info,ArrayRef<const Value * > Args,const Instruction * CxtI)1295 InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
1296 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1297 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1298 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1299
1300 // TODO: Handle more cost kinds.
1301 if (CostKind != TTI::TCK_RecipThroughput)
1302 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1303 Args, CxtI);
1304
1305 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1306 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1307 Args, CxtI);
1308
1309 // Skip if scalar size of Ty is bigger than ELEN.
1310 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELEN())
1311 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1312 Args, CxtI);
1313
1314 // Legalize the type.
1315 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1316
1317 // TODO: Handle scalar type.
1318 if (!LT.second.isVector())
1319 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1320 Args, CxtI);
1321
1322
1323 auto getConstantMatCost =
1324 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1325 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1326 // Two sub-cases:
1327 // * Has a 5 bit immediate operand which can be splatted.
1328 // * Has a larger immediate which must be materialized in scalar register
1329 // We return 0 for both as we currently ignore the cost of materializing
1330 // scalar constants in GPRs.
1331 return 0;
1332
1333 // Add a cost of address generation + the cost of the vector load. The
1334 // address is expected to be a PC relative offset to a constant pool entry
1335 // using auipc/addi.
1336 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
1337 /*AddressSpace=*/0, CostKind);
1338 };
1339
1340 // Add the cost of materializing any constant vectors required.
1341 InstructionCost ConstantMatCost = 0;
1342 if (Op1Info.isConstant())
1343 ConstantMatCost += getConstantMatCost(0, Op1Info);
1344 if (Op2Info.isConstant())
1345 ConstantMatCost += getConstantMatCost(1, Op2Info);
1346
1347 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1348 case ISD::ADD:
1349 case ISD::SUB:
1350 case ISD::AND:
1351 case ISD::OR:
1352 case ISD::XOR:
1353 case ISD::SHL:
1354 case ISD::SRL:
1355 case ISD::SRA:
1356 case ISD::MUL:
1357 case ISD::MULHS:
1358 case ISD::MULHU:
1359 case ISD::FADD:
1360 case ISD::FSUB:
1361 case ISD::FMUL:
1362 case ISD::FNEG: {
1363 return ConstantMatCost + getLMULCost(LT.second) * LT.first * 1;
1364 }
1365 default:
1366 return ConstantMatCost +
1367 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1368 Args, CxtI);
1369 }
1370 }
1371
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)1372 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1373 TTI::UnrollingPreferences &UP,
1374 OptimizationRemarkEmitter *ORE) {
1375 // TODO: More tuning on benchmarks and metrics with changes as needed
1376 // would apply to all settings below to enable performance.
1377
1378
1379 if (ST->enableDefaultUnroll())
1380 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1381
1382 // Enable Upper bound unrolling universally, not dependant upon the conditions
1383 // below.
1384 UP.UpperBound = true;
1385
1386 // Disable loop unrolling for Oz and Os.
1387 UP.OptSizeThreshold = 0;
1388 UP.PartialOptSizeThreshold = 0;
1389 if (L->getHeader()->getParent()->hasOptSize())
1390 return;
1391
1392 SmallVector<BasicBlock *, 4> ExitingBlocks;
1393 L->getExitingBlocks(ExitingBlocks);
1394 LLVM_DEBUG(dbgs() << "Loop has:\n"
1395 << "Blocks: " << L->getNumBlocks() << "\n"
1396 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1397
1398 // Only allow another exit other than the latch. This acts as an early exit
1399 // as it mirrors the profitability calculation of the runtime unroller.
1400 if (ExitingBlocks.size() > 2)
1401 return;
1402
1403 // Limit the CFG of the loop body for targets with a branch predictor.
1404 // Allowing 4 blocks permits if-then-else diamonds in the body.
1405 if (L->getNumBlocks() > 4)
1406 return;
1407
1408 // Don't unroll vectorized loops, including the remainder loop
1409 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1410 return;
1411
1412 // Scan the loop: don't unroll loops with calls as this could prevent
1413 // inlining.
1414 InstructionCost Cost = 0;
1415 for (auto *BB : L->getBlocks()) {
1416 for (auto &I : *BB) {
1417 // Initial setting - Don't unroll loops containing vectorized
1418 // instructions.
1419 if (I.getType()->isVectorTy())
1420 return;
1421
1422 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1423 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1424 if (!isLoweredToCall(F))
1425 continue;
1426 }
1427 return;
1428 }
1429
1430 SmallVector<const Value *> Operands(I.operand_values());
1431 Cost += getInstructionCost(&I, Operands,
1432 TargetTransformInfo::TCK_SizeAndLatency);
1433 }
1434 }
1435
1436 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1437
1438 UP.Partial = true;
1439 UP.Runtime = true;
1440 UP.UnrollRemainder = true;
1441 UP.UnrollAndJam = true;
1442 UP.UnrollAndJamInnerLoopThreshold = 60;
1443
1444 // Force unrolling small loops can be very useful because of the branch
1445 // taken cost of the backedge.
1446 if (Cost < 12)
1447 UP.Force = true;
1448 }
1449
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)1450 void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1451 TTI::PeelingPreferences &PP) {
1452 BaseT::getPeelingPreferences(L, SE, PP);
1453 }
1454
getRegUsageForType(Type * Ty)1455 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
1456 TypeSize Size = DL.getTypeSizeInBits(Ty);
1457 if (Ty->isVectorTy()) {
1458 if (Size.isScalable() && ST->hasVInstructions())
1459 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1460
1461 if (ST->useRVVForFixedLengthVectors())
1462 return divideCeil(Size, ST->getRealMinVLen());
1463 }
1464
1465 return BaseT::getRegUsageForType(Ty);
1466 }
1467
getMaximumVF(unsigned ElemWidth,unsigned Opcode) const1468 unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1469 // This interface is currently only used by SLP. Returning 1 (which is the
1470 // default value for SLPMaxVF) disables SLP. We currently have a cost modeling
1471 // problem w/ constant materialization which causes SLP to perform majorly
1472 // unprofitable transformations.
1473 // TODO: Figure out constant materialization cost modeling and remove.
1474 return SLPMaxVF;
1475 }
1476
isLSRCostLess(const TargetTransformInfo::LSRCost & C1,const TargetTransformInfo::LSRCost & C2)1477 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
1478 const TargetTransformInfo::LSRCost &C2) {
1479 // RISCV specific here are "instruction number 1st priority".
1480 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
1481 C1.NumIVMuls, C1.NumBaseAdds,
1482 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1483 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
1484 C2.NumIVMuls, C2.NumBaseAdds,
1485 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1486 }
1487