xref: /llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (revision 81d18ad86419fc612c7071e888d11aa923eaeb8a)
1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "AArch64TargetTransformInfo.h"
10 #include "AArch64ExpandImm.h"
11 #include "AArch64PerfectShuffle.h"
12 #include "MCTargetDesc/AArch64AddressingModes.h"
13 #include "Utils/AArch64SMEAttributes.h"
14 #include "llvm/ADT/DenseMap.h"
15 #include "llvm/Analysis/IVDescriptors.h"
16 #include "llvm/Analysis/LoopInfo.h"
17 #include "llvm/Analysis/TargetTransformInfo.h"
18 #include "llvm/CodeGen/BasicTTIImpl.h"
19 #include "llvm/CodeGen/CostTable.h"
20 #include "llvm/CodeGen/TargetLowering.h"
21 #include "llvm/IR/IntrinsicInst.h"
22 #include "llvm/IR/Intrinsics.h"
23 #include "llvm/IR/IntrinsicsAArch64.h"
24 #include "llvm/IR/PatternMatch.h"
25 #include "llvm/Support/Debug.h"
26 #include "llvm/TargetParser/AArch64TargetParser.h"
27 #include "llvm/Transforms/InstCombine/InstCombiner.h"
28 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
29 #include <algorithm>
30 #include <optional>
31 using namespace llvm;
32 using namespace llvm::PatternMatch;
33 
34 #define DEBUG_TYPE "aarch64tti"
35 
36 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
37                                                cl::init(true), cl::Hidden);
38 
39 static cl::opt<bool> SVEPreferFixedOverScalableIfEqualCost(
40     "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
41 
42 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
43                                            cl::Hidden);
44 
45 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
46                                             cl::init(10), cl::Hidden);
47 
48 static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
49                                                   cl::init(15), cl::Hidden);
50 
51 static cl::opt<unsigned>
52     NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
53                                cl::Hidden);
54 
55 static cl::opt<unsigned> CallPenaltyChangeSM(
56     "call-penalty-sm-change", cl::init(5), cl::Hidden,
57     cl::desc(
58         "Penalty of calling a function that requires a change to PSTATE.SM"));
59 
60 static cl::opt<unsigned> InlineCallPenaltyChangeSM(
61     "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
62     cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
63 
64 static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
65                                            cl::init(true), cl::Hidden);
66 
67 static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
68                                       cl::init(true), cl::Hidden);
69 
70 // A complete guess as to a reasonable cost.
71 static cl::opt<unsigned>
72     BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
73                     cl::desc("The cost of a histcnt instruction"));
74 
75 static cl::opt<unsigned> DMBLookaheadThreshold(
76     "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
77     cl::desc("The number of instructions to search for a redundant dmb"));
78 
79 namespace {
80 class TailFoldingOption {
81   // These bitfields will only ever be set to something non-zero in operator=,
82   // when setting the -sve-tail-folding option. This option should always be of
83   // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
84   // InitialBits is one of (disabled|all|simple). EnableBits represents
85   // additional flags we're enabling, and DisableBits for those flags we're
86   // disabling. The default flag is tracked in the variable NeedsDefault, since
87   // at the time of setting the option we may not know what the default value
88   // for the CPU is.
89   TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
90   TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
91   TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
92 
93   // This value needs to be initialised to true in case the user does not
94   // explicitly set the -sve-tail-folding option.
95   bool NeedsDefault = true;
96 
97   void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
98 
99   void setNeedsDefault(bool V) { NeedsDefault = V; }
100 
101   void setEnableBit(TailFoldingOpts Bit) {
102     EnableBits |= Bit;
103     DisableBits &= ~Bit;
104   }
105 
106   void setDisableBit(TailFoldingOpts Bit) {
107     EnableBits &= ~Bit;
108     DisableBits |= Bit;
109   }
110 
111   TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
112     TailFoldingOpts Bits = TailFoldingOpts::Disabled;
113 
114     assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
115            "Initial bits should only include one of "
116            "(disabled|all|simple|default)");
117     Bits = NeedsDefault ? DefaultBits : InitialBits;
118     Bits |= EnableBits;
119     Bits &= ~DisableBits;
120 
121     return Bits;
122   }
123 
124   void reportError(std::string Opt) {
125     errs() << "invalid argument '" << Opt
126            << "' to -sve-tail-folding=; the option should be of the form\n"
127               "  (disabled|all|default|simple)[+(reductions|recurrences"
128               "|reverse|noreductions|norecurrences|noreverse)]\n";
129     report_fatal_error("Unrecognised tail-folding option");
130   }
131 
132 public:
133 
134   void operator=(const std::string &Val) {
135     // If the user explicitly sets -sve-tail-folding= then treat as an error.
136     if (Val.empty()) {
137       reportError("");
138       return;
139     }
140 
141     // Since the user is explicitly setting the option we don't automatically
142     // need the default unless they require it.
143     setNeedsDefault(false);
144 
145     SmallVector<StringRef, 4> TailFoldTypes;
146     StringRef(Val).split(TailFoldTypes, '+', -1, false);
147 
148     unsigned StartIdx = 1;
149     if (TailFoldTypes[0] == "disabled")
150       setInitialBits(TailFoldingOpts::Disabled);
151     else if (TailFoldTypes[0] == "all")
152       setInitialBits(TailFoldingOpts::All);
153     else if (TailFoldTypes[0] == "default")
154       setNeedsDefault(true);
155     else if (TailFoldTypes[0] == "simple")
156       setInitialBits(TailFoldingOpts::Simple);
157     else {
158       StartIdx = 0;
159       setInitialBits(TailFoldingOpts::Disabled);
160     }
161 
162     for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
163       if (TailFoldTypes[I] == "reductions")
164         setEnableBit(TailFoldingOpts::Reductions);
165       else if (TailFoldTypes[I] == "recurrences")
166         setEnableBit(TailFoldingOpts::Recurrences);
167       else if (TailFoldTypes[I] == "reverse")
168         setEnableBit(TailFoldingOpts::Reverse);
169       else if (TailFoldTypes[I] == "noreductions")
170         setDisableBit(TailFoldingOpts::Reductions);
171       else if (TailFoldTypes[I] == "norecurrences")
172         setDisableBit(TailFoldingOpts::Recurrences);
173       else if (TailFoldTypes[I] == "noreverse")
174         setDisableBit(TailFoldingOpts::Reverse);
175       else
176         reportError(Val);
177     }
178   }
179 
180   bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
181     return (getBits(DefaultBits) & Required) == Required;
182   }
183 };
184 } // namespace
185 
186 TailFoldingOption TailFoldingOptionLoc;
187 
188 cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding(
189     "sve-tail-folding",
190     cl::desc(
191         "Control the use of vectorisation using tail-folding for SVE where the"
192         " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
193         "\ndisabled      (Initial) No loop types will vectorize using "
194         "tail-folding"
195         "\ndefault       (Initial) Uses the default tail-folding settings for "
196         "the target CPU"
197         "\nall           (Initial) All legal loop types will vectorize using "
198         "tail-folding"
199         "\nsimple        (Initial) Use tail-folding for simple loops (not "
200         "reductions or recurrences)"
201         "\nreductions    Use tail-folding for loops containing reductions"
202         "\nnoreductions  Inverse of above"
203         "\nrecurrences   Use tail-folding for loops containing fixed order "
204         "recurrences"
205         "\nnorecurrences Inverse of above"
206         "\nreverse       Use tail-folding for loops requiring reversed "
207         "predicates"
208         "\nnoreverse     Inverse of above"),
209     cl::location(TailFoldingOptionLoc));
210 
211 // Experimental option that will only be fully functional when the
212 // code-generator is changed to use SVE instead of NEON for all fixed-width
213 // operations.
214 static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
215     "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
216 
217 // Experimental option that will only be fully functional when the cost-model
218 // and code-generator have been changed to avoid using scalable vector
219 // instructions that are not legal in streaming SVE mode.
220 static cl::opt<bool> EnableScalableAutovecInStreamingMode(
221     "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
222 
223 static bool isSMEABIRoutineCall(const CallInst &CI) {
224   const auto *F = CI.getCalledFunction();
225   return F && StringSwitch<bool>(F->getName())
226                   .Case("__arm_sme_state", true)
227                   .Case("__arm_tpidr2_save", true)
228                   .Case("__arm_tpidr2_restore", true)
229                   .Case("__arm_za_disable", true)
230                   .Default(false);
231 }
232 
233 /// Returns true if the function has explicit operations that can only be
234 /// lowered using incompatible instructions for the selected mode. This also
235 /// returns true if the function F may use or modify ZA state.
236 static bool hasPossibleIncompatibleOps(const Function *F) {
237   for (const BasicBlock &BB : *F) {
238     for (const Instruction &I : BB) {
239       // Be conservative for now and assume that any call to inline asm or to
240       // intrinsics could could result in non-streaming ops (e.g. calls to
241       // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
242       // all native LLVM instructions can be lowered to compatible instructions.
243       if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
244           (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
245            isSMEABIRoutineCall(cast<CallInst>(I))))
246         return true;
247     }
248   }
249   return false;
250 }
251 
252 uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const {
253   StringRef AttributeStr =
254       isMultiversionedFunction(F) ? "fmv-features" : "target-features";
255   StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
256   SmallVector<StringRef, 8> Features;
257   FeatureStr.split(Features, ",");
258   return AArch64::getFMVPriority(Features);
259 }
260 
261 bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const {
262   return F.hasFnAttribute("fmv-features");
263 }
264 
265 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
266                                          const Function *Callee) const {
267   SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
268 
269   // When inlining, we should consider the body of the function, not the
270   // interface.
271   if (CalleeAttrs.hasStreamingBody()) {
272     CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
273     CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
274   }
275 
276   if (CalleeAttrs.isNewZA() || CalleeAttrs.isNewZT0())
277     return false;
278 
279   if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
280       CallerAttrs.requiresSMChange(CalleeAttrs) ||
281       CallerAttrs.requiresPreservingZT0(CalleeAttrs) ||
282       CallerAttrs.requiresPreservingAllZAState(CalleeAttrs)) {
283     if (hasPossibleIncompatibleOps(Callee))
284       return false;
285   }
286 
287   return BaseT::areInlineCompatible(Caller, Callee);
288 }
289 
290 bool AArch64TTIImpl::areTypesABICompatible(
291     const Function *Caller, const Function *Callee,
292     const ArrayRef<Type *> &Types) const {
293   if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
294     return false;
295 
296   // We need to ensure that argument promotion does not attempt to promote
297   // pointers to fixed-length vector types larger than 128 bits like
298   // <8 x float> (and pointers to aggregate types which have such fixed-length
299   // vector type members) into the values of the pointees. Such vector types
300   // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
301   // backend cannot lower such value arguments. The 128-bit fixed-length SVE
302   // types can be safely treated as 128-bit NEON types and they cannot be
303   // distinguished in IR.
304   if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
305         auto FVTy = dyn_cast<FixedVectorType>(Ty);
306         return FVTy &&
307                FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
308       }))
309     return false;
310 
311   return true;
312 }
313 
314 unsigned
315 AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
316                                      unsigned DefaultCallPenalty) const {
317   // This function calculates a penalty for executing Call in F.
318   //
319   // There are two ways this function can be called:
320   // (1)  F:
321   //       call from F -> G (the call here is Call)
322   //
323   // For (1), Call.getCaller() == F, so it will always return a high cost if
324   // a streaming-mode change is required (thus promoting the need to inline the
325   // function)
326   //
327   // (2)  F:
328   //       call from F -> G (the call here is not Call)
329   //      G:
330   //       call from G -> H (the call here is Call)
331   //
332   // For (2), if after inlining the body of G into F the call to H requires a
333   // streaming-mode change, and the call to G from F would also require a
334   // streaming-mode change, then there is benefit to do the streaming-mode
335   // change only once and avoid inlining of G into F.
336   SMEAttrs FAttrs(*F);
337   SMEAttrs CalleeAttrs(Call);
338   if (FAttrs.requiresSMChange(CalleeAttrs)) {
339     if (F == Call.getCaller()) // (1)
340       return CallPenaltyChangeSM * DefaultCallPenalty;
341     if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)
342       return InlineCallPenaltyChangeSM * DefaultCallPenalty;
343   }
344 
345   return DefaultCallPenalty;
346 }
347 
348 bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
349     TargetTransformInfo::RegisterKind K) const {
350   assert(K != TargetTransformInfo::RGK_Scalar);
351   return (K == TargetTransformInfo::RGK_FixedWidthVector &&
352           ST->isNeonAvailable());
353 }
354 
355 /// Calculate the cost of materializing a 64-bit value. This helper
356 /// method might only calculate a fraction of a larger immediate. Therefore it
357 /// is valid to return a cost of ZERO.
358 InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
359   // Check if the immediate can be encoded within an instruction.
360   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
361     return 0;
362 
363   if (Val < 0)
364     Val = ~Val;
365 
366   // Calculate how many moves we will need to materialize this constant.
367   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
368   AArch64_IMM::expandMOVImm(Val, 64, Insn);
369   return Insn.size();
370 }
371 
372 /// Calculate the cost of materializing the given constant.
373 InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
374                                               TTI::TargetCostKind CostKind) {
375   assert(Ty->isIntegerTy());
376 
377   unsigned BitSize = Ty->getPrimitiveSizeInBits();
378   if (BitSize == 0)
379     return ~0U;
380 
381   // Sign-extend all constants to a multiple of 64-bit.
382   APInt ImmVal = Imm;
383   if (BitSize & 0x3f)
384     ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
385 
386   // Split the constant into 64-bit chunks and calculate the cost for each
387   // chunk.
388   InstructionCost Cost = 0;
389   for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
390     APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
391     int64_t Val = Tmp.getSExtValue();
392     Cost += getIntImmCost(Val);
393   }
394   // We need at least one instruction to materialze the constant.
395   return std::max<InstructionCost>(1, Cost);
396 }
397 
398 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
399                                                   const APInt &Imm, Type *Ty,
400                                                   TTI::TargetCostKind CostKind,
401                                                   Instruction *Inst) {
402   assert(Ty->isIntegerTy());
403 
404   unsigned BitSize = Ty->getPrimitiveSizeInBits();
405   // There is no cost model for constants with a bit size of 0. Return TCC_Free
406   // here, so that constant hoisting will ignore this constant.
407   if (BitSize == 0)
408     return TTI::TCC_Free;
409 
410   unsigned ImmIdx = ~0U;
411   switch (Opcode) {
412   default:
413     return TTI::TCC_Free;
414   case Instruction::GetElementPtr:
415     // Always hoist the base address of a GetElementPtr.
416     if (Idx == 0)
417       return 2 * TTI::TCC_Basic;
418     return TTI::TCC_Free;
419   case Instruction::Store:
420     ImmIdx = 0;
421     break;
422   case Instruction::Add:
423   case Instruction::Sub:
424   case Instruction::Mul:
425   case Instruction::UDiv:
426   case Instruction::SDiv:
427   case Instruction::URem:
428   case Instruction::SRem:
429   case Instruction::And:
430   case Instruction::Or:
431   case Instruction::Xor:
432   case Instruction::ICmp:
433     ImmIdx = 1;
434     break;
435   // Always return TCC_Free for the shift value of a shift instruction.
436   case Instruction::Shl:
437   case Instruction::LShr:
438   case Instruction::AShr:
439     if (Idx == 1)
440       return TTI::TCC_Free;
441     break;
442   case Instruction::Trunc:
443   case Instruction::ZExt:
444   case Instruction::SExt:
445   case Instruction::IntToPtr:
446   case Instruction::PtrToInt:
447   case Instruction::BitCast:
448   case Instruction::PHI:
449   case Instruction::Call:
450   case Instruction::Select:
451   case Instruction::Ret:
452   case Instruction::Load:
453     break;
454   }
455 
456   if (Idx == ImmIdx) {
457     int NumConstants = (BitSize + 63) / 64;
458     InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
459     return (Cost <= NumConstants * TTI::TCC_Basic)
460                ? static_cast<int>(TTI::TCC_Free)
461                : Cost;
462   }
463   return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
464 }
465 
466 InstructionCost
467 AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
468                                     const APInt &Imm, Type *Ty,
469                                     TTI::TargetCostKind CostKind) {
470   assert(Ty->isIntegerTy());
471 
472   unsigned BitSize = Ty->getPrimitiveSizeInBits();
473   // There is no cost model for constants with a bit size of 0. Return TCC_Free
474   // here, so that constant hoisting will ignore this constant.
475   if (BitSize == 0)
476     return TTI::TCC_Free;
477 
478   // Most (all?) AArch64 intrinsics do not support folding immediates into the
479   // selected instruction, so we compute the materialization cost for the
480   // immediate directly.
481   if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
482     return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
483 
484   switch (IID) {
485   default:
486     return TTI::TCC_Free;
487   case Intrinsic::sadd_with_overflow:
488   case Intrinsic::uadd_with_overflow:
489   case Intrinsic::ssub_with_overflow:
490   case Intrinsic::usub_with_overflow:
491   case Intrinsic::smul_with_overflow:
492   case Intrinsic::umul_with_overflow:
493     if (Idx == 1) {
494       int NumConstants = (BitSize + 63) / 64;
495       InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
496       return (Cost <= NumConstants * TTI::TCC_Basic)
497                  ? static_cast<int>(TTI::TCC_Free)
498                  : Cost;
499     }
500     break;
501   case Intrinsic::experimental_stackmap:
502     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
503       return TTI::TCC_Free;
504     break;
505   case Intrinsic::experimental_patchpoint_void:
506   case Intrinsic::experimental_patchpoint:
507     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
508       return TTI::TCC_Free;
509     break;
510   case Intrinsic::experimental_gc_statepoint:
511     if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
512       return TTI::TCC_Free;
513     break;
514   }
515   return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
516 }
517 
518 TargetTransformInfo::PopcntSupportKind
519 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
520   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
521   if (TyWidth == 32 || TyWidth == 64)
522     return TTI::PSK_FastHardware;
523   // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
524   return TTI::PSK_Software;
525 }
526 
527 static bool isUnpackedVectorVT(EVT VecVT) {
528   return VecVT.isScalableVector() &&
529          VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
530 }
531 
532 static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
533   Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
534   Type *EltTy = ICA.getArgTypes()[1];        // Type of bucket elements
535   unsigned TotalHistCnts = 1;
536 
537   unsigned EltSize = EltTy->getScalarSizeInBits();
538   // Only allow (up to 64b) integers or pointers
539   if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
540     return InstructionCost::getInvalid();
541 
542   // FIXME: We should be able to generate histcnt for fixed-length vectors
543   //        using ptrue with a specific VL.
544   if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
545     unsigned EC = VTy->getElementCount().getKnownMinValue();
546     if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
547       return InstructionCost::getInvalid();
548 
549     // HistCnt only supports 32b and 64b element types
550     unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
551 
552     if (EC == 2 || (LegalEltSize == 32 && EC == 4))
553       return InstructionCost(BaseHistCntCost);
554 
555     unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
556     TotalHistCnts = EC / NaturalVectorWidth;
557   }
558 
559   return InstructionCost(BaseHistCntCost * TotalHistCnts);
560 }
561 
562 InstructionCost
563 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
564                                       TTI::TargetCostKind CostKind) {
565   // The code-generator is currently not able to handle scalable vectors
566   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
567   // it. This change will be removed when code-generation for these types is
568   // sufficiently reliable.
569   auto *RetTy = ICA.getReturnType();
570   if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
571     if (VTy->getElementCount() == ElementCount::getScalable(1))
572       return InstructionCost::getInvalid();
573 
574   switch (ICA.getID()) {
575   case Intrinsic::experimental_vector_histogram_add:
576     if (!ST->hasSVE2())
577       return InstructionCost::getInvalid();
578     return getHistogramCost(ICA);
579   case Intrinsic::umin:
580   case Intrinsic::umax:
581   case Intrinsic::smin:
582   case Intrinsic::smax: {
583     static const auto ValidMinMaxTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
584                                         MVT::v8i16, MVT::v2i32, MVT::v4i32,
585                                         MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
586                                         MVT::nxv2i64};
587     auto LT = getTypeLegalizationCost(RetTy);
588     // v2i64 types get converted to cmp+bif hence the cost of 2
589     if (LT.second == MVT::v2i64)
590       return LT.first * 2;
591     if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
592       return LT.first;
593     break;
594   }
595   case Intrinsic::sadd_sat:
596   case Intrinsic::ssub_sat:
597   case Intrinsic::uadd_sat:
598   case Intrinsic::usub_sat: {
599     static const auto ValidSatTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
600                                      MVT::v8i16, MVT::v2i32, MVT::v4i32,
601                                      MVT::v2i64};
602     auto LT = getTypeLegalizationCost(RetTy);
603     // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
604     // need to extend the type, as it uses shr(qadd(shl, shl)).
605     unsigned Instrs =
606         LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
607     if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
608       return LT.first * Instrs;
609     break;
610   }
611   case Intrinsic::abs: {
612     static const auto ValidAbsTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
613                                      MVT::v8i16, MVT::v2i32, MVT::v4i32,
614                                      MVT::v2i64};
615     auto LT = getTypeLegalizationCost(RetTy);
616     if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
617       return LT.first;
618     break;
619   }
620   case Intrinsic::bswap: {
621     static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
622                                      MVT::v4i32, MVT::v2i64};
623     auto LT = getTypeLegalizationCost(RetTy);
624     if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
625         LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
626       return LT.first;
627     break;
628   }
629   case Intrinsic::stepvector: {
630     InstructionCost Cost = 1; // Cost of the `index' instruction
631     auto LT = getTypeLegalizationCost(RetTy);
632     // Legalisation of illegal vectors involves an `index' instruction plus
633     // (LT.first - 1) vector adds.
634     if (LT.first > 1) {
635       Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
636       InstructionCost AddCost =
637           getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
638       Cost += AddCost * (LT.first - 1);
639     }
640     return Cost;
641   }
642   case Intrinsic::vector_extract:
643   case Intrinsic::vector_insert: {
644     // If both the vector and subvector types are legal types and the index
645     // is 0, then this should be a no-op or simple operation; return a
646     // relatively low cost.
647 
648     // If arguments aren't actually supplied, then we cannot determine the
649     // value of the index. We also want to skip predicate types.
650     if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
651         ICA.getReturnType()->getScalarType()->isIntegerTy(1))
652       break;
653 
654     LLVMContext &C = RetTy->getContext();
655     EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
656     bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
657     EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
658                              : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
659     // Skip this if either the vector or subvector types are unpacked
660     // SVE types; they may get lowered to stack stores and loads.
661     if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
662       break;
663 
664     TargetLoweringBase::LegalizeKind SubVecLK =
665         getTLI()->getTypeConversion(C, SubVecVT);
666     TargetLoweringBase::LegalizeKind VecLK =
667         getTLI()->getTypeConversion(C, VecVT);
668     const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
669     const ConstantInt *CIdx = cast<ConstantInt>(Idx);
670     if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
671         VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
672       return TTI::TCC_Free;
673     break;
674   }
675   case Intrinsic::bitreverse: {
676     static const CostTblEntry BitreverseTbl[] = {
677         {Intrinsic::bitreverse, MVT::i32, 1},
678         {Intrinsic::bitreverse, MVT::i64, 1},
679         {Intrinsic::bitreverse, MVT::v8i8, 1},
680         {Intrinsic::bitreverse, MVT::v16i8, 1},
681         {Intrinsic::bitreverse, MVT::v4i16, 2},
682         {Intrinsic::bitreverse, MVT::v8i16, 2},
683         {Intrinsic::bitreverse, MVT::v2i32, 2},
684         {Intrinsic::bitreverse, MVT::v4i32, 2},
685         {Intrinsic::bitreverse, MVT::v1i64, 2},
686         {Intrinsic::bitreverse, MVT::v2i64, 2},
687     };
688     const auto LegalisationCost = getTypeLegalizationCost(RetTy);
689     const auto *Entry =
690         CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
691     if (Entry) {
692       // Cost Model is using the legal type(i32) that i8 and i16 will be
693       // converted to +1 so that we match the actual lowering cost
694       if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
695           TLI->getValueType(DL, RetTy, true) == MVT::i16)
696         return LegalisationCost.first * Entry->Cost + 1;
697 
698       return LegalisationCost.first * Entry->Cost;
699     }
700     break;
701   }
702   case Intrinsic::ctpop: {
703     if (!ST->hasNEON()) {
704       // 32-bit or 64-bit ctpop without NEON is 12 instructions.
705       return getTypeLegalizationCost(RetTy).first * 12;
706     }
707     static const CostTblEntry CtpopCostTbl[] = {
708         {ISD::CTPOP, MVT::v2i64, 4},
709         {ISD::CTPOP, MVT::v4i32, 3},
710         {ISD::CTPOP, MVT::v8i16, 2},
711         {ISD::CTPOP, MVT::v16i8, 1},
712         {ISD::CTPOP, MVT::i64,   4},
713         {ISD::CTPOP, MVT::v2i32, 3},
714         {ISD::CTPOP, MVT::v4i16, 2},
715         {ISD::CTPOP, MVT::v8i8,  1},
716         {ISD::CTPOP, MVT::i32,   5},
717     };
718     auto LT = getTypeLegalizationCost(RetTy);
719     MVT MTy = LT.second;
720     if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
721       // Extra cost of +1 when illegal vector types are legalized by promoting
722       // the integer type.
723       int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
724                                             RetTy->getScalarSizeInBits()
725                           ? 1
726                           : 0;
727       return LT.first * Entry->Cost + ExtraCost;
728     }
729     break;
730   }
731   case Intrinsic::sadd_with_overflow:
732   case Intrinsic::uadd_with_overflow:
733   case Intrinsic::ssub_with_overflow:
734   case Intrinsic::usub_with_overflow:
735   case Intrinsic::smul_with_overflow:
736   case Intrinsic::umul_with_overflow: {
737     static const CostTblEntry WithOverflowCostTbl[] = {
738         {Intrinsic::sadd_with_overflow, MVT::i8, 3},
739         {Intrinsic::uadd_with_overflow, MVT::i8, 3},
740         {Intrinsic::sadd_with_overflow, MVT::i16, 3},
741         {Intrinsic::uadd_with_overflow, MVT::i16, 3},
742         {Intrinsic::sadd_with_overflow, MVT::i32, 1},
743         {Intrinsic::uadd_with_overflow, MVT::i32, 1},
744         {Intrinsic::sadd_with_overflow, MVT::i64, 1},
745         {Intrinsic::uadd_with_overflow, MVT::i64, 1},
746         {Intrinsic::ssub_with_overflow, MVT::i8, 3},
747         {Intrinsic::usub_with_overflow, MVT::i8, 3},
748         {Intrinsic::ssub_with_overflow, MVT::i16, 3},
749         {Intrinsic::usub_with_overflow, MVT::i16, 3},
750         {Intrinsic::ssub_with_overflow, MVT::i32, 1},
751         {Intrinsic::usub_with_overflow, MVT::i32, 1},
752         {Intrinsic::ssub_with_overflow, MVT::i64, 1},
753         {Intrinsic::usub_with_overflow, MVT::i64, 1},
754         {Intrinsic::smul_with_overflow, MVT::i8, 5},
755         {Intrinsic::umul_with_overflow, MVT::i8, 4},
756         {Intrinsic::smul_with_overflow, MVT::i16, 5},
757         {Intrinsic::umul_with_overflow, MVT::i16, 4},
758         {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
759         {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
760         {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
761         {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
762     };
763     EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
764     if (MTy.isSimple())
765       if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
766                                               MTy.getSimpleVT()))
767         return Entry->Cost;
768     break;
769   }
770   case Intrinsic::fptosi_sat:
771   case Intrinsic::fptoui_sat: {
772     if (ICA.getArgTypes().empty())
773       break;
774     bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
775     auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
776     EVT MTy = TLI->getValueType(DL, RetTy);
777     // Check for the legal types, which are where the size of the input and the
778     // output are the same, or we are using cvt f64->i32 or f32->i64.
779     if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
780          LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
781          LT.second == MVT::v2f64)) {
782       if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
783            (LT.second == MVT::f64 && MTy == MVT::i32) ||
784            (LT.second == MVT::f32 && MTy == MVT::i64)))
785         return LT.first;
786       // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
787       if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
788           MTy.getScalarSizeInBits() == 64)
789         return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
790     }
791     // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
792     // f32.
793     if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
794       return LT.first + getIntrinsicInstrCost(
795                             {ICA.getID(),
796                              RetTy,
797                              {ICA.getArgTypes()[0]->getWithNewType(
798                                  Type::getFloatTy(RetTy->getContext()))}},
799                             CostKind);
800     if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
801         (LT.second == MVT::f16 && MTy == MVT::i64) ||
802         ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
803          (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
804       return LT.first;
805     // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
806     if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
807         MTy.getScalarSizeInBits() == 32)
808       return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
809     // Extending vector types v8f16->v8i32. These current scalarize but the
810     // codegen could be better.
811     if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
812         MTy.getScalarSizeInBits() == 64)
813       return MTy.getVectorNumElements() * 3;
814 
815     // If we can we use a legal convert followed by a min+max
816     if ((LT.second.getScalarType() == MVT::f32 ||
817          LT.second.getScalarType() == MVT::f64 ||
818          LT.second.getScalarType() == MVT::f16) &&
819         LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
820       Type *LegalTy =
821           Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
822       if (LT.second.isVector())
823         LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
824       InstructionCost Cost = 1;
825       IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
826                                     LegalTy, {LegalTy, LegalTy});
827       Cost += getIntrinsicInstrCost(Attrs1, CostKind);
828       IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
829                                     LegalTy, {LegalTy, LegalTy});
830       Cost += getIntrinsicInstrCost(Attrs2, CostKind);
831       return LT.first * Cost +
832              ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
833                                                                            : 1);
834     }
835     // Otherwise we need to follow the default expansion that clamps the value
836     // using a float min/max with a fcmp+sel for nan handling when signed.
837     Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
838     RetTy = RetTy->getScalarType();
839     if (LT.second.isVector()) {
840       FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
841       RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
842     }
843     IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
844     InstructionCost Cost = getIntrinsicInstrCost(Attrs1, CostKind);
845     IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
846     Cost += getIntrinsicInstrCost(Attrs2, CostKind);
847     Cost +=
848         getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
849                          RetTy, FPTy, TTI::CastContextHint::None, CostKind);
850     if (IsSigned) {
851       Type *CondTy = RetTy->getWithNewBitWidth(1);
852       Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
853                                  CmpInst::FCMP_UNO, CostKind);
854       Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
855                                  CmpInst::FCMP_UNO, CostKind);
856     }
857     return LT.first * Cost;
858   }
859   case Intrinsic::fshl:
860   case Intrinsic::fshr: {
861     if (ICA.getArgs().empty())
862       break;
863 
864     // TODO: Add handling for fshl where third argument is not a constant.
865     const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
866     if (!OpInfoZ.isConstant())
867       break;
868 
869     const auto LegalisationCost = getTypeLegalizationCost(RetTy);
870     if (OpInfoZ.isUniform()) {
871       // FIXME: The costs could be lower if the codegen is better.
872       static const CostTblEntry FshlTbl[] = {
873           {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
874           {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
875           {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
876           {Intrinsic::fshl, MVT::v8i8, 4},  {Intrinsic::fshl, MVT::v4i16, 4}};
877       // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
878       // to avoid having to duplicate the costs.
879       const auto *Entry =
880           CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
881       if (Entry)
882         return LegalisationCost.first * Entry->Cost;
883     }
884 
885     auto TyL = getTypeLegalizationCost(RetTy);
886     if (!RetTy->isIntegerTy())
887       break;
888 
889     // Estimate cost manually, as types like i8 and i16 will get promoted to
890     // i32 and CostTableLookup will ignore the extra conversion cost.
891     bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
892                        RetTy->getScalarSizeInBits() < 64) ||
893                       (RetTy->getScalarSizeInBits() % 64 != 0);
894     unsigned ExtraCost = HigherCost ? 1 : 0;
895     if (RetTy->getScalarSizeInBits() == 32 ||
896         RetTy->getScalarSizeInBits() == 64)
897       ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
898                      // extr instruction.
899     else if (HigherCost)
900       ExtraCost = 1;
901     else
902       break;
903     return TyL.first + ExtraCost;
904   }
905   case Intrinsic::get_active_lane_mask: {
906     auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
907     if (RetTy) {
908       EVT RetVT = getTLI()->getValueType(DL, RetTy);
909       EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
910       if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
911           !getTLI()->isTypeLegal(RetVT)) {
912         // We don't have enough context at this point to determine if the mask
913         // is going to be kept live after the block, which will force the vXi1
914         // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
915         // For now, we just assume the vectorizer created this intrinsic and
916         // the result will be the input for a PHI. In this case the cost will
917         // be extremely high for fixed-width vectors.
918         // NOTE: getScalarizationOverhead returns a cost that's far too
919         // pessimistic for the actual generated codegen. In reality there are
920         // two instructions generated per lane.
921         return RetTy->getNumElements() * 2;
922       }
923     }
924     break;
925   }
926   case Intrinsic::experimental_vector_match: {
927     auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
928     EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
929     unsigned SearchSize = NeedleTy->getNumElements();
930     if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
931       // Base cost for MATCH instructions. At least on the Neoverse V2 and
932       // Neoverse V3, these are cheap operations with the same latency as a
933       // vector ADD. In most cases, however, we also need to do an extra DUP.
934       // For fixed-length vectors we currently need an extra five--six
935       // instructions besides the MATCH.
936       InstructionCost Cost = 4;
937       if (isa<FixedVectorType>(RetTy))
938         Cost += 10;
939       return Cost;
940     }
941     break;
942   }
943   default:
944     break;
945   }
946   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
947 }
948 
949 /// The function will remove redundant reinterprets casting in the presence
950 /// of the control flow
951 static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
952                                                    IntrinsicInst &II) {
953   SmallVector<Instruction *, 32> Worklist;
954   auto RequiredType = II.getType();
955 
956   auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
957   assert(PN && "Expected Phi Node!");
958 
959   // Don't create a new Phi unless we can remove the old one.
960   if (!PN->hasOneUse())
961     return std::nullopt;
962 
963   for (Value *IncValPhi : PN->incoming_values()) {
964     auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
965     if (!Reinterpret ||
966         Reinterpret->getIntrinsicID() !=
967             Intrinsic::aarch64_sve_convert_to_svbool ||
968         RequiredType != Reinterpret->getArgOperand(0)->getType())
969       return std::nullopt;
970   }
971 
972   // Create the new Phi
973   IC.Builder.SetInsertPoint(PN);
974   PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
975   Worklist.push_back(PN);
976 
977   for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
978     auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
979     NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
980     Worklist.push_back(Reinterpret);
981   }
982 
983   // Cleanup Phi Node and reinterprets
984   return IC.replaceInstUsesWith(II, NPN);
985 }
986 
987 // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
988 // => (binop (pred) (from_svbool _) (from_svbool _))
989 //
990 // The above transformation eliminates a `to_svbool` in the predicate
991 // operand of bitwise operation `binop` by narrowing the vector width of
992 // the operation. For example, it would convert a `<vscale x 16 x i1>
993 // and` into a `<vscale x 4 x i1> and`. This is profitable because
994 // to_svbool must zero the new lanes during widening, whereas
995 // from_svbool is free.
996 static std::optional<Instruction *>
997 tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {
998   auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
999   if (!BinOp)
1000     return std::nullopt;
1001 
1002   auto IntrinsicID = BinOp->getIntrinsicID();
1003   switch (IntrinsicID) {
1004   case Intrinsic::aarch64_sve_and_z:
1005   case Intrinsic::aarch64_sve_bic_z:
1006   case Intrinsic::aarch64_sve_eor_z:
1007   case Intrinsic::aarch64_sve_nand_z:
1008   case Intrinsic::aarch64_sve_nor_z:
1009   case Intrinsic::aarch64_sve_orn_z:
1010   case Intrinsic::aarch64_sve_orr_z:
1011     break;
1012   default:
1013     return std::nullopt;
1014   }
1015 
1016   auto BinOpPred = BinOp->getOperand(0);
1017   auto BinOpOp1 = BinOp->getOperand(1);
1018   auto BinOpOp2 = BinOp->getOperand(2);
1019 
1020   auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1021   if (!PredIntr ||
1022       PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1023     return std::nullopt;
1024 
1025   auto PredOp = PredIntr->getOperand(0);
1026   auto PredOpTy = cast<VectorType>(PredOp->getType());
1027   if (PredOpTy != II.getType())
1028     return std::nullopt;
1029 
1030   SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1031   auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1032       Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1033   NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1034   if (BinOpOp1 == BinOpOp2)
1035     NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1036   else
1037     NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1038         Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1039 
1040   auto NarrowedBinOp =
1041       IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1042   return IC.replaceInstUsesWith(II, NarrowedBinOp);
1043 }
1044 
1045 static std::optional<Instruction *>
1046 instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {
1047   // If the reinterpret instruction operand is a PHI Node
1048   if (isa<PHINode>(II.getArgOperand(0)))
1049     return processPhiNode(IC, II);
1050 
1051   if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1052     return BinOpCombine;
1053 
1054   // Ignore converts to/from svcount_t.
1055   if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1056       isa<TargetExtType>(II.getType()))
1057     return std::nullopt;
1058 
1059   SmallVector<Instruction *, 32> CandidatesForRemoval;
1060   Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1061 
1062   const auto *IVTy = cast<VectorType>(II.getType());
1063 
1064   // Walk the chain of conversions.
1065   while (Cursor) {
1066     // If the type of the cursor has fewer lanes than the final result, zeroing
1067     // must take place, which breaks the equivalence chain.
1068     const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1069     if (CursorVTy->getElementCount().getKnownMinValue() <
1070         IVTy->getElementCount().getKnownMinValue())
1071       break;
1072 
1073     // If the cursor has the same type as I, it is a viable replacement.
1074     if (Cursor->getType() == IVTy)
1075       EarliestReplacement = Cursor;
1076 
1077     auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1078 
1079     // If this is not an SVE conversion intrinsic, this is the end of the chain.
1080     if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1081                                   Intrinsic::aarch64_sve_convert_to_svbool ||
1082                               IntrinsicCursor->getIntrinsicID() ==
1083                                   Intrinsic::aarch64_sve_convert_from_svbool))
1084       break;
1085 
1086     CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1087     Cursor = IntrinsicCursor->getOperand(0);
1088   }
1089 
1090   // If no viable replacement in the conversion chain was found, there is
1091   // nothing to do.
1092   if (!EarliestReplacement)
1093     return std::nullopt;
1094 
1095   return IC.replaceInstUsesWith(II, EarliestReplacement);
1096 }
1097 
1098 static bool isAllActivePredicate(Value *Pred) {
1099   // Look through convert.from.svbool(convert.to.svbool(...) chain.
1100   Value *UncastedPred;
1101   if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1102                       m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1103                           m_Value(UncastedPred)))))
1104     // If the predicate has the same or less lanes than the uncasted
1105     // predicate then we know the casting has no effect.
1106     if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1107         cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1108       Pred = UncastedPred;
1109 
1110   return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1111                          m_ConstantInt<AArch64SVEPredPattern::all>()));
1112 }
1113 
1114 // Simplify unary operation where predicate has all inactive lanes by replacing
1115 // instruction with its operand
1116 static std::optional<Instruction *>
1117 instCombineSVENoActiveReplace(InstCombiner &IC, IntrinsicInst &II,
1118                               bool hasInactiveVector) {
1119   int PredOperand = hasInactiveVector ? 1 : 0;
1120   int ReplaceOperand = hasInactiveVector ? 0 : 1;
1121   if (match(II.getOperand(PredOperand), m_ZeroInt())) {
1122     IC.replaceInstUsesWith(II, II.getOperand(ReplaceOperand));
1123     return IC.eraseInstFromFunction(II);
1124   }
1125   return std::nullopt;
1126 }
1127 
1128 // Simplify unary operation where predicate has all inactive lanes or
1129 // replace unused first operand with undef when all lanes are active
1130 static std::optional<Instruction *>
1131 instCombineSVEAllOrNoActiveUnary(InstCombiner &IC, IntrinsicInst &II) {
1132   if (isAllActivePredicate(II.getOperand(1)) &&
1133       !isa<llvm::UndefValue>(II.getOperand(0)) &&
1134       !isa<llvm::PoisonValue>(II.getOperand(0))) {
1135     Value *Undef = llvm::UndefValue::get(II.getType());
1136     return IC.replaceOperand(II, 0, Undef);
1137   }
1138   return instCombineSVENoActiveReplace(IC, II, true);
1139 }
1140 
1141 // Erase unary operation where predicate has all inactive lanes
1142 static std::optional<Instruction *>
1143 instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II,
1144                                  int PredPos) {
1145   if (match(II.getOperand(PredPos), m_ZeroInt())) {
1146     return IC.eraseInstFromFunction(II);
1147   }
1148   return std::nullopt;
1149 }
1150 
1151 // Simplify operation where predicate has all inactive lanes by replacing
1152 // instruction with zeroed object
1153 static std::optional<Instruction *>
1154 instCombineSVENoActiveZero(InstCombiner &IC, IntrinsicInst &II) {
1155   if (match(II.getOperand(0), m_ZeroInt())) {
1156     Constant *Node;
1157     Type *RetTy = II.getType();
1158     if (RetTy->isStructTy()) {
1159       auto StructT = cast<StructType>(RetTy);
1160       auto VecT = StructT->getElementType(0);
1161       SmallVector<llvm::Constant *, 4> ZerVec;
1162       for (unsigned i = 0; i < StructT->getNumElements(); i++) {
1163         ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
1164                                                   : ConstantInt::get(VecT, 0));
1165       }
1166       Node = ConstantStruct::get(StructT, ZerVec);
1167     } else
1168       Node = RetTy->isFPOrFPVectorTy() ? ConstantFP::get(RetTy, 0.0)
1169                                        : ConstantInt::get(II.getType(), 0);
1170 
1171     IC.replaceInstUsesWith(II, Node);
1172     return IC.eraseInstFromFunction(II);
1173   }
1174   return std::nullopt;
1175 }
1176 
1177 static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1178                                                       IntrinsicInst &II) {
1179   // svsel(ptrue, x, y) => x
1180   auto *OpPredicate = II.getOperand(0);
1181   if (isAllActivePredicate(OpPredicate))
1182     return IC.replaceInstUsesWith(II, II.getOperand(1));
1183 
1184   auto Select =
1185       IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1186   return IC.replaceInstUsesWith(II, Select);
1187 }
1188 
1189 static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1190                                                       IntrinsicInst &II) {
1191   IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1192   if (!Pg)
1193     return std::nullopt;
1194 
1195   if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1196     return std::nullopt;
1197 
1198   const auto PTruePattern =
1199       cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1200   if (PTruePattern != AArch64SVEPredPattern::vl1)
1201     return std::nullopt;
1202 
1203   // The intrinsic is inserting into lane zero so use an insert instead.
1204   auto *IdxTy = Type::getInt64Ty(II.getContext());
1205   auto *Insert = InsertElementInst::Create(
1206       II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1207   Insert->insertBefore(II.getIterator());
1208   Insert->takeName(&II);
1209 
1210   return IC.replaceInstUsesWith(II, Insert);
1211 }
1212 
1213 static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1214                                                        IntrinsicInst &II) {
1215   // Replace DupX with a regular IR splat.
1216   auto *RetTy = cast<ScalableVectorType>(II.getType());
1217   Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1218                                               II.getArgOperand(0));
1219   Splat->takeName(&II);
1220   return IC.replaceInstUsesWith(II, Splat);
1221 }
1222 
1223 static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1224                                                         IntrinsicInst &II) {
1225   LLVMContext &Ctx = II.getContext();
1226 
1227   // Replace by zero constant when all lanes are inactive
1228   if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1229     return II_NA;
1230 
1231   // Check that the predicate is all active
1232   auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
1233   if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1234     return std::nullopt;
1235 
1236   const auto PTruePattern =
1237       cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1238   if (PTruePattern != AArch64SVEPredPattern::all)
1239     return std::nullopt;
1240 
1241   // Check that we have a compare of zero..
1242   auto *SplatValue =
1243       dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
1244   if (!SplatValue || !SplatValue->isZero())
1245     return std::nullopt;
1246 
1247   // ..against a dupq
1248   auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1249   if (!DupQLane ||
1250       DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1251     return std::nullopt;
1252 
1253   // Where the dupq is a lane 0 replicate of a vector insert
1254   auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1255   if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1256     return std::nullopt;
1257 
1258   auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1259   if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1260     return std::nullopt;
1261 
1262   // Where the vector insert is a fixed constant vector insert into undef at
1263   // index zero
1264   if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1265     return std::nullopt;
1266 
1267   if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1268     return std::nullopt;
1269 
1270   auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1271   if (!ConstVec)
1272     return std::nullopt;
1273 
1274   auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1275   auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1276   if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1277     return std::nullopt;
1278 
1279   unsigned NumElts = VecTy->getNumElements();
1280   unsigned PredicateBits = 0;
1281 
1282   // Expand intrinsic operands to a 16-bit byte level predicate
1283   for (unsigned I = 0; I < NumElts; ++I) {
1284     auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1285     if (!Arg)
1286       return std::nullopt;
1287     if (!Arg->isZero())
1288       PredicateBits |= 1 << (I * (16 / NumElts));
1289   }
1290 
1291   // If all bits are zero bail early with an empty predicate
1292   if (PredicateBits == 0) {
1293     auto *PFalse = Constant::getNullValue(II.getType());
1294     PFalse->takeName(&II);
1295     return IC.replaceInstUsesWith(II, PFalse);
1296   }
1297 
1298   // Calculate largest predicate type used (where byte predicate is largest)
1299   unsigned Mask = 8;
1300   for (unsigned I = 0; I < 16; ++I)
1301     if ((PredicateBits & (1 << I)) != 0)
1302       Mask |= (I % 8);
1303 
1304   unsigned PredSize = Mask & -Mask;
1305   auto *PredType = ScalableVectorType::get(
1306       Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1307 
1308   // Ensure all relevant bits are set
1309   for (unsigned I = 0; I < 16; I += PredSize)
1310     if ((PredicateBits & (1 << I)) == 0)
1311       return std::nullopt;
1312 
1313   auto *PTruePat =
1314       ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1315   auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1316                                            {PredType}, {PTruePat});
1317   auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1318       Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1319   auto *ConvertFromSVBool =
1320       IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1321                                  {II.getType()}, {ConvertToSVBool});
1322 
1323   ConvertFromSVBool->takeName(&II);
1324   return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1325 }
1326 
1327 static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1328                                                        IntrinsicInst &II) {
1329   Value *Pg = II.getArgOperand(0);
1330   Value *Vec = II.getArgOperand(1);
1331   auto IntrinsicID = II.getIntrinsicID();
1332   bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1333 
1334   // lastX(splat(X)) --> X
1335   if (auto *SplatVal = getSplatValue(Vec))
1336     return IC.replaceInstUsesWith(II, SplatVal);
1337 
1338   // If x and/or y is a splat value then:
1339   // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1340   Value *LHS, *RHS;
1341   if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1342     if (isSplatValue(LHS) || isSplatValue(RHS)) {
1343       auto *OldBinOp = cast<BinaryOperator>(Vec);
1344       auto OpC = OldBinOp->getOpcode();
1345       auto *NewLHS =
1346           IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1347       auto *NewRHS =
1348           IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1349       auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
1350           OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1351       return IC.replaceInstUsesWith(II, NewBinOp);
1352     }
1353   }
1354 
1355   auto *C = dyn_cast<Constant>(Pg);
1356   if (IsAfter && C && C->isNullValue()) {
1357     // The intrinsic is extracting lane 0 so use an extract instead.
1358     auto *IdxTy = Type::getInt64Ty(II.getContext());
1359     auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1360     Extract->insertBefore(II.getIterator());
1361     Extract->takeName(&II);
1362     return IC.replaceInstUsesWith(II, Extract);
1363   }
1364 
1365   auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1366   if (!IntrPG)
1367     return std::nullopt;
1368 
1369   if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1370     return std::nullopt;
1371 
1372   const auto PTruePattern =
1373       cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1374 
1375   // Can the intrinsic's predicate be converted to a known constant index?
1376   unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
1377   if (!MinNumElts)
1378     return std::nullopt;
1379 
1380   unsigned Idx = MinNumElts - 1;
1381   // Increment the index if extracting the element after the last active
1382   // predicate element.
1383   if (IsAfter)
1384     ++Idx;
1385 
1386   // Ignore extracts whose index is larger than the known minimum vector
1387   // length. NOTE: This is an artificial constraint where we prefer to
1388   // maintain what the user asked for until an alternative is proven faster.
1389   auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1390   if (Idx >= PgVTy->getMinNumElements())
1391     return std::nullopt;
1392 
1393   // The intrinsic is extracting a fixed lane so use an extract instead.
1394   auto *IdxTy = Type::getInt64Ty(II.getContext());
1395   auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
1396   Extract->insertBefore(II.getIterator());
1397   Extract->takeName(&II);
1398   return IC.replaceInstUsesWith(II, Extract);
1399 }
1400 
1401 static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1402                                                            IntrinsicInst &II) {
1403   // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1404   // integer variant across a variety of micro-architectures. Replace scalar
1405   // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1406   // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1407   // depending on the micro-architecture, but has been observed as generally
1408   // being faster, particularly when the CLAST[AB] op is a loop-carried
1409   // dependency.
1410   Value *Pg = II.getArgOperand(0);
1411   Value *Fallback = II.getArgOperand(1);
1412   Value *Vec = II.getArgOperand(2);
1413   Type *Ty = II.getType();
1414 
1415   if (!Ty->isIntegerTy())
1416     return std::nullopt;
1417 
1418   Type *FPTy;
1419   switch (cast<IntegerType>(Ty)->getBitWidth()) {
1420   default:
1421     return std::nullopt;
1422   case 16:
1423     FPTy = IC.Builder.getHalfTy();
1424     break;
1425   case 32:
1426     FPTy = IC.Builder.getFloatTy();
1427     break;
1428   case 64:
1429     FPTy = IC.Builder.getDoubleTy();
1430     break;
1431   }
1432 
1433   Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
1434   auto *FPVTy = VectorType::get(
1435       FPTy, cast<VectorType>(Vec->getType())->getElementCount());
1436   Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
1437   auto *FPII = IC.Builder.CreateIntrinsic(
1438       II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1439   Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
1440   return IC.replaceInstUsesWith(II, FPIItoInt);
1441 }
1442 
1443 static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1444                                                      IntrinsicInst &II) {
1445   LLVMContext &Ctx = II.getContext();
1446   // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1447   // can work with RDFFR_PP for ptest elimination.
1448   auto *AllPat =
1449       ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1450   auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1451                                            {II.getType()}, {AllPat});
1452   auto *RDFFR =
1453       IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1454   RDFFR->takeName(&II);
1455   return IC.replaceInstUsesWith(II, RDFFR);
1456 }
1457 
1458 static std::optional<Instruction *>
1459 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
1460   const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1461 
1462   if (Pattern == AArch64SVEPredPattern::all) {
1463     Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
1464     auto *VScale = IC.Builder.CreateVScale(StepVal);
1465     VScale->takeName(&II);
1466     return IC.replaceInstUsesWith(II, VScale);
1467   }
1468 
1469   unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1470 
1471   return MinNumElts && NumElts >= MinNumElts
1472              ? std::optional<Instruction *>(IC.replaceInstUsesWith(
1473                    II, ConstantInt::get(II.getType(), MinNumElts)))
1474              : std::nullopt;
1475 }
1476 
1477 static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1478                                                         IntrinsicInst &II) {
1479   Value *PgVal = II.getArgOperand(0);
1480   Value *OpVal = II.getArgOperand(1);
1481 
1482   // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1483   // Later optimizations prefer this form.
1484   if (PgVal == OpVal &&
1485       (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1486        II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1487     Value *Ops[] = {PgVal, OpVal};
1488     Type *Tys[] = {PgVal->getType()};
1489 
1490     auto *PTest =
1491         IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1492     PTest->takeName(&II);
1493 
1494     return IC.replaceInstUsesWith(II, PTest);
1495   }
1496 
1497   IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1498   IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1499 
1500   if (!Pg || !Op)
1501     return std::nullopt;
1502 
1503   Intrinsic::ID OpIID = Op->getIntrinsicID();
1504 
1505   if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1506       OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1507       Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1508     Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1509     Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1510 
1511     auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1512 
1513     PTest->takeName(&II);
1514     return IC.replaceInstUsesWith(II, PTest);
1515   }
1516 
1517   // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1518   // Later optimizations may rewrite sequence to use the flag-setting variant
1519   // of instruction X to remove PTEST.
1520   if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1521       ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1522        (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1523        (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1524        (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1525        (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1526        (OpIID == Intrinsic::aarch64_sve_and_z) ||
1527        (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1528        (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1529        (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1530        (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1531        (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1532        (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1533     Value *Ops[] = {Pg->getArgOperand(0), Pg};
1534     Type *Tys[] = {Pg->getType()};
1535 
1536     auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1537     PTest->takeName(&II);
1538 
1539     return IC.replaceInstUsesWith(II, PTest);
1540   }
1541 
1542   return std::nullopt;
1543 }
1544 
1545 template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1546 static std::optional<Instruction *>
1547 instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
1548                                   bool MergeIntoAddendOp) {
1549   Value *P = II.getOperand(0);
1550   Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1551   if (MergeIntoAddendOp) {
1552     AddendOp = II.getOperand(1);
1553     Mul = II.getOperand(2);
1554   } else {
1555     AddendOp = II.getOperand(2);
1556     Mul = II.getOperand(1);
1557   }
1558 
1559   if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1560                                       m_Value(MulOp1))))
1561     return std::nullopt;
1562 
1563   if (!Mul->hasOneUse())
1564     return std::nullopt;
1565 
1566   Instruction *FMFSource = nullptr;
1567   if (II.getType()->isFPOrFPVectorTy()) {
1568     llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1569     // Stop the combine when the flags on the inputs differ in case dropping
1570     // flags would lead to us missing out on more beneficial optimizations.
1571     if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1572       return std::nullopt;
1573     if (!FAddFlags.allowContract())
1574       return std::nullopt;
1575     FMFSource = &II;
1576   }
1577 
1578   CallInst *Res;
1579   if (MergeIntoAddendOp)
1580     Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1581                                      {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1582   else
1583     Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1584                                      {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1585 
1586   return IC.replaceInstUsesWith(II, Res);
1587 }
1588 
1589 static std::optional<Instruction *>
1590 instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
1591   Value *Pred = II.getOperand(0);
1592   Value *PtrOp = II.getOperand(1);
1593   Type *VecTy = II.getType();
1594 
1595   // Replace by zero constant when all lanes are inactive
1596   if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1597     return II_NA;
1598 
1599   if (isAllActivePredicate(Pred)) {
1600     LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
1601     Load->copyMetadata(II);
1602     return IC.replaceInstUsesWith(II, Load);
1603   }
1604 
1605   CallInst *MaskedLoad =
1606       IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1607                                   Pred, ConstantAggregateZero::get(VecTy));
1608   MaskedLoad->copyMetadata(II);
1609   return IC.replaceInstUsesWith(II, MaskedLoad);
1610 }
1611 
1612 static std::optional<Instruction *>
1613 instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
1614   Value *VecOp = II.getOperand(0);
1615   Value *Pred = II.getOperand(1);
1616   Value *PtrOp = II.getOperand(2);
1617 
1618   if (isAllActivePredicate(Pred)) {
1619     StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
1620     Store->copyMetadata(II);
1621     return IC.eraseInstFromFunction(II);
1622   }
1623 
1624   CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
1625       VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
1626   MaskedStore->copyMetadata(II);
1627   return IC.eraseInstFromFunction(II);
1628 }
1629 
1630 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
1631   switch (Intrinsic) {
1632   case Intrinsic::aarch64_sve_fmul_u:
1633     return Instruction::BinaryOps::FMul;
1634   case Intrinsic::aarch64_sve_fadd_u:
1635     return Instruction::BinaryOps::FAdd;
1636   case Intrinsic::aarch64_sve_fsub_u:
1637     return Instruction::BinaryOps::FSub;
1638   default:
1639     return Instruction::BinaryOpsEnd;
1640   }
1641 }
1642 
1643 static std::optional<Instruction *>
1644 instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
1645   // Bail due to missing support for ISD::STRICT_ scalable vector operations.
1646   if (II.isStrictFP())
1647     return std::nullopt;
1648 
1649   auto *OpPredicate = II.getOperand(0);
1650   auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1651   if (BinOpCode == Instruction::BinaryOpsEnd ||
1652       !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1653                               m_ConstantInt<AArch64SVEPredPattern::all>())))
1654     return std::nullopt;
1655   auto BinOp = IC.Builder.CreateBinOpFMF(
1656       BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
1657   return IC.replaceInstUsesWith(II, BinOp);
1658 }
1659 
1660 // Canonicalise operations that take an all active predicate (e.g. sve.add ->
1661 // sve.add_u).
1662 static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
1663                                                             Intrinsic::ID IID) {
1664   auto *OpPredicate = II.getOperand(0);
1665   if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1666                               m_ConstantInt<AArch64SVEPredPattern::all>())))
1667     return std::nullopt;
1668 
1669   auto *Mod = II.getModule();
1670   auto *NewDecl = Intrinsic::getOrInsertDeclaration(Mod, IID, {II.getType()});
1671   II.setCalledFunction(NewDecl);
1672 
1673   return &II;
1674 }
1675 
1676 // Simplify operations where predicate has all inactive lanes or try to replace
1677 // with _u form when all lanes are active
1678 static std::optional<Instruction *>
1679 instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II,
1680                             Intrinsic::ID IID) {
1681   if (match(II.getOperand(0), m_ZeroInt())) {
1682     //  llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1683     //  inactive for sv[func]_m
1684     return IC.replaceInstUsesWith(II, II.getOperand(1));
1685   }
1686   return instCombineSVEAllActive(II, IID);
1687 }
1688 
1689 static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1690                                                             IntrinsicInst &II) {
1691   if (auto II_U =
1692           instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
1693     return II_U;
1694   if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1695                                                    Intrinsic::aarch64_sve_mla>(
1696           IC, II, true))
1697     return MLA;
1698   if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1699                                                    Intrinsic::aarch64_sve_mad>(
1700           IC, II, false))
1701     return MAD;
1702   return std::nullopt;
1703 }
1704 
1705 static std::optional<Instruction *>
1706 instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
1707   if (auto II_U =
1708           instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
1709     return II_U;
1710   if (auto FMLA =
1711           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1712                                             Intrinsic::aarch64_sve_fmla>(IC, II,
1713                                                                          true))
1714     return FMLA;
1715   if (auto FMAD =
1716           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1717                                             Intrinsic::aarch64_sve_fmad>(IC, II,
1718                                                                          false))
1719     return FMAD;
1720   if (auto FMLA =
1721           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1722                                             Intrinsic::aarch64_sve_fmla>(IC, II,
1723                                                                          true))
1724     return FMLA;
1725   return std::nullopt;
1726 }
1727 
1728 static std::optional<Instruction *>
1729 instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {
1730   if (auto FMLA =
1731           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1732                                             Intrinsic::aarch64_sve_fmla>(IC, II,
1733                                                                          true))
1734     return FMLA;
1735   if (auto FMAD =
1736           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1737                                             Intrinsic::aarch64_sve_fmad>(IC, II,
1738                                                                          false))
1739     return FMAD;
1740   if (auto FMLA_U =
1741           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1742                                             Intrinsic::aarch64_sve_fmla_u>(
1743               IC, II, true))
1744     return FMLA_U;
1745   return instCombineSVEVectorBinOp(IC, II);
1746 }
1747 
1748 static std::optional<Instruction *>
1749 instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
1750   if (auto II_U =
1751           instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
1752     return II_U;
1753   if (auto FMLS =
1754           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1755                                             Intrinsic::aarch64_sve_fmls>(IC, II,
1756                                                                          true))
1757     return FMLS;
1758   if (auto FMSB =
1759           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1760                                             Intrinsic::aarch64_sve_fnmsb>(
1761               IC, II, false))
1762     return FMSB;
1763   if (auto FMLS =
1764           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1765                                             Intrinsic::aarch64_sve_fmls>(IC, II,
1766                                                                          true))
1767     return FMLS;
1768   return std::nullopt;
1769 }
1770 
1771 static std::optional<Instruction *>
1772 instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {
1773   if (auto FMLS =
1774           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1775                                             Intrinsic::aarch64_sve_fmls>(IC, II,
1776                                                                          true))
1777     return FMLS;
1778   if (auto FMSB =
1779           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1780                                             Intrinsic::aarch64_sve_fnmsb>(
1781               IC, II, false))
1782     return FMSB;
1783   if (auto FMLS_U =
1784           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1785                                             Intrinsic::aarch64_sve_fmls_u>(
1786               IC, II, true))
1787     return FMLS_U;
1788   return instCombineSVEVectorBinOp(IC, II);
1789 }
1790 
1791 static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1792                                                             IntrinsicInst &II) {
1793   if (auto II_U =
1794           instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
1795     return II_U;
1796   if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1797                                                    Intrinsic::aarch64_sve_mls>(
1798           IC, II, true))
1799     return MLS;
1800   return std::nullopt;
1801 }
1802 
1803 static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1804                                                             IntrinsicInst &II,
1805                                                             Intrinsic::ID IID) {
1806   auto *OpPredicate = II.getOperand(0);
1807   auto *OpMultiplicand = II.getOperand(1);
1808   auto *OpMultiplier = II.getOperand(2);
1809 
1810   // Return true if a given instruction is a unit splat value, false otherwise.
1811   auto IsUnitSplat = [](auto *I) {
1812     auto *SplatValue = getSplatValue(I);
1813     if (!SplatValue)
1814       return false;
1815     return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1816   };
1817 
1818   // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1819   // with a unit splat value, false otherwise.
1820   auto IsUnitDup = [](auto *I) {
1821     auto *IntrI = dyn_cast<IntrinsicInst>(I);
1822     if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1823       return false;
1824 
1825     auto *SplatValue = IntrI->getOperand(2);
1826     return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1827   };
1828 
1829   if (IsUnitSplat(OpMultiplier)) {
1830     // [f]mul pg %n, (dupx 1) => %n
1831     OpMultiplicand->takeName(&II);
1832     return IC.replaceInstUsesWith(II, OpMultiplicand);
1833   } else if (IsUnitDup(OpMultiplier)) {
1834     // [f]mul pg %n, (dup pg 1) => %n
1835     auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1836     auto *DupPg = DupInst->getOperand(1);
1837     // TODO: this is naive. The optimization is still valid if DupPg
1838     // 'encompasses' OpPredicate, not only if they're the same predicate.
1839     if (OpPredicate == DupPg) {
1840       OpMultiplicand->takeName(&II);
1841       return IC.replaceInstUsesWith(II, OpMultiplicand);
1842     }
1843   }
1844 
1845   return instCombineSVEVectorBinOp(IC, II);
1846 }
1847 
1848 static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1849                                                          IntrinsicInst &II) {
1850   Value *UnpackArg = II.getArgOperand(0);
1851   auto *RetTy = cast<ScalableVectorType>(II.getType());
1852   bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1853                   II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1854 
1855   // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1856   // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1857   if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1858     ScalarArg =
1859         IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1860     Value *NewVal =
1861         IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1862     NewVal->takeName(&II);
1863     return IC.replaceInstUsesWith(II, NewVal);
1864   }
1865 
1866   return std::nullopt;
1867 }
1868 static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1869                                                       IntrinsicInst &II) {
1870   auto *OpVal = II.getOperand(0);
1871   auto *OpIndices = II.getOperand(1);
1872   VectorType *VTy = cast<VectorType>(II.getType());
1873 
1874   // Check whether OpIndices is a constant splat value < minimal element count
1875   // of result.
1876   auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1877   if (!SplatValue ||
1878       SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1879     return std::nullopt;
1880 
1881   // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1882   // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1883   auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
1884   auto *VectorSplat =
1885       IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1886 
1887   VectorSplat->takeName(&II);
1888   return IC.replaceInstUsesWith(II, VectorSplat);
1889 }
1890 
1891 static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
1892                                                        IntrinsicInst &II) {
1893   Value *A, *B;
1894   Type *RetTy = II.getType();
1895   constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1896   constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1897 
1898   // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
1899   // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
1900   if ((match(II.getArgOperand(0),
1901              m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) &&
1902        match(II.getArgOperand(1),
1903              m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) ||
1904       (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
1905        match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
1906     auto *TyA = cast<ScalableVectorType>(A->getType());
1907     if (TyA == B->getType() &&
1908         RetTy == ScalableVectorType::getDoubleElementsVectorType(TyA)) {
1909       auto *SubVec = IC.Builder.CreateInsertVector(
1910           RetTy, PoisonValue::get(RetTy), A, IC.Builder.getInt64(0));
1911       auto *ConcatVec = IC.Builder.CreateInsertVector(
1912           RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
1913       ConcatVec->takeName(&II);
1914       return IC.replaceInstUsesWith(II, ConcatVec);
1915     }
1916   }
1917 
1918   return std::nullopt;
1919 }
1920 
1921 static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1922                                                       IntrinsicInst &II) {
1923   // zip1(uzp1(A, B), uzp2(A, B)) --> A
1924   // zip2(uzp1(A, B), uzp2(A, B)) --> B
1925   Value *A, *B;
1926   if (match(II.getArgOperand(0),
1927             m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1928       match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1929                                      m_Specific(A), m_Specific(B))))
1930     return IC.replaceInstUsesWith(
1931         II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1932 
1933   return std::nullopt;
1934 }
1935 
1936 static std::optional<Instruction *>
1937 instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
1938   Value *Mask = II.getOperand(0);
1939   Value *BasePtr = II.getOperand(1);
1940   Value *Index = II.getOperand(2);
1941   Type *Ty = II.getType();
1942   Value *PassThru = ConstantAggregateZero::get(Ty);
1943 
1944   // Replace by zero constant when all lanes are inactive
1945   if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1946     return II_NA;
1947 
1948   // Contiguous gather => masked load.
1949   // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1950   // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1951   Value *IndexBase;
1952   if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1953                        m_Value(IndexBase), m_SpecificInt(1)))) {
1954     Align Alignment =
1955         BasePtr->getPointerAlignment(II.getDataLayout());
1956 
1957     Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1958                                       BasePtr, IndexBase);
1959     CallInst *MaskedLoad =
1960         IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1961     MaskedLoad->takeName(&II);
1962     return IC.replaceInstUsesWith(II, MaskedLoad);
1963   }
1964 
1965   return std::nullopt;
1966 }
1967 
1968 static std::optional<Instruction *>
1969 instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
1970   Value *Val = II.getOperand(0);
1971   Value *Mask = II.getOperand(1);
1972   Value *BasePtr = II.getOperand(2);
1973   Value *Index = II.getOperand(3);
1974   Type *Ty = Val->getType();
1975 
1976   // Contiguous scatter => masked store.
1977   // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1978   // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1979   Value *IndexBase;
1980   if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1981                        m_Value(IndexBase), m_SpecificInt(1)))) {
1982     Align Alignment =
1983         BasePtr->getPointerAlignment(II.getDataLayout());
1984 
1985     Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1986                                       BasePtr, IndexBase);
1987     (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1988 
1989     return IC.eraseInstFromFunction(II);
1990   }
1991 
1992   return std::nullopt;
1993 }
1994 
1995 static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1996                                                        IntrinsicInst &II) {
1997   Type *Int32Ty = IC.Builder.getInt32Ty();
1998   Value *Pred = II.getOperand(0);
1999   Value *Vec = II.getOperand(1);
2000   Value *DivVec = II.getOperand(2);
2001 
2002   Value *SplatValue = getSplatValue(DivVec);
2003   ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2004   if (!SplatConstantInt)
2005     return std::nullopt;
2006 
2007   APInt Divisor = SplatConstantInt->getValue();
2008   const int64_t DivisorValue = Divisor.getSExtValue();
2009   if (DivisorValue == -1)
2010     return std::nullopt;
2011   if (DivisorValue == 1)
2012     IC.replaceInstUsesWith(II, Vec);
2013 
2014   if (Divisor.isPowerOf2()) {
2015     Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2016     auto ASRD = IC.Builder.CreateIntrinsic(
2017         Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2018     return IC.replaceInstUsesWith(II, ASRD);
2019   }
2020   if (Divisor.isNegatedPowerOf2()) {
2021     Divisor.negate();
2022     Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2023     auto ASRD = IC.Builder.CreateIntrinsic(
2024         Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2025     auto NEG = IC.Builder.CreateIntrinsic(
2026         Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2027     return IC.replaceInstUsesWith(II, NEG);
2028   }
2029 
2030   return std::nullopt;
2031 }
2032 
2033 bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2034   size_t VecSize = Vec.size();
2035   if (VecSize == 1)
2036     return true;
2037   if (!isPowerOf2_64(VecSize))
2038     return false;
2039   size_t HalfVecSize = VecSize / 2;
2040 
2041   for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2042        RHS != Vec.end(); LHS++, RHS++) {
2043     if (*LHS != nullptr && *RHS != nullptr) {
2044       if (*LHS == *RHS)
2045         continue;
2046       else
2047         return false;
2048     }
2049     if (!AllowPoison)
2050       return false;
2051     if (*LHS == nullptr && *RHS != nullptr)
2052       *LHS = *RHS;
2053   }
2054 
2055   Vec.resize(HalfVecSize);
2056   SimplifyValuePattern(Vec, AllowPoison);
2057   return true;
2058 }
2059 
2060 // Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2061 // to dupqlane(f64(C)) where C is A concatenated with B
2062 static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2063                                                            IntrinsicInst &II) {
2064   Value *CurrentInsertElt = nullptr, *Default = nullptr;
2065   if (!match(II.getOperand(0),
2066              m_Intrinsic<Intrinsic::vector_insert>(
2067                  m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2068       !isa<FixedVectorType>(CurrentInsertElt->getType()))
2069     return std::nullopt;
2070   auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2071 
2072   // Insert the scalars into a container ordered by InsertElement index
2073   SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2074   while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2075     auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2076     Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2077     CurrentInsertElt = InsertElt->getOperand(0);
2078   }
2079 
2080   bool AllowPoison =
2081       isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2082   if (!SimplifyValuePattern(Elts, AllowPoison))
2083     return std::nullopt;
2084 
2085   // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2086   Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2087   for (size_t I = 0; I < Elts.size(); I++) {
2088     if (Elts[I] == nullptr)
2089       continue;
2090     InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2091                                                     IC.Builder.getInt64(I));
2092   }
2093   if (InsertEltChain == nullptr)
2094     return std::nullopt;
2095 
2096   // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2097   // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2098   // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2099   // be narrowed back to the original type.
2100   unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2101   unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2102                                  IIScalableTy->getMinNumElements() /
2103                                  PatternWidth;
2104 
2105   IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2106   auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2107   auto *WideShuffleMaskTy =
2108       ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2109 
2110   auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0));
2111   auto InsertSubvector = IC.Builder.CreateInsertVector(
2112       II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
2113   auto WideBitcast =
2114       IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2115   auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2116   auto WideShuffle = IC.Builder.CreateShuffleVector(
2117       WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2118   auto NarrowBitcast =
2119       IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2120 
2121   return IC.replaceInstUsesWith(II, NarrowBitcast);
2122 }
2123 
2124 static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2125                                                         IntrinsicInst &II) {
2126   Value *A = II.getArgOperand(0);
2127   Value *B = II.getArgOperand(1);
2128   if (A == B)
2129     return IC.replaceInstUsesWith(II, A);
2130 
2131   return std::nullopt;
2132 }
2133 
2134 static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2135                                                         IntrinsicInst &II) {
2136   Value *Pred = II.getOperand(0);
2137   Value *Vec = II.getOperand(1);
2138   Value *Shift = II.getOperand(2);
2139 
2140   // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2141   Value *AbsPred, *MergedValue;
2142   if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2143                       m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2144       !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2145                       m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2146 
2147     return std::nullopt;
2148 
2149   // Transform is valid if any of the following are true:
2150   // * The ABS merge value is an undef or non-negative
2151   // * The ABS predicate is all active
2152   // * The ABS predicate and the SRSHL predicates are the same
2153   if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2154       AbsPred != Pred && !isAllActivePredicate(AbsPred))
2155     return std::nullopt;
2156 
2157   // Only valid when the shift amount is non-negative, otherwise the rounding
2158   // behaviour of SRSHL cannot be ignored.
2159   if (!match(Shift, m_NonNegative()))
2160     return std::nullopt;
2161 
2162   auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2163                                         {II.getType()}, {Pred, Vec, Shift});
2164 
2165   return IC.replaceInstUsesWith(II, LSL);
2166 }
2167 
2168 static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2169                                                        IntrinsicInst &II) {
2170   Value *Vec = II.getOperand(0);
2171 
2172   if (getSplatValue(Vec) == II.getOperand(1))
2173     return IC.replaceInstUsesWith(II, Vec);
2174 
2175   return std::nullopt;
2176 }
2177 
2178 static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2179                                                    IntrinsicInst &II) {
2180   // If this barrier is post-dominated by identical one we can remove it
2181   auto *NI = II.getNextNonDebugInstruction();
2182   unsigned LookaheadThreshold = DMBLookaheadThreshold;
2183   auto CanSkipOver = [](Instruction *I) {
2184     return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2185   };
2186   while (LookaheadThreshold-- && CanSkipOver(NI)) {
2187     auto *NIBB = NI->getParent();
2188     NI = NI->getNextNonDebugInstruction();
2189     if (!NI) {
2190       if (auto *SuccBB = NIBB->getUniqueSuccessor())
2191         NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2192       else
2193         break;
2194     }
2195   }
2196   auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2197   if (NextII && II.isIdenticalTo(NextII))
2198     return IC.eraseInstFromFunction(II);
2199 
2200   return std::nullopt;
2201 }
2202 
2203 std::optional<Instruction *>
2204 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
2205                                      IntrinsicInst &II) const {
2206   Intrinsic::ID IID = II.getIntrinsicID();
2207   switch (IID) {
2208   default:
2209     break;
2210   case Intrinsic::aarch64_dmb:
2211     return instCombineDMB(IC, II);
2212   case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
2213   case Intrinsic::aarch64_sve_fcvt_f16f32:
2214   case Intrinsic::aarch64_sve_fcvt_f16f64:
2215   case Intrinsic::aarch64_sve_fcvt_f32f16:
2216   case Intrinsic::aarch64_sve_fcvt_f32f64:
2217   case Intrinsic::aarch64_sve_fcvt_f64f16:
2218   case Intrinsic::aarch64_sve_fcvt_f64f32:
2219   case Intrinsic::aarch64_sve_fcvtlt_f32f16:
2220   case Intrinsic::aarch64_sve_fcvtlt_f64f32:
2221   case Intrinsic::aarch64_sve_fcvtx_f32f64:
2222   case Intrinsic::aarch64_sve_fcvtzs:
2223   case Intrinsic::aarch64_sve_fcvtzs_i32f16:
2224   case Intrinsic::aarch64_sve_fcvtzs_i32f64:
2225   case Intrinsic::aarch64_sve_fcvtzs_i64f16:
2226   case Intrinsic::aarch64_sve_fcvtzs_i64f32:
2227   case Intrinsic::aarch64_sve_fcvtzu:
2228   case Intrinsic::aarch64_sve_fcvtzu_i32f16:
2229   case Intrinsic::aarch64_sve_fcvtzu_i32f64:
2230   case Intrinsic::aarch64_sve_fcvtzu_i64f16:
2231   case Intrinsic::aarch64_sve_fcvtzu_i64f32:
2232   case Intrinsic::aarch64_sve_scvtf:
2233   case Intrinsic::aarch64_sve_scvtf_f16i32:
2234   case Intrinsic::aarch64_sve_scvtf_f16i64:
2235   case Intrinsic::aarch64_sve_scvtf_f32i64:
2236   case Intrinsic::aarch64_sve_scvtf_f64i32:
2237   case Intrinsic::aarch64_sve_ucvtf:
2238   case Intrinsic::aarch64_sve_ucvtf_f16i32:
2239   case Intrinsic::aarch64_sve_ucvtf_f16i64:
2240   case Intrinsic::aarch64_sve_ucvtf_f32i64:
2241   case Intrinsic::aarch64_sve_ucvtf_f64i32:
2242     return instCombineSVEAllOrNoActiveUnary(IC, II);
2243   case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
2244   case Intrinsic::aarch64_sve_fcvtnt_f16f32:
2245   case Intrinsic::aarch64_sve_fcvtnt_f32f64:
2246   case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
2247     return instCombineSVENoActiveReplace(IC, II, true);
2248   case Intrinsic::aarch64_sve_st1_scatter:
2249   case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
2250   case Intrinsic::aarch64_sve_st1_scatter_sxtw:
2251   case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
2252   case Intrinsic::aarch64_sve_st1_scatter_uxtw:
2253   case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
2254   case Intrinsic::aarch64_sve_st1dq:
2255   case Intrinsic::aarch64_sve_st1q_scatter_index:
2256   case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
2257   case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
2258   case Intrinsic::aarch64_sve_st1wq:
2259   case Intrinsic::aarch64_sve_stnt1:
2260   case Intrinsic::aarch64_sve_stnt1_scatter:
2261   case Intrinsic::aarch64_sve_stnt1_scatter_index:
2262   case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
2263   case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
2264     return instCombineSVENoActiveUnaryErase(IC, II, 1);
2265   case Intrinsic::aarch64_sve_st2:
2266   case Intrinsic::aarch64_sve_st2q:
2267     return instCombineSVENoActiveUnaryErase(IC, II, 2);
2268   case Intrinsic::aarch64_sve_st3:
2269   case Intrinsic::aarch64_sve_st3q:
2270     return instCombineSVENoActiveUnaryErase(IC, II, 3);
2271   case Intrinsic::aarch64_sve_st4:
2272   case Intrinsic::aarch64_sve_st4q:
2273     return instCombineSVENoActiveUnaryErase(IC, II, 4);
2274   case Intrinsic::aarch64_sve_addqv:
2275   case Intrinsic::aarch64_sve_and_z:
2276   case Intrinsic::aarch64_sve_bic_z:
2277   case Intrinsic::aarch64_sve_brka_z:
2278   case Intrinsic::aarch64_sve_brkb_z:
2279   case Intrinsic::aarch64_sve_brkn_z:
2280   case Intrinsic::aarch64_sve_brkpa_z:
2281   case Intrinsic::aarch64_sve_brkpb_z:
2282   case Intrinsic::aarch64_sve_cntp:
2283   case Intrinsic::aarch64_sve_compact:
2284   case Intrinsic::aarch64_sve_eor_z:
2285   case Intrinsic::aarch64_sve_eorv:
2286   case Intrinsic::aarch64_sve_eorqv:
2287   case Intrinsic::aarch64_sve_nand_z:
2288   case Intrinsic::aarch64_sve_nor_z:
2289   case Intrinsic::aarch64_sve_orn_z:
2290   case Intrinsic::aarch64_sve_orr_z:
2291   case Intrinsic::aarch64_sve_orv:
2292   case Intrinsic::aarch64_sve_orqv:
2293   case Intrinsic::aarch64_sve_pnext:
2294   case Intrinsic::aarch64_sve_rdffr_z:
2295   case Intrinsic::aarch64_sve_saddv:
2296   case Intrinsic::aarch64_sve_uaddv:
2297   case Intrinsic::aarch64_sve_umaxv:
2298   case Intrinsic::aarch64_sve_umaxqv:
2299   case Intrinsic::aarch64_sve_cmpeq:
2300   case Intrinsic::aarch64_sve_cmpeq_wide:
2301   case Intrinsic::aarch64_sve_cmpge:
2302   case Intrinsic::aarch64_sve_cmpge_wide:
2303   case Intrinsic::aarch64_sve_cmpgt:
2304   case Intrinsic::aarch64_sve_cmpgt_wide:
2305   case Intrinsic::aarch64_sve_cmphi:
2306   case Intrinsic::aarch64_sve_cmphi_wide:
2307   case Intrinsic::aarch64_sve_cmphs:
2308   case Intrinsic::aarch64_sve_cmphs_wide:
2309   case Intrinsic::aarch64_sve_cmple_wide:
2310   case Intrinsic::aarch64_sve_cmplo_wide:
2311   case Intrinsic::aarch64_sve_cmpls_wide:
2312   case Intrinsic::aarch64_sve_cmplt_wide:
2313   case Intrinsic::aarch64_sve_facge:
2314   case Intrinsic::aarch64_sve_facgt:
2315   case Intrinsic::aarch64_sve_fcmpeq:
2316   case Intrinsic::aarch64_sve_fcmpge:
2317   case Intrinsic::aarch64_sve_fcmpgt:
2318   case Intrinsic::aarch64_sve_fcmpne:
2319   case Intrinsic::aarch64_sve_fcmpuo:
2320   case Intrinsic::aarch64_sve_ld1_gather:
2321   case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
2322   case Intrinsic::aarch64_sve_ld1_gather_sxtw:
2323   case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
2324   case Intrinsic::aarch64_sve_ld1_gather_uxtw:
2325   case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
2326   case Intrinsic::aarch64_sve_ld1q_gather_index:
2327   case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
2328   case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
2329   case Intrinsic::aarch64_sve_ld1ro:
2330   case Intrinsic::aarch64_sve_ld1rq:
2331   case Intrinsic::aarch64_sve_ld1udq:
2332   case Intrinsic::aarch64_sve_ld1uwq:
2333   case Intrinsic::aarch64_sve_ld2_sret:
2334   case Intrinsic::aarch64_sve_ld2q_sret:
2335   case Intrinsic::aarch64_sve_ld3_sret:
2336   case Intrinsic::aarch64_sve_ld3q_sret:
2337   case Intrinsic::aarch64_sve_ld4_sret:
2338   case Intrinsic::aarch64_sve_ld4q_sret:
2339   case Intrinsic::aarch64_sve_ldff1:
2340   case Intrinsic::aarch64_sve_ldff1_gather:
2341   case Intrinsic::aarch64_sve_ldff1_gather_index:
2342   case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
2343   case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
2344   case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
2345   case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
2346   case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
2347   case Intrinsic::aarch64_sve_ldnf1:
2348   case Intrinsic::aarch64_sve_ldnt1:
2349   case Intrinsic::aarch64_sve_ldnt1_gather:
2350   case Intrinsic::aarch64_sve_ldnt1_gather_index:
2351   case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
2352   case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
2353     return instCombineSVENoActiveZero(IC, II);
2354   case Intrinsic::aarch64_sve_prf:
2355   case Intrinsic::aarch64_sve_prfb_gather_index:
2356   case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
2357   case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
2358   case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
2359   case Intrinsic::aarch64_sve_prfd_gather_index:
2360   case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
2361   case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
2362   case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
2363   case Intrinsic::aarch64_sve_prfh_gather_index:
2364   case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
2365   case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
2366   case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
2367   case Intrinsic::aarch64_sve_prfw_gather_index:
2368   case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
2369   case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
2370   case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
2371     return instCombineSVENoActiveUnaryErase(IC, II, 0);
2372   case Intrinsic::aarch64_neon_fmaxnm:
2373   case Intrinsic::aarch64_neon_fminnm:
2374     return instCombineMaxMinNM(IC, II);
2375   case Intrinsic::aarch64_sve_convert_from_svbool:
2376     return instCombineConvertFromSVBool(IC, II);
2377   case Intrinsic::aarch64_sve_dup:
2378     return instCombineSVEDup(IC, II);
2379   case Intrinsic::aarch64_sve_dup_x:
2380     return instCombineSVEDupX(IC, II);
2381   case Intrinsic::aarch64_sve_cmpne:
2382   case Intrinsic::aarch64_sve_cmpne_wide:
2383     return instCombineSVECmpNE(IC, II);
2384   case Intrinsic::aarch64_sve_rdffr:
2385     return instCombineRDFFR(IC, II);
2386   case Intrinsic::aarch64_sve_lasta:
2387   case Intrinsic::aarch64_sve_lastb:
2388     return instCombineSVELast(IC, II);
2389   case Intrinsic::aarch64_sve_clasta_n:
2390   case Intrinsic::aarch64_sve_clastb_n:
2391     return instCombineSVECondLast(IC, II);
2392   case Intrinsic::aarch64_sve_cntd:
2393     return instCombineSVECntElts(IC, II, 2);
2394   case Intrinsic::aarch64_sve_cntw:
2395     return instCombineSVECntElts(IC, II, 4);
2396   case Intrinsic::aarch64_sve_cnth:
2397     return instCombineSVECntElts(IC, II, 8);
2398   case Intrinsic::aarch64_sve_cntb:
2399     return instCombineSVECntElts(IC, II, 16);
2400   case Intrinsic::aarch64_sve_ptest_any:
2401   case Intrinsic::aarch64_sve_ptest_first:
2402   case Intrinsic::aarch64_sve_ptest_last:
2403     return instCombineSVEPTest(IC, II);
2404   case Intrinsic::aarch64_sve_fabd:
2405     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
2406   case Intrinsic::aarch64_sve_fadd:
2407     return instCombineSVEVectorFAdd(IC, II);
2408   case Intrinsic::aarch64_sve_fadd_u:
2409     return instCombineSVEVectorFAddU(IC, II);
2410   case Intrinsic::aarch64_sve_fdiv:
2411     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
2412   case Intrinsic::aarch64_sve_fmax:
2413     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
2414   case Intrinsic::aarch64_sve_fmaxnm:
2415     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
2416   case Intrinsic::aarch64_sve_fmin:
2417     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
2418   case Intrinsic::aarch64_sve_fminnm:
2419     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
2420   case Intrinsic::aarch64_sve_fmla:
2421     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
2422   case Intrinsic::aarch64_sve_fmls:
2423     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
2424   case Intrinsic::aarch64_sve_fmul:
2425     if (auto II_U =
2426             instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u))
2427       return II_U;
2428     return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2429   case Intrinsic::aarch64_sve_fmul_u:
2430     return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2431   case Intrinsic::aarch64_sve_fmulx:
2432     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
2433   case Intrinsic::aarch64_sve_fnmla:
2434     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
2435   case Intrinsic::aarch64_sve_fnmls:
2436     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
2437   case Intrinsic::aarch64_sve_fsub:
2438     return instCombineSVEVectorFSub(IC, II);
2439   case Intrinsic::aarch64_sve_fsub_u:
2440     return instCombineSVEVectorFSubU(IC, II);
2441   case Intrinsic::aarch64_sve_add:
2442     return instCombineSVEVectorAdd(IC, II);
2443   case Intrinsic::aarch64_sve_add_u:
2444     return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2445                                              Intrinsic::aarch64_sve_mla_u>(
2446         IC, II, true);
2447   case Intrinsic::aarch64_sve_mla:
2448     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
2449   case Intrinsic::aarch64_sve_mls:
2450     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
2451   case Intrinsic::aarch64_sve_mul:
2452     if (auto II_U =
2453             instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u))
2454       return II_U;
2455     return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2456   case Intrinsic::aarch64_sve_mul_u:
2457     return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2458   case Intrinsic::aarch64_sve_sabd:
2459     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
2460   case Intrinsic::aarch64_sve_smax:
2461     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
2462   case Intrinsic::aarch64_sve_smin:
2463     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
2464   case Intrinsic::aarch64_sve_smulh:
2465     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
2466   case Intrinsic::aarch64_sve_sub:
2467     return instCombineSVEVectorSub(IC, II);
2468   case Intrinsic::aarch64_sve_sub_u:
2469     return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2470                                              Intrinsic::aarch64_sve_mls_u>(
2471         IC, II, true);
2472   case Intrinsic::aarch64_sve_uabd:
2473     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
2474   case Intrinsic::aarch64_sve_umax:
2475     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
2476   case Intrinsic::aarch64_sve_umin:
2477     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
2478   case Intrinsic::aarch64_sve_umulh:
2479     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
2480   case Intrinsic::aarch64_sve_asr:
2481     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
2482   case Intrinsic::aarch64_sve_lsl:
2483     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
2484   case Intrinsic::aarch64_sve_lsr:
2485     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
2486   case Intrinsic::aarch64_sve_and:
2487     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
2488   case Intrinsic::aarch64_sve_bic:
2489     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
2490   case Intrinsic::aarch64_sve_eor:
2491     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
2492   case Intrinsic::aarch64_sve_orr:
2493     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
2494   case Intrinsic::aarch64_sve_sqsub:
2495     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
2496   case Intrinsic::aarch64_sve_uqsub:
2497     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
2498   case Intrinsic::aarch64_sve_tbl:
2499     return instCombineSVETBL(IC, II);
2500   case Intrinsic::aarch64_sve_uunpkhi:
2501   case Intrinsic::aarch64_sve_uunpklo:
2502   case Intrinsic::aarch64_sve_sunpkhi:
2503   case Intrinsic::aarch64_sve_sunpklo:
2504     return instCombineSVEUnpack(IC, II);
2505   case Intrinsic::aarch64_sve_uzp1:
2506     return instCombineSVEUzp1(IC, II);
2507   case Intrinsic::aarch64_sve_zip1:
2508   case Intrinsic::aarch64_sve_zip2:
2509     return instCombineSVEZip(IC, II);
2510   case Intrinsic::aarch64_sve_ld1_gather_index:
2511     return instCombineLD1GatherIndex(IC, II);
2512   case Intrinsic::aarch64_sve_st1_scatter_index:
2513     return instCombineST1ScatterIndex(IC, II);
2514   case Intrinsic::aarch64_sve_ld1:
2515     return instCombineSVELD1(IC, II, DL);
2516   case Intrinsic::aarch64_sve_st1:
2517     return instCombineSVEST1(IC, II, DL);
2518   case Intrinsic::aarch64_sve_sdiv:
2519     return instCombineSVESDIV(IC, II);
2520   case Intrinsic::aarch64_sve_sel:
2521     return instCombineSVESel(IC, II);
2522   case Intrinsic::aarch64_sve_srshl:
2523     return instCombineSVESrshl(IC, II);
2524   case Intrinsic::aarch64_sve_dupq_lane:
2525     return instCombineSVEDupqLane(IC, II);
2526   case Intrinsic::aarch64_sve_insr:
2527     return instCombineSVEInsr(IC, II);
2528   }
2529 
2530   return std::nullopt;
2531 }
2532 
2533 std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
2534     InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2535     APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2536     std::function<void(Instruction *, unsigned, APInt, APInt &)>
2537         SimplifyAndSetOp) const {
2538   switch (II.getIntrinsicID()) {
2539   default:
2540     break;
2541   case Intrinsic::aarch64_neon_fcvtxn:
2542   case Intrinsic::aarch64_neon_rshrn:
2543   case Intrinsic::aarch64_neon_sqrshrn:
2544   case Intrinsic::aarch64_neon_sqrshrun:
2545   case Intrinsic::aarch64_neon_sqshrn:
2546   case Intrinsic::aarch64_neon_sqshrun:
2547   case Intrinsic::aarch64_neon_sqxtn:
2548   case Intrinsic::aarch64_neon_sqxtun:
2549   case Intrinsic::aarch64_neon_uqrshrn:
2550   case Intrinsic::aarch64_neon_uqshrn:
2551   case Intrinsic::aarch64_neon_uqxtn:
2552     SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2553     break;
2554   }
2555 
2556   return std::nullopt;
2557 }
2558 
2559 bool AArch64TTIImpl::enableScalableVectorization() const {
2560   return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2561                                   EnableScalableAutovecInStreamingMode);
2562 }
2563 
2564 TypeSize
2565 AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
2566   switch (K) {
2567   case TargetTransformInfo::RGK_Scalar:
2568     return TypeSize::getFixed(64);
2569   case TargetTransformInfo::RGK_FixedWidthVector:
2570     if (ST->useSVEForFixedLengthVectors() &&
2571         (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
2572       return TypeSize::getFixed(
2573           std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2574     else if (ST->isNeonAvailable())
2575       return TypeSize::getFixed(128);
2576     else
2577       return TypeSize::getFixed(0);
2578   case TargetTransformInfo::RGK_ScalableVector:
2579     if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2580                                  EnableScalableAutovecInStreamingMode))
2581       return TypeSize::getScalable(128);
2582     else
2583       return TypeSize::getScalable(0);
2584   }
2585   llvm_unreachable("Unsupported register kind");
2586 }
2587 
2588 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
2589                                            ArrayRef<const Value *> Args,
2590                                            Type *SrcOverrideTy) {
2591   // A helper that returns a vector type from the given type. The number of
2592   // elements in type Ty determines the vector width.
2593   auto toVectorTy = [&](Type *ArgTy) {
2594     return VectorType::get(ArgTy->getScalarType(),
2595                            cast<VectorType>(DstTy)->getElementCount());
2596   };
2597 
2598   // Exit early if DstTy is not a vector type whose elements are one of [i16,
2599   // i32, i64]. SVE doesn't generally have the same set of instructions to
2600   // perform an extend with the add/sub/mul. There are SMULLB style
2601   // instructions, but they operate on top/bottom, requiring some sort of lane
2602   // interleaving to be used with zext/sext.
2603   unsigned DstEltSize = DstTy->getScalarSizeInBits();
2604   if (!useNeonVector(DstTy) || Args.size() != 2 ||
2605       (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2606     return false;
2607 
2608   // Determine if the operation has a widening variant. We consider both the
2609   // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2610   // instructions.
2611   //
2612   // TODO: Add additional widening operations (e.g., shl, etc.) once we
2613   //       verify that their extending operands are eliminated during code
2614   //       generation.
2615   Type *SrcTy = SrcOverrideTy;
2616   switch (Opcode) {
2617   case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2618   case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2619     // The second operand needs to be an extend
2620     if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2621       if (!SrcTy)
2622         SrcTy =
2623             toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2624     } else
2625       return false;
2626     break;
2627   case Instruction::Mul: { // SMULL(2), UMULL(2)
2628     // Both operands need to be extends of the same type.
2629     if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
2630         (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
2631       if (!SrcTy)
2632         SrcTy =
2633             toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
2634     } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2635       // If one of the operands is a Zext and the other has enough zero bits to
2636       // be treated as unsigned, we can still general a umull, meaning the zext
2637       // is free.
2638       KnownBits Known =
2639           computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
2640       if (Args[0]->getType()->getScalarSizeInBits() -
2641               Known.Zero.countLeadingOnes() >
2642           DstTy->getScalarSizeInBits() / 2)
2643         return false;
2644       if (!SrcTy)
2645         SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
2646                                            DstTy->getScalarSizeInBits() / 2));
2647     } else
2648       return false;
2649     break;
2650   }
2651   default:
2652     return false;
2653   }
2654 
2655   // Legalize the destination type and ensure it can be used in a widening
2656   // operation.
2657   auto DstTyL = getTypeLegalizationCost(DstTy);
2658   if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
2659     return false;
2660 
2661   // Legalize the source type and ensure it can be used in a widening
2662   // operation.
2663   assert(SrcTy && "Expected some SrcTy");
2664   auto SrcTyL = getTypeLegalizationCost(SrcTy);
2665   unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2666   if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2667     return false;
2668 
2669   // Get the total number of vector elements in the legalized types.
2670   InstructionCost NumDstEls =
2671       DstTyL.first * DstTyL.second.getVectorMinNumElements();
2672   InstructionCost NumSrcEls =
2673       SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2674 
2675   // Return true if the legalized types have the same number of vector elements
2676   // and the destination element type size is twice that of the source type.
2677   return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2678 }
2679 
2680 // s/urhadd instructions implement the following pattern, making the
2681 // extends free:
2682 //   %x = add ((zext i8 -> i16), 1)
2683 //   %y = (zext i8 -> i16)
2684 //   trunc i16 (lshr (add %x, %y), 1) -> i8
2685 //
2686 bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst,
2687                                         Type *Src) {
2688   // The source should be a legal vector type.
2689   if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2690       (Src->isScalableTy() && !ST->hasSVE2()))
2691     return false;
2692 
2693   if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2694     return false;
2695 
2696   // Look for trunc/shl/add before trying to match the pattern.
2697   const Instruction *Add = ExtUser;
2698   auto *AddUser =
2699       dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2700   if (AddUser && AddUser->getOpcode() == Instruction::Add)
2701     Add = AddUser;
2702 
2703   auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2704   if (!Shr || Shr->getOpcode() != Instruction::LShr)
2705     return false;
2706 
2707   auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2708   if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2709       Src->getScalarSizeInBits() !=
2710           cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2711     return false;
2712 
2713   // Try to match the whole pattern. Ext could be either the first or second
2714   // m_ZExtOrSExt matched.
2715   Instruction *Ex1, *Ex2;
2716   if (!(match(Add, m_c_Add(m_Instruction(Ex1),
2717                            m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
2718     return false;
2719 
2720   // Ensure both extends are of the same type
2721   if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
2722       Ex1->getOpcode() == Ex2->getOpcode())
2723     return true;
2724 
2725   return false;
2726 }
2727 
2728 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
2729                                                  Type *Src,
2730                                                  TTI::CastContextHint CCH,
2731                                                  TTI::TargetCostKind CostKind,
2732                                                  const Instruction *I) {
2733   int ISD = TLI->InstructionOpcodeToISD(Opcode);
2734   assert(ISD && "Invalid opcode");
2735   // If the cast is observable, and it is used by a widening instruction (e.g.,
2736   // uaddl, saddw, etc.), it may be free.
2737   if (I && I->hasOneUser()) {
2738     auto *SingleUser = cast<Instruction>(*I->user_begin());
2739     SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
2740     if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
2741       // For adds only count the second operand as free if both operands are
2742       // extends but not the same operation. (i.e both operands are not free in
2743       // add(sext, zext)).
2744       if (SingleUser->getOpcode() == Instruction::Add) {
2745         if (I == SingleUser->getOperand(1) ||
2746             (isa<CastInst>(SingleUser->getOperand(1)) &&
2747              cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2748           return 0;
2749       } else // Others are free so long as isWideningInstruction returned true.
2750         return 0;
2751     }
2752 
2753     // The cast will be free for the s/urhadd instructions
2754     if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
2755         isExtPartOfAvgExpr(SingleUser, Dst, Src))
2756       return 0;
2757   }
2758 
2759   // TODO: Allow non-throughput costs that aren't binary.
2760   auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2761     if (CostKind != TTI::TCK_RecipThroughput)
2762       return Cost == 0 ? 0 : 1;
2763     return Cost;
2764   };
2765 
2766   EVT SrcTy = TLI->getValueType(DL, Src);
2767   EVT DstTy = TLI->getValueType(DL, Dst);
2768 
2769   if (!SrcTy.isSimple() || !DstTy.isSimple())
2770     return AdjustCost(
2771         BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2772 
2773   static const TypeConversionCostTblEntry BF16Tbl[] = {
2774       {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1},     // bfcvt
2775       {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1},     // bfcvt
2776       {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
2777       {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
2778       {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
2779       {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
2780       {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
2781   };
2782 
2783   if (ST->hasBF16())
2784     if (const auto *Entry = ConvertCostTableLookup(
2785             BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2786       return AdjustCost(Entry->Cost);
2787 
2788   static const TypeConversionCostTblEntry ConversionTbl[] = {
2789       {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1},    // xtn
2790       {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1},   // xtn
2791       {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1},   // xtn
2792       {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1},    // xtn
2793       {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3},    // 2 xtn + 1 uzp1
2794       {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1},   // xtn
2795       {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2},   // 1 uzp1 + 1 xtn
2796       {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1},   // 1 uzp1
2797       {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1},    // 1 xtn
2798       {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2},    // 1 uzp1 + 1 xtn
2799       {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4},    // 3 x uzp1 + xtn
2800       {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1},   // 1 uzp1
2801       {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3},   // 3 x uzp1
2802       {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2},   // 2 x uzp1
2803       {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1},  // uzp1
2804       {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3},  // (2 + 1) x uzp1
2805       {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7},  // (4 + 2 + 1) x uzp1
2806       {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
2807       {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
2808       {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
2809 
2810       // Truncations on nxvmiN
2811       {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
2812       {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
2813       {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
2814       {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
2815       {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
2816       {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
2817       {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
2818       {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
2819       {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
2820       {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
2821       {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
2822       {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
2823       {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
2824       {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
2825       {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
2826       {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
2827       {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
2828       {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
2829       {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
2830       {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
2831       {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
2832       {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
2833       {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
2834       {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
2835       {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
2836       {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
2837       {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
2838       {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
2839       {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
2840       {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
2841       {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
2842       {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
2843       {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
2844 
2845       // The number of shll instructions for the extension.
2846       {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
2847       {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
2848       {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
2849       {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
2850       {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
2851       {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
2852       {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
2853       {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
2854       {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
2855       {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
2856       {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
2857       {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
2858       {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
2859       {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
2860       {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
2861       {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
2862 
2863       // FP Ext and trunc
2864       {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1},     // fcvt
2865       {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
2866       {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
2867       //   FP16
2868       {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1},     // fcvt
2869       {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1},     // fcvt
2870       {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
2871       {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
2872       {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
2873       {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
2874       {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
2875       //   BF16 (uses shift)
2876       {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1},     // shl
2877       {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2},     // shl+fcvt
2878       {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
2879       {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
2880       {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
2881       {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
2882       {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
2883       // FP Ext and trunc
2884       {ISD::FP_ROUND, MVT::f32, MVT::f64, 1},     // fcvt
2885       {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
2886       {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
2887       //   FP16
2888       {ISD::FP_ROUND, MVT::f16, MVT::f32, 1},     // fcvt
2889       {ISD::FP_ROUND, MVT::f16, MVT::f64, 1},     // fcvt
2890       {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
2891       {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
2892       {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
2893       {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
2894       {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
2895       //   BF16 (more complex, with +bf16 is handled above)
2896       {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
2897       {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
2898       {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
2899       {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
2900       {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
2901       {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
2902       {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
2903       {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
2904 
2905       // LowerVectorINT_TO_FP:
2906       {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
2907       {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
2908       {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
2909       {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
2910       {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
2911       {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
2912 
2913       // Complex: to v2f32
2914       {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
2915       {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
2916       {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2},
2917       {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
2918       {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
2919       {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2},
2920 
2921       // Complex: to v4f32
2922       {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
2923       {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
2924       {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
2925       {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
2926 
2927       // Complex: to v8f32
2928       {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
2929       {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
2930       {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
2931       {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
2932 
2933       // Complex: to v16f32
2934       {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
2935       {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
2936 
2937       // Complex: to v2f64
2938       {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
2939       {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
2940       {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
2941       {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
2942       {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
2943       {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
2944 
2945       // Complex: to v4f64
2946       {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
2947       {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
2948 
2949       // LowerVectorFP_TO_INT
2950       {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
2951       {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
2952       {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
2953       {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
2954       {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
2955       {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
2956 
2957       // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
2958       {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
2959       {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
2960       {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
2961       {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
2962       {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
2963       {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
2964 
2965       // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
2966       {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
2967       {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
2968       {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
2969       {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
2970 
2971       // Complex, from nxv2f32.
2972       {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
2973       {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
2974       {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
2975       {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
2976       {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
2977       {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
2978       {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
2979       {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
2980 
2981       // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
2982       {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
2983       {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
2984       {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
2985       {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
2986       {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
2987       {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
2988 
2989       // Complex, from nxv2f64.
2990       {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
2991       {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
2992       {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
2993       {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
2994       {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
2995       {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
2996       {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
2997       {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
2998 
2999       // Complex, from nxv4f32.
3000       {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3001       {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3002       {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3003       {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3004       {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3005       {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3006       {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3007       {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3008 
3009       // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3010       {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3011       {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3012       {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3013       {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3014 
3015       // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3016       {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3017       {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3018       {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3019       {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3020       {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3021       {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3022 
3023       // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3024       {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3025       {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3026       {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3027       {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3028 
3029       // Complex, from nxv8f16.
3030       {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3031       {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3032       {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3033       {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3034       {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3035       {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3036       {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3037       {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3038 
3039       // Complex, from nxv4f16.
3040       {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3041       {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3042       {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3043       {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3044       {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3045       {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3046       {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3047       {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3048 
3049       // Complex, from nxv2f16.
3050       {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3051       {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3052       {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3053       {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3054       {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3055       {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3056       {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3057       {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3058 
3059       // Truncate from nxvmf32 to nxvmf16.
3060       {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3061       {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3062       {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3063 
3064       // Truncate from nxvmf64 to nxvmf16.
3065       {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3066       {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3067       {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3068 
3069       // Truncate from nxvmf64 to nxvmf32.
3070       {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3071       {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3072       {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3073 
3074       // Extend from nxvmf16 to nxvmf32.
3075       {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3076       {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3077       {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3078 
3079       // Extend from nxvmf16 to nxvmf64.
3080       {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3081       {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3082       {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3083 
3084       // Extend from nxvmf32 to nxvmf64.
3085       {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3086       {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3087       {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3088 
3089       // Bitcasts from float to integer
3090       {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3091       {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3092       {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3093 
3094       // Bitcasts from integer to float
3095       {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3096       {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3097       {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3098 
3099       // Add cost for extending to illegal -too wide- scalable vectors.
3100       // zero/sign extend are implemented by multiple unpack operations,
3101       // where each operation has a cost of 1.
3102       {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3103       {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3104       {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3105       {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3106       {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3107       {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3108 
3109       {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3110       {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3111       {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3112       {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3113       {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3114       {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3115   };
3116 
3117   // We have to estimate a cost of fixed length operation upon
3118   // SVE registers(operations) with the number of registers required
3119   // for a fixed type to be represented upon SVE registers.
3120   EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3121   if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3122       SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3123       ST->useSVEForFixedLengthVectors(WiderTy)) {
3124     std::pair<InstructionCost, MVT> LT =
3125         getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3126     unsigned NumElements =
3127         AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3128     return AdjustCost(
3129         LT.first *
3130         getCastInstrCost(
3131             Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
3132             ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3133             CostKind, I));
3134   }
3135 
3136   if (const auto *Entry = ConvertCostTableLookup(
3137           ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3138     return AdjustCost(Entry->Cost);
3139 
3140   static const TypeConversionCostTblEntry FP16Tbl[] = {
3141       {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3142       {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3143       {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3144       {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3145       {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3146       {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3147       {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3148       {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3149       {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3150       {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3151       {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3152       {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3153       {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3154       {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3155       {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3156       {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3157       {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3158       {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3159       {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2},   // ushll + ucvtf
3160       {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2},   // sshll + scvtf
3161       {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3162       {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3163   };
3164 
3165   if (ST->hasFullFP16())
3166     if (const auto *Entry = ConvertCostTableLookup(
3167             FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3168       return AdjustCost(Entry->Cost);
3169 
3170   if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3171       CCH == TTI::CastContextHint::Masked &&
3172       ST->isSVEorStreamingSVEAvailable() &&
3173       TLI->getTypeAction(Src->getContext(), SrcTy) ==
3174           TargetLowering::TypePromoteInteger &&
3175       TLI->getTypeAction(Dst->getContext(), DstTy) ==
3176           TargetLowering::TypeSplitVector) {
3177     // The standard behaviour in the backend for these cases is to split the
3178     // extend up into two parts:
3179     //  1. Perform an extending load or masked load up to the legal type.
3180     //  2. Extend the loaded data to the final type.
3181     std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3182     Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3183     InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost(
3184         Opcode, LegalTy, Src, CCH, CostKind, I);
3185     InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost(
3186         Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3187     return Part1 + Part2;
3188   }
3189 
3190   // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3191   // but we also want to include the TTI::CastContextHint::Masked case too.
3192   if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3193       CCH == TTI::CastContextHint::Masked &&
3194       ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3195     CCH = TTI::CastContextHint::Normal;
3196 
3197   return AdjustCost(
3198       BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3199 }
3200 
3201 InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
3202                                                          Type *Dst,
3203                                                          VectorType *VecTy,
3204                                                          unsigned Index) {
3205 
3206   // Make sure we were given a valid extend opcode.
3207   assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3208          "Invalid opcode");
3209 
3210   // We are extending an element we extract from a vector, so the source type
3211   // of the extend is the element type of the vector.
3212   auto *Src = VecTy->getElementType();
3213 
3214   // Sign- and zero-extends are for integer types only.
3215   assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3216 
3217   // Get the cost for the extract. We compute the cost (if any) for the extend
3218   // below.
3219   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3220   InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3221                                             CostKind, Index, nullptr, nullptr);
3222 
3223   // Legalize the types.
3224   auto VecLT = getTypeLegalizationCost(VecTy);
3225   auto DstVT = TLI->getValueType(DL, Dst);
3226   auto SrcVT = TLI->getValueType(DL, Src);
3227 
3228   // If the resulting type is still a vector and the destination type is legal,
3229   // we may get the extension for free. If not, get the default cost for the
3230   // extend.
3231   if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3232     return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3233                                    CostKind);
3234 
3235   // The destination type should be larger than the element type. If not, get
3236   // the default cost for the extend.
3237   if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3238     return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3239                                    CostKind);
3240 
3241   switch (Opcode) {
3242   default:
3243     llvm_unreachable("Opcode should be either SExt or ZExt");
3244 
3245   // For sign-extends, we only need a smov, which performs the extension
3246   // automatically.
3247   case Instruction::SExt:
3248     return Cost;
3249 
3250   // For zero-extends, the extend is performed automatically by a umov unless
3251   // the destination type is i64 and the element type is i8 or i16.
3252   case Instruction::ZExt:
3253     if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3254       return Cost;
3255   }
3256 
3257   // If we are unable to perform the extend for free, get the default cost.
3258   return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3259                                  CostKind);
3260 }
3261 
3262 InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
3263                                                TTI::TargetCostKind CostKind,
3264                                                const Instruction *I) {
3265   if (CostKind != TTI::TCK_RecipThroughput)
3266     return Opcode == Instruction::PHI ? 0 : 1;
3267   assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3268   // Branches are assumed to be predicted.
3269   return 0;
3270 }
3271 
3272 InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3273     unsigned Opcode, Type *Val, unsigned Index, bool HasRealUse,
3274     const Instruction *I, Value *Scalar,
3275     ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
3276   assert(Val->isVectorTy() && "This must be a vector type");
3277 
3278   if (Index != -1U) {
3279     // Legalize the type.
3280     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3281 
3282     // This type is legalized to a scalar type.
3283     if (!LT.second.isVector())
3284       return 0;
3285 
3286     // The type may be split. For fixed-width vectors we can normalize the
3287     // index to the new type.
3288     if (LT.second.isFixedLengthVector()) {
3289       unsigned Width = LT.second.getVectorNumElements();
3290       Index = Index % Width;
3291     }
3292 
3293     // The element at index zero is already inside the vector.
3294     // - For a physical (HasRealUse==true) insert-element or extract-element
3295     // instruction that extracts integers, an explicit FPR -> GPR move is
3296     // needed. So it has non-zero cost.
3297     // - For the rest of cases (virtual instruction or element type is float),
3298     // consider the instruction free.
3299     if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
3300       return 0;
3301 
3302     // This is recognising a LD1 single-element structure to one lane of one
3303     // register instruction. I.e., if this is an `insertelement` instruction,
3304     // and its second operand is a load, then we will generate a LD1, which
3305     // are expensive instructions.
3306     if (I && dyn_cast<LoadInst>(I->getOperand(1)))
3307       return ST->getVectorInsertExtractBaseCost() + 1;
3308 
3309     // i1 inserts and extract will include an extra cset or cmp of the vector
3310     // value. Increase the cost by 1 to account.
3311     if (Val->getScalarSizeInBits() == 1)
3312       return ST->getVectorInsertExtractBaseCost() + 1;
3313 
3314     // FIXME:
3315     // If the extract-element and insert-element instructions could be
3316     // simplified away (e.g., could be combined into users by looking at use-def
3317     // context), they have no cost. This is not done in the first place for
3318     // compile-time considerations.
3319   }
3320 
3321   // In case of Neon, if there exists extractelement from lane != 0 such that
3322   // 1. extractelement does not necessitate a move from vector_reg -> GPR.
3323   // 2. extractelement result feeds into fmul.
3324   // 3. Other operand of fmul is an extractelement from lane 0 or lane
3325   // equivalent to 0.
3326   // then the extractelement can be merged with fmul in the backend and it
3327   // incurs no cost.
3328   // e.g.
3329   // define double @foo(<2 x double> %a) {
3330   //   %1 = extractelement <2 x double> %a, i32 0
3331   //   %2 = extractelement <2 x double> %a, i32 1
3332   //   %res = fmul double %1, %2
3333   //   ret double %res
3334   // }
3335   // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
3336   auto ExtractCanFuseWithFmul = [&]() {
3337     // We bail out if the extract is from lane 0.
3338     if (Index == 0)
3339       return false;
3340 
3341     // Check if the scalar element type of the vector operand of ExtractElement
3342     // instruction is one of the allowed types.
3343     auto IsAllowedScalarTy = [&](const Type *T) {
3344       return T->isFloatTy() || T->isDoubleTy() ||
3345              (T->isHalfTy() && ST->hasFullFP16());
3346     };
3347 
3348     // Check if the extractelement user is scalar fmul.
3349     auto IsUserFMulScalarTy = [](const Value *EEUser) {
3350       // Check if the user is scalar fmul.
3351       const auto *BO = dyn_cast<BinaryOperator>(EEUser);
3352       return BO && BO->getOpcode() == BinaryOperator::FMul &&
3353              !BO->getType()->isVectorTy();
3354     };
3355 
3356     // Check if the extract index is from lane 0 or lane equivalent to 0 for a
3357     // certain scalar type and a certain vector register width.
3358     auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
3359       auto RegWidth =
3360           getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
3361               .getFixedValue();
3362       return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
3363     };
3364 
3365     // Check if the type constraints on input vector type and result scalar type
3366     // of extractelement instruction are satisfied.
3367     if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
3368       return false;
3369 
3370     if (Scalar) {
3371       DenseMap<User *, unsigned> UserToExtractIdx;
3372       for (auto *U : Scalar->users()) {
3373         if (!IsUserFMulScalarTy(U))
3374           return false;
3375         // Recording entry for the user is important. Index value is not
3376         // important.
3377         UserToExtractIdx[U];
3378       }
3379       if (UserToExtractIdx.empty())
3380         return false;
3381       for (auto &[S, U, L] : ScalarUserAndIdx) {
3382         for (auto *U : S->users()) {
3383           if (UserToExtractIdx.find(U) != UserToExtractIdx.end()) {
3384             auto *FMul = cast<BinaryOperator>(U);
3385             auto *Op0 = FMul->getOperand(0);
3386             auto *Op1 = FMul->getOperand(1);
3387             if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
3388               UserToExtractIdx[U] = L;
3389               break;
3390             }
3391           }
3392         }
3393       }
3394       for (auto &[U, L] : UserToExtractIdx) {
3395         if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
3396             !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
3397           return false;
3398       }
3399     } else {
3400       const auto *EE = cast<ExtractElementInst>(I);
3401 
3402       const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
3403       if (!IdxOp)
3404         return false;
3405 
3406       return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
3407         if (!IsUserFMulScalarTy(U))
3408           return false;
3409 
3410         // Check if the other operand of extractelement is also extractelement
3411         // from lane equivalent to 0.
3412         const auto *BO = cast<BinaryOperator>(U);
3413         const auto *OtherEE = dyn_cast<ExtractElementInst>(
3414             BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
3415         if (OtherEE) {
3416           const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
3417           if (!IdxOp)
3418             return false;
3419           return IsExtractLaneEquivalentToZero(
3420               cast<ConstantInt>(OtherEE->getIndexOperand())
3421                   ->getValue()
3422                   .getZExtValue(),
3423               OtherEE->getType()->getScalarSizeInBits());
3424         }
3425         return true;
3426       });
3427     }
3428     return true;
3429   };
3430 
3431   if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
3432       ExtractCanFuseWithFmul())
3433     return 0;
3434 
3435   // All other insert/extracts cost this much.
3436   return ST->getVectorInsertExtractBaseCost();
3437 }
3438 
3439 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
3440                                                    TTI::TargetCostKind CostKind,
3441                                                    unsigned Index, Value *Op0,
3442                                                    Value *Op1) {
3443   bool HasRealUse =
3444       Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
3445   return getVectorInstrCostHelper(Opcode, Val, Index, HasRealUse);
3446 }
3447 
3448 InstructionCost AArch64TTIImpl::getVectorInstrCost(
3449     unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3450     Value *Scalar,
3451     ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
3452   return getVectorInstrCostHelper(Opcode, Val, Index, false, nullptr, Scalar,
3453                                   ScalarUserAndIdx);
3454 }
3455 
3456 InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
3457                                                    Type *Val,
3458                                                    TTI::TargetCostKind CostKind,
3459                                                    unsigned Index) {
3460   return getVectorInstrCostHelper(I.getOpcode(), Val, Index,
3461                                   true /* HasRealUse */, &I);
3462 }
3463 
3464 InstructionCost AArch64TTIImpl::getScalarizationOverhead(
3465     VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
3466     TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
3467   if (isa<ScalableVectorType>(Ty))
3468     return InstructionCost::getInvalid();
3469   if (Ty->getElementType()->isFloatingPointTy())
3470     return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
3471                                            CostKind);
3472   return DemandedElts.popcount() * (Insert + Extract) *
3473          ST->getVectorInsertExtractBaseCost();
3474 }
3475 
3476 InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
3477     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
3478     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
3479     ArrayRef<const Value *> Args,
3480     const Instruction *CxtI) {
3481 
3482   // The code-generator is currently not able to handle scalable vectors
3483   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3484   // it. This change will be removed when code-generation for these types is
3485   // sufficiently reliable.
3486   if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3487     if (VTy->getElementCount() == ElementCount::getScalable(1))
3488       return InstructionCost::getInvalid();
3489 
3490   // TODO: Handle more cost kinds.
3491   if (CostKind != TTI::TCK_RecipThroughput)
3492     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3493                                          Op2Info, Args, CxtI);
3494 
3495   // Legalize the type.
3496   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3497   int ISD = TLI->InstructionOpcodeToISD(Opcode);
3498 
3499   switch (ISD) {
3500   default:
3501     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3502                                          Op2Info);
3503   case ISD::SDIV:
3504     if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
3505       // On AArch64, scalar signed division by constants power-of-two are
3506       // normally expanded to the sequence ADD + CMP + SELECT + SRA.
3507       // The OperandValue properties many not be same as that of previous
3508       // operation; conservatively assume OP_None.
3509       InstructionCost Cost = getArithmeticInstrCost(
3510           Instruction::Add, Ty, CostKind,
3511           Op1Info.getNoProps(), Op2Info.getNoProps());
3512       Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
3513                                      Op1Info.getNoProps(), Op2Info.getNoProps());
3514       Cost += getArithmeticInstrCost(
3515           Instruction::Select, Ty, CostKind,
3516           Op1Info.getNoProps(), Op2Info.getNoProps());
3517       Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3518                                      Op1Info.getNoProps(), Op2Info.getNoProps());
3519       return Cost;
3520     }
3521     [[fallthrough]];
3522   case ISD::UDIV: {
3523     auto VT = TLI->getValueType(DL, Ty);
3524     if (Op2Info.isConstant() && Op2Info.isUniform()) {
3525       if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
3526         // Vector signed division by constant are expanded to the
3527         // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
3528         // to MULHS + SUB + SRL + ADD + SRL.
3529         InstructionCost MulCost = getArithmeticInstrCost(
3530             Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3531         InstructionCost AddCost = getArithmeticInstrCost(
3532             Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3533         InstructionCost ShrCost = getArithmeticInstrCost(
3534             Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3535         return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
3536       }
3537     }
3538 
3539     // div i128's are lowered as libcalls.  Pass nullptr as (u)divti3 calls are
3540     // emitted by the backend even when those functions are not declared in the
3541     // module.
3542     if (!VT.isVector() && VT.getSizeInBits() > 64)
3543       return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3544 
3545     InstructionCost Cost = BaseT::getArithmeticInstrCost(
3546         Opcode, Ty, CostKind, Op1Info, Op2Info);
3547     if (Ty->isVectorTy()) {
3548       if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
3549         // SDIV/UDIV operations are lowered using SVE, then we can have less
3550         // costs.
3551         if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
3552                                                 ->getPrimitiveSizeInBits()
3553                                                 .getFixedValue() < 128) {
3554           EVT VT = TLI->getValueType(DL, Ty);
3555           static const CostTblEntry DivTbl[]{
3556               {ISD::SDIV, MVT::v2i8, 5},  {ISD::SDIV, MVT::v4i8, 8},
3557               {ISD::SDIV, MVT::v8i8, 8},  {ISD::SDIV, MVT::v2i16, 5},
3558               {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
3559               {ISD::UDIV, MVT::v2i8, 5},  {ISD::UDIV, MVT::v4i8, 8},
3560               {ISD::UDIV, MVT::v8i8, 8},  {ISD::UDIV, MVT::v2i16, 5},
3561               {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
3562 
3563           const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
3564           if (nullptr != Entry)
3565             return Entry->Cost;
3566         }
3567         // For 8/16-bit elements, the cost is higher because the type
3568         // requires promotion and possibly splitting:
3569         if (LT.second.getScalarType() == MVT::i8)
3570           Cost *= 8;
3571         else if (LT.second.getScalarType() == MVT::i16)
3572           Cost *= 4;
3573         return Cost;
3574       } else {
3575         // If one of the operands is a uniform constant then the cost for each
3576         // element is Cost for insertion, extraction and division.
3577         // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
3578         // operation with scalar type
3579         if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
3580             (Op2Info.isConstant() && Op2Info.isUniform())) {
3581           if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
3582             InstructionCost DivCost = BaseT::getArithmeticInstrCost(
3583                 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
3584             return (4 + DivCost) * VTy->getNumElements();
3585           }
3586         }
3587         // On AArch64, without SVE, vector divisions are expanded
3588         // into scalar divisions of each pair of elements.
3589         Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
3590                                        CostKind, Op1Info, Op2Info);
3591         Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
3592                                        Op1Info, Op2Info);
3593       }
3594 
3595       // TODO: if one of the arguments is scalar, then it's not necessary to
3596       // double the cost of handling the vector elements.
3597       Cost += Cost;
3598     }
3599     return Cost;
3600   }
3601   case ISD::MUL:
3602     // When SVE is available, then we can lower the v2i64 operation using
3603     // the SVE mul instruction, which has a lower cost.
3604     if (LT.second == MVT::v2i64 && ST->hasSVE())
3605       return LT.first;
3606 
3607     // When SVE is not available, there is no MUL.2d instruction,
3608     // which means mul <2 x i64> is expensive as elements are extracted
3609     // from the vectors and the muls scalarized.
3610     // As getScalarizationOverhead is a bit too pessimistic, we
3611     // estimate the cost for a i64 vector directly here, which is:
3612     // - four 2-cost i64 extracts,
3613     // - two 2-cost i64 inserts, and
3614     // - two 1-cost muls.
3615     // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
3616     // LT.first = 2 the cost is 28. If both operands are extensions it will not
3617     // need to scalarize so the cost can be cheaper (smull or umull).
3618     // so the cost can be cheaper (smull or umull).
3619     if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
3620       return LT.first;
3621     return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
3622            (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
3623             getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
3624                                nullptr, nullptr) *
3625                 2 +
3626             getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
3627                                nullptr, nullptr));
3628   case ISD::ADD:
3629   case ISD::XOR:
3630   case ISD::OR:
3631   case ISD::AND:
3632   case ISD::SRL:
3633   case ISD::SRA:
3634   case ISD::SHL:
3635     // These nodes are marked as 'custom' for combining purposes only.
3636     // We know that they are legal. See LowerAdd in ISelLowering.
3637     return LT.first;
3638 
3639   case ISD::FNEG:
3640     // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
3641     if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
3642          (Ty->isHalfTy() && ST->hasFullFP16())) &&
3643         CxtI &&
3644         ((CxtI->hasOneUse() &&
3645           match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
3646          match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
3647       return 0;
3648     [[fallthrough]];
3649   case ISD::FADD:
3650   case ISD::FSUB:
3651     // Increase the cost for half and bfloat types if not architecturally
3652     // supported.
3653     if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
3654         (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
3655       return 2 * LT.first;
3656     if (!Ty->getScalarType()->isFP128Ty())
3657       return LT.first;
3658     [[fallthrough]];
3659   case ISD::FMUL:
3660   case ISD::FDIV:
3661     // These nodes are marked as 'custom' just to lower them to SVE.
3662     // We know said lowering will incur no additional cost.
3663     if (!Ty->getScalarType()->isFP128Ty())
3664       return 2 * LT.first;
3665 
3666     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3667                                          Op2Info);
3668   case ISD::FREM:
3669     // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
3670     // those functions are not declared in the module.
3671     if (!Ty->isVectorTy())
3672       return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3673     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3674                                          Op2Info);
3675   }
3676 }
3677 
3678 InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
3679                                                           ScalarEvolution *SE,
3680                                                           const SCEV *Ptr) {
3681   // Address computations in vectorized code with non-consecutive addresses will
3682   // likely result in more instructions compared to scalar code where the
3683   // computation can more often be merged into the index mode. The resulting
3684   // extra micro-ops can significantly decrease throughput.
3685   unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
3686   int MaxMergeDistance = 64;
3687 
3688   if (Ty->isVectorTy() && SE &&
3689       !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
3690     return NumVectorInstToHideOverhead;
3691 
3692   // In many cases the address computation is not merged into the instruction
3693   // addressing mode.
3694   return 1;
3695 }
3696 
3697 InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
3698     unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3699     TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
3700     TTI::OperandValueInfo Op2Info, const Instruction *I) {
3701   // TODO: Handle other cost kinds.
3702   if (CostKind != TTI::TCK_RecipThroughput)
3703     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3704                                      Op1Info, Op2Info, I);
3705 
3706   int ISD = TLI->InstructionOpcodeToISD(Opcode);
3707   // We don't lower some vector selects well that are wider than the register
3708   // width.
3709   if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
3710     // We would need this many instructions to hide the scalarization happening.
3711     const int AmortizationCost = 20;
3712 
3713     // If VecPred is not set, check if we can get a predicate from the context
3714     // instruction, if its type matches the requested ValTy.
3715     if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3716       CmpPredicate CurrentPred;
3717       if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
3718                             m_Value())))
3719         VecPred = CurrentPred;
3720     }
3721     // Check if we have a compare/select chain that can be lowered using
3722     // a (F)CMxx & BFI pair.
3723     if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
3724         VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
3725         VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
3726         VecPred == CmpInst::FCMP_UNE) {
3727       static const auto ValidMinMaxTys = {
3728           MVT::v8i8,  MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3729           MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3730       static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3731 
3732       auto LT = getTypeLegalizationCost(ValTy);
3733       if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
3734           (ST->hasFullFP16() &&
3735            any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
3736         return LT.first;
3737     }
3738 
3739     static const TypeConversionCostTblEntry
3740     VectorSelectTbl[] = {
3741       { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
3742       { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
3743       { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
3744       { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
3745       { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
3746       { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
3747       { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
3748       { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
3749       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
3750       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
3751       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
3752     };
3753 
3754     EVT SelCondTy = TLI->getValueType(DL, CondTy);
3755     EVT SelValTy = TLI->getValueType(DL, ValTy);
3756     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
3757       if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
3758                                                      SelCondTy.getSimpleVT(),
3759                                                      SelValTy.getSimpleVT()))
3760         return Entry->Cost;
3761     }
3762   }
3763 
3764   if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
3765     auto LT = getTypeLegalizationCost(ValTy);
3766     // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
3767     if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3768       return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
3769   }
3770 
3771   // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
3772   // FIXME: This can apply to more conditions and add/sub if it can be shown to
3773   // be profitable.
3774   if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
3775       ICmpInst::isEquality(VecPred) &&
3776       TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
3777       match(I->getOperand(1), m_Zero()) &&
3778       match(I->getOperand(0), m_And(m_Value(), m_Value())))
3779     return 0;
3780 
3781   // The base case handles scalable vectors fine for now, since it treats the
3782   // cost as 1 * legalization cost.
3783   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3784                                    Op1Info, Op2Info, I);
3785 }
3786 
3787 AArch64TTIImpl::TTI::MemCmpExpansionOptions
3788 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3789   TTI::MemCmpExpansionOptions Options;
3790   if (ST->requiresStrictAlign()) {
3791     // TODO: Add cost modeling for strict align. Misaligned loads expand to
3792     // a bunch of instructions when strict align is enabled.
3793     return Options;
3794   }
3795   Options.AllowOverlappingLoads = true;
3796   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3797   Options.NumLoadsPerBlock = Options.MaxNumLoads;
3798   // TODO: Though vector loads usually perform well on AArch64, in some targets
3799   // they may wake up the FP unit, which raises the power consumption.  Perhaps
3800   // they could be used with no holds barred (-O3).
3801   Options.LoadSizes = {8, 4, 2, 1};
3802   Options.AllowedTailExpansions = {3, 5, 6};
3803   return Options;
3804 }
3805 
3806 bool AArch64TTIImpl::prefersVectorizedAddressing() const {
3807   return ST->hasSVE();
3808 }
3809 
3810 InstructionCost
3811 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
3812                                       Align Alignment, unsigned AddressSpace,
3813                                       TTI::TargetCostKind CostKind) {
3814   if (useNeonVector(Src))
3815     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3816                                         CostKind);
3817   auto LT = getTypeLegalizationCost(Src);
3818   if (!LT.first.isValid())
3819     return InstructionCost::getInvalid();
3820 
3821   // Return an invalid cost for element types that we are unable to lower.
3822   auto *VT = cast<VectorType>(Src);
3823   if (VT->getElementType()->isIntegerTy(1))
3824     return InstructionCost::getInvalid();
3825 
3826   // The code-generator is currently not able to handle scalable vectors
3827   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3828   // it. This change will be removed when code-generation for these types is
3829   // sufficiently reliable.
3830   if (VT->getElementCount() == ElementCount::getScalable(1))
3831     return InstructionCost::getInvalid();
3832 
3833   return LT.first;
3834 }
3835 
3836 // This function returns gather/scatter overhead either from
3837 // user-provided value or specialized values per-target from \p ST.
3838 static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
3839                                             const AArch64Subtarget *ST) {
3840   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
3841          "Should be called on only load or stores.");
3842   switch (Opcode) {
3843   case Instruction::Load:
3844     if (SVEGatherOverhead.getNumOccurrences() > 0)
3845       return SVEGatherOverhead;
3846     return ST->getGatherOverhead();
3847     break;
3848   case Instruction::Store:
3849     if (SVEScatterOverhead.getNumOccurrences() > 0)
3850       return SVEScatterOverhead;
3851     return ST->getScatterOverhead();
3852     break;
3853   default:
3854     llvm_unreachable("Shouldn't have reached here");
3855   }
3856 }
3857 
3858 InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
3859     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
3860     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
3861   if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
3862     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
3863                                          Alignment, CostKind, I);
3864   auto *VT = cast<VectorType>(DataTy);
3865   auto LT = getTypeLegalizationCost(DataTy);
3866   if (!LT.first.isValid())
3867     return InstructionCost::getInvalid();
3868 
3869   // Return an invalid cost for element types that we are unable to lower.
3870   if (!LT.second.isVector() ||
3871       !isElementTypeLegalForScalableVector(VT->getElementType()) ||
3872       VT->getElementType()->isIntegerTy(1))
3873     return InstructionCost::getInvalid();
3874 
3875   // The code-generator is currently not able to handle scalable vectors
3876   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3877   // it. This change will be removed when code-generation for these types is
3878   // sufficiently reliable.
3879   if (VT->getElementCount() == ElementCount::getScalable(1))
3880     return InstructionCost::getInvalid();
3881 
3882   ElementCount LegalVF = LT.second.getVectorElementCount();
3883   InstructionCost MemOpCost =
3884       getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
3885                       {TTI::OK_AnyValue, TTI::OP_None}, I);
3886   // Add on an overhead cost for using gathers/scatters.
3887   MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
3888   return LT.first * MemOpCost * getMaxNumElements(LegalVF);
3889 }
3890 
3891 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
3892   return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
3893 }
3894 
3895 InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
3896                                                 MaybeAlign Alignment,
3897                                                 unsigned AddressSpace,
3898                                                 TTI::TargetCostKind CostKind,
3899                                                 TTI::OperandValueInfo OpInfo,
3900                                                 const Instruction *I) {
3901   EVT VT = TLI->getValueType(DL, Ty, true);
3902   // Type legalization can't handle structs
3903   if (VT == MVT::Other)
3904     return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
3905                                   CostKind);
3906 
3907   auto LT = getTypeLegalizationCost(Ty);
3908   if (!LT.first.isValid())
3909     return InstructionCost::getInvalid();
3910 
3911   // The code-generator is currently not able to handle scalable vectors
3912   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3913   // it. This change will be removed when code-generation for these types is
3914   // sufficiently reliable.
3915   // We also only support full register predicate loads and stores.
3916   if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3917     if (VTy->getElementCount() == ElementCount::getScalable(1) ||
3918         (VTy->getElementType()->isIntegerTy(1) &&
3919          !VTy->getElementCount().isKnownMultipleOf(
3920              ElementCount::getScalable(16))))
3921       return InstructionCost::getInvalid();
3922 
3923   // TODO: consider latency as well for TCK_SizeAndLatency.
3924   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
3925     return LT.first;
3926 
3927   if (CostKind != TTI::TCK_RecipThroughput)
3928     return 1;
3929 
3930   if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3931       LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
3932     // Unaligned stores are extremely inefficient. We don't split all
3933     // unaligned 128-bit stores because the negative impact that has shown in
3934     // practice on inlined block copy code.
3935     // We make such stores expensive so that we will only vectorize if there
3936     // are 6 other instructions getting vectorized.
3937     const int AmortizationCost = 6;
3938 
3939     return LT.first * 2 * AmortizationCost;
3940   }
3941 
3942   // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3943   if (Ty->isPtrOrPtrVectorTy())
3944     return LT.first;
3945 
3946   if (useNeonVector(Ty)) {
3947     // Check truncating stores and extending loads.
3948     if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3949       // v4i8 types are lowered to scalar a load/store and sshll/xtn.
3950       if (VT == MVT::v4i8)
3951         return 2;
3952       // Otherwise we need to scalarize.
3953       return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3954     }
3955     EVT EltVT = VT.getVectorElementType();
3956     unsigned EltSize = EltVT.getScalarSizeInBits();
3957     if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
3958         VT.getVectorNumElements() >= (128 / EltSize) || !Alignment ||
3959         *Alignment != Align(1))
3960       return LT.first;
3961     // FIXME: v3i8 lowering currently is very inefficient, due to automatic
3962     // widening to v4i8, which produces suboptimal results.
3963     if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
3964       return LT.first;
3965 
3966     // Check non-power-of-2 loads/stores for legal vector element types with
3967     // NEON. Non-power-of-2 memory ops will get broken down to a set of
3968     // operations on smaller power-of-2 ops, including ld1/st1.
3969     LLVMContext &C = Ty->getContext();
3970     InstructionCost Cost(0);
3971     SmallVector<EVT> TypeWorklist;
3972     TypeWorklist.push_back(VT);
3973     while (!TypeWorklist.empty()) {
3974       EVT CurrVT = TypeWorklist.pop_back_val();
3975       unsigned CurrNumElements = CurrVT.getVectorNumElements();
3976       if (isPowerOf2_32(CurrNumElements)) {
3977         Cost += 1;
3978         continue;
3979       }
3980 
3981       unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
3982       TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
3983       TypeWorklist.push_back(
3984           EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
3985     }
3986     return Cost;
3987   }
3988 
3989   return LT.first;
3990 }
3991 
3992 InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
3993     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
3994     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
3995     bool UseMaskForCond, bool UseMaskForGaps) {
3996   assert(Factor >= 2 && "Invalid interleave factor");
3997   auto *VecVTy = cast<VectorType>(VecTy);
3998 
3999   if (VecTy->isScalableTy() && !ST->hasSVE())
4000     return InstructionCost::getInvalid();
4001 
4002   // Vectorization for masked interleaved accesses is only enabled for scalable
4003   // VF.
4004   if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
4005     return InstructionCost::getInvalid();
4006 
4007   if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4008     unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4009     auto *SubVecTy =
4010         VectorType::get(VecVTy->getElementType(),
4011                         VecVTy->getElementCount().divideCoefficientBy(Factor));
4012 
4013     // ldN/stN only support legal vector types of size 64 or 128 in bits.
4014     // Accesses having vector types that are a multiple of 128 bits can be
4015     // matched to more than one ldN/stN instruction.
4016     bool UseScalable;
4017     if (MinElts % Factor == 0 &&
4018         TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
4019       return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
4020   }
4021 
4022   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4023                                            Alignment, AddressSpace, CostKind,
4024                                            UseMaskForCond, UseMaskForGaps);
4025 }
4026 
4027 InstructionCost
4028 AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
4029   InstructionCost Cost = 0;
4030   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4031   for (auto *I : Tys) {
4032     if (!I->isVectorTy())
4033       continue;
4034     if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
4035         128)
4036       Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
4037               getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
4038   }
4039   return Cost;
4040 }
4041 
4042 unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
4043   return ST->getMaxInterleaveFactor();
4044 }
4045 
4046 // For Falkor, we want to avoid having too many strided loads in a loop since
4047 // that can exhaust the HW prefetcher resources.  We adjust the unroller
4048 // MaxCount preference below to attempt to ensure unrolling doesn't create too
4049 // many strided loads.
4050 static void
4051 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
4052                               TargetTransformInfo::UnrollingPreferences &UP) {
4053   enum { MaxStridedLoads = 7 };
4054   auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
4055     int StridedLoads = 0;
4056     // FIXME? We could make this more precise by looking at the CFG and
4057     // e.g. not counting loads in each side of an if-then-else diamond.
4058     for (const auto BB : L->blocks()) {
4059       for (auto &I : *BB) {
4060         LoadInst *LMemI = dyn_cast<LoadInst>(&I);
4061         if (!LMemI)
4062           continue;
4063 
4064         Value *PtrValue = LMemI->getPointerOperand();
4065         if (L->isLoopInvariant(PtrValue))
4066           continue;
4067 
4068         const SCEV *LSCEV = SE.getSCEV(PtrValue);
4069         const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
4070         if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
4071           continue;
4072 
4073         // FIXME? We could take pairing of unrolled load copies into account
4074         // by looking at the AddRec, but we would probably have to limit this
4075         // to loops with no stores or other memory optimization barriers.
4076         ++StridedLoads;
4077         // We've seen enough strided loads that seeing more won't make a
4078         // difference.
4079         if (StridedLoads > MaxStridedLoads / 2)
4080           return StridedLoads;
4081       }
4082     }
4083     return StridedLoads;
4084   };
4085 
4086   int StridedLoads = countStridedLoads(L, SE);
4087   LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
4088                     << " strided loads\n");
4089   // Pick the largest power of 2 unroll count that won't result in too many
4090   // strided loads.
4091   if (StridedLoads) {
4092     UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
4093     LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
4094                       << UP.MaxCount << '\n');
4095   }
4096 }
4097 
4098 /// For Apple CPUs, we want to runtime-unroll loops to make better use if the
4099 /// OOO engine's wide instruction window and various predictors.
4100 static void
4101 getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
4102                                  TargetTransformInfo::UnrollingPreferences &UP,
4103                                  AArch64TTIImpl &TTI) {
4104   // Limit loops with structure that is highly likely to benefit from runtime
4105   // unrolling; that is we exclude outer loops, loops with multiple exits and
4106   // many blocks (i.e. likely with complex control flow). Note that the
4107   // heuristics here may be overly conservative and we err on the side of
4108   // avoiding runtime unrolling rather than unroll excessively. They are all
4109   // subject to further refinement.
4110   if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8)
4111     return;
4112 
4113   const SCEV *BTC = SE.getBackedgeTakenCount(L);
4114   if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
4115       (SE.getSmallConstantMaxTripCount(L) > 0 &&
4116        SE.getSmallConstantMaxTripCount(L) <= 32))
4117     return;
4118   if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4119     return;
4120 
4121   int64_t Size = 0;
4122   for (auto *BB : L->getBlocks()) {
4123     for (auto &I : *BB) {
4124       if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
4125         return;
4126       SmallVector<const Value *, 4> Operands(I.operand_values());
4127       Size +=
4128           *TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue();
4129     }
4130   }
4131 
4132   // Limit to loops with trip counts that are cheap to expand.
4133   UP.SCEVExpansionBudget = 1;
4134 
4135   // Try to unroll small, single block loops, if they have load/store
4136   // dependencies, to expose more parallel memory access streams.
4137   BasicBlock *Header = L->getHeader();
4138   if (Header == L->getLoopLatch()) {
4139     if (Size > 8)
4140       return;
4141 
4142     SmallPtrSet<Value *, 8> LoadedValues;
4143     SmallVector<StoreInst *> Stores;
4144     for (auto *BB : L->blocks()) {
4145       for (auto &I : *BB) {
4146         Value *Ptr = getLoadStorePointerOperand(&I);
4147         if (!Ptr)
4148           continue;
4149         const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4150         if (SE.isLoopInvariant(PtrSCEV, L))
4151           continue;
4152         if (isa<LoadInst>(&I))
4153           LoadedValues.insert(&I);
4154         else
4155           Stores.push_back(cast<StoreInst>(&I));
4156       }
4157     }
4158 
4159     // Try to find an unroll count that maximizes the use of the instruction
4160     // window, i.e. trying to fetch as many instructions per cycle as possible.
4161     unsigned MaxInstsPerLine = 16;
4162     unsigned UC = 1;
4163     unsigned BestUC = 1;
4164     unsigned SizeWithBestUC = BestUC * Size;
4165     while (UC <= 8) {
4166       unsigned SizeWithUC = UC * Size;
4167       if (SizeWithUC > 48)
4168         break;
4169       if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4170           (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4171         BestUC = UC;
4172         SizeWithBestUC = BestUC * Size;
4173       }
4174       UC++;
4175     }
4176 
4177     if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
4178           return LoadedValues.contains(SI->getOperand(0));
4179         }))
4180       return;
4181 
4182     UP.Runtime = true;
4183     UP.DefaultUnrollRuntimeCount = BestUC;
4184     return;
4185   }
4186 
4187   // Try to runtime-unroll loops with early-continues depending on loop-varying
4188   // loads; this helps with branch-prediction for the early-continues.
4189   auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
4190   auto *Latch = L->getLoopLatch();
4191   SmallVector<BasicBlock *> Preds(predecessors(Latch));
4192   if (!Term || !Term->isConditional() || Preds.size() == 1 ||
4193       none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
4194       none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
4195     return;
4196 
4197   std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
4198       [&](Instruction *I, unsigned Depth) -> bool {
4199     if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
4200       return false;
4201 
4202     if (isa<LoadInst>(I))
4203       return true;
4204 
4205     return any_of(I->operands(), [&](Value *V) {
4206       auto *I = dyn_cast<Instruction>(V);
4207       return I && DependsOnLoopLoad(I, Depth + 1);
4208     });
4209   };
4210   CmpPredicate Pred;
4211   Instruction *I;
4212   if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
4213                        m_Value())) &&
4214       DependsOnLoopLoad(I, 0)) {
4215     UP.Runtime = true;
4216   }
4217 }
4218 
4219 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
4220                                              TTI::UnrollingPreferences &UP,
4221                                              OptimizationRemarkEmitter *ORE) {
4222   // Enable partial unrolling and runtime unrolling.
4223   BaseT::getUnrollingPreferences(L, SE, UP, ORE);
4224 
4225   UP.UpperBound = true;
4226 
4227   // For inner loop, it is more likely to be a hot one, and the runtime check
4228   // can be promoted out from LICM pass, so the overhead is less, let's try
4229   // a larger threshold to unroll more loops.
4230   if (L->getLoopDepth() > 1)
4231     UP.PartialThreshold *= 2;
4232 
4233   // Disable partial & runtime unrolling on -Os.
4234   UP.PartialOptSizeThreshold = 0;
4235 
4236   // Apply subtarget-specific unrolling preferences.
4237   switch (ST->getProcFamily()) {
4238   case AArch64Subtarget::AppleA14:
4239   case AArch64Subtarget::AppleA15:
4240   case AArch64Subtarget::AppleA16:
4241   case AArch64Subtarget::AppleM4:
4242     getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
4243     break;
4244   case AArch64Subtarget::Falkor:
4245     if (EnableFalkorHWPFUnrollFix)
4246       getFalkorUnrollingPreferences(L, SE, UP);
4247     break;
4248   default:
4249     break;
4250   }
4251 
4252   // Scan the loop: don't unroll loops with calls as this could prevent
4253   // inlining. Don't unroll vector loops either, as they don't benefit much from
4254   // unrolling.
4255   for (auto *BB : L->getBlocks()) {
4256     for (auto &I : *BB) {
4257       // Don't unroll vectorised loop.
4258       if (I.getType()->isVectorTy())
4259         return;
4260 
4261       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4262         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
4263           if (!isLoweredToCall(F))
4264             continue;
4265         }
4266         return;
4267       }
4268     }
4269   }
4270 
4271   // Enable runtime unrolling for in-order models
4272   // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
4273   // checking for that case, we can ensure that the default behaviour is
4274   // unchanged
4275   if (ST->getProcFamily() != AArch64Subtarget::Others &&
4276       !ST->getSchedModel().isOutOfOrder()) {
4277     UP.Runtime = true;
4278     UP.Partial = true;
4279     UP.UnrollRemainder = true;
4280     UP.DefaultUnrollRuntimeCount = 4;
4281 
4282     UP.UnrollAndJam = true;
4283     UP.UnrollAndJamInnerLoopThreshold = 60;
4284   }
4285 }
4286 
4287 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
4288                                            TTI::PeelingPreferences &PP) {
4289   BaseT::getPeelingPreferences(L, SE, PP);
4290 }
4291 
4292 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
4293                                                          Type *ExpectedType) {
4294   switch (Inst->getIntrinsicID()) {
4295   default:
4296     return nullptr;
4297   case Intrinsic::aarch64_neon_st2:
4298   case Intrinsic::aarch64_neon_st3:
4299   case Intrinsic::aarch64_neon_st4: {
4300     // Create a struct type
4301     StructType *ST = dyn_cast<StructType>(ExpectedType);
4302     if (!ST)
4303       return nullptr;
4304     unsigned NumElts = Inst->arg_size() - 1;
4305     if (ST->getNumElements() != NumElts)
4306       return nullptr;
4307     for (unsigned i = 0, e = NumElts; i != e; ++i) {
4308       if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
4309         return nullptr;
4310     }
4311     Value *Res = PoisonValue::get(ExpectedType);
4312     IRBuilder<> Builder(Inst);
4313     for (unsigned i = 0, e = NumElts; i != e; ++i) {
4314       Value *L = Inst->getArgOperand(i);
4315       Res = Builder.CreateInsertValue(Res, L, i);
4316     }
4317     return Res;
4318   }
4319   case Intrinsic::aarch64_neon_ld2:
4320   case Intrinsic::aarch64_neon_ld3:
4321   case Intrinsic::aarch64_neon_ld4:
4322     if (Inst->getType() == ExpectedType)
4323       return Inst;
4324     return nullptr;
4325   }
4326 }
4327 
4328 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
4329                                         MemIntrinsicInfo &Info) {
4330   switch (Inst->getIntrinsicID()) {
4331   default:
4332     break;
4333   case Intrinsic::aarch64_neon_ld2:
4334   case Intrinsic::aarch64_neon_ld3:
4335   case Intrinsic::aarch64_neon_ld4:
4336     Info.ReadMem = true;
4337     Info.WriteMem = false;
4338     Info.PtrVal = Inst->getArgOperand(0);
4339     break;
4340   case Intrinsic::aarch64_neon_st2:
4341   case Intrinsic::aarch64_neon_st3:
4342   case Intrinsic::aarch64_neon_st4:
4343     Info.ReadMem = false;
4344     Info.WriteMem = true;
4345     Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
4346     break;
4347   }
4348 
4349   switch (Inst->getIntrinsicID()) {
4350   default:
4351     return false;
4352   case Intrinsic::aarch64_neon_ld2:
4353   case Intrinsic::aarch64_neon_st2:
4354     Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
4355     break;
4356   case Intrinsic::aarch64_neon_ld3:
4357   case Intrinsic::aarch64_neon_st3:
4358     Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
4359     break;
4360   case Intrinsic::aarch64_neon_ld4:
4361   case Intrinsic::aarch64_neon_st4:
4362     Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
4363     break;
4364   }
4365   return true;
4366 }
4367 
4368 /// See if \p I should be considered for address type promotion. We check if \p
4369 /// I is a sext with right type and used in memory accesses. If it used in a
4370 /// "complex" getelementptr, we allow it to be promoted without finding other
4371 /// sext instructions that sign extended the same initial value. A getelementptr
4372 /// is considered as "complex" if it has more than 2 operands.
4373 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
4374     const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
4375   bool Considerable = false;
4376   AllowPromotionWithoutCommonHeader = false;
4377   if (!isa<SExtInst>(&I))
4378     return false;
4379   Type *ConsideredSExtType =
4380       Type::getInt64Ty(I.getParent()->getParent()->getContext());
4381   if (I.getType() != ConsideredSExtType)
4382     return false;
4383   // See if the sext is the one with the right type and used in at least one
4384   // GetElementPtrInst.
4385   for (const User *U : I.users()) {
4386     if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
4387       Considerable = true;
4388       // A getelementptr is considered as "complex" if it has more than 2
4389       // operands. We will promote a SExt used in such complex GEP as we
4390       // expect some computation to be merged if they are done on 64 bits.
4391       if (GEPInst->getNumOperands() > 2) {
4392         AllowPromotionWithoutCommonHeader = true;
4393         break;
4394       }
4395     }
4396   }
4397   return Considerable;
4398 }
4399 
4400 bool AArch64TTIImpl::isLegalToVectorizeReduction(
4401     const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
4402   if (!VF.isScalable())
4403     return true;
4404 
4405   Type *Ty = RdxDesc.getRecurrenceType();
4406   if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
4407     return false;
4408 
4409   switch (RdxDesc.getRecurrenceKind()) {
4410   case RecurKind::Add:
4411   case RecurKind::FAdd:
4412   case RecurKind::And:
4413   case RecurKind::Or:
4414   case RecurKind::Xor:
4415   case RecurKind::SMin:
4416   case RecurKind::SMax:
4417   case RecurKind::UMin:
4418   case RecurKind::UMax:
4419   case RecurKind::FMin:
4420   case RecurKind::FMax:
4421   case RecurKind::FMulAdd:
4422   case RecurKind::IAnyOf:
4423   case RecurKind::FAnyOf:
4424     return true;
4425   default:
4426     return false;
4427   }
4428 }
4429 
4430 InstructionCost
4431 AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
4432                                        FastMathFlags FMF,
4433                                        TTI::TargetCostKind CostKind) {
4434   // The code-generator is currently not able to handle scalable vectors
4435   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4436   // it. This change will be removed when code-generation for these types is
4437   // sufficiently reliable.
4438   if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4439     if (VTy->getElementCount() == ElementCount::getScalable(1))
4440       return InstructionCost::getInvalid();
4441 
4442   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4443 
4444   if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
4445     return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
4446 
4447   InstructionCost LegalizationCost = 0;
4448   if (LT.first > 1) {
4449     Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
4450     IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
4451     LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
4452   }
4453 
4454   return LegalizationCost + /*Cost of horizontal reduction*/ 2;
4455 }
4456 
4457 InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
4458     unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
4459   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
4460   InstructionCost LegalizationCost = 0;
4461   if (LT.first > 1) {
4462     Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
4463     LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
4464     LegalizationCost *= LT.first - 1;
4465   }
4466 
4467   int ISD = TLI->InstructionOpcodeToISD(Opcode);
4468   assert(ISD && "Invalid opcode");
4469   // Add the final reduction cost for the legal horizontal reduction
4470   switch (ISD) {
4471   case ISD::ADD:
4472   case ISD::AND:
4473   case ISD::OR:
4474   case ISD::XOR:
4475   case ISD::FADD:
4476     return LegalizationCost + 2;
4477   default:
4478     return InstructionCost::getInvalid();
4479   }
4480 }
4481 
4482 InstructionCost
4483 AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
4484                                            std::optional<FastMathFlags> FMF,
4485                                            TTI::TargetCostKind CostKind) {
4486   // The code-generator is currently not able to handle scalable vectors
4487   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4488   // it. This change will be removed when code-generation for these types is
4489   // sufficiently reliable.
4490   if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
4491     if (VTy->getElementCount() == ElementCount::getScalable(1))
4492       return InstructionCost::getInvalid();
4493 
4494   if (TTI::requiresOrderedReduction(FMF)) {
4495     if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
4496       InstructionCost BaseCost =
4497           BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
4498       // Add on extra cost to reflect the extra overhead on some CPUs. We still
4499       // end up vectorizing for more computationally intensive loops.
4500       return BaseCost + FixedVTy->getNumElements();
4501     }
4502 
4503     if (Opcode != Instruction::FAdd)
4504       return InstructionCost::getInvalid();
4505 
4506     auto *VTy = cast<ScalableVectorType>(ValTy);
4507     InstructionCost Cost =
4508         getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
4509     Cost *= getMaxNumElements(VTy->getElementCount());
4510     return Cost;
4511   }
4512 
4513   if (isa<ScalableVectorType>(ValTy))
4514     return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
4515 
4516   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
4517   MVT MTy = LT.second;
4518   int ISD = TLI->InstructionOpcodeToISD(Opcode);
4519   assert(ISD && "Invalid opcode");
4520 
4521   // Horizontal adds can use the 'addv' instruction. We model the cost of these
4522   // instructions as twice a normal vector add, plus 1 for each legalization
4523   // step (LT.first). This is the only arithmetic vector reduction operation for
4524   // which we have an instruction.
4525   // OR, XOR and AND costs should match the codegen from:
4526   // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
4527   // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
4528   // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
4529   static const CostTblEntry CostTblNoPairwise[]{
4530       {ISD::ADD, MVT::v8i8,   2},
4531       {ISD::ADD, MVT::v16i8,  2},
4532       {ISD::ADD, MVT::v4i16,  2},
4533       {ISD::ADD, MVT::v8i16,  2},
4534       {ISD::ADD, MVT::v4i32,  2},
4535       {ISD::ADD, MVT::v2i64,  2},
4536       {ISD::OR,  MVT::v8i8,  15},
4537       {ISD::OR,  MVT::v16i8, 17},
4538       {ISD::OR,  MVT::v4i16,  7},
4539       {ISD::OR,  MVT::v8i16,  9},
4540       {ISD::OR,  MVT::v2i32,  3},
4541       {ISD::OR,  MVT::v4i32,  5},
4542       {ISD::OR,  MVT::v2i64,  3},
4543       {ISD::XOR, MVT::v8i8,  15},
4544       {ISD::XOR, MVT::v16i8, 17},
4545       {ISD::XOR, MVT::v4i16,  7},
4546       {ISD::XOR, MVT::v8i16,  9},
4547       {ISD::XOR, MVT::v2i32,  3},
4548       {ISD::XOR, MVT::v4i32,  5},
4549       {ISD::XOR, MVT::v2i64,  3},
4550       {ISD::AND, MVT::v8i8,  15},
4551       {ISD::AND, MVT::v16i8, 17},
4552       {ISD::AND, MVT::v4i16,  7},
4553       {ISD::AND, MVT::v8i16,  9},
4554       {ISD::AND, MVT::v2i32,  3},
4555       {ISD::AND, MVT::v4i32,  5},
4556       {ISD::AND, MVT::v2i64,  3},
4557   };
4558   switch (ISD) {
4559   default:
4560     break;
4561   case ISD::FADD:
4562     if (Type *EltTy = ValTy->getScalarType();
4563         // FIXME: For half types without fullfp16 support, this could extend and
4564         // use a fp32 faddp reduction but current codegen unrolls.
4565         MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
4566                            (EltTy->isHalfTy() && ST->hasFullFP16()))) {
4567       const unsigned NElts = MTy.getVectorNumElements();
4568       if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
4569           isPowerOf2_32(NElts))
4570         // Reduction corresponding to series of fadd instructions is lowered to
4571         // series of faddp instructions. faddp has latency/throughput that
4572         // matches fadd instruction and hence, every faddp instruction can be
4573         // considered to have a relative cost = 1 with
4574         // CostKind = TCK_RecipThroughput.
4575         // An faddp will pairwise add vector elements, so the size of input
4576         // vector reduces by half every time, requiring
4577         // #(faddp instructions) = log2_32(NElts).
4578         return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
4579     }
4580     break;
4581   case ISD::ADD:
4582     if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
4583       return (LT.first - 1) + Entry->Cost;
4584     break;
4585   case ISD::XOR:
4586   case ISD::AND:
4587   case ISD::OR:
4588     const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
4589     if (!Entry)
4590       break;
4591     auto *ValVTy = cast<FixedVectorType>(ValTy);
4592     if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
4593         isPowerOf2_32(ValVTy->getNumElements())) {
4594       InstructionCost ExtraCost = 0;
4595       if (LT.first != 1) {
4596         // Type needs to be split, so there is an extra cost of LT.first - 1
4597         // arithmetic ops.
4598         auto *Ty = FixedVectorType::get(ValTy->getElementType(),
4599                                         MTy.getVectorNumElements());
4600         ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
4601         ExtraCost *= LT.first - 1;
4602       }
4603       // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
4604       auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
4605       return Cost + ExtraCost;
4606     }
4607     break;
4608   }
4609   return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
4610 }
4611 
4612 InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
4613   static const CostTblEntry ShuffleTbl[] = {
4614       { TTI::SK_Splice, MVT::nxv16i8,  1 },
4615       { TTI::SK_Splice, MVT::nxv8i16,  1 },
4616       { TTI::SK_Splice, MVT::nxv4i32,  1 },
4617       { TTI::SK_Splice, MVT::nxv2i64,  1 },
4618       { TTI::SK_Splice, MVT::nxv2f16,  1 },
4619       { TTI::SK_Splice, MVT::nxv4f16,  1 },
4620       { TTI::SK_Splice, MVT::nxv8f16,  1 },
4621       { TTI::SK_Splice, MVT::nxv2bf16, 1 },
4622       { TTI::SK_Splice, MVT::nxv4bf16, 1 },
4623       { TTI::SK_Splice, MVT::nxv8bf16, 1 },
4624       { TTI::SK_Splice, MVT::nxv2f32,  1 },
4625       { TTI::SK_Splice, MVT::nxv4f32,  1 },
4626       { TTI::SK_Splice, MVT::nxv2f64,  1 },
4627   };
4628 
4629   // The code-generator is currently not able to handle scalable vectors
4630   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4631   // it. This change will be removed when code-generation for these types is
4632   // sufficiently reliable.
4633   if (Tp->getElementCount() == ElementCount::getScalable(1))
4634     return InstructionCost::getInvalid();
4635 
4636   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
4637   Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
4638   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4639   EVT PromotedVT = LT.second.getScalarType() == MVT::i1
4640                        ? TLI->getPromotedVTForPredicate(EVT(LT.second))
4641                        : LT.second;
4642   Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
4643   InstructionCost LegalizationCost = 0;
4644   if (Index < 0) {
4645     LegalizationCost =
4646         getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
4647                            CmpInst::BAD_ICMP_PREDICATE, CostKind) +
4648         getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
4649                            CmpInst::BAD_ICMP_PREDICATE, CostKind);
4650   }
4651 
4652   // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
4653   // Cost performed on a promoted type.
4654   if (LT.second.getScalarType() == MVT::i1) {
4655     LegalizationCost +=
4656         getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
4657                          TTI::CastContextHint::None, CostKind) +
4658         getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
4659                          TTI::CastContextHint::None, CostKind);
4660   }
4661   const auto *Entry =
4662       CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
4663   assert(Entry && "Illegal Type for Splice");
4664   LegalizationCost += Entry->Cost;
4665   return LegalizationCost * LT.first;
4666 }
4667 
4668 InstructionCost AArch64TTIImpl::getPartialReductionCost(
4669     unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
4670     ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
4671     TTI::PartialReductionExtendKind OpBExtend,
4672     std::optional<unsigned> BinOp) const {
4673   InstructionCost Invalid = InstructionCost::getInvalid();
4674   InstructionCost Cost(TTI::TCC_Basic);
4675 
4676   if (Opcode != Instruction::Add)
4677     return Invalid;
4678 
4679   if (InputTypeA != InputTypeB)
4680     return Invalid;
4681 
4682   EVT InputEVT = EVT::getEVT(InputTypeA);
4683   EVT AccumEVT = EVT::getEVT(AccumType);
4684 
4685   if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable())
4686     return Invalid;
4687   if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd()))
4688     return Invalid;
4689 
4690   if (InputEVT == MVT::i8) {
4691     switch (VF.getKnownMinValue()) {
4692     default:
4693       return Invalid;
4694     case 8:
4695       if (AccumEVT == MVT::i32)
4696         Cost *= 2;
4697       else if (AccumEVT != MVT::i64)
4698         return Invalid;
4699       break;
4700     case 16:
4701       if (AccumEVT == MVT::i64)
4702         Cost *= 2;
4703       else if (AccumEVT != MVT::i32)
4704         return Invalid;
4705       break;
4706     }
4707   } else if (InputEVT == MVT::i16) {
4708     // FIXME: Allow i32 accumulator but increase cost, as we would extend
4709     //        it to i64.
4710     if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64)
4711       return Invalid;
4712   } else
4713     return Invalid;
4714 
4715   // AArch64 supports lowering mixed extensions to a usdot but only if the
4716   // i8mm or sve/streaming features are available.
4717   if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None ||
4718       (OpAExtend != OpBExtend && !ST->hasMatMulInt8() &&
4719        !ST->isSVEorStreamingSVEAvailable()))
4720     return Invalid;
4721 
4722   if (!BinOp || *BinOp != Instruction::Mul)
4723     return Invalid;
4724 
4725   return Cost;
4726 }
4727 
4728 InstructionCost AArch64TTIImpl::getShuffleCost(
4729     TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
4730     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
4731     ArrayRef<const Value *> Args, const Instruction *CxtI) {
4732   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
4733 
4734   // If we have a Mask, and the LT is being legalized somehow, split the Mask
4735   // into smaller vectors and sum the cost of each shuffle.
4736   if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
4737       Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
4738       Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
4739 
4740     // Check for LD3/LD4 instructions, which are represented in llvm IR as
4741     // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
4742     // but we model it with a cost of LT.first so that LD3/LD4 have a higher
4743     // cost than just the load.
4744     if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
4745         (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 3) ||
4746          ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 4)))
4747       return std::max<InstructionCost>(1, LT.first / 4);
4748 
4749     // Check for ST3/ST4 instructions, which are represented in llvm IR as
4750     // store(interleaving-shuffle). The shuffle cost could potentially be free,
4751     // but we model it with a cost of LT.first so that ST3/ST4 have a higher
4752     // cost than just the store.
4753     if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
4754         (ShuffleVectorInst::isInterleaveMask(
4755              Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
4756          ShuffleVectorInst::isInterleaveMask(
4757              Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))
4758       return LT.first;
4759 
4760     unsigned TpNumElts = Mask.size();
4761     unsigned LTNumElts = LT.second.getVectorNumElements();
4762     unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
4763     VectorType *NTp =
4764         VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
4765     InstructionCost Cost;
4766     for (unsigned N = 0; N < NumVecs; N++) {
4767       SmallVector<int> NMask;
4768       // Split the existing mask into chunks of size LTNumElts. Track the source
4769       // sub-vectors to ensure the result has at most 2 inputs.
4770       unsigned Source1, Source2;
4771       unsigned NumSources = 0;
4772       for (unsigned E = 0; E < LTNumElts; E++) {
4773         int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
4774                                                       : PoisonMaskElem;
4775         if (MaskElt < 0) {
4776           NMask.push_back(PoisonMaskElem);
4777           continue;
4778         }
4779 
4780         // Calculate which source from the input this comes from and whether it
4781         // is new to us.
4782         unsigned Source = MaskElt / LTNumElts;
4783         if (NumSources == 0) {
4784           Source1 = Source;
4785           NumSources = 1;
4786         } else if (NumSources == 1 && Source != Source1) {
4787           Source2 = Source;
4788           NumSources = 2;
4789         } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
4790           NumSources++;
4791         }
4792 
4793         // Add to the new mask. For the NumSources>2 case these are not correct,
4794         // but are only used for the modular lane number.
4795         if (Source == Source1)
4796           NMask.push_back(MaskElt % LTNumElts);
4797         else if (Source == Source2)
4798           NMask.push_back(MaskElt % LTNumElts + LTNumElts);
4799         else
4800           NMask.push_back(MaskElt % LTNumElts);
4801       }
4802       // If the sub-mask has at most 2 input sub-vectors then re-cost it using
4803       // getShuffleCost. If not then cost it using the worst case as the number
4804       // of element moves into a new vector.
4805       if (NumSources <= 2)
4806         Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
4807                                                : TTI::SK_PermuteTwoSrc,
4808                                NTp, NMask, CostKind, 0, nullptr, Args, CxtI);
4809       else
4810         Cost += LTNumElts;
4811     }
4812     return Cost;
4813   }
4814 
4815   Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
4816   bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
4817   // A subvector extract can be implemented with an ext (or trivial extract, if
4818   // from lane 0). This currently only handles low or high extracts to prevent
4819   // SLP vectorizer regressions.
4820   if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
4821     if (LT.second.is128BitVector() &&
4822         cast<FixedVectorType>(SubTp)->getNumElements() ==
4823             LT.second.getVectorNumElements() / 2) {
4824       if (Index == 0)
4825         return 0;
4826       if (Index == (int)LT.second.getVectorNumElements() / 2)
4827         return 1;
4828     }
4829     Kind = TTI::SK_PermuteSingleSrc;
4830   }
4831 
4832   // Check for broadcast loads, which are supported by the LD1R instruction.
4833   // In terms of code-size, the shuffle vector is free when a load + dup get
4834   // folded into a LD1R. That's what we check and return here. For performance
4835   // and reciprocal throughput, a LD1R is not completely free. In this case, we
4836   // return the cost for the broadcast below (i.e. 1 for most/all types), so
4837   // that we model the load + dup sequence slightly higher because LD1R is a
4838   // high latency instruction.
4839   if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
4840     bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
4841     if (IsLoad && LT.second.isVector() &&
4842         isLegalBroadcastLoad(Tp->getElementType(),
4843                              LT.second.getVectorElementCount()))
4844       return 0;
4845   }
4846 
4847   // If we have 4 elements for the shuffle and a Mask, get the cost straight
4848   // from the perfect shuffle tables.
4849   if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
4850       (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
4851       all_of(Mask, [](int E) { return E < 8; }))
4852     return getPerfectShuffleCost(Mask);
4853 
4854   // Check for identity masks, which we can treat as free.
4855   if (!Mask.empty() && LT.second.isFixedLengthVector() &&
4856       (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4857       all_of(enumerate(Mask), [](const auto &M) {
4858         return M.value() < 0 || M.value() == (int)M.index();
4859       }))
4860     return 0;
4861 
4862   // Check for other shuffles that are not SK_ kinds but we have native
4863   // instructions for, for example ZIP and UZP.
4864   unsigned Unused;
4865   if (LT.second.isFixedLengthVector() &&
4866       LT.second.getVectorNumElements() == Mask.size() &&
4867       (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4868       (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4869        isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4870        // Check for non-zero lane splats
4871        all_of(drop_begin(Mask),
4872               [&Mask](int M) { return M < 0 || M == Mask[0]; })))
4873     return 1;
4874 
4875   if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
4876       Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
4877       Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
4878     static const CostTblEntry ShuffleTbl[] = {
4879         // Broadcast shuffle kinds can be performed with 'dup'.
4880         {TTI::SK_Broadcast, MVT::v8i8, 1},
4881         {TTI::SK_Broadcast, MVT::v16i8, 1},
4882         {TTI::SK_Broadcast, MVT::v4i16, 1},
4883         {TTI::SK_Broadcast, MVT::v8i16, 1},
4884         {TTI::SK_Broadcast, MVT::v2i32, 1},
4885         {TTI::SK_Broadcast, MVT::v4i32, 1},
4886         {TTI::SK_Broadcast, MVT::v2i64, 1},
4887         {TTI::SK_Broadcast, MVT::v4f16, 1},
4888         {TTI::SK_Broadcast, MVT::v8f16, 1},
4889         {TTI::SK_Broadcast, MVT::v2f32, 1},
4890         {TTI::SK_Broadcast, MVT::v4f32, 1},
4891         {TTI::SK_Broadcast, MVT::v2f64, 1},
4892         // Transpose shuffle kinds can be performed with 'trn1/trn2' and
4893         // 'zip1/zip2' instructions.
4894         {TTI::SK_Transpose, MVT::v8i8, 1},
4895         {TTI::SK_Transpose, MVT::v16i8, 1},
4896         {TTI::SK_Transpose, MVT::v4i16, 1},
4897         {TTI::SK_Transpose, MVT::v8i16, 1},
4898         {TTI::SK_Transpose, MVT::v2i32, 1},
4899         {TTI::SK_Transpose, MVT::v4i32, 1},
4900         {TTI::SK_Transpose, MVT::v2i64, 1},
4901         {TTI::SK_Transpose, MVT::v4f16, 1},
4902         {TTI::SK_Transpose, MVT::v8f16, 1},
4903         {TTI::SK_Transpose, MVT::v2f32, 1},
4904         {TTI::SK_Transpose, MVT::v4f32, 1},
4905         {TTI::SK_Transpose, MVT::v2f64, 1},
4906         // Select shuffle kinds.
4907         // TODO: handle vXi8/vXi16.
4908         {TTI::SK_Select, MVT::v2i32, 1}, // mov.
4909         {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
4910         {TTI::SK_Select, MVT::v2i64, 1}, // mov.
4911         {TTI::SK_Select, MVT::v2f32, 1}, // mov.
4912         {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
4913         {TTI::SK_Select, MVT::v2f64, 1}, // mov.
4914         // PermuteSingleSrc shuffle kinds.
4915         {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
4916         {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
4917         {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
4918         {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
4919         {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
4920         {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
4921         {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
4922         {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
4923         {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
4924         {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8},  // constpool + load + tbl
4925         {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8},  // constpool + load + tbl
4926         {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
4927         {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8},   // constpool + load + tbl
4928         {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8},  // constpool + load + tbl
4929         // Reverse can be lowered with `rev`.
4930         {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
4931         {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
4932         {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
4933         {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
4934         {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
4935         {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
4936         {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
4937         {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
4938         {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
4939         {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
4940         {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
4941         {TTI::SK_Reverse, MVT::v8i8, 1},  // REV64
4942         // Splice can all be lowered as `ext`.
4943         {TTI::SK_Splice, MVT::v2i32, 1},
4944         {TTI::SK_Splice, MVT::v4i32, 1},
4945         {TTI::SK_Splice, MVT::v2i64, 1},
4946         {TTI::SK_Splice, MVT::v2f32, 1},
4947         {TTI::SK_Splice, MVT::v4f32, 1},
4948         {TTI::SK_Splice, MVT::v2f64, 1},
4949         {TTI::SK_Splice, MVT::v8f16, 1},
4950         {TTI::SK_Splice, MVT::v8bf16, 1},
4951         {TTI::SK_Splice, MVT::v8i16, 1},
4952         {TTI::SK_Splice, MVT::v16i8, 1},
4953         {TTI::SK_Splice, MVT::v4bf16, 1},
4954         {TTI::SK_Splice, MVT::v4f16, 1},
4955         {TTI::SK_Splice, MVT::v4i16, 1},
4956         {TTI::SK_Splice, MVT::v8i8, 1},
4957         // Broadcast shuffle kinds for scalable vectors
4958         {TTI::SK_Broadcast, MVT::nxv16i8, 1},
4959         {TTI::SK_Broadcast, MVT::nxv8i16, 1},
4960         {TTI::SK_Broadcast, MVT::nxv4i32, 1},
4961         {TTI::SK_Broadcast, MVT::nxv2i64, 1},
4962         {TTI::SK_Broadcast, MVT::nxv2f16, 1},
4963         {TTI::SK_Broadcast, MVT::nxv4f16, 1},
4964         {TTI::SK_Broadcast, MVT::nxv8f16, 1},
4965         {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
4966         {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
4967         {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
4968         {TTI::SK_Broadcast, MVT::nxv2f32, 1},
4969         {TTI::SK_Broadcast, MVT::nxv4f32, 1},
4970         {TTI::SK_Broadcast, MVT::nxv2f64, 1},
4971         {TTI::SK_Broadcast, MVT::nxv16i1, 1},
4972         {TTI::SK_Broadcast, MVT::nxv8i1, 1},
4973         {TTI::SK_Broadcast, MVT::nxv4i1, 1},
4974         {TTI::SK_Broadcast, MVT::nxv2i1, 1},
4975         // Handle the cases for vector.reverse with scalable vectors
4976         {TTI::SK_Reverse, MVT::nxv16i8, 1},
4977         {TTI::SK_Reverse, MVT::nxv8i16, 1},
4978         {TTI::SK_Reverse, MVT::nxv4i32, 1},
4979         {TTI::SK_Reverse, MVT::nxv2i64, 1},
4980         {TTI::SK_Reverse, MVT::nxv2f16, 1},
4981         {TTI::SK_Reverse, MVT::nxv4f16, 1},
4982         {TTI::SK_Reverse, MVT::nxv8f16, 1},
4983         {TTI::SK_Reverse, MVT::nxv2bf16, 1},
4984         {TTI::SK_Reverse, MVT::nxv4bf16, 1},
4985         {TTI::SK_Reverse, MVT::nxv8bf16, 1},
4986         {TTI::SK_Reverse, MVT::nxv2f32, 1},
4987         {TTI::SK_Reverse, MVT::nxv4f32, 1},
4988         {TTI::SK_Reverse, MVT::nxv2f64, 1},
4989         {TTI::SK_Reverse, MVT::nxv16i1, 1},
4990         {TTI::SK_Reverse, MVT::nxv8i1, 1},
4991         {TTI::SK_Reverse, MVT::nxv4i1, 1},
4992         {TTI::SK_Reverse, MVT::nxv2i1, 1},
4993     };
4994     if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
4995       return LT.first * Entry->Cost;
4996   }
4997 
4998   if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
4999     return getSpliceCost(Tp, Index);
5000 
5001   // Inserting a subvector can often be done with either a D, S or H register
5002   // move, so long as the inserted vector is "aligned".
5003   if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
5004       LT.second.getSizeInBits() <= 128 && SubTp) {
5005     std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
5006     if (SubLT.second.isVector()) {
5007       int NumElts = LT.second.getVectorNumElements();
5008       int NumSubElts = SubLT.second.getVectorNumElements();
5009       if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
5010         return SubLT.first;
5011     }
5012   }
5013 
5014   // Restore optimal kind.
5015   if (IsExtractSubvector)
5016     Kind = TTI::SK_ExtractSubvector;
5017   return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
5018                                CxtI);
5019 }
5020 
5021 static bool containsDecreasingPointers(Loop *TheLoop,
5022                                        PredicatedScalarEvolution *PSE) {
5023   const auto &Strides = DenseMap<Value *, const SCEV *>();
5024   for (BasicBlock *BB : TheLoop->blocks()) {
5025     // Scan the instructions in the block and look for addresses that are
5026     // consecutive and decreasing.
5027     for (Instruction &I : *BB) {
5028       if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
5029         Value *Ptr = getLoadStorePointerOperand(&I);
5030         Type *AccessTy = getLoadStoreType(&I);
5031         if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
5032                          /*ShouldCheckWrap=*/false)
5033                 .value_or(0) < 0)
5034           return true;
5035       }
5036     }
5037   }
5038   return false;
5039 }
5040 
5041 bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost() const {
5042   if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
5043     return SVEPreferFixedOverScalableIfEqualCost;
5044   return ST->useFixedOverScalableIfEqualCost();
5045 }
5046 
5047 unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {
5048   return ST->getEpilogueVectorizationMinVF();
5049 }
5050 
5051 bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
5052   if (!ST->hasSVE())
5053     return false;
5054 
5055   // We don't currently support vectorisation with interleaving for SVE - with
5056   // such loops we're better off not using tail-folding. This gives us a chance
5057   // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
5058   if (TFI->IAI->hasGroups())
5059     return false;
5060 
5061   TailFoldingOpts Required = TailFoldingOpts::Disabled;
5062   if (TFI->LVL->getReductionVars().size())
5063     Required |= TailFoldingOpts::Reductions;
5064   if (TFI->LVL->getFixedOrderRecurrences().size())
5065     Required |= TailFoldingOpts::Recurrences;
5066 
5067   // We call this to discover whether any load/store pointers in the loop have
5068   // negative strides. This will require extra work to reverse the loop
5069   // predicate, which may be expensive.
5070   if (containsDecreasingPointers(TFI->LVL->getLoop(),
5071                                  TFI->LVL->getPredicatedScalarEvolution()))
5072     Required |= TailFoldingOpts::Reverse;
5073   if (Required == TailFoldingOpts::Disabled)
5074     Required |= TailFoldingOpts::Simple;
5075 
5076   if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
5077                                       Required))
5078     return false;
5079 
5080   // Don't tail-fold for tight loops where we would be better off interleaving
5081   // with an unpredicated loop.
5082   unsigned NumInsns = 0;
5083   for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
5084     NumInsns += BB->sizeWithoutDebug();
5085   }
5086 
5087   // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
5088   return NumInsns >= SVETailFoldInsnThreshold;
5089 }
5090 
5091 InstructionCost
5092 AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
5093                                      StackOffset BaseOffset, bool HasBaseReg,
5094                                      int64_t Scale, unsigned AddrSpace) const {
5095   // Scaling factors are not free at all.
5096   // Operands                     | Rt Latency
5097   // -------------------------------------------
5098   // Rt, [Xn, Xm]                 | 4
5099   // -------------------------------------------
5100   // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
5101   // Rt, [Xn, Wm, <extend> #imm]  |
5102   TargetLoweringBase::AddrMode AM;
5103   AM.BaseGV = BaseGV;
5104   AM.BaseOffs = BaseOffset.getFixed();
5105   AM.HasBaseReg = HasBaseReg;
5106   AM.Scale = Scale;
5107   AM.ScalableOffset = BaseOffset.getScalable();
5108   if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
5109     // Scale represents reg2 * scale, thus account for 1 if
5110     // it is not equal to 0 or 1.
5111     return AM.Scale != 0 && AM.Scale != 1;
5112   return -1;
5113 }
5114 
5115 bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {
5116   if (EnableOrLikeSelectOpt) {
5117     // For the binary operators (e.g. or) we need to be more careful than
5118     // selects, here we only transform them if they are already at a natural
5119     // break point in the code - the end of a block with an unconditional
5120     // terminator.
5121     if (I->getOpcode() == Instruction::Or &&
5122         isa<BranchInst>(I->getNextNode()) &&
5123         cast<BranchInst>(I->getNextNode())->isUnconditional())
5124       return true;
5125 
5126     if (I->getOpcode() == Instruction::Add ||
5127         I->getOpcode() == Instruction::Sub)
5128       return true;
5129   }
5130   return BaseT::shouldTreatInstructionLikeSelect(I);
5131 }
5132 
5133 bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
5134                                    const TargetTransformInfo::LSRCost &C2) {
5135   // AArch64 specific here is adding the number of instructions to the
5136   // comparison (though not as the first consideration, as some targets do)
5137   // along with changing the priority of the base additions.
5138   // TODO: Maybe a more nuanced tradeoff between instruction count
5139   // and number of registers? To be investigated at a later date.
5140   if (EnableLSRCostOpt)
5141     return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
5142                     C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5143            std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
5144                     C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5145 
5146   return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
5147 }
5148 
5149 static bool isSplatShuffle(Value *V) {
5150   if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
5151     return all_equal(Shuf->getShuffleMask());
5152   return false;
5153 }
5154 
5155 /// Check if both Op1 and Op2 are shufflevector extracts of either the lower
5156 /// or upper half of the vector elements.
5157 static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
5158                                      bool AllowSplat = false) {
5159   // Scalable types can't be extract shuffle vectors.
5160   if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
5161     return false;
5162 
5163   auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
5164     auto *FullTy = FullV->getType();
5165     auto *HalfTy = HalfV->getType();
5166     return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
5167            2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
5168   };
5169 
5170   auto extractHalf = [](Value *FullV, Value *HalfV) {
5171     auto *FullVT = cast<FixedVectorType>(FullV->getType());
5172     auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
5173     return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
5174   };
5175 
5176   ArrayRef<int> M1, M2;
5177   Value *S1Op1 = nullptr, *S2Op1 = nullptr;
5178   if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
5179       !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
5180     return false;
5181 
5182   // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
5183   // it is not checked as an extract below.
5184   if (AllowSplat && isSplatShuffle(Op1))
5185     S1Op1 = nullptr;
5186   if (AllowSplat && isSplatShuffle(Op2))
5187     S2Op1 = nullptr;
5188 
5189   // Check that the operands are half as wide as the result and we extract
5190   // half of the elements of the input vectors.
5191   if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
5192       (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
5193     return false;
5194 
5195   // Check the mask extracts either the lower or upper half of vector
5196   // elements.
5197   int M1Start = 0;
5198   int M2Start = 0;
5199   int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
5200   if ((S1Op1 &&
5201        !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
5202       (S2Op1 &&
5203        !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
5204     return false;
5205 
5206   if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
5207       (M2Start != 0 && M2Start != (NumElements / 2)))
5208     return false;
5209   if (S1Op1 && S2Op1 && M1Start != M2Start)
5210     return false;
5211 
5212   return true;
5213 }
5214 
5215 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
5216 /// of the vector elements.
5217 static bool areExtractExts(Value *Ext1, Value *Ext2) {
5218   auto areExtDoubled = [](Instruction *Ext) {
5219     return Ext->getType()->getScalarSizeInBits() ==
5220            2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
5221   };
5222 
5223   if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
5224       !match(Ext2, m_ZExtOrSExt(m_Value())) ||
5225       !areExtDoubled(cast<Instruction>(Ext1)) ||
5226       !areExtDoubled(cast<Instruction>(Ext2)))
5227     return false;
5228 
5229   return true;
5230 }
5231 
5232 /// Check if Op could be used with vmull_high_p64 intrinsic.
5233 static bool isOperandOfVmullHighP64(Value *Op) {
5234   Value *VectorOperand = nullptr;
5235   ConstantInt *ElementIndex = nullptr;
5236   return match(Op, m_ExtractElt(m_Value(VectorOperand),
5237                                 m_ConstantInt(ElementIndex))) &&
5238          ElementIndex->getValue() == 1 &&
5239          isa<FixedVectorType>(VectorOperand->getType()) &&
5240          cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
5241 }
5242 
5243 /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
5244 static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
5245   return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
5246 }
5247 
5248 static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
5249   // Restrict ourselves to the form CodeGenPrepare typically constructs.
5250   auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
5251   if (!GEP || GEP->getNumOperands() != 2)
5252     return false;
5253 
5254   Value *Base = GEP->getOperand(0);
5255   Value *Offsets = GEP->getOperand(1);
5256 
5257   // We only care about scalar_base+vector_offsets.
5258   if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
5259     return false;
5260 
5261   // Sink extends that would allow us to use 32-bit offset vectors.
5262   if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
5263     auto *OffsetsInst = cast<Instruction>(Offsets);
5264     if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
5265         OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
5266       Ops.push_back(&GEP->getOperandUse(1));
5267   }
5268 
5269   // Sink the GEP.
5270   return true;
5271 }
5272 
5273 /// We want to sink following cases:
5274 /// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
5275 /// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
5276 static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
5277   if (match(Op, m_VScale()))
5278     return true;
5279   if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
5280       match(Op, m_Mul(m_VScale(), m_ConstantInt()))) {
5281     Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
5282     return true;
5283   }
5284   if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
5285       match(Op, m_Mul(m_ZExt(m_VScale()), m_ConstantInt()))) {
5286     Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
5287     Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
5288     Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
5289     return true;
5290   }
5291   return false;
5292 }
5293 
5294 /// Check if sinking \p I's operands to I's basic block is profitable, because
5295 /// the operands can be folded into a target instruction, e.g.
5296 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
5297 bool AArch64TTIImpl::isProfitableToSinkOperands(
5298     Instruction *I, SmallVectorImpl<Use *> &Ops) const {
5299   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
5300     switch (II->getIntrinsicID()) {
5301     case Intrinsic::aarch64_neon_smull:
5302     case Intrinsic::aarch64_neon_umull:
5303       if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
5304                                    /*AllowSplat=*/true)) {
5305         Ops.push_back(&II->getOperandUse(0));
5306         Ops.push_back(&II->getOperandUse(1));
5307         return true;
5308       }
5309       [[fallthrough]];
5310 
5311     case Intrinsic::fma:
5312     case Intrinsic::fmuladd:
5313       if (isa<VectorType>(I->getType()) &&
5314           cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
5315           !ST->hasFullFP16())
5316         return false;
5317       [[fallthrough]];
5318     case Intrinsic::aarch64_neon_sqdmull:
5319     case Intrinsic::aarch64_neon_sqdmulh:
5320     case Intrinsic::aarch64_neon_sqrdmulh:
5321       // Sink splats for index lane variants
5322       if (isSplatShuffle(II->getOperand(0)))
5323         Ops.push_back(&II->getOperandUse(0));
5324       if (isSplatShuffle(II->getOperand(1)))
5325         Ops.push_back(&II->getOperandUse(1));
5326       return !Ops.empty();
5327     case Intrinsic::aarch64_neon_fmlal:
5328     case Intrinsic::aarch64_neon_fmlal2:
5329     case Intrinsic::aarch64_neon_fmlsl:
5330     case Intrinsic::aarch64_neon_fmlsl2:
5331       // Sink splats for index lane variants
5332       if (isSplatShuffle(II->getOperand(1)))
5333         Ops.push_back(&II->getOperandUse(1));
5334       if (isSplatShuffle(II->getOperand(2)))
5335         Ops.push_back(&II->getOperandUse(2));
5336       return !Ops.empty();
5337     case Intrinsic::aarch64_sve_ptest_first:
5338     case Intrinsic::aarch64_sve_ptest_last:
5339       if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
5340         if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
5341           Ops.push_back(&II->getOperandUse(0));
5342       return !Ops.empty();
5343     case Intrinsic::aarch64_sme_write_horiz:
5344     case Intrinsic::aarch64_sme_write_vert:
5345     case Intrinsic::aarch64_sme_writeq_horiz:
5346     case Intrinsic::aarch64_sme_writeq_vert: {
5347       auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
5348       if (!Idx || Idx->getOpcode() != Instruction::Add)
5349         return false;
5350       Ops.push_back(&II->getOperandUse(1));
5351       return true;
5352     }
5353     case Intrinsic::aarch64_sme_read_horiz:
5354     case Intrinsic::aarch64_sme_read_vert:
5355     case Intrinsic::aarch64_sme_readq_horiz:
5356     case Intrinsic::aarch64_sme_readq_vert:
5357     case Intrinsic::aarch64_sme_ld1b_vert:
5358     case Intrinsic::aarch64_sme_ld1h_vert:
5359     case Intrinsic::aarch64_sme_ld1w_vert:
5360     case Intrinsic::aarch64_sme_ld1d_vert:
5361     case Intrinsic::aarch64_sme_ld1q_vert:
5362     case Intrinsic::aarch64_sme_st1b_vert:
5363     case Intrinsic::aarch64_sme_st1h_vert:
5364     case Intrinsic::aarch64_sme_st1w_vert:
5365     case Intrinsic::aarch64_sme_st1d_vert:
5366     case Intrinsic::aarch64_sme_st1q_vert:
5367     case Intrinsic::aarch64_sme_ld1b_horiz:
5368     case Intrinsic::aarch64_sme_ld1h_horiz:
5369     case Intrinsic::aarch64_sme_ld1w_horiz:
5370     case Intrinsic::aarch64_sme_ld1d_horiz:
5371     case Intrinsic::aarch64_sme_ld1q_horiz:
5372     case Intrinsic::aarch64_sme_st1b_horiz:
5373     case Intrinsic::aarch64_sme_st1h_horiz:
5374     case Intrinsic::aarch64_sme_st1w_horiz:
5375     case Intrinsic::aarch64_sme_st1d_horiz:
5376     case Intrinsic::aarch64_sme_st1q_horiz: {
5377       auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
5378       if (!Idx || Idx->getOpcode() != Instruction::Add)
5379         return false;
5380       Ops.push_back(&II->getOperandUse(3));
5381       return true;
5382     }
5383     case Intrinsic::aarch64_neon_pmull:
5384       if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
5385         return false;
5386       Ops.push_back(&II->getOperandUse(0));
5387       Ops.push_back(&II->getOperandUse(1));
5388       return true;
5389     case Intrinsic::aarch64_neon_pmull64:
5390       if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
5391                                      II->getArgOperand(1)))
5392         return false;
5393       Ops.push_back(&II->getArgOperandUse(0));
5394       Ops.push_back(&II->getArgOperandUse(1));
5395       return true;
5396     case Intrinsic::masked_gather:
5397       if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
5398         return false;
5399       Ops.push_back(&II->getArgOperandUse(0));
5400       return true;
5401     case Intrinsic::masked_scatter:
5402       if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
5403         return false;
5404       Ops.push_back(&II->getArgOperandUse(1));
5405       return true;
5406     default:
5407       return false;
5408     }
5409   }
5410 
5411   auto ShouldSinkCondition = [](Value *Cond) -> bool {
5412     auto *II = dyn_cast<IntrinsicInst>(Cond);
5413     return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
5414            isa<ScalableVectorType>(II->getOperand(0)->getType());
5415   };
5416 
5417   switch (I->getOpcode()) {
5418   case Instruction::GetElementPtr:
5419   case Instruction::Add:
5420   case Instruction::Sub:
5421     // Sink vscales closer to uses for better isel
5422     for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
5423       if (shouldSinkVScale(I->getOperand(Op), Ops)) {
5424         Ops.push_back(&I->getOperandUse(Op));
5425         return true;
5426       }
5427     }
5428     break;
5429   case Instruction::Select: {
5430     if (!ShouldSinkCondition(I->getOperand(0)))
5431       return false;
5432 
5433     Ops.push_back(&I->getOperandUse(0));
5434     return true;
5435   }
5436   case Instruction::Br: {
5437     if (cast<BranchInst>(I)->isUnconditional())
5438       return false;
5439 
5440     if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))
5441       return false;
5442 
5443     Ops.push_back(&I->getOperandUse(0));
5444     return true;
5445   }
5446   default:
5447     break;
5448   }
5449 
5450   if (!I->getType()->isVectorTy())
5451     return false;
5452 
5453   switch (I->getOpcode()) {
5454   case Instruction::Sub:
5455   case Instruction::Add: {
5456     if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
5457       return false;
5458 
5459     // If the exts' operands extract either the lower or upper elements, we
5460     // can sink them too.
5461     auto Ext1 = cast<Instruction>(I->getOperand(0));
5462     auto Ext2 = cast<Instruction>(I->getOperand(1));
5463     if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
5464       Ops.push_back(&Ext1->getOperandUse(0));
5465       Ops.push_back(&Ext2->getOperandUse(0));
5466     }
5467 
5468     Ops.push_back(&I->getOperandUse(0));
5469     Ops.push_back(&I->getOperandUse(1));
5470 
5471     return true;
5472   }
5473   case Instruction::Or: {
5474     // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
5475     // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
5476     if (ST->hasNEON()) {
5477       Instruction *OtherAnd, *IA, *IB;
5478       Value *MaskValue;
5479       // MainAnd refers to And instruction that has 'Not' as one of its operands
5480       if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
5481                           m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
5482                                            m_Instruction(IA)))))) {
5483         if (match(OtherAnd,
5484                   m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
5485           Instruction *MainAnd = I->getOperand(0) == OtherAnd
5486                                      ? cast<Instruction>(I->getOperand(1))
5487                                      : cast<Instruction>(I->getOperand(0));
5488 
5489           // Both Ands should be in same basic block as Or
5490           if (I->getParent() != MainAnd->getParent() ||
5491               I->getParent() != OtherAnd->getParent())
5492             return false;
5493 
5494           // Non-mask operands of both Ands should also be in same basic block
5495           if (I->getParent() != IA->getParent() ||
5496               I->getParent() != IB->getParent())
5497             return false;
5498 
5499           Ops.push_back(
5500               &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
5501           Ops.push_back(&I->getOperandUse(0));
5502           Ops.push_back(&I->getOperandUse(1));
5503 
5504           return true;
5505         }
5506       }
5507     }
5508 
5509     return false;
5510   }
5511   case Instruction::Mul: {
5512     auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
5513       auto *Ty = cast<VectorType>(V->getType());
5514       // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
5515       if (Ty->isScalableTy())
5516         return false;
5517 
5518       // Indexed variants of Mul exist for i16 and i32 element types only.
5519       return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
5520     };
5521 
5522     int NumZExts = 0, NumSExts = 0;
5523     for (auto &Op : I->operands()) {
5524       // Make sure we are not already sinking this operand
5525       if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
5526         continue;
5527 
5528       if (match(&Op, m_ZExtOrSExt(m_Value()))) {
5529         auto *Ext = cast<Instruction>(Op);
5530         auto *ExtOp = Ext->getOperand(0);
5531         if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
5532           Ops.push_back(&Ext->getOperandUse(0));
5533         Ops.push_back(&Op);
5534 
5535         if (isa<SExtInst>(Ext))
5536           NumSExts++;
5537         else
5538           NumZExts++;
5539 
5540         continue;
5541       }
5542 
5543       ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
5544       if (!Shuffle)
5545         continue;
5546 
5547       // If the Shuffle is a splat and the operand is a zext/sext, sinking the
5548       // operand and the s/zext can help create indexed s/umull. This is
5549       // especially useful to prevent i64 mul being scalarized.
5550       if (isSplatShuffle(Shuffle) &&
5551           match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
5552         Ops.push_back(&Shuffle->getOperandUse(0));
5553         Ops.push_back(&Op);
5554         if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
5555           NumSExts++;
5556         else
5557           NumZExts++;
5558         continue;
5559       }
5560 
5561       Value *ShuffleOperand = Shuffle->getOperand(0);
5562       InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
5563       if (!Insert)
5564         continue;
5565 
5566       Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
5567       if (!OperandInstr)
5568         continue;
5569 
5570       ConstantInt *ElementConstant =
5571           dyn_cast<ConstantInt>(Insert->getOperand(2));
5572       // Check that the insertelement is inserting into element 0
5573       if (!ElementConstant || !ElementConstant->isZero())
5574         continue;
5575 
5576       unsigned Opcode = OperandInstr->getOpcode();
5577       if (Opcode == Instruction::SExt)
5578         NumSExts++;
5579       else if (Opcode == Instruction::ZExt)
5580         NumZExts++;
5581       else {
5582         // If we find that the top bits are known 0, then we can sink and allow
5583         // the backend to generate a umull.
5584         unsigned Bitwidth = I->getType()->getScalarSizeInBits();
5585         APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
5586         const DataLayout &DL = I->getDataLayout();
5587         if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
5588           continue;
5589         NumZExts++;
5590       }
5591 
5592       // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
5593       // the And, just to hoist it again back to the load.
5594       if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
5595         Ops.push_back(&Insert->getOperandUse(1));
5596       Ops.push_back(&Shuffle->getOperandUse(0));
5597       Ops.push_back(&Op);
5598     }
5599 
5600     // It is profitable to sink if we found two of the same type of extends.
5601     if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
5602       return true;
5603 
5604     // Otherwise, see if we should sink splats for indexed variants.
5605     if (!ShouldSinkSplatForIndexedVariant(I))
5606       return false;
5607 
5608     Ops.clear();
5609     if (isSplatShuffle(I->getOperand(0)))
5610       Ops.push_back(&I->getOperandUse(0));
5611     if (isSplatShuffle(I->getOperand(1)))
5612       Ops.push_back(&I->getOperandUse(1));
5613 
5614     return !Ops.empty();
5615   }
5616   case Instruction::FMul: {
5617     // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
5618     if (I->getType()->isScalableTy())
5619       return false;
5620 
5621     if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
5622         !ST->hasFullFP16())
5623       return false;
5624 
5625     // Sink splats for index lane variants
5626     if (isSplatShuffle(I->getOperand(0)))
5627       Ops.push_back(&I->getOperandUse(0));
5628     if (isSplatShuffle(I->getOperand(1)))
5629       Ops.push_back(&I->getOperandUse(1));
5630     return !Ops.empty();
5631   }
5632   default:
5633     return false;
5634   }
5635   return false;
5636 }
5637