xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (revision 62987288060ff68c817b7056815aa9fb8ba8ecd7)
10b57cec5SDimitry Andric //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric 
90b57cec5SDimitry Andric #include "AArch64TargetTransformInfo.h"
10e8d8bef9SDimitry Andric #include "AArch64ExpandImm.h"
1181ad6265SDimitry Andric #include "AArch64PerfectShuffle.h"
120b57cec5SDimitry Andric #include "MCTargetDesc/AArch64AddressingModes.h"
13349cc55cSDimitry Andric #include "llvm/Analysis/IVDescriptors.h"
140b57cec5SDimitry Andric #include "llvm/Analysis/LoopInfo.h"
150b57cec5SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h"
160b57cec5SDimitry Andric #include "llvm/CodeGen/BasicTTIImpl.h"
170b57cec5SDimitry Andric #include "llvm/CodeGen/CostTable.h"
180b57cec5SDimitry Andric #include "llvm/CodeGen/TargetLowering.h"
190b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h"
2081ad6265SDimitry Andric #include "llvm/IR/Intrinsics.h"
21480093f4SDimitry Andric #include "llvm/IR/IntrinsicsAArch64.h"
22e8d8bef9SDimitry Andric #include "llvm/IR/PatternMatch.h"
230b57cec5SDimitry Andric #include "llvm/Support/Debug.h"
24fe6060f1SDimitry Andric #include "llvm/Transforms/InstCombine/InstCombiner.h"
25fcaf7f86SDimitry Andric #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
260b57cec5SDimitry Andric #include <algorithm>
27bdd1243dSDimitry Andric #include <optional>
280b57cec5SDimitry Andric using namespace llvm;
29e8d8bef9SDimitry Andric using namespace llvm::PatternMatch;
300b57cec5SDimitry Andric 
310b57cec5SDimitry Andric #define DEBUG_TYPE "aarch64tti"
320b57cec5SDimitry Andric 
330b57cec5SDimitry Andric static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
340b57cec5SDimitry Andric                                                cl::init(true), cl::Hidden);
350b57cec5SDimitry Andric 
360eae32dcSDimitry Andric static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
370eae32dcSDimitry Andric                                            cl::Hidden);
380eae32dcSDimitry Andric 
390eae32dcSDimitry Andric static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
400eae32dcSDimitry Andric                                             cl::init(10), cl::Hidden);
410eae32dcSDimitry Andric 
4206c3fb27SDimitry Andric static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
4306c3fb27SDimitry Andric                                                   cl::init(15), cl::Hidden);
4406c3fb27SDimitry Andric 
4506c3fb27SDimitry Andric static cl::opt<unsigned>
4606c3fb27SDimitry Andric     NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
4706c3fb27SDimitry Andric                                cl::Hidden);
4806c3fb27SDimitry Andric 
495f757f3fSDimitry Andric static cl::opt<unsigned> CallPenaltyChangeSM(
505f757f3fSDimitry Andric     "call-penalty-sm-change", cl::init(5), cl::Hidden,
515f757f3fSDimitry Andric     cl::desc(
525f757f3fSDimitry Andric         "Penalty of calling a function that requires a change to PSTATE.SM"));
535f757f3fSDimitry Andric 
545f757f3fSDimitry Andric static cl::opt<unsigned> InlineCallPenaltyChangeSM(
555f757f3fSDimitry Andric     "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
565f757f3fSDimitry Andric     cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
575f757f3fSDimitry Andric 
587a6dacacSDimitry Andric static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
597a6dacacSDimitry Andric                                            cl::init(true), cl::Hidden);
607a6dacacSDimitry Andric 
610fca6ea1SDimitry Andric static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
620fca6ea1SDimitry Andric                                       cl::init(true), cl::Hidden);
630fca6ea1SDimitry Andric 
640fca6ea1SDimitry Andric // A complete guess as to a reasonable cost.
650fca6ea1SDimitry Andric static cl::opt<unsigned>
660fca6ea1SDimitry Andric     BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
670fca6ea1SDimitry Andric                     cl::desc("The cost of a histcnt instruction"));
680fca6ea1SDimitry Andric 
69bdd1243dSDimitry Andric namespace {
7006c3fb27SDimitry Andric class TailFoldingOption {
7106c3fb27SDimitry Andric   // These bitfields will only ever be set to something non-zero in operator=,
7206c3fb27SDimitry Andric   // when setting the -sve-tail-folding option. This option should always be of
7306c3fb27SDimitry Andric   // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
7406c3fb27SDimitry Andric   // InitialBits is one of (disabled|all|simple). EnableBits represents
7506c3fb27SDimitry Andric   // additional flags we're enabling, and DisableBits for those flags we're
7606c3fb27SDimitry Andric   // disabling. The default flag is tracked in the variable NeedsDefault, since
7706c3fb27SDimitry Andric   // at the time of setting the option we may not know what the default value
7806c3fb27SDimitry Andric   // for the CPU is.
7906c3fb27SDimitry Andric   TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
8006c3fb27SDimitry Andric   TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
8106c3fb27SDimitry Andric   TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
8206c3fb27SDimitry Andric 
8306c3fb27SDimitry Andric   // This value needs to be initialised to true in case the user does not
8406c3fb27SDimitry Andric   // explicitly set the -sve-tail-folding option.
8506c3fb27SDimitry Andric   bool NeedsDefault = true;
8606c3fb27SDimitry Andric 
8706c3fb27SDimitry Andric   void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
8806c3fb27SDimitry Andric 
8906c3fb27SDimitry Andric   void setNeedsDefault(bool V) { NeedsDefault = V; }
9006c3fb27SDimitry Andric 
9106c3fb27SDimitry Andric   void setEnableBit(TailFoldingOpts Bit) {
9206c3fb27SDimitry Andric     EnableBits |= Bit;
9306c3fb27SDimitry Andric     DisableBits &= ~Bit;
9406c3fb27SDimitry Andric   }
9506c3fb27SDimitry Andric 
9606c3fb27SDimitry Andric   void setDisableBit(TailFoldingOpts Bit) {
9706c3fb27SDimitry Andric     EnableBits &= ~Bit;
9806c3fb27SDimitry Andric     DisableBits |= Bit;
9906c3fb27SDimitry Andric   }
10006c3fb27SDimitry Andric 
10106c3fb27SDimitry Andric   TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
10206c3fb27SDimitry Andric     TailFoldingOpts Bits = TailFoldingOpts::Disabled;
10306c3fb27SDimitry Andric 
10406c3fb27SDimitry Andric     assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
10506c3fb27SDimitry Andric            "Initial bits should only include one of "
10606c3fb27SDimitry Andric            "(disabled|all|simple|default)");
10706c3fb27SDimitry Andric     Bits = NeedsDefault ? DefaultBits : InitialBits;
10806c3fb27SDimitry Andric     Bits |= EnableBits;
10906c3fb27SDimitry Andric     Bits &= ~DisableBits;
11006c3fb27SDimitry Andric 
11106c3fb27SDimitry Andric     return Bits;
11206c3fb27SDimitry Andric   }
11306c3fb27SDimitry Andric 
11406c3fb27SDimitry Andric   void reportError(std::string Opt) {
11506c3fb27SDimitry Andric     errs() << "invalid argument '" << Opt
11606c3fb27SDimitry Andric            << "' to -sve-tail-folding=; the option should be of the form\n"
11706c3fb27SDimitry Andric               "  (disabled|all|default|simple)[+(reductions|recurrences"
11806c3fb27SDimitry Andric               "|reverse|noreductions|norecurrences|noreverse)]\n";
11906c3fb27SDimitry Andric     report_fatal_error("Unrecognised tail-folding option");
12006c3fb27SDimitry Andric   }
121fcaf7f86SDimitry Andric 
122fcaf7f86SDimitry Andric public:
123fcaf7f86SDimitry Andric 
124fcaf7f86SDimitry Andric   void operator=(const std::string &Val) {
12506c3fb27SDimitry Andric     // If the user explicitly sets -sve-tail-folding= then treat as an error.
12606c3fb27SDimitry Andric     if (Val.empty()) {
12706c3fb27SDimitry Andric       reportError("");
128fcaf7f86SDimitry Andric       return;
12906c3fb27SDimitry Andric     }
13006c3fb27SDimitry Andric 
13106c3fb27SDimitry Andric     // Since the user is explicitly setting the option we don't automatically
13206c3fb27SDimitry Andric     // need the default unless they require it.
13306c3fb27SDimitry Andric     setNeedsDefault(false);
13406c3fb27SDimitry Andric 
13506c3fb27SDimitry Andric     SmallVector<StringRef, 4> TailFoldTypes;
136fcaf7f86SDimitry Andric     StringRef(Val).split(TailFoldTypes, '+', -1, false);
13706c3fb27SDimitry Andric 
13806c3fb27SDimitry Andric     unsigned StartIdx = 1;
13906c3fb27SDimitry Andric     if (TailFoldTypes[0] == "disabled")
14006c3fb27SDimitry Andric       setInitialBits(TailFoldingOpts::Disabled);
14106c3fb27SDimitry Andric     else if (TailFoldTypes[0] == "all")
14206c3fb27SDimitry Andric       setInitialBits(TailFoldingOpts::All);
14306c3fb27SDimitry Andric     else if (TailFoldTypes[0] == "default")
14406c3fb27SDimitry Andric       setNeedsDefault(true);
14506c3fb27SDimitry Andric     else if (TailFoldTypes[0] == "simple")
14606c3fb27SDimitry Andric       setInitialBits(TailFoldingOpts::Simple);
147fcaf7f86SDimitry Andric     else {
14806c3fb27SDimitry Andric       StartIdx = 0;
14906c3fb27SDimitry Andric       setInitialBits(TailFoldingOpts::Disabled);
150fcaf7f86SDimitry Andric     }
15106c3fb27SDimitry Andric 
15206c3fb27SDimitry Andric     for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
15306c3fb27SDimitry Andric       if (TailFoldTypes[I] == "reductions")
15406c3fb27SDimitry Andric         setEnableBit(TailFoldingOpts::Reductions);
15506c3fb27SDimitry Andric       else if (TailFoldTypes[I] == "recurrences")
15606c3fb27SDimitry Andric         setEnableBit(TailFoldingOpts::Recurrences);
15706c3fb27SDimitry Andric       else if (TailFoldTypes[I] == "reverse")
15806c3fb27SDimitry Andric         setEnableBit(TailFoldingOpts::Reverse);
15906c3fb27SDimitry Andric       else if (TailFoldTypes[I] == "noreductions")
16006c3fb27SDimitry Andric         setDisableBit(TailFoldingOpts::Reductions);
16106c3fb27SDimitry Andric       else if (TailFoldTypes[I] == "norecurrences")
16206c3fb27SDimitry Andric         setDisableBit(TailFoldingOpts::Recurrences);
16306c3fb27SDimitry Andric       else if (TailFoldTypes[I] == "noreverse")
16406c3fb27SDimitry Andric         setDisableBit(TailFoldingOpts::Reverse);
16506c3fb27SDimitry Andric       else
16606c3fb27SDimitry Andric         reportError(Val);
167fcaf7f86SDimitry Andric     }
168fcaf7f86SDimitry Andric   }
169fcaf7f86SDimitry Andric 
17006c3fb27SDimitry Andric   bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
17106c3fb27SDimitry Andric     return (getBits(DefaultBits) & Required) == Required;
17206c3fb27SDimitry Andric   }
173fcaf7f86SDimitry Andric };
174bdd1243dSDimitry Andric } // namespace
175fcaf7f86SDimitry Andric 
17606c3fb27SDimitry Andric TailFoldingOption TailFoldingOptionLoc;
177fcaf7f86SDimitry Andric 
17806c3fb27SDimitry Andric cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding(
179fcaf7f86SDimitry Andric     "sve-tail-folding",
180fcaf7f86SDimitry Andric     cl::desc(
18106c3fb27SDimitry Andric         "Control the use of vectorisation using tail-folding for SVE where the"
18206c3fb27SDimitry Andric         " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
18306c3fb27SDimitry Andric         "\ndisabled      (Initial) No loop types will vectorize using "
18406c3fb27SDimitry Andric         "tail-folding"
18506c3fb27SDimitry Andric         "\ndefault       (Initial) Uses the default tail-folding settings for "
18606c3fb27SDimitry Andric         "the target CPU"
18706c3fb27SDimitry Andric         "\nall           (Initial) All legal loop types will vectorize using "
18806c3fb27SDimitry Andric         "tail-folding"
18906c3fb27SDimitry Andric         "\nsimple        (Initial) Use tail-folding for simple loops (not "
19006c3fb27SDimitry Andric         "reductions or recurrences)"
191fcaf7f86SDimitry Andric         "\nreductions    Use tail-folding for loops containing reductions"
19206c3fb27SDimitry Andric         "\nnoreductions  Inverse of above"
193bdd1243dSDimitry Andric         "\nrecurrences   Use tail-folding for loops containing fixed order "
19406c3fb27SDimitry Andric         "recurrences"
19506c3fb27SDimitry Andric         "\nnorecurrences Inverse of above"
19606c3fb27SDimitry Andric         "\nreverse       Use tail-folding for loops requiring reversed "
19706c3fb27SDimitry Andric         "predicates"
19806c3fb27SDimitry Andric         "\nnoreverse     Inverse of above"),
19906c3fb27SDimitry Andric     cl::location(TailFoldingOptionLoc));
200fcaf7f86SDimitry Andric 
201bdd1243dSDimitry Andric // Experimental option that will only be fully functional when the
202bdd1243dSDimitry Andric // code-generator is changed to use SVE instead of NEON for all fixed-width
203bdd1243dSDimitry Andric // operations.
204bdd1243dSDimitry Andric static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
205bdd1243dSDimitry Andric     "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
206bdd1243dSDimitry Andric 
207bdd1243dSDimitry Andric // Experimental option that will only be fully functional when the cost-model
208bdd1243dSDimitry Andric // and code-generator have been changed to avoid using scalable vector
209bdd1243dSDimitry Andric // instructions that are not legal in streaming SVE mode.
210bdd1243dSDimitry Andric static cl::opt<bool> EnableScalableAutovecInStreamingMode(
211bdd1243dSDimitry Andric     "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
212bdd1243dSDimitry Andric 
2135f757f3fSDimitry Andric static bool isSMEABIRoutineCall(const CallInst &CI) {
2145f757f3fSDimitry Andric   const auto *F = CI.getCalledFunction();
2155f757f3fSDimitry Andric   return F && StringSwitch<bool>(F->getName())
2165f757f3fSDimitry Andric                   .Case("__arm_sme_state", true)
2175f757f3fSDimitry Andric                   .Case("__arm_tpidr2_save", true)
2185f757f3fSDimitry Andric                   .Case("__arm_tpidr2_restore", true)
2195f757f3fSDimitry Andric                   .Case("__arm_za_disable", true)
2205f757f3fSDimitry Andric                   .Default(false);
2215f757f3fSDimitry Andric }
2225f757f3fSDimitry Andric 
2235f757f3fSDimitry Andric /// Returns true if the function has explicit operations that can only be
2245f757f3fSDimitry Andric /// lowered using incompatible instructions for the selected mode. This also
2255f757f3fSDimitry Andric /// returns true if the function F may use or modify ZA state.
2265f757f3fSDimitry Andric static bool hasPossibleIncompatibleOps(const Function *F) {
2275f757f3fSDimitry Andric   for (const BasicBlock &BB : *F) {
2285f757f3fSDimitry Andric     for (const Instruction &I : BB) {
2295f757f3fSDimitry Andric       // Be conservative for now and assume that any call to inline asm or to
2305f757f3fSDimitry Andric       // intrinsics could could result in non-streaming ops (e.g. calls to
2315f757f3fSDimitry Andric       // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
2325f757f3fSDimitry Andric       // all native LLVM instructions can be lowered to compatible instructions.
2335f757f3fSDimitry Andric       if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
2345f757f3fSDimitry Andric           (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
2355f757f3fSDimitry Andric            isSMEABIRoutineCall(cast<CallInst>(I))))
2365f757f3fSDimitry Andric         return true;
2375f757f3fSDimitry Andric     }
2385f757f3fSDimitry Andric   }
2395f757f3fSDimitry Andric   return false;
2405f757f3fSDimitry Andric }
2415f757f3fSDimitry Andric 
2420b57cec5SDimitry Andric bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
2430b57cec5SDimitry Andric                                          const Function *Callee) const {
244b3edf446SDimitry Andric   SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
245b3edf446SDimitry Andric 
246b3edf446SDimitry Andric   // When inlining, we should consider the body of the function, not the
247b3edf446SDimitry Andric   // interface.
248b3edf446SDimitry Andric   if (CalleeAttrs.hasStreamingBody()) {
249b3edf446SDimitry Andric     CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
250b3edf446SDimitry Andric     CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
251b3edf446SDimitry Andric   }
252b3edf446SDimitry Andric 
2530fca6ea1SDimitry Andric   if (CalleeAttrs.isNewZA())
254bdd1243dSDimitry Andric     return false;
255bdd1243dSDimitry Andric 
2565f757f3fSDimitry Andric   if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
257*62987288SDimitry Andric       CallerAttrs.requiresSMChange(CalleeAttrs) ||
258*62987288SDimitry Andric       CallerAttrs.requiresPreservingZT0(CalleeAttrs)) {
2595f757f3fSDimitry Andric     if (hasPossibleIncompatibleOps(Callee))
2605f757f3fSDimitry Andric       return false;
2615f757f3fSDimitry Andric   }
2625f757f3fSDimitry Andric 
2630b57cec5SDimitry Andric   const TargetMachine &TM = getTLI()->getTargetMachine();
2640b57cec5SDimitry Andric 
2650b57cec5SDimitry Andric   const FeatureBitset &CallerBits =
2660b57cec5SDimitry Andric       TM.getSubtargetImpl(*Caller)->getFeatureBits();
2670b57cec5SDimitry Andric   const FeatureBitset &CalleeBits =
2680b57cec5SDimitry Andric       TM.getSubtargetImpl(*Callee)->getFeatureBits();
2690b57cec5SDimitry Andric 
2700b57cec5SDimitry Andric   // Inline a callee if its target-features are a subset of the callers
2710b57cec5SDimitry Andric   // target-features.
2720b57cec5SDimitry Andric   return (CallerBits & CalleeBits) == CalleeBits;
2730b57cec5SDimitry Andric }
2740b57cec5SDimitry Andric 
275b121cb00SDimitry Andric bool AArch64TTIImpl::areTypesABICompatible(
276b121cb00SDimitry Andric     const Function *Caller, const Function *Callee,
277b121cb00SDimitry Andric     const ArrayRef<Type *> &Types) const {
278b121cb00SDimitry Andric   if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
279b121cb00SDimitry Andric     return false;
280b121cb00SDimitry Andric 
281b121cb00SDimitry Andric   // We need to ensure that argument promotion does not attempt to promote
282b121cb00SDimitry Andric   // pointers to fixed-length vector types larger than 128 bits like
283b121cb00SDimitry Andric   // <8 x float> (and pointers to aggregate types which have such fixed-length
284b121cb00SDimitry Andric   // vector type members) into the values of the pointees. Such vector types
285b121cb00SDimitry Andric   // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
286b121cb00SDimitry Andric   // backend cannot lower such value arguments. The 128-bit fixed-length SVE
287b121cb00SDimitry Andric   // types can be safely treated as 128-bit NEON types and they cannot be
288b121cb00SDimitry Andric   // distinguished in IR.
289b121cb00SDimitry Andric   if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
290b121cb00SDimitry Andric         auto FVTy = dyn_cast<FixedVectorType>(Ty);
291b121cb00SDimitry Andric         return FVTy &&
292b121cb00SDimitry Andric                FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
293b121cb00SDimitry Andric       }))
294b121cb00SDimitry Andric     return false;
295b121cb00SDimitry Andric 
296b121cb00SDimitry Andric   return true;
297b121cb00SDimitry Andric }
298b121cb00SDimitry Andric 
2995f757f3fSDimitry Andric unsigned
3005f757f3fSDimitry Andric AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
3015f757f3fSDimitry Andric                                      unsigned DefaultCallPenalty) const {
3025f757f3fSDimitry Andric   // This function calculates a penalty for executing Call in F.
3035f757f3fSDimitry Andric   //
3045f757f3fSDimitry Andric   // There are two ways this function can be called:
3055f757f3fSDimitry Andric   // (1)  F:
3065f757f3fSDimitry Andric   //       call from F -> G (the call here is Call)
3075f757f3fSDimitry Andric   //
3085f757f3fSDimitry Andric   // For (1), Call.getCaller() == F, so it will always return a high cost if
3095f757f3fSDimitry Andric   // a streaming-mode change is required (thus promoting the need to inline the
3105f757f3fSDimitry Andric   // function)
3115f757f3fSDimitry Andric   //
3125f757f3fSDimitry Andric   // (2)  F:
3135f757f3fSDimitry Andric   //       call from F -> G (the call here is not Call)
3145f757f3fSDimitry Andric   //      G:
3155f757f3fSDimitry Andric   //       call from G -> H (the call here is Call)
3165f757f3fSDimitry Andric   //
3175f757f3fSDimitry Andric   // For (2), if after inlining the body of G into F the call to H requires a
3185f757f3fSDimitry Andric   // streaming-mode change, and the call to G from F would also require a
3195f757f3fSDimitry Andric   // streaming-mode change, then there is benefit to do the streaming-mode
3205f757f3fSDimitry Andric   // change only once and avoid inlining of G into F.
3215f757f3fSDimitry Andric   SMEAttrs FAttrs(*F);
3225f757f3fSDimitry Andric   SMEAttrs CalleeAttrs(Call);
3235f757f3fSDimitry Andric   if (FAttrs.requiresSMChange(CalleeAttrs)) {
3245f757f3fSDimitry Andric     if (F == Call.getCaller()) // (1)
3255f757f3fSDimitry Andric       return CallPenaltyChangeSM * DefaultCallPenalty;
3265f757f3fSDimitry Andric     if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)
3275f757f3fSDimitry Andric       return InlineCallPenaltyChangeSM * DefaultCallPenalty;
3285f757f3fSDimitry Andric   }
3295f757f3fSDimitry Andric 
3305f757f3fSDimitry Andric   return DefaultCallPenalty;
3315f757f3fSDimitry Andric }
3325f757f3fSDimitry Andric 
33381ad6265SDimitry Andric bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
33481ad6265SDimitry Andric     TargetTransformInfo::RegisterKind K) const {
33581ad6265SDimitry Andric   assert(K != TargetTransformInfo::RGK_Scalar);
33606c3fb27SDimitry Andric   return (K == TargetTransformInfo::RGK_FixedWidthVector &&
33706c3fb27SDimitry Andric           ST->isNeonAvailable());
33881ad6265SDimitry Andric }
33981ad6265SDimitry Andric 
3400b57cec5SDimitry Andric /// Calculate the cost of materializing a 64-bit value. This helper
3410b57cec5SDimitry Andric /// method might only calculate a fraction of a larger immediate. Therefore it
3420b57cec5SDimitry Andric /// is valid to return a cost of ZERO.
343fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
3440b57cec5SDimitry Andric   // Check if the immediate can be encoded within an instruction.
3450b57cec5SDimitry Andric   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
3460b57cec5SDimitry Andric     return 0;
3470b57cec5SDimitry Andric 
3480b57cec5SDimitry Andric   if (Val < 0)
3490b57cec5SDimitry Andric     Val = ~Val;
3500b57cec5SDimitry Andric 
3510b57cec5SDimitry Andric   // Calculate how many moves we will need to materialize this constant.
3520b57cec5SDimitry Andric   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
3530b57cec5SDimitry Andric   AArch64_IMM::expandMOVImm(Val, 64, Insn);
3540b57cec5SDimitry Andric   return Insn.size();
3550b57cec5SDimitry Andric }
3560b57cec5SDimitry Andric 
3570b57cec5SDimitry Andric /// Calculate the cost of materializing the given constant.
358fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
3595ffd83dbSDimitry Andric                                               TTI::TargetCostKind CostKind) {
3600b57cec5SDimitry Andric   assert(Ty->isIntegerTy());
3610b57cec5SDimitry Andric 
3620b57cec5SDimitry Andric   unsigned BitSize = Ty->getPrimitiveSizeInBits();
3630b57cec5SDimitry Andric   if (BitSize == 0)
3640b57cec5SDimitry Andric     return ~0U;
3650b57cec5SDimitry Andric 
3660b57cec5SDimitry Andric   // Sign-extend all constants to a multiple of 64-bit.
3670b57cec5SDimitry Andric   APInt ImmVal = Imm;
3680b57cec5SDimitry Andric   if (BitSize & 0x3f)
3690b57cec5SDimitry Andric     ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
3700b57cec5SDimitry Andric 
3710b57cec5SDimitry Andric   // Split the constant into 64-bit chunks and calculate the cost for each
3720b57cec5SDimitry Andric   // chunk.
373fe6060f1SDimitry Andric   InstructionCost Cost = 0;
3740b57cec5SDimitry Andric   for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
3750b57cec5SDimitry Andric     APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
3760b57cec5SDimitry Andric     int64_t Val = Tmp.getSExtValue();
3770b57cec5SDimitry Andric     Cost += getIntImmCost(Val);
3780b57cec5SDimitry Andric   }
3790b57cec5SDimitry Andric   // We need at least one instruction to materialze the constant.
380fe6060f1SDimitry Andric   return std::max<InstructionCost>(1, Cost);
3810b57cec5SDimitry Andric }
3820b57cec5SDimitry Andric 
383fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
3845ffd83dbSDimitry Andric                                                   const APInt &Imm, Type *Ty,
385e8d8bef9SDimitry Andric                                                   TTI::TargetCostKind CostKind,
386e8d8bef9SDimitry Andric                                                   Instruction *Inst) {
3870b57cec5SDimitry Andric   assert(Ty->isIntegerTy());
3880b57cec5SDimitry Andric 
3890b57cec5SDimitry Andric   unsigned BitSize = Ty->getPrimitiveSizeInBits();
3900b57cec5SDimitry Andric   // There is no cost model for constants with a bit size of 0. Return TCC_Free
3910b57cec5SDimitry Andric   // here, so that constant hoisting will ignore this constant.
3920b57cec5SDimitry Andric   if (BitSize == 0)
3930b57cec5SDimitry Andric     return TTI::TCC_Free;
3940b57cec5SDimitry Andric 
3950b57cec5SDimitry Andric   unsigned ImmIdx = ~0U;
3960b57cec5SDimitry Andric   switch (Opcode) {
3970b57cec5SDimitry Andric   default:
3980b57cec5SDimitry Andric     return TTI::TCC_Free;
3990b57cec5SDimitry Andric   case Instruction::GetElementPtr:
4000b57cec5SDimitry Andric     // Always hoist the base address of a GetElementPtr.
4010b57cec5SDimitry Andric     if (Idx == 0)
4020b57cec5SDimitry Andric       return 2 * TTI::TCC_Basic;
4030b57cec5SDimitry Andric     return TTI::TCC_Free;
4040b57cec5SDimitry Andric   case Instruction::Store:
4050b57cec5SDimitry Andric     ImmIdx = 0;
4060b57cec5SDimitry Andric     break;
4070b57cec5SDimitry Andric   case Instruction::Add:
4080b57cec5SDimitry Andric   case Instruction::Sub:
4090b57cec5SDimitry Andric   case Instruction::Mul:
4100b57cec5SDimitry Andric   case Instruction::UDiv:
4110b57cec5SDimitry Andric   case Instruction::SDiv:
4120b57cec5SDimitry Andric   case Instruction::URem:
4130b57cec5SDimitry Andric   case Instruction::SRem:
4140b57cec5SDimitry Andric   case Instruction::And:
4150b57cec5SDimitry Andric   case Instruction::Or:
4160b57cec5SDimitry Andric   case Instruction::Xor:
4170b57cec5SDimitry Andric   case Instruction::ICmp:
4180b57cec5SDimitry Andric     ImmIdx = 1;
4190b57cec5SDimitry Andric     break;
4200b57cec5SDimitry Andric   // Always return TCC_Free for the shift value of a shift instruction.
4210b57cec5SDimitry Andric   case Instruction::Shl:
4220b57cec5SDimitry Andric   case Instruction::LShr:
4230b57cec5SDimitry Andric   case Instruction::AShr:
4240b57cec5SDimitry Andric     if (Idx == 1)
4250b57cec5SDimitry Andric       return TTI::TCC_Free;
4260b57cec5SDimitry Andric     break;
4270b57cec5SDimitry Andric   case Instruction::Trunc:
4280b57cec5SDimitry Andric   case Instruction::ZExt:
4290b57cec5SDimitry Andric   case Instruction::SExt:
4300b57cec5SDimitry Andric   case Instruction::IntToPtr:
4310b57cec5SDimitry Andric   case Instruction::PtrToInt:
4320b57cec5SDimitry Andric   case Instruction::BitCast:
4330b57cec5SDimitry Andric   case Instruction::PHI:
4340b57cec5SDimitry Andric   case Instruction::Call:
4350b57cec5SDimitry Andric   case Instruction::Select:
4360b57cec5SDimitry Andric   case Instruction::Ret:
4370b57cec5SDimitry Andric   case Instruction::Load:
4380b57cec5SDimitry Andric     break;
4390b57cec5SDimitry Andric   }
4400b57cec5SDimitry Andric 
4410b57cec5SDimitry Andric   if (Idx == ImmIdx) {
4420b57cec5SDimitry Andric     int NumConstants = (BitSize + 63) / 64;
443fe6060f1SDimitry Andric     InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4440b57cec5SDimitry Andric     return (Cost <= NumConstants * TTI::TCC_Basic)
4450b57cec5SDimitry Andric                ? static_cast<int>(TTI::TCC_Free)
4460b57cec5SDimitry Andric                : Cost;
4470b57cec5SDimitry Andric   }
4485ffd83dbSDimitry Andric   return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4490b57cec5SDimitry Andric }
4500b57cec5SDimitry Andric 
451fe6060f1SDimitry Andric InstructionCost
452fe6060f1SDimitry Andric AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
4535ffd83dbSDimitry Andric                                     const APInt &Imm, Type *Ty,
4545ffd83dbSDimitry Andric                                     TTI::TargetCostKind CostKind) {
4550b57cec5SDimitry Andric   assert(Ty->isIntegerTy());
4560b57cec5SDimitry Andric 
4570b57cec5SDimitry Andric   unsigned BitSize = Ty->getPrimitiveSizeInBits();
4580b57cec5SDimitry Andric   // There is no cost model for constants with a bit size of 0. Return TCC_Free
4590b57cec5SDimitry Andric   // here, so that constant hoisting will ignore this constant.
4600b57cec5SDimitry Andric   if (BitSize == 0)
4610b57cec5SDimitry Andric     return TTI::TCC_Free;
4620b57cec5SDimitry Andric 
463480093f4SDimitry Andric   // Most (all?) AArch64 intrinsics do not support folding immediates into the
464480093f4SDimitry Andric   // selected instruction, so we compute the materialization cost for the
465480093f4SDimitry Andric   // immediate directly.
466480093f4SDimitry Andric   if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
4675ffd83dbSDimitry Andric     return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
468480093f4SDimitry Andric 
4690b57cec5SDimitry Andric   switch (IID) {
4700b57cec5SDimitry Andric   default:
4710b57cec5SDimitry Andric     return TTI::TCC_Free;
4720b57cec5SDimitry Andric   case Intrinsic::sadd_with_overflow:
4730b57cec5SDimitry Andric   case Intrinsic::uadd_with_overflow:
4740b57cec5SDimitry Andric   case Intrinsic::ssub_with_overflow:
4750b57cec5SDimitry Andric   case Intrinsic::usub_with_overflow:
4760b57cec5SDimitry Andric   case Intrinsic::smul_with_overflow:
4770b57cec5SDimitry Andric   case Intrinsic::umul_with_overflow:
4780b57cec5SDimitry Andric     if (Idx == 1) {
4790b57cec5SDimitry Andric       int NumConstants = (BitSize + 63) / 64;
480fe6060f1SDimitry Andric       InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4810b57cec5SDimitry Andric       return (Cost <= NumConstants * TTI::TCC_Basic)
4820b57cec5SDimitry Andric                  ? static_cast<int>(TTI::TCC_Free)
4830b57cec5SDimitry Andric                  : Cost;
4840b57cec5SDimitry Andric     }
4850b57cec5SDimitry Andric     break;
4860b57cec5SDimitry Andric   case Intrinsic::experimental_stackmap:
4870b57cec5SDimitry Andric     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
4880b57cec5SDimitry Andric       return TTI::TCC_Free;
4890b57cec5SDimitry Andric     break;
4900b57cec5SDimitry Andric   case Intrinsic::experimental_patchpoint_void:
4910fca6ea1SDimitry Andric   case Intrinsic::experimental_patchpoint:
4920b57cec5SDimitry Andric     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
4930b57cec5SDimitry Andric       return TTI::TCC_Free;
4940b57cec5SDimitry Andric     break;
495e8d8bef9SDimitry Andric   case Intrinsic::experimental_gc_statepoint:
496e8d8bef9SDimitry Andric     if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
497e8d8bef9SDimitry Andric       return TTI::TCC_Free;
498e8d8bef9SDimitry Andric     break;
4990b57cec5SDimitry Andric   }
5005ffd83dbSDimitry Andric   return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5010b57cec5SDimitry Andric }
5020b57cec5SDimitry Andric 
5030b57cec5SDimitry Andric TargetTransformInfo::PopcntSupportKind
5040b57cec5SDimitry Andric AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
5050b57cec5SDimitry Andric   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
5060b57cec5SDimitry Andric   if (TyWidth == 32 || TyWidth == 64)
5070b57cec5SDimitry Andric     return TTI::PSK_FastHardware;
5080b57cec5SDimitry Andric   // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
5090b57cec5SDimitry Andric   return TTI::PSK_Software;
5100b57cec5SDimitry Andric }
5110b57cec5SDimitry Andric 
5120fca6ea1SDimitry Andric static bool isUnpackedVectorVT(EVT VecVT) {
5130fca6ea1SDimitry Andric   return VecVT.isScalableVector() &&
5140fca6ea1SDimitry Andric          VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
5150fca6ea1SDimitry Andric }
5160fca6ea1SDimitry Andric 
5170fca6ea1SDimitry Andric static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
5180fca6ea1SDimitry Andric   Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
5190fca6ea1SDimitry Andric   Type *EltTy = ICA.getArgTypes()[1];        // Type of bucket elements
5200fca6ea1SDimitry Andric 
5210fca6ea1SDimitry Andric   // Only allow (32b and 64b) integers or pointers for now...
5220fca6ea1SDimitry Andric   if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) ||
5230fca6ea1SDimitry Andric       (EltTy->getScalarSizeInBits() != 32 &&
5240fca6ea1SDimitry Andric        EltTy->getScalarSizeInBits() != 64))
5250fca6ea1SDimitry Andric     return InstructionCost::getInvalid();
5260fca6ea1SDimitry Andric 
5270fca6ea1SDimitry Andric   // FIXME: Hacky check for legal vector types. We can promote smaller types
5280fca6ea1SDimitry Andric   //        but we cannot legalize vectors via splitting for histcnt.
5290fca6ea1SDimitry Andric   // FIXME: We should be able to generate histcnt for fixed-length vectors
5300fca6ea1SDimitry Andric   //        using ptrue with a specific VL.
5310fca6ea1SDimitry Andric   if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy))
5320fca6ea1SDimitry Andric     if ((VTy->getElementCount().getKnownMinValue() != 2 &&
5330fca6ea1SDimitry Andric          VTy->getElementCount().getKnownMinValue() != 4) ||
5340fca6ea1SDimitry Andric         VTy->getPrimitiveSizeInBits().getKnownMinValue() > 128 ||
5350fca6ea1SDimitry Andric         !VTy->isScalableTy())
5360fca6ea1SDimitry Andric       return InstructionCost::getInvalid();
5370fca6ea1SDimitry Andric 
5380fca6ea1SDimitry Andric   return InstructionCost(BaseHistCntCost);
5390fca6ea1SDimitry Andric }
5400fca6ea1SDimitry Andric 
541fe6060f1SDimitry Andric InstructionCost
542e8d8bef9SDimitry Andric AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
543e8d8bef9SDimitry Andric                                       TTI::TargetCostKind CostKind) {
544*62987288SDimitry Andric   // The code-generator is currently not able to handle scalable vectors
545*62987288SDimitry Andric   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
546*62987288SDimitry Andric   // it. This change will be removed when code-generation for these types is
547*62987288SDimitry Andric   // sufficiently reliable.
548e8d8bef9SDimitry Andric   auto *RetTy = ICA.getReturnType();
549*62987288SDimitry Andric   if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
550*62987288SDimitry Andric     if (VTy->getElementCount() == ElementCount::getScalable(1))
551*62987288SDimitry Andric       return InstructionCost::getInvalid();
552*62987288SDimitry Andric 
553e8d8bef9SDimitry Andric   switch (ICA.getID()) {
5540fca6ea1SDimitry Andric   case Intrinsic::experimental_vector_histogram_add:
5550fca6ea1SDimitry Andric     if (!ST->hasSVE2())
5560fca6ea1SDimitry Andric       return InstructionCost::getInvalid();
5570fca6ea1SDimitry Andric     return getHistogramCost(ICA);
558e8d8bef9SDimitry Andric   case Intrinsic::umin:
559349cc55cSDimitry Andric   case Intrinsic::umax:
560e8d8bef9SDimitry Andric   case Intrinsic::smin:
561e8d8bef9SDimitry Andric   case Intrinsic::smax: {
562e8d8bef9SDimitry Andric     static const auto ValidMinMaxTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
56306c3fb27SDimitry Andric                                         MVT::v8i16, MVT::v2i32, MVT::v4i32,
56406c3fb27SDimitry Andric                                         MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
56506c3fb27SDimitry Andric                                         MVT::nxv2i64};
566bdd1243dSDimitry Andric     auto LT = getTypeLegalizationCost(RetTy);
567349cc55cSDimitry Andric     // v2i64 types get converted to cmp+bif hence the cost of 2
568349cc55cSDimitry Andric     if (LT.second == MVT::v2i64)
569349cc55cSDimitry Andric       return LT.first * 2;
570e8d8bef9SDimitry Andric     if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
571e8d8bef9SDimitry Andric       return LT.first;
572e8d8bef9SDimitry Andric     break;
573e8d8bef9SDimitry Andric   }
574fe6060f1SDimitry Andric   case Intrinsic::sadd_sat:
575fe6060f1SDimitry Andric   case Intrinsic::ssub_sat:
576fe6060f1SDimitry Andric   case Intrinsic::uadd_sat:
577fe6060f1SDimitry Andric   case Intrinsic::usub_sat: {
578fe6060f1SDimitry Andric     static const auto ValidSatTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
579fe6060f1SDimitry Andric                                      MVT::v8i16, MVT::v2i32, MVT::v4i32,
580fe6060f1SDimitry Andric                                      MVT::v2i64};
581bdd1243dSDimitry Andric     auto LT = getTypeLegalizationCost(RetTy);
582fe6060f1SDimitry Andric     // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
583fe6060f1SDimitry Andric     // need to extend the type, as it uses shr(qadd(shl, shl)).
584fe6060f1SDimitry Andric     unsigned Instrs =
585fe6060f1SDimitry Andric         LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
586fe6060f1SDimitry Andric     if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
587fe6060f1SDimitry Andric       return LT.first * Instrs;
588fe6060f1SDimitry Andric     break;
589fe6060f1SDimitry Andric   }
590fe6060f1SDimitry Andric   case Intrinsic::abs: {
591fe6060f1SDimitry Andric     static const auto ValidAbsTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
592fe6060f1SDimitry Andric                                      MVT::v8i16, MVT::v2i32, MVT::v4i32,
593fe6060f1SDimitry Andric                                      MVT::v2i64};
594bdd1243dSDimitry Andric     auto LT = getTypeLegalizationCost(RetTy);
595fe6060f1SDimitry Andric     if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
596fe6060f1SDimitry Andric       return LT.first;
597fe6060f1SDimitry Andric     break;
598fe6060f1SDimitry Andric   }
59906c3fb27SDimitry Andric   case Intrinsic::bswap: {
60006c3fb27SDimitry Andric     static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
60106c3fb27SDimitry Andric                                      MVT::v4i32, MVT::v2i64};
60206c3fb27SDimitry Andric     auto LT = getTypeLegalizationCost(RetTy);
60306c3fb27SDimitry Andric     if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
60406c3fb27SDimitry Andric         LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
60506c3fb27SDimitry Andric       return LT.first;
60606c3fb27SDimitry Andric     break;
60706c3fb27SDimitry Andric   }
608fe6060f1SDimitry Andric   case Intrinsic::experimental_stepvector: {
609fe6060f1SDimitry Andric     InstructionCost Cost = 1; // Cost of the `index' instruction
610bdd1243dSDimitry Andric     auto LT = getTypeLegalizationCost(RetTy);
611fe6060f1SDimitry Andric     // Legalisation of illegal vectors involves an `index' instruction plus
612fe6060f1SDimitry Andric     // (LT.first - 1) vector adds.
613fe6060f1SDimitry Andric     if (LT.first > 1) {
614fe6060f1SDimitry Andric       Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
615fe6060f1SDimitry Andric       InstructionCost AddCost =
616fe6060f1SDimitry Andric           getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
617fe6060f1SDimitry Andric       Cost += AddCost * (LT.first - 1);
618fe6060f1SDimitry Andric     }
619fe6060f1SDimitry Andric     return Cost;
620fe6060f1SDimitry Andric   }
6210fca6ea1SDimitry Andric   case Intrinsic::vector_extract:
6220fca6ea1SDimitry Andric   case Intrinsic::vector_insert: {
6230fca6ea1SDimitry Andric     // If both the vector and subvector types are legal types and the index
6240fca6ea1SDimitry Andric     // is 0, then this should be a no-op or simple operation; return a
6250fca6ea1SDimitry Andric     // relatively low cost.
6260fca6ea1SDimitry Andric 
6270fca6ea1SDimitry Andric     // If arguments aren't actually supplied, then we cannot determine the
6280fca6ea1SDimitry Andric     // value of the index. We also want to skip predicate types.
6290fca6ea1SDimitry Andric     if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
6300fca6ea1SDimitry Andric         ICA.getReturnType()->getScalarType()->isIntegerTy(1))
6310fca6ea1SDimitry Andric       break;
6320fca6ea1SDimitry Andric 
6330fca6ea1SDimitry Andric     LLVMContext &C = RetTy->getContext();
6340fca6ea1SDimitry Andric     EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
6350fca6ea1SDimitry Andric     bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
6360fca6ea1SDimitry Andric     EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
6370fca6ea1SDimitry Andric                              : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
6380fca6ea1SDimitry Andric     // Skip this if either the vector or subvector types are unpacked
6390fca6ea1SDimitry Andric     // SVE types; they may get lowered to stack stores and loads.
6400fca6ea1SDimitry Andric     if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
6410fca6ea1SDimitry Andric       break;
6420fca6ea1SDimitry Andric 
6430fca6ea1SDimitry Andric     TargetLoweringBase::LegalizeKind SubVecLK =
6440fca6ea1SDimitry Andric         getTLI()->getTypeConversion(C, SubVecVT);
6450fca6ea1SDimitry Andric     TargetLoweringBase::LegalizeKind VecLK =
6460fca6ea1SDimitry Andric         getTLI()->getTypeConversion(C, VecVT);
6470fca6ea1SDimitry Andric     const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
6480fca6ea1SDimitry Andric     const ConstantInt *CIdx = cast<ConstantInt>(Idx);
6490fca6ea1SDimitry Andric     if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
6500fca6ea1SDimitry Andric         VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
6510fca6ea1SDimitry Andric       return TTI::TCC_Free;
6520fca6ea1SDimitry Andric     break;
6530fca6ea1SDimitry Andric   }
654fe6060f1SDimitry Andric   case Intrinsic::bitreverse: {
655fe6060f1SDimitry Andric     static const CostTblEntry BitreverseTbl[] = {
656fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::i32, 1},
657fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::i64, 1},
658fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::v8i8, 1},
659fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::v16i8, 1},
660fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::v4i16, 2},
661fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::v8i16, 2},
662fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::v2i32, 2},
663fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::v4i32, 2},
664fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::v1i64, 2},
665fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::v2i64, 2},
666fe6060f1SDimitry Andric     };
667bdd1243dSDimitry Andric     const auto LegalisationCost = getTypeLegalizationCost(RetTy);
668fe6060f1SDimitry Andric     const auto *Entry =
669fe6060f1SDimitry Andric         CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
670349cc55cSDimitry Andric     if (Entry) {
671349cc55cSDimitry Andric       // Cost Model is using the legal type(i32) that i8 and i16 will be
672349cc55cSDimitry Andric       // converted to +1 so that we match the actual lowering cost
673fe6060f1SDimitry Andric       if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
674fe6060f1SDimitry Andric           TLI->getValueType(DL, RetTy, true) == MVT::i16)
675fe6060f1SDimitry Andric         return LegalisationCost.first * Entry->Cost + 1;
676349cc55cSDimitry Andric 
677fe6060f1SDimitry Andric       return LegalisationCost.first * Entry->Cost;
678349cc55cSDimitry Andric     }
679fe6060f1SDimitry Andric     break;
680fe6060f1SDimitry Andric   }
681fe6060f1SDimitry Andric   case Intrinsic::ctpop: {
682bdd1243dSDimitry Andric     if (!ST->hasNEON()) {
683bdd1243dSDimitry Andric       // 32-bit or 64-bit ctpop without NEON is 12 instructions.
684bdd1243dSDimitry Andric       return getTypeLegalizationCost(RetTy).first * 12;
685bdd1243dSDimitry Andric     }
686fe6060f1SDimitry Andric     static const CostTblEntry CtpopCostTbl[] = {
687fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::v2i64, 4},
688fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::v4i32, 3},
689fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::v8i16, 2},
690fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::v16i8, 1},
691fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::i64,   4},
692fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::v2i32, 3},
693fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::v4i16, 2},
694fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::v8i8,  1},
695fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::i32,   5},
696fe6060f1SDimitry Andric     };
697bdd1243dSDimitry Andric     auto LT = getTypeLegalizationCost(RetTy);
698fe6060f1SDimitry Andric     MVT MTy = LT.second;
699fe6060f1SDimitry Andric     if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
700fe6060f1SDimitry Andric       // Extra cost of +1 when illegal vector types are legalized by promoting
701fe6060f1SDimitry Andric       // the integer type.
702fe6060f1SDimitry Andric       int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
703fe6060f1SDimitry Andric                                             RetTy->getScalarSizeInBits()
704fe6060f1SDimitry Andric                           ? 1
705fe6060f1SDimitry Andric                           : 0;
706fe6060f1SDimitry Andric       return LT.first * Entry->Cost + ExtraCost;
707fe6060f1SDimitry Andric     }
708fe6060f1SDimitry Andric     break;
709fe6060f1SDimitry Andric   }
71004eeddc0SDimitry Andric   case Intrinsic::sadd_with_overflow:
71104eeddc0SDimitry Andric   case Intrinsic::uadd_with_overflow:
71204eeddc0SDimitry Andric   case Intrinsic::ssub_with_overflow:
71304eeddc0SDimitry Andric   case Intrinsic::usub_with_overflow:
71404eeddc0SDimitry Andric   case Intrinsic::smul_with_overflow:
71504eeddc0SDimitry Andric   case Intrinsic::umul_with_overflow: {
71604eeddc0SDimitry Andric     static const CostTblEntry WithOverflowCostTbl[] = {
71704eeddc0SDimitry Andric         {Intrinsic::sadd_with_overflow, MVT::i8, 3},
71804eeddc0SDimitry Andric         {Intrinsic::uadd_with_overflow, MVT::i8, 3},
71904eeddc0SDimitry Andric         {Intrinsic::sadd_with_overflow, MVT::i16, 3},
72004eeddc0SDimitry Andric         {Intrinsic::uadd_with_overflow, MVT::i16, 3},
72104eeddc0SDimitry Andric         {Intrinsic::sadd_with_overflow, MVT::i32, 1},
72204eeddc0SDimitry Andric         {Intrinsic::uadd_with_overflow, MVT::i32, 1},
72304eeddc0SDimitry Andric         {Intrinsic::sadd_with_overflow, MVT::i64, 1},
72404eeddc0SDimitry Andric         {Intrinsic::uadd_with_overflow, MVT::i64, 1},
72504eeddc0SDimitry Andric         {Intrinsic::ssub_with_overflow, MVT::i8, 3},
72604eeddc0SDimitry Andric         {Intrinsic::usub_with_overflow, MVT::i8, 3},
72704eeddc0SDimitry Andric         {Intrinsic::ssub_with_overflow, MVT::i16, 3},
72804eeddc0SDimitry Andric         {Intrinsic::usub_with_overflow, MVT::i16, 3},
72904eeddc0SDimitry Andric         {Intrinsic::ssub_with_overflow, MVT::i32, 1},
73004eeddc0SDimitry Andric         {Intrinsic::usub_with_overflow, MVT::i32, 1},
73104eeddc0SDimitry Andric         {Intrinsic::ssub_with_overflow, MVT::i64, 1},
73204eeddc0SDimitry Andric         {Intrinsic::usub_with_overflow, MVT::i64, 1},
73304eeddc0SDimitry Andric         {Intrinsic::smul_with_overflow, MVT::i8, 5},
73404eeddc0SDimitry Andric         {Intrinsic::umul_with_overflow, MVT::i8, 4},
73504eeddc0SDimitry Andric         {Intrinsic::smul_with_overflow, MVT::i16, 5},
73604eeddc0SDimitry Andric         {Intrinsic::umul_with_overflow, MVT::i16, 4},
73704eeddc0SDimitry Andric         {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
73804eeddc0SDimitry Andric         {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
73904eeddc0SDimitry Andric         {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
74004eeddc0SDimitry Andric         {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
74104eeddc0SDimitry Andric     };
74204eeddc0SDimitry Andric     EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
74304eeddc0SDimitry Andric     if (MTy.isSimple())
74404eeddc0SDimitry Andric       if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
74504eeddc0SDimitry Andric                                               MTy.getSimpleVT()))
74604eeddc0SDimitry Andric         return Entry->Cost;
74704eeddc0SDimitry Andric     break;
74804eeddc0SDimitry Andric   }
74981ad6265SDimitry Andric   case Intrinsic::fptosi_sat:
75081ad6265SDimitry Andric   case Intrinsic::fptoui_sat: {
75181ad6265SDimitry Andric     if (ICA.getArgTypes().empty())
75281ad6265SDimitry Andric       break;
75381ad6265SDimitry Andric     bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
754bdd1243dSDimitry Andric     auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
75581ad6265SDimitry Andric     EVT MTy = TLI->getValueType(DL, RetTy);
75681ad6265SDimitry Andric     // Check for the legal types, which are where the size of the input and the
75781ad6265SDimitry Andric     // output are the same, or we are using cvt f64->i32 or f32->i64.
75881ad6265SDimitry Andric     if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
75981ad6265SDimitry Andric          LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
76081ad6265SDimitry Andric          LT.second == MVT::v2f64) &&
76181ad6265SDimitry Andric         (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
76281ad6265SDimitry Andric          (LT.second == MVT::f64 && MTy == MVT::i32) ||
76381ad6265SDimitry Andric          (LT.second == MVT::f32 && MTy == MVT::i64)))
76481ad6265SDimitry Andric       return LT.first;
76581ad6265SDimitry Andric     // Similarly for fp16 sizes
76681ad6265SDimitry Andric     if (ST->hasFullFP16() &&
76781ad6265SDimitry Andric         ((LT.second == MVT::f16 && MTy == MVT::i32) ||
76881ad6265SDimitry Andric          ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
76981ad6265SDimitry Andric           (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
77081ad6265SDimitry Andric       return LT.first;
77181ad6265SDimitry Andric 
77281ad6265SDimitry Andric     // Otherwise we use a legal convert followed by a min+max
77381ad6265SDimitry Andric     if ((LT.second.getScalarType() == MVT::f32 ||
77481ad6265SDimitry Andric          LT.second.getScalarType() == MVT::f64 ||
77581ad6265SDimitry Andric          (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
77681ad6265SDimitry Andric         LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
77781ad6265SDimitry Andric       Type *LegalTy =
77881ad6265SDimitry Andric           Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
77981ad6265SDimitry Andric       if (LT.second.isVector())
78081ad6265SDimitry Andric         LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
78181ad6265SDimitry Andric       InstructionCost Cost = 1;
78281ad6265SDimitry Andric       IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
78381ad6265SDimitry Andric                                     LegalTy, {LegalTy, LegalTy});
78481ad6265SDimitry Andric       Cost += getIntrinsicInstrCost(Attrs1, CostKind);
78581ad6265SDimitry Andric       IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
78681ad6265SDimitry Andric                                     LegalTy, {LegalTy, LegalTy});
78781ad6265SDimitry Andric       Cost += getIntrinsicInstrCost(Attrs2, CostKind);
78881ad6265SDimitry Andric       return LT.first * Cost;
78981ad6265SDimitry Andric     }
79081ad6265SDimitry Andric     break;
79181ad6265SDimitry Andric   }
79206c3fb27SDimitry Andric   case Intrinsic::fshl:
79306c3fb27SDimitry Andric   case Intrinsic::fshr: {
79406c3fb27SDimitry Andric     if (ICA.getArgs().empty())
79506c3fb27SDimitry Andric       break;
79606c3fb27SDimitry Andric 
79706c3fb27SDimitry Andric     // TODO: Add handling for fshl where third argument is not a constant.
79806c3fb27SDimitry Andric     const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
79906c3fb27SDimitry Andric     if (!OpInfoZ.isConstant())
80006c3fb27SDimitry Andric       break;
80106c3fb27SDimitry Andric 
80206c3fb27SDimitry Andric     const auto LegalisationCost = getTypeLegalizationCost(RetTy);
80306c3fb27SDimitry Andric     if (OpInfoZ.isUniform()) {
80406c3fb27SDimitry Andric       // FIXME: The costs could be lower if the codegen is better.
80506c3fb27SDimitry Andric       static const CostTblEntry FshlTbl[] = {
80606c3fb27SDimitry Andric           {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
80706c3fb27SDimitry Andric           {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
80806c3fb27SDimitry Andric           {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
80906c3fb27SDimitry Andric           {Intrinsic::fshl, MVT::v8i8, 4},  {Intrinsic::fshl, MVT::v4i16, 4}};
81006c3fb27SDimitry Andric       // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
81106c3fb27SDimitry Andric       // to avoid having to duplicate the costs.
81206c3fb27SDimitry Andric       const auto *Entry =
81306c3fb27SDimitry Andric           CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
81406c3fb27SDimitry Andric       if (Entry)
81506c3fb27SDimitry Andric         return LegalisationCost.first * Entry->Cost;
81606c3fb27SDimitry Andric     }
81706c3fb27SDimitry Andric 
81806c3fb27SDimitry Andric     auto TyL = getTypeLegalizationCost(RetTy);
81906c3fb27SDimitry Andric     if (!RetTy->isIntegerTy())
82006c3fb27SDimitry Andric       break;
82106c3fb27SDimitry Andric 
82206c3fb27SDimitry Andric     // Estimate cost manually, as types like i8 and i16 will get promoted to
82306c3fb27SDimitry Andric     // i32 and CostTableLookup will ignore the extra conversion cost.
82406c3fb27SDimitry Andric     bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
82506c3fb27SDimitry Andric                        RetTy->getScalarSizeInBits() < 64) ||
82606c3fb27SDimitry Andric                       (RetTy->getScalarSizeInBits() % 64 != 0);
82706c3fb27SDimitry Andric     unsigned ExtraCost = HigherCost ? 1 : 0;
82806c3fb27SDimitry Andric     if (RetTy->getScalarSizeInBits() == 32 ||
82906c3fb27SDimitry Andric         RetTy->getScalarSizeInBits() == 64)
83006c3fb27SDimitry Andric       ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
83106c3fb27SDimitry Andric                      // extr instruction.
83206c3fb27SDimitry Andric     else if (HigherCost)
83306c3fb27SDimitry Andric       ExtraCost = 1;
83406c3fb27SDimitry Andric     else
83506c3fb27SDimitry Andric       break;
83606c3fb27SDimitry Andric     return TyL.first + ExtraCost;
83706c3fb27SDimitry Andric   }
8380fca6ea1SDimitry Andric   case Intrinsic::get_active_lane_mask: {
8390fca6ea1SDimitry Andric     auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
8400fca6ea1SDimitry Andric     if (RetTy) {
8410fca6ea1SDimitry Andric       EVT RetVT = getTLI()->getValueType(DL, RetTy);
8420fca6ea1SDimitry Andric       EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
8430fca6ea1SDimitry Andric       if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
8440fca6ea1SDimitry Andric           !getTLI()->isTypeLegal(RetVT)) {
8450fca6ea1SDimitry Andric         // We don't have enough context at this point to determine if the mask
8460fca6ea1SDimitry Andric         // is going to be kept live after the block, which will force the vXi1
8470fca6ea1SDimitry Andric         // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
8480fca6ea1SDimitry Andric         // For now, we just assume the vectorizer created this intrinsic and
8490fca6ea1SDimitry Andric         // the result will be the input for a PHI. In this case the cost will
8500fca6ea1SDimitry Andric         // be extremely high for fixed-width vectors.
8510fca6ea1SDimitry Andric         // NOTE: getScalarizationOverhead returns a cost that's far too
8520fca6ea1SDimitry Andric         // pessimistic for the actual generated codegen. In reality there are
8530fca6ea1SDimitry Andric         // two instructions generated per lane.
8540fca6ea1SDimitry Andric         return RetTy->getNumElements() * 2;
8550fca6ea1SDimitry Andric       }
8560fca6ea1SDimitry Andric     }
8570fca6ea1SDimitry Andric     break;
8580fca6ea1SDimitry Andric   }
859e8d8bef9SDimitry Andric   default:
860e8d8bef9SDimitry Andric     break;
861e8d8bef9SDimitry Andric   }
862e8d8bef9SDimitry Andric   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
863e8d8bef9SDimitry Andric }
864e8d8bef9SDimitry Andric 
865fe6060f1SDimitry Andric /// The function will remove redundant reinterprets casting in the presence
866fe6060f1SDimitry Andric /// of the control flow
867bdd1243dSDimitry Andric static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
868fe6060f1SDimitry Andric                                                    IntrinsicInst &II) {
869fe6060f1SDimitry Andric   SmallVector<Instruction *, 32> Worklist;
870fe6060f1SDimitry Andric   auto RequiredType = II.getType();
871fe6060f1SDimitry Andric 
872fe6060f1SDimitry Andric   auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
873fe6060f1SDimitry Andric   assert(PN && "Expected Phi Node!");
874fe6060f1SDimitry Andric 
875fe6060f1SDimitry Andric   // Don't create a new Phi unless we can remove the old one.
876fe6060f1SDimitry Andric   if (!PN->hasOneUse())
877bdd1243dSDimitry Andric     return std::nullopt;
878fe6060f1SDimitry Andric 
879fe6060f1SDimitry Andric   for (Value *IncValPhi : PN->incoming_values()) {
880fe6060f1SDimitry Andric     auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
881fe6060f1SDimitry Andric     if (!Reinterpret ||
882fe6060f1SDimitry Andric         Reinterpret->getIntrinsicID() !=
883fe6060f1SDimitry Andric             Intrinsic::aarch64_sve_convert_to_svbool ||
884fe6060f1SDimitry Andric         RequiredType != Reinterpret->getArgOperand(0)->getType())
885bdd1243dSDimitry Andric       return std::nullopt;
886fe6060f1SDimitry Andric   }
887fe6060f1SDimitry Andric 
888fe6060f1SDimitry Andric   // Create the new Phi
88906c3fb27SDimitry Andric   IC.Builder.SetInsertPoint(PN);
89006c3fb27SDimitry Andric   PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
891fe6060f1SDimitry Andric   Worklist.push_back(PN);
892fe6060f1SDimitry Andric 
893fe6060f1SDimitry Andric   for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
894fe6060f1SDimitry Andric     auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
895fe6060f1SDimitry Andric     NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
896fe6060f1SDimitry Andric     Worklist.push_back(Reinterpret);
897fe6060f1SDimitry Andric   }
898fe6060f1SDimitry Andric 
899fe6060f1SDimitry Andric   // Cleanup Phi Node and reinterprets
900fe6060f1SDimitry Andric   return IC.replaceInstUsesWith(II, NPN);
901fe6060f1SDimitry Andric }
902fe6060f1SDimitry Andric 
90304eeddc0SDimitry Andric // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
90404eeddc0SDimitry Andric // => (binop (pred) (from_svbool _) (from_svbool _))
90504eeddc0SDimitry Andric //
90604eeddc0SDimitry Andric // The above transformation eliminates a `to_svbool` in the predicate
90704eeddc0SDimitry Andric // operand of bitwise operation `binop` by narrowing the vector width of
90804eeddc0SDimitry Andric // the operation. For example, it would convert a `<vscale x 16 x i1>
90904eeddc0SDimitry Andric // and` into a `<vscale x 4 x i1> and`. This is profitable because
91004eeddc0SDimitry Andric // to_svbool must zero the new lanes during widening, whereas
91104eeddc0SDimitry Andric // from_svbool is free.
912bdd1243dSDimitry Andric static std::optional<Instruction *>
913bdd1243dSDimitry Andric tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {
91404eeddc0SDimitry Andric   auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
91504eeddc0SDimitry Andric   if (!BinOp)
916bdd1243dSDimitry Andric     return std::nullopt;
91704eeddc0SDimitry Andric 
91804eeddc0SDimitry Andric   auto IntrinsicID = BinOp->getIntrinsicID();
91904eeddc0SDimitry Andric   switch (IntrinsicID) {
92004eeddc0SDimitry Andric   case Intrinsic::aarch64_sve_and_z:
92104eeddc0SDimitry Andric   case Intrinsic::aarch64_sve_bic_z:
92204eeddc0SDimitry Andric   case Intrinsic::aarch64_sve_eor_z:
92304eeddc0SDimitry Andric   case Intrinsic::aarch64_sve_nand_z:
92404eeddc0SDimitry Andric   case Intrinsic::aarch64_sve_nor_z:
92504eeddc0SDimitry Andric   case Intrinsic::aarch64_sve_orn_z:
92604eeddc0SDimitry Andric   case Intrinsic::aarch64_sve_orr_z:
92704eeddc0SDimitry Andric     break;
92804eeddc0SDimitry Andric   default:
929bdd1243dSDimitry Andric     return std::nullopt;
93004eeddc0SDimitry Andric   }
93104eeddc0SDimitry Andric 
93204eeddc0SDimitry Andric   auto BinOpPred = BinOp->getOperand(0);
93304eeddc0SDimitry Andric   auto BinOpOp1 = BinOp->getOperand(1);
93404eeddc0SDimitry Andric   auto BinOpOp2 = BinOp->getOperand(2);
93504eeddc0SDimitry Andric 
93604eeddc0SDimitry Andric   auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
93704eeddc0SDimitry Andric   if (!PredIntr ||
93804eeddc0SDimitry Andric       PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
939bdd1243dSDimitry Andric     return std::nullopt;
94004eeddc0SDimitry Andric 
94104eeddc0SDimitry Andric   auto PredOp = PredIntr->getOperand(0);
94204eeddc0SDimitry Andric   auto PredOpTy = cast<VectorType>(PredOp->getType());
94304eeddc0SDimitry Andric   if (PredOpTy != II.getType())
944bdd1243dSDimitry Andric     return std::nullopt;
94504eeddc0SDimitry Andric 
94604eeddc0SDimitry Andric   SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
94706c3fb27SDimitry Andric   auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
94804eeddc0SDimitry Andric       Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
94904eeddc0SDimitry Andric   NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
95004eeddc0SDimitry Andric   if (BinOpOp1 == BinOpOp2)
95104eeddc0SDimitry Andric     NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
95204eeddc0SDimitry Andric   else
95306c3fb27SDimitry Andric     NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
95404eeddc0SDimitry Andric         Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
95504eeddc0SDimitry Andric 
95604eeddc0SDimitry Andric   auto NarrowedBinOp =
95706c3fb27SDimitry Andric       IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
95804eeddc0SDimitry Andric   return IC.replaceInstUsesWith(II, NarrowedBinOp);
95904eeddc0SDimitry Andric }
96004eeddc0SDimitry Andric 
961bdd1243dSDimitry Andric static std::optional<Instruction *>
962bdd1243dSDimitry Andric instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {
963fe6060f1SDimitry Andric   // If the reinterpret instruction operand is a PHI Node
964fe6060f1SDimitry Andric   if (isa<PHINode>(II.getArgOperand(0)))
965fe6060f1SDimitry Andric     return processPhiNode(IC, II);
966fe6060f1SDimitry Andric 
96704eeddc0SDimitry Andric   if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
96804eeddc0SDimitry Andric     return BinOpCombine;
96904eeddc0SDimitry Andric 
97006c3fb27SDimitry Andric   // Ignore converts to/from svcount_t.
97106c3fb27SDimitry Andric   if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
97206c3fb27SDimitry Andric       isa<TargetExtType>(II.getType()))
97306c3fb27SDimitry Andric     return std::nullopt;
97406c3fb27SDimitry Andric 
975fe6060f1SDimitry Andric   SmallVector<Instruction *, 32> CandidatesForRemoval;
976fe6060f1SDimitry Andric   Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
977fe6060f1SDimitry Andric 
978fe6060f1SDimitry Andric   const auto *IVTy = cast<VectorType>(II.getType());
979fe6060f1SDimitry Andric 
980fe6060f1SDimitry Andric   // Walk the chain of conversions.
981fe6060f1SDimitry Andric   while (Cursor) {
982fe6060f1SDimitry Andric     // If the type of the cursor has fewer lanes than the final result, zeroing
983fe6060f1SDimitry Andric     // must take place, which breaks the equivalence chain.
984fe6060f1SDimitry Andric     const auto *CursorVTy = cast<VectorType>(Cursor->getType());
985fe6060f1SDimitry Andric     if (CursorVTy->getElementCount().getKnownMinValue() <
986fe6060f1SDimitry Andric         IVTy->getElementCount().getKnownMinValue())
987fe6060f1SDimitry Andric       break;
988fe6060f1SDimitry Andric 
989fe6060f1SDimitry Andric     // If the cursor has the same type as I, it is a viable replacement.
990fe6060f1SDimitry Andric     if (Cursor->getType() == IVTy)
991fe6060f1SDimitry Andric       EarliestReplacement = Cursor;
992fe6060f1SDimitry Andric 
993fe6060f1SDimitry Andric     auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
994fe6060f1SDimitry Andric 
995fe6060f1SDimitry Andric     // If this is not an SVE conversion intrinsic, this is the end of the chain.
996fe6060f1SDimitry Andric     if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
997fe6060f1SDimitry Andric                                   Intrinsic::aarch64_sve_convert_to_svbool ||
998fe6060f1SDimitry Andric                               IntrinsicCursor->getIntrinsicID() ==
999fe6060f1SDimitry Andric                                   Intrinsic::aarch64_sve_convert_from_svbool))
1000fe6060f1SDimitry Andric       break;
1001fe6060f1SDimitry Andric 
1002fe6060f1SDimitry Andric     CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1003fe6060f1SDimitry Andric     Cursor = IntrinsicCursor->getOperand(0);
1004fe6060f1SDimitry Andric   }
1005fe6060f1SDimitry Andric 
1006fe6060f1SDimitry Andric   // If no viable replacement in the conversion chain was found, there is
1007fe6060f1SDimitry Andric   // nothing to do.
1008fe6060f1SDimitry Andric   if (!EarliestReplacement)
1009bdd1243dSDimitry Andric     return std::nullopt;
1010fe6060f1SDimitry Andric 
1011fe6060f1SDimitry Andric   return IC.replaceInstUsesWith(II, EarliestReplacement);
1012fe6060f1SDimitry Andric }
1013fe6060f1SDimitry Andric 
10145f757f3fSDimitry Andric static bool isAllActivePredicate(Value *Pred) {
10155f757f3fSDimitry Andric   // Look through convert.from.svbool(convert.to.svbool(...) chain.
10165f757f3fSDimitry Andric   Value *UncastedPred;
10175f757f3fSDimitry Andric   if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
10185f757f3fSDimitry Andric                       m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
10195f757f3fSDimitry Andric                           m_Value(UncastedPred)))))
10205f757f3fSDimitry Andric     // If the predicate has the same or less lanes than the uncasted
10215f757f3fSDimitry Andric     // predicate then we know the casting has no effect.
10225f757f3fSDimitry Andric     if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
10235f757f3fSDimitry Andric         cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
10245f757f3fSDimitry Andric       Pred = UncastedPred;
10255f757f3fSDimitry Andric 
10265f757f3fSDimitry Andric   return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
10275f757f3fSDimitry Andric                          m_ConstantInt<AArch64SVEPredPattern::all>()));
10285f757f3fSDimitry Andric }
10295f757f3fSDimitry Andric 
10300fca6ea1SDimitry Andric // Erase unary operation where predicate has all inactive lanes
10310fca6ea1SDimitry Andric static std::optional<Instruction *>
10320fca6ea1SDimitry Andric instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II,
10330fca6ea1SDimitry Andric                                  int PredPos) {
10340fca6ea1SDimitry Andric   if (match(II.getOperand(PredPos), m_ZeroInt())) {
10350fca6ea1SDimitry Andric     return IC.eraseInstFromFunction(II);
10360fca6ea1SDimitry Andric   }
10370fca6ea1SDimitry Andric   return std::nullopt;
10380fca6ea1SDimitry Andric }
10390fca6ea1SDimitry Andric 
10400fca6ea1SDimitry Andric // Simplify unary operation where predicate has all inactive lanes by replacing
10410fca6ea1SDimitry Andric // instruction with zeroed object
10420fca6ea1SDimitry Andric static std::optional<Instruction *>
10430fca6ea1SDimitry Andric instCombineSVENoActiveUnaryZero(InstCombiner &IC, IntrinsicInst &II) {
10440fca6ea1SDimitry Andric   if (match(II.getOperand(0), m_ZeroInt())) {
10450fca6ea1SDimitry Andric     Constant *Node;
10460fca6ea1SDimitry Andric     Type *RetTy = II.getType();
10470fca6ea1SDimitry Andric     if (RetTy->isStructTy()) {
10480fca6ea1SDimitry Andric       auto StructT = cast<StructType>(RetTy);
10490fca6ea1SDimitry Andric       auto VecT = StructT->getElementType(0);
10500fca6ea1SDimitry Andric       SmallVector<llvm::Constant *, 4> ZerVec;
10510fca6ea1SDimitry Andric       for (unsigned i = 0; i < StructT->getNumElements(); i++) {
10520fca6ea1SDimitry Andric         ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
10530fca6ea1SDimitry Andric                                                   : ConstantInt::get(VecT, 0));
10540fca6ea1SDimitry Andric       }
10550fca6ea1SDimitry Andric       Node = ConstantStruct::get(StructT, ZerVec);
10560fca6ea1SDimitry Andric     } else if (RetTy->isFPOrFPVectorTy())
10570fca6ea1SDimitry Andric       Node = ConstantFP::get(RetTy, 0.0);
10580fca6ea1SDimitry Andric     else
10590fca6ea1SDimitry Andric       Node = ConstantInt::get(II.getType(), 0);
10600fca6ea1SDimitry Andric 
10610fca6ea1SDimitry Andric     IC.replaceInstUsesWith(II, Node);
10620fca6ea1SDimitry Andric     return IC.eraseInstFromFunction(II);
10630fca6ea1SDimitry Andric   }
10640fca6ea1SDimitry Andric   return std::nullopt;
10650fca6ea1SDimitry Andric }
10660fca6ea1SDimitry Andric 
1067bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
106881ad6265SDimitry Andric                                                       IntrinsicInst &II) {
10695f757f3fSDimitry Andric   // svsel(ptrue, x, y) => x
10705f757f3fSDimitry Andric   auto *OpPredicate = II.getOperand(0);
10715f757f3fSDimitry Andric   if (isAllActivePredicate(OpPredicate))
10725f757f3fSDimitry Andric     return IC.replaceInstUsesWith(II, II.getOperand(1));
10735f757f3fSDimitry Andric 
10745f757f3fSDimitry Andric   auto Select =
10755f757f3fSDimitry Andric       IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
107681ad6265SDimitry Andric   return IC.replaceInstUsesWith(II, Select);
107781ad6265SDimitry Andric }
107881ad6265SDimitry Andric 
1079bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1080fe6060f1SDimitry Andric                                                       IntrinsicInst &II) {
1081fe6060f1SDimitry Andric   IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1082fe6060f1SDimitry Andric   if (!Pg)
1083bdd1243dSDimitry Andric     return std::nullopt;
1084fe6060f1SDimitry Andric 
1085fe6060f1SDimitry Andric   if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1086bdd1243dSDimitry Andric     return std::nullopt;
1087fe6060f1SDimitry Andric 
1088fe6060f1SDimitry Andric   const auto PTruePattern =
1089fe6060f1SDimitry Andric       cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1090fe6060f1SDimitry Andric   if (PTruePattern != AArch64SVEPredPattern::vl1)
1091bdd1243dSDimitry Andric     return std::nullopt;
1092fe6060f1SDimitry Andric 
1093fe6060f1SDimitry Andric   // The intrinsic is inserting into lane zero so use an insert instead.
1094fe6060f1SDimitry Andric   auto *IdxTy = Type::getInt64Ty(II.getContext());
1095fe6060f1SDimitry Andric   auto *Insert = InsertElementInst::Create(
1096fe6060f1SDimitry Andric       II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1097fe6060f1SDimitry Andric   Insert->insertBefore(&II);
1098fe6060f1SDimitry Andric   Insert->takeName(&II);
1099fe6060f1SDimitry Andric 
1100fe6060f1SDimitry Andric   return IC.replaceInstUsesWith(II, Insert);
1101fe6060f1SDimitry Andric }
1102fe6060f1SDimitry Andric 
1103bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1104349cc55cSDimitry Andric                                                        IntrinsicInst &II) {
1105349cc55cSDimitry Andric   // Replace DupX with a regular IR splat.
1106349cc55cSDimitry Andric   auto *RetTy = cast<ScalableVectorType>(II.getType());
110706c3fb27SDimitry Andric   Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
110806c3fb27SDimitry Andric                                               II.getArgOperand(0));
1109349cc55cSDimitry Andric   Splat->takeName(&II);
1110349cc55cSDimitry Andric   return IC.replaceInstUsesWith(II, Splat);
1111349cc55cSDimitry Andric }
1112349cc55cSDimitry Andric 
1113bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1114fe6060f1SDimitry Andric                                                         IntrinsicInst &II) {
1115fe6060f1SDimitry Andric   LLVMContext &Ctx = II.getContext();
1116fe6060f1SDimitry Andric 
1117fe6060f1SDimitry Andric   // Check that the predicate is all active
1118fe6060f1SDimitry Andric   auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
1119fe6060f1SDimitry Andric   if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1120bdd1243dSDimitry Andric     return std::nullopt;
1121fe6060f1SDimitry Andric 
1122fe6060f1SDimitry Andric   const auto PTruePattern =
1123fe6060f1SDimitry Andric       cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1124fe6060f1SDimitry Andric   if (PTruePattern != AArch64SVEPredPattern::all)
1125bdd1243dSDimitry Andric     return std::nullopt;
1126fe6060f1SDimitry Andric 
1127fe6060f1SDimitry Andric   // Check that we have a compare of zero..
1128349cc55cSDimitry Andric   auto *SplatValue =
1129349cc55cSDimitry Andric       dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
1130349cc55cSDimitry Andric   if (!SplatValue || !SplatValue->isZero())
1131bdd1243dSDimitry Andric     return std::nullopt;
1132fe6060f1SDimitry Andric 
1133fe6060f1SDimitry Andric   // ..against a dupq
1134fe6060f1SDimitry Andric   auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1135fe6060f1SDimitry Andric   if (!DupQLane ||
1136fe6060f1SDimitry Andric       DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1137bdd1243dSDimitry Andric     return std::nullopt;
1138fe6060f1SDimitry Andric 
1139fe6060f1SDimitry Andric   // Where the dupq is a lane 0 replicate of a vector insert
1140fe6060f1SDimitry Andric   if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
1141bdd1243dSDimitry Andric     return std::nullopt;
1142fe6060f1SDimitry Andric 
1143fe6060f1SDimitry Andric   auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
114481ad6265SDimitry Andric   if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1145bdd1243dSDimitry Andric     return std::nullopt;
1146fe6060f1SDimitry Andric 
1147fe6060f1SDimitry Andric   // Where the vector insert is a fixed constant vector insert into undef at
1148fe6060f1SDimitry Andric   // index zero
1149fe6060f1SDimitry Andric   if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1150bdd1243dSDimitry Andric     return std::nullopt;
1151fe6060f1SDimitry Andric 
1152fe6060f1SDimitry Andric   if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1153bdd1243dSDimitry Andric     return std::nullopt;
1154fe6060f1SDimitry Andric 
1155fe6060f1SDimitry Andric   auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1156fe6060f1SDimitry Andric   if (!ConstVec)
1157bdd1243dSDimitry Andric     return std::nullopt;
1158fe6060f1SDimitry Andric 
1159fe6060f1SDimitry Andric   auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1160fe6060f1SDimitry Andric   auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1161fe6060f1SDimitry Andric   if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1162bdd1243dSDimitry Andric     return std::nullopt;
1163fe6060f1SDimitry Andric 
1164fe6060f1SDimitry Andric   unsigned NumElts = VecTy->getNumElements();
1165fe6060f1SDimitry Andric   unsigned PredicateBits = 0;
1166fe6060f1SDimitry Andric 
1167fe6060f1SDimitry Andric   // Expand intrinsic operands to a 16-bit byte level predicate
1168fe6060f1SDimitry Andric   for (unsigned I = 0; I < NumElts; ++I) {
1169fe6060f1SDimitry Andric     auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1170fe6060f1SDimitry Andric     if (!Arg)
1171bdd1243dSDimitry Andric       return std::nullopt;
1172fe6060f1SDimitry Andric     if (!Arg->isZero())
1173fe6060f1SDimitry Andric       PredicateBits |= 1 << (I * (16 / NumElts));
1174fe6060f1SDimitry Andric   }
1175fe6060f1SDimitry Andric 
1176fe6060f1SDimitry Andric   // If all bits are zero bail early with an empty predicate
1177fe6060f1SDimitry Andric   if (PredicateBits == 0) {
1178fe6060f1SDimitry Andric     auto *PFalse = Constant::getNullValue(II.getType());
1179fe6060f1SDimitry Andric     PFalse->takeName(&II);
1180fe6060f1SDimitry Andric     return IC.replaceInstUsesWith(II, PFalse);
1181fe6060f1SDimitry Andric   }
1182fe6060f1SDimitry Andric 
1183fe6060f1SDimitry Andric   // Calculate largest predicate type used (where byte predicate is largest)
1184fe6060f1SDimitry Andric   unsigned Mask = 8;
1185fe6060f1SDimitry Andric   for (unsigned I = 0; I < 16; ++I)
1186fe6060f1SDimitry Andric     if ((PredicateBits & (1 << I)) != 0)
1187fe6060f1SDimitry Andric       Mask |= (I % 8);
1188fe6060f1SDimitry Andric 
1189fe6060f1SDimitry Andric   unsigned PredSize = Mask & -Mask;
1190fe6060f1SDimitry Andric   auto *PredType = ScalableVectorType::get(
1191fe6060f1SDimitry Andric       Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1192fe6060f1SDimitry Andric 
1193fe6060f1SDimitry Andric   // Ensure all relevant bits are set
1194fe6060f1SDimitry Andric   for (unsigned I = 0; I < 16; I += PredSize)
1195fe6060f1SDimitry Andric     if ((PredicateBits & (1 << I)) == 0)
1196bdd1243dSDimitry Andric       return std::nullopt;
1197fe6060f1SDimitry Andric 
1198fe6060f1SDimitry Andric   auto *PTruePat =
1199fe6060f1SDimitry Andric       ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
120006c3fb27SDimitry Andric   auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1201fe6060f1SDimitry Andric                                            {PredType}, {PTruePat});
120206c3fb27SDimitry Andric   auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1203fe6060f1SDimitry Andric       Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1204fe6060f1SDimitry Andric   auto *ConvertFromSVBool =
120506c3fb27SDimitry Andric       IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1206fe6060f1SDimitry Andric                                  {II.getType()}, {ConvertToSVBool});
1207fe6060f1SDimitry Andric 
1208fe6060f1SDimitry Andric   ConvertFromSVBool->takeName(&II);
1209fe6060f1SDimitry Andric   return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1210fe6060f1SDimitry Andric }
1211fe6060f1SDimitry Andric 
1212bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1213fe6060f1SDimitry Andric                                                        IntrinsicInst &II) {
1214fe6060f1SDimitry Andric   Value *Pg = II.getArgOperand(0);
1215fe6060f1SDimitry Andric   Value *Vec = II.getArgOperand(1);
1216349cc55cSDimitry Andric   auto IntrinsicID = II.getIntrinsicID();
1217349cc55cSDimitry Andric   bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1218fe6060f1SDimitry Andric 
1219fe6060f1SDimitry Andric   // lastX(splat(X)) --> X
1220fe6060f1SDimitry Andric   if (auto *SplatVal = getSplatValue(Vec))
1221fe6060f1SDimitry Andric     return IC.replaceInstUsesWith(II, SplatVal);
1222fe6060f1SDimitry Andric 
1223349cc55cSDimitry Andric   // If x and/or y is a splat value then:
1224349cc55cSDimitry Andric   // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1225349cc55cSDimitry Andric   Value *LHS, *RHS;
1226349cc55cSDimitry Andric   if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1227349cc55cSDimitry Andric     if (isSplatValue(LHS) || isSplatValue(RHS)) {
1228349cc55cSDimitry Andric       auto *OldBinOp = cast<BinaryOperator>(Vec);
1229349cc55cSDimitry Andric       auto OpC = OldBinOp->getOpcode();
1230349cc55cSDimitry Andric       auto *NewLHS =
123106c3fb27SDimitry Andric           IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1232349cc55cSDimitry Andric       auto *NewRHS =
123306c3fb27SDimitry Andric           IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1234349cc55cSDimitry Andric       auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
12350fca6ea1SDimitry Andric           OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1236349cc55cSDimitry Andric       return IC.replaceInstUsesWith(II, NewBinOp);
1237349cc55cSDimitry Andric     }
1238349cc55cSDimitry Andric   }
1239349cc55cSDimitry Andric 
1240fe6060f1SDimitry Andric   auto *C = dyn_cast<Constant>(Pg);
1241fe6060f1SDimitry Andric   if (IsAfter && C && C->isNullValue()) {
1242fe6060f1SDimitry Andric     // The intrinsic is extracting lane 0 so use an extract instead.
1243fe6060f1SDimitry Andric     auto *IdxTy = Type::getInt64Ty(II.getContext());
1244fe6060f1SDimitry Andric     auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1245fe6060f1SDimitry Andric     Extract->insertBefore(&II);
1246fe6060f1SDimitry Andric     Extract->takeName(&II);
1247fe6060f1SDimitry Andric     return IC.replaceInstUsesWith(II, Extract);
1248fe6060f1SDimitry Andric   }
1249fe6060f1SDimitry Andric 
1250fe6060f1SDimitry Andric   auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1251fe6060f1SDimitry Andric   if (!IntrPG)
1252bdd1243dSDimitry Andric     return std::nullopt;
1253fe6060f1SDimitry Andric 
1254fe6060f1SDimitry Andric   if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1255bdd1243dSDimitry Andric     return std::nullopt;
1256fe6060f1SDimitry Andric 
1257fe6060f1SDimitry Andric   const auto PTruePattern =
1258fe6060f1SDimitry Andric       cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1259fe6060f1SDimitry Andric 
1260fe6060f1SDimitry Andric   // Can the intrinsic's predicate be converted to a known constant index?
1261349cc55cSDimitry Andric   unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
1262349cc55cSDimitry Andric   if (!MinNumElts)
1263bdd1243dSDimitry Andric     return std::nullopt;
1264fe6060f1SDimitry Andric 
1265349cc55cSDimitry Andric   unsigned Idx = MinNumElts - 1;
1266fe6060f1SDimitry Andric   // Increment the index if extracting the element after the last active
1267fe6060f1SDimitry Andric   // predicate element.
1268fe6060f1SDimitry Andric   if (IsAfter)
1269fe6060f1SDimitry Andric     ++Idx;
1270fe6060f1SDimitry Andric 
1271fe6060f1SDimitry Andric   // Ignore extracts whose index is larger than the known minimum vector
1272fe6060f1SDimitry Andric   // length. NOTE: This is an artificial constraint where we prefer to
1273fe6060f1SDimitry Andric   // maintain what the user asked for until an alternative is proven faster.
1274fe6060f1SDimitry Andric   auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1275fe6060f1SDimitry Andric   if (Idx >= PgVTy->getMinNumElements())
1276bdd1243dSDimitry Andric     return std::nullopt;
1277fe6060f1SDimitry Andric 
1278fe6060f1SDimitry Andric   // The intrinsic is extracting a fixed lane so use an extract instead.
1279fe6060f1SDimitry Andric   auto *IdxTy = Type::getInt64Ty(II.getContext());
1280fe6060f1SDimitry Andric   auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
1281fe6060f1SDimitry Andric   Extract->insertBefore(&II);
1282fe6060f1SDimitry Andric   Extract->takeName(&II);
1283fe6060f1SDimitry Andric   return IC.replaceInstUsesWith(II, Extract);
1284fe6060f1SDimitry Andric }
1285fe6060f1SDimitry Andric 
1286bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1287753f127fSDimitry Andric                                                            IntrinsicInst &II) {
1288753f127fSDimitry Andric   // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1289753f127fSDimitry Andric   // integer variant across a variety of micro-architectures. Replace scalar
1290753f127fSDimitry Andric   // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1291753f127fSDimitry Andric   // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1292753f127fSDimitry Andric   // depending on the micro-architecture, but has been observed as generally
1293753f127fSDimitry Andric   // being faster, particularly when the CLAST[AB] op is a loop-carried
1294753f127fSDimitry Andric   // dependency.
1295753f127fSDimitry Andric   Value *Pg = II.getArgOperand(0);
1296753f127fSDimitry Andric   Value *Fallback = II.getArgOperand(1);
1297753f127fSDimitry Andric   Value *Vec = II.getArgOperand(2);
1298753f127fSDimitry Andric   Type *Ty = II.getType();
1299753f127fSDimitry Andric 
1300753f127fSDimitry Andric   if (!Ty->isIntegerTy())
1301bdd1243dSDimitry Andric     return std::nullopt;
1302753f127fSDimitry Andric 
1303753f127fSDimitry Andric   Type *FPTy;
1304753f127fSDimitry Andric   switch (cast<IntegerType>(Ty)->getBitWidth()) {
1305753f127fSDimitry Andric   default:
1306bdd1243dSDimitry Andric     return std::nullopt;
1307753f127fSDimitry Andric   case 16:
130806c3fb27SDimitry Andric     FPTy = IC.Builder.getHalfTy();
1309753f127fSDimitry Andric     break;
1310753f127fSDimitry Andric   case 32:
131106c3fb27SDimitry Andric     FPTy = IC.Builder.getFloatTy();
1312753f127fSDimitry Andric     break;
1313753f127fSDimitry Andric   case 64:
131406c3fb27SDimitry Andric     FPTy = IC.Builder.getDoubleTy();
1315753f127fSDimitry Andric     break;
1316753f127fSDimitry Andric   }
1317753f127fSDimitry Andric 
131806c3fb27SDimitry Andric   Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
1319753f127fSDimitry Andric   auto *FPVTy = VectorType::get(
1320753f127fSDimitry Andric       FPTy, cast<VectorType>(Vec->getType())->getElementCount());
132106c3fb27SDimitry Andric   Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
132206c3fb27SDimitry Andric   auto *FPII = IC.Builder.CreateIntrinsic(
132306c3fb27SDimitry Andric       II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
132406c3fb27SDimitry Andric   Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
1325753f127fSDimitry Andric   return IC.replaceInstUsesWith(II, FPIItoInt);
1326753f127fSDimitry Andric }
1327753f127fSDimitry Andric 
1328bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1329fe6060f1SDimitry Andric                                                      IntrinsicInst &II) {
1330fe6060f1SDimitry Andric   LLVMContext &Ctx = II.getContext();
1331fe6060f1SDimitry Andric   // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1332fe6060f1SDimitry Andric   // can work with RDFFR_PP for ptest elimination.
1333fe6060f1SDimitry Andric   auto *AllPat =
1334fe6060f1SDimitry Andric       ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
133506c3fb27SDimitry Andric   auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1336fe6060f1SDimitry Andric                                            {II.getType()}, {AllPat});
1337fe6060f1SDimitry Andric   auto *RDFFR =
133806c3fb27SDimitry Andric       IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1339fe6060f1SDimitry Andric   RDFFR->takeName(&II);
1340fe6060f1SDimitry Andric   return IC.replaceInstUsesWith(II, RDFFR);
1341fe6060f1SDimitry Andric }
1342fe6060f1SDimitry Andric 
1343bdd1243dSDimitry Andric static std::optional<Instruction *>
1344fe6060f1SDimitry Andric instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
1345fe6060f1SDimitry Andric   const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1346fe6060f1SDimitry Andric 
1347fe6060f1SDimitry Andric   if (Pattern == AArch64SVEPredPattern::all) {
1348fe6060f1SDimitry Andric     Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
134906c3fb27SDimitry Andric     auto *VScale = IC.Builder.CreateVScale(StepVal);
1350fe6060f1SDimitry Andric     VScale->takeName(&II);
1351fe6060f1SDimitry Andric     return IC.replaceInstUsesWith(II, VScale);
1352fe6060f1SDimitry Andric   }
1353fe6060f1SDimitry Andric 
1354349cc55cSDimitry Andric   unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1355fe6060f1SDimitry Andric 
1356349cc55cSDimitry Andric   return MinNumElts && NumElts >= MinNumElts
1357bdd1243dSDimitry Andric              ? std::optional<Instruction *>(IC.replaceInstUsesWith(
1358fe6060f1SDimitry Andric                    II, ConstantInt::get(II.getType(), MinNumElts)))
1359bdd1243dSDimitry Andric              : std::nullopt;
1360fe6060f1SDimitry Andric }
1361fe6060f1SDimitry Andric 
1362bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1363fe6060f1SDimitry Andric                                                         IntrinsicInst &II) {
1364bdd1243dSDimitry Andric   Value *PgVal = II.getArgOperand(0);
1365bdd1243dSDimitry Andric   Value *OpVal = II.getArgOperand(1);
1366fe6060f1SDimitry Andric 
1367bdd1243dSDimitry Andric   // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1368bdd1243dSDimitry Andric   // Later optimizations prefer this form.
1369bdd1243dSDimitry Andric   if (PgVal == OpVal &&
1370bdd1243dSDimitry Andric       (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1371bdd1243dSDimitry Andric        II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1372bdd1243dSDimitry Andric     Value *Ops[] = {PgVal, OpVal};
1373bdd1243dSDimitry Andric     Type *Tys[] = {PgVal->getType()};
1374bdd1243dSDimitry Andric 
1375bdd1243dSDimitry Andric     auto *PTest =
137606c3fb27SDimitry Andric         IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1377bdd1243dSDimitry Andric     PTest->takeName(&II);
1378bdd1243dSDimitry Andric 
1379bdd1243dSDimitry Andric     return IC.replaceInstUsesWith(II, PTest);
1380bdd1243dSDimitry Andric   }
1381bdd1243dSDimitry Andric 
1382bdd1243dSDimitry Andric   IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1383bdd1243dSDimitry Andric   IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1384bdd1243dSDimitry Andric 
1385bdd1243dSDimitry Andric   if (!Pg || !Op)
1386bdd1243dSDimitry Andric     return std::nullopt;
1387bdd1243dSDimitry Andric 
1388bdd1243dSDimitry Andric   Intrinsic::ID OpIID = Op->getIntrinsicID();
1389bdd1243dSDimitry Andric 
1390bdd1243dSDimitry Andric   if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1391bdd1243dSDimitry Andric       OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1392bdd1243dSDimitry Andric       Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1393bdd1243dSDimitry Andric     Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1394bdd1243dSDimitry Andric     Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1395fe6060f1SDimitry Andric 
139606c3fb27SDimitry Andric     auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1397fe6060f1SDimitry Andric 
1398fe6060f1SDimitry Andric     PTest->takeName(&II);
1399fe6060f1SDimitry Andric     return IC.replaceInstUsesWith(II, PTest);
1400fe6060f1SDimitry Andric   }
1401fe6060f1SDimitry Andric 
1402bdd1243dSDimitry Andric   // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1403bdd1243dSDimitry Andric   // Later optimizations may rewrite sequence to use the flag-setting variant
1404bdd1243dSDimitry Andric   // of instruction X to remove PTEST.
1405bdd1243dSDimitry Andric   if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1406bdd1243dSDimitry Andric       ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1407bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1408bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1409bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1410bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1411bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_and_z) ||
1412bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1413bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1414bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1415bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1416bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1417bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1418bdd1243dSDimitry Andric     Value *Ops[] = {Pg->getArgOperand(0), Pg};
1419bdd1243dSDimitry Andric     Type *Tys[] = {Pg->getType()};
1420bdd1243dSDimitry Andric 
142106c3fb27SDimitry Andric     auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1422bdd1243dSDimitry Andric     PTest->takeName(&II);
1423bdd1243dSDimitry Andric 
1424bdd1243dSDimitry Andric     return IC.replaceInstUsesWith(II, PTest);
1425fe6060f1SDimitry Andric   }
1426fe6060f1SDimitry Andric 
1427bdd1243dSDimitry Andric   return std::nullopt;
1428bdd1243dSDimitry Andric }
1429bdd1243dSDimitry Andric 
1430bdd1243dSDimitry Andric template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1431bdd1243dSDimitry Andric static std::optional<Instruction *>
1432bdd1243dSDimitry Andric instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
1433bdd1243dSDimitry Andric                                   bool MergeIntoAddendOp) {
1434349cc55cSDimitry Andric   Value *P = II.getOperand(0);
1435bdd1243dSDimitry Andric   Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1436bdd1243dSDimitry Andric   if (MergeIntoAddendOp) {
1437bdd1243dSDimitry Andric     AddendOp = II.getOperand(1);
1438bdd1243dSDimitry Andric     Mul = II.getOperand(2);
1439bdd1243dSDimitry Andric   } else {
1440bdd1243dSDimitry Andric     AddendOp = II.getOperand(2);
1441bdd1243dSDimitry Andric     Mul = II.getOperand(1);
1442bdd1243dSDimitry Andric   }
1443349cc55cSDimitry Andric 
1444bdd1243dSDimitry Andric   if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1445bdd1243dSDimitry Andric                                       m_Value(MulOp1))))
1446bdd1243dSDimitry Andric     return std::nullopt;
1447349cc55cSDimitry Andric 
1448bdd1243dSDimitry Andric   if (!Mul->hasOneUse())
1449bdd1243dSDimitry Andric     return std::nullopt;
1450bdd1243dSDimitry Andric 
1451bdd1243dSDimitry Andric   Instruction *FMFSource = nullptr;
1452bdd1243dSDimitry Andric   if (II.getType()->isFPOrFPVectorTy()) {
1453349cc55cSDimitry Andric     llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1454bdd1243dSDimitry Andric     // Stop the combine when the flags on the inputs differ in case dropping
1455bdd1243dSDimitry Andric     // flags would lead to us missing out on more beneficial optimizations.
1456bdd1243dSDimitry Andric     if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1457bdd1243dSDimitry Andric       return std::nullopt;
1458349cc55cSDimitry Andric     if (!FAddFlags.allowContract())
1459bdd1243dSDimitry Andric       return std::nullopt;
1460bdd1243dSDimitry Andric     FMFSource = &II;
1461bdd1243dSDimitry Andric   }
1462349cc55cSDimitry Andric 
1463bdd1243dSDimitry Andric   CallInst *Res;
1464bdd1243dSDimitry Andric   if (MergeIntoAddendOp)
146506c3fb27SDimitry Andric     Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1466bdd1243dSDimitry Andric                                      {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1467bdd1243dSDimitry Andric   else
146806c3fb27SDimitry Andric     Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1469bdd1243dSDimitry Andric                                      {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1470bdd1243dSDimitry Andric 
1471bdd1243dSDimitry Andric   return IC.replaceInstUsesWith(II, Res);
1472349cc55cSDimitry Andric }
1473349cc55cSDimitry Andric 
1474bdd1243dSDimitry Andric static std::optional<Instruction *>
1475349cc55cSDimitry Andric instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
1476349cc55cSDimitry Andric   Value *Pred = II.getOperand(0);
1477349cc55cSDimitry Andric   Value *PtrOp = II.getOperand(1);
1478349cc55cSDimitry Andric   Type *VecTy = II.getType();
1479349cc55cSDimitry Andric 
14800fca6ea1SDimitry Andric   // Replace by zero constant when all lanes are inactive
14810fca6ea1SDimitry Andric   if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
14820fca6ea1SDimitry Andric     return II_NA;
14830fca6ea1SDimitry Andric 
14840eae32dcSDimitry Andric   if (isAllActivePredicate(Pred)) {
148506c3fb27SDimitry Andric     LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
148681ad6265SDimitry Andric     Load->copyMetadata(II);
1487349cc55cSDimitry Andric     return IC.replaceInstUsesWith(II, Load);
1488349cc55cSDimitry Andric   }
1489349cc55cSDimitry Andric 
1490349cc55cSDimitry Andric   CallInst *MaskedLoad =
149106c3fb27SDimitry Andric       IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1492349cc55cSDimitry Andric                                   Pred, ConstantAggregateZero::get(VecTy));
149381ad6265SDimitry Andric   MaskedLoad->copyMetadata(II);
1494349cc55cSDimitry Andric   return IC.replaceInstUsesWith(II, MaskedLoad);
1495349cc55cSDimitry Andric }
1496349cc55cSDimitry Andric 
1497bdd1243dSDimitry Andric static std::optional<Instruction *>
1498349cc55cSDimitry Andric instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
1499349cc55cSDimitry Andric   Value *VecOp = II.getOperand(0);
1500349cc55cSDimitry Andric   Value *Pred = II.getOperand(1);
1501349cc55cSDimitry Andric   Value *PtrOp = II.getOperand(2);
1502349cc55cSDimitry Andric 
15030eae32dcSDimitry Andric   if (isAllActivePredicate(Pred)) {
150406c3fb27SDimitry Andric     StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
150581ad6265SDimitry Andric     Store->copyMetadata(II);
1506349cc55cSDimitry Andric     return IC.eraseInstFromFunction(II);
1507349cc55cSDimitry Andric   }
1508349cc55cSDimitry Andric 
150906c3fb27SDimitry Andric   CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
151006c3fb27SDimitry Andric       VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
151181ad6265SDimitry Andric   MaskedStore->copyMetadata(II);
1512349cc55cSDimitry Andric   return IC.eraseInstFromFunction(II);
1513349cc55cSDimitry Andric }
1514349cc55cSDimitry Andric 
1515349cc55cSDimitry Andric static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
1516349cc55cSDimitry Andric   switch (Intrinsic) {
151706c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmul_u:
1518349cc55cSDimitry Andric     return Instruction::BinaryOps::FMul;
151906c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fadd_u:
1520349cc55cSDimitry Andric     return Instruction::BinaryOps::FAdd;
152106c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fsub_u:
1522349cc55cSDimitry Andric     return Instruction::BinaryOps::FSub;
1523349cc55cSDimitry Andric   default:
1524349cc55cSDimitry Andric     return Instruction::BinaryOpsEnd;
1525349cc55cSDimitry Andric   }
1526349cc55cSDimitry Andric }
1527349cc55cSDimitry Andric 
1528bdd1243dSDimitry Andric static std::optional<Instruction *>
1529bdd1243dSDimitry Andric instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
153006c3fb27SDimitry Andric   // Bail due to missing support for ISD::STRICT_ scalable vector operations.
153106c3fb27SDimitry Andric   if (II.isStrictFP())
153206c3fb27SDimitry Andric     return std::nullopt;
153306c3fb27SDimitry Andric 
1534349cc55cSDimitry Andric   auto *OpPredicate = II.getOperand(0);
1535349cc55cSDimitry Andric   auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1536349cc55cSDimitry Andric   if (BinOpCode == Instruction::BinaryOpsEnd ||
1537349cc55cSDimitry Andric       !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1538349cc55cSDimitry Andric                               m_ConstantInt<AArch64SVEPredPattern::all>())))
1539bdd1243dSDimitry Andric     return std::nullopt;
154006c3fb27SDimitry Andric   IRBuilderBase::FastMathFlagGuard FMFGuard(IC.Builder);
154106c3fb27SDimitry Andric   IC.Builder.setFastMathFlags(II.getFastMathFlags());
1542349cc55cSDimitry Andric   auto BinOp =
154306c3fb27SDimitry Andric       IC.Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
1544349cc55cSDimitry Andric   return IC.replaceInstUsesWith(II, BinOp);
1545349cc55cSDimitry Andric }
1546349cc55cSDimitry Andric 
154706c3fb27SDimitry Andric // Canonicalise operations that take an all active predicate (e.g. sve.add ->
154806c3fb27SDimitry Andric // sve.add_u).
154906c3fb27SDimitry Andric static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
155006c3fb27SDimitry Andric                                                             Intrinsic::ID IID) {
155106c3fb27SDimitry Andric   auto *OpPredicate = II.getOperand(0);
155206c3fb27SDimitry Andric   if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
155306c3fb27SDimitry Andric                               m_ConstantInt<AArch64SVEPredPattern::all>())))
155406c3fb27SDimitry Andric     return std::nullopt;
155506c3fb27SDimitry Andric 
155606c3fb27SDimitry Andric   auto *Mod = II.getModule();
155706c3fb27SDimitry Andric   auto *NewDecl = Intrinsic::getDeclaration(Mod, IID, {II.getType()});
155806c3fb27SDimitry Andric   II.setCalledFunction(NewDecl);
155906c3fb27SDimitry Andric 
156006c3fb27SDimitry Andric   return &II;
156106c3fb27SDimitry Andric }
156206c3fb27SDimitry Andric 
1563297eecfbSDimitry Andric // Simplify operations where predicate has all inactive lanes or try to replace
1564297eecfbSDimitry Andric // with _u form when all lanes are active
1565297eecfbSDimitry Andric static std::optional<Instruction *>
1566297eecfbSDimitry Andric instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II,
1567297eecfbSDimitry Andric                             Intrinsic::ID IID) {
1568297eecfbSDimitry Andric   if (match(II.getOperand(0), m_ZeroInt())) {
1569297eecfbSDimitry Andric     //  llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1570297eecfbSDimitry Andric     //  inactive for sv[func]_m
1571297eecfbSDimitry Andric     return IC.replaceInstUsesWith(II, II.getOperand(1));
1572297eecfbSDimitry Andric   }
1573297eecfbSDimitry Andric   return instCombineSVEAllActive(II, IID);
1574297eecfbSDimitry Andric }
1575297eecfbSDimitry Andric 
1576bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1577349cc55cSDimitry Andric                                                             IntrinsicInst &II) {
1578297eecfbSDimitry Andric   if (auto II_U =
1579297eecfbSDimitry Andric           instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
158006c3fb27SDimitry Andric     return II_U;
158106c3fb27SDimitry Andric   if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
158206c3fb27SDimitry Andric                                                    Intrinsic::aarch64_sve_mla>(
158306c3fb27SDimitry Andric           IC, II, true))
158406c3fb27SDimitry Andric     return MLA;
158506c3fb27SDimitry Andric   if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
158606c3fb27SDimitry Andric                                                    Intrinsic::aarch64_sve_mad>(
158706c3fb27SDimitry Andric           IC, II, false))
158806c3fb27SDimitry Andric     return MAD;
158906c3fb27SDimitry Andric   return std::nullopt;
159006c3fb27SDimitry Andric }
159106c3fb27SDimitry Andric 
159206c3fb27SDimitry Andric static std::optional<Instruction *>
159306c3fb27SDimitry Andric instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
1594297eecfbSDimitry Andric   if (auto II_U =
1595297eecfbSDimitry Andric           instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
159606c3fb27SDimitry Andric     return II_U;
1597bdd1243dSDimitry Andric   if (auto FMLA =
1598bdd1243dSDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1599bdd1243dSDimitry Andric                                             Intrinsic::aarch64_sve_fmla>(IC, II,
1600bdd1243dSDimitry Andric                                                                          true))
1601349cc55cSDimitry Andric     return FMLA;
1602bdd1243dSDimitry Andric   if (auto FMAD =
1603bdd1243dSDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1604bdd1243dSDimitry Andric                                             Intrinsic::aarch64_sve_fmad>(IC, II,
1605bdd1243dSDimitry Andric                                                                          false))
1606bdd1243dSDimitry Andric     return FMAD;
160706c3fb27SDimitry Andric   if (auto FMLA =
160806c3fb27SDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
160906c3fb27SDimitry Andric                                             Intrinsic::aarch64_sve_fmla>(IC, II,
161006c3fb27SDimitry Andric                                                                          true))
161106c3fb27SDimitry Andric     return FMLA;
161206c3fb27SDimitry Andric   return std::nullopt;
161306c3fb27SDimitry Andric }
161406c3fb27SDimitry Andric 
161506c3fb27SDimitry Andric static std::optional<Instruction *>
161606c3fb27SDimitry Andric instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {
161706c3fb27SDimitry Andric   if (auto FMLA =
161806c3fb27SDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
161906c3fb27SDimitry Andric                                             Intrinsic::aarch64_sve_fmla>(IC, II,
162006c3fb27SDimitry Andric                                                                          true))
162106c3fb27SDimitry Andric     return FMLA;
162206c3fb27SDimitry Andric   if (auto FMAD =
162306c3fb27SDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
162406c3fb27SDimitry Andric                                             Intrinsic::aarch64_sve_fmad>(IC, II,
162506c3fb27SDimitry Andric                                                                          false))
162606c3fb27SDimitry Andric     return FMAD;
162706c3fb27SDimitry Andric   if (auto FMLA_U =
162806c3fb27SDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
162906c3fb27SDimitry Andric                                             Intrinsic::aarch64_sve_fmla_u>(
163006c3fb27SDimitry Andric               IC, II, true))
163106c3fb27SDimitry Andric     return FMLA_U;
1632349cc55cSDimitry Andric   return instCombineSVEVectorBinOp(IC, II);
1633349cc55cSDimitry Andric }
1634349cc55cSDimitry Andric 
163506c3fb27SDimitry Andric static std::optional<Instruction *>
163606c3fb27SDimitry Andric instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
1637297eecfbSDimitry Andric   if (auto II_U =
1638297eecfbSDimitry Andric           instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
163906c3fb27SDimitry Andric     return II_U;
1640bdd1243dSDimitry Andric   if (auto FMLS =
1641bdd1243dSDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1642bdd1243dSDimitry Andric                                             Intrinsic::aarch64_sve_fmls>(IC, II,
1643bdd1243dSDimitry Andric                                                                          true))
1644bdd1243dSDimitry Andric     return FMLS;
1645bdd1243dSDimitry Andric   if (auto FMSB =
1646bdd1243dSDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1647bdd1243dSDimitry Andric                                             Intrinsic::aarch64_sve_fnmsb>(
1648bdd1243dSDimitry Andric               IC, II, false))
1649bdd1243dSDimitry Andric     return FMSB;
165006c3fb27SDimitry Andric   if (auto FMLS =
165106c3fb27SDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
165206c3fb27SDimitry Andric                                             Intrinsic::aarch64_sve_fmls>(IC, II,
165306c3fb27SDimitry Andric                                                                          true))
165406c3fb27SDimitry Andric     return FMLS;
165506c3fb27SDimitry Andric   return std::nullopt;
165606c3fb27SDimitry Andric }
165706c3fb27SDimitry Andric 
165806c3fb27SDimitry Andric static std::optional<Instruction *>
165906c3fb27SDimitry Andric instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {
166006c3fb27SDimitry Andric   if (auto FMLS =
166106c3fb27SDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
166206c3fb27SDimitry Andric                                             Intrinsic::aarch64_sve_fmls>(IC, II,
166306c3fb27SDimitry Andric                                                                          true))
166406c3fb27SDimitry Andric     return FMLS;
166506c3fb27SDimitry Andric   if (auto FMSB =
166606c3fb27SDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
166706c3fb27SDimitry Andric                                             Intrinsic::aarch64_sve_fnmsb>(
166806c3fb27SDimitry Andric               IC, II, false))
166906c3fb27SDimitry Andric     return FMSB;
167006c3fb27SDimitry Andric   if (auto FMLS_U =
167106c3fb27SDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
167206c3fb27SDimitry Andric                                             Intrinsic::aarch64_sve_fmls_u>(
167306c3fb27SDimitry Andric               IC, II, true))
167406c3fb27SDimitry Andric     return FMLS_U;
1675bdd1243dSDimitry Andric   return instCombineSVEVectorBinOp(IC, II);
1676bdd1243dSDimitry Andric }
1677bdd1243dSDimitry Andric 
167806c3fb27SDimitry Andric static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1679fe6060f1SDimitry Andric                                                             IntrinsicInst &II) {
1680297eecfbSDimitry Andric   if (auto II_U =
1681297eecfbSDimitry Andric           instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
168206c3fb27SDimitry Andric     return II_U;
168306c3fb27SDimitry Andric   if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
168406c3fb27SDimitry Andric                                                    Intrinsic::aarch64_sve_mls>(
168506c3fb27SDimitry Andric           IC, II, true))
168606c3fb27SDimitry Andric     return MLS;
168706c3fb27SDimitry Andric   return std::nullopt;
168806c3fb27SDimitry Andric }
168906c3fb27SDimitry Andric 
169006c3fb27SDimitry Andric static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
169106c3fb27SDimitry Andric                                                             IntrinsicInst &II,
169206c3fb27SDimitry Andric                                                             Intrinsic::ID IID) {
1693fe6060f1SDimitry Andric   auto *OpPredicate = II.getOperand(0);
1694fe6060f1SDimitry Andric   auto *OpMultiplicand = II.getOperand(1);
1695fe6060f1SDimitry Andric   auto *OpMultiplier = II.getOperand(2);
1696fe6060f1SDimitry Andric 
1697349cc55cSDimitry Andric   // Return true if a given instruction is a unit splat value, false otherwise.
1698349cc55cSDimitry Andric   auto IsUnitSplat = [](auto *I) {
1699349cc55cSDimitry Andric     auto *SplatValue = getSplatValue(I);
1700349cc55cSDimitry Andric     if (!SplatValue)
1701fe6060f1SDimitry Andric       return false;
1702fe6060f1SDimitry Andric     return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1703fe6060f1SDimitry Andric   };
1704fe6060f1SDimitry Andric 
1705fe6060f1SDimitry Andric   // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1706fe6060f1SDimitry Andric   // with a unit splat value, false otherwise.
1707fe6060f1SDimitry Andric   auto IsUnitDup = [](auto *I) {
1708fe6060f1SDimitry Andric     auto *IntrI = dyn_cast<IntrinsicInst>(I);
1709fe6060f1SDimitry Andric     if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1710fe6060f1SDimitry Andric       return false;
1711fe6060f1SDimitry Andric 
1712fe6060f1SDimitry Andric     auto *SplatValue = IntrI->getOperand(2);
1713fe6060f1SDimitry Andric     return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1714fe6060f1SDimitry Andric   };
1715fe6060f1SDimitry Andric 
1716349cc55cSDimitry Andric   if (IsUnitSplat(OpMultiplier)) {
17174824e7fdSDimitry Andric     // [f]mul pg %n, (dupx 1) => %n
1718fe6060f1SDimitry Andric     OpMultiplicand->takeName(&II);
1719fe6060f1SDimitry Andric     return IC.replaceInstUsesWith(II, OpMultiplicand);
1720fe6060f1SDimitry Andric   } else if (IsUnitDup(OpMultiplier)) {
17214824e7fdSDimitry Andric     // [f]mul pg %n, (dup pg 1) => %n
1722fe6060f1SDimitry Andric     auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1723fe6060f1SDimitry Andric     auto *DupPg = DupInst->getOperand(1);
1724fe6060f1SDimitry Andric     // TODO: this is naive. The optimization is still valid if DupPg
1725fe6060f1SDimitry Andric     // 'encompasses' OpPredicate, not only if they're the same predicate.
1726fe6060f1SDimitry Andric     if (OpPredicate == DupPg) {
1727fe6060f1SDimitry Andric       OpMultiplicand->takeName(&II);
1728fe6060f1SDimitry Andric       return IC.replaceInstUsesWith(II, OpMultiplicand);
1729fe6060f1SDimitry Andric     }
1730fe6060f1SDimitry Andric   }
1731fe6060f1SDimitry Andric 
1732349cc55cSDimitry Andric   return instCombineSVEVectorBinOp(IC, II);
1733fe6060f1SDimitry Andric }
1734fe6060f1SDimitry Andric 
1735bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1736349cc55cSDimitry Andric                                                          IntrinsicInst &II) {
1737349cc55cSDimitry Andric   Value *UnpackArg = II.getArgOperand(0);
1738349cc55cSDimitry Andric   auto *RetTy = cast<ScalableVectorType>(II.getType());
1739349cc55cSDimitry Andric   bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1740349cc55cSDimitry Andric                   II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1741349cc55cSDimitry Andric 
1742349cc55cSDimitry Andric   // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1743349cc55cSDimitry Andric   // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1744349cc55cSDimitry Andric   if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1745349cc55cSDimitry Andric     ScalarArg =
174606c3fb27SDimitry Andric         IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1747349cc55cSDimitry Andric     Value *NewVal =
174806c3fb27SDimitry Andric         IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1749349cc55cSDimitry Andric     NewVal->takeName(&II);
1750349cc55cSDimitry Andric     return IC.replaceInstUsesWith(II, NewVal);
1751349cc55cSDimitry Andric   }
1752349cc55cSDimitry Andric 
1753bdd1243dSDimitry Andric   return std::nullopt;
1754349cc55cSDimitry Andric }
1755bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1756fe6060f1SDimitry Andric                                                       IntrinsicInst &II) {
1757fe6060f1SDimitry Andric   auto *OpVal = II.getOperand(0);
1758fe6060f1SDimitry Andric   auto *OpIndices = II.getOperand(1);
1759fe6060f1SDimitry Andric   VectorType *VTy = cast<VectorType>(II.getType());
1760fe6060f1SDimitry Andric 
1761349cc55cSDimitry Andric   // Check whether OpIndices is a constant splat value < minimal element count
1762349cc55cSDimitry Andric   // of result.
1763349cc55cSDimitry Andric   auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1764fe6060f1SDimitry Andric   if (!SplatValue ||
1765fe6060f1SDimitry Andric       SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1766bdd1243dSDimitry Andric     return std::nullopt;
1767fe6060f1SDimitry Andric 
1768fe6060f1SDimitry Andric   // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1769fe6060f1SDimitry Andric   // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
177006c3fb27SDimitry Andric   auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
1771fe6060f1SDimitry Andric   auto *VectorSplat =
177206c3fb27SDimitry Andric       IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1773fe6060f1SDimitry Andric 
1774fe6060f1SDimitry Andric   VectorSplat->takeName(&II);
1775fe6060f1SDimitry Andric   return IC.replaceInstUsesWith(II, VectorSplat);
1776fe6060f1SDimitry Andric }
1777fe6060f1SDimitry Andric 
17780fca6ea1SDimitry Andric static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
17790fca6ea1SDimitry Andric                                                        IntrinsicInst &II) {
17800fca6ea1SDimitry Andric   Value *A, *B;
17810fca6ea1SDimitry Andric   Type *RetTy = II.getType();
17820fca6ea1SDimitry Andric   constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
17830fca6ea1SDimitry Andric   constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
17840fca6ea1SDimitry Andric 
17850fca6ea1SDimitry Andric   // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
17860fca6ea1SDimitry Andric   // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
17870fca6ea1SDimitry Andric   if ((match(II.getArgOperand(0),
17880fca6ea1SDimitry Andric              m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) &&
17890fca6ea1SDimitry Andric        match(II.getArgOperand(1),
17900fca6ea1SDimitry Andric              m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) ||
17910fca6ea1SDimitry Andric       (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
17920fca6ea1SDimitry Andric        match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
17930fca6ea1SDimitry Andric     auto *TyA = cast<ScalableVectorType>(A->getType());
17940fca6ea1SDimitry Andric     if (TyA == B->getType() &&
17950fca6ea1SDimitry Andric         RetTy == ScalableVectorType::getDoubleElementsVectorType(TyA)) {
17960fca6ea1SDimitry Andric       auto *SubVec = IC.Builder.CreateInsertVector(
17970fca6ea1SDimitry Andric           RetTy, PoisonValue::get(RetTy), A, IC.Builder.getInt64(0));
17980fca6ea1SDimitry Andric       auto *ConcatVec = IC.Builder.CreateInsertVector(
17990fca6ea1SDimitry Andric           RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
18000fca6ea1SDimitry Andric       ConcatVec->takeName(&II);
18010fca6ea1SDimitry Andric       return IC.replaceInstUsesWith(II, ConcatVec);
18020fca6ea1SDimitry Andric     }
18030fca6ea1SDimitry Andric   }
18040fca6ea1SDimitry Andric 
18050fca6ea1SDimitry Andric   return std::nullopt;
18060fca6ea1SDimitry Andric }
18070fca6ea1SDimitry Andric 
1808bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1809349cc55cSDimitry Andric                                                       IntrinsicInst &II) {
1810349cc55cSDimitry Andric   // zip1(uzp1(A, B), uzp2(A, B)) --> A
1811349cc55cSDimitry Andric   // zip2(uzp1(A, B), uzp2(A, B)) --> B
1812349cc55cSDimitry Andric   Value *A, *B;
1813349cc55cSDimitry Andric   if (match(II.getArgOperand(0),
1814349cc55cSDimitry Andric             m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1815349cc55cSDimitry Andric       match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1816349cc55cSDimitry Andric                                      m_Specific(A), m_Specific(B))))
1817349cc55cSDimitry Andric     return IC.replaceInstUsesWith(
1818349cc55cSDimitry Andric         II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1819349cc55cSDimitry Andric 
1820bdd1243dSDimitry Andric   return std::nullopt;
1821349cc55cSDimitry Andric }
1822349cc55cSDimitry Andric 
1823bdd1243dSDimitry Andric static std::optional<Instruction *>
1824bdd1243dSDimitry Andric instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
1825349cc55cSDimitry Andric   Value *Mask = II.getOperand(0);
1826349cc55cSDimitry Andric   Value *BasePtr = II.getOperand(1);
1827349cc55cSDimitry Andric   Value *Index = II.getOperand(2);
1828349cc55cSDimitry Andric   Type *Ty = II.getType();
1829349cc55cSDimitry Andric   Value *PassThru = ConstantAggregateZero::get(Ty);
1830349cc55cSDimitry Andric 
18310fca6ea1SDimitry Andric   // Replace by zero constant when all lanes are inactive
18320fca6ea1SDimitry Andric   if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
18330fca6ea1SDimitry Andric     return II_NA;
18340fca6ea1SDimitry Andric 
1835349cc55cSDimitry Andric   // Contiguous gather => masked load.
1836349cc55cSDimitry Andric   // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1837349cc55cSDimitry Andric   // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1838349cc55cSDimitry Andric   Value *IndexBase;
1839349cc55cSDimitry Andric   if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1840349cc55cSDimitry Andric                        m_Value(IndexBase), m_SpecificInt(1)))) {
1841349cc55cSDimitry Andric     Align Alignment =
18420fca6ea1SDimitry Andric         BasePtr->getPointerAlignment(II.getDataLayout());
1843349cc55cSDimitry Andric 
1844349cc55cSDimitry Andric     Type *VecPtrTy = PointerType::getUnqual(Ty);
184506c3fb27SDimitry Andric     Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1846bdd1243dSDimitry Andric                                       BasePtr, IndexBase);
184706c3fb27SDimitry Andric     Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1848349cc55cSDimitry Andric     CallInst *MaskedLoad =
184906c3fb27SDimitry Andric         IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1850349cc55cSDimitry Andric     MaskedLoad->takeName(&II);
1851349cc55cSDimitry Andric     return IC.replaceInstUsesWith(II, MaskedLoad);
1852349cc55cSDimitry Andric   }
1853349cc55cSDimitry Andric 
1854bdd1243dSDimitry Andric   return std::nullopt;
1855349cc55cSDimitry Andric }
1856349cc55cSDimitry Andric 
1857bdd1243dSDimitry Andric static std::optional<Instruction *>
1858bdd1243dSDimitry Andric instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
1859349cc55cSDimitry Andric   Value *Val = II.getOperand(0);
1860349cc55cSDimitry Andric   Value *Mask = II.getOperand(1);
1861349cc55cSDimitry Andric   Value *BasePtr = II.getOperand(2);
1862349cc55cSDimitry Andric   Value *Index = II.getOperand(3);
1863349cc55cSDimitry Andric   Type *Ty = Val->getType();
1864349cc55cSDimitry Andric 
1865349cc55cSDimitry Andric   // Contiguous scatter => masked store.
186681ad6265SDimitry Andric   // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1867349cc55cSDimitry Andric   // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1868349cc55cSDimitry Andric   Value *IndexBase;
1869349cc55cSDimitry Andric   if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1870349cc55cSDimitry Andric                        m_Value(IndexBase), m_SpecificInt(1)))) {
1871349cc55cSDimitry Andric     Align Alignment =
18720fca6ea1SDimitry Andric         BasePtr->getPointerAlignment(II.getDataLayout());
1873349cc55cSDimitry Andric 
187406c3fb27SDimitry Andric     Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1875bdd1243dSDimitry Andric                                       BasePtr, IndexBase);
1876349cc55cSDimitry Andric     Type *VecPtrTy = PointerType::getUnqual(Ty);
187706c3fb27SDimitry Andric     Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1878349cc55cSDimitry Andric 
187906c3fb27SDimitry Andric     (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1880349cc55cSDimitry Andric 
1881349cc55cSDimitry Andric     return IC.eraseInstFromFunction(II);
1882349cc55cSDimitry Andric   }
1883349cc55cSDimitry Andric 
1884bdd1243dSDimitry Andric   return std::nullopt;
1885349cc55cSDimitry Andric }
1886349cc55cSDimitry Andric 
1887bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
18880eae32dcSDimitry Andric                                                        IntrinsicInst &II) {
188906c3fb27SDimitry Andric   Type *Int32Ty = IC.Builder.getInt32Ty();
18900eae32dcSDimitry Andric   Value *Pred = II.getOperand(0);
18910eae32dcSDimitry Andric   Value *Vec = II.getOperand(1);
18920eae32dcSDimitry Andric   Value *DivVec = II.getOperand(2);
18930eae32dcSDimitry Andric 
18940eae32dcSDimitry Andric   Value *SplatValue = getSplatValue(DivVec);
18950eae32dcSDimitry Andric   ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
18960eae32dcSDimitry Andric   if (!SplatConstantInt)
1897bdd1243dSDimitry Andric     return std::nullopt;
18980eae32dcSDimitry Andric   APInt Divisor = SplatConstantInt->getValue();
18990eae32dcSDimitry Andric 
19000eae32dcSDimitry Andric   if (Divisor.isPowerOf2()) {
19010eae32dcSDimitry Andric     Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
190206c3fb27SDimitry Andric     auto ASRD = IC.Builder.CreateIntrinsic(
19030eae32dcSDimitry Andric         Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
19040eae32dcSDimitry Andric     return IC.replaceInstUsesWith(II, ASRD);
19050eae32dcSDimitry Andric   }
19060eae32dcSDimitry Andric   if (Divisor.isNegatedPowerOf2()) {
19070eae32dcSDimitry Andric     Divisor.negate();
19080eae32dcSDimitry Andric     Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
190906c3fb27SDimitry Andric     auto ASRD = IC.Builder.CreateIntrinsic(
19100eae32dcSDimitry Andric         Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
191106c3fb27SDimitry Andric     auto NEG = IC.Builder.CreateIntrinsic(
191206c3fb27SDimitry Andric         Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
19130eae32dcSDimitry Andric     return IC.replaceInstUsesWith(II, NEG);
19140eae32dcSDimitry Andric   }
19150eae32dcSDimitry Andric 
1916bdd1243dSDimitry Andric   return std::nullopt;
19170eae32dcSDimitry Andric }
19180eae32dcSDimitry Andric 
1919bdd1243dSDimitry Andric bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
1920bdd1243dSDimitry Andric   size_t VecSize = Vec.size();
1921bdd1243dSDimitry Andric   if (VecSize == 1)
1922bdd1243dSDimitry Andric     return true;
1923bdd1243dSDimitry Andric   if (!isPowerOf2_64(VecSize))
1924bdd1243dSDimitry Andric     return false;
1925bdd1243dSDimitry Andric   size_t HalfVecSize = VecSize / 2;
1926bdd1243dSDimitry Andric 
1927bdd1243dSDimitry Andric   for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
1928bdd1243dSDimitry Andric        RHS != Vec.end(); LHS++, RHS++) {
1929bdd1243dSDimitry Andric     if (*LHS != nullptr && *RHS != nullptr) {
1930bdd1243dSDimitry Andric       if (*LHS == *RHS)
1931bdd1243dSDimitry Andric         continue;
1932bdd1243dSDimitry Andric       else
1933bdd1243dSDimitry Andric         return false;
1934bdd1243dSDimitry Andric     }
1935bdd1243dSDimitry Andric     if (!AllowPoison)
1936bdd1243dSDimitry Andric       return false;
1937bdd1243dSDimitry Andric     if (*LHS == nullptr && *RHS != nullptr)
1938bdd1243dSDimitry Andric       *LHS = *RHS;
1939bdd1243dSDimitry Andric   }
1940bdd1243dSDimitry Andric 
1941bdd1243dSDimitry Andric   Vec.resize(HalfVecSize);
1942bdd1243dSDimitry Andric   SimplifyValuePattern(Vec, AllowPoison);
1943bdd1243dSDimitry Andric   return true;
1944bdd1243dSDimitry Andric }
1945bdd1243dSDimitry Andric 
1946bdd1243dSDimitry Andric // Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1947bdd1243dSDimitry Andric // to dupqlane(f64(C)) where C is A concatenated with B
1948bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
1949bdd1243dSDimitry Andric                                                            IntrinsicInst &II) {
1950bdd1243dSDimitry Andric   Value *CurrentInsertElt = nullptr, *Default = nullptr;
1951bdd1243dSDimitry Andric   if (!match(II.getOperand(0),
1952bdd1243dSDimitry Andric              m_Intrinsic<Intrinsic::vector_insert>(
1953bdd1243dSDimitry Andric                  m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
1954bdd1243dSDimitry Andric       !isa<FixedVectorType>(CurrentInsertElt->getType()))
1955bdd1243dSDimitry Andric     return std::nullopt;
1956bdd1243dSDimitry Andric   auto IIScalableTy = cast<ScalableVectorType>(II.getType());
1957bdd1243dSDimitry Andric 
1958bdd1243dSDimitry Andric   // Insert the scalars into a container ordered by InsertElement index
1959bdd1243dSDimitry Andric   SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
1960bdd1243dSDimitry Andric   while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
1961bdd1243dSDimitry Andric     auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
1962bdd1243dSDimitry Andric     Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
1963bdd1243dSDimitry Andric     CurrentInsertElt = InsertElt->getOperand(0);
1964bdd1243dSDimitry Andric   }
1965bdd1243dSDimitry Andric 
1966bdd1243dSDimitry Andric   bool AllowPoison =
1967bdd1243dSDimitry Andric       isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
1968bdd1243dSDimitry Andric   if (!SimplifyValuePattern(Elts, AllowPoison))
1969bdd1243dSDimitry Andric     return std::nullopt;
1970bdd1243dSDimitry Andric 
1971bdd1243dSDimitry Andric   // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
1972bdd1243dSDimitry Andric   Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
1973bdd1243dSDimitry Andric   for (size_t I = 0; I < Elts.size(); I++) {
1974bdd1243dSDimitry Andric     if (Elts[I] == nullptr)
1975bdd1243dSDimitry Andric       continue;
197606c3fb27SDimitry Andric     InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
197706c3fb27SDimitry Andric                                                     IC.Builder.getInt64(I));
1978bdd1243dSDimitry Andric   }
1979bdd1243dSDimitry Andric   if (InsertEltChain == nullptr)
1980bdd1243dSDimitry Andric     return std::nullopt;
1981bdd1243dSDimitry Andric 
1982bdd1243dSDimitry Andric   // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
1983bdd1243dSDimitry Andric   // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
1984bdd1243dSDimitry Andric   // be bitcast to a type wide enough to fit the sequence, be splatted, and then
1985bdd1243dSDimitry Andric   // be narrowed back to the original type.
1986bdd1243dSDimitry Andric   unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1987bdd1243dSDimitry Andric   unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1988bdd1243dSDimitry Andric                                  IIScalableTy->getMinNumElements() /
1989bdd1243dSDimitry Andric                                  PatternWidth;
1990bdd1243dSDimitry Andric 
199106c3fb27SDimitry Andric   IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
1992bdd1243dSDimitry Andric   auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
1993bdd1243dSDimitry Andric   auto *WideShuffleMaskTy =
199406c3fb27SDimitry Andric       ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
1995bdd1243dSDimitry Andric 
199606c3fb27SDimitry Andric   auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0));
199706c3fb27SDimitry Andric   auto InsertSubvector = IC.Builder.CreateInsertVector(
1998bdd1243dSDimitry Andric       II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
1999bdd1243dSDimitry Andric   auto WideBitcast =
200006c3fb27SDimitry Andric       IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2001bdd1243dSDimitry Andric   auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
200206c3fb27SDimitry Andric   auto WideShuffle = IC.Builder.CreateShuffleVector(
2003bdd1243dSDimitry Andric       WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2004bdd1243dSDimitry Andric   auto NarrowBitcast =
200506c3fb27SDimitry Andric       IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2006bdd1243dSDimitry Andric 
2007bdd1243dSDimitry Andric   return IC.replaceInstUsesWith(II, NarrowBitcast);
2008bdd1243dSDimitry Andric }
2009bdd1243dSDimitry Andric 
2010bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
201181ad6265SDimitry Andric                                                         IntrinsicInst &II) {
201281ad6265SDimitry Andric   Value *A = II.getArgOperand(0);
201381ad6265SDimitry Andric   Value *B = II.getArgOperand(1);
201481ad6265SDimitry Andric   if (A == B)
201581ad6265SDimitry Andric     return IC.replaceInstUsesWith(II, A);
201681ad6265SDimitry Andric 
2017bdd1243dSDimitry Andric   return std::nullopt;
201881ad6265SDimitry Andric }
201981ad6265SDimitry Andric 
2020bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
202181ad6265SDimitry Andric                                                         IntrinsicInst &II) {
202281ad6265SDimitry Andric   Value *Pred = II.getOperand(0);
202381ad6265SDimitry Andric   Value *Vec = II.getOperand(1);
202481ad6265SDimitry Andric   Value *Shift = II.getOperand(2);
202581ad6265SDimitry Andric 
202681ad6265SDimitry Andric   // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
202781ad6265SDimitry Andric   Value *AbsPred, *MergedValue;
202881ad6265SDimitry Andric   if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
202981ad6265SDimitry Andric                       m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
203081ad6265SDimitry Andric       !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
203181ad6265SDimitry Andric                       m_Value(MergedValue), m_Value(AbsPred), m_Value())))
203281ad6265SDimitry Andric 
2033bdd1243dSDimitry Andric     return std::nullopt;
203481ad6265SDimitry Andric 
203581ad6265SDimitry Andric   // Transform is valid if any of the following are true:
203681ad6265SDimitry Andric   // * The ABS merge value is an undef or non-negative
203781ad6265SDimitry Andric   // * The ABS predicate is all active
203881ad6265SDimitry Andric   // * The ABS predicate and the SRSHL predicates are the same
2039bdd1243dSDimitry Andric   if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
204081ad6265SDimitry Andric       AbsPred != Pred && !isAllActivePredicate(AbsPred))
2041bdd1243dSDimitry Andric     return std::nullopt;
204281ad6265SDimitry Andric 
204381ad6265SDimitry Andric   // Only valid when the shift amount is non-negative, otherwise the rounding
204481ad6265SDimitry Andric   // behaviour of SRSHL cannot be ignored.
204581ad6265SDimitry Andric   if (!match(Shift, m_NonNegative()))
2046bdd1243dSDimitry Andric     return std::nullopt;
204781ad6265SDimitry Andric 
204806c3fb27SDimitry Andric   auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
204906c3fb27SDimitry Andric                                         {II.getType()}, {Pred, Vec, Shift});
205081ad6265SDimitry Andric 
205181ad6265SDimitry Andric   return IC.replaceInstUsesWith(II, LSL);
205281ad6265SDimitry Andric }
205381ad6265SDimitry Andric 
2054bdd1243dSDimitry Andric std::optional<Instruction *>
2055fe6060f1SDimitry Andric AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
2056fe6060f1SDimitry Andric                                      IntrinsicInst &II) const {
2057fe6060f1SDimitry Andric   Intrinsic::ID IID = II.getIntrinsicID();
2058fe6060f1SDimitry Andric   switch (IID) {
2059fe6060f1SDimitry Andric   default:
2060fe6060f1SDimitry Andric     break;
20610fca6ea1SDimitry Andric 
20620fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_st1_scatter:
20630fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
20640fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_st1_scatter_sxtw:
20650fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
20660fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_st1_scatter_uxtw:
20670fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
20680fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_st1dq:
20690fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_st1q_scatter_index:
20700fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
20710fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
20720fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_st1wq:
20730fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_stnt1:
20740fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_stnt1_scatter:
20750fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_stnt1_scatter_index:
20760fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
20770fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
20780fca6ea1SDimitry Andric     return instCombineSVENoActiveUnaryErase(IC, II, 1);
20790fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_st2:
20800fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_st2q:
20810fca6ea1SDimitry Andric     return instCombineSVENoActiveUnaryErase(IC, II, 2);
20820fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_st3:
20830fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_st3q:
20840fca6ea1SDimitry Andric     return instCombineSVENoActiveUnaryErase(IC, II, 3);
20850fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_st4:
20860fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_st4q:
20870fca6ea1SDimitry Andric     return instCombineSVENoActiveUnaryErase(IC, II, 4);
20880fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld1_gather:
20890fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
20900fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld1_gather_sxtw:
20910fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
20920fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld1_gather_uxtw:
20930fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
20940fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld1q_gather_index:
20950fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
20960fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
20970fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld1ro:
20980fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld1rq:
20990fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld1udq:
21000fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld1uwq:
21010fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld2_sret:
21020fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld2q_sret:
21030fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld3_sret:
21040fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld3q_sret:
21050fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld4_sret:
21060fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ld4q_sret:
21070fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ldff1:
21080fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ldff1_gather:
21090fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ldff1_gather_index:
21100fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
21110fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
21120fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
21130fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
21140fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
21150fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ldnf1:
21160fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ldnt1:
21170fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ldnt1_gather:
21180fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ldnt1_gather_index:
21190fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
21200fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
21210fca6ea1SDimitry Andric     return instCombineSVENoActiveUnaryZero(IC, II);
212281ad6265SDimitry Andric   case Intrinsic::aarch64_neon_fmaxnm:
212381ad6265SDimitry Andric   case Intrinsic::aarch64_neon_fminnm:
212481ad6265SDimitry Andric     return instCombineMaxMinNM(IC, II);
2125fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_convert_from_svbool:
2126fe6060f1SDimitry Andric     return instCombineConvertFromSVBool(IC, II);
2127fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_dup:
2128fe6060f1SDimitry Andric     return instCombineSVEDup(IC, II);
2129349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_dup_x:
2130349cc55cSDimitry Andric     return instCombineSVEDupX(IC, II);
2131fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_cmpne:
2132fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_cmpne_wide:
2133fe6060f1SDimitry Andric     return instCombineSVECmpNE(IC, II);
2134fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_rdffr:
2135fe6060f1SDimitry Andric     return instCombineRDFFR(IC, II);
2136fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_lasta:
2137fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_lastb:
2138fe6060f1SDimitry Andric     return instCombineSVELast(IC, II);
2139753f127fSDimitry Andric   case Intrinsic::aarch64_sve_clasta_n:
2140753f127fSDimitry Andric   case Intrinsic::aarch64_sve_clastb_n:
2141753f127fSDimitry Andric     return instCombineSVECondLast(IC, II);
2142fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_cntd:
2143fe6060f1SDimitry Andric     return instCombineSVECntElts(IC, II, 2);
2144fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_cntw:
2145fe6060f1SDimitry Andric     return instCombineSVECntElts(IC, II, 4);
2146fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_cnth:
2147fe6060f1SDimitry Andric     return instCombineSVECntElts(IC, II, 8);
2148fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_cntb:
2149fe6060f1SDimitry Andric     return instCombineSVECntElts(IC, II, 16);
2150fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_ptest_any:
2151fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_ptest_first:
2152fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_ptest_last:
2153fe6060f1SDimitry Andric     return instCombineSVEPTest(IC, II);
215406c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fabd:
2155297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
2156349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_fadd:
215706c3fb27SDimitry Andric     return instCombineSVEVectorFAdd(IC, II);
215806c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fadd_u:
215906c3fb27SDimitry Andric     return instCombineSVEVectorFAddU(IC, II);
216006c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fdiv:
2161297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
216206c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmax:
2163297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
216406c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmaxnm:
2165297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
216606c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmin:
2167297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
216806c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fminnm:
2169297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
217006c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmla:
2171297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
217206c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmls:
2173297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
217406c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmul:
2175297eecfbSDimitry Andric     if (auto II_U =
2176297eecfbSDimitry Andric             instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u))
2177297eecfbSDimitry Andric       return II_U;
2178297eecfbSDimitry Andric     return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
217906c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmul_u:
218006c3fb27SDimitry Andric     return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
218106c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmulx:
2182297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
218306c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fnmla:
2184297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
218506c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fnmls:
2186297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
218706c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fsub:
218806c3fb27SDimitry Andric     return instCombineSVEVectorFSub(IC, II);
218906c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fsub_u:
219006c3fb27SDimitry Andric     return instCombineSVEVectorFSubU(IC, II);
2191bdd1243dSDimitry Andric   case Intrinsic::aarch64_sve_add:
2192bdd1243dSDimitry Andric     return instCombineSVEVectorAdd(IC, II);
219306c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_add_u:
219406c3fb27SDimitry Andric     return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
219506c3fb27SDimitry Andric                                              Intrinsic::aarch64_sve_mla_u>(
219606c3fb27SDimitry Andric         IC, II, true);
219706c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_mla:
2198297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
219906c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_mls:
2200297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
220106c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_mul:
2202297eecfbSDimitry Andric     if (auto II_U =
2203297eecfbSDimitry Andric             instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u))
2204297eecfbSDimitry Andric       return II_U;
2205297eecfbSDimitry Andric     return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
220606c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_mul_u:
220706c3fb27SDimitry Andric     return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
220806c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_sabd:
2209297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
221006c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_smax:
2211297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
221206c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_smin:
2213297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
221406c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_smulh:
2215297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
2216bdd1243dSDimitry Andric   case Intrinsic::aarch64_sve_sub:
2217bdd1243dSDimitry Andric     return instCombineSVEVectorSub(IC, II);
221806c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_sub_u:
221906c3fb27SDimitry Andric     return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
222006c3fb27SDimitry Andric                                              Intrinsic::aarch64_sve_mls_u>(
222106c3fb27SDimitry Andric         IC, II, true);
222206c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_uabd:
2223297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
222406c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_umax:
2225297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
222606c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_umin:
2227297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
222806c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_umulh:
2229297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
223006c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_asr:
2231297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
223206c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_lsl:
2233297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
223406c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_lsr:
2235297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
223606c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_and:
2237297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
223806c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_bic:
2239297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
224006c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_eor:
2241297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
224206c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_orr:
2243297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
224406c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_sqsub:
2245297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
224606c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_uqsub:
2247297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
2248fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_tbl:
2249fe6060f1SDimitry Andric     return instCombineSVETBL(IC, II);
2250349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_uunpkhi:
2251349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_uunpklo:
2252349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_sunpkhi:
2253349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_sunpklo:
2254349cc55cSDimitry Andric     return instCombineSVEUnpack(IC, II);
22550fca6ea1SDimitry Andric   case Intrinsic::aarch64_sve_uzp1:
22560fca6ea1SDimitry Andric     return instCombineSVEUzp1(IC, II);
2257349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_zip1:
2258349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_zip2:
2259349cc55cSDimitry Andric     return instCombineSVEZip(IC, II);
2260349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_ld1_gather_index:
2261349cc55cSDimitry Andric     return instCombineLD1GatherIndex(IC, II);
2262349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_st1_scatter_index:
2263349cc55cSDimitry Andric     return instCombineST1ScatterIndex(IC, II);
2264349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_ld1:
2265349cc55cSDimitry Andric     return instCombineSVELD1(IC, II, DL);
2266349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_st1:
2267349cc55cSDimitry Andric     return instCombineSVEST1(IC, II, DL);
22680eae32dcSDimitry Andric   case Intrinsic::aarch64_sve_sdiv:
22690eae32dcSDimitry Andric     return instCombineSVESDIV(IC, II);
227081ad6265SDimitry Andric   case Intrinsic::aarch64_sve_sel:
227181ad6265SDimitry Andric     return instCombineSVESel(IC, II);
227281ad6265SDimitry Andric   case Intrinsic::aarch64_sve_srshl:
227381ad6265SDimitry Andric     return instCombineSVESrshl(IC, II);
2274bdd1243dSDimitry Andric   case Intrinsic::aarch64_sve_dupq_lane:
2275bdd1243dSDimitry Andric     return instCombineSVEDupqLane(IC, II);
2276fe6060f1SDimitry Andric   }
2277fe6060f1SDimitry Andric 
2278bdd1243dSDimitry Andric   return std::nullopt;
2279fe6060f1SDimitry Andric }
2280fe6060f1SDimitry Andric 
2281bdd1243dSDimitry Andric std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
228204eeddc0SDimitry Andric     InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
228304eeddc0SDimitry Andric     APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
228404eeddc0SDimitry Andric     std::function<void(Instruction *, unsigned, APInt, APInt &)>
228504eeddc0SDimitry Andric         SimplifyAndSetOp) const {
228604eeddc0SDimitry Andric   switch (II.getIntrinsicID()) {
228704eeddc0SDimitry Andric   default:
228804eeddc0SDimitry Andric     break;
228904eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_fcvtxn:
229004eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_rshrn:
229104eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_sqrshrn:
229204eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_sqrshrun:
229304eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_sqshrn:
229404eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_sqshrun:
229504eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_sqxtn:
229604eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_sqxtun:
229704eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_uqrshrn:
229804eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_uqshrn:
229904eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_uqxtn:
230004eeddc0SDimitry Andric     SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
230104eeddc0SDimitry Andric     break;
230204eeddc0SDimitry Andric   }
230304eeddc0SDimitry Andric 
2304bdd1243dSDimitry Andric   return std::nullopt;
2305bdd1243dSDimitry Andric }
2306bdd1243dSDimitry Andric 
2307*62987288SDimitry Andric bool AArch64TTIImpl::enableScalableVectorization() const {
2308*62987288SDimitry Andric   return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2309*62987288SDimitry Andric                                   EnableScalableAutovecInStreamingMode);
2310*62987288SDimitry Andric }
2311*62987288SDimitry Andric 
2312bdd1243dSDimitry Andric TypeSize
2313bdd1243dSDimitry Andric AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
2314bdd1243dSDimitry Andric   switch (K) {
2315bdd1243dSDimitry Andric   case TargetTransformInfo::RGK_Scalar:
2316bdd1243dSDimitry Andric     return TypeSize::getFixed(64);
2317bdd1243dSDimitry Andric   case TargetTransformInfo::RGK_FixedWidthVector:
23180fca6ea1SDimitry Andric     if (ST->useSVEForFixedLengthVectors() &&
23190fca6ea1SDimitry Andric         (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
2320bdd1243dSDimitry Andric       return TypeSize::getFixed(
2321bdd1243dSDimitry Andric           std::max(ST->getMinSVEVectorSizeInBits(), 128u));
23220fca6ea1SDimitry Andric     else if (ST->isNeonAvailable())
23230fca6ea1SDimitry Andric       return TypeSize::getFixed(128);
23240fca6ea1SDimitry Andric     else
23250fca6ea1SDimitry Andric       return TypeSize::getFixed(0);
2326bdd1243dSDimitry Andric   case TargetTransformInfo::RGK_ScalableVector:
23270fca6ea1SDimitry Andric     if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
23280fca6ea1SDimitry Andric                                  EnableScalableAutovecInStreamingMode))
23290fca6ea1SDimitry Andric       return TypeSize::getScalable(128);
23300fca6ea1SDimitry Andric     else
2331bdd1243dSDimitry Andric       return TypeSize::getScalable(0);
2332bdd1243dSDimitry Andric   }
2333bdd1243dSDimitry Andric   llvm_unreachable("Unsupported register kind");
233404eeddc0SDimitry Andric }
233504eeddc0SDimitry Andric 
23360b57cec5SDimitry Andric bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
233706c3fb27SDimitry Andric                                            ArrayRef<const Value *> Args,
233806c3fb27SDimitry Andric                                            Type *SrcOverrideTy) {
23390b57cec5SDimitry Andric   // A helper that returns a vector type from the given type. The number of
234081ad6265SDimitry Andric   // elements in type Ty determines the vector width.
23410b57cec5SDimitry Andric   auto toVectorTy = [&](Type *ArgTy) {
2342e8d8bef9SDimitry Andric     return VectorType::get(ArgTy->getScalarType(),
2343e8d8bef9SDimitry Andric                            cast<VectorType>(DstTy)->getElementCount());
23440b57cec5SDimitry Andric   };
23450b57cec5SDimitry Andric 
234606c3fb27SDimitry Andric   // Exit early if DstTy is not a vector type whose elements are one of [i16,
234706c3fb27SDimitry Andric   // i32, i64]. SVE doesn't generally have the same set of instructions to
2348bdd1243dSDimitry Andric   // perform an extend with the add/sub/mul. There are SMULLB style
2349bdd1243dSDimitry Andric   // instructions, but they operate on top/bottom, requiring some sort of lane
2350bdd1243dSDimitry Andric   // interleaving to be used with zext/sext.
235106c3fb27SDimitry Andric   unsigned DstEltSize = DstTy->getScalarSizeInBits();
235206c3fb27SDimitry Andric   if (!useNeonVector(DstTy) || Args.size() != 2 ||
235306c3fb27SDimitry Andric       (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
23540b57cec5SDimitry Andric     return false;
23550b57cec5SDimitry Andric 
23560b57cec5SDimitry Andric   // Determine if the operation has a widening variant. We consider both the
23570b57cec5SDimitry Andric   // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
23580b57cec5SDimitry Andric   // instructions.
23590b57cec5SDimitry Andric   //
236081ad6265SDimitry Andric   // TODO: Add additional widening operations (e.g., shl, etc.) once we
23610b57cec5SDimitry Andric   //       verify that their extending operands are eliminated during code
23620b57cec5SDimitry Andric   //       generation.
236306c3fb27SDimitry Andric   Type *SrcTy = SrcOverrideTy;
23640b57cec5SDimitry Andric   switch (Opcode) {
23650b57cec5SDimitry Andric   case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
23660b57cec5SDimitry Andric   case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
236706c3fb27SDimitry Andric     // The second operand needs to be an extend
236806c3fb27SDimitry Andric     if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
236906c3fb27SDimitry Andric       if (!SrcTy)
237006c3fb27SDimitry Andric         SrcTy =
237106c3fb27SDimitry Andric             toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
237206c3fb27SDimitry Andric     } else
237306c3fb27SDimitry Andric       return false;
23740b57cec5SDimitry Andric     break;
237506c3fb27SDimitry Andric   case Instruction::Mul: { // SMULL(2), UMULL(2)
237606c3fb27SDimitry Andric     // Both operands need to be extends of the same type.
237706c3fb27SDimitry Andric     if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
237806c3fb27SDimitry Andric         (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
237906c3fb27SDimitry Andric       if (!SrcTy)
238006c3fb27SDimitry Andric         SrcTy =
238106c3fb27SDimitry Andric             toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
238206c3fb27SDimitry Andric     } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
238306c3fb27SDimitry Andric       // If one of the operands is a Zext and the other has enough zero bits to
238406c3fb27SDimitry Andric       // be treated as unsigned, we can still general a umull, meaning the zext
238506c3fb27SDimitry Andric       // is free.
238606c3fb27SDimitry Andric       KnownBits Known =
238706c3fb27SDimitry Andric           computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
238806c3fb27SDimitry Andric       if (Args[0]->getType()->getScalarSizeInBits() -
238906c3fb27SDimitry Andric               Known.Zero.countLeadingOnes() >
239006c3fb27SDimitry Andric           DstTy->getScalarSizeInBits() / 2)
239106c3fb27SDimitry Andric         return false;
239206c3fb27SDimitry Andric       if (!SrcTy)
239306c3fb27SDimitry Andric         SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
239406c3fb27SDimitry Andric                                            DstTy->getScalarSizeInBits() / 2));
239506c3fb27SDimitry Andric     } else
239606c3fb27SDimitry Andric       return false;
239706c3fb27SDimitry Andric     break;
239806c3fb27SDimitry Andric   }
23990b57cec5SDimitry Andric   default:
24000b57cec5SDimitry Andric     return false;
24010b57cec5SDimitry Andric   }
24020b57cec5SDimitry Andric 
24030b57cec5SDimitry Andric   // Legalize the destination type and ensure it can be used in a widening
24040b57cec5SDimitry Andric   // operation.
2405bdd1243dSDimitry Andric   auto DstTyL = getTypeLegalizationCost(DstTy);
240606c3fb27SDimitry Andric   if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
24070b57cec5SDimitry Andric     return false;
24080b57cec5SDimitry Andric 
24090b57cec5SDimitry Andric   // Legalize the source type and ensure it can be used in a widening
24100b57cec5SDimitry Andric   // operation.
241106c3fb27SDimitry Andric   assert(SrcTy && "Expected some SrcTy");
2412bdd1243dSDimitry Andric   auto SrcTyL = getTypeLegalizationCost(SrcTy);
24130b57cec5SDimitry Andric   unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
24140b57cec5SDimitry Andric   if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
24150b57cec5SDimitry Andric     return false;
24160b57cec5SDimitry Andric 
24170b57cec5SDimitry Andric   // Get the total number of vector elements in the legalized types.
2418fe6060f1SDimitry Andric   InstructionCost NumDstEls =
2419fe6060f1SDimitry Andric       DstTyL.first * DstTyL.second.getVectorMinNumElements();
2420fe6060f1SDimitry Andric   InstructionCost NumSrcEls =
2421fe6060f1SDimitry Andric       SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
24220b57cec5SDimitry Andric 
24230b57cec5SDimitry Andric   // Return true if the legalized types have the same number of vector elements
24240b57cec5SDimitry Andric   // and the destination element type size is twice that of the source type.
242506c3fb27SDimitry Andric   return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
24260b57cec5SDimitry Andric }
24270b57cec5SDimitry Andric 
24285f757f3fSDimitry Andric // s/urhadd instructions implement the following pattern, making the
24295f757f3fSDimitry Andric // extends free:
24305f757f3fSDimitry Andric //   %x = add ((zext i8 -> i16), 1)
24315f757f3fSDimitry Andric //   %y = (zext i8 -> i16)
24325f757f3fSDimitry Andric //   trunc i16 (lshr (add %x, %y), 1) -> i8
24335f757f3fSDimitry Andric //
24345f757f3fSDimitry Andric bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst,
24355f757f3fSDimitry Andric                                         Type *Src) {
24365f757f3fSDimitry Andric   // The source should be a legal vector type.
24375f757f3fSDimitry Andric   if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
24385f757f3fSDimitry Andric       (Src->isScalableTy() && !ST->hasSVE2()))
24395f757f3fSDimitry Andric     return false;
24405f757f3fSDimitry Andric 
24415f757f3fSDimitry Andric   if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
24425f757f3fSDimitry Andric     return false;
24435f757f3fSDimitry Andric 
24445f757f3fSDimitry Andric   // Look for trunc/shl/add before trying to match the pattern.
24455f757f3fSDimitry Andric   const Instruction *Add = ExtUser;
24465f757f3fSDimitry Andric   auto *AddUser =
24475f757f3fSDimitry Andric       dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
24485f757f3fSDimitry Andric   if (AddUser && AddUser->getOpcode() == Instruction::Add)
24495f757f3fSDimitry Andric     Add = AddUser;
24505f757f3fSDimitry Andric 
24515f757f3fSDimitry Andric   auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
24525f757f3fSDimitry Andric   if (!Shr || Shr->getOpcode() != Instruction::LShr)
24535f757f3fSDimitry Andric     return false;
24545f757f3fSDimitry Andric 
24555f757f3fSDimitry Andric   auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
24565f757f3fSDimitry Andric   if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
24575f757f3fSDimitry Andric       Src->getScalarSizeInBits() !=
24585f757f3fSDimitry Andric           cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
24595f757f3fSDimitry Andric     return false;
24605f757f3fSDimitry Andric 
24615f757f3fSDimitry Andric   // Try to match the whole pattern. Ext could be either the first or second
24625f757f3fSDimitry Andric   // m_ZExtOrSExt matched.
24635f757f3fSDimitry Andric   Instruction *Ex1, *Ex2;
24645f757f3fSDimitry Andric   if (!(match(Add, m_c_Add(m_Instruction(Ex1),
24655f757f3fSDimitry Andric                            m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
24665f757f3fSDimitry Andric     return false;
24675f757f3fSDimitry Andric 
24685f757f3fSDimitry Andric   // Ensure both extends are of the same type
24695f757f3fSDimitry Andric   if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
24705f757f3fSDimitry Andric       Ex1->getOpcode() == Ex2->getOpcode())
24715f757f3fSDimitry Andric     return true;
24725f757f3fSDimitry Andric 
24735f757f3fSDimitry Andric   return false;
24745f757f3fSDimitry Andric }
24755f757f3fSDimitry Andric 
2476fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
2477fe6060f1SDimitry Andric                                                  Type *Src,
2478e8d8bef9SDimitry Andric                                                  TTI::CastContextHint CCH,
24795ffd83dbSDimitry Andric                                                  TTI::TargetCostKind CostKind,
24800b57cec5SDimitry Andric                                                  const Instruction *I) {
24810b57cec5SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
24820b57cec5SDimitry Andric   assert(ISD && "Invalid opcode");
24830b57cec5SDimitry Andric   // If the cast is observable, and it is used by a widening instruction (e.g.,
24840b57cec5SDimitry Andric   // uaddl, saddw, etc.), it may be free.
248581ad6265SDimitry Andric   if (I && I->hasOneUser()) {
24860b57cec5SDimitry Andric     auto *SingleUser = cast<Instruction>(*I->user_begin());
24870b57cec5SDimitry Andric     SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
248806c3fb27SDimitry Andric     if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
248906c3fb27SDimitry Andric       // For adds only count the second operand as free if both operands are
249006c3fb27SDimitry Andric       // extends but not the same operation. (i.e both operands are not free in
249106c3fb27SDimitry Andric       // add(sext, zext)).
249206c3fb27SDimitry Andric       if (SingleUser->getOpcode() == Instruction::Add) {
249306c3fb27SDimitry Andric         if (I == SingleUser->getOperand(1) ||
249406c3fb27SDimitry Andric             (isa<CastInst>(SingleUser->getOperand(1)) &&
249506c3fb27SDimitry Andric              cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
24960b57cec5SDimitry Andric           return 0;
249706c3fb27SDimitry Andric       } else // Others are free so long as isWideningInstruction returned true.
24980b57cec5SDimitry Andric         return 0;
24990b57cec5SDimitry Andric     }
25005f757f3fSDimitry Andric 
25015f757f3fSDimitry Andric     // The cast will be free for the s/urhadd instructions
25025f757f3fSDimitry Andric     if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
25035f757f3fSDimitry Andric         isExtPartOfAvgExpr(SingleUser, Dst, Src))
25045f757f3fSDimitry Andric       return 0;
25050b57cec5SDimitry Andric   }
25060b57cec5SDimitry Andric 
25075ffd83dbSDimitry Andric   // TODO: Allow non-throughput costs that aren't binary.
2508fe6060f1SDimitry Andric   auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
25095ffd83dbSDimitry Andric     if (CostKind != TTI::TCK_RecipThroughput)
25105ffd83dbSDimitry Andric       return Cost == 0 ? 0 : 1;
25115ffd83dbSDimitry Andric     return Cost;
25125ffd83dbSDimitry Andric   };
25135ffd83dbSDimitry Andric 
25140b57cec5SDimitry Andric   EVT SrcTy = TLI->getValueType(DL, Src);
25150b57cec5SDimitry Andric   EVT DstTy = TLI->getValueType(DL, Dst);
25160b57cec5SDimitry Andric 
25170b57cec5SDimitry Andric   if (!SrcTy.isSimple() || !DstTy.isSimple())
2518e8d8bef9SDimitry Andric     return AdjustCost(
2519e8d8bef9SDimitry Andric         BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
25200b57cec5SDimitry Andric 
25210b57cec5SDimitry Andric   static const TypeConversionCostTblEntry
25220b57cec5SDimitry Andric   ConversionTbl[] = {
2523bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v2i8,   MVT::v2i64,  1},  // xtn
2524bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v2i16,  MVT::v2i64,  1},  // xtn
2525bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v2i32,  MVT::v2i64,  1},  // xtn
2526bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v4i8,   MVT::v4i32,  1},  // xtn
2527bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v4i8,   MVT::v4i64,  3},  // 2 xtn + 1 uzp1
2528bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v4i16,  MVT::v4i32,  1},  // xtn
2529bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v4i16,  MVT::v4i64,  2},  // 1 uzp1 + 1 xtn
2530bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v4i32,  MVT::v4i64,  1},  // 1 uzp1
2531bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v8i8,   MVT::v8i16,  1},  // 1 xtn
2532bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v8i8,   MVT::v8i32,  2},  // 1 uzp1 + 1 xtn
2533bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v8i8,   MVT::v8i64,  4},  // 3 x uzp1 + xtn
2534bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v8i16,  MVT::v8i32,  1},  // 1 uzp1
2535bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v8i16,  MVT::v8i64,  3},  // 3 x uzp1
2536bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v8i32,  MVT::v8i64,  2},  // 2 x uzp1
2537bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v16i8,  MVT::v16i16, 1},  // uzp1
2538bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v16i8,  MVT::v16i32, 3},  // (2 + 1) x uzp1
2539bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v16i8,  MVT::v16i64, 7},  // (4 + 2 + 1) x uzp1
2540bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2},  // 2 x uzp1
2541bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6},  // (4 + 2) x uzp1
2542bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4},  // 4 x uzp1
25430b57cec5SDimitry Andric 
2544fe6060f1SDimitry Andric     // Truncations on nxvmiN
2545fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
2546fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
2547fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
2548fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
2549fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
2550fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
2551fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
2552fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
2553fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
2554fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
2555fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
2556fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
2557fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
2558fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
2559fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
2560fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
2561fe6060f1SDimitry Andric 
25620b57cec5SDimitry Andric     // The number of shll instructions for the extension.
25630b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
25640b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
25650b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
25660b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
25670b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
25680b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
25690b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
25700b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
25710b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
25720b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
25730b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
25740b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
25750b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
25760b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
25770b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
25780b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
25790b57cec5SDimitry Andric 
25800b57cec5SDimitry Andric     // LowerVectorINT_TO_FP:
25810b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
25820b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
25830b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
25840b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
25850b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
25860b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
25870b57cec5SDimitry Andric 
25880b57cec5SDimitry Andric     // Complex: to v2f32
25890b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
25900b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
25910b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
25920b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
25930b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
25940b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
25950b57cec5SDimitry Andric 
25960b57cec5SDimitry Andric     // Complex: to v4f32
25970b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8,  4 },
25980b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
25990b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
26000b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
26010b57cec5SDimitry Andric 
26020b57cec5SDimitry Andric     // Complex: to v8f32
26030b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
26040b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
26050b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
26060b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
26070b57cec5SDimitry Andric 
26080b57cec5SDimitry Andric     // Complex: to v16f32
26090b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
26100b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
26110b57cec5SDimitry Andric 
26120b57cec5SDimitry Andric     // Complex: to v2f64
26130b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
26140b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
26150b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
26160b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
26170b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
26180b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
26190b57cec5SDimitry Andric 
2620bdd1243dSDimitry Andric     // Complex: to v4f64
2621bdd1243dSDimitry Andric     { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32,  4 },
2622bdd1243dSDimitry Andric     { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32,  4 },
26230b57cec5SDimitry Andric 
26240b57cec5SDimitry Andric     // LowerVectorFP_TO_INT
26250b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
26260b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
26270b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
26280b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
26290b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
26300b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
26310b57cec5SDimitry Andric 
26320b57cec5SDimitry Andric     // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
26330b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
26340b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
26350b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f32, 1 },
26360b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
26370b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
26380b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f32, 1 },
26390b57cec5SDimitry Andric 
26400b57cec5SDimitry Andric     // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
26410b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
26420b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v4i8,  MVT::v4f32, 2 },
26430b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
26440b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v4i8,  MVT::v4f32, 2 },
26450b57cec5SDimitry Andric 
2646fe6060f1SDimitry Andric     // Complex, from nxv2f32.
2647fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2648fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2649fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2650fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f32, 1 },
2651fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2652fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2653fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2654fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f32, 1 },
2655fe6060f1SDimitry Andric 
26560b57cec5SDimitry Andric     // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
26570b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
26580b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
26590b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f64, 2 },
26600b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
26610b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
26620b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },
2663fe6060f1SDimitry Andric 
2664fe6060f1SDimitry Andric     // Complex, from nxv2f64.
2665fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2666fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2667fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2668fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f64, 1 },
2669fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2670fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2671fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2672fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f64, 1 },
2673fe6060f1SDimitry Andric 
2674fe6060f1SDimitry Andric     // Complex, from nxv4f32.
2675fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2676fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2677fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2678fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f32, 1 },
2679fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2680fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2681fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2682fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f32, 1 },
2683fe6060f1SDimitry Andric 
2684fe6060f1SDimitry Andric     // Complex, from nxv8f64. Illegal -> illegal conversions not required.
2685fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2686fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f64, 7 },
2687fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2688fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f64, 7 },
2689fe6060f1SDimitry Andric 
2690fe6060f1SDimitry Andric     // Complex, from nxv4f64. Illegal -> illegal conversions not required.
2691fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2692fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2693fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f64, 3 },
2694fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2695fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2696fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f64, 3 },
2697fe6060f1SDimitry Andric 
2698fe6060f1SDimitry Andric     // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2699fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2700fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f32, 3 },
2701fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2702fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f32, 3 },
2703fe6060f1SDimitry Andric 
2704fe6060f1SDimitry Andric     // Complex, from nxv8f16.
2705fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2706fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2707fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2708fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f16, 1 },
2709fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2710fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2711fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2712fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f16, 1 },
2713fe6060f1SDimitry Andric 
2714fe6060f1SDimitry Andric     // Complex, from nxv4f16.
2715fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2716fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2717fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2718fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f16, 1 },
2719fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2720fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2721fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2722fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f16, 1 },
2723fe6060f1SDimitry Andric 
2724fe6060f1SDimitry Andric     // Complex, from nxv2f16.
2725fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2726fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2727fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2728fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f16, 1 },
2729fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2730fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2731fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2732fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f16, 1 },
2733fe6060f1SDimitry Andric 
2734fe6060f1SDimitry Andric     // Truncate from nxvmf32 to nxvmf16.
2735fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
2736fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
2737fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
2738fe6060f1SDimitry Andric 
2739fe6060f1SDimitry Andric     // Truncate from nxvmf64 to nxvmf16.
2740fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
2741fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
2742fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
2743fe6060f1SDimitry Andric 
2744fe6060f1SDimitry Andric     // Truncate from nxvmf64 to nxvmf32.
2745fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
2746fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
2747fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
2748fe6060f1SDimitry Andric 
2749fe6060f1SDimitry Andric     // Extend from nxvmf16 to nxvmf32.
2750fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
2751fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
2752fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
2753fe6060f1SDimitry Andric 
2754fe6060f1SDimitry Andric     // Extend from nxvmf16 to nxvmf64.
2755fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
2756fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
2757fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
2758fe6060f1SDimitry Andric 
2759fe6060f1SDimitry Andric     // Extend from nxvmf32 to nxvmf64.
2760fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
2761fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
2762fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
2763fe6060f1SDimitry Andric 
276404eeddc0SDimitry Andric     // Bitcasts from float to integer
276504eeddc0SDimitry Andric     { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
276604eeddc0SDimitry Andric     { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
276704eeddc0SDimitry Andric     { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
276804eeddc0SDimitry Andric 
276904eeddc0SDimitry Andric     // Bitcasts from integer to float
277004eeddc0SDimitry Andric     { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
277104eeddc0SDimitry Andric     { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
277204eeddc0SDimitry Andric     { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
277306c3fb27SDimitry Andric 
277406c3fb27SDimitry Andric     // Add cost for extending to illegal -too wide- scalable vectors.
277506c3fb27SDimitry Andric     // zero/sign extend are implemented by multiple unpack operations,
277606c3fb27SDimitry Andric     // where each operation has a cost of 1.
277706c3fb27SDimitry Andric     { ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
277806c3fb27SDimitry Andric     { ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
277906c3fb27SDimitry Andric     { ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
278006c3fb27SDimitry Andric     { ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
278106c3fb27SDimitry Andric     { ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
278206c3fb27SDimitry Andric     { ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
278306c3fb27SDimitry Andric 
278406c3fb27SDimitry Andric     { ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
278506c3fb27SDimitry Andric     { ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
278606c3fb27SDimitry Andric     { ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
278706c3fb27SDimitry Andric     { ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
278806c3fb27SDimitry Andric     { ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
278906c3fb27SDimitry Andric     { ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
27900b57cec5SDimitry Andric   };
27910b57cec5SDimitry Andric 
279206c3fb27SDimitry Andric   // We have to estimate a cost of fixed length operation upon
279306c3fb27SDimitry Andric   // SVE registers(operations) with the number of registers required
279406c3fb27SDimitry Andric   // for a fixed type to be represented upon SVE registers.
279506c3fb27SDimitry Andric   EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
279606c3fb27SDimitry Andric   if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
279706c3fb27SDimitry Andric       SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
279806c3fb27SDimitry Andric       ST->useSVEForFixedLengthVectors(WiderTy)) {
279906c3fb27SDimitry Andric     std::pair<InstructionCost, MVT> LT =
280006c3fb27SDimitry Andric         getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
280106c3fb27SDimitry Andric     unsigned NumElements = AArch64::SVEBitsPerBlock /
28020fca6ea1SDimitry Andric                            LT.second.getScalarSizeInBits();
280306c3fb27SDimitry Andric     return AdjustCost(
280406c3fb27SDimitry Andric         LT.first *
280506c3fb27SDimitry Andric         getCastInstrCost(
280606c3fb27SDimitry Andric             Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
280706c3fb27SDimitry Andric             ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
280806c3fb27SDimitry Andric             CostKind, I));
280906c3fb27SDimitry Andric   }
281006c3fb27SDimitry Andric 
28110b57cec5SDimitry Andric   if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
28120b57cec5SDimitry Andric                                                  DstTy.getSimpleVT(),
28130b57cec5SDimitry Andric                                                  SrcTy.getSimpleVT()))
28145ffd83dbSDimitry Andric     return AdjustCost(Entry->Cost);
28150b57cec5SDimitry Andric 
281681ad6265SDimitry Andric   static const TypeConversionCostTblEntry FP16Tbl[] = {
281781ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
281881ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
281981ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
282081ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
282181ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
282281ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
282381ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
282481ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
282581ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
282681ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
282781ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
282881ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
282981ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
283081ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
283181ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
283281ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
283381ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
283481ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
283581ad6265SDimitry Andric       {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2},   // ushll + ucvtf
283681ad6265SDimitry Andric       {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2},   // sshll + scvtf
283781ad6265SDimitry Andric       {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
283881ad6265SDimitry Andric       {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
283981ad6265SDimitry Andric   };
284081ad6265SDimitry Andric 
284181ad6265SDimitry Andric   if (ST->hasFullFP16())
284281ad6265SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(
284381ad6265SDimitry Andric             FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
284481ad6265SDimitry Andric       return AdjustCost(Entry->Cost);
284581ad6265SDimitry Andric 
28465f757f3fSDimitry Andric   if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
28470fca6ea1SDimitry Andric       CCH == TTI::CastContextHint::Masked &&
28480fca6ea1SDimitry Andric       ST->isSVEorStreamingSVEAvailable() &&
28495f757f3fSDimitry Andric       TLI->getTypeAction(Src->getContext(), SrcTy) ==
28505f757f3fSDimitry Andric           TargetLowering::TypePromoteInteger &&
28515f757f3fSDimitry Andric       TLI->getTypeAction(Dst->getContext(), DstTy) ==
28525f757f3fSDimitry Andric           TargetLowering::TypeSplitVector) {
28535f757f3fSDimitry Andric     // The standard behaviour in the backend for these cases is to split the
28545f757f3fSDimitry Andric     // extend up into two parts:
28555f757f3fSDimitry Andric     //  1. Perform an extending load or masked load up to the legal type.
28565f757f3fSDimitry Andric     //  2. Extend the loaded data to the final type.
28575f757f3fSDimitry Andric     std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
28585f757f3fSDimitry Andric     Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
28595f757f3fSDimitry Andric     InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost(
28605f757f3fSDimitry Andric         Opcode, LegalTy, Src, CCH, CostKind, I);
28615f757f3fSDimitry Andric     InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost(
28625f757f3fSDimitry Andric         Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
28635f757f3fSDimitry Andric     return Part1 + Part2;
28645f757f3fSDimitry Andric   }
28655f757f3fSDimitry Andric 
286606c3fb27SDimitry Andric   // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
286706c3fb27SDimitry Andric   // but we also want to include the TTI::CastContextHint::Masked case too.
286806c3fb27SDimitry Andric   if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
28690fca6ea1SDimitry Andric       CCH == TTI::CastContextHint::Masked &&
28700fca6ea1SDimitry Andric       ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
287106c3fb27SDimitry Andric     CCH = TTI::CastContextHint::Normal;
287206c3fb27SDimitry Andric 
2873e8d8bef9SDimitry Andric   return AdjustCost(
2874e8d8bef9SDimitry Andric       BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
28750b57cec5SDimitry Andric }
28760b57cec5SDimitry Andric 
2877fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
2878fe6060f1SDimitry Andric                                                          Type *Dst,
28790b57cec5SDimitry Andric                                                          VectorType *VecTy,
28800b57cec5SDimitry Andric                                                          unsigned Index) {
28810b57cec5SDimitry Andric 
28820b57cec5SDimitry Andric   // Make sure we were given a valid extend opcode.
28830b57cec5SDimitry Andric   assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
28840b57cec5SDimitry Andric          "Invalid opcode");
28850b57cec5SDimitry Andric 
28860b57cec5SDimitry Andric   // We are extending an element we extract from a vector, so the source type
28870b57cec5SDimitry Andric   // of the extend is the element type of the vector.
28880b57cec5SDimitry Andric   auto *Src = VecTy->getElementType();
28890b57cec5SDimitry Andric 
28900b57cec5SDimitry Andric   // Sign- and zero-extends are for integer types only.
28910b57cec5SDimitry Andric   assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
28920b57cec5SDimitry Andric 
28930b57cec5SDimitry Andric   // Get the cost for the extract. We compute the cost (if any) for the extend
28940b57cec5SDimitry Andric   // below.
2895bdd1243dSDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2896bdd1243dSDimitry Andric   InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
2897bdd1243dSDimitry Andric                                             CostKind, Index, nullptr, nullptr);
28980b57cec5SDimitry Andric 
28990b57cec5SDimitry Andric   // Legalize the types.
2900bdd1243dSDimitry Andric   auto VecLT = getTypeLegalizationCost(VecTy);
29010b57cec5SDimitry Andric   auto DstVT = TLI->getValueType(DL, Dst);
29020b57cec5SDimitry Andric   auto SrcVT = TLI->getValueType(DL, Src);
29030b57cec5SDimitry Andric 
29040b57cec5SDimitry Andric   // If the resulting type is still a vector and the destination type is legal,
29050b57cec5SDimitry Andric   // we may get the extension for free. If not, get the default cost for the
29060b57cec5SDimitry Andric   // extend.
29070b57cec5SDimitry Andric   if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
2908e8d8bef9SDimitry Andric     return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2909e8d8bef9SDimitry Andric                                    CostKind);
29100b57cec5SDimitry Andric 
29110b57cec5SDimitry Andric   // The destination type should be larger than the element type. If not, get
29120b57cec5SDimitry Andric   // the default cost for the extend.
2913e8d8bef9SDimitry Andric   if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2914e8d8bef9SDimitry Andric     return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2915e8d8bef9SDimitry Andric                                    CostKind);
29160b57cec5SDimitry Andric 
29170b57cec5SDimitry Andric   switch (Opcode) {
29180b57cec5SDimitry Andric   default:
29190b57cec5SDimitry Andric     llvm_unreachable("Opcode should be either SExt or ZExt");
29200b57cec5SDimitry Andric 
29210b57cec5SDimitry Andric   // For sign-extends, we only need a smov, which performs the extension
29220b57cec5SDimitry Andric   // automatically.
29230b57cec5SDimitry Andric   case Instruction::SExt:
29240b57cec5SDimitry Andric     return Cost;
29250b57cec5SDimitry Andric 
29260b57cec5SDimitry Andric   // For zero-extends, the extend is performed automatically by a umov unless
29270b57cec5SDimitry Andric   // the destination type is i64 and the element type is i8 or i16.
29280b57cec5SDimitry Andric   case Instruction::ZExt:
29290b57cec5SDimitry Andric     if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
29300b57cec5SDimitry Andric       return Cost;
29310b57cec5SDimitry Andric   }
29320b57cec5SDimitry Andric 
29330b57cec5SDimitry Andric   // If we are unable to perform the extend for free, get the default cost.
2934e8d8bef9SDimitry Andric   return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2935e8d8bef9SDimitry Andric                                  CostKind);
29365ffd83dbSDimitry Andric }
29375ffd83dbSDimitry Andric 
2938fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
2939fe6060f1SDimitry Andric                                                TTI::TargetCostKind CostKind,
2940fe6060f1SDimitry Andric                                                const Instruction *I) {
29415ffd83dbSDimitry Andric   if (CostKind != TTI::TCK_RecipThroughput)
29425ffd83dbSDimitry Andric     return Opcode == Instruction::PHI ? 0 : 1;
29435ffd83dbSDimitry Andric   assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
29445ffd83dbSDimitry Andric   // Branches are assumed to be predicted.
29455ffd83dbSDimitry Andric   return 0;
29460b57cec5SDimitry Andric }
29470b57cec5SDimitry Andric 
294806c3fb27SDimitry Andric InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
294906c3fb27SDimitry Andric                                                          Type *Val,
2950bdd1243dSDimitry Andric                                                          unsigned Index,
2951bdd1243dSDimitry Andric                                                          bool HasRealUse) {
29520b57cec5SDimitry Andric   assert(Val->isVectorTy() && "This must be a vector type");
29530b57cec5SDimitry Andric 
29540b57cec5SDimitry Andric   if (Index != -1U) {
29550b57cec5SDimitry Andric     // Legalize the type.
2956bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
29570b57cec5SDimitry Andric 
29580b57cec5SDimitry Andric     // This type is legalized to a scalar type.
29590b57cec5SDimitry Andric     if (!LT.second.isVector())
29600b57cec5SDimitry Andric       return 0;
29610b57cec5SDimitry Andric 
296204eeddc0SDimitry Andric     // The type may be split. For fixed-width vectors we can normalize the
296304eeddc0SDimitry Andric     // index to the new type.
296404eeddc0SDimitry Andric     if (LT.second.isFixedLengthVector()) {
29650b57cec5SDimitry Andric       unsigned Width = LT.second.getVectorNumElements();
29660b57cec5SDimitry Andric       Index = Index % Width;
296704eeddc0SDimitry Andric     }
29680b57cec5SDimitry Andric 
29690b57cec5SDimitry Andric     // The element at index zero is already inside the vector.
2970bdd1243dSDimitry Andric     // - For a physical (HasRealUse==true) insert-element or extract-element
2971bdd1243dSDimitry Andric     // instruction that extracts integers, an explicit FPR -> GPR move is
2972bdd1243dSDimitry Andric     // needed. So it has non-zero cost.
2973bdd1243dSDimitry Andric     // - For the rest of cases (virtual instruction or element type is float),
2974bdd1243dSDimitry Andric     // consider the instruction free.
297506c3fb27SDimitry Andric     if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
297606c3fb27SDimitry Andric       return 0;
297706c3fb27SDimitry Andric 
297806c3fb27SDimitry Andric     // This is recognising a LD1 single-element structure to one lane of one
297906c3fb27SDimitry Andric     // register instruction. I.e., if this is an `insertelement` instruction,
298006c3fb27SDimitry Andric     // and its second operand is a load, then we will generate a LD1, which
298106c3fb27SDimitry Andric     // are expensive instructions.
298206c3fb27SDimitry Andric     if (I && dyn_cast<LoadInst>(I->getOperand(1)))
298306c3fb27SDimitry Andric       return ST->getVectorInsertExtractBaseCost() + 1;
298406c3fb27SDimitry Andric 
298506c3fb27SDimitry Andric     // i1 inserts and extract will include an extra cset or cmp of the vector
298606c3fb27SDimitry Andric     // value. Increase the cost by 1 to account.
298706c3fb27SDimitry Andric     if (Val->getScalarSizeInBits() == 1)
298806c3fb27SDimitry Andric       return ST->getVectorInsertExtractBaseCost() + 1;
298906c3fb27SDimitry Andric 
2990bdd1243dSDimitry Andric     // FIXME:
2991bdd1243dSDimitry Andric     // If the extract-element and insert-element instructions could be
2992bdd1243dSDimitry Andric     // simplified away (e.g., could be combined into users by looking at use-def
2993bdd1243dSDimitry Andric     // context), they have no cost. This is not done in the first place for
2994bdd1243dSDimitry Andric     // compile-time considerations.
29950b57cec5SDimitry Andric   }
29960b57cec5SDimitry Andric 
29970b57cec5SDimitry Andric   // All other insert/extracts cost this much.
29980b57cec5SDimitry Andric   return ST->getVectorInsertExtractBaseCost();
29990b57cec5SDimitry Andric }
30000b57cec5SDimitry Andric 
3001bdd1243dSDimitry Andric InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
3002bdd1243dSDimitry Andric                                                    TTI::TargetCostKind CostKind,
3003bdd1243dSDimitry Andric                                                    unsigned Index, Value *Op0,
3004bdd1243dSDimitry Andric                                                    Value *Op1) {
300506c3fb27SDimitry Andric   bool HasRealUse =
300606c3fb27SDimitry Andric       Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
300706c3fb27SDimitry Andric   return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse);
3008bdd1243dSDimitry Andric }
3009bdd1243dSDimitry Andric 
3010bdd1243dSDimitry Andric InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
3011bdd1243dSDimitry Andric                                                    Type *Val,
3012bdd1243dSDimitry Andric                                                    TTI::TargetCostKind CostKind,
3013bdd1243dSDimitry Andric                                                    unsigned Index) {
301406c3fb27SDimitry Andric   return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
3015bdd1243dSDimitry Andric }
3016bdd1243dSDimitry Andric 
30175f757f3fSDimitry Andric InstructionCost AArch64TTIImpl::getScalarizationOverhead(
30185f757f3fSDimitry Andric     VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
30195f757f3fSDimitry Andric     TTI::TargetCostKind CostKind) {
30205f757f3fSDimitry Andric   if (isa<ScalableVectorType>(Ty))
30215f757f3fSDimitry Andric     return InstructionCost::getInvalid();
30225f757f3fSDimitry Andric   if (Ty->getElementType()->isFloatingPointTy())
30235f757f3fSDimitry Andric     return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
30245f757f3fSDimitry Andric                                            CostKind);
30255f757f3fSDimitry Andric   return DemandedElts.popcount() * (Insert + Extract) *
30265f757f3fSDimitry Andric          ST->getVectorInsertExtractBaseCost();
30275f757f3fSDimitry Andric }
30285f757f3fSDimitry Andric 
3029fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
30305ffd83dbSDimitry Andric     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
3031bdd1243dSDimitry Andric     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
3032bdd1243dSDimitry Andric     ArrayRef<const Value *> Args,
3033480093f4SDimitry Andric     const Instruction *CxtI) {
3034bdd1243dSDimitry Andric 
3035*62987288SDimitry Andric   // The code-generator is currently not able to handle scalable vectors
3036*62987288SDimitry Andric   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3037*62987288SDimitry Andric   // it. This change will be removed when code-generation for these types is
3038*62987288SDimitry Andric   // sufficiently reliable.
3039*62987288SDimitry Andric   if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3040*62987288SDimitry Andric     if (VTy->getElementCount() == ElementCount::getScalable(1))
3041*62987288SDimitry Andric       return InstructionCost::getInvalid();
3042*62987288SDimitry Andric 
30435ffd83dbSDimitry Andric   // TODO: Handle more cost kinds.
30445ffd83dbSDimitry Andric   if (CostKind != TTI::TCK_RecipThroughput)
3045bdd1243dSDimitry Andric     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3046bdd1243dSDimitry Andric                                          Op2Info, Args, CxtI);
30475ffd83dbSDimitry Andric 
30480b57cec5SDimitry Andric   // Legalize the type.
3049bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
30500b57cec5SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
30510b57cec5SDimitry Andric 
30520b57cec5SDimitry Andric   switch (ISD) {
30530b57cec5SDimitry Andric   default:
3054bdd1243dSDimitry Andric     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3055bdd1243dSDimitry Andric                                          Op2Info);
30560b57cec5SDimitry Andric   case ISD::SDIV:
3057bdd1243dSDimitry Andric     if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
30580b57cec5SDimitry Andric       // On AArch64, scalar signed division by constants power-of-two are
30590b57cec5SDimitry Andric       // normally expanded to the sequence ADD + CMP + SELECT + SRA.
30600b57cec5SDimitry Andric       // The OperandValue properties many not be same as that of previous
30610b57cec5SDimitry Andric       // operation; conservatively assume OP_None.
306281ad6265SDimitry Andric       InstructionCost Cost = getArithmeticInstrCost(
3063bdd1243dSDimitry Andric           Instruction::Add, Ty, CostKind,
3064bdd1243dSDimitry Andric           Op1Info.getNoProps(), Op2Info.getNoProps());
3065bdd1243dSDimitry Andric       Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
3066bdd1243dSDimitry Andric                                      Op1Info.getNoProps(), Op2Info.getNoProps());
306781ad6265SDimitry Andric       Cost += getArithmeticInstrCost(
3068bdd1243dSDimitry Andric           Instruction::Select, Ty, CostKind,
3069bdd1243dSDimitry Andric           Op1Info.getNoProps(), Op2Info.getNoProps());
3070bdd1243dSDimitry Andric       Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3071bdd1243dSDimitry Andric                                      Op1Info.getNoProps(), Op2Info.getNoProps());
30720b57cec5SDimitry Andric       return Cost;
30730b57cec5SDimitry Andric     }
3074bdd1243dSDimitry Andric     [[fallthrough]];
307581ad6265SDimitry Andric   case ISD::UDIV: {
3076bdd1243dSDimitry Andric     if (Op2Info.isConstant() && Op2Info.isUniform()) {
30770b57cec5SDimitry Andric       auto VT = TLI->getValueType(DL, Ty);
30780b57cec5SDimitry Andric       if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
30790b57cec5SDimitry Andric         // Vector signed division by constant are expanded to the
30800b57cec5SDimitry Andric         // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
30810b57cec5SDimitry Andric         // to MULHS + SUB + SRL + ADD + SRL.
3082fe6060f1SDimitry Andric         InstructionCost MulCost = getArithmeticInstrCost(
3083bdd1243dSDimitry Andric             Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3084fe6060f1SDimitry Andric         InstructionCost AddCost = getArithmeticInstrCost(
3085bdd1243dSDimitry Andric             Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3086fe6060f1SDimitry Andric         InstructionCost ShrCost = getArithmeticInstrCost(
3087bdd1243dSDimitry Andric             Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
30880b57cec5SDimitry Andric         return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
30890b57cec5SDimitry Andric       }
30900b57cec5SDimitry Andric     }
30910b57cec5SDimitry Andric 
309281ad6265SDimitry Andric     InstructionCost Cost = BaseT::getArithmeticInstrCost(
3093bdd1243dSDimitry Andric         Opcode, Ty, CostKind, Op1Info, Op2Info);
30940b57cec5SDimitry Andric     if (Ty->isVectorTy()) {
3095bdd1243dSDimitry Andric       if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
3096bdd1243dSDimitry Andric         // SDIV/UDIV operations are lowered using SVE, then we can have less
3097bdd1243dSDimitry Andric         // costs.
3098bdd1243dSDimitry Andric         if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
3099bdd1243dSDimitry Andric                                                 ->getPrimitiveSizeInBits()
3100bdd1243dSDimitry Andric                                                 .getFixedValue() < 128) {
3101bdd1243dSDimitry Andric           EVT VT = TLI->getValueType(DL, Ty);
3102bdd1243dSDimitry Andric           static const CostTblEntry DivTbl[]{
3103bdd1243dSDimitry Andric               {ISD::SDIV, MVT::v2i8, 5},  {ISD::SDIV, MVT::v4i8, 8},
3104bdd1243dSDimitry Andric               {ISD::SDIV, MVT::v8i8, 8},  {ISD::SDIV, MVT::v2i16, 5},
3105bdd1243dSDimitry Andric               {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
3106bdd1243dSDimitry Andric               {ISD::UDIV, MVT::v2i8, 5},  {ISD::UDIV, MVT::v4i8, 8},
3107bdd1243dSDimitry Andric               {ISD::UDIV, MVT::v8i8, 8},  {ISD::UDIV, MVT::v2i16, 5},
3108bdd1243dSDimitry Andric               {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
3109bdd1243dSDimitry Andric 
3110bdd1243dSDimitry Andric           const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
3111bdd1243dSDimitry Andric           if (nullptr != Entry)
3112bdd1243dSDimitry Andric             return Entry->Cost;
3113bdd1243dSDimitry Andric         }
3114bdd1243dSDimitry Andric         // For 8/16-bit elements, the cost is higher because the type
3115bdd1243dSDimitry Andric         // requires promotion and possibly splitting:
3116bdd1243dSDimitry Andric         if (LT.second.getScalarType() == MVT::i8)
3117bdd1243dSDimitry Andric           Cost *= 8;
3118bdd1243dSDimitry Andric         else if (LT.second.getScalarType() == MVT::i16)
3119bdd1243dSDimitry Andric           Cost *= 4;
3120bdd1243dSDimitry Andric         return Cost;
3121bdd1243dSDimitry Andric       } else {
3122bdd1243dSDimitry Andric         // If one of the operands is a uniform constant then the cost for each
3123bdd1243dSDimitry Andric         // element is Cost for insertion, extraction and division.
3124bdd1243dSDimitry Andric         // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
3125bdd1243dSDimitry Andric         // operation with scalar type
3126bdd1243dSDimitry Andric         if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
3127bdd1243dSDimitry Andric             (Op2Info.isConstant() && Op2Info.isUniform())) {
3128bdd1243dSDimitry Andric           if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
3129bdd1243dSDimitry Andric             InstructionCost DivCost = BaseT::getArithmeticInstrCost(
3130bdd1243dSDimitry Andric                 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
3131bdd1243dSDimitry Andric             return (4 + DivCost) * VTy->getNumElements();
3132bdd1243dSDimitry Andric           }
3133bdd1243dSDimitry Andric         }
3134bdd1243dSDimitry Andric         // On AArch64, without SVE, vector divisions are expanded
3135bdd1243dSDimitry Andric         // into scalar divisions of each pair of elements.
3136bdd1243dSDimitry Andric         Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
3137bdd1243dSDimitry Andric                                        CostKind, Op1Info, Op2Info);
31385ffd83dbSDimitry Andric         Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
3139bdd1243dSDimitry Andric                                        Op1Info, Op2Info);
3140bdd1243dSDimitry Andric       }
3141bdd1243dSDimitry Andric 
31420b57cec5SDimitry Andric       // TODO: if one of the arguments is scalar, then it's not necessary to
31430b57cec5SDimitry Andric       // double the cost of handling the vector elements.
31440b57cec5SDimitry Andric       Cost += Cost;
31450b57cec5SDimitry Andric     }
31460b57cec5SDimitry Andric     return Cost;
314781ad6265SDimitry Andric   }
31480b57cec5SDimitry Andric   case ISD::MUL:
3149bdd1243dSDimitry Andric     // When SVE is available, then we can lower the v2i64 operation using
3150bdd1243dSDimitry Andric     // the SVE mul instruction, which has a lower cost.
3151bdd1243dSDimitry Andric     if (LT.second == MVT::v2i64 && ST->hasSVE())
3152bdd1243dSDimitry Andric       return LT.first;
3153bdd1243dSDimitry Andric 
3154bdd1243dSDimitry Andric     // When SVE is not available, there is no MUL.2d instruction,
3155bdd1243dSDimitry Andric     // which means mul <2 x i64> is expensive as elements are extracted
3156bdd1243dSDimitry Andric     // from the vectors and the muls scalarized.
3157bdd1243dSDimitry Andric     // As getScalarizationOverhead is a bit too pessimistic, we
3158bdd1243dSDimitry Andric     // estimate the cost for a i64 vector directly here, which is:
315981ad6265SDimitry Andric     // - four 2-cost i64 extracts,
316081ad6265SDimitry Andric     // - two 2-cost i64 inserts, and
316181ad6265SDimitry Andric     // - two 1-cost muls.
316281ad6265SDimitry Andric     // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
316381ad6265SDimitry Andric     // LT.first = 2 the cost is 28. If both operands are extensions it will not
316481ad6265SDimitry Andric     // need to scalarize so the cost can be cheaper (smull or umull).
3165bdd1243dSDimitry Andric     // so the cost can be cheaper (smull or umull).
316681ad6265SDimitry Andric     if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
316781ad6265SDimitry Andric       return LT.first;
316881ad6265SDimitry Andric     return LT.first * 14;
3169e8d8bef9SDimitry Andric   case ISD::ADD:
31700b57cec5SDimitry Andric   case ISD::XOR:
31710b57cec5SDimitry Andric   case ISD::OR:
31720b57cec5SDimitry Andric   case ISD::AND:
317381ad6265SDimitry Andric   case ISD::SRL:
317481ad6265SDimitry Andric   case ISD::SRA:
317581ad6265SDimitry Andric   case ISD::SHL:
31760b57cec5SDimitry Andric     // These nodes are marked as 'custom' for combining purposes only.
31770b57cec5SDimitry Andric     // We know that they are legal. See LowerAdd in ISelLowering.
317881ad6265SDimitry Andric     return LT.first;
31795ffd83dbSDimitry Andric 
318006c3fb27SDimitry Andric   case ISD::FNEG:
31815ffd83dbSDimitry Andric   case ISD::FADD:
3182349cc55cSDimitry Andric   case ISD::FSUB:
318306c3fb27SDimitry Andric     // Increase the cost for half and bfloat types if not architecturally
318406c3fb27SDimitry Andric     // supported.
318506c3fb27SDimitry Andric     if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
318606c3fb27SDimitry Andric         (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
318706c3fb27SDimitry Andric       return 2 * LT.first;
318806c3fb27SDimitry Andric     if (!Ty->getScalarType()->isFP128Ty())
318906c3fb27SDimitry Andric       return LT.first;
319006c3fb27SDimitry Andric     [[fallthrough]];
3191349cc55cSDimitry Andric   case ISD::FMUL:
3192349cc55cSDimitry Andric   case ISD::FDIV:
31935ffd83dbSDimitry Andric     // These nodes are marked as 'custom' just to lower them to SVE.
31945ffd83dbSDimitry Andric     // We know said lowering will incur no additional cost.
3195349cc55cSDimitry Andric     if (!Ty->getScalarType()->isFP128Ty())
319681ad6265SDimitry Andric       return 2 * LT.first;
31975ffd83dbSDimitry Andric 
3198bdd1243dSDimitry Andric     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3199bdd1243dSDimitry Andric                                          Op2Info);
32000fca6ea1SDimitry Andric   case ISD::FREM:
32010fca6ea1SDimitry Andric     // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
32020fca6ea1SDimitry Andric     // those functions are not declared in the module.
32030fca6ea1SDimitry Andric     if (!Ty->isVectorTy())
32040fca6ea1SDimitry Andric       return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
32050fca6ea1SDimitry Andric     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
32060fca6ea1SDimitry Andric                                          Op2Info);
32070b57cec5SDimitry Andric   }
32080b57cec5SDimitry Andric }
32090b57cec5SDimitry Andric 
3210fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
3211fe6060f1SDimitry Andric                                                           ScalarEvolution *SE,
32120b57cec5SDimitry Andric                                                           const SCEV *Ptr) {
32130b57cec5SDimitry Andric   // Address computations in vectorized code with non-consecutive addresses will
32140b57cec5SDimitry Andric   // likely result in more instructions compared to scalar code where the
32150b57cec5SDimitry Andric   // computation can more often be merged into the index mode. The resulting
32160b57cec5SDimitry Andric   // extra micro-ops can significantly decrease throughput.
321706c3fb27SDimitry Andric   unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
32180b57cec5SDimitry Andric   int MaxMergeDistance = 64;
32190b57cec5SDimitry Andric 
32200b57cec5SDimitry Andric   if (Ty->isVectorTy() && SE &&
32210b57cec5SDimitry Andric       !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
32220b57cec5SDimitry Andric     return NumVectorInstToHideOverhead;
32230b57cec5SDimitry Andric 
32240b57cec5SDimitry Andric   // In many cases the address computation is not merged into the instruction
32250b57cec5SDimitry Andric   // addressing mode.
32260b57cec5SDimitry Andric   return 1;
32270b57cec5SDimitry Andric }
32280b57cec5SDimitry Andric 
3229fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
3230fe6060f1SDimitry Andric                                                    Type *CondTy,
3231fe6060f1SDimitry Andric                                                    CmpInst::Predicate VecPred,
32325ffd83dbSDimitry Andric                                                    TTI::TargetCostKind CostKind,
32335ffd83dbSDimitry Andric                                                    const Instruction *I) {
32345ffd83dbSDimitry Andric   // TODO: Handle other cost kinds.
32355ffd83dbSDimitry Andric   if (CostKind != TTI::TCK_RecipThroughput)
3236e8d8bef9SDimitry Andric     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3237e8d8bef9SDimitry Andric                                      I);
32380b57cec5SDimitry Andric 
32390b57cec5SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
32400b57cec5SDimitry Andric   // We don't lower some vector selects well that are wider than the register
32410b57cec5SDimitry Andric   // width.
3242e8d8bef9SDimitry Andric   if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
32430b57cec5SDimitry Andric     // We would need this many instructions to hide the scalarization happening.
32440b57cec5SDimitry Andric     const int AmortizationCost = 20;
3245e8d8bef9SDimitry Andric 
3246e8d8bef9SDimitry Andric     // If VecPred is not set, check if we can get a predicate from the context
3247e8d8bef9SDimitry Andric     // instruction, if its type matches the requested ValTy.
3248e8d8bef9SDimitry Andric     if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3249e8d8bef9SDimitry Andric       CmpInst::Predicate CurrentPred;
3250e8d8bef9SDimitry Andric       if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
3251e8d8bef9SDimitry Andric                             m_Value())))
3252e8d8bef9SDimitry Andric         VecPred = CurrentPred;
3253e8d8bef9SDimitry Andric     }
32541fd87a68SDimitry Andric     // Check if we have a compare/select chain that can be lowered using
32551fd87a68SDimitry Andric     // a (F)CMxx & BFI pair.
32561fd87a68SDimitry Andric     if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
32571fd87a68SDimitry Andric         VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
32581fd87a68SDimitry Andric         VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
32591fd87a68SDimitry Andric         VecPred == CmpInst::FCMP_UNE) {
32601fd87a68SDimitry Andric       static const auto ValidMinMaxTys = {
32611fd87a68SDimitry Andric           MVT::v8i8,  MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
32621fd87a68SDimitry Andric           MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
32631fd87a68SDimitry Andric       static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
32641fd87a68SDimitry Andric 
3265bdd1243dSDimitry Andric       auto LT = getTypeLegalizationCost(ValTy);
32661fd87a68SDimitry Andric       if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
32671fd87a68SDimitry Andric           (ST->hasFullFP16() &&
32681fd87a68SDimitry Andric            any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
3269e8d8bef9SDimitry Andric         return LT.first;
3270e8d8bef9SDimitry Andric     }
3271e8d8bef9SDimitry Andric 
32720b57cec5SDimitry Andric     static const TypeConversionCostTblEntry
32730b57cec5SDimitry Andric     VectorSelectTbl[] = {
327406c3fb27SDimitry Andric       { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
327506c3fb27SDimitry Andric       { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
327606c3fb27SDimitry Andric       { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
327706c3fb27SDimitry Andric       { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
327806c3fb27SDimitry Andric       { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
32790b57cec5SDimitry Andric       { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
32800b57cec5SDimitry Andric       { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
32810b57cec5SDimitry Andric       { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
32820b57cec5SDimitry Andric       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
32830b57cec5SDimitry Andric       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
32840b57cec5SDimitry Andric       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
32850b57cec5SDimitry Andric     };
32860b57cec5SDimitry Andric 
32870b57cec5SDimitry Andric     EVT SelCondTy = TLI->getValueType(DL, CondTy);
32880b57cec5SDimitry Andric     EVT SelValTy = TLI->getValueType(DL, ValTy);
32890b57cec5SDimitry Andric     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
32900b57cec5SDimitry Andric       if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
32910b57cec5SDimitry Andric                                                      SelCondTy.getSimpleVT(),
32920b57cec5SDimitry Andric                                                      SelValTy.getSimpleVT()))
32930b57cec5SDimitry Andric         return Entry->Cost;
32940b57cec5SDimitry Andric     }
32950b57cec5SDimitry Andric   }
329606c3fb27SDimitry Andric 
329706c3fb27SDimitry Andric   if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
329806c3fb27SDimitry Andric     auto LT = getTypeLegalizationCost(ValTy);
329906c3fb27SDimitry Andric     // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
330006c3fb27SDimitry Andric     if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
330106c3fb27SDimitry Andric       return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
330206c3fb27SDimitry Andric   }
330306c3fb27SDimitry Andric 
330406c3fb27SDimitry Andric   // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
330506c3fb27SDimitry Andric   // FIXME: This can apply to more conditions and add/sub if it can be shown to
330606c3fb27SDimitry Andric   // be profitable.
330706c3fb27SDimitry Andric   if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
330806c3fb27SDimitry Andric       ICmpInst::isEquality(VecPred) &&
330906c3fb27SDimitry Andric       TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
331006c3fb27SDimitry Andric       match(I->getOperand(1), m_Zero()) &&
331106c3fb27SDimitry Andric       match(I->getOperand(0), m_And(m_Value(), m_Value())))
331206c3fb27SDimitry Andric     return 0;
331306c3fb27SDimitry Andric 
3314e8d8bef9SDimitry Andric   // The base case handles scalable vectors fine for now, since it treats the
3315e8d8bef9SDimitry Andric   // cost as 1 * legalization cost.
3316e8d8bef9SDimitry Andric   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
33170b57cec5SDimitry Andric }
33180b57cec5SDimitry Andric 
33190b57cec5SDimitry Andric AArch64TTIImpl::TTI::MemCmpExpansionOptions
33200b57cec5SDimitry Andric AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
33210b57cec5SDimitry Andric   TTI::MemCmpExpansionOptions Options;
33225ffd83dbSDimitry Andric   if (ST->requiresStrictAlign()) {
33235ffd83dbSDimitry Andric     // TODO: Add cost modeling for strict align. Misaligned loads expand to
33245ffd83dbSDimitry Andric     // a bunch of instructions when strict align is enabled.
33255ffd83dbSDimitry Andric     return Options;
33265ffd83dbSDimitry Andric   }
33275ffd83dbSDimitry Andric   Options.AllowOverlappingLoads = true;
33280b57cec5SDimitry Andric   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
33290b57cec5SDimitry Andric   Options.NumLoadsPerBlock = Options.MaxNumLoads;
33300b57cec5SDimitry Andric   // TODO: Though vector loads usually perform well on AArch64, in some targets
33310b57cec5SDimitry Andric   // they may wake up the FP unit, which raises the power consumption.  Perhaps
33320b57cec5SDimitry Andric   // they could be used with no holds barred (-O3).
33330b57cec5SDimitry Andric   Options.LoadSizes = {8, 4, 2, 1};
33345f757f3fSDimitry Andric   Options.AllowedTailExpansions = {3, 5, 6};
33350b57cec5SDimitry Andric   return Options;
33360b57cec5SDimitry Andric }
33370b57cec5SDimitry Andric 
333881ad6265SDimitry Andric bool AArch64TTIImpl::prefersVectorizedAddressing() const {
333981ad6265SDimitry Andric   return ST->hasSVE();
334081ad6265SDimitry Andric }
334181ad6265SDimitry Andric 
3342fe6060f1SDimitry Andric InstructionCost
3343fe6060f1SDimitry Andric AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
3344fe6060f1SDimitry Andric                                       Align Alignment, unsigned AddressSpace,
3345fe6060f1SDimitry Andric                                       TTI::TargetCostKind CostKind) {
33460eae32dcSDimitry Andric   if (useNeonVector(Src))
3347fe6060f1SDimitry Andric     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3348fe6060f1SDimitry Andric                                         CostKind);
3349bdd1243dSDimitry Andric   auto LT = getTypeLegalizationCost(Src);
3350fe6060f1SDimitry Andric   if (!LT.first.isValid())
3351fe6060f1SDimitry Andric     return InstructionCost::getInvalid();
3352fe6060f1SDimitry Andric 
33530fca6ea1SDimitry Andric   // Return an invalid cost for element types that we are unable to lower.
33540fca6ea1SDimitry Andric   auto *VT = cast<VectorType>(Src);
33550fca6ea1SDimitry Andric   if (VT->getElementType()->isIntegerTy(1))
33560fca6ea1SDimitry Andric     return InstructionCost::getInvalid();
33570fca6ea1SDimitry Andric 
3358fe6060f1SDimitry Andric   // The code-generator is currently not able to handle scalable vectors
3359fe6060f1SDimitry Andric   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3360fe6060f1SDimitry Andric   // it. This change will be removed when code-generation for these types is
3361fe6060f1SDimitry Andric   // sufficiently reliable.
33620fca6ea1SDimitry Andric   if (VT->getElementCount() == ElementCount::getScalable(1))
3363fe6060f1SDimitry Andric     return InstructionCost::getInvalid();
3364fe6060f1SDimitry Andric 
3365bdd1243dSDimitry Andric   return LT.first;
3366fe6060f1SDimitry Andric }
3367fe6060f1SDimitry Andric 
33680eae32dcSDimitry Andric static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
33690eae32dcSDimitry Andric   return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
33700eae32dcSDimitry Andric }
33710eae32dcSDimitry Andric 
3372fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
3373e8d8bef9SDimitry Andric     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
3374e8d8bef9SDimitry Andric     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
33755f757f3fSDimitry Andric   if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
3376e8d8bef9SDimitry Andric     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
3377e8d8bef9SDimitry Andric                                          Alignment, CostKind, I);
3378e8d8bef9SDimitry Andric   auto *VT = cast<VectorType>(DataTy);
3379bdd1243dSDimitry Andric   auto LT = getTypeLegalizationCost(DataTy);
3380fe6060f1SDimitry Andric   if (!LT.first.isValid())
3381fe6060f1SDimitry Andric     return InstructionCost::getInvalid();
3382e8d8bef9SDimitry Andric 
33830fca6ea1SDimitry Andric   // Return an invalid cost for element types that we are unable to lower.
33845f757f3fSDimitry Andric   if (!LT.second.isVector() ||
33850fca6ea1SDimitry Andric       !isElementTypeLegalForScalableVector(VT->getElementType()) ||
33860fca6ea1SDimitry Andric       VT->getElementType()->isIntegerTy(1))
33875f757f3fSDimitry Andric     return InstructionCost::getInvalid();
33885f757f3fSDimitry Andric 
3389fe6060f1SDimitry Andric   // The code-generator is currently not able to handle scalable vectors
3390fe6060f1SDimitry Andric   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3391fe6060f1SDimitry Andric   // it. This change will be removed when code-generation for these types is
3392fe6060f1SDimitry Andric   // sufficiently reliable.
33930fca6ea1SDimitry Andric   if (VT->getElementCount() == ElementCount::getScalable(1))
3394fe6060f1SDimitry Andric     return InstructionCost::getInvalid();
3395fe6060f1SDimitry Andric 
3396fe6060f1SDimitry Andric   ElementCount LegalVF = LT.second.getVectorElementCount();
3397fe6060f1SDimitry Andric   InstructionCost MemOpCost =
3398bdd1243dSDimitry Andric       getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
3399bdd1243dSDimitry Andric                       {TTI::OK_AnyValue, TTI::OP_None}, I);
34000eae32dcSDimitry Andric   // Add on an overhead cost for using gathers/scatters.
34010eae32dcSDimitry Andric   // TODO: At the moment this is applied unilaterally for all CPUs, but at some
34020eae32dcSDimitry Andric   // point we may want a per-CPU overhead.
34030eae32dcSDimitry Andric   MemOpCost *= getSVEGatherScatterOverhead(Opcode);
3404fe6060f1SDimitry Andric   return LT.first * MemOpCost * getMaxNumElements(LegalVF);
3405e8d8bef9SDimitry Andric }
3406e8d8bef9SDimitry Andric 
3407e8d8bef9SDimitry Andric bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
3408e8d8bef9SDimitry Andric   return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
3409e8d8bef9SDimitry Andric }
3410e8d8bef9SDimitry Andric 
3411fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
3412fe6060f1SDimitry Andric                                                 MaybeAlign Alignment,
3413fe6060f1SDimitry Andric                                                 unsigned AddressSpace,
34145ffd83dbSDimitry Andric                                                 TTI::TargetCostKind CostKind,
3415bdd1243dSDimitry Andric                                                 TTI::OperandValueInfo OpInfo,
34160b57cec5SDimitry Andric                                                 const Instruction *I) {
3417fe6060f1SDimitry Andric   EVT VT = TLI->getValueType(DL, Ty, true);
34185ffd83dbSDimitry Andric   // Type legalization can't handle structs
3419fe6060f1SDimitry Andric   if (VT == MVT::Other)
34205ffd83dbSDimitry Andric     return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
34215ffd83dbSDimitry Andric                                   CostKind);
34225ffd83dbSDimitry Andric 
3423bdd1243dSDimitry Andric   auto LT = getTypeLegalizationCost(Ty);
3424fe6060f1SDimitry Andric   if (!LT.first.isValid())
3425fe6060f1SDimitry Andric     return InstructionCost::getInvalid();
3426fe6060f1SDimitry Andric 
3427fe6060f1SDimitry Andric   // The code-generator is currently not able to handle scalable vectors
3428fe6060f1SDimitry Andric   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3429fe6060f1SDimitry Andric   // it. This change will be removed when code-generation for these types is
3430fe6060f1SDimitry Andric   // sufficiently reliable.
34310fca6ea1SDimitry Andric   // We also only support full register predicate loads and stores.
3432fe6060f1SDimitry Andric   if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
34330fca6ea1SDimitry Andric     if (VTy->getElementCount() == ElementCount::getScalable(1) ||
34340fca6ea1SDimitry Andric         (VTy->getElementType()->isIntegerTy(1) &&
34350fca6ea1SDimitry Andric          !VTy->getElementCount().isKnownMultipleOf(
34360fca6ea1SDimitry Andric              ElementCount::getScalable(16))))
3437fe6060f1SDimitry Andric       return InstructionCost::getInvalid();
3438fe6060f1SDimitry Andric 
3439fe6060f1SDimitry Andric   // TODO: consider latency as well for TCK_SizeAndLatency.
3440fe6060f1SDimitry Andric   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
3441fe6060f1SDimitry Andric     return LT.first;
3442fe6060f1SDimitry Andric 
3443fe6060f1SDimitry Andric   if (CostKind != TTI::TCK_RecipThroughput)
3444fe6060f1SDimitry Andric     return 1;
34450b57cec5SDimitry Andric 
34460b57cec5SDimitry Andric   if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3447480093f4SDimitry Andric       LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
34480b57cec5SDimitry Andric     // Unaligned stores are extremely inefficient. We don't split all
34490b57cec5SDimitry Andric     // unaligned 128-bit stores because the negative impact that has shown in
34500b57cec5SDimitry Andric     // practice on inlined block copy code.
34510b57cec5SDimitry Andric     // We make such stores expensive so that we will only vectorize if there
34520b57cec5SDimitry Andric     // are 6 other instructions getting vectorized.
34530b57cec5SDimitry Andric     const int AmortizationCost = 6;
34540b57cec5SDimitry Andric 
34550b57cec5SDimitry Andric     return LT.first * 2 * AmortizationCost;
34560b57cec5SDimitry Andric   }
34570b57cec5SDimitry Andric 
3458bdd1243dSDimitry Andric   // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3459bdd1243dSDimitry Andric   if (Ty->isPtrOrPtrVectorTy())
3460bdd1243dSDimitry Andric     return LT.first;
3461bdd1243dSDimitry Andric 
34627a6dacacSDimitry Andric   if (useNeonVector(Ty)) {
3463fe6060f1SDimitry Andric     // Check truncating stores and extending loads.
34647a6dacacSDimitry Andric     if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3465fe6060f1SDimitry Andric       // v4i8 types are lowered to scalar a load/store and sshll/xtn.
3466fe6060f1SDimitry Andric       if (VT == MVT::v4i8)
3467fe6060f1SDimitry Andric         return 2;
3468fe6060f1SDimitry Andric       // Otherwise we need to scalarize.
3469fe6060f1SDimitry Andric       return cast<FixedVectorType>(Ty)->getNumElements() * 2;
34700b57cec5SDimitry Andric     }
34717a6dacacSDimitry Andric     EVT EltVT = VT.getVectorElementType();
34727a6dacacSDimitry Andric     unsigned EltSize = EltVT.getScalarSizeInBits();
34737a6dacacSDimitry Andric     if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
34747a6dacacSDimitry Andric         VT.getVectorNumElements() >= (128 / EltSize) || !Alignment ||
34757a6dacacSDimitry Andric         *Alignment != Align(1))
34767a6dacacSDimitry Andric       return LT.first;
34777a6dacacSDimitry Andric     // FIXME: v3i8 lowering currently is very inefficient, due to automatic
34787a6dacacSDimitry Andric     // widening to v4i8, which produces suboptimal results.
34797a6dacacSDimitry Andric     if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
34807a6dacacSDimitry Andric       return LT.first;
34817a6dacacSDimitry Andric 
34827a6dacacSDimitry Andric     // Check non-power-of-2 loads/stores for legal vector element types with
34837a6dacacSDimitry Andric     // NEON. Non-power-of-2 memory ops will get broken down to a set of
34847a6dacacSDimitry Andric     // operations on smaller power-of-2 ops, including ld1/st1.
34857a6dacacSDimitry Andric     LLVMContext &C = Ty->getContext();
34867a6dacacSDimitry Andric     InstructionCost Cost(0);
34877a6dacacSDimitry Andric     SmallVector<EVT> TypeWorklist;
34887a6dacacSDimitry Andric     TypeWorklist.push_back(VT);
34897a6dacacSDimitry Andric     while (!TypeWorklist.empty()) {
34907a6dacacSDimitry Andric       EVT CurrVT = TypeWorklist.pop_back_val();
34917a6dacacSDimitry Andric       unsigned CurrNumElements = CurrVT.getVectorNumElements();
34927a6dacacSDimitry Andric       if (isPowerOf2_32(CurrNumElements)) {
34937a6dacacSDimitry Andric         Cost += 1;
34947a6dacacSDimitry Andric         continue;
34957a6dacacSDimitry Andric       }
34967a6dacacSDimitry Andric 
34977a6dacacSDimitry Andric       unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
34987a6dacacSDimitry Andric       TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
34997a6dacacSDimitry Andric       TypeWorklist.push_back(
35007a6dacacSDimitry Andric           EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
35017a6dacacSDimitry Andric     }
35027a6dacacSDimitry Andric     return Cost;
35037a6dacacSDimitry Andric   }
35040b57cec5SDimitry Andric 
35050b57cec5SDimitry Andric   return LT.first;
35060b57cec5SDimitry Andric }
35070b57cec5SDimitry Andric 
3508fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
35095ffd83dbSDimitry Andric     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
35105ffd83dbSDimitry Andric     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
35115ffd83dbSDimitry Andric     bool UseMaskForCond, bool UseMaskForGaps) {
35120b57cec5SDimitry Andric   assert(Factor >= 2 && "Invalid interleave factor");
351306c3fb27SDimitry Andric   auto *VecVTy = cast<VectorType>(VecTy);
35140b57cec5SDimitry Andric 
351506c3fb27SDimitry Andric   if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
351606c3fb27SDimitry Andric     return InstructionCost::getInvalid();
351706c3fb27SDimitry Andric 
351806c3fb27SDimitry Andric   // Vectorization for masked interleaved accesses is only enabled for scalable
351906c3fb27SDimitry Andric   // VF.
352006c3fb27SDimitry Andric   if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
352106c3fb27SDimitry Andric     return InstructionCost::getInvalid();
352206c3fb27SDimitry Andric 
352306c3fb27SDimitry Andric   if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
352406c3fb27SDimitry Andric     unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
35255ffd83dbSDimitry Andric     auto *SubVecTy =
352606c3fb27SDimitry Andric         VectorType::get(VecVTy->getElementType(),
352706c3fb27SDimitry Andric                         VecVTy->getElementCount().divideCoefficientBy(Factor));
35280b57cec5SDimitry Andric 
35290b57cec5SDimitry Andric     // ldN/stN only support legal vector types of size 64 or 128 in bits.
35300b57cec5SDimitry Andric     // Accesses having vector types that are a multiple of 128 bits can be
35310b57cec5SDimitry Andric     // matched to more than one ldN/stN instruction.
3532349cc55cSDimitry Andric     bool UseScalable;
353306c3fb27SDimitry Andric     if (MinElts % Factor == 0 &&
3534349cc55cSDimitry Andric         TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
3535349cc55cSDimitry Andric       return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
35360b57cec5SDimitry Andric   }
35370b57cec5SDimitry Andric 
35380b57cec5SDimitry Andric   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
35395ffd83dbSDimitry Andric                                            Alignment, AddressSpace, CostKind,
35400b57cec5SDimitry Andric                                            UseMaskForCond, UseMaskForGaps);
35410b57cec5SDimitry Andric }
35420b57cec5SDimitry Andric 
3543fe6060f1SDimitry Andric InstructionCost
3544fe6060f1SDimitry Andric AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
3545fe6060f1SDimitry Andric   InstructionCost Cost = 0;
35465ffd83dbSDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
35470b57cec5SDimitry Andric   for (auto *I : Tys) {
35480b57cec5SDimitry Andric     if (!I->isVectorTy())
35490b57cec5SDimitry Andric       continue;
35505ffd83dbSDimitry Andric     if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
35515ffd83dbSDimitry Andric         128)
35525ffd83dbSDimitry Andric       Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
35535ffd83dbSDimitry Andric               getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
35540b57cec5SDimitry Andric   }
35550b57cec5SDimitry Andric   return Cost;
35560b57cec5SDimitry Andric }
35570b57cec5SDimitry Andric 
355806c3fb27SDimitry Andric unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
35590b57cec5SDimitry Andric   return ST->getMaxInterleaveFactor();
35600b57cec5SDimitry Andric }
35610b57cec5SDimitry Andric 
35620b57cec5SDimitry Andric // For Falkor, we want to avoid having too many strided loads in a loop since
35630b57cec5SDimitry Andric // that can exhaust the HW prefetcher resources.  We adjust the unroller
35640b57cec5SDimitry Andric // MaxCount preference below to attempt to ensure unrolling doesn't create too
35650b57cec5SDimitry Andric // many strided loads.
35660b57cec5SDimitry Andric static void
35670b57cec5SDimitry Andric getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
35680b57cec5SDimitry Andric                               TargetTransformInfo::UnrollingPreferences &UP) {
35690b57cec5SDimitry Andric   enum { MaxStridedLoads = 7 };
35700b57cec5SDimitry Andric   auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
35710b57cec5SDimitry Andric     int StridedLoads = 0;
35720b57cec5SDimitry Andric     // FIXME? We could make this more precise by looking at the CFG and
35730b57cec5SDimitry Andric     // e.g. not counting loads in each side of an if-then-else diamond.
35740b57cec5SDimitry Andric     for (const auto BB : L->blocks()) {
35750b57cec5SDimitry Andric       for (auto &I : *BB) {
35760b57cec5SDimitry Andric         LoadInst *LMemI = dyn_cast<LoadInst>(&I);
35770b57cec5SDimitry Andric         if (!LMemI)
35780b57cec5SDimitry Andric           continue;
35790b57cec5SDimitry Andric 
35800b57cec5SDimitry Andric         Value *PtrValue = LMemI->getPointerOperand();
35810b57cec5SDimitry Andric         if (L->isLoopInvariant(PtrValue))
35820b57cec5SDimitry Andric           continue;
35830b57cec5SDimitry Andric 
35840b57cec5SDimitry Andric         const SCEV *LSCEV = SE.getSCEV(PtrValue);
35850b57cec5SDimitry Andric         const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
35860b57cec5SDimitry Andric         if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
35870b57cec5SDimitry Andric           continue;
35880b57cec5SDimitry Andric 
35890b57cec5SDimitry Andric         // FIXME? We could take pairing of unrolled load copies into account
35900b57cec5SDimitry Andric         // by looking at the AddRec, but we would probably have to limit this
35910b57cec5SDimitry Andric         // to loops with no stores or other memory optimization barriers.
35920b57cec5SDimitry Andric         ++StridedLoads;
35930b57cec5SDimitry Andric         // We've seen enough strided loads that seeing more won't make a
35940b57cec5SDimitry Andric         // difference.
35950b57cec5SDimitry Andric         if (StridedLoads > MaxStridedLoads / 2)
35960b57cec5SDimitry Andric           return StridedLoads;
35970b57cec5SDimitry Andric       }
35980b57cec5SDimitry Andric     }
35990b57cec5SDimitry Andric     return StridedLoads;
36000b57cec5SDimitry Andric   };
36010b57cec5SDimitry Andric 
36020b57cec5SDimitry Andric   int StridedLoads = countStridedLoads(L, SE);
36030b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
36040b57cec5SDimitry Andric                     << " strided loads\n");
36050b57cec5SDimitry Andric   // Pick the largest power of 2 unroll count that won't result in too many
36060b57cec5SDimitry Andric   // strided loads.
36070b57cec5SDimitry Andric   if (StridedLoads) {
36080b57cec5SDimitry Andric     UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
36090b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
36100b57cec5SDimitry Andric                       << UP.MaxCount << '\n');
36110b57cec5SDimitry Andric   }
36120b57cec5SDimitry Andric }
36130b57cec5SDimitry Andric 
36140b57cec5SDimitry Andric void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
3615349cc55cSDimitry Andric                                              TTI::UnrollingPreferences &UP,
3616349cc55cSDimitry Andric                                              OptimizationRemarkEmitter *ORE) {
36170b57cec5SDimitry Andric   // Enable partial unrolling and runtime unrolling.
3618349cc55cSDimitry Andric   BaseT::getUnrollingPreferences(L, SE, UP, ORE);
3619349cc55cSDimitry Andric 
3620349cc55cSDimitry Andric   UP.UpperBound = true;
36210b57cec5SDimitry Andric 
36220b57cec5SDimitry Andric   // For inner loop, it is more likely to be a hot one, and the runtime check
36230b57cec5SDimitry Andric   // can be promoted out from LICM pass, so the overhead is less, let's try
36240b57cec5SDimitry Andric   // a larger threshold to unroll more loops.
36250b57cec5SDimitry Andric   if (L->getLoopDepth() > 1)
36260b57cec5SDimitry Andric     UP.PartialThreshold *= 2;
36270b57cec5SDimitry Andric 
36280b57cec5SDimitry Andric   // Disable partial & runtime unrolling on -Os.
36290b57cec5SDimitry Andric   UP.PartialOptSizeThreshold = 0;
36300b57cec5SDimitry Andric 
36310b57cec5SDimitry Andric   if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
36320b57cec5SDimitry Andric       EnableFalkorHWPFUnrollFix)
36330b57cec5SDimitry Andric     getFalkorUnrollingPreferences(L, SE, UP);
3634fe6060f1SDimitry Andric 
3635fe6060f1SDimitry Andric   // Scan the loop: don't unroll loops with calls as this could prevent
3636fe6060f1SDimitry Andric   // inlining. Don't unroll vector loops either, as they don't benefit much from
3637fe6060f1SDimitry Andric   // unrolling.
3638fe6060f1SDimitry Andric   for (auto *BB : L->getBlocks()) {
3639fe6060f1SDimitry Andric     for (auto &I : *BB) {
3640fe6060f1SDimitry Andric       // Don't unroll vectorised loop.
3641fe6060f1SDimitry Andric       if (I.getType()->isVectorTy())
3642fe6060f1SDimitry Andric         return;
3643fe6060f1SDimitry Andric 
3644fe6060f1SDimitry Andric       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
3645fe6060f1SDimitry Andric         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
3646fe6060f1SDimitry Andric           if (!isLoweredToCall(F))
3647fe6060f1SDimitry Andric             continue;
3648fe6060f1SDimitry Andric         }
3649fe6060f1SDimitry Andric         return;
3650fe6060f1SDimitry Andric       }
3651fe6060f1SDimitry Andric     }
3652fe6060f1SDimitry Andric   }
3653fe6060f1SDimitry Andric 
3654fe6060f1SDimitry Andric   // Enable runtime unrolling for in-order models
3655fe6060f1SDimitry Andric   // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
3656fe6060f1SDimitry Andric   // checking for that case, we can ensure that the default behaviour is
3657fe6060f1SDimitry Andric   // unchanged
3658fe6060f1SDimitry Andric   if (ST->getProcFamily() != AArch64Subtarget::Others &&
3659fe6060f1SDimitry Andric       !ST->getSchedModel().isOutOfOrder()) {
3660fe6060f1SDimitry Andric     UP.Runtime = true;
3661fe6060f1SDimitry Andric     UP.Partial = true;
3662fe6060f1SDimitry Andric     UP.UnrollRemainder = true;
3663fe6060f1SDimitry Andric     UP.DefaultUnrollRuntimeCount = 4;
3664fe6060f1SDimitry Andric 
3665fe6060f1SDimitry Andric     UP.UnrollAndJam = true;
3666fe6060f1SDimitry Andric     UP.UnrollAndJamInnerLoopThreshold = 60;
3667fe6060f1SDimitry Andric   }
36680b57cec5SDimitry Andric }
36690b57cec5SDimitry Andric 
36705ffd83dbSDimitry Andric void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
36715ffd83dbSDimitry Andric                                            TTI::PeelingPreferences &PP) {
36725ffd83dbSDimitry Andric   BaseT::getPeelingPreferences(L, SE, PP);
36735ffd83dbSDimitry Andric }
36745ffd83dbSDimitry Andric 
36750b57cec5SDimitry Andric Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
36760b57cec5SDimitry Andric                                                          Type *ExpectedType) {
36770b57cec5SDimitry Andric   switch (Inst->getIntrinsicID()) {
36780b57cec5SDimitry Andric   default:
36790b57cec5SDimitry Andric     return nullptr;
36800b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st2:
36810b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st3:
36820b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st4: {
36830b57cec5SDimitry Andric     // Create a struct type
36840b57cec5SDimitry Andric     StructType *ST = dyn_cast<StructType>(ExpectedType);
36850b57cec5SDimitry Andric     if (!ST)
36860b57cec5SDimitry Andric       return nullptr;
3687349cc55cSDimitry Andric     unsigned NumElts = Inst->arg_size() - 1;
36880b57cec5SDimitry Andric     if (ST->getNumElements() != NumElts)
36890b57cec5SDimitry Andric       return nullptr;
36900b57cec5SDimitry Andric     for (unsigned i = 0, e = NumElts; i != e; ++i) {
36910b57cec5SDimitry Andric       if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
36920b57cec5SDimitry Andric         return nullptr;
36930b57cec5SDimitry Andric     }
3694bdd1243dSDimitry Andric     Value *Res = PoisonValue::get(ExpectedType);
36950b57cec5SDimitry Andric     IRBuilder<> Builder(Inst);
36960b57cec5SDimitry Andric     for (unsigned i = 0, e = NumElts; i != e; ++i) {
36970b57cec5SDimitry Andric       Value *L = Inst->getArgOperand(i);
36980b57cec5SDimitry Andric       Res = Builder.CreateInsertValue(Res, L, i);
36990b57cec5SDimitry Andric     }
37000b57cec5SDimitry Andric     return Res;
37010b57cec5SDimitry Andric   }
37020b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld2:
37030b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld3:
37040b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld4:
37050b57cec5SDimitry Andric     if (Inst->getType() == ExpectedType)
37060b57cec5SDimitry Andric       return Inst;
37070b57cec5SDimitry Andric     return nullptr;
37080b57cec5SDimitry Andric   }
37090b57cec5SDimitry Andric }
37100b57cec5SDimitry Andric 
37110b57cec5SDimitry Andric bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
37120b57cec5SDimitry Andric                                         MemIntrinsicInfo &Info) {
37130b57cec5SDimitry Andric   switch (Inst->getIntrinsicID()) {
37140b57cec5SDimitry Andric   default:
37150b57cec5SDimitry Andric     break;
37160b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld2:
37170b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld3:
37180b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld4:
37190b57cec5SDimitry Andric     Info.ReadMem = true;
37200b57cec5SDimitry Andric     Info.WriteMem = false;
37210b57cec5SDimitry Andric     Info.PtrVal = Inst->getArgOperand(0);
37220b57cec5SDimitry Andric     break;
37230b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st2:
37240b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st3:
37250b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st4:
37260b57cec5SDimitry Andric     Info.ReadMem = false;
37270b57cec5SDimitry Andric     Info.WriteMem = true;
3728349cc55cSDimitry Andric     Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
37290b57cec5SDimitry Andric     break;
37300b57cec5SDimitry Andric   }
37310b57cec5SDimitry Andric 
37320b57cec5SDimitry Andric   switch (Inst->getIntrinsicID()) {
37330b57cec5SDimitry Andric   default:
37340b57cec5SDimitry Andric     return false;
37350b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld2:
37360b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st2:
37370b57cec5SDimitry Andric     Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
37380b57cec5SDimitry Andric     break;
37390b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld3:
37400b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st3:
37410b57cec5SDimitry Andric     Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
37420b57cec5SDimitry Andric     break;
37430b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld4:
37440b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st4:
37450b57cec5SDimitry Andric     Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
37460b57cec5SDimitry Andric     break;
37470b57cec5SDimitry Andric   }
37480b57cec5SDimitry Andric   return true;
37490b57cec5SDimitry Andric }
37500b57cec5SDimitry Andric 
37510b57cec5SDimitry Andric /// See if \p I should be considered for address type promotion. We check if \p
37520b57cec5SDimitry Andric /// I is a sext with right type and used in memory accesses. If it used in a
37530b57cec5SDimitry Andric /// "complex" getelementptr, we allow it to be promoted without finding other
37540b57cec5SDimitry Andric /// sext instructions that sign extended the same initial value. A getelementptr
37550b57cec5SDimitry Andric /// is considered as "complex" if it has more than 2 operands.
37560b57cec5SDimitry Andric bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
37570b57cec5SDimitry Andric     const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
37580b57cec5SDimitry Andric   bool Considerable = false;
37590b57cec5SDimitry Andric   AllowPromotionWithoutCommonHeader = false;
37600b57cec5SDimitry Andric   if (!isa<SExtInst>(&I))
37610b57cec5SDimitry Andric     return false;
37620b57cec5SDimitry Andric   Type *ConsideredSExtType =
37630b57cec5SDimitry Andric       Type::getInt64Ty(I.getParent()->getParent()->getContext());
37640b57cec5SDimitry Andric   if (I.getType() != ConsideredSExtType)
37650b57cec5SDimitry Andric     return false;
37660b57cec5SDimitry Andric   // See if the sext is the one with the right type and used in at least one
37670b57cec5SDimitry Andric   // GetElementPtrInst.
37680b57cec5SDimitry Andric   for (const User *U : I.users()) {
37690b57cec5SDimitry Andric     if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
37700b57cec5SDimitry Andric       Considerable = true;
37710b57cec5SDimitry Andric       // A getelementptr is considered as "complex" if it has more than 2
37720b57cec5SDimitry Andric       // operands. We will promote a SExt used in such complex GEP as we
37730b57cec5SDimitry Andric       // expect some computation to be merged if they are done on 64 bits.
37740b57cec5SDimitry Andric       if (GEPInst->getNumOperands() > 2) {
37750b57cec5SDimitry Andric         AllowPromotionWithoutCommonHeader = true;
37760b57cec5SDimitry Andric         break;
37770b57cec5SDimitry Andric       }
37780b57cec5SDimitry Andric     }
37790b57cec5SDimitry Andric   }
37800b57cec5SDimitry Andric   return Considerable;
37810b57cec5SDimitry Andric }
37820b57cec5SDimitry Andric 
3783fe6060f1SDimitry Andric bool AArch64TTIImpl::isLegalToVectorizeReduction(
3784fe6060f1SDimitry Andric     const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
3785fe6060f1SDimitry Andric   if (!VF.isScalable())
3786fe6060f1SDimitry Andric     return true;
3787fe6060f1SDimitry Andric 
3788fe6060f1SDimitry Andric   Type *Ty = RdxDesc.getRecurrenceType();
3789fe6060f1SDimitry Andric   if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
37900b57cec5SDimitry Andric     return false;
3791fe6060f1SDimitry Andric 
3792fe6060f1SDimitry Andric   switch (RdxDesc.getRecurrenceKind()) {
3793fe6060f1SDimitry Andric   case RecurKind::Add:
3794fe6060f1SDimitry Andric   case RecurKind::FAdd:
3795fe6060f1SDimitry Andric   case RecurKind::And:
3796fe6060f1SDimitry Andric   case RecurKind::Or:
3797fe6060f1SDimitry Andric   case RecurKind::Xor:
3798fe6060f1SDimitry Andric   case RecurKind::SMin:
3799fe6060f1SDimitry Andric   case RecurKind::SMax:
3800fe6060f1SDimitry Andric   case RecurKind::UMin:
3801fe6060f1SDimitry Andric   case RecurKind::UMax:
3802fe6060f1SDimitry Andric   case RecurKind::FMin:
3803fe6060f1SDimitry Andric   case RecurKind::FMax:
38044824e7fdSDimitry Andric   case RecurKind::FMulAdd:
38055f757f3fSDimitry Andric   case RecurKind::IAnyOf:
38065f757f3fSDimitry Andric   case RecurKind::FAnyOf:
3807fe6060f1SDimitry Andric     return true;
38080b57cec5SDimitry Andric   default:
38090b57cec5SDimitry Andric     return false;
38100b57cec5SDimitry Andric   }
3811fe6060f1SDimitry Andric }
38120b57cec5SDimitry Andric 
3813fe6060f1SDimitry Andric InstructionCost
381406c3fb27SDimitry Andric AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
381506c3fb27SDimitry Andric                                        FastMathFlags FMF,
3816e8d8bef9SDimitry Andric                                        TTI::TargetCostKind CostKind) {
3817*62987288SDimitry Andric   // The code-generator is currently not able to handle scalable vectors
3818*62987288SDimitry Andric   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3819*62987288SDimitry Andric   // it. This change will be removed when code-generation for these types is
3820*62987288SDimitry Andric   // sufficiently reliable.
3821*62987288SDimitry Andric   if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3822*62987288SDimitry Andric     if (VTy->getElementCount() == ElementCount::getScalable(1))
3823*62987288SDimitry Andric       return InstructionCost::getInvalid();
3824*62987288SDimitry Andric 
3825bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3826349cc55cSDimitry Andric 
3827349cc55cSDimitry Andric   if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
382806c3fb27SDimitry Andric     return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
3829349cc55cSDimitry Andric 
3830fe6060f1SDimitry Andric   InstructionCost LegalizationCost = 0;
3831e8d8bef9SDimitry Andric   if (LT.first > 1) {
3832e8d8bef9SDimitry Andric     Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
383306c3fb27SDimitry Andric     IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
3834349cc55cSDimitry Andric     LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
3835e8d8bef9SDimitry Andric   }
3836e8d8bef9SDimitry Andric 
3837e8d8bef9SDimitry Andric   return LegalizationCost + /*Cost of horizontal reduction*/ 2;
3838e8d8bef9SDimitry Andric }
3839e8d8bef9SDimitry Andric 
3840fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
3841fe6060f1SDimitry Andric     unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
3842bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3843fe6060f1SDimitry Andric   InstructionCost LegalizationCost = 0;
3844e8d8bef9SDimitry Andric   if (LT.first > 1) {
3845e8d8bef9SDimitry Andric     Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
3846e8d8bef9SDimitry Andric     LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
3847e8d8bef9SDimitry Andric     LegalizationCost *= LT.first - 1;
3848e8d8bef9SDimitry Andric   }
3849e8d8bef9SDimitry Andric 
3850e8d8bef9SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
3851e8d8bef9SDimitry Andric   assert(ISD && "Invalid opcode");
3852e8d8bef9SDimitry Andric   // Add the final reduction cost for the legal horizontal reduction
3853e8d8bef9SDimitry Andric   switch (ISD) {
3854e8d8bef9SDimitry Andric   case ISD::ADD:
3855e8d8bef9SDimitry Andric   case ISD::AND:
3856e8d8bef9SDimitry Andric   case ISD::OR:
3857e8d8bef9SDimitry Andric   case ISD::XOR:
3858e8d8bef9SDimitry Andric   case ISD::FADD:
3859e8d8bef9SDimitry Andric     return LegalizationCost + 2;
3860e8d8bef9SDimitry Andric   default:
3861fe6060f1SDimitry Andric     return InstructionCost::getInvalid();
3862e8d8bef9SDimitry Andric   }
3863e8d8bef9SDimitry Andric }
3864e8d8bef9SDimitry Andric 
3865fe6060f1SDimitry Andric InstructionCost
3866fe6060f1SDimitry Andric AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
3867bdd1243dSDimitry Andric                                            std::optional<FastMathFlags> FMF,
38685ffd83dbSDimitry Andric                                            TTI::TargetCostKind CostKind) {
3869*62987288SDimitry Andric   // The code-generator is currently not able to handle scalable vectors
3870*62987288SDimitry Andric   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3871*62987288SDimitry Andric   // it. This change will be removed when code-generation for these types is
3872*62987288SDimitry Andric   // sufficiently reliable.
3873*62987288SDimitry Andric   if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
3874*62987288SDimitry Andric     if (VTy->getElementCount() == ElementCount::getScalable(1))
3875*62987288SDimitry Andric       return InstructionCost::getInvalid();
3876*62987288SDimitry Andric 
3877fe6060f1SDimitry Andric   if (TTI::requiresOrderedReduction(FMF)) {
3878349cc55cSDimitry Andric     if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
3879349cc55cSDimitry Andric       InstructionCost BaseCost =
3880349cc55cSDimitry Andric           BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3881349cc55cSDimitry Andric       // Add on extra cost to reflect the extra overhead on some CPUs. We still
3882349cc55cSDimitry Andric       // end up vectorizing for more computationally intensive loops.
3883349cc55cSDimitry Andric       return BaseCost + FixedVTy->getNumElements();
3884349cc55cSDimitry Andric     }
3885fe6060f1SDimitry Andric 
3886fe6060f1SDimitry Andric     if (Opcode != Instruction::FAdd)
3887fe6060f1SDimitry Andric       return InstructionCost::getInvalid();
3888fe6060f1SDimitry Andric 
3889fe6060f1SDimitry Andric     auto *VTy = cast<ScalableVectorType>(ValTy);
3890fe6060f1SDimitry Andric     InstructionCost Cost =
3891fe6060f1SDimitry Andric         getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
3892fe6060f1SDimitry Andric     Cost *= getMaxNumElements(VTy->getElementCount());
3893fe6060f1SDimitry Andric     return Cost;
3894fe6060f1SDimitry Andric   }
38950b57cec5SDimitry Andric 
3896e8d8bef9SDimitry Andric   if (isa<ScalableVectorType>(ValTy))
3897fe6060f1SDimitry Andric     return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
38980b57cec5SDimitry Andric 
3899bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
39000b57cec5SDimitry Andric   MVT MTy = LT.second;
39010b57cec5SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
39020b57cec5SDimitry Andric   assert(ISD && "Invalid opcode");
39030b57cec5SDimitry Andric 
39040b57cec5SDimitry Andric   // Horizontal adds can use the 'addv' instruction. We model the cost of these
3905fe6060f1SDimitry Andric   // instructions as twice a normal vector add, plus 1 for each legalization
3906fe6060f1SDimitry Andric   // step (LT.first). This is the only arithmetic vector reduction operation for
3907fe6060f1SDimitry Andric   // which we have an instruction.
3908fe6060f1SDimitry Andric   // OR, XOR and AND costs should match the codegen from:
3909fe6060f1SDimitry Andric   // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3910fe6060f1SDimitry Andric   // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3911fe6060f1SDimitry Andric   // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
39120b57cec5SDimitry Andric   static const CostTblEntry CostTblNoPairwise[]{
3913fe6060f1SDimitry Andric       {ISD::ADD, MVT::v8i8,   2},
3914fe6060f1SDimitry Andric       {ISD::ADD, MVT::v16i8,  2},
3915fe6060f1SDimitry Andric       {ISD::ADD, MVT::v4i16,  2},
3916fe6060f1SDimitry Andric       {ISD::ADD, MVT::v8i16,  2},
3917fe6060f1SDimitry Andric       {ISD::ADD, MVT::v4i32,  2},
3918bdd1243dSDimitry Andric       {ISD::ADD, MVT::v2i64,  2},
3919fe6060f1SDimitry Andric       {ISD::OR,  MVT::v8i8,  15},
3920fe6060f1SDimitry Andric       {ISD::OR,  MVT::v16i8, 17},
3921fe6060f1SDimitry Andric       {ISD::OR,  MVT::v4i16,  7},
3922fe6060f1SDimitry Andric       {ISD::OR,  MVT::v8i16,  9},
3923fe6060f1SDimitry Andric       {ISD::OR,  MVT::v2i32,  3},
3924fe6060f1SDimitry Andric       {ISD::OR,  MVT::v4i32,  5},
3925fe6060f1SDimitry Andric       {ISD::OR,  MVT::v2i64,  3},
3926fe6060f1SDimitry Andric       {ISD::XOR, MVT::v8i8,  15},
3927fe6060f1SDimitry Andric       {ISD::XOR, MVT::v16i8, 17},
3928fe6060f1SDimitry Andric       {ISD::XOR, MVT::v4i16,  7},
3929fe6060f1SDimitry Andric       {ISD::XOR, MVT::v8i16,  9},
3930fe6060f1SDimitry Andric       {ISD::XOR, MVT::v2i32,  3},
3931fe6060f1SDimitry Andric       {ISD::XOR, MVT::v4i32,  5},
3932fe6060f1SDimitry Andric       {ISD::XOR, MVT::v2i64,  3},
3933fe6060f1SDimitry Andric       {ISD::AND, MVT::v8i8,  15},
3934fe6060f1SDimitry Andric       {ISD::AND, MVT::v16i8, 17},
3935fe6060f1SDimitry Andric       {ISD::AND, MVT::v4i16,  7},
3936fe6060f1SDimitry Andric       {ISD::AND, MVT::v8i16,  9},
3937fe6060f1SDimitry Andric       {ISD::AND, MVT::v2i32,  3},
3938fe6060f1SDimitry Andric       {ISD::AND, MVT::v4i32,  5},
3939fe6060f1SDimitry Andric       {ISD::AND, MVT::v2i64,  3},
39400b57cec5SDimitry Andric   };
3941fe6060f1SDimitry Andric   switch (ISD) {
3942fe6060f1SDimitry Andric   default:
3943fe6060f1SDimitry Andric     break;
3944fe6060f1SDimitry Andric   case ISD::ADD:
39450b57cec5SDimitry Andric     if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
3946fe6060f1SDimitry Andric       return (LT.first - 1) + Entry->Cost;
3947fe6060f1SDimitry Andric     break;
3948fe6060f1SDimitry Andric   case ISD::XOR:
3949fe6060f1SDimitry Andric   case ISD::AND:
3950fe6060f1SDimitry Andric   case ISD::OR:
3951fe6060f1SDimitry Andric     const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
3952fe6060f1SDimitry Andric     if (!Entry)
3953fe6060f1SDimitry Andric       break;
3954fe6060f1SDimitry Andric     auto *ValVTy = cast<FixedVectorType>(ValTy);
395506c3fb27SDimitry Andric     if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3956fe6060f1SDimitry Andric         isPowerOf2_32(ValVTy->getNumElements())) {
3957fe6060f1SDimitry Andric       InstructionCost ExtraCost = 0;
3958fe6060f1SDimitry Andric       if (LT.first != 1) {
3959fe6060f1SDimitry Andric         // Type needs to be split, so there is an extra cost of LT.first - 1
3960fe6060f1SDimitry Andric         // arithmetic ops.
3961fe6060f1SDimitry Andric         auto *Ty = FixedVectorType::get(ValTy->getElementType(),
3962fe6060f1SDimitry Andric                                         MTy.getVectorNumElements());
3963fe6060f1SDimitry Andric         ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3964fe6060f1SDimitry Andric         ExtraCost *= LT.first - 1;
3965fe6060f1SDimitry Andric       }
396606c3fb27SDimitry Andric       // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
396706c3fb27SDimitry Andric       auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
396806c3fb27SDimitry Andric       return Cost + ExtraCost;
3969fe6060f1SDimitry Andric     }
3970fe6060f1SDimitry Andric     break;
3971fe6060f1SDimitry Andric   }
3972fe6060f1SDimitry Andric   return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
39730b57cec5SDimitry Andric }
39740b57cec5SDimitry Andric 
3975fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
3976fe6060f1SDimitry Andric   static const CostTblEntry ShuffleTbl[] = {
3977fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv16i8,  1 },
3978fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv8i16,  1 },
3979fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv4i32,  1 },
3980fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv2i64,  1 },
3981fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv2f16,  1 },
3982fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv4f16,  1 },
3983fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv8f16,  1 },
3984fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv2bf16, 1 },
3985fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv4bf16, 1 },
3986fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv8bf16, 1 },
3987fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv2f32,  1 },
3988fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv4f32,  1 },
3989fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv2f64,  1 },
3990fe6060f1SDimitry Andric   };
3991fe6060f1SDimitry Andric 
3992bdd1243dSDimitry Andric   // The code-generator is currently not able to handle scalable vectors
3993bdd1243dSDimitry Andric   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3994bdd1243dSDimitry Andric   // it. This change will be removed when code-generation for these types is
3995bdd1243dSDimitry Andric   // sufficiently reliable.
3996bdd1243dSDimitry Andric   if (Tp->getElementCount() == ElementCount::getScalable(1))
3997bdd1243dSDimitry Andric     return InstructionCost::getInvalid();
3998bdd1243dSDimitry Andric 
3999bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
4000fe6060f1SDimitry Andric   Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
4001fe6060f1SDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4002fe6060f1SDimitry Andric   EVT PromotedVT = LT.second.getScalarType() == MVT::i1
4003fe6060f1SDimitry Andric                        ? TLI->getPromotedVTForPredicate(EVT(LT.second))
4004fe6060f1SDimitry Andric                        : LT.second;
4005fe6060f1SDimitry Andric   Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
4006fe6060f1SDimitry Andric   InstructionCost LegalizationCost = 0;
4007fe6060f1SDimitry Andric   if (Index < 0) {
4008fe6060f1SDimitry Andric     LegalizationCost =
4009fe6060f1SDimitry Andric         getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
4010fe6060f1SDimitry Andric                            CmpInst::BAD_ICMP_PREDICATE, CostKind) +
4011fe6060f1SDimitry Andric         getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
4012fe6060f1SDimitry Andric                            CmpInst::BAD_ICMP_PREDICATE, CostKind);
4013fe6060f1SDimitry Andric   }
4014fe6060f1SDimitry Andric 
4015fe6060f1SDimitry Andric   // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
4016fe6060f1SDimitry Andric   // Cost performed on a promoted type.
4017fe6060f1SDimitry Andric   if (LT.second.getScalarType() == MVT::i1) {
4018fe6060f1SDimitry Andric     LegalizationCost +=
4019fe6060f1SDimitry Andric         getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
4020fe6060f1SDimitry Andric                          TTI::CastContextHint::None, CostKind) +
4021fe6060f1SDimitry Andric         getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
4022fe6060f1SDimitry Andric                          TTI::CastContextHint::None, CostKind);
4023fe6060f1SDimitry Andric   }
4024fe6060f1SDimitry Andric   const auto *Entry =
4025fe6060f1SDimitry Andric       CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
4026fe6060f1SDimitry Andric   assert(Entry && "Illegal Type for Splice");
4027fe6060f1SDimitry Andric   LegalizationCost += Entry->Cost;
4028fe6060f1SDimitry Andric   return LegalizationCost * LT.first;
4029fe6060f1SDimitry Andric }
4030fe6060f1SDimitry Andric 
40310fca6ea1SDimitry Andric InstructionCost AArch64TTIImpl::getShuffleCost(
40320fca6ea1SDimitry Andric     TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
40330fca6ea1SDimitry Andric     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
40340fca6ea1SDimitry Andric     ArrayRef<const Value *> Args, const Instruction *CxtI) {
4035bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
40360fca6ea1SDimitry Andric 
403781ad6265SDimitry Andric   // If we have a Mask, and the LT is being legalized somehow, split the Mask
403881ad6265SDimitry Andric   // into smaller vectors and sum the cost of each shuffle.
403981ad6265SDimitry Andric   if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
404081ad6265SDimitry Andric       Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
40415f757f3fSDimitry Andric       Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
40420fca6ea1SDimitry Andric 
40430fca6ea1SDimitry Andric     // Check for LD3/LD4 instructions, which are represented in llvm IR as
40440fca6ea1SDimitry Andric     // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
40450fca6ea1SDimitry Andric     // but we model it with a cost of LT.first so that LD3/LD4 have a higher
40460fca6ea1SDimitry Andric     // cost than just the load.
40470fca6ea1SDimitry Andric     if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
40480fca6ea1SDimitry Andric         (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 3) ||
40490fca6ea1SDimitry Andric          ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 4)))
40500fca6ea1SDimitry Andric       return std::max<InstructionCost>(1, LT.first / 4);
40510fca6ea1SDimitry Andric 
40520fca6ea1SDimitry Andric     // Check for ST3/ST4 instructions, which are represented in llvm IR as
40530fca6ea1SDimitry Andric     // store(interleaving-shuffle). The shuffle cost could potentially be free,
40540fca6ea1SDimitry Andric     // but we model it with a cost of LT.first so that ST3/ST4 have a higher
40550fca6ea1SDimitry Andric     // cost than just the store.
40560fca6ea1SDimitry Andric     if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
40570fca6ea1SDimitry Andric         (ShuffleVectorInst::isInterleaveMask(
40580fca6ea1SDimitry Andric              Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
40590fca6ea1SDimitry Andric          ShuffleVectorInst::isInterleaveMask(
40600fca6ea1SDimitry Andric              Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))
40610fca6ea1SDimitry Andric       return LT.first;
40620fca6ea1SDimitry Andric 
40635f757f3fSDimitry Andric     unsigned TpNumElts = Mask.size();
406481ad6265SDimitry Andric     unsigned LTNumElts = LT.second.getVectorNumElements();
406581ad6265SDimitry Andric     unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
406681ad6265SDimitry Andric     VectorType *NTp =
406781ad6265SDimitry Andric         VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
406881ad6265SDimitry Andric     InstructionCost Cost;
406981ad6265SDimitry Andric     for (unsigned N = 0; N < NumVecs; N++) {
407081ad6265SDimitry Andric       SmallVector<int> NMask;
407181ad6265SDimitry Andric       // Split the existing mask into chunks of size LTNumElts. Track the source
407281ad6265SDimitry Andric       // sub-vectors to ensure the result has at most 2 inputs.
407381ad6265SDimitry Andric       unsigned Source1, Source2;
407481ad6265SDimitry Andric       unsigned NumSources = 0;
407581ad6265SDimitry Andric       for (unsigned E = 0; E < LTNumElts; E++) {
407681ad6265SDimitry Andric         int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
407706c3fb27SDimitry Andric                                                       : PoisonMaskElem;
407881ad6265SDimitry Andric         if (MaskElt < 0) {
407906c3fb27SDimitry Andric           NMask.push_back(PoisonMaskElem);
408081ad6265SDimitry Andric           continue;
408181ad6265SDimitry Andric         }
408281ad6265SDimitry Andric 
408381ad6265SDimitry Andric         // Calculate which source from the input this comes from and whether it
408481ad6265SDimitry Andric         // is new to us.
408581ad6265SDimitry Andric         unsigned Source = MaskElt / LTNumElts;
408681ad6265SDimitry Andric         if (NumSources == 0) {
408781ad6265SDimitry Andric           Source1 = Source;
408881ad6265SDimitry Andric           NumSources = 1;
408981ad6265SDimitry Andric         } else if (NumSources == 1 && Source != Source1) {
409081ad6265SDimitry Andric           Source2 = Source;
409181ad6265SDimitry Andric           NumSources = 2;
409281ad6265SDimitry Andric         } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
409381ad6265SDimitry Andric           NumSources++;
409481ad6265SDimitry Andric         }
409581ad6265SDimitry Andric 
409681ad6265SDimitry Andric         // Add to the new mask. For the NumSources>2 case these are not correct,
409781ad6265SDimitry Andric         // but are only used for the modular lane number.
409881ad6265SDimitry Andric         if (Source == Source1)
409981ad6265SDimitry Andric           NMask.push_back(MaskElt % LTNumElts);
410081ad6265SDimitry Andric         else if (Source == Source2)
410181ad6265SDimitry Andric           NMask.push_back(MaskElt % LTNumElts + LTNumElts);
410281ad6265SDimitry Andric         else
410381ad6265SDimitry Andric           NMask.push_back(MaskElt % LTNumElts);
410481ad6265SDimitry Andric       }
410581ad6265SDimitry Andric       // If the sub-mask has at most 2 input sub-vectors then re-cost it using
410681ad6265SDimitry Andric       // getShuffleCost. If not then cost it using the worst case.
410781ad6265SDimitry Andric       if (NumSources <= 2)
410881ad6265SDimitry Andric         Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
410981ad6265SDimitry Andric                                                : TTI::SK_PermuteTwoSrc,
41100fca6ea1SDimitry Andric                                NTp, NMask, CostKind, 0, nullptr, Args, CxtI);
411181ad6265SDimitry Andric       else if (any_of(enumerate(NMask), [&](const auto &ME) {
411281ad6265SDimitry Andric                  return ME.value() % LTNumElts == ME.index();
411381ad6265SDimitry Andric                }))
411481ad6265SDimitry Andric         Cost += LTNumElts - 1;
411581ad6265SDimitry Andric       else
411681ad6265SDimitry Andric         Cost += LTNumElts;
411781ad6265SDimitry Andric     }
411881ad6265SDimitry Andric     return Cost;
411981ad6265SDimitry Andric   }
412081ad6265SDimitry Andric 
41215f757f3fSDimitry Andric   Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
41220fca6ea1SDimitry Andric   // Treat extractsubvector as single op permutation.
41230fca6ea1SDimitry Andric   bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
41240fca6ea1SDimitry Andric   if (IsExtractSubvector && LT.second.isFixedLengthVector())
41250fca6ea1SDimitry Andric     Kind = TTI::SK_PermuteSingleSrc;
412681ad6265SDimitry Andric 
412706c3fb27SDimitry Andric   // Check for broadcast loads, which are supported by the LD1R instruction.
412806c3fb27SDimitry Andric   // In terms of code-size, the shuffle vector is free when a load + dup get
412906c3fb27SDimitry Andric   // folded into a LD1R. That's what we check and return here. For performance
413006c3fb27SDimitry Andric   // and reciprocal throughput, a LD1R is not completely free. In this case, we
413106c3fb27SDimitry Andric   // return the cost for the broadcast below (i.e. 1 for most/all types), so
413206c3fb27SDimitry Andric   // that we model the load + dup sequence slightly higher because LD1R is a
413306c3fb27SDimitry Andric   // high latency instruction.
413406c3fb27SDimitry Andric   if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
413581ad6265SDimitry Andric     bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
413681ad6265SDimitry Andric     if (IsLoad && LT.second.isVector() &&
413781ad6265SDimitry Andric         isLegalBroadcastLoad(Tp->getElementType(),
413881ad6265SDimitry Andric                              LT.second.getVectorElementCount()))
413906c3fb27SDimitry Andric       return 0;
414081ad6265SDimitry Andric   }
414181ad6265SDimitry Andric 
414281ad6265SDimitry Andric   // If we have 4 elements for the shuffle and a Mask, get the cost straight
414381ad6265SDimitry Andric   // from the perfect shuffle tables.
414481ad6265SDimitry Andric   if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
414581ad6265SDimitry Andric       (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
414681ad6265SDimitry Andric       all_of(Mask, [](int E) { return E < 8; }))
414781ad6265SDimitry Andric     return getPerfectShuffleCost(Mask);
414881ad6265SDimitry Andric 
41490fca6ea1SDimitry Andric   // Check for identity masks, which we can treat as free.
41500fca6ea1SDimitry Andric   if (!Mask.empty() && LT.second.isFixedLengthVector() &&
41510fca6ea1SDimitry Andric       (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
41520fca6ea1SDimitry Andric       all_of(enumerate(Mask), [](const auto &M) {
41530fca6ea1SDimitry Andric         return M.value() < 0 || M.value() == (int)M.index();
41540fca6ea1SDimitry Andric       }))
41550fca6ea1SDimitry Andric     return 0;
41560fca6ea1SDimitry Andric 
41570fca6ea1SDimitry Andric   // Check for other shuffles that are not SK_ kinds but we have native
41580fca6ea1SDimitry Andric   // instructions for, for example ZIP and UZP.
41590fca6ea1SDimitry Andric   unsigned Unused;
41600fca6ea1SDimitry Andric   if (LT.second.isFixedLengthVector() &&
41610fca6ea1SDimitry Andric       LT.second.getVectorNumElements() == Mask.size() &&
41620fca6ea1SDimitry Andric       (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
41630fca6ea1SDimitry Andric       (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
41640fca6ea1SDimitry Andric        isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
41650fca6ea1SDimitry Andric        // Check for non-zero lane splats
41660fca6ea1SDimitry Andric        all_of(drop_begin(Mask),
41670fca6ea1SDimitry Andric               [&Mask](int M) { return M < 0 || M == Mask[0]; })))
41680fca6ea1SDimitry Andric     return 1;
41690fca6ea1SDimitry Andric 
41700b57cec5SDimitry Andric   if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
4171fe6060f1SDimitry Andric       Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
4172bdd1243dSDimitry Andric       Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
41730b57cec5SDimitry Andric     static const CostTblEntry ShuffleTbl[] = {
41740b57cec5SDimitry Andric         // Broadcast shuffle kinds can be performed with 'dup'.
41750b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v8i8, 1},
41760b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v16i8, 1},
41770b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v4i16, 1},
41780b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v8i16, 1},
41790b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v2i32, 1},
41800b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v4i32, 1},
41810b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v2i64, 1},
418206c3fb27SDimitry Andric         {TTI::SK_Broadcast, MVT::v4f16, 1},
418306c3fb27SDimitry Andric         {TTI::SK_Broadcast, MVT::v8f16, 1},
41840b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v2f32, 1},
41850b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v4f32, 1},
41860b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v2f64, 1},
41870b57cec5SDimitry Andric         // Transpose shuffle kinds can be performed with 'trn1/trn2' and
41880b57cec5SDimitry Andric         // 'zip1/zip2' instructions.
41890b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v8i8, 1},
41900b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v16i8, 1},
41910b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v4i16, 1},
41920b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v8i16, 1},
41930b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v2i32, 1},
41940b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v4i32, 1},
41950b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v2i64, 1},
419606c3fb27SDimitry Andric         {TTI::SK_Transpose, MVT::v4f16, 1},
419706c3fb27SDimitry Andric         {TTI::SK_Transpose, MVT::v8f16, 1},
41980b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v2f32, 1},
41990b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v4f32, 1},
42000b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v2f64, 1},
42010b57cec5SDimitry Andric         // Select shuffle kinds.
42020b57cec5SDimitry Andric         // TODO: handle vXi8/vXi16.
42030b57cec5SDimitry Andric         {TTI::SK_Select, MVT::v2i32, 1}, // mov.
42040b57cec5SDimitry Andric         {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
42050b57cec5SDimitry Andric         {TTI::SK_Select, MVT::v2i64, 1}, // mov.
42060b57cec5SDimitry Andric         {TTI::SK_Select, MVT::v2f32, 1}, // mov.
42070b57cec5SDimitry Andric         {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
42080b57cec5SDimitry Andric         {TTI::SK_Select, MVT::v2f64, 1}, // mov.
42090b57cec5SDimitry Andric         // PermuteSingleSrc shuffle kinds.
42100b57cec5SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
42110b57cec5SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
42120b57cec5SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
42130b57cec5SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
42140b57cec5SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
42150b57cec5SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
4216fe6060f1SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
4217fe6060f1SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
4218bdd1243dSDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
4219fe6060f1SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8},  // constpool + load + tbl
4220fe6060f1SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8},  // constpool + load + tbl
4221fe6060f1SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
4222fe6060f1SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8},   // constpool + load + tbl
4223fe6060f1SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8},  // constpool + load + tbl
4224fe6060f1SDimitry Andric         // Reverse can be lowered with `rev`.
4225bdd1243dSDimitry Andric         {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
4226fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
4227bdd1243dSDimitry Andric         {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
4228bdd1243dSDimitry Andric         {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
4229fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
4230bdd1243dSDimitry Andric         {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
423181ad6265SDimitry Andric         {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
423281ad6265SDimitry Andric         {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
423381ad6265SDimitry Andric         {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
423481ad6265SDimitry Andric         {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
423581ad6265SDimitry Andric         {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
423681ad6265SDimitry Andric         {TTI::SK_Reverse, MVT::v8i8, 1},  // REV64
4237bdd1243dSDimitry Andric         // Splice can all be lowered as `ext`.
4238bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v2i32, 1},
4239bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v4i32, 1},
4240bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v2i64, 1},
4241bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v2f32, 1},
4242bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v4f32, 1},
4243bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v2f64, 1},
4244bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v8f16, 1},
4245bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v8bf16, 1},
4246bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v8i16, 1},
4247bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v16i8, 1},
4248bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v4bf16, 1},
4249bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v4f16, 1},
4250bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v4i16, 1},
4251bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v8i8, 1},
4252fe6060f1SDimitry Andric         // Broadcast shuffle kinds for scalable vectors
4253fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv16i8, 1},
4254fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv8i16, 1},
4255fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv4i32, 1},
4256fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv2i64, 1},
4257fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv2f16, 1},
4258fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv4f16, 1},
4259fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv8f16, 1},
4260fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
4261fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
4262fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
4263fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv2f32, 1},
4264fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv4f32, 1},
4265fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv2f64, 1},
4266fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv16i1, 1},
4267fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv8i1, 1},
4268fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv4i1, 1},
4269fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv2i1, 1},
4270fe6060f1SDimitry Andric         // Handle the cases for vector.reverse with scalable vectors
4271fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv16i8, 1},
4272fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv8i16, 1},
4273fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv4i32, 1},
4274fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv2i64, 1},
4275fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv2f16, 1},
4276fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv4f16, 1},
4277fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv8f16, 1},
4278fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv2bf16, 1},
4279fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv4bf16, 1},
4280fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv8bf16, 1},
4281fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv2f32, 1},
4282fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv4f32, 1},
4283fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv2f64, 1},
4284fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv16i1, 1},
4285fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv8i1, 1},
4286fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv4i1, 1},
4287fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv2i1, 1},
42880b57cec5SDimitry Andric     };
42890b57cec5SDimitry Andric     if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
42900b57cec5SDimitry Andric       return LT.first * Entry->Cost;
42910b57cec5SDimitry Andric   }
429281ad6265SDimitry Andric 
4293fe6060f1SDimitry Andric   if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
4294fe6060f1SDimitry Andric     return getSpliceCost(Tp, Index);
429581ad6265SDimitry Andric 
429681ad6265SDimitry Andric   // Inserting a subvector can often be done with either a D, S or H register
429781ad6265SDimitry Andric   // move, so long as the inserted vector is "aligned".
429881ad6265SDimitry Andric   if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
429981ad6265SDimitry Andric       LT.second.getSizeInBits() <= 128 && SubTp) {
4300bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
430181ad6265SDimitry Andric     if (SubLT.second.isVector()) {
430281ad6265SDimitry Andric       int NumElts = LT.second.getVectorNumElements();
430381ad6265SDimitry Andric       int NumSubElts = SubLT.second.getVectorNumElements();
430481ad6265SDimitry Andric       if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
430581ad6265SDimitry Andric         return SubLT.first;
430681ad6265SDimitry Andric     }
430781ad6265SDimitry Andric   }
430881ad6265SDimitry Andric 
43090fca6ea1SDimitry Andric   // Restore optimal kind.
43100fca6ea1SDimitry Andric   if (IsExtractSubvector)
43110fca6ea1SDimitry Andric     Kind = TTI::SK_ExtractSubvector;
43120fca6ea1SDimitry Andric   return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
43130fca6ea1SDimitry Andric                                CxtI);
43140b57cec5SDimitry Andric }
4315fcaf7f86SDimitry Andric 
431606c3fb27SDimitry Andric static bool containsDecreasingPointers(Loop *TheLoop,
431706c3fb27SDimitry Andric                                        PredicatedScalarEvolution *PSE) {
431806c3fb27SDimitry Andric   const auto &Strides = DenseMap<Value *, const SCEV *>();
431906c3fb27SDimitry Andric   for (BasicBlock *BB : TheLoop->blocks()) {
432006c3fb27SDimitry Andric     // Scan the instructions in the block and look for addresses that are
432106c3fb27SDimitry Andric     // consecutive and decreasing.
432206c3fb27SDimitry Andric     for (Instruction &I : *BB) {
432306c3fb27SDimitry Andric       if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
432406c3fb27SDimitry Andric         Value *Ptr = getLoadStorePointerOperand(&I);
432506c3fb27SDimitry Andric         Type *AccessTy = getLoadStoreType(&I);
432606c3fb27SDimitry Andric         if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
432706c3fb27SDimitry Andric                          /*ShouldCheckWrap=*/false)
432806c3fb27SDimitry Andric                 .value_or(0) < 0)
432906c3fb27SDimitry Andric           return true;
433006c3fb27SDimitry Andric       }
433106c3fb27SDimitry Andric     }
433206c3fb27SDimitry Andric   }
433306c3fb27SDimitry Andric   return false;
433406c3fb27SDimitry Andric }
433506c3fb27SDimitry Andric 
433606c3fb27SDimitry Andric bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
433706c3fb27SDimitry Andric   if (!ST->hasSVE())
4338fcaf7f86SDimitry Andric     return false;
4339fcaf7f86SDimitry Andric 
4340bdd1243dSDimitry Andric   // We don't currently support vectorisation with interleaving for SVE - with
4341bdd1243dSDimitry Andric   // such loops we're better off not using tail-folding. This gives us a chance
4342bdd1243dSDimitry Andric   // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
434306c3fb27SDimitry Andric   if (TFI->IAI->hasGroups())
4344bdd1243dSDimitry Andric     return false;
4345bdd1243dSDimitry Andric 
434606c3fb27SDimitry Andric   TailFoldingOpts Required = TailFoldingOpts::Disabled;
434706c3fb27SDimitry Andric   if (TFI->LVL->getReductionVars().size())
434806c3fb27SDimitry Andric     Required |= TailFoldingOpts::Reductions;
434906c3fb27SDimitry Andric   if (TFI->LVL->getFixedOrderRecurrences().size())
435006c3fb27SDimitry Andric     Required |= TailFoldingOpts::Recurrences;
4351fcaf7f86SDimitry Andric 
435206c3fb27SDimitry Andric   // We call this to discover whether any load/store pointers in the loop have
435306c3fb27SDimitry Andric   // negative strides. This will require extra work to reverse the loop
435406c3fb27SDimitry Andric   // predicate, which may be expensive.
435506c3fb27SDimitry Andric   if (containsDecreasingPointers(TFI->LVL->getLoop(),
435606c3fb27SDimitry Andric                                  TFI->LVL->getPredicatedScalarEvolution()))
435706c3fb27SDimitry Andric     Required |= TailFoldingOpts::Reverse;
435806c3fb27SDimitry Andric   if (Required == TailFoldingOpts::Disabled)
435906c3fb27SDimitry Andric     Required |= TailFoldingOpts::Simple;
436006c3fb27SDimitry Andric 
436106c3fb27SDimitry Andric   if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
436206c3fb27SDimitry Andric                                       Required))
436306c3fb27SDimitry Andric     return false;
436406c3fb27SDimitry Andric 
436506c3fb27SDimitry Andric   // Don't tail-fold for tight loops where we would be better off interleaving
436606c3fb27SDimitry Andric   // with an unpredicated loop.
436706c3fb27SDimitry Andric   unsigned NumInsns = 0;
436806c3fb27SDimitry Andric   for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
436906c3fb27SDimitry Andric     NumInsns += BB->sizeWithoutDebug();
437006c3fb27SDimitry Andric   }
437106c3fb27SDimitry Andric 
437206c3fb27SDimitry Andric   // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
437306c3fb27SDimitry Andric   return NumInsns >= SVETailFoldInsnThreshold;
4374fcaf7f86SDimitry Andric }
4375bdd1243dSDimitry Andric 
4376bdd1243dSDimitry Andric InstructionCost
4377bdd1243dSDimitry Andric AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
43780fca6ea1SDimitry Andric                                      StackOffset BaseOffset, bool HasBaseReg,
4379bdd1243dSDimitry Andric                                      int64_t Scale, unsigned AddrSpace) const {
4380bdd1243dSDimitry Andric   // Scaling factors are not free at all.
4381bdd1243dSDimitry Andric   // Operands                     | Rt Latency
4382bdd1243dSDimitry Andric   // -------------------------------------------
4383bdd1243dSDimitry Andric   // Rt, [Xn, Xm]                 | 4
4384bdd1243dSDimitry Andric   // -------------------------------------------
4385bdd1243dSDimitry Andric   // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
4386bdd1243dSDimitry Andric   // Rt, [Xn, Wm, <extend> #imm]  |
4387bdd1243dSDimitry Andric   TargetLoweringBase::AddrMode AM;
4388bdd1243dSDimitry Andric   AM.BaseGV = BaseGV;
43890fca6ea1SDimitry Andric   AM.BaseOffs = BaseOffset.getFixed();
4390bdd1243dSDimitry Andric   AM.HasBaseReg = HasBaseReg;
4391bdd1243dSDimitry Andric   AM.Scale = Scale;
43920fca6ea1SDimitry Andric   AM.ScalableOffset = BaseOffset.getScalable();
4393bdd1243dSDimitry Andric   if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
4394bdd1243dSDimitry Andric     // Scale represents reg2 * scale, thus account for 1 if
4395bdd1243dSDimitry Andric     // it is not equal to 0 or 1.
4396bdd1243dSDimitry Andric     return AM.Scale != 0 && AM.Scale != 1;
4397bdd1243dSDimitry Andric   return -1;
4398bdd1243dSDimitry Andric }
43997a6dacacSDimitry Andric 
44007a6dacacSDimitry Andric bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {
44017a6dacacSDimitry Andric   // For the binary operators (e.g. or) we need to be more careful than
44027a6dacacSDimitry Andric   // selects, here we only transform them if they are already at a natural
44037a6dacacSDimitry Andric   // break point in the code - the end of a block with an unconditional
44047a6dacacSDimitry Andric   // terminator.
44057a6dacacSDimitry Andric   if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or &&
44067a6dacacSDimitry Andric       isa<BranchInst>(I->getNextNode()) &&
44077a6dacacSDimitry Andric       cast<BranchInst>(I->getNextNode())->isUnconditional())
44087a6dacacSDimitry Andric     return true;
44097a6dacacSDimitry Andric   return BaseT::shouldTreatInstructionLikeSelect(I);
44107a6dacacSDimitry Andric }
44110fca6ea1SDimitry Andric 
44120fca6ea1SDimitry Andric bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
44130fca6ea1SDimitry Andric                                    const TargetTransformInfo::LSRCost &C2) {
44140fca6ea1SDimitry Andric   // AArch64 specific here is adding the number of instructions to the
44150fca6ea1SDimitry Andric   // comparison (though not as the first consideration, as some targets do)
44160fca6ea1SDimitry Andric   // along with changing the priority of the base additions.
44170fca6ea1SDimitry Andric   // TODO: Maybe a more nuanced tradeoff between instruction count
44180fca6ea1SDimitry Andric   // and number of registers? To be investigated at a later date.
44190fca6ea1SDimitry Andric   if (EnableLSRCostOpt)
44200fca6ea1SDimitry Andric     return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
44210fca6ea1SDimitry Andric                     C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
44220fca6ea1SDimitry Andric            std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
44230fca6ea1SDimitry Andric                     C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
44240fca6ea1SDimitry Andric 
44250fca6ea1SDimitry Andric   return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
44260fca6ea1SDimitry Andric }
4427