10b57cec5SDimitry Andric //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric 90b57cec5SDimitry Andric #include "AArch64TargetTransformInfo.h" 10e8d8bef9SDimitry Andric #include "AArch64ExpandImm.h" 1181ad6265SDimitry Andric #include "AArch64PerfectShuffle.h" 120b57cec5SDimitry Andric #include "MCTargetDesc/AArch64AddressingModes.h" 13349cc55cSDimitry Andric #include "llvm/Analysis/IVDescriptors.h" 140b57cec5SDimitry Andric #include "llvm/Analysis/LoopInfo.h" 150b57cec5SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h" 160b57cec5SDimitry Andric #include "llvm/CodeGen/BasicTTIImpl.h" 170b57cec5SDimitry Andric #include "llvm/CodeGen/CostTable.h" 180b57cec5SDimitry Andric #include "llvm/CodeGen/TargetLowering.h" 190b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h" 2081ad6265SDimitry Andric #include "llvm/IR/Intrinsics.h" 21480093f4SDimitry Andric #include "llvm/IR/IntrinsicsAArch64.h" 22e8d8bef9SDimitry Andric #include "llvm/IR/PatternMatch.h" 230b57cec5SDimitry Andric #include "llvm/Support/Debug.h" 24fe6060f1SDimitry Andric #include "llvm/Transforms/InstCombine/InstCombiner.h" 25fcaf7f86SDimitry Andric #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 260b57cec5SDimitry Andric #include <algorithm> 27bdd1243dSDimitry Andric #include <optional> 280b57cec5SDimitry Andric using namespace llvm; 29e8d8bef9SDimitry Andric using namespace llvm::PatternMatch; 300b57cec5SDimitry Andric 310b57cec5SDimitry Andric #define DEBUG_TYPE "aarch64tti" 320b57cec5SDimitry Andric 330b57cec5SDimitry Andric static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", 340b57cec5SDimitry Andric cl::init(true), cl::Hidden); 350b57cec5SDimitry Andric 360eae32dcSDimitry Andric static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), 370eae32dcSDimitry Andric cl::Hidden); 380eae32dcSDimitry Andric 390eae32dcSDimitry Andric static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", 400eae32dcSDimitry Andric cl::init(10), cl::Hidden); 410eae32dcSDimitry Andric 4206c3fb27SDimitry Andric static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", 4306c3fb27SDimitry Andric cl::init(15), cl::Hidden); 4406c3fb27SDimitry Andric 4506c3fb27SDimitry Andric static cl::opt<unsigned> 4606c3fb27SDimitry Andric NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), 4706c3fb27SDimitry Andric cl::Hidden); 4806c3fb27SDimitry Andric 495f757f3fSDimitry Andric static cl::opt<unsigned> CallPenaltyChangeSM( 505f757f3fSDimitry Andric "call-penalty-sm-change", cl::init(5), cl::Hidden, 515f757f3fSDimitry Andric cl::desc( 525f757f3fSDimitry Andric "Penalty of calling a function that requires a change to PSTATE.SM")); 535f757f3fSDimitry Andric 545f757f3fSDimitry Andric static cl::opt<unsigned> InlineCallPenaltyChangeSM( 555f757f3fSDimitry Andric "inline-call-penalty-sm-change", cl::init(10), cl::Hidden, 565f757f3fSDimitry Andric cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM")); 575f757f3fSDimitry Andric 587a6dacacSDimitry Andric static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select", 597a6dacacSDimitry Andric cl::init(true), cl::Hidden); 607a6dacacSDimitry Andric 610fca6ea1SDimitry Andric static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", 620fca6ea1SDimitry Andric cl::init(true), cl::Hidden); 630fca6ea1SDimitry Andric 640fca6ea1SDimitry Andric // A complete guess as to a reasonable cost. 650fca6ea1SDimitry Andric static cl::opt<unsigned> 660fca6ea1SDimitry Andric BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, 670fca6ea1SDimitry Andric cl::desc("The cost of a histcnt instruction")); 680fca6ea1SDimitry Andric 69bdd1243dSDimitry Andric namespace { 7006c3fb27SDimitry Andric class TailFoldingOption { 7106c3fb27SDimitry Andric // These bitfields will only ever be set to something non-zero in operator=, 7206c3fb27SDimitry Andric // when setting the -sve-tail-folding option. This option should always be of 7306c3fb27SDimitry Andric // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here 7406c3fb27SDimitry Andric // InitialBits is one of (disabled|all|simple). EnableBits represents 7506c3fb27SDimitry Andric // additional flags we're enabling, and DisableBits for those flags we're 7606c3fb27SDimitry Andric // disabling. The default flag is tracked in the variable NeedsDefault, since 7706c3fb27SDimitry Andric // at the time of setting the option we may not know what the default value 7806c3fb27SDimitry Andric // for the CPU is. 7906c3fb27SDimitry Andric TailFoldingOpts InitialBits = TailFoldingOpts::Disabled; 8006c3fb27SDimitry Andric TailFoldingOpts EnableBits = TailFoldingOpts::Disabled; 8106c3fb27SDimitry Andric TailFoldingOpts DisableBits = TailFoldingOpts::Disabled; 8206c3fb27SDimitry Andric 8306c3fb27SDimitry Andric // This value needs to be initialised to true in case the user does not 8406c3fb27SDimitry Andric // explicitly set the -sve-tail-folding option. 8506c3fb27SDimitry Andric bool NeedsDefault = true; 8606c3fb27SDimitry Andric 8706c3fb27SDimitry Andric void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; } 8806c3fb27SDimitry Andric 8906c3fb27SDimitry Andric void setNeedsDefault(bool V) { NeedsDefault = V; } 9006c3fb27SDimitry Andric 9106c3fb27SDimitry Andric void setEnableBit(TailFoldingOpts Bit) { 9206c3fb27SDimitry Andric EnableBits |= Bit; 9306c3fb27SDimitry Andric DisableBits &= ~Bit; 9406c3fb27SDimitry Andric } 9506c3fb27SDimitry Andric 9606c3fb27SDimitry Andric void setDisableBit(TailFoldingOpts Bit) { 9706c3fb27SDimitry Andric EnableBits &= ~Bit; 9806c3fb27SDimitry Andric DisableBits |= Bit; 9906c3fb27SDimitry Andric } 10006c3fb27SDimitry Andric 10106c3fb27SDimitry Andric TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const { 10206c3fb27SDimitry Andric TailFoldingOpts Bits = TailFoldingOpts::Disabled; 10306c3fb27SDimitry Andric 10406c3fb27SDimitry Andric assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) && 10506c3fb27SDimitry Andric "Initial bits should only include one of " 10606c3fb27SDimitry Andric "(disabled|all|simple|default)"); 10706c3fb27SDimitry Andric Bits = NeedsDefault ? DefaultBits : InitialBits; 10806c3fb27SDimitry Andric Bits |= EnableBits; 10906c3fb27SDimitry Andric Bits &= ~DisableBits; 11006c3fb27SDimitry Andric 11106c3fb27SDimitry Andric return Bits; 11206c3fb27SDimitry Andric } 11306c3fb27SDimitry Andric 11406c3fb27SDimitry Andric void reportError(std::string Opt) { 11506c3fb27SDimitry Andric errs() << "invalid argument '" << Opt 11606c3fb27SDimitry Andric << "' to -sve-tail-folding=; the option should be of the form\n" 11706c3fb27SDimitry Andric " (disabled|all|default|simple)[+(reductions|recurrences" 11806c3fb27SDimitry Andric "|reverse|noreductions|norecurrences|noreverse)]\n"; 11906c3fb27SDimitry Andric report_fatal_error("Unrecognised tail-folding option"); 12006c3fb27SDimitry Andric } 121fcaf7f86SDimitry Andric 122fcaf7f86SDimitry Andric public: 123fcaf7f86SDimitry Andric 124fcaf7f86SDimitry Andric void operator=(const std::string &Val) { 12506c3fb27SDimitry Andric // If the user explicitly sets -sve-tail-folding= then treat as an error. 12606c3fb27SDimitry Andric if (Val.empty()) { 12706c3fb27SDimitry Andric reportError(""); 128fcaf7f86SDimitry Andric return; 12906c3fb27SDimitry Andric } 13006c3fb27SDimitry Andric 13106c3fb27SDimitry Andric // Since the user is explicitly setting the option we don't automatically 13206c3fb27SDimitry Andric // need the default unless they require it. 13306c3fb27SDimitry Andric setNeedsDefault(false); 13406c3fb27SDimitry Andric 13506c3fb27SDimitry Andric SmallVector<StringRef, 4> TailFoldTypes; 136fcaf7f86SDimitry Andric StringRef(Val).split(TailFoldTypes, '+', -1, false); 13706c3fb27SDimitry Andric 13806c3fb27SDimitry Andric unsigned StartIdx = 1; 13906c3fb27SDimitry Andric if (TailFoldTypes[0] == "disabled") 14006c3fb27SDimitry Andric setInitialBits(TailFoldingOpts::Disabled); 14106c3fb27SDimitry Andric else if (TailFoldTypes[0] == "all") 14206c3fb27SDimitry Andric setInitialBits(TailFoldingOpts::All); 14306c3fb27SDimitry Andric else if (TailFoldTypes[0] == "default") 14406c3fb27SDimitry Andric setNeedsDefault(true); 14506c3fb27SDimitry Andric else if (TailFoldTypes[0] == "simple") 14606c3fb27SDimitry Andric setInitialBits(TailFoldingOpts::Simple); 147fcaf7f86SDimitry Andric else { 14806c3fb27SDimitry Andric StartIdx = 0; 14906c3fb27SDimitry Andric setInitialBits(TailFoldingOpts::Disabled); 150fcaf7f86SDimitry Andric } 15106c3fb27SDimitry Andric 15206c3fb27SDimitry Andric for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) { 15306c3fb27SDimitry Andric if (TailFoldTypes[I] == "reductions") 15406c3fb27SDimitry Andric setEnableBit(TailFoldingOpts::Reductions); 15506c3fb27SDimitry Andric else if (TailFoldTypes[I] == "recurrences") 15606c3fb27SDimitry Andric setEnableBit(TailFoldingOpts::Recurrences); 15706c3fb27SDimitry Andric else if (TailFoldTypes[I] == "reverse") 15806c3fb27SDimitry Andric setEnableBit(TailFoldingOpts::Reverse); 15906c3fb27SDimitry Andric else if (TailFoldTypes[I] == "noreductions") 16006c3fb27SDimitry Andric setDisableBit(TailFoldingOpts::Reductions); 16106c3fb27SDimitry Andric else if (TailFoldTypes[I] == "norecurrences") 16206c3fb27SDimitry Andric setDisableBit(TailFoldingOpts::Recurrences); 16306c3fb27SDimitry Andric else if (TailFoldTypes[I] == "noreverse") 16406c3fb27SDimitry Andric setDisableBit(TailFoldingOpts::Reverse); 16506c3fb27SDimitry Andric else 16606c3fb27SDimitry Andric reportError(Val); 167fcaf7f86SDimitry Andric } 168fcaf7f86SDimitry Andric } 169fcaf7f86SDimitry Andric 17006c3fb27SDimitry Andric bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const { 17106c3fb27SDimitry Andric return (getBits(DefaultBits) & Required) == Required; 17206c3fb27SDimitry Andric } 173fcaf7f86SDimitry Andric }; 174bdd1243dSDimitry Andric } // namespace 175fcaf7f86SDimitry Andric 17606c3fb27SDimitry Andric TailFoldingOption TailFoldingOptionLoc; 177fcaf7f86SDimitry Andric 17806c3fb27SDimitry Andric cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding( 179fcaf7f86SDimitry Andric "sve-tail-folding", 180fcaf7f86SDimitry Andric cl::desc( 18106c3fb27SDimitry Andric "Control the use of vectorisation using tail-folding for SVE where the" 18206c3fb27SDimitry Andric " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" 18306c3fb27SDimitry Andric "\ndisabled (Initial) No loop types will vectorize using " 18406c3fb27SDimitry Andric "tail-folding" 18506c3fb27SDimitry Andric "\ndefault (Initial) Uses the default tail-folding settings for " 18606c3fb27SDimitry Andric "the target CPU" 18706c3fb27SDimitry Andric "\nall (Initial) All legal loop types will vectorize using " 18806c3fb27SDimitry Andric "tail-folding" 18906c3fb27SDimitry Andric "\nsimple (Initial) Use tail-folding for simple loops (not " 19006c3fb27SDimitry Andric "reductions or recurrences)" 191fcaf7f86SDimitry Andric "\nreductions Use tail-folding for loops containing reductions" 19206c3fb27SDimitry Andric "\nnoreductions Inverse of above" 193bdd1243dSDimitry Andric "\nrecurrences Use tail-folding for loops containing fixed order " 19406c3fb27SDimitry Andric "recurrences" 19506c3fb27SDimitry Andric "\nnorecurrences Inverse of above" 19606c3fb27SDimitry Andric "\nreverse Use tail-folding for loops requiring reversed " 19706c3fb27SDimitry Andric "predicates" 19806c3fb27SDimitry Andric "\nnoreverse Inverse of above"), 19906c3fb27SDimitry Andric cl::location(TailFoldingOptionLoc)); 200fcaf7f86SDimitry Andric 201bdd1243dSDimitry Andric // Experimental option that will only be fully functional when the 202bdd1243dSDimitry Andric // code-generator is changed to use SVE instead of NEON for all fixed-width 203bdd1243dSDimitry Andric // operations. 204bdd1243dSDimitry Andric static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode( 205bdd1243dSDimitry Andric "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden); 206bdd1243dSDimitry Andric 207bdd1243dSDimitry Andric // Experimental option that will only be fully functional when the cost-model 208bdd1243dSDimitry Andric // and code-generator have been changed to avoid using scalable vector 209bdd1243dSDimitry Andric // instructions that are not legal in streaming SVE mode. 210bdd1243dSDimitry Andric static cl::opt<bool> EnableScalableAutovecInStreamingMode( 211bdd1243dSDimitry Andric "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden); 212bdd1243dSDimitry Andric 2135f757f3fSDimitry Andric static bool isSMEABIRoutineCall(const CallInst &CI) { 2145f757f3fSDimitry Andric const auto *F = CI.getCalledFunction(); 2155f757f3fSDimitry Andric return F && StringSwitch<bool>(F->getName()) 2165f757f3fSDimitry Andric .Case("__arm_sme_state", true) 2175f757f3fSDimitry Andric .Case("__arm_tpidr2_save", true) 2185f757f3fSDimitry Andric .Case("__arm_tpidr2_restore", true) 2195f757f3fSDimitry Andric .Case("__arm_za_disable", true) 2205f757f3fSDimitry Andric .Default(false); 2215f757f3fSDimitry Andric } 2225f757f3fSDimitry Andric 2235f757f3fSDimitry Andric /// Returns true if the function has explicit operations that can only be 2245f757f3fSDimitry Andric /// lowered using incompatible instructions for the selected mode. This also 2255f757f3fSDimitry Andric /// returns true if the function F may use or modify ZA state. 2265f757f3fSDimitry Andric static bool hasPossibleIncompatibleOps(const Function *F) { 2275f757f3fSDimitry Andric for (const BasicBlock &BB : *F) { 2285f757f3fSDimitry Andric for (const Instruction &I : BB) { 2295f757f3fSDimitry Andric // Be conservative for now and assume that any call to inline asm or to 2305f757f3fSDimitry Andric // intrinsics could could result in non-streaming ops (e.g. calls to 2315f757f3fSDimitry Andric // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that 2325f757f3fSDimitry Andric // all native LLVM instructions can be lowered to compatible instructions. 2335f757f3fSDimitry Andric if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() && 2345f757f3fSDimitry Andric (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) || 2355f757f3fSDimitry Andric isSMEABIRoutineCall(cast<CallInst>(I)))) 2365f757f3fSDimitry Andric return true; 2375f757f3fSDimitry Andric } 2385f757f3fSDimitry Andric } 2395f757f3fSDimitry Andric return false; 2405f757f3fSDimitry Andric } 2415f757f3fSDimitry Andric 2420b57cec5SDimitry Andric bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, 2430b57cec5SDimitry Andric const Function *Callee) const { 244b3edf446SDimitry Andric SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee); 245b3edf446SDimitry Andric 246b3edf446SDimitry Andric // When inlining, we should consider the body of the function, not the 247b3edf446SDimitry Andric // interface. 248b3edf446SDimitry Andric if (CalleeAttrs.hasStreamingBody()) { 249b3edf446SDimitry Andric CalleeAttrs.set(SMEAttrs::SM_Compatible, false); 250b3edf446SDimitry Andric CalleeAttrs.set(SMEAttrs::SM_Enabled, true); 251b3edf446SDimitry Andric } 252b3edf446SDimitry Andric 2530fca6ea1SDimitry Andric if (CalleeAttrs.isNewZA()) 254bdd1243dSDimitry Andric return false; 255bdd1243dSDimitry Andric 2565f757f3fSDimitry Andric if (CallerAttrs.requiresLazySave(CalleeAttrs) || 257*62987288SDimitry Andric CallerAttrs.requiresSMChange(CalleeAttrs) || 258*62987288SDimitry Andric CallerAttrs.requiresPreservingZT0(CalleeAttrs)) { 2595f757f3fSDimitry Andric if (hasPossibleIncompatibleOps(Callee)) 2605f757f3fSDimitry Andric return false; 2615f757f3fSDimitry Andric } 2625f757f3fSDimitry Andric 2630b57cec5SDimitry Andric const TargetMachine &TM = getTLI()->getTargetMachine(); 2640b57cec5SDimitry Andric 2650b57cec5SDimitry Andric const FeatureBitset &CallerBits = 2660b57cec5SDimitry Andric TM.getSubtargetImpl(*Caller)->getFeatureBits(); 2670b57cec5SDimitry Andric const FeatureBitset &CalleeBits = 2680b57cec5SDimitry Andric TM.getSubtargetImpl(*Callee)->getFeatureBits(); 2690b57cec5SDimitry Andric 2700b57cec5SDimitry Andric // Inline a callee if its target-features are a subset of the callers 2710b57cec5SDimitry Andric // target-features. 2720b57cec5SDimitry Andric return (CallerBits & CalleeBits) == CalleeBits; 2730b57cec5SDimitry Andric } 2740b57cec5SDimitry Andric 275b121cb00SDimitry Andric bool AArch64TTIImpl::areTypesABICompatible( 276b121cb00SDimitry Andric const Function *Caller, const Function *Callee, 277b121cb00SDimitry Andric const ArrayRef<Type *> &Types) const { 278b121cb00SDimitry Andric if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) 279b121cb00SDimitry Andric return false; 280b121cb00SDimitry Andric 281b121cb00SDimitry Andric // We need to ensure that argument promotion does not attempt to promote 282b121cb00SDimitry Andric // pointers to fixed-length vector types larger than 128 bits like 283b121cb00SDimitry Andric // <8 x float> (and pointers to aggregate types which have such fixed-length 284b121cb00SDimitry Andric // vector type members) into the values of the pointees. Such vector types 285b121cb00SDimitry Andric // are used for SVE VLS but there is no ABI for SVE VLS arguments and the 286b121cb00SDimitry Andric // backend cannot lower such value arguments. The 128-bit fixed-length SVE 287b121cb00SDimitry Andric // types can be safely treated as 128-bit NEON types and they cannot be 288b121cb00SDimitry Andric // distinguished in IR. 289b121cb00SDimitry Andric if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) { 290b121cb00SDimitry Andric auto FVTy = dyn_cast<FixedVectorType>(Ty); 291b121cb00SDimitry Andric return FVTy && 292b121cb00SDimitry Andric FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128; 293b121cb00SDimitry Andric })) 294b121cb00SDimitry Andric return false; 295b121cb00SDimitry Andric 296b121cb00SDimitry Andric return true; 297b121cb00SDimitry Andric } 298b121cb00SDimitry Andric 2995f757f3fSDimitry Andric unsigned 3005f757f3fSDimitry Andric AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call, 3015f757f3fSDimitry Andric unsigned DefaultCallPenalty) const { 3025f757f3fSDimitry Andric // This function calculates a penalty for executing Call in F. 3035f757f3fSDimitry Andric // 3045f757f3fSDimitry Andric // There are two ways this function can be called: 3055f757f3fSDimitry Andric // (1) F: 3065f757f3fSDimitry Andric // call from F -> G (the call here is Call) 3075f757f3fSDimitry Andric // 3085f757f3fSDimitry Andric // For (1), Call.getCaller() == F, so it will always return a high cost if 3095f757f3fSDimitry Andric // a streaming-mode change is required (thus promoting the need to inline the 3105f757f3fSDimitry Andric // function) 3115f757f3fSDimitry Andric // 3125f757f3fSDimitry Andric // (2) F: 3135f757f3fSDimitry Andric // call from F -> G (the call here is not Call) 3145f757f3fSDimitry Andric // G: 3155f757f3fSDimitry Andric // call from G -> H (the call here is Call) 3165f757f3fSDimitry Andric // 3175f757f3fSDimitry Andric // For (2), if after inlining the body of G into F the call to H requires a 3185f757f3fSDimitry Andric // streaming-mode change, and the call to G from F would also require a 3195f757f3fSDimitry Andric // streaming-mode change, then there is benefit to do the streaming-mode 3205f757f3fSDimitry Andric // change only once and avoid inlining of G into F. 3215f757f3fSDimitry Andric SMEAttrs FAttrs(*F); 3225f757f3fSDimitry Andric SMEAttrs CalleeAttrs(Call); 3235f757f3fSDimitry Andric if (FAttrs.requiresSMChange(CalleeAttrs)) { 3245f757f3fSDimitry Andric if (F == Call.getCaller()) // (1) 3255f757f3fSDimitry Andric return CallPenaltyChangeSM * DefaultCallPenalty; 3265f757f3fSDimitry Andric if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2) 3275f757f3fSDimitry Andric return InlineCallPenaltyChangeSM * DefaultCallPenalty; 3285f757f3fSDimitry Andric } 3295f757f3fSDimitry Andric 3305f757f3fSDimitry Andric return DefaultCallPenalty; 3315f757f3fSDimitry Andric } 3325f757f3fSDimitry Andric 33381ad6265SDimitry Andric bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( 33481ad6265SDimitry Andric TargetTransformInfo::RegisterKind K) const { 33581ad6265SDimitry Andric assert(K != TargetTransformInfo::RGK_Scalar); 33606c3fb27SDimitry Andric return (K == TargetTransformInfo::RGK_FixedWidthVector && 33706c3fb27SDimitry Andric ST->isNeonAvailable()); 33881ad6265SDimitry Andric } 33981ad6265SDimitry Andric 3400b57cec5SDimitry Andric /// Calculate the cost of materializing a 64-bit value. This helper 3410b57cec5SDimitry Andric /// method might only calculate a fraction of a larger immediate. Therefore it 3420b57cec5SDimitry Andric /// is valid to return a cost of ZERO. 343fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { 3440b57cec5SDimitry Andric // Check if the immediate can be encoded within an instruction. 3450b57cec5SDimitry Andric if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 3460b57cec5SDimitry Andric return 0; 3470b57cec5SDimitry Andric 3480b57cec5SDimitry Andric if (Val < 0) 3490b57cec5SDimitry Andric Val = ~Val; 3500b57cec5SDimitry Andric 3510b57cec5SDimitry Andric // Calculate how many moves we will need to materialize this constant. 3520b57cec5SDimitry Andric SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 3530b57cec5SDimitry Andric AArch64_IMM::expandMOVImm(Val, 64, Insn); 3540b57cec5SDimitry Andric return Insn.size(); 3550b57cec5SDimitry Andric } 3560b57cec5SDimitry Andric 3570b57cec5SDimitry Andric /// Calculate the cost of materializing the given constant. 358fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 3595ffd83dbSDimitry Andric TTI::TargetCostKind CostKind) { 3600b57cec5SDimitry Andric assert(Ty->isIntegerTy()); 3610b57cec5SDimitry Andric 3620b57cec5SDimitry Andric unsigned BitSize = Ty->getPrimitiveSizeInBits(); 3630b57cec5SDimitry Andric if (BitSize == 0) 3640b57cec5SDimitry Andric return ~0U; 3650b57cec5SDimitry Andric 3660b57cec5SDimitry Andric // Sign-extend all constants to a multiple of 64-bit. 3670b57cec5SDimitry Andric APInt ImmVal = Imm; 3680b57cec5SDimitry Andric if (BitSize & 0x3f) 3690b57cec5SDimitry Andric ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 3700b57cec5SDimitry Andric 3710b57cec5SDimitry Andric // Split the constant into 64-bit chunks and calculate the cost for each 3720b57cec5SDimitry Andric // chunk. 373fe6060f1SDimitry Andric InstructionCost Cost = 0; 3740b57cec5SDimitry Andric for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 3750b57cec5SDimitry Andric APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 3760b57cec5SDimitry Andric int64_t Val = Tmp.getSExtValue(); 3770b57cec5SDimitry Andric Cost += getIntImmCost(Val); 3780b57cec5SDimitry Andric } 3790b57cec5SDimitry Andric // We need at least one instruction to materialze the constant. 380fe6060f1SDimitry Andric return std::max<InstructionCost>(1, Cost); 3810b57cec5SDimitry Andric } 3820b57cec5SDimitry Andric 383fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 3845ffd83dbSDimitry Andric const APInt &Imm, Type *Ty, 385e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind, 386e8d8bef9SDimitry Andric Instruction *Inst) { 3870b57cec5SDimitry Andric assert(Ty->isIntegerTy()); 3880b57cec5SDimitry Andric 3890b57cec5SDimitry Andric unsigned BitSize = Ty->getPrimitiveSizeInBits(); 3900b57cec5SDimitry Andric // There is no cost model for constants with a bit size of 0. Return TCC_Free 3910b57cec5SDimitry Andric // here, so that constant hoisting will ignore this constant. 3920b57cec5SDimitry Andric if (BitSize == 0) 3930b57cec5SDimitry Andric return TTI::TCC_Free; 3940b57cec5SDimitry Andric 3950b57cec5SDimitry Andric unsigned ImmIdx = ~0U; 3960b57cec5SDimitry Andric switch (Opcode) { 3970b57cec5SDimitry Andric default: 3980b57cec5SDimitry Andric return TTI::TCC_Free; 3990b57cec5SDimitry Andric case Instruction::GetElementPtr: 4000b57cec5SDimitry Andric // Always hoist the base address of a GetElementPtr. 4010b57cec5SDimitry Andric if (Idx == 0) 4020b57cec5SDimitry Andric return 2 * TTI::TCC_Basic; 4030b57cec5SDimitry Andric return TTI::TCC_Free; 4040b57cec5SDimitry Andric case Instruction::Store: 4050b57cec5SDimitry Andric ImmIdx = 0; 4060b57cec5SDimitry Andric break; 4070b57cec5SDimitry Andric case Instruction::Add: 4080b57cec5SDimitry Andric case Instruction::Sub: 4090b57cec5SDimitry Andric case Instruction::Mul: 4100b57cec5SDimitry Andric case Instruction::UDiv: 4110b57cec5SDimitry Andric case Instruction::SDiv: 4120b57cec5SDimitry Andric case Instruction::URem: 4130b57cec5SDimitry Andric case Instruction::SRem: 4140b57cec5SDimitry Andric case Instruction::And: 4150b57cec5SDimitry Andric case Instruction::Or: 4160b57cec5SDimitry Andric case Instruction::Xor: 4170b57cec5SDimitry Andric case Instruction::ICmp: 4180b57cec5SDimitry Andric ImmIdx = 1; 4190b57cec5SDimitry Andric break; 4200b57cec5SDimitry Andric // Always return TCC_Free for the shift value of a shift instruction. 4210b57cec5SDimitry Andric case Instruction::Shl: 4220b57cec5SDimitry Andric case Instruction::LShr: 4230b57cec5SDimitry Andric case Instruction::AShr: 4240b57cec5SDimitry Andric if (Idx == 1) 4250b57cec5SDimitry Andric return TTI::TCC_Free; 4260b57cec5SDimitry Andric break; 4270b57cec5SDimitry Andric case Instruction::Trunc: 4280b57cec5SDimitry Andric case Instruction::ZExt: 4290b57cec5SDimitry Andric case Instruction::SExt: 4300b57cec5SDimitry Andric case Instruction::IntToPtr: 4310b57cec5SDimitry Andric case Instruction::PtrToInt: 4320b57cec5SDimitry Andric case Instruction::BitCast: 4330b57cec5SDimitry Andric case Instruction::PHI: 4340b57cec5SDimitry Andric case Instruction::Call: 4350b57cec5SDimitry Andric case Instruction::Select: 4360b57cec5SDimitry Andric case Instruction::Ret: 4370b57cec5SDimitry Andric case Instruction::Load: 4380b57cec5SDimitry Andric break; 4390b57cec5SDimitry Andric } 4400b57cec5SDimitry Andric 4410b57cec5SDimitry Andric if (Idx == ImmIdx) { 4420b57cec5SDimitry Andric int NumConstants = (BitSize + 63) / 64; 443fe6060f1SDimitry Andric InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 4440b57cec5SDimitry Andric return (Cost <= NumConstants * TTI::TCC_Basic) 4450b57cec5SDimitry Andric ? static_cast<int>(TTI::TCC_Free) 4460b57cec5SDimitry Andric : Cost; 4470b57cec5SDimitry Andric } 4485ffd83dbSDimitry Andric return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 4490b57cec5SDimitry Andric } 4500b57cec5SDimitry Andric 451fe6060f1SDimitry Andric InstructionCost 452fe6060f1SDimitry Andric AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 4535ffd83dbSDimitry Andric const APInt &Imm, Type *Ty, 4545ffd83dbSDimitry Andric TTI::TargetCostKind CostKind) { 4550b57cec5SDimitry Andric assert(Ty->isIntegerTy()); 4560b57cec5SDimitry Andric 4570b57cec5SDimitry Andric unsigned BitSize = Ty->getPrimitiveSizeInBits(); 4580b57cec5SDimitry Andric // There is no cost model for constants with a bit size of 0. Return TCC_Free 4590b57cec5SDimitry Andric // here, so that constant hoisting will ignore this constant. 4600b57cec5SDimitry Andric if (BitSize == 0) 4610b57cec5SDimitry Andric return TTI::TCC_Free; 4620b57cec5SDimitry Andric 463480093f4SDimitry Andric // Most (all?) AArch64 intrinsics do not support folding immediates into the 464480093f4SDimitry Andric // selected instruction, so we compute the materialization cost for the 465480093f4SDimitry Andric // immediate directly. 466480093f4SDimitry Andric if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) 4675ffd83dbSDimitry Andric return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 468480093f4SDimitry Andric 4690b57cec5SDimitry Andric switch (IID) { 4700b57cec5SDimitry Andric default: 4710b57cec5SDimitry Andric return TTI::TCC_Free; 4720b57cec5SDimitry Andric case Intrinsic::sadd_with_overflow: 4730b57cec5SDimitry Andric case Intrinsic::uadd_with_overflow: 4740b57cec5SDimitry Andric case Intrinsic::ssub_with_overflow: 4750b57cec5SDimitry Andric case Intrinsic::usub_with_overflow: 4760b57cec5SDimitry Andric case Intrinsic::smul_with_overflow: 4770b57cec5SDimitry Andric case Intrinsic::umul_with_overflow: 4780b57cec5SDimitry Andric if (Idx == 1) { 4790b57cec5SDimitry Andric int NumConstants = (BitSize + 63) / 64; 480fe6060f1SDimitry Andric InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 4810b57cec5SDimitry Andric return (Cost <= NumConstants * TTI::TCC_Basic) 4820b57cec5SDimitry Andric ? static_cast<int>(TTI::TCC_Free) 4830b57cec5SDimitry Andric : Cost; 4840b57cec5SDimitry Andric } 4850b57cec5SDimitry Andric break; 4860b57cec5SDimitry Andric case Intrinsic::experimental_stackmap: 4870b57cec5SDimitry Andric if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 4880b57cec5SDimitry Andric return TTI::TCC_Free; 4890b57cec5SDimitry Andric break; 4900b57cec5SDimitry Andric case Intrinsic::experimental_patchpoint_void: 4910fca6ea1SDimitry Andric case Intrinsic::experimental_patchpoint: 4920b57cec5SDimitry Andric if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 4930b57cec5SDimitry Andric return TTI::TCC_Free; 4940b57cec5SDimitry Andric break; 495e8d8bef9SDimitry Andric case Intrinsic::experimental_gc_statepoint: 496e8d8bef9SDimitry Andric if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 497e8d8bef9SDimitry Andric return TTI::TCC_Free; 498e8d8bef9SDimitry Andric break; 4990b57cec5SDimitry Andric } 5005ffd83dbSDimitry Andric return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 5010b57cec5SDimitry Andric } 5020b57cec5SDimitry Andric 5030b57cec5SDimitry Andric TargetTransformInfo::PopcntSupportKind 5040b57cec5SDimitry Andric AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { 5050b57cec5SDimitry Andric assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 5060b57cec5SDimitry Andric if (TyWidth == 32 || TyWidth == 64) 5070b57cec5SDimitry Andric return TTI::PSK_FastHardware; 5080b57cec5SDimitry Andric // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 5090b57cec5SDimitry Andric return TTI::PSK_Software; 5100b57cec5SDimitry Andric } 5110b57cec5SDimitry Andric 5120fca6ea1SDimitry Andric static bool isUnpackedVectorVT(EVT VecVT) { 5130fca6ea1SDimitry Andric return VecVT.isScalableVector() && 5140fca6ea1SDimitry Andric VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock; 5150fca6ea1SDimitry Andric } 5160fca6ea1SDimitry Andric 5170fca6ea1SDimitry Andric static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) { 5180fca6ea1SDimitry Andric Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers 5190fca6ea1SDimitry Andric Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements 5200fca6ea1SDimitry Andric 5210fca6ea1SDimitry Andric // Only allow (32b and 64b) integers or pointers for now... 5220fca6ea1SDimitry Andric if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || 5230fca6ea1SDimitry Andric (EltTy->getScalarSizeInBits() != 32 && 5240fca6ea1SDimitry Andric EltTy->getScalarSizeInBits() != 64)) 5250fca6ea1SDimitry Andric return InstructionCost::getInvalid(); 5260fca6ea1SDimitry Andric 5270fca6ea1SDimitry Andric // FIXME: Hacky check for legal vector types. We can promote smaller types 5280fca6ea1SDimitry Andric // but we cannot legalize vectors via splitting for histcnt. 5290fca6ea1SDimitry Andric // FIXME: We should be able to generate histcnt for fixed-length vectors 5300fca6ea1SDimitry Andric // using ptrue with a specific VL. 5310fca6ea1SDimitry Andric if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) 5320fca6ea1SDimitry Andric if ((VTy->getElementCount().getKnownMinValue() != 2 && 5330fca6ea1SDimitry Andric VTy->getElementCount().getKnownMinValue() != 4) || 5340fca6ea1SDimitry Andric VTy->getPrimitiveSizeInBits().getKnownMinValue() > 128 || 5350fca6ea1SDimitry Andric !VTy->isScalableTy()) 5360fca6ea1SDimitry Andric return InstructionCost::getInvalid(); 5370fca6ea1SDimitry Andric 5380fca6ea1SDimitry Andric return InstructionCost(BaseHistCntCost); 5390fca6ea1SDimitry Andric } 5400fca6ea1SDimitry Andric 541fe6060f1SDimitry Andric InstructionCost 542e8d8bef9SDimitry Andric AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 543e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind) { 544*62987288SDimitry Andric // The code-generator is currently not able to handle scalable vectors 545*62987288SDimitry Andric // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 546*62987288SDimitry Andric // it. This change will be removed when code-generation for these types is 547*62987288SDimitry Andric // sufficiently reliable. 548e8d8bef9SDimitry Andric auto *RetTy = ICA.getReturnType(); 549*62987288SDimitry Andric if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy)) 550*62987288SDimitry Andric if (VTy->getElementCount() == ElementCount::getScalable(1)) 551*62987288SDimitry Andric return InstructionCost::getInvalid(); 552*62987288SDimitry Andric 553e8d8bef9SDimitry Andric switch (ICA.getID()) { 5540fca6ea1SDimitry Andric case Intrinsic::experimental_vector_histogram_add: 5550fca6ea1SDimitry Andric if (!ST->hasSVE2()) 5560fca6ea1SDimitry Andric return InstructionCost::getInvalid(); 5570fca6ea1SDimitry Andric return getHistogramCost(ICA); 558e8d8bef9SDimitry Andric case Intrinsic::umin: 559349cc55cSDimitry Andric case Intrinsic::umax: 560e8d8bef9SDimitry Andric case Intrinsic::smin: 561e8d8bef9SDimitry Andric case Intrinsic::smax: { 562e8d8bef9SDimitry Andric static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 56306c3fb27SDimitry Andric MVT::v8i16, MVT::v2i32, MVT::v4i32, 56406c3fb27SDimitry Andric MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, 56506c3fb27SDimitry Andric MVT::nxv2i64}; 566bdd1243dSDimitry Andric auto LT = getTypeLegalizationCost(RetTy); 567349cc55cSDimitry Andric // v2i64 types get converted to cmp+bif hence the cost of 2 568349cc55cSDimitry Andric if (LT.second == MVT::v2i64) 569349cc55cSDimitry Andric return LT.first * 2; 570e8d8bef9SDimitry Andric if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) 571e8d8bef9SDimitry Andric return LT.first; 572e8d8bef9SDimitry Andric break; 573e8d8bef9SDimitry Andric } 574fe6060f1SDimitry Andric case Intrinsic::sadd_sat: 575fe6060f1SDimitry Andric case Intrinsic::ssub_sat: 576fe6060f1SDimitry Andric case Intrinsic::uadd_sat: 577fe6060f1SDimitry Andric case Intrinsic::usub_sat: { 578fe6060f1SDimitry Andric static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 579fe6060f1SDimitry Andric MVT::v8i16, MVT::v2i32, MVT::v4i32, 580fe6060f1SDimitry Andric MVT::v2i64}; 581bdd1243dSDimitry Andric auto LT = getTypeLegalizationCost(RetTy); 582fe6060f1SDimitry Andric // This is a base cost of 1 for the vadd, plus 3 extract shifts if we 583fe6060f1SDimitry Andric // need to extend the type, as it uses shr(qadd(shl, shl)). 584fe6060f1SDimitry Andric unsigned Instrs = 585fe6060f1SDimitry Andric LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; 586fe6060f1SDimitry Andric if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) 587fe6060f1SDimitry Andric return LT.first * Instrs; 588fe6060f1SDimitry Andric break; 589fe6060f1SDimitry Andric } 590fe6060f1SDimitry Andric case Intrinsic::abs: { 591fe6060f1SDimitry Andric static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 592fe6060f1SDimitry Andric MVT::v8i16, MVT::v2i32, MVT::v4i32, 593fe6060f1SDimitry Andric MVT::v2i64}; 594bdd1243dSDimitry Andric auto LT = getTypeLegalizationCost(RetTy); 595fe6060f1SDimitry Andric if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) 596fe6060f1SDimitry Andric return LT.first; 597fe6060f1SDimitry Andric break; 598fe6060f1SDimitry Andric } 59906c3fb27SDimitry Andric case Intrinsic::bswap: { 60006c3fb27SDimitry Andric static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32, 60106c3fb27SDimitry Andric MVT::v4i32, MVT::v2i64}; 60206c3fb27SDimitry Andric auto LT = getTypeLegalizationCost(RetTy); 60306c3fb27SDimitry Andric if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; }) && 60406c3fb27SDimitry Andric LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits()) 60506c3fb27SDimitry Andric return LT.first; 60606c3fb27SDimitry Andric break; 60706c3fb27SDimitry Andric } 608fe6060f1SDimitry Andric case Intrinsic::experimental_stepvector: { 609fe6060f1SDimitry Andric InstructionCost Cost = 1; // Cost of the `index' instruction 610bdd1243dSDimitry Andric auto LT = getTypeLegalizationCost(RetTy); 611fe6060f1SDimitry Andric // Legalisation of illegal vectors involves an `index' instruction plus 612fe6060f1SDimitry Andric // (LT.first - 1) vector adds. 613fe6060f1SDimitry Andric if (LT.first > 1) { 614fe6060f1SDimitry Andric Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); 615fe6060f1SDimitry Andric InstructionCost AddCost = 616fe6060f1SDimitry Andric getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); 617fe6060f1SDimitry Andric Cost += AddCost * (LT.first - 1); 618fe6060f1SDimitry Andric } 619fe6060f1SDimitry Andric return Cost; 620fe6060f1SDimitry Andric } 6210fca6ea1SDimitry Andric case Intrinsic::vector_extract: 6220fca6ea1SDimitry Andric case Intrinsic::vector_insert: { 6230fca6ea1SDimitry Andric // If both the vector and subvector types are legal types and the index 6240fca6ea1SDimitry Andric // is 0, then this should be a no-op or simple operation; return a 6250fca6ea1SDimitry Andric // relatively low cost. 6260fca6ea1SDimitry Andric 6270fca6ea1SDimitry Andric // If arguments aren't actually supplied, then we cannot determine the 6280fca6ea1SDimitry Andric // value of the index. We also want to skip predicate types. 6290fca6ea1SDimitry Andric if (ICA.getArgs().size() != ICA.getArgTypes().size() || 6300fca6ea1SDimitry Andric ICA.getReturnType()->getScalarType()->isIntegerTy(1)) 6310fca6ea1SDimitry Andric break; 6320fca6ea1SDimitry Andric 6330fca6ea1SDimitry Andric LLVMContext &C = RetTy->getContext(); 6340fca6ea1SDimitry Andric EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); 6350fca6ea1SDimitry Andric bool IsExtract = ICA.getID() == Intrinsic::vector_extract; 6360fca6ea1SDimitry Andric EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy) 6370fca6ea1SDimitry Andric : getTLI()->getValueType(DL, ICA.getArgTypes()[1]); 6380fca6ea1SDimitry Andric // Skip this if either the vector or subvector types are unpacked 6390fca6ea1SDimitry Andric // SVE types; they may get lowered to stack stores and loads. 6400fca6ea1SDimitry Andric if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT)) 6410fca6ea1SDimitry Andric break; 6420fca6ea1SDimitry Andric 6430fca6ea1SDimitry Andric TargetLoweringBase::LegalizeKind SubVecLK = 6440fca6ea1SDimitry Andric getTLI()->getTypeConversion(C, SubVecVT); 6450fca6ea1SDimitry Andric TargetLoweringBase::LegalizeKind VecLK = 6460fca6ea1SDimitry Andric getTLI()->getTypeConversion(C, VecVT); 6470fca6ea1SDimitry Andric const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2]; 6480fca6ea1SDimitry Andric const ConstantInt *CIdx = cast<ConstantInt>(Idx); 6490fca6ea1SDimitry Andric if (SubVecLK.first == TargetLoweringBase::TypeLegal && 6500fca6ea1SDimitry Andric VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero()) 6510fca6ea1SDimitry Andric return TTI::TCC_Free; 6520fca6ea1SDimitry Andric break; 6530fca6ea1SDimitry Andric } 654fe6060f1SDimitry Andric case Intrinsic::bitreverse: { 655fe6060f1SDimitry Andric static const CostTblEntry BitreverseTbl[] = { 656fe6060f1SDimitry Andric {Intrinsic::bitreverse, MVT::i32, 1}, 657fe6060f1SDimitry Andric {Intrinsic::bitreverse, MVT::i64, 1}, 658fe6060f1SDimitry Andric {Intrinsic::bitreverse, MVT::v8i8, 1}, 659fe6060f1SDimitry Andric {Intrinsic::bitreverse, MVT::v16i8, 1}, 660fe6060f1SDimitry Andric {Intrinsic::bitreverse, MVT::v4i16, 2}, 661fe6060f1SDimitry Andric {Intrinsic::bitreverse, MVT::v8i16, 2}, 662fe6060f1SDimitry Andric {Intrinsic::bitreverse, MVT::v2i32, 2}, 663fe6060f1SDimitry Andric {Intrinsic::bitreverse, MVT::v4i32, 2}, 664fe6060f1SDimitry Andric {Intrinsic::bitreverse, MVT::v1i64, 2}, 665fe6060f1SDimitry Andric {Intrinsic::bitreverse, MVT::v2i64, 2}, 666fe6060f1SDimitry Andric }; 667bdd1243dSDimitry Andric const auto LegalisationCost = getTypeLegalizationCost(RetTy); 668fe6060f1SDimitry Andric const auto *Entry = 669fe6060f1SDimitry Andric CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); 670349cc55cSDimitry Andric if (Entry) { 671349cc55cSDimitry Andric // Cost Model is using the legal type(i32) that i8 and i16 will be 672349cc55cSDimitry Andric // converted to +1 so that we match the actual lowering cost 673fe6060f1SDimitry Andric if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || 674fe6060f1SDimitry Andric TLI->getValueType(DL, RetTy, true) == MVT::i16) 675fe6060f1SDimitry Andric return LegalisationCost.first * Entry->Cost + 1; 676349cc55cSDimitry Andric 677fe6060f1SDimitry Andric return LegalisationCost.first * Entry->Cost; 678349cc55cSDimitry Andric } 679fe6060f1SDimitry Andric break; 680fe6060f1SDimitry Andric } 681fe6060f1SDimitry Andric case Intrinsic::ctpop: { 682bdd1243dSDimitry Andric if (!ST->hasNEON()) { 683bdd1243dSDimitry Andric // 32-bit or 64-bit ctpop without NEON is 12 instructions. 684bdd1243dSDimitry Andric return getTypeLegalizationCost(RetTy).first * 12; 685bdd1243dSDimitry Andric } 686fe6060f1SDimitry Andric static const CostTblEntry CtpopCostTbl[] = { 687fe6060f1SDimitry Andric {ISD::CTPOP, MVT::v2i64, 4}, 688fe6060f1SDimitry Andric {ISD::CTPOP, MVT::v4i32, 3}, 689fe6060f1SDimitry Andric {ISD::CTPOP, MVT::v8i16, 2}, 690fe6060f1SDimitry Andric {ISD::CTPOP, MVT::v16i8, 1}, 691fe6060f1SDimitry Andric {ISD::CTPOP, MVT::i64, 4}, 692fe6060f1SDimitry Andric {ISD::CTPOP, MVT::v2i32, 3}, 693fe6060f1SDimitry Andric {ISD::CTPOP, MVT::v4i16, 2}, 694fe6060f1SDimitry Andric {ISD::CTPOP, MVT::v8i8, 1}, 695fe6060f1SDimitry Andric {ISD::CTPOP, MVT::i32, 5}, 696fe6060f1SDimitry Andric }; 697bdd1243dSDimitry Andric auto LT = getTypeLegalizationCost(RetTy); 698fe6060f1SDimitry Andric MVT MTy = LT.second; 699fe6060f1SDimitry Andric if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { 700fe6060f1SDimitry Andric // Extra cost of +1 when illegal vector types are legalized by promoting 701fe6060f1SDimitry Andric // the integer type. 702fe6060f1SDimitry Andric int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() != 703fe6060f1SDimitry Andric RetTy->getScalarSizeInBits() 704fe6060f1SDimitry Andric ? 1 705fe6060f1SDimitry Andric : 0; 706fe6060f1SDimitry Andric return LT.first * Entry->Cost + ExtraCost; 707fe6060f1SDimitry Andric } 708fe6060f1SDimitry Andric break; 709fe6060f1SDimitry Andric } 71004eeddc0SDimitry Andric case Intrinsic::sadd_with_overflow: 71104eeddc0SDimitry Andric case Intrinsic::uadd_with_overflow: 71204eeddc0SDimitry Andric case Intrinsic::ssub_with_overflow: 71304eeddc0SDimitry Andric case Intrinsic::usub_with_overflow: 71404eeddc0SDimitry Andric case Intrinsic::smul_with_overflow: 71504eeddc0SDimitry Andric case Intrinsic::umul_with_overflow: { 71604eeddc0SDimitry Andric static const CostTblEntry WithOverflowCostTbl[] = { 71704eeddc0SDimitry Andric {Intrinsic::sadd_with_overflow, MVT::i8, 3}, 71804eeddc0SDimitry Andric {Intrinsic::uadd_with_overflow, MVT::i8, 3}, 71904eeddc0SDimitry Andric {Intrinsic::sadd_with_overflow, MVT::i16, 3}, 72004eeddc0SDimitry Andric {Intrinsic::uadd_with_overflow, MVT::i16, 3}, 72104eeddc0SDimitry Andric {Intrinsic::sadd_with_overflow, MVT::i32, 1}, 72204eeddc0SDimitry Andric {Intrinsic::uadd_with_overflow, MVT::i32, 1}, 72304eeddc0SDimitry Andric {Intrinsic::sadd_with_overflow, MVT::i64, 1}, 72404eeddc0SDimitry Andric {Intrinsic::uadd_with_overflow, MVT::i64, 1}, 72504eeddc0SDimitry Andric {Intrinsic::ssub_with_overflow, MVT::i8, 3}, 72604eeddc0SDimitry Andric {Intrinsic::usub_with_overflow, MVT::i8, 3}, 72704eeddc0SDimitry Andric {Intrinsic::ssub_with_overflow, MVT::i16, 3}, 72804eeddc0SDimitry Andric {Intrinsic::usub_with_overflow, MVT::i16, 3}, 72904eeddc0SDimitry Andric {Intrinsic::ssub_with_overflow, MVT::i32, 1}, 73004eeddc0SDimitry Andric {Intrinsic::usub_with_overflow, MVT::i32, 1}, 73104eeddc0SDimitry Andric {Intrinsic::ssub_with_overflow, MVT::i64, 1}, 73204eeddc0SDimitry Andric {Intrinsic::usub_with_overflow, MVT::i64, 1}, 73304eeddc0SDimitry Andric {Intrinsic::smul_with_overflow, MVT::i8, 5}, 73404eeddc0SDimitry Andric {Intrinsic::umul_with_overflow, MVT::i8, 4}, 73504eeddc0SDimitry Andric {Intrinsic::smul_with_overflow, MVT::i16, 5}, 73604eeddc0SDimitry Andric {Intrinsic::umul_with_overflow, MVT::i16, 4}, 73704eeddc0SDimitry Andric {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst 73804eeddc0SDimitry Andric {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw 73904eeddc0SDimitry Andric {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp 74004eeddc0SDimitry Andric {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr 74104eeddc0SDimitry Andric }; 74204eeddc0SDimitry Andric EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true); 74304eeddc0SDimitry Andric if (MTy.isSimple()) 74404eeddc0SDimitry Andric if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(), 74504eeddc0SDimitry Andric MTy.getSimpleVT())) 74604eeddc0SDimitry Andric return Entry->Cost; 74704eeddc0SDimitry Andric break; 74804eeddc0SDimitry Andric } 74981ad6265SDimitry Andric case Intrinsic::fptosi_sat: 75081ad6265SDimitry Andric case Intrinsic::fptoui_sat: { 75181ad6265SDimitry Andric if (ICA.getArgTypes().empty()) 75281ad6265SDimitry Andric break; 75381ad6265SDimitry Andric bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; 754bdd1243dSDimitry Andric auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]); 75581ad6265SDimitry Andric EVT MTy = TLI->getValueType(DL, RetTy); 75681ad6265SDimitry Andric // Check for the legal types, which are where the size of the input and the 75781ad6265SDimitry Andric // output are the same, or we are using cvt f64->i32 or f32->i64. 75881ad6265SDimitry Andric if ((LT.second == MVT::f32 || LT.second == MVT::f64 || 75981ad6265SDimitry Andric LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || 76081ad6265SDimitry Andric LT.second == MVT::v2f64) && 76181ad6265SDimitry Andric (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || 76281ad6265SDimitry Andric (LT.second == MVT::f64 && MTy == MVT::i32) || 76381ad6265SDimitry Andric (LT.second == MVT::f32 && MTy == MVT::i64))) 76481ad6265SDimitry Andric return LT.first; 76581ad6265SDimitry Andric // Similarly for fp16 sizes 76681ad6265SDimitry Andric if (ST->hasFullFP16() && 76781ad6265SDimitry Andric ((LT.second == MVT::f16 && MTy == MVT::i32) || 76881ad6265SDimitry Andric ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && 76981ad6265SDimitry Andric (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))) 77081ad6265SDimitry Andric return LT.first; 77181ad6265SDimitry Andric 77281ad6265SDimitry Andric // Otherwise we use a legal convert followed by a min+max 77381ad6265SDimitry Andric if ((LT.second.getScalarType() == MVT::f32 || 77481ad6265SDimitry Andric LT.second.getScalarType() == MVT::f64 || 77581ad6265SDimitry Andric (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) && 77681ad6265SDimitry Andric LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { 77781ad6265SDimitry Andric Type *LegalTy = 77881ad6265SDimitry Andric Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits()); 77981ad6265SDimitry Andric if (LT.second.isVector()) 78081ad6265SDimitry Andric LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount()); 78181ad6265SDimitry Andric InstructionCost Cost = 1; 78281ad6265SDimitry Andric IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, 78381ad6265SDimitry Andric LegalTy, {LegalTy, LegalTy}); 78481ad6265SDimitry Andric Cost += getIntrinsicInstrCost(Attrs1, CostKind); 78581ad6265SDimitry Andric IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, 78681ad6265SDimitry Andric LegalTy, {LegalTy, LegalTy}); 78781ad6265SDimitry Andric Cost += getIntrinsicInstrCost(Attrs2, CostKind); 78881ad6265SDimitry Andric return LT.first * Cost; 78981ad6265SDimitry Andric } 79081ad6265SDimitry Andric break; 79181ad6265SDimitry Andric } 79206c3fb27SDimitry Andric case Intrinsic::fshl: 79306c3fb27SDimitry Andric case Intrinsic::fshr: { 79406c3fb27SDimitry Andric if (ICA.getArgs().empty()) 79506c3fb27SDimitry Andric break; 79606c3fb27SDimitry Andric 79706c3fb27SDimitry Andric // TODO: Add handling for fshl where third argument is not a constant. 79806c3fb27SDimitry Andric const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]); 79906c3fb27SDimitry Andric if (!OpInfoZ.isConstant()) 80006c3fb27SDimitry Andric break; 80106c3fb27SDimitry Andric 80206c3fb27SDimitry Andric const auto LegalisationCost = getTypeLegalizationCost(RetTy); 80306c3fb27SDimitry Andric if (OpInfoZ.isUniform()) { 80406c3fb27SDimitry Andric // FIXME: The costs could be lower if the codegen is better. 80506c3fb27SDimitry Andric static const CostTblEntry FshlTbl[] = { 80606c3fb27SDimitry Andric {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr 80706c3fb27SDimitry Andric {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4}, 80806c3fb27SDimitry Andric {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3}, 80906c3fb27SDimitry Andric {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}}; 81006c3fb27SDimitry Andric // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl 81106c3fb27SDimitry Andric // to avoid having to duplicate the costs. 81206c3fb27SDimitry Andric const auto *Entry = 81306c3fb27SDimitry Andric CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second); 81406c3fb27SDimitry Andric if (Entry) 81506c3fb27SDimitry Andric return LegalisationCost.first * Entry->Cost; 81606c3fb27SDimitry Andric } 81706c3fb27SDimitry Andric 81806c3fb27SDimitry Andric auto TyL = getTypeLegalizationCost(RetTy); 81906c3fb27SDimitry Andric if (!RetTy->isIntegerTy()) 82006c3fb27SDimitry Andric break; 82106c3fb27SDimitry Andric 82206c3fb27SDimitry Andric // Estimate cost manually, as types like i8 and i16 will get promoted to 82306c3fb27SDimitry Andric // i32 and CostTableLookup will ignore the extra conversion cost. 82406c3fb27SDimitry Andric bool HigherCost = (RetTy->getScalarSizeInBits() != 32 && 82506c3fb27SDimitry Andric RetTy->getScalarSizeInBits() < 64) || 82606c3fb27SDimitry Andric (RetTy->getScalarSizeInBits() % 64 != 0); 82706c3fb27SDimitry Andric unsigned ExtraCost = HigherCost ? 1 : 0; 82806c3fb27SDimitry Andric if (RetTy->getScalarSizeInBits() == 32 || 82906c3fb27SDimitry Andric RetTy->getScalarSizeInBits() == 64) 83006c3fb27SDimitry Andric ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single 83106c3fb27SDimitry Andric // extr instruction. 83206c3fb27SDimitry Andric else if (HigherCost) 83306c3fb27SDimitry Andric ExtraCost = 1; 83406c3fb27SDimitry Andric else 83506c3fb27SDimitry Andric break; 83606c3fb27SDimitry Andric return TyL.first + ExtraCost; 83706c3fb27SDimitry Andric } 8380fca6ea1SDimitry Andric case Intrinsic::get_active_lane_mask: { 8390fca6ea1SDimitry Andric auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType()); 8400fca6ea1SDimitry Andric if (RetTy) { 8410fca6ea1SDimitry Andric EVT RetVT = getTLI()->getValueType(DL, RetTy); 8420fca6ea1SDimitry Andric EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); 8430fca6ea1SDimitry Andric if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) && 8440fca6ea1SDimitry Andric !getTLI()->isTypeLegal(RetVT)) { 8450fca6ea1SDimitry Andric // We don't have enough context at this point to determine if the mask 8460fca6ea1SDimitry Andric // is going to be kept live after the block, which will force the vXi1 8470fca6ea1SDimitry Andric // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32. 8480fca6ea1SDimitry Andric // For now, we just assume the vectorizer created this intrinsic and 8490fca6ea1SDimitry Andric // the result will be the input for a PHI. In this case the cost will 8500fca6ea1SDimitry Andric // be extremely high for fixed-width vectors. 8510fca6ea1SDimitry Andric // NOTE: getScalarizationOverhead returns a cost that's far too 8520fca6ea1SDimitry Andric // pessimistic for the actual generated codegen. In reality there are 8530fca6ea1SDimitry Andric // two instructions generated per lane. 8540fca6ea1SDimitry Andric return RetTy->getNumElements() * 2; 8550fca6ea1SDimitry Andric } 8560fca6ea1SDimitry Andric } 8570fca6ea1SDimitry Andric break; 8580fca6ea1SDimitry Andric } 859e8d8bef9SDimitry Andric default: 860e8d8bef9SDimitry Andric break; 861e8d8bef9SDimitry Andric } 862e8d8bef9SDimitry Andric return BaseT::getIntrinsicInstrCost(ICA, CostKind); 863e8d8bef9SDimitry Andric } 864e8d8bef9SDimitry Andric 865fe6060f1SDimitry Andric /// The function will remove redundant reinterprets casting in the presence 866fe6060f1SDimitry Andric /// of the control flow 867bdd1243dSDimitry Andric static std::optional<Instruction *> processPhiNode(InstCombiner &IC, 868fe6060f1SDimitry Andric IntrinsicInst &II) { 869fe6060f1SDimitry Andric SmallVector<Instruction *, 32> Worklist; 870fe6060f1SDimitry Andric auto RequiredType = II.getType(); 871fe6060f1SDimitry Andric 872fe6060f1SDimitry Andric auto *PN = dyn_cast<PHINode>(II.getArgOperand(0)); 873fe6060f1SDimitry Andric assert(PN && "Expected Phi Node!"); 874fe6060f1SDimitry Andric 875fe6060f1SDimitry Andric // Don't create a new Phi unless we can remove the old one. 876fe6060f1SDimitry Andric if (!PN->hasOneUse()) 877bdd1243dSDimitry Andric return std::nullopt; 878fe6060f1SDimitry Andric 879fe6060f1SDimitry Andric for (Value *IncValPhi : PN->incoming_values()) { 880fe6060f1SDimitry Andric auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi); 881fe6060f1SDimitry Andric if (!Reinterpret || 882fe6060f1SDimitry Andric Reinterpret->getIntrinsicID() != 883fe6060f1SDimitry Andric Intrinsic::aarch64_sve_convert_to_svbool || 884fe6060f1SDimitry Andric RequiredType != Reinterpret->getArgOperand(0)->getType()) 885bdd1243dSDimitry Andric return std::nullopt; 886fe6060f1SDimitry Andric } 887fe6060f1SDimitry Andric 888fe6060f1SDimitry Andric // Create the new Phi 88906c3fb27SDimitry Andric IC.Builder.SetInsertPoint(PN); 89006c3fb27SDimitry Andric PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); 891fe6060f1SDimitry Andric Worklist.push_back(PN); 892fe6060f1SDimitry Andric 893fe6060f1SDimitry Andric for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { 894fe6060f1SDimitry Andric auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); 895fe6060f1SDimitry Andric NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); 896fe6060f1SDimitry Andric Worklist.push_back(Reinterpret); 897fe6060f1SDimitry Andric } 898fe6060f1SDimitry Andric 899fe6060f1SDimitry Andric // Cleanup Phi Node and reinterprets 900fe6060f1SDimitry Andric return IC.replaceInstUsesWith(II, NPN); 901fe6060f1SDimitry Andric } 902fe6060f1SDimitry Andric 90304eeddc0SDimitry Andric // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) 90404eeddc0SDimitry Andric // => (binop (pred) (from_svbool _) (from_svbool _)) 90504eeddc0SDimitry Andric // 90604eeddc0SDimitry Andric // The above transformation eliminates a `to_svbool` in the predicate 90704eeddc0SDimitry Andric // operand of bitwise operation `binop` by narrowing the vector width of 90804eeddc0SDimitry Andric // the operation. For example, it would convert a `<vscale x 16 x i1> 90904eeddc0SDimitry Andric // and` into a `<vscale x 4 x i1> and`. This is profitable because 91004eeddc0SDimitry Andric // to_svbool must zero the new lanes during widening, whereas 91104eeddc0SDimitry Andric // from_svbool is free. 912bdd1243dSDimitry Andric static std::optional<Instruction *> 913bdd1243dSDimitry Andric tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) { 91404eeddc0SDimitry Andric auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0)); 91504eeddc0SDimitry Andric if (!BinOp) 916bdd1243dSDimitry Andric return std::nullopt; 91704eeddc0SDimitry Andric 91804eeddc0SDimitry Andric auto IntrinsicID = BinOp->getIntrinsicID(); 91904eeddc0SDimitry Andric switch (IntrinsicID) { 92004eeddc0SDimitry Andric case Intrinsic::aarch64_sve_and_z: 92104eeddc0SDimitry Andric case Intrinsic::aarch64_sve_bic_z: 92204eeddc0SDimitry Andric case Intrinsic::aarch64_sve_eor_z: 92304eeddc0SDimitry Andric case Intrinsic::aarch64_sve_nand_z: 92404eeddc0SDimitry Andric case Intrinsic::aarch64_sve_nor_z: 92504eeddc0SDimitry Andric case Intrinsic::aarch64_sve_orn_z: 92604eeddc0SDimitry Andric case Intrinsic::aarch64_sve_orr_z: 92704eeddc0SDimitry Andric break; 92804eeddc0SDimitry Andric default: 929bdd1243dSDimitry Andric return std::nullopt; 93004eeddc0SDimitry Andric } 93104eeddc0SDimitry Andric 93204eeddc0SDimitry Andric auto BinOpPred = BinOp->getOperand(0); 93304eeddc0SDimitry Andric auto BinOpOp1 = BinOp->getOperand(1); 93404eeddc0SDimitry Andric auto BinOpOp2 = BinOp->getOperand(2); 93504eeddc0SDimitry Andric 93604eeddc0SDimitry Andric auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred); 93704eeddc0SDimitry Andric if (!PredIntr || 93804eeddc0SDimitry Andric PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) 939bdd1243dSDimitry Andric return std::nullopt; 94004eeddc0SDimitry Andric 94104eeddc0SDimitry Andric auto PredOp = PredIntr->getOperand(0); 94204eeddc0SDimitry Andric auto PredOpTy = cast<VectorType>(PredOp->getType()); 94304eeddc0SDimitry Andric if (PredOpTy != II.getType()) 944bdd1243dSDimitry Andric return std::nullopt; 94504eeddc0SDimitry Andric 94604eeddc0SDimitry Andric SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; 94706c3fb27SDimitry Andric auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic( 94804eeddc0SDimitry Andric Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1}); 94904eeddc0SDimitry Andric NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 95004eeddc0SDimitry Andric if (BinOpOp1 == BinOpOp2) 95104eeddc0SDimitry Andric NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 95204eeddc0SDimitry Andric else 95306c3fb27SDimitry Andric NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic( 95404eeddc0SDimitry Andric Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2})); 95504eeddc0SDimitry Andric 95604eeddc0SDimitry Andric auto NarrowedBinOp = 95706c3fb27SDimitry Andric IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs); 95804eeddc0SDimitry Andric return IC.replaceInstUsesWith(II, NarrowedBinOp); 95904eeddc0SDimitry Andric } 96004eeddc0SDimitry Andric 961bdd1243dSDimitry Andric static std::optional<Instruction *> 962bdd1243dSDimitry Andric instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) { 963fe6060f1SDimitry Andric // If the reinterpret instruction operand is a PHI Node 964fe6060f1SDimitry Andric if (isa<PHINode>(II.getArgOperand(0))) 965fe6060f1SDimitry Andric return processPhiNode(IC, II); 966fe6060f1SDimitry Andric 96704eeddc0SDimitry Andric if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) 96804eeddc0SDimitry Andric return BinOpCombine; 96904eeddc0SDimitry Andric 97006c3fb27SDimitry Andric // Ignore converts to/from svcount_t. 97106c3fb27SDimitry Andric if (isa<TargetExtType>(II.getArgOperand(0)->getType()) || 97206c3fb27SDimitry Andric isa<TargetExtType>(II.getType())) 97306c3fb27SDimitry Andric return std::nullopt; 97406c3fb27SDimitry Andric 975fe6060f1SDimitry Andric SmallVector<Instruction *, 32> CandidatesForRemoval; 976fe6060f1SDimitry Andric Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; 977fe6060f1SDimitry Andric 978fe6060f1SDimitry Andric const auto *IVTy = cast<VectorType>(II.getType()); 979fe6060f1SDimitry Andric 980fe6060f1SDimitry Andric // Walk the chain of conversions. 981fe6060f1SDimitry Andric while (Cursor) { 982fe6060f1SDimitry Andric // If the type of the cursor has fewer lanes than the final result, zeroing 983fe6060f1SDimitry Andric // must take place, which breaks the equivalence chain. 984fe6060f1SDimitry Andric const auto *CursorVTy = cast<VectorType>(Cursor->getType()); 985fe6060f1SDimitry Andric if (CursorVTy->getElementCount().getKnownMinValue() < 986fe6060f1SDimitry Andric IVTy->getElementCount().getKnownMinValue()) 987fe6060f1SDimitry Andric break; 988fe6060f1SDimitry Andric 989fe6060f1SDimitry Andric // If the cursor has the same type as I, it is a viable replacement. 990fe6060f1SDimitry Andric if (Cursor->getType() == IVTy) 991fe6060f1SDimitry Andric EarliestReplacement = Cursor; 992fe6060f1SDimitry Andric 993fe6060f1SDimitry Andric auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); 994fe6060f1SDimitry Andric 995fe6060f1SDimitry Andric // If this is not an SVE conversion intrinsic, this is the end of the chain. 996fe6060f1SDimitry Andric if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == 997fe6060f1SDimitry Andric Intrinsic::aarch64_sve_convert_to_svbool || 998fe6060f1SDimitry Andric IntrinsicCursor->getIntrinsicID() == 999fe6060f1SDimitry Andric Intrinsic::aarch64_sve_convert_from_svbool)) 1000fe6060f1SDimitry Andric break; 1001fe6060f1SDimitry Andric 1002fe6060f1SDimitry Andric CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); 1003fe6060f1SDimitry Andric Cursor = IntrinsicCursor->getOperand(0); 1004fe6060f1SDimitry Andric } 1005fe6060f1SDimitry Andric 1006fe6060f1SDimitry Andric // If no viable replacement in the conversion chain was found, there is 1007fe6060f1SDimitry Andric // nothing to do. 1008fe6060f1SDimitry Andric if (!EarliestReplacement) 1009bdd1243dSDimitry Andric return std::nullopt; 1010fe6060f1SDimitry Andric 1011fe6060f1SDimitry Andric return IC.replaceInstUsesWith(II, EarliestReplacement); 1012fe6060f1SDimitry Andric } 1013fe6060f1SDimitry Andric 10145f757f3fSDimitry Andric static bool isAllActivePredicate(Value *Pred) { 10155f757f3fSDimitry Andric // Look through convert.from.svbool(convert.to.svbool(...) chain. 10165f757f3fSDimitry Andric Value *UncastedPred; 10175f757f3fSDimitry Andric if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( 10185f757f3fSDimitry Andric m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( 10195f757f3fSDimitry Andric m_Value(UncastedPred))))) 10205f757f3fSDimitry Andric // If the predicate has the same or less lanes than the uncasted 10215f757f3fSDimitry Andric // predicate then we know the casting has no effect. 10225f757f3fSDimitry Andric if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <= 10235f757f3fSDimitry Andric cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements()) 10245f757f3fSDimitry Andric Pred = UncastedPred; 10255f757f3fSDimitry Andric 10265f757f3fSDimitry Andric return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 10275f757f3fSDimitry Andric m_ConstantInt<AArch64SVEPredPattern::all>())); 10285f757f3fSDimitry Andric } 10295f757f3fSDimitry Andric 10300fca6ea1SDimitry Andric // Erase unary operation where predicate has all inactive lanes 10310fca6ea1SDimitry Andric static std::optional<Instruction *> 10320fca6ea1SDimitry Andric instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II, 10330fca6ea1SDimitry Andric int PredPos) { 10340fca6ea1SDimitry Andric if (match(II.getOperand(PredPos), m_ZeroInt())) { 10350fca6ea1SDimitry Andric return IC.eraseInstFromFunction(II); 10360fca6ea1SDimitry Andric } 10370fca6ea1SDimitry Andric return std::nullopt; 10380fca6ea1SDimitry Andric } 10390fca6ea1SDimitry Andric 10400fca6ea1SDimitry Andric // Simplify unary operation where predicate has all inactive lanes by replacing 10410fca6ea1SDimitry Andric // instruction with zeroed object 10420fca6ea1SDimitry Andric static std::optional<Instruction *> 10430fca6ea1SDimitry Andric instCombineSVENoActiveUnaryZero(InstCombiner &IC, IntrinsicInst &II) { 10440fca6ea1SDimitry Andric if (match(II.getOperand(0), m_ZeroInt())) { 10450fca6ea1SDimitry Andric Constant *Node; 10460fca6ea1SDimitry Andric Type *RetTy = II.getType(); 10470fca6ea1SDimitry Andric if (RetTy->isStructTy()) { 10480fca6ea1SDimitry Andric auto StructT = cast<StructType>(RetTy); 10490fca6ea1SDimitry Andric auto VecT = StructT->getElementType(0); 10500fca6ea1SDimitry Andric SmallVector<llvm::Constant *, 4> ZerVec; 10510fca6ea1SDimitry Andric for (unsigned i = 0; i < StructT->getNumElements(); i++) { 10520fca6ea1SDimitry Andric ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0) 10530fca6ea1SDimitry Andric : ConstantInt::get(VecT, 0)); 10540fca6ea1SDimitry Andric } 10550fca6ea1SDimitry Andric Node = ConstantStruct::get(StructT, ZerVec); 10560fca6ea1SDimitry Andric } else if (RetTy->isFPOrFPVectorTy()) 10570fca6ea1SDimitry Andric Node = ConstantFP::get(RetTy, 0.0); 10580fca6ea1SDimitry Andric else 10590fca6ea1SDimitry Andric Node = ConstantInt::get(II.getType(), 0); 10600fca6ea1SDimitry Andric 10610fca6ea1SDimitry Andric IC.replaceInstUsesWith(II, Node); 10620fca6ea1SDimitry Andric return IC.eraseInstFromFunction(II); 10630fca6ea1SDimitry Andric } 10640fca6ea1SDimitry Andric return std::nullopt; 10650fca6ea1SDimitry Andric } 10660fca6ea1SDimitry Andric 1067bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC, 106881ad6265SDimitry Andric IntrinsicInst &II) { 10695f757f3fSDimitry Andric // svsel(ptrue, x, y) => x 10705f757f3fSDimitry Andric auto *OpPredicate = II.getOperand(0); 10715f757f3fSDimitry Andric if (isAllActivePredicate(OpPredicate)) 10725f757f3fSDimitry Andric return IC.replaceInstUsesWith(II, II.getOperand(1)); 10735f757f3fSDimitry Andric 10745f757f3fSDimitry Andric auto Select = 10755f757f3fSDimitry Andric IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2)); 107681ad6265SDimitry Andric return IC.replaceInstUsesWith(II, Select); 107781ad6265SDimitry Andric } 107881ad6265SDimitry Andric 1079bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC, 1080fe6060f1SDimitry Andric IntrinsicInst &II) { 1081fe6060f1SDimitry Andric IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 1082fe6060f1SDimitry Andric if (!Pg) 1083bdd1243dSDimitry Andric return std::nullopt; 1084fe6060f1SDimitry Andric 1085fe6060f1SDimitry Andric if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 1086bdd1243dSDimitry Andric return std::nullopt; 1087fe6060f1SDimitry Andric 1088fe6060f1SDimitry Andric const auto PTruePattern = 1089fe6060f1SDimitry Andric cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 1090fe6060f1SDimitry Andric if (PTruePattern != AArch64SVEPredPattern::vl1) 1091bdd1243dSDimitry Andric return std::nullopt; 1092fe6060f1SDimitry Andric 1093fe6060f1SDimitry Andric // The intrinsic is inserting into lane zero so use an insert instead. 1094fe6060f1SDimitry Andric auto *IdxTy = Type::getInt64Ty(II.getContext()); 1095fe6060f1SDimitry Andric auto *Insert = InsertElementInst::Create( 1096fe6060f1SDimitry Andric II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); 1097fe6060f1SDimitry Andric Insert->insertBefore(&II); 1098fe6060f1SDimitry Andric Insert->takeName(&II); 1099fe6060f1SDimitry Andric 1100fe6060f1SDimitry Andric return IC.replaceInstUsesWith(II, Insert); 1101fe6060f1SDimitry Andric } 1102fe6060f1SDimitry Andric 1103bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, 1104349cc55cSDimitry Andric IntrinsicInst &II) { 1105349cc55cSDimitry Andric // Replace DupX with a regular IR splat. 1106349cc55cSDimitry Andric auto *RetTy = cast<ScalableVectorType>(II.getType()); 110706c3fb27SDimitry Andric Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(), 110806c3fb27SDimitry Andric II.getArgOperand(0)); 1109349cc55cSDimitry Andric Splat->takeName(&II); 1110349cc55cSDimitry Andric return IC.replaceInstUsesWith(II, Splat); 1111349cc55cSDimitry Andric } 1112349cc55cSDimitry Andric 1113bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, 1114fe6060f1SDimitry Andric IntrinsicInst &II) { 1115fe6060f1SDimitry Andric LLVMContext &Ctx = II.getContext(); 1116fe6060f1SDimitry Andric 1117fe6060f1SDimitry Andric // Check that the predicate is all active 1118fe6060f1SDimitry Andric auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 1119fe6060f1SDimitry Andric if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 1120bdd1243dSDimitry Andric return std::nullopt; 1121fe6060f1SDimitry Andric 1122fe6060f1SDimitry Andric const auto PTruePattern = 1123fe6060f1SDimitry Andric cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 1124fe6060f1SDimitry Andric if (PTruePattern != AArch64SVEPredPattern::all) 1125bdd1243dSDimitry Andric return std::nullopt; 1126fe6060f1SDimitry Andric 1127fe6060f1SDimitry Andric // Check that we have a compare of zero.. 1128349cc55cSDimitry Andric auto *SplatValue = 1129349cc55cSDimitry Andric dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2))); 1130349cc55cSDimitry Andric if (!SplatValue || !SplatValue->isZero()) 1131bdd1243dSDimitry Andric return std::nullopt; 1132fe6060f1SDimitry Andric 1133fe6060f1SDimitry Andric // ..against a dupq 1134fe6060f1SDimitry Andric auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 1135fe6060f1SDimitry Andric if (!DupQLane || 1136fe6060f1SDimitry Andric DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) 1137bdd1243dSDimitry Andric return std::nullopt; 1138fe6060f1SDimitry Andric 1139fe6060f1SDimitry Andric // Where the dupq is a lane 0 replicate of a vector insert 1140fe6060f1SDimitry Andric if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero()) 1141bdd1243dSDimitry Andric return std::nullopt; 1142fe6060f1SDimitry Andric 1143fe6060f1SDimitry Andric auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); 114481ad6265SDimitry Andric if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert) 1145bdd1243dSDimitry Andric return std::nullopt; 1146fe6060f1SDimitry Andric 1147fe6060f1SDimitry Andric // Where the vector insert is a fixed constant vector insert into undef at 1148fe6060f1SDimitry Andric // index zero 1149fe6060f1SDimitry Andric if (!isa<UndefValue>(VecIns->getArgOperand(0))) 1150bdd1243dSDimitry Andric return std::nullopt; 1151fe6060f1SDimitry Andric 1152fe6060f1SDimitry Andric if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero()) 1153bdd1243dSDimitry Andric return std::nullopt; 1154fe6060f1SDimitry Andric 1155fe6060f1SDimitry Andric auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1)); 1156fe6060f1SDimitry Andric if (!ConstVec) 1157bdd1243dSDimitry Andric return std::nullopt; 1158fe6060f1SDimitry Andric 1159fe6060f1SDimitry Andric auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType()); 1160fe6060f1SDimitry Andric auto *OutTy = dyn_cast<ScalableVectorType>(II.getType()); 1161fe6060f1SDimitry Andric if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) 1162bdd1243dSDimitry Andric return std::nullopt; 1163fe6060f1SDimitry Andric 1164fe6060f1SDimitry Andric unsigned NumElts = VecTy->getNumElements(); 1165fe6060f1SDimitry Andric unsigned PredicateBits = 0; 1166fe6060f1SDimitry Andric 1167fe6060f1SDimitry Andric // Expand intrinsic operands to a 16-bit byte level predicate 1168fe6060f1SDimitry Andric for (unsigned I = 0; I < NumElts; ++I) { 1169fe6060f1SDimitry Andric auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I)); 1170fe6060f1SDimitry Andric if (!Arg) 1171bdd1243dSDimitry Andric return std::nullopt; 1172fe6060f1SDimitry Andric if (!Arg->isZero()) 1173fe6060f1SDimitry Andric PredicateBits |= 1 << (I * (16 / NumElts)); 1174fe6060f1SDimitry Andric } 1175fe6060f1SDimitry Andric 1176fe6060f1SDimitry Andric // If all bits are zero bail early with an empty predicate 1177fe6060f1SDimitry Andric if (PredicateBits == 0) { 1178fe6060f1SDimitry Andric auto *PFalse = Constant::getNullValue(II.getType()); 1179fe6060f1SDimitry Andric PFalse->takeName(&II); 1180fe6060f1SDimitry Andric return IC.replaceInstUsesWith(II, PFalse); 1181fe6060f1SDimitry Andric } 1182fe6060f1SDimitry Andric 1183fe6060f1SDimitry Andric // Calculate largest predicate type used (where byte predicate is largest) 1184fe6060f1SDimitry Andric unsigned Mask = 8; 1185fe6060f1SDimitry Andric for (unsigned I = 0; I < 16; ++I) 1186fe6060f1SDimitry Andric if ((PredicateBits & (1 << I)) != 0) 1187fe6060f1SDimitry Andric Mask |= (I % 8); 1188fe6060f1SDimitry Andric 1189fe6060f1SDimitry Andric unsigned PredSize = Mask & -Mask; 1190fe6060f1SDimitry Andric auto *PredType = ScalableVectorType::get( 1191fe6060f1SDimitry Andric Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); 1192fe6060f1SDimitry Andric 1193fe6060f1SDimitry Andric // Ensure all relevant bits are set 1194fe6060f1SDimitry Andric for (unsigned I = 0; I < 16; I += PredSize) 1195fe6060f1SDimitry Andric if ((PredicateBits & (1 << I)) == 0) 1196bdd1243dSDimitry Andric return std::nullopt; 1197fe6060f1SDimitry Andric 1198fe6060f1SDimitry Andric auto *PTruePat = 1199fe6060f1SDimitry Andric ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 120006c3fb27SDimitry Andric auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 1201fe6060f1SDimitry Andric {PredType}, {PTruePat}); 120206c3fb27SDimitry Andric auto *ConvertToSVBool = IC.Builder.CreateIntrinsic( 1203fe6060f1SDimitry Andric Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); 1204fe6060f1SDimitry Andric auto *ConvertFromSVBool = 120506c3fb27SDimitry Andric IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, 1206fe6060f1SDimitry Andric {II.getType()}, {ConvertToSVBool}); 1207fe6060f1SDimitry Andric 1208fe6060f1SDimitry Andric ConvertFromSVBool->takeName(&II); 1209fe6060f1SDimitry Andric return IC.replaceInstUsesWith(II, ConvertFromSVBool); 1210fe6060f1SDimitry Andric } 1211fe6060f1SDimitry Andric 1212bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC, 1213fe6060f1SDimitry Andric IntrinsicInst &II) { 1214fe6060f1SDimitry Andric Value *Pg = II.getArgOperand(0); 1215fe6060f1SDimitry Andric Value *Vec = II.getArgOperand(1); 1216349cc55cSDimitry Andric auto IntrinsicID = II.getIntrinsicID(); 1217349cc55cSDimitry Andric bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; 1218fe6060f1SDimitry Andric 1219fe6060f1SDimitry Andric // lastX(splat(X)) --> X 1220fe6060f1SDimitry Andric if (auto *SplatVal = getSplatValue(Vec)) 1221fe6060f1SDimitry Andric return IC.replaceInstUsesWith(II, SplatVal); 1222fe6060f1SDimitry Andric 1223349cc55cSDimitry Andric // If x and/or y is a splat value then: 1224349cc55cSDimitry Andric // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) 1225349cc55cSDimitry Andric Value *LHS, *RHS; 1226349cc55cSDimitry Andric if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) { 1227349cc55cSDimitry Andric if (isSplatValue(LHS) || isSplatValue(RHS)) { 1228349cc55cSDimitry Andric auto *OldBinOp = cast<BinaryOperator>(Vec); 1229349cc55cSDimitry Andric auto OpC = OldBinOp->getOpcode(); 1230349cc55cSDimitry Andric auto *NewLHS = 123106c3fb27SDimitry Andric IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS}); 1232349cc55cSDimitry Andric auto *NewRHS = 123306c3fb27SDimitry Andric IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS}); 1234349cc55cSDimitry Andric auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( 12350fca6ea1SDimitry Andric OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator()); 1236349cc55cSDimitry Andric return IC.replaceInstUsesWith(II, NewBinOp); 1237349cc55cSDimitry Andric } 1238349cc55cSDimitry Andric } 1239349cc55cSDimitry Andric 1240fe6060f1SDimitry Andric auto *C = dyn_cast<Constant>(Pg); 1241fe6060f1SDimitry Andric if (IsAfter && C && C->isNullValue()) { 1242fe6060f1SDimitry Andric // The intrinsic is extracting lane 0 so use an extract instead. 1243fe6060f1SDimitry Andric auto *IdxTy = Type::getInt64Ty(II.getContext()); 1244fe6060f1SDimitry Andric auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); 1245fe6060f1SDimitry Andric Extract->insertBefore(&II); 1246fe6060f1SDimitry Andric Extract->takeName(&II); 1247fe6060f1SDimitry Andric return IC.replaceInstUsesWith(II, Extract); 1248fe6060f1SDimitry Andric } 1249fe6060f1SDimitry Andric 1250fe6060f1SDimitry Andric auto *IntrPG = dyn_cast<IntrinsicInst>(Pg); 1251fe6060f1SDimitry Andric if (!IntrPG) 1252bdd1243dSDimitry Andric return std::nullopt; 1253fe6060f1SDimitry Andric 1254fe6060f1SDimitry Andric if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 1255bdd1243dSDimitry Andric return std::nullopt; 1256fe6060f1SDimitry Andric 1257fe6060f1SDimitry Andric const auto PTruePattern = 1258fe6060f1SDimitry Andric cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue(); 1259fe6060f1SDimitry Andric 1260fe6060f1SDimitry Andric // Can the intrinsic's predicate be converted to a known constant index? 1261349cc55cSDimitry Andric unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern); 1262349cc55cSDimitry Andric if (!MinNumElts) 1263bdd1243dSDimitry Andric return std::nullopt; 1264fe6060f1SDimitry Andric 1265349cc55cSDimitry Andric unsigned Idx = MinNumElts - 1; 1266fe6060f1SDimitry Andric // Increment the index if extracting the element after the last active 1267fe6060f1SDimitry Andric // predicate element. 1268fe6060f1SDimitry Andric if (IsAfter) 1269fe6060f1SDimitry Andric ++Idx; 1270fe6060f1SDimitry Andric 1271fe6060f1SDimitry Andric // Ignore extracts whose index is larger than the known minimum vector 1272fe6060f1SDimitry Andric // length. NOTE: This is an artificial constraint where we prefer to 1273fe6060f1SDimitry Andric // maintain what the user asked for until an alternative is proven faster. 1274fe6060f1SDimitry Andric auto *PgVTy = cast<ScalableVectorType>(Pg->getType()); 1275fe6060f1SDimitry Andric if (Idx >= PgVTy->getMinNumElements()) 1276bdd1243dSDimitry Andric return std::nullopt; 1277fe6060f1SDimitry Andric 1278fe6060f1SDimitry Andric // The intrinsic is extracting a fixed lane so use an extract instead. 1279fe6060f1SDimitry Andric auto *IdxTy = Type::getInt64Ty(II.getContext()); 1280fe6060f1SDimitry Andric auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx)); 1281fe6060f1SDimitry Andric Extract->insertBefore(&II); 1282fe6060f1SDimitry Andric Extract->takeName(&II); 1283fe6060f1SDimitry Andric return IC.replaceInstUsesWith(II, Extract); 1284fe6060f1SDimitry Andric } 1285fe6060f1SDimitry Andric 1286bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC, 1287753f127fSDimitry Andric IntrinsicInst &II) { 1288753f127fSDimitry Andric // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar 1289753f127fSDimitry Andric // integer variant across a variety of micro-architectures. Replace scalar 1290753f127fSDimitry Andric // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple 1291753f127fSDimitry Andric // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more 1292753f127fSDimitry Andric // depending on the micro-architecture, but has been observed as generally 1293753f127fSDimitry Andric // being faster, particularly when the CLAST[AB] op is a loop-carried 1294753f127fSDimitry Andric // dependency. 1295753f127fSDimitry Andric Value *Pg = II.getArgOperand(0); 1296753f127fSDimitry Andric Value *Fallback = II.getArgOperand(1); 1297753f127fSDimitry Andric Value *Vec = II.getArgOperand(2); 1298753f127fSDimitry Andric Type *Ty = II.getType(); 1299753f127fSDimitry Andric 1300753f127fSDimitry Andric if (!Ty->isIntegerTy()) 1301bdd1243dSDimitry Andric return std::nullopt; 1302753f127fSDimitry Andric 1303753f127fSDimitry Andric Type *FPTy; 1304753f127fSDimitry Andric switch (cast<IntegerType>(Ty)->getBitWidth()) { 1305753f127fSDimitry Andric default: 1306bdd1243dSDimitry Andric return std::nullopt; 1307753f127fSDimitry Andric case 16: 130806c3fb27SDimitry Andric FPTy = IC.Builder.getHalfTy(); 1309753f127fSDimitry Andric break; 1310753f127fSDimitry Andric case 32: 131106c3fb27SDimitry Andric FPTy = IC.Builder.getFloatTy(); 1312753f127fSDimitry Andric break; 1313753f127fSDimitry Andric case 64: 131406c3fb27SDimitry Andric FPTy = IC.Builder.getDoubleTy(); 1315753f127fSDimitry Andric break; 1316753f127fSDimitry Andric } 1317753f127fSDimitry Andric 131806c3fb27SDimitry Andric Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy); 1319753f127fSDimitry Andric auto *FPVTy = VectorType::get( 1320753f127fSDimitry Andric FPTy, cast<VectorType>(Vec->getType())->getElementCount()); 132106c3fb27SDimitry Andric Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy); 132206c3fb27SDimitry Andric auto *FPII = IC.Builder.CreateIntrinsic( 132306c3fb27SDimitry Andric II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec}); 132406c3fb27SDimitry Andric Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType()); 1325753f127fSDimitry Andric return IC.replaceInstUsesWith(II, FPIItoInt); 1326753f127fSDimitry Andric } 1327753f127fSDimitry Andric 1328bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC, 1329fe6060f1SDimitry Andric IntrinsicInst &II) { 1330fe6060f1SDimitry Andric LLVMContext &Ctx = II.getContext(); 1331fe6060f1SDimitry Andric // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr 1332fe6060f1SDimitry Andric // can work with RDFFR_PP for ptest elimination. 1333fe6060f1SDimitry Andric auto *AllPat = 1334fe6060f1SDimitry Andric ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 133506c3fb27SDimitry Andric auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 1336fe6060f1SDimitry Andric {II.getType()}, {AllPat}); 1337fe6060f1SDimitry Andric auto *RDFFR = 133806c3fb27SDimitry Andric IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); 1339fe6060f1SDimitry Andric RDFFR->takeName(&II); 1340fe6060f1SDimitry Andric return IC.replaceInstUsesWith(II, RDFFR); 1341fe6060f1SDimitry Andric } 1342fe6060f1SDimitry Andric 1343bdd1243dSDimitry Andric static std::optional<Instruction *> 1344fe6060f1SDimitry Andric instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { 1345fe6060f1SDimitry Andric const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); 1346fe6060f1SDimitry Andric 1347fe6060f1SDimitry Andric if (Pattern == AArch64SVEPredPattern::all) { 1348fe6060f1SDimitry Andric Constant *StepVal = ConstantInt::get(II.getType(), NumElts); 134906c3fb27SDimitry Andric auto *VScale = IC.Builder.CreateVScale(StepVal); 1350fe6060f1SDimitry Andric VScale->takeName(&II); 1351fe6060f1SDimitry Andric return IC.replaceInstUsesWith(II, VScale); 1352fe6060f1SDimitry Andric } 1353fe6060f1SDimitry Andric 1354349cc55cSDimitry Andric unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); 1355fe6060f1SDimitry Andric 1356349cc55cSDimitry Andric return MinNumElts && NumElts >= MinNumElts 1357bdd1243dSDimitry Andric ? std::optional<Instruction *>(IC.replaceInstUsesWith( 1358fe6060f1SDimitry Andric II, ConstantInt::get(II.getType(), MinNumElts))) 1359bdd1243dSDimitry Andric : std::nullopt; 1360fe6060f1SDimitry Andric } 1361fe6060f1SDimitry Andric 1362bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, 1363fe6060f1SDimitry Andric IntrinsicInst &II) { 1364bdd1243dSDimitry Andric Value *PgVal = II.getArgOperand(0); 1365bdd1243dSDimitry Andric Value *OpVal = II.getArgOperand(1); 1366fe6060f1SDimitry Andric 1367bdd1243dSDimitry Andric // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X). 1368bdd1243dSDimitry Andric // Later optimizations prefer this form. 1369bdd1243dSDimitry Andric if (PgVal == OpVal && 1370bdd1243dSDimitry Andric (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first || 1371bdd1243dSDimitry Andric II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) { 1372bdd1243dSDimitry Andric Value *Ops[] = {PgVal, OpVal}; 1373bdd1243dSDimitry Andric Type *Tys[] = {PgVal->getType()}; 1374bdd1243dSDimitry Andric 1375bdd1243dSDimitry Andric auto *PTest = 137606c3fb27SDimitry Andric IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops); 1377bdd1243dSDimitry Andric PTest->takeName(&II); 1378bdd1243dSDimitry Andric 1379bdd1243dSDimitry Andric return IC.replaceInstUsesWith(II, PTest); 1380bdd1243dSDimitry Andric } 1381bdd1243dSDimitry Andric 1382bdd1243dSDimitry Andric IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal); 1383bdd1243dSDimitry Andric IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal); 1384bdd1243dSDimitry Andric 1385bdd1243dSDimitry Andric if (!Pg || !Op) 1386bdd1243dSDimitry Andric return std::nullopt; 1387bdd1243dSDimitry Andric 1388bdd1243dSDimitry Andric Intrinsic::ID OpIID = Op->getIntrinsicID(); 1389bdd1243dSDimitry Andric 1390bdd1243dSDimitry Andric if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 1391bdd1243dSDimitry Andric OpIID == Intrinsic::aarch64_sve_convert_to_svbool && 1392bdd1243dSDimitry Andric Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) { 1393bdd1243dSDimitry Andric Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)}; 1394bdd1243dSDimitry Andric Type *Tys[] = {Pg->getArgOperand(0)->getType()}; 1395fe6060f1SDimitry Andric 139606c3fb27SDimitry Andric auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); 1397fe6060f1SDimitry Andric 1398fe6060f1SDimitry Andric PTest->takeName(&II); 1399fe6060f1SDimitry Andric return IC.replaceInstUsesWith(II, PTest); 1400fe6060f1SDimitry Andric } 1401fe6060f1SDimitry Andric 1402bdd1243dSDimitry Andric // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)). 1403bdd1243dSDimitry Andric // Later optimizations may rewrite sequence to use the flag-setting variant 1404bdd1243dSDimitry Andric // of instruction X to remove PTEST. 1405bdd1243dSDimitry Andric if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) && 1406bdd1243dSDimitry Andric ((OpIID == Intrinsic::aarch64_sve_brka_z) || 1407bdd1243dSDimitry Andric (OpIID == Intrinsic::aarch64_sve_brkb_z) || 1408bdd1243dSDimitry Andric (OpIID == Intrinsic::aarch64_sve_brkpa_z) || 1409bdd1243dSDimitry Andric (OpIID == Intrinsic::aarch64_sve_brkpb_z) || 1410bdd1243dSDimitry Andric (OpIID == Intrinsic::aarch64_sve_rdffr_z) || 1411bdd1243dSDimitry Andric (OpIID == Intrinsic::aarch64_sve_and_z) || 1412bdd1243dSDimitry Andric (OpIID == Intrinsic::aarch64_sve_bic_z) || 1413bdd1243dSDimitry Andric (OpIID == Intrinsic::aarch64_sve_eor_z) || 1414bdd1243dSDimitry Andric (OpIID == Intrinsic::aarch64_sve_nand_z) || 1415bdd1243dSDimitry Andric (OpIID == Intrinsic::aarch64_sve_nor_z) || 1416bdd1243dSDimitry Andric (OpIID == Intrinsic::aarch64_sve_orn_z) || 1417bdd1243dSDimitry Andric (OpIID == Intrinsic::aarch64_sve_orr_z))) { 1418bdd1243dSDimitry Andric Value *Ops[] = {Pg->getArgOperand(0), Pg}; 1419bdd1243dSDimitry Andric Type *Tys[] = {Pg->getType()}; 1420bdd1243dSDimitry Andric 142106c3fb27SDimitry Andric auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); 1422bdd1243dSDimitry Andric PTest->takeName(&II); 1423bdd1243dSDimitry Andric 1424bdd1243dSDimitry Andric return IC.replaceInstUsesWith(II, PTest); 1425fe6060f1SDimitry Andric } 1426fe6060f1SDimitry Andric 1427bdd1243dSDimitry Andric return std::nullopt; 1428bdd1243dSDimitry Andric } 1429bdd1243dSDimitry Andric 1430bdd1243dSDimitry Andric template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc> 1431bdd1243dSDimitry Andric static std::optional<Instruction *> 1432bdd1243dSDimitry Andric instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, 1433bdd1243dSDimitry Andric bool MergeIntoAddendOp) { 1434349cc55cSDimitry Andric Value *P = II.getOperand(0); 1435bdd1243dSDimitry Andric Value *MulOp0, *MulOp1, *AddendOp, *Mul; 1436bdd1243dSDimitry Andric if (MergeIntoAddendOp) { 1437bdd1243dSDimitry Andric AddendOp = II.getOperand(1); 1438bdd1243dSDimitry Andric Mul = II.getOperand(2); 1439bdd1243dSDimitry Andric } else { 1440bdd1243dSDimitry Andric AddendOp = II.getOperand(2); 1441bdd1243dSDimitry Andric Mul = II.getOperand(1); 1442bdd1243dSDimitry Andric } 1443349cc55cSDimitry Andric 1444bdd1243dSDimitry Andric if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0), 1445bdd1243dSDimitry Andric m_Value(MulOp1)))) 1446bdd1243dSDimitry Andric return std::nullopt; 1447349cc55cSDimitry Andric 1448bdd1243dSDimitry Andric if (!Mul->hasOneUse()) 1449bdd1243dSDimitry Andric return std::nullopt; 1450bdd1243dSDimitry Andric 1451bdd1243dSDimitry Andric Instruction *FMFSource = nullptr; 1452bdd1243dSDimitry Andric if (II.getType()->isFPOrFPVectorTy()) { 1453349cc55cSDimitry Andric llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); 1454bdd1243dSDimitry Andric // Stop the combine when the flags on the inputs differ in case dropping 1455bdd1243dSDimitry Andric // flags would lead to us missing out on more beneficial optimizations. 1456bdd1243dSDimitry Andric if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags()) 1457bdd1243dSDimitry Andric return std::nullopt; 1458349cc55cSDimitry Andric if (!FAddFlags.allowContract()) 1459bdd1243dSDimitry Andric return std::nullopt; 1460bdd1243dSDimitry Andric FMFSource = &II; 1461bdd1243dSDimitry Andric } 1462349cc55cSDimitry Andric 1463bdd1243dSDimitry Andric CallInst *Res; 1464bdd1243dSDimitry Andric if (MergeIntoAddendOp) 146506c3fb27SDimitry Andric Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()}, 1466bdd1243dSDimitry Andric {P, AddendOp, MulOp0, MulOp1}, FMFSource); 1467bdd1243dSDimitry Andric else 146806c3fb27SDimitry Andric Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()}, 1469bdd1243dSDimitry Andric {P, MulOp0, MulOp1, AddendOp}, FMFSource); 1470bdd1243dSDimitry Andric 1471bdd1243dSDimitry Andric return IC.replaceInstUsesWith(II, Res); 1472349cc55cSDimitry Andric } 1473349cc55cSDimitry Andric 1474bdd1243dSDimitry Andric static std::optional<Instruction *> 1475349cc55cSDimitry Andric instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 1476349cc55cSDimitry Andric Value *Pred = II.getOperand(0); 1477349cc55cSDimitry Andric Value *PtrOp = II.getOperand(1); 1478349cc55cSDimitry Andric Type *VecTy = II.getType(); 1479349cc55cSDimitry Andric 14800fca6ea1SDimitry Andric // Replace by zero constant when all lanes are inactive 14810fca6ea1SDimitry Andric if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II)) 14820fca6ea1SDimitry Andric return II_NA; 14830fca6ea1SDimitry Andric 14840eae32dcSDimitry Andric if (isAllActivePredicate(Pred)) { 148506c3fb27SDimitry Andric LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp); 148681ad6265SDimitry Andric Load->copyMetadata(II); 1487349cc55cSDimitry Andric return IC.replaceInstUsesWith(II, Load); 1488349cc55cSDimitry Andric } 1489349cc55cSDimitry Andric 1490349cc55cSDimitry Andric CallInst *MaskedLoad = 149106c3fb27SDimitry Andric IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL), 1492349cc55cSDimitry Andric Pred, ConstantAggregateZero::get(VecTy)); 149381ad6265SDimitry Andric MaskedLoad->copyMetadata(II); 1494349cc55cSDimitry Andric return IC.replaceInstUsesWith(II, MaskedLoad); 1495349cc55cSDimitry Andric } 1496349cc55cSDimitry Andric 1497bdd1243dSDimitry Andric static std::optional<Instruction *> 1498349cc55cSDimitry Andric instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 1499349cc55cSDimitry Andric Value *VecOp = II.getOperand(0); 1500349cc55cSDimitry Andric Value *Pred = II.getOperand(1); 1501349cc55cSDimitry Andric Value *PtrOp = II.getOperand(2); 1502349cc55cSDimitry Andric 15030eae32dcSDimitry Andric if (isAllActivePredicate(Pred)) { 150406c3fb27SDimitry Andric StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp); 150581ad6265SDimitry Andric Store->copyMetadata(II); 1506349cc55cSDimitry Andric return IC.eraseInstFromFunction(II); 1507349cc55cSDimitry Andric } 1508349cc55cSDimitry Andric 150906c3fb27SDimitry Andric CallInst *MaskedStore = IC.Builder.CreateMaskedStore( 151006c3fb27SDimitry Andric VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred); 151181ad6265SDimitry Andric MaskedStore->copyMetadata(II); 1512349cc55cSDimitry Andric return IC.eraseInstFromFunction(II); 1513349cc55cSDimitry Andric } 1514349cc55cSDimitry Andric 1515349cc55cSDimitry Andric static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { 1516349cc55cSDimitry Andric switch (Intrinsic) { 151706c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fmul_u: 1518349cc55cSDimitry Andric return Instruction::BinaryOps::FMul; 151906c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fadd_u: 1520349cc55cSDimitry Andric return Instruction::BinaryOps::FAdd; 152106c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fsub_u: 1522349cc55cSDimitry Andric return Instruction::BinaryOps::FSub; 1523349cc55cSDimitry Andric default: 1524349cc55cSDimitry Andric return Instruction::BinaryOpsEnd; 1525349cc55cSDimitry Andric } 1526349cc55cSDimitry Andric } 1527349cc55cSDimitry Andric 1528bdd1243dSDimitry Andric static std::optional<Instruction *> 1529bdd1243dSDimitry Andric instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) { 153006c3fb27SDimitry Andric // Bail due to missing support for ISD::STRICT_ scalable vector operations. 153106c3fb27SDimitry Andric if (II.isStrictFP()) 153206c3fb27SDimitry Andric return std::nullopt; 153306c3fb27SDimitry Andric 1534349cc55cSDimitry Andric auto *OpPredicate = II.getOperand(0); 1535349cc55cSDimitry Andric auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID()); 1536349cc55cSDimitry Andric if (BinOpCode == Instruction::BinaryOpsEnd || 1537349cc55cSDimitry Andric !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 1538349cc55cSDimitry Andric m_ConstantInt<AArch64SVEPredPattern::all>()))) 1539bdd1243dSDimitry Andric return std::nullopt; 154006c3fb27SDimitry Andric IRBuilderBase::FastMathFlagGuard FMFGuard(IC.Builder); 154106c3fb27SDimitry Andric IC.Builder.setFastMathFlags(II.getFastMathFlags()); 1542349cc55cSDimitry Andric auto BinOp = 154306c3fb27SDimitry Andric IC.Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2)); 1544349cc55cSDimitry Andric return IC.replaceInstUsesWith(II, BinOp); 1545349cc55cSDimitry Andric } 1546349cc55cSDimitry Andric 154706c3fb27SDimitry Andric // Canonicalise operations that take an all active predicate (e.g. sve.add -> 154806c3fb27SDimitry Andric // sve.add_u). 154906c3fb27SDimitry Andric static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II, 155006c3fb27SDimitry Andric Intrinsic::ID IID) { 155106c3fb27SDimitry Andric auto *OpPredicate = II.getOperand(0); 155206c3fb27SDimitry Andric if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 155306c3fb27SDimitry Andric m_ConstantInt<AArch64SVEPredPattern::all>()))) 155406c3fb27SDimitry Andric return std::nullopt; 155506c3fb27SDimitry Andric 155606c3fb27SDimitry Andric auto *Mod = II.getModule(); 155706c3fb27SDimitry Andric auto *NewDecl = Intrinsic::getDeclaration(Mod, IID, {II.getType()}); 155806c3fb27SDimitry Andric II.setCalledFunction(NewDecl); 155906c3fb27SDimitry Andric 156006c3fb27SDimitry Andric return &II; 156106c3fb27SDimitry Andric } 156206c3fb27SDimitry Andric 1563297eecfbSDimitry Andric // Simplify operations where predicate has all inactive lanes or try to replace 1564297eecfbSDimitry Andric // with _u form when all lanes are active 1565297eecfbSDimitry Andric static std::optional<Instruction *> 1566297eecfbSDimitry Andric instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, 1567297eecfbSDimitry Andric Intrinsic::ID IID) { 1568297eecfbSDimitry Andric if (match(II.getOperand(0), m_ZeroInt())) { 1569297eecfbSDimitry Andric // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are 1570297eecfbSDimitry Andric // inactive for sv[func]_m 1571297eecfbSDimitry Andric return IC.replaceInstUsesWith(II, II.getOperand(1)); 1572297eecfbSDimitry Andric } 1573297eecfbSDimitry Andric return instCombineSVEAllActive(II, IID); 1574297eecfbSDimitry Andric } 1575297eecfbSDimitry Andric 1576bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC, 1577349cc55cSDimitry Andric IntrinsicInst &II) { 1578297eecfbSDimitry Andric if (auto II_U = 1579297eecfbSDimitry Andric instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u)) 158006c3fb27SDimitry Andric return II_U; 158106c3fb27SDimitry Andric if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, 158206c3fb27SDimitry Andric Intrinsic::aarch64_sve_mla>( 158306c3fb27SDimitry Andric IC, II, true)) 158406c3fb27SDimitry Andric return MLA; 158506c3fb27SDimitry Andric if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, 158606c3fb27SDimitry Andric Intrinsic::aarch64_sve_mad>( 158706c3fb27SDimitry Andric IC, II, false)) 158806c3fb27SDimitry Andric return MAD; 158906c3fb27SDimitry Andric return std::nullopt; 159006c3fb27SDimitry Andric } 159106c3fb27SDimitry Andric 159206c3fb27SDimitry Andric static std::optional<Instruction *> 159306c3fb27SDimitry Andric instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) { 1594297eecfbSDimitry Andric if (auto II_U = 1595297eecfbSDimitry Andric instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u)) 159606c3fb27SDimitry Andric return II_U; 1597bdd1243dSDimitry Andric if (auto FMLA = 1598bdd1243dSDimitry Andric instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1599bdd1243dSDimitry Andric Intrinsic::aarch64_sve_fmla>(IC, II, 1600bdd1243dSDimitry Andric true)) 1601349cc55cSDimitry Andric return FMLA; 1602bdd1243dSDimitry Andric if (auto FMAD = 1603bdd1243dSDimitry Andric instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1604bdd1243dSDimitry Andric Intrinsic::aarch64_sve_fmad>(IC, II, 1605bdd1243dSDimitry Andric false)) 1606bdd1243dSDimitry Andric return FMAD; 160706c3fb27SDimitry Andric if (auto FMLA = 160806c3fb27SDimitry Andric instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, 160906c3fb27SDimitry Andric Intrinsic::aarch64_sve_fmla>(IC, II, 161006c3fb27SDimitry Andric true)) 161106c3fb27SDimitry Andric return FMLA; 161206c3fb27SDimitry Andric return std::nullopt; 161306c3fb27SDimitry Andric } 161406c3fb27SDimitry Andric 161506c3fb27SDimitry Andric static std::optional<Instruction *> 161606c3fb27SDimitry Andric instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) { 161706c3fb27SDimitry Andric if (auto FMLA = 161806c3fb27SDimitry Andric instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 161906c3fb27SDimitry Andric Intrinsic::aarch64_sve_fmla>(IC, II, 162006c3fb27SDimitry Andric true)) 162106c3fb27SDimitry Andric return FMLA; 162206c3fb27SDimitry Andric if (auto FMAD = 162306c3fb27SDimitry Andric instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 162406c3fb27SDimitry Andric Intrinsic::aarch64_sve_fmad>(IC, II, 162506c3fb27SDimitry Andric false)) 162606c3fb27SDimitry Andric return FMAD; 162706c3fb27SDimitry Andric if (auto FMLA_U = 162806c3fb27SDimitry Andric instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, 162906c3fb27SDimitry Andric Intrinsic::aarch64_sve_fmla_u>( 163006c3fb27SDimitry Andric IC, II, true)) 163106c3fb27SDimitry Andric return FMLA_U; 1632349cc55cSDimitry Andric return instCombineSVEVectorBinOp(IC, II); 1633349cc55cSDimitry Andric } 1634349cc55cSDimitry Andric 163506c3fb27SDimitry Andric static std::optional<Instruction *> 163606c3fb27SDimitry Andric instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) { 1637297eecfbSDimitry Andric if (auto II_U = 1638297eecfbSDimitry Andric instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u)) 163906c3fb27SDimitry Andric return II_U; 1640bdd1243dSDimitry Andric if (auto FMLS = 1641bdd1243dSDimitry Andric instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1642bdd1243dSDimitry Andric Intrinsic::aarch64_sve_fmls>(IC, II, 1643bdd1243dSDimitry Andric true)) 1644bdd1243dSDimitry Andric return FMLS; 1645bdd1243dSDimitry Andric if (auto FMSB = 1646bdd1243dSDimitry Andric instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1647bdd1243dSDimitry Andric Intrinsic::aarch64_sve_fnmsb>( 1648bdd1243dSDimitry Andric IC, II, false)) 1649bdd1243dSDimitry Andric return FMSB; 165006c3fb27SDimitry Andric if (auto FMLS = 165106c3fb27SDimitry Andric instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, 165206c3fb27SDimitry Andric Intrinsic::aarch64_sve_fmls>(IC, II, 165306c3fb27SDimitry Andric true)) 165406c3fb27SDimitry Andric return FMLS; 165506c3fb27SDimitry Andric return std::nullopt; 165606c3fb27SDimitry Andric } 165706c3fb27SDimitry Andric 165806c3fb27SDimitry Andric static std::optional<Instruction *> 165906c3fb27SDimitry Andric instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) { 166006c3fb27SDimitry Andric if (auto FMLS = 166106c3fb27SDimitry Andric instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 166206c3fb27SDimitry Andric Intrinsic::aarch64_sve_fmls>(IC, II, 166306c3fb27SDimitry Andric true)) 166406c3fb27SDimitry Andric return FMLS; 166506c3fb27SDimitry Andric if (auto FMSB = 166606c3fb27SDimitry Andric instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 166706c3fb27SDimitry Andric Intrinsic::aarch64_sve_fnmsb>( 166806c3fb27SDimitry Andric IC, II, false)) 166906c3fb27SDimitry Andric return FMSB; 167006c3fb27SDimitry Andric if (auto FMLS_U = 167106c3fb27SDimitry Andric instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, 167206c3fb27SDimitry Andric Intrinsic::aarch64_sve_fmls_u>( 167306c3fb27SDimitry Andric IC, II, true)) 167406c3fb27SDimitry Andric return FMLS_U; 1675bdd1243dSDimitry Andric return instCombineSVEVectorBinOp(IC, II); 1676bdd1243dSDimitry Andric } 1677bdd1243dSDimitry Andric 167806c3fb27SDimitry Andric static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC, 1679fe6060f1SDimitry Andric IntrinsicInst &II) { 1680297eecfbSDimitry Andric if (auto II_U = 1681297eecfbSDimitry Andric instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u)) 168206c3fb27SDimitry Andric return II_U; 168306c3fb27SDimitry Andric if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, 168406c3fb27SDimitry Andric Intrinsic::aarch64_sve_mls>( 168506c3fb27SDimitry Andric IC, II, true)) 168606c3fb27SDimitry Andric return MLS; 168706c3fb27SDimitry Andric return std::nullopt; 168806c3fb27SDimitry Andric } 168906c3fb27SDimitry Andric 169006c3fb27SDimitry Andric static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, 169106c3fb27SDimitry Andric IntrinsicInst &II, 169206c3fb27SDimitry Andric Intrinsic::ID IID) { 1693fe6060f1SDimitry Andric auto *OpPredicate = II.getOperand(0); 1694fe6060f1SDimitry Andric auto *OpMultiplicand = II.getOperand(1); 1695fe6060f1SDimitry Andric auto *OpMultiplier = II.getOperand(2); 1696fe6060f1SDimitry Andric 1697349cc55cSDimitry Andric // Return true if a given instruction is a unit splat value, false otherwise. 1698349cc55cSDimitry Andric auto IsUnitSplat = [](auto *I) { 1699349cc55cSDimitry Andric auto *SplatValue = getSplatValue(I); 1700349cc55cSDimitry Andric if (!SplatValue) 1701fe6060f1SDimitry Andric return false; 1702fe6060f1SDimitry Andric return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1703fe6060f1SDimitry Andric }; 1704fe6060f1SDimitry Andric 1705fe6060f1SDimitry Andric // Return true if a given instruction is an aarch64_sve_dup intrinsic call 1706fe6060f1SDimitry Andric // with a unit splat value, false otherwise. 1707fe6060f1SDimitry Andric auto IsUnitDup = [](auto *I) { 1708fe6060f1SDimitry Andric auto *IntrI = dyn_cast<IntrinsicInst>(I); 1709fe6060f1SDimitry Andric if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) 1710fe6060f1SDimitry Andric return false; 1711fe6060f1SDimitry Andric 1712fe6060f1SDimitry Andric auto *SplatValue = IntrI->getOperand(2); 1713fe6060f1SDimitry Andric return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1714fe6060f1SDimitry Andric }; 1715fe6060f1SDimitry Andric 1716349cc55cSDimitry Andric if (IsUnitSplat(OpMultiplier)) { 17174824e7fdSDimitry Andric // [f]mul pg %n, (dupx 1) => %n 1718fe6060f1SDimitry Andric OpMultiplicand->takeName(&II); 1719fe6060f1SDimitry Andric return IC.replaceInstUsesWith(II, OpMultiplicand); 1720fe6060f1SDimitry Andric } else if (IsUnitDup(OpMultiplier)) { 17214824e7fdSDimitry Andric // [f]mul pg %n, (dup pg 1) => %n 1722fe6060f1SDimitry Andric auto *DupInst = cast<IntrinsicInst>(OpMultiplier); 1723fe6060f1SDimitry Andric auto *DupPg = DupInst->getOperand(1); 1724fe6060f1SDimitry Andric // TODO: this is naive. The optimization is still valid if DupPg 1725fe6060f1SDimitry Andric // 'encompasses' OpPredicate, not only if they're the same predicate. 1726fe6060f1SDimitry Andric if (OpPredicate == DupPg) { 1727fe6060f1SDimitry Andric OpMultiplicand->takeName(&II); 1728fe6060f1SDimitry Andric return IC.replaceInstUsesWith(II, OpMultiplicand); 1729fe6060f1SDimitry Andric } 1730fe6060f1SDimitry Andric } 1731fe6060f1SDimitry Andric 1732349cc55cSDimitry Andric return instCombineSVEVectorBinOp(IC, II); 1733fe6060f1SDimitry Andric } 1734fe6060f1SDimitry Andric 1735bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, 1736349cc55cSDimitry Andric IntrinsicInst &II) { 1737349cc55cSDimitry Andric Value *UnpackArg = II.getArgOperand(0); 1738349cc55cSDimitry Andric auto *RetTy = cast<ScalableVectorType>(II.getType()); 1739349cc55cSDimitry Andric bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || 1740349cc55cSDimitry Andric II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; 1741349cc55cSDimitry Andric 1742349cc55cSDimitry Andric // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) 1743349cc55cSDimitry Andric // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) 1744349cc55cSDimitry Andric if (auto *ScalarArg = getSplatValue(UnpackArg)) { 1745349cc55cSDimitry Andric ScalarArg = 174606c3fb27SDimitry Andric IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned); 1747349cc55cSDimitry Andric Value *NewVal = 174806c3fb27SDimitry Andric IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg); 1749349cc55cSDimitry Andric NewVal->takeName(&II); 1750349cc55cSDimitry Andric return IC.replaceInstUsesWith(II, NewVal); 1751349cc55cSDimitry Andric } 1752349cc55cSDimitry Andric 1753bdd1243dSDimitry Andric return std::nullopt; 1754349cc55cSDimitry Andric } 1755bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC, 1756fe6060f1SDimitry Andric IntrinsicInst &II) { 1757fe6060f1SDimitry Andric auto *OpVal = II.getOperand(0); 1758fe6060f1SDimitry Andric auto *OpIndices = II.getOperand(1); 1759fe6060f1SDimitry Andric VectorType *VTy = cast<VectorType>(II.getType()); 1760fe6060f1SDimitry Andric 1761349cc55cSDimitry Andric // Check whether OpIndices is a constant splat value < minimal element count 1762349cc55cSDimitry Andric // of result. 1763349cc55cSDimitry Andric auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices)); 1764fe6060f1SDimitry Andric if (!SplatValue || 1765fe6060f1SDimitry Andric SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) 1766bdd1243dSDimitry Andric return std::nullopt; 1767fe6060f1SDimitry Andric 1768fe6060f1SDimitry Andric // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to 1769fe6060f1SDimitry Andric // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. 177006c3fb27SDimitry Andric auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue); 1771fe6060f1SDimitry Andric auto *VectorSplat = 177206c3fb27SDimitry Andric IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract); 1773fe6060f1SDimitry Andric 1774fe6060f1SDimitry Andric VectorSplat->takeName(&II); 1775fe6060f1SDimitry Andric return IC.replaceInstUsesWith(II, VectorSplat); 1776fe6060f1SDimitry Andric } 1777fe6060f1SDimitry Andric 17780fca6ea1SDimitry Andric static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC, 17790fca6ea1SDimitry Andric IntrinsicInst &II) { 17800fca6ea1SDimitry Andric Value *A, *B; 17810fca6ea1SDimitry Andric Type *RetTy = II.getType(); 17820fca6ea1SDimitry Andric constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool; 17830fca6ea1SDimitry Andric constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool; 17840fca6ea1SDimitry Andric 17850fca6ea1SDimitry Andric // uzp1(to_svbool(A), to_svbool(B)) --> <A, B> 17860fca6ea1SDimitry Andric // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B> 17870fca6ea1SDimitry Andric if ((match(II.getArgOperand(0), 17880fca6ea1SDimitry Andric m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) && 17890fca6ea1SDimitry Andric match(II.getArgOperand(1), 17900fca6ea1SDimitry Andric m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) || 17910fca6ea1SDimitry Andric (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) && 17920fca6ea1SDimitry Andric match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) { 17930fca6ea1SDimitry Andric auto *TyA = cast<ScalableVectorType>(A->getType()); 17940fca6ea1SDimitry Andric if (TyA == B->getType() && 17950fca6ea1SDimitry Andric RetTy == ScalableVectorType::getDoubleElementsVectorType(TyA)) { 17960fca6ea1SDimitry Andric auto *SubVec = IC.Builder.CreateInsertVector( 17970fca6ea1SDimitry Andric RetTy, PoisonValue::get(RetTy), A, IC.Builder.getInt64(0)); 17980fca6ea1SDimitry Andric auto *ConcatVec = IC.Builder.CreateInsertVector( 17990fca6ea1SDimitry Andric RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements())); 18000fca6ea1SDimitry Andric ConcatVec->takeName(&II); 18010fca6ea1SDimitry Andric return IC.replaceInstUsesWith(II, ConcatVec); 18020fca6ea1SDimitry Andric } 18030fca6ea1SDimitry Andric } 18040fca6ea1SDimitry Andric 18050fca6ea1SDimitry Andric return std::nullopt; 18060fca6ea1SDimitry Andric } 18070fca6ea1SDimitry Andric 1808bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC, 1809349cc55cSDimitry Andric IntrinsicInst &II) { 1810349cc55cSDimitry Andric // zip1(uzp1(A, B), uzp2(A, B)) --> A 1811349cc55cSDimitry Andric // zip2(uzp1(A, B), uzp2(A, B)) --> B 1812349cc55cSDimitry Andric Value *A, *B; 1813349cc55cSDimitry Andric if (match(II.getArgOperand(0), 1814349cc55cSDimitry Andric m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) && 1815349cc55cSDimitry Andric match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( 1816349cc55cSDimitry Andric m_Specific(A), m_Specific(B)))) 1817349cc55cSDimitry Andric return IC.replaceInstUsesWith( 1818349cc55cSDimitry Andric II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); 1819349cc55cSDimitry Andric 1820bdd1243dSDimitry Andric return std::nullopt; 1821349cc55cSDimitry Andric } 1822349cc55cSDimitry Andric 1823bdd1243dSDimitry Andric static std::optional<Instruction *> 1824bdd1243dSDimitry Andric instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) { 1825349cc55cSDimitry Andric Value *Mask = II.getOperand(0); 1826349cc55cSDimitry Andric Value *BasePtr = II.getOperand(1); 1827349cc55cSDimitry Andric Value *Index = II.getOperand(2); 1828349cc55cSDimitry Andric Type *Ty = II.getType(); 1829349cc55cSDimitry Andric Value *PassThru = ConstantAggregateZero::get(Ty); 1830349cc55cSDimitry Andric 18310fca6ea1SDimitry Andric // Replace by zero constant when all lanes are inactive 18320fca6ea1SDimitry Andric if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II)) 18330fca6ea1SDimitry Andric return II_NA; 18340fca6ea1SDimitry Andric 1835349cc55cSDimitry Andric // Contiguous gather => masked load. 1836349cc55cSDimitry Andric // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) 1837349cc55cSDimitry Andric // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) 1838349cc55cSDimitry Andric Value *IndexBase; 1839349cc55cSDimitry Andric if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1840349cc55cSDimitry Andric m_Value(IndexBase), m_SpecificInt(1)))) { 1841349cc55cSDimitry Andric Align Alignment = 18420fca6ea1SDimitry Andric BasePtr->getPointerAlignment(II.getDataLayout()); 1843349cc55cSDimitry Andric 1844349cc55cSDimitry Andric Type *VecPtrTy = PointerType::getUnqual(Ty); 184506c3fb27SDimitry Andric Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(), 1846bdd1243dSDimitry Andric BasePtr, IndexBase); 184706c3fb27SDimitry Andric Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy); 1848349cc55cSDimitry Andric CallInst *MaskedLoad = 184906c3fb27SDimitry Andric IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); 1850349cc55cSDimitry Andric MaskedLoad->takeName(&II); 1851349cc55cSDimitry Andric return IC.replaceInstUsesWith(II, MaskedLoad); 1852349cc55cSDimitry Andric } 1853349cc55cSDimitry Andric 1854bdd1243dSDimitry Andric return std::nullopt; 1855349cc55cSDimitry Andric } 1856349cc55cSDimitry Andric 1857bdd1243dSDimitry Andric static std::optional<Instruction *> 1858bdd1243dSDimitry Andric instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) { 1859349cc55cSDimitry Andric Value *Val = II.getOperand(0); 1860349cc55cSDimitry Andric Value *Mask = II.getOperand(1); 1861349cc55cSDimitry Andric Value *BasePtr = II.getOperand(2); 1862349cc55cSDimitry Andric Value *Index = II.getOperand(3); 1863349cc55cSDimitry Andric Type *Ty = Val->getType(); 1864349cc55cSDimitry Andric 1865349cc55cSDimitry Andric // Contiguous scatter => masked store. 186681ad6265SDimitry Andric // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) 1867349cc55cSDimitry Andric // => (masked.store Value (gep BasePtr IndexBase) Align Mask) 1868349cc55cSDimitry Andric Value *IndexBase; 1869349cc55cSDimitry Andric if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1870349cc55cSDimitry Andric m_Value(IndexBase), m_SpecificInt(1)))) { 1871349cc55cSDimitry Andric Align Alignment = 18720fca6ea1SDimitry Andric BasePtr->getPointerAlignment(II.getDataLayout()); 1873349cc55cSDimitry Andric 187406c3fb27SDimitry Andric Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(), 1875bdd1243dSDimitry Andric BasePtr, IndexBase); 1876349cc55cSDimitry Andric Type *VecPtrTy = PointerType::getUnqual(Ty); 187706c3fb27SDimitry Andric Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy); 1878349cc55cSDimitry Andric 187906c3fb27SDimitry Andric (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); 1880349cc55cSDimitry Andric 1881349cc55cSDimitry Andric return IC.eraseInstFromFunction(II); 1882349cc55cSDimitry Andric } 1883349cc55cSDimitry Andric 1884bdd1243dSDimitry Andric return std::nullopt; 1885349cc55cSDimitry Andric } 1886349cc55cSDimitry Andric 1887bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, 18880eae32dcSDimitry Andric IntrinsicInst &II) { 188906c3fb27SDimitry Andric Type *Int32Ty = IC.Builder.getInt32Ty(); 18900eae32dcSDimitry Andric Value *Pred = II.getOperand(0); 18910eae32dcSDimitry Andric Value *Vec = II.getOperand(1); 18920eae32dcSDimitry Andric Value *DivVec = II.getOperand(2); 18930eae32dcSDimitry Andric 18940eae32dcSDimitry Andric Value *SplatValue = getSplatValue(DivVec); 18950eae32dcSDimitry Andric ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue); 18960eae32dcSDimitry Andric if (!SplatConstantInt) 1897bdd1243dSDimitry Andric return std::nullopt; 18980eae32dcSDimitry Andric APInt Divisor = SplatConstantInt->getValue(); 18990eae32dcSDimitry Andric 19000eae32dcSDimitry Andric if (Divisor.isPowerOf2()) { 19010eae32dcSDimitry Andric Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 190206c3fb27SDimitry Andric auto ASRD = IC.Builder.CreateIntrinsic( 19030eae32dcSDimitry Andric Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 19040eae32dcSDimitry Andric return IC.replaceInstUsesWith(II, ASRD); 19050eae32dcSDimitry Andric } 19060eae32dcSDimitry Andric if (Divisor.isNegatedPowerOf2()) { 19070eae32dcSDimitry Andric Divisor.negate(); 19080eae32dcSDimitry Andric Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 190906c3fb27SDimitry Andric auto ASRD = IC.Builder.CreateIntrinsic( 19100eae32dcSDimitry Andric Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 191106c3fb27SDimitry Andric auto NEG = IC.Builder.CreateIntrinsic( 191206c3fb27SDimitry Andric Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD}); 19130eae32dcSDimitry Andric return IC.replaceInstUsesWith(II, NEG); 19140eae32dcSDimitry Andric } 19150eae32dcSDimitry Andric 1916bdd1243dSDimitry Andric return std::nullopt; 19170eae32dcSDimitry Andric } 19180eae32dcSDimitry Andric 1919bdd1243dSDimitry Andric bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) { 1920bdd1243dSDimitry Andric size_t VecSize = Vec.size(); 1921bdd1243dSDimitry Andric if (VecSize == 1) 1922bdd1243dSDimitry Andric return true; 1923bdd1243dSDimitry Andric if (!isPowerOf2_64(VecSize)) 1924bdd1243dSDimitry Andric return false; 1925bdd1243dSDimitry Andric size_t HalfVecSize = VecSize / 2; 1926bdd1243dSDimitry Andric 1927bdd1243dSDimitry Andric for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize; 1928bdd1243dSDimitry Andric RHS != Vec.end(); LHS++, RHS++) { 1929bdd1243dSDimitry Andric if (*LHS != nullptr && *RHS != nullptr) { 1930bdd1243dSDimitry Andric if (*LHS == *RHS) 1931bdd1243dSDimitry Andric continue; 1932bdd1243dSDimitry Andric else 1933bdd1243dSDimitry Andric return false; 1934bdd1243dSDimitry Andric } 1935bdd1243dSDimitry Andric if (!AllowPoison) 1936bdd1243dSDimitry Andric return false; 1937bdd1243dSDimitry Andric if (*LHS == nullptr && *RHS != nullptr) 1938bdd1243dSDimitry Andric *LHS = *RHS; 1939bdd1243dSDimitry Andric } 1940bdd1243dSDimitry Andric 1941bdd1243dSDimitry Andric Vec.resize(HalfVecSize); 1942bdd1243dSDimitry Andric SimplifyValuePattern(Vec, AllowPoison); 1943bdd1243dSDimitry Andric return true; 1944bdd1243dSDimitry Andric } 1945bdd1243dSDimitry Andric 1946bdd1243dSDimitry Andric // Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B) 1947bdd1243dSDimitry Andric // to dupqlane(f64(C)) where C is A concatenated with B 1948bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC, 1949bdd1243dSDimitry Andric IntrinsicInst &II) { 1950bdd1243dSDimitry Andric Value *CurrentInsertElt = nullptr, *Default = nullptr; 1951bdd1243dSDimitry Andric if (!match(II.getOperand(0), 1952bdd1243dSDimitry Andric m_Intrinsic<Intrinsic::vector_insert>( 1953bdd1243dSDimitry Andric m_Value(Default), m_Value(CurrentInsertElt), m_Value())) || 1954bdd1243dSDimitry Andric !isa<FixedVectorType>(CurrentInsertElt->getType())) 1955bdd1243dSDimitry Andric return std::nullopt; 1956bdd1243dSDimitry Andric auto IIScalableTy = cast<ScalableVectorType>(II.getType()); 1957bdd1243dSDimitry Andric 1958bdd1243dSDimitry Andric // Insert the scalars into a container ordered by InsertElement index 1959bdd1243dSDimitry Andric SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr); 1960bdd1243dSDimitry Andric while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) { 1961bdd1243dSDimitry Andric auto Idx = cast<ConstantInt>(InsertElt->getOperand(2)); 1962bdd1243dSDimitry Andric Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1); 1963bdd1243dSDimitry Andric CurrentInsertElt = InsertElt->getOperand(0); 1964bdd1243dSDimitry Andric } 1965bdd1243dSDimitry Andric 1966bdd1243dSDimitry Andric bool AllowPoison = 1967bdd1243dSDimitry Andric isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default); 1968bdd1243dSDimitry Andric if (!SimplifyValuePattern(Elts, AllowPoison)) 1969bdd1243dSDimitry Andric return std::nullopt; 1970bdd1243dSDimitry Andric 1971bdd1243dSDimitry Andric // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b) 1972bdd1243dSDimitry Andric Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType()); 1973bdd1243dSDimitry Andric for (size_t I = 0; I < Elts.size(); I++) { 1974bdd1243dSDimitry Andric if (Elts[I] == nullptr) 1975bdd1243dSDimitry Andric continue; 197606c3fb27SDimitry Andric InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I], 197706c3fb27SDimitry Andric IC.Builder.getInt64(I)); 1978bdd1243dSDimitry Andric } 1979bdd1243dSDimitry Andric if (InsertEltChain == nullptr) 1980bdd1243dSDimitry Andric return std::nullopt; 1981bdd1243dSDimitry Andric 1982bdd1243dSDimitry Andric // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64 1983bdd1243dSDimitry Andric // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector 1984bdd1243dSDimitry Andric // be bitcast to a type wide enough to fit the sequence, be splatted, and then 1985bdd1243dSDimitry Andric // be narrowed back to the original type. 1986bdd1243dSDimitry Andric unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size(); 1987bdd1243dSDimitry Andric unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() * 1988bdd1243dSDimitry Andric IIScalableTy->getMinNumElements() / 1989bdd1243dSDimitry Andric PatternWidth; 1990bdd1243dSDimitry Andric 199106c3fb27SDimitry Andric IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth); 1992bdd1243dSDimitry Andric auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount); 1993bdd1243dSDimitry Andric auto *WideShuffleMaskTy = 199406c3fb27SDimitry Andric ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount); 1995bdd1243dSDimitry Andric 199606c3fb27SDimitry Andric auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0)); 199706c3fb27SDimitry Andric auto InsertSubvector = IC.Builder.CreateInsertVector( 1998bdd1243dSDimitry Andric II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx); 1999bdd1243dSDimitry Andric auto WideBitcast = 200006c3fb27SDimitry Andric IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy); 2001bdd1243dSDimitry Andric auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy); 200206c3fb27SDimitry Andric auto WideShuffle = IC.Builder.CreateShuffleVector( 2003bdd1243dSDimitry Andric WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask); 2004bdd1243dSDimitry Andric auto NarrowBitcast = 200506c3fb27SDimitry Andric IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType()); 2006bdd1243dSDimitry Andric 2007bdd1243dSDimitry Andric return IC.replaceInstUsesWith(II, NarrowBitcast); 2008bdd1243dSDimitry Andric } 2009bdd1243dSDimitry Andric 2010bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, 201181ad6265SDimitry Andric IntrinsicInst &II) { 201281ad6265SDimitry Andric Value *A = II.getArgOperand(0); 201381ad6265SDimitry Andric Value *B = II.getArgOperand(1); 201481ad6265SDimitry Andric if (A == B) 201581ad6265SDimitry Andric return IC.replaceInstUsesWith(II, A); 201681ad6265SDimitry Andric 2017bdd1243dSDimitry Andric return std::nullopt; 201881ad6265SDimitry Andric } 201981ad6265SDimitry Andric 2020bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC, 202181ad6265SDimitry Andric IntrinsicInst &II) { 202281ad6265SDimitry Andric Value *Pred = II.getOperand(0); 202381ad6265SDimitry Andric Value *Vec = II.getOperand(1); 202481ad6265SDimitry Andric Value *Shift = II.getOperand(2); 202581ad6265SDimitry Andric 202681ad6265SDimitry Andric // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic. 202781ad6265SDimitry Andric Value *AbsPred, *MergedValue; 202881ad6265SDimitry Andric if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>( 202981ad6265SDimitry Andric m_Value(MergedValue), m_Value(AbsPred), m_Value())) && 203081ad6265SDimitry Andric !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>( 203181ad6265SDimitry Andric m_Value(MergedValue), m_Value(AbsPred), m_Value()))) 203281ad6265SDimitry Andric 2033bdd1243dSDimitry Andric return std::nullopt; 203481ad6265SDimitry Andric 203581ad6265SDimitry Andric // Transform is valid if any of the following are true: 203681ad6265SDimitry Andric // * The ABS merge value is an undef or non-negative 203781ad6265SDimitry Andric // * The ABS predicate is all active 203881ad6265SDimitry Andric // * The ABS predicate and the SRSHL predicates are the same 2039bdd1243dSDimitry Andric if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) && 204081ad6265SDimitry Andric AbsPred != Pred && !isAllActivePredicate(AbsPred)) 2041bdd1243dSDimitry Andric return std::nullopt; 204281ad6265SDimitry Andric 204381ad6265SDimitry Andric // Only valid when the shift amount is non-negative, otherwise the rounding 204481ad6265SDimitry Andric // behaviour of SRSHL cannot be ignored. 204581ad6265SDimitry Andric if (!match(Shift, m_NonNegative())) 2046bdd1243dSDimitry Andric return std::nullopt; 204781ad6265SDimitry Andric 204806c3fb27SDimitry Andric auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, 204906c3fb27SDimitry Andric {II.getType()}, {Pred, Vec, Shift}); 205081ad6265SDimitry Andric 205181ad6265SDimitry Andric return IC.replaceInstUsesWith(II, LSL); 205281ad6265SDimitry Andric } 205381ad6265SDimitry Andric 2054bdd1243dSDimitry Andric std::optional<Instruction *> 2055fe6060f1SDimitry Andric AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, 2056fe6060f1SDimitry Andric IntrinsicInst &II) const { 2057fe6060f1SDimitry Andric Intrinsic::ID IID = II.getIntrinsicID(); 2058fe6060f1SDimitry Andric switch (IID) { 2059fe6060f1SDimitry Andric default: 2060fe6060f1SDimitry Andric break; 20610fca6ea1SDimitry Andric 20620fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_st1_scatter: 20630fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: 20640fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_st1_scatter_sxtw: 20650fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: 20660fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_st1_scatter_uxtw: 20670fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: 20680fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_st1dq: 20690fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_st1q_scatter_index: 20700fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset: 20710fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_st1q_scatter_vector_offset: 20720fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_st1wq: 20730fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_stnt1: 20740fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_stnt1_scatter: 20750fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_stnt1_scatter_index: 20760fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: 20770fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: 20780fca6ea1SDimitry Andric return instCombineSVENoActiveUnaryErase(IC, II, 1); 20790fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_st2: 20800fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_st2q: 20810fca6ea1SDimitry Andric return instCombineSVENoActiveUnaryErase(IC, II, 2); 20820fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_st3: 20830fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_st3q: 20840fca6ea1SDimitry Andric return instCombineSVENoActiveUnaryErase(IC, II, 3); 20850fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_st4: 20860fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_st4q: 20870fca6ea1SDimitry Andric return instCombineSVENoActiveUnaryErase(IC, II, 4); 20880fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld1_gather: 20890fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: 20900fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld1_gather_sxtw: 20910fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: 20920fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld1_gather_uxtw: 20930fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: 20940fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld1q_gather_index: 20950fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset: 20960fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld1q_gather_vector_offset: 20970fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld1ro: 20980fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld1rq: 20990fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld1udq: 21000fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld1uwq: 21010fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld2_sret: 21020fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld2q_sret: 21030fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld3_sret: 21040fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld3q_sret: 21050fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld4_sret: 21060fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ld4q_sret: 21070fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ldff1: 21080fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ldff1_gather: 21090fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ldff1_gather_index: 21100fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: 21110fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ldff1_gather_sxtw: 21120fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: 21130fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ldff1_gather_uxtw: 21140fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: 21150fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ldnf1: 21160fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ldnt1: 21170fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ldnt1_gather: 21180fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ldnt1_gather_index: 21190fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: 21200fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: 21210fca6ea1SDimitry Andric return instCombineSVENoActiveUnaryZero(IC, II); 212281ad6265SDimitry Andric case Intrinsic::aarch64_neon_fmaxnm: 212381ad6265SDimitry Andric case Intrinsic::aarch64_neon_fminnm: 212481ad6265SDimitry Andric return instCombineMaxMinNM(IC, II); 2125fe6060f1SDimitry Andric case Intrinsic::aarch64_sve_convert_from_svbool: 2126fe6060f1SDimitry Andric return instCombineConvertFromSVBool(IC, II); 2127fe6060f1SDimitry Andric case Intrinsic::aarch64_sve_dup: 2128fe6060f1SDimitry Andric return instCombineSVEDup(IC, II); 2129349cc55cSDimitry Andric case Intrinsic::aarch64_sve_dup_x: 2130349cc55cSDimitry Andric return instCombineSVEDupX(IC, II); 2131fe6060f1SDimitry Andric case Intrinsic::aarch64_sve_cmpne: 2132fe6060f1SDimitry Andric case Intrinsic::aarch64_sve_cmpne_wide: 2133fe6060f1SDimitry Andric return instCombineSVECmpNE(IC, II); 2134fe6060f1SDimitry Andric case Intrinsic::aarch64_sve_rdffr: 2135fe6060f1SDimitry Andric return instCombineRDFFR(IC, II); 2136fe6060f1SDimitry Andric case Intrinsic::aarch64_sve_lasta: 2137fe6060f1SDimitry Andric case Intrinsic::aarch64_sve_lastb: 2138fe6060f1SDimitry Andric return instCombineSVELast(IC, II); 2139753f127fSDimitry Andric case Intrinsic::aarch64_sve_clasta_n: 2140753f127fSDimitry Andric case Intrinsic::aarch64_sve_clastb_n: 2141753f127fSDimitry Andric return instCombineSVECondLast(IC, II); 2142fe6060f1SDimitry Andric case Intrinsic::aarch64_sve_cntd: 2143fe6060f1SDimitry Andric return instCombineSVECntElts(IC, II, 2); 2144fe6060f1SDimitry Andric case Intrinsic::aarch64_sve_cntw: 2145fe6060f1SDimitry Andric return instCombineSVECntElts(IC, II, 4); 2146fe6060f1SDimitry Andric case Intrinsic::aarch64_sve_cnth: 2147fe6060f1SDimitry Andric return instCombineSVECntElts(IC, II, 8); 2148fe6060f1SDimitry Andric case Intrinsic::aarch64_sve_cntb: 2149fe6060f1SDimitry Andric return instCombineSVECntElts(IC, II, 16); 2150fe6060f1SDimitry Andric case Intrinsic::aarch64_sve_ptest_any: 2151fe6060f1SDimitry Andric case Intrinsic::aarch64_sve_ptest_first: 2152fe6060f1SDimitry Andric case Intrinsic::aarch64_sve_ptest_last: 2153fe6060f1SDimitry Andric return instCombineSVEPTest(IC, II); 215406c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fabd: 2155297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u); 2156349cc55cSDimitry Andric case Intrinsic::aarch64_sve_fadd: 215706c3fb27SDimitry Andric return instCombineSVEVectorFAdd(IC, II); 215806c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fadd_u: 215906c3fb27SDimitry Andric return instCombineSVEVectorFAddU(IC, II); 216006c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fdiv: 2161297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u); 216206c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fmax: 2163297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u); 216406c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fmaxnm: 2165297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u); 216606c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fmin: 2167297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u); 216806c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fminnm: 2169297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u); 217006c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fmla: 2171297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u); 217206c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fmls: 2173297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u); 217406c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fmul: 2175297eecfbSDimitry Andric if (auto II_U = 2176297eecfbSDimitry Andric instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u)) 2177297eecfbSDimitry Andric return II_U; 2178297eecfbSDimitry Andric return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u); 217906c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fmul_u: 218006c3fb27SDimitry Andric return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u); 218106c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fmulx: 2182297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u); 218306c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fnmla: 2184297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u); 218506c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fnmls: 2186297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u); 218706c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fsub: 218806c3fb27SDimitry Andric return instCombineSVEVectorFSub(IC, II); 218906c3fb27SDimitry Andric case Intrinsic::aarch64_sve_fsub_u: 219006c3fb27SDimitry Andric return instCombineSVEVectorFSubU(IC, II); 2191bdd1243dSDimitry Andric case Intrinsic::aarch64_sve_add: 2192bdd1243dSDimitry Andric return instCombineSVEVectorAdd(IC, II); 219306c3fb27SDimitry Andric case Intrinsic::aarch64_sve_add_u: 219406c3fb27SDimitry Andric return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u, 219506c3fb27SDimitry Andric Intrinsic::aarch64_sve_mla_u>( 219606c3fb27SDimitry Andric IC, II, true); 219706c3fb27SDimitry Andric case Intrinsic::aarch64_sve_mla: 2198297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u); 219906c3fb27SDimitry Andric case Intrinsic::aarch64_sve_mls: 2200297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u); 220106c3fb27SDimitry Andric case Intrinsic::aarch64_sve_mul: 2202297eecfbSDimitry Andric if (auto II_U = 2203297eecfbSDimitry Andric instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u)) 2204297eecfbSDimitry Andric return II_U; 2205297eecfbSDimitry Andric return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u); 220606c3fb27SDimitry Andric case Intrinsic::aarch64_sve_mul_u: 220706c3fb27SDimitry Andric return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u); 220806c3fb27SDimitry Andric case Intrinsic::aarch64_sve_sabd: 2209297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u); 221006c3fb27SDimitry Andric case Intrinsic::aarch64_sve_smax: 2211297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u); 221206c3fb27SDimitry Andric case Intrinsic::aarch64_sve_smin: 2213297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u); 221406c3fb27SDimitry Andric case Intrinsic::aarch64_sve_smulh: 2215297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u); 2216bdd1243dSDimitry Andric case Intrinsic::aarch64_sve_sub: 2217bdd1243dSDimitry Andric return instCombineSVEVectorSub(IC, II); 221806c3fb27SDimitry Andric case Intrinsic::aarch64_sve_sub_u: 221906c3fb27SDimitry Andric return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u, 222006c3fb27SDimitry Andric Intrinsic::aarch64_sve_mls_u>( 222106c3fb27SDimitry Andric IC, II, true); 222206c3fb27SDimitry Andric case Intrinsic::aarch64_sve_uabd: 2223297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u); 222406c3fb27SDimitry Andric case Intrinsic::aarch64_sve_umax: 2225297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u); 222606c3fb27SDimitry Andric case Intrinsic::aarch64_sve_umin: 2227297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u); 222806c3fb27SDimitry Andric case Intrinsic::aarch64_sve_umulh: 2229297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u); 223006c3fb27SDimitry Andric case Intrinsic::aarch64_sve_asr: 2231297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u); 223206c3fb27SDimitry Andric case Intrinsic::aarch64_sve_lsl: 2233297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u); 223406c3fb27SDimitry Andric case Intrinsic::aarch64_sve_lsr: 2235297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u); 223606c3fb27SDimitry Andric case Intrinsic::aarch64_sve_and: 2237297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u); 223806c3fb27SDimitry Andric case Intrinsic::aarch64_sve_bic: 2239297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u); 224006c3fb27SDimitry Andric case Intrinsic::aarch64_sve_eor: 2241297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u); 224206c3fb27SDimitry Andric case Intrinsic::aarch64_sve_orr: 2243297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u); 224406c3fb27SDimitry Andric case Intrinsic::aarch64_sve_sqsub: 2245297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u); 224606c3fb27SDimitry Andric case Intrinsic::aarch64_sve_uqsub: 2247297eecfbSDimitry Andric return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u); 2248fe6060f1SDimitry Andric case Intrinsic::aarch64_sve_tbl: 2249fe6060f1SDimitry Andric return instCombineSVETBL(IC, II); 2250349cc55cSDimitry Andric case Intrinsic::aarch64_sve_uunpkhi: 2251349cc55cSDimitry Andric case Intrinsic::aarch64_sve_uunpklo: 2252349cc55cSDimitry Andric case Intrinsic::aarch64_sve_sunpkhi: 2253349cc55cSDimitry Andric case Intrinsic::aarch64_sve_sunpklo: 2254349cc55cSDimitry Andric return instCombineSVEUnpack(IC, II); 22550fca6ea1SDimitry Andric case Intrinsic::aarch64_sve_uzp1: 22560fca6ea1SDimitry Andric return instCombineSVEUzp1(IC, II); 2257349cc55cSDimitry Andric case Intrinsic::aarch64_sve_zip1: 2258349cc55cSDimitry Andric case Intrinsic::aarch64_sve_zip2: 2259349cc55cSDimitry Andric return instCombineSVEZip(IC, II); 2260349cc55cSDimitry Andric case Intrinsic::aarch64_sve_ld1_gather_index: 2261349cc55cSDimitry Andric return instCombineLD1GatherIndex(IC, II); 2262349cc55cSDimitry Andric case Intrinsic::aarch64_sve_st1_scatter_index: 2263349cc55cSDimitry Andric return instCombineST1ScatterIndex(IC, II); 2264349cc55cSDimitry Andric case Intrinsic::aarch64_sve_ld1: 2265349cc55cSDimitry Andric return instCombineSVELD1(IC, II, DL); 2266349cc55cSDimitry Andric case Intrinsic::aarch64_sve_st1: 2267349cc55cSDimitry Andric return instCombineSVEST1(IC, II, DL); 22680eae32dcSDimitry Andric case Intrinsic::aarch64_sve_sdiv: 22690eae32dcSDimitry Andric return instCombineSVESDIV(IC, II); 227081ad6265SDimitry Andric case Intrinsic::aarch64_sve_sel: 227181ad6265SDimitry Andric return instCombineSVESel(IC, II); 227281ad6265SDimitry Andric case Intrinsic::aarch64_sve_srshl: 227381ad6265SDimitry Andric return instCombineSVESrshl(IC, II); 2274bdd1243dSDimitry Andric case Intrinsic::aarch64_sve_dupq_lane: 2275bdd1243dSDimitry Andric return instCombineSVEDupqLane(IC, II); 2276fe6060f1SDimitry Andric } 2277fe6060f1SDimitry Andric 2278bdd1243dSDimitry Andric return std::nullopt; 2279fe6060f1SDimitry Andric } 2280fe6060f1SDimitry Andric 2281bdd1243dSDimitry Andric std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( 228204eeddc0SDimitry Andric InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, 228304eeddc0SDimitry Andric APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, 228404eeddc0SDimitry Andric std::function<void(Instruction *, unsigned, APInt, APInt &)> 228504eeddc0SDimitry Andric SimplifyAndSetOp) const { 228604eeddc0SDimitry Andric switch (II.getIntrinsicID()) { 228704eeddc0SDimitry Andric default: 228804eeddc0SDimitry Andric break; 228904eeddc0SDimitry Andric case Intrinsic::aarch64_neon_fcvtxn: 229004eeddc0SDimitry Andric case Intrinsic::aarch64_neon_rshrn: 229104eeddc0SDimitry Andric case Intrinsic::aarch64_neon_sqrshrn: 229204eeddc0SDimitry Andric case Intrinsic::aarch64_neon_sqrshrun: 229304eeddc0SDimitry Andric case Intrinsic::aarch64_neon_sqshrn: 229404eeddc0SDimitry Andric case Intrinsic::aarch64_neon_sqshrun: 229504eeddc0SDimitry Andric case Intrinsic::aarch64_neon_sqxtn: 229604eeddc0SDimitry Andric case Intrinsic::aarch64_neon_sqxtun: 229704eeddc0SDimitry Andric case Intrinsic::aarch64_neon_uqrshrn: 229804eeddc0SDimitry Andric case Intrinsic::aarch64_neon_uqshrn: 229904eeddc0SDimitry Andric case Intrinsic::aarch64_neon_uqxtn: 230004eeddc0SDimitry Andric SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); 230104eeddc0SDimitry Andric break; 230204eeddc0SDimitry Andric } 230304eeddc0SDimitry Andric 2304bdd1243dSDimitry Andric return std::nullopt; 2305bdd1243dSDimitry Andric } 2306bdd1243dSDimitry Andric 2307*62987288SDimitry Andric bool AArch64TTIImpl::enableScalableVectorization() const { 2308*62987288SDimitry Andric return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() && 2309*62987288SDimitry Andric EnableScalableAutovecInStreamingMode); 2310*62987288SDimitry Andric } 2311*62987288SDimitry Andric 2312bdd1243dSDimitry Andric TypeSize 2313bdd1243dSDimitry Andric AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 2314bdd1243dSDimitry Andric switch (K) { 2315bdd1243dSDimitry Andric case TargetTransformInfo::RGK_Scalar: 2316bdd1243dSDimitry Andric return TypeSize::getFixed(64); 2317bdd1243dSDimitry Andric case TargetTransformInfo::RGK_FixedWidthVector: 23180fca6ea1SDimitry Andric if (ST->useSVEForFixedLengthVectors() && 23190fca6ea1SDimitry Andric (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode)) 2320bdd1243dSDimitry Andric return TypeSize::getFixed( 2321bdd1243dSDimitry Andric std::max(ST->getMinSVEVectorSizeInBits(), 128u)); 23220fca6ea1SDimitry Andric else if (ST->isNeonAvailable()) 23230fca6ea1SDimitry Andric return TypeSize::getFixed(128); 23240fca6ea1SDimitry Andric else 23250fca6ea1SDimitry Andric return TypeSize::getFixed(0); 2326bdd1243dSDimitry Andric case TargetTransformInfo::RGK_ScalableVector: 23270fca6ea1SDimitry Andric if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() && 23280fca6ea1SDimitry Andric EnableScalableAutovecInStreamingMode)) 23290fca6ea1SDimitry Andric return TypeSize::getScalable(128); 23300fca6ea1SDimitry Andric else 2331bdd1243dSDimitry Andric return TypeSize::getScalable(0); 2332bdd1243dSDimitry Andric } 2333bdd1243dSDimitry Andric llvm_unreachable("Unsupported register kind"); 233404eeddc0SDimitry Andric } 233504eeddc0SDimitry Andric 23360b57cec5SDimitry Andric bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, 233706c3fb27SDimitry Andric ArrayRef<const Value *> Args, 233806c3fb27SDimitry Andric Type *SrcOverrideTy) { 23390b57cec5SDimitry Andric // A helper that returns a vector type from the given type. The number of 234081ad6265SDimitry Andric // elements in type Ty determines the vector width. 23410b57cec5SDimitry Andric auto toVectorTy = [&](Type *ArgTy) { 2342e8d8bef9SDimitry Andric return VectorType::get(ArgTy->getScalarType(), 2343e8d8bef9SDimitry Andric cast<VectorType>(DstTy)->getElementCount()); 23440b57cec5SDimitry Andric }; 23450b57cec5SDimitry Andric 234606c3fb27SDimitry Andric // Exit early if DstTy is not a vector type whose elements are one of [i16, 234706c3fb27SDimitry Andric // i32, i64]. SVE doesn't generally have the same set of instructions to 2348bdd1243dSDimitry Andric // perform an extend with the add/sub/mul. There are SMULLB style 2349bdd1243dSDimitry Andric // instructions, but they operate on top/bottom, requiring some sort of lane 2350bdd1243dSDimitry Andric // interleaving to be used with zext/sext. 235106c3fb27SDimitry Andric unsigned DstEltSize = DstTy->getScalarSizeInBits(); 235206c3fb27SDimitry Andric if (!useNeonVector(DstTy) || Args.size() != 2 || 235306c3fb27SDimitry Andric (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) 23540b57cec5SDimitry Andric return false; 23550b57cec5SDimitry Andric 23560b57cec5SDimitry Andric // Determine if the operation has a widening variant. We consider both the 23570b57cec5SDimitry Andric // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the 23580b57cec5SDimitry Andric // instructions. 23590b57cec5SDimitry Andric // 236081ad6265SDimitry Andric // TODO: Add additional widening operations (e.g., shl, etc.) once we 23610b57cec5SDimitry Andric // verify that their extending operands are eliminated during code 23620b57cec5SDimitry Andric // generation. 236306c3fb27SDimitry Andric Type *SrcTy = SrcOverrideTy; 23640b57cec5SDimitry Andric switch (Opcode) { 23650b57cec5SDimitry Andric case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). 23660b57cec5SDimitry Andric case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). 236706c3fb27SDimitry Andric // The second operand needs to be an extend 236806c3fb27SDimitry Andric if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) { 236906c3fb27SDimitry Andric if (!SrcTy) 237006c3fb27SDimitry Andric SrcTy = 237106c3fb27SDimitry Andric toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType()); 237206c3fb27SDimitry Andric } else 237306c3fb27SDimitry Andric return false; 23740b57cec5SDimitry Andric break; 237506c3fb27SDimitry Andric case Instruction::Mul: { // SMULL(2), UMULL(2) 237606c3fb27SDimitry Andric // Both operands need to be extends of the same type. 237706c3fb27SDimitry Andric if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || 237806c3fb27SDimitry Andric (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { 237906c3fb27SDimitry Andric if (!SrcTy) 238006c3fb27SDimitry Andric SrcTy = 238106c3fb27SDimitry Andric toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType()); 238206c3fb27SDimitry Andric } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) { 238306c3fb27SDimitry Andric // If one of the operands is a Zext and the other has enough zero bits to 238406c3fb27SDimitry Andric // be treated as unsigned, we can still general a umull, meaning the zext 238506c3fb27SDimitry Andric // is free. 238606c3fb27SDimitry Andric KnownBits Known = 238706c3fb27SDimitry Andric computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); 238806c3fb27SDimitry Andric if (Args[0]->getType()->getScalarSizeInBits() - 238906c3fb27SDimitry Andric Known.Zero.countLeadingOnes() > 239006c3fb27SDimitry Andric DstTy->getScalarSizeInBits() / 2) 239106c3fb27SDimitry Andric return false; 239206c3fb27SDimitry Andric if (!SrcTy) 239306c3fb27SDimitry Andric SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(), 239406c3fb27SDimitry Andric DstTy->getScalarSizeInBits() / 2)); 239506c3fb27SDimitry Andric } else 239606c3fb27SDimitry Andric return false; 239706c3fb27SDimitry Andric break; 239806c3fb27SDimitry Andric } 23990b57cec5SDimitry Andric default: 24000b57cec5SDimitry Andric return false; 24010b57cec5SDimitry Andric } 24020b57cec5SDimitry Andric 24030b57cec5SDimitry Andric // Legalize the destination type and ensure it can be used in a widening 24040b57cec5SDimitry Andric // operation. 2405bdd1243dSDimitry Andric auto DstTyL = getTypeLegalizationCost(DstTy); 240606c3fb27SDimitry Andric if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits()) 24070b57cec5SDimitry Andric return false; 24080b57cec5SDimitry Andric 24090b57cec5SDimitry Andric // Legalize the source type and ensure it can be used in a widening 24100b57cec5SDimitry Andric // operation. 241106c3fb27SDimitry Andric assert(SrcTy && "Expected some SrcTy"); 2412bdd1243dSDimitry Andric auto SrcTyL = getTypeLegalizationCost(SrcTy); 24130b57cec5SDimitry Andric unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); 24140b57cec5SDimitry Andric if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) 24150b57cec5SDimitry Andric return false; 24160b57cec5SDimitry Andric 24170b57cec5SDimitry Andric // Get the total number of vector elements in the legalized types. 2418fe6060f1SDimitry Andric InstructionCost NumDstEls = 2419fe6060f1SDimitry Andric DstTyL.first * DstTyL.second.getVectorMinNumElements(); 2420fe6060f1SDimitry Andric InstructionCost NumSrcEls = 2421fe6060f1SDimitry Andric SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); 24220b57cec5SDimitry Andric 24230b57cec5SDimitry Andric // Return true if the legalized types have the same number of vector elements 24240b57cec5SDimitry Andric // and the destination element type size is twice that of the source type. 242506c3fb27SDimitry Andric return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize; 24260b57cec5SDimitry Andric } 24270b57cec5SDimitry Andric 24285f757f3fSDimitry Andric // s/urhadd instructions implement the following pattern, making the 24295f757f3fSDimitry Andric // extends free: 24305f757f3fSDimitry Andric // %x = add ((zext i8 -> i16), 1) 24315f757f3fSDimitry Andric // %y = (zext i8 -> i16) 24325f757f3fSDimitry Andric // trunc i16 (lshr (add %x, %y), 1) -> i8 24335f757f3fSDimitry Andric // 24345f757f3fSDimitry Andric bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, 24355f757f3fSDimitry Andric Type *Src) { 24365f757f3fSDimitry Andric // The source should be a legal vector type. 24375f757f3fSDimitry Andric if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) || 24385f757f3fSDimitry Andric (Src->isScalableTy() && !ST->hasSVE2())) 24395f757f3fSDimitry Andric return false; 24405f757f3fSDimitry Andric 24415f757f3fSDimitry Andric if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse()) 24425f757f3fSDimitry Andric return false; 24435f757f3fSDimitry Andric 24445f757f3fSDimitry Andric // Look for trunc/shl/add before trying to match the pattern. 24455f757f3fSDimitry Andric const Instruction *Add = ExtUser; 24465f757f3fSDimitry Andric auto *AddUser = 24475f757f3fSDimitry Andric dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser()); 24485f757f3fSDimitry Andric if (AddUser && AddUser->getOpcode() == Instruction::Add) 24495f757f3fSDimitry Andric Add = AddUser; 24505f757f3fSDimitry Andric 24515f757f3fSDimitry Andric auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser()); 24525f757f3fSDimitry Andric if (!Shr || Shr->getOpcode() != Instruction::LShr) 24535f757f3fSDimitry Andric return false; 24545f757f3fSDimitry Andric 24555f757f3fSDimitry Andric auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser()); 24565f757f3fSDimitry Andric if (!Trunc || Trunc->getOpcode() != Instruction::Trunc || 24575f757f3fSDimitry Andric Src->getScalarSizeInBits() != 24585f757f3fSDimitry Andric cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits()) 24595f757f3fSDimitry Andric return false; 24605f757f3fSDimitry Andric 24615f757f3fSDimitry Andric // Try to match the whole pattern. Ext could be either the first or second 24625f757f3fSDimitry Andric // m_ZExtOrSExt matched. 24635f757f3fSDimitry Andric Instruction *Ex1, *Ex2; 24645f757f3fSDimitry Andric if (!(match(Add, m_c_Add(m_Instruction(Ex1), 24655f757f3fSDimitry Andric m_c_Add(m_Instruction(Ex2), m_SpecificInt(1)))))) 24665f757f3fSDimitry Andric return false; 24675f757f3fSDimitry Andric 24685f757f3fSDimitry Andric // Ensure both extends are of the same type 24695f757f3fSDimitry Andric if (match(Ex1, m_ZExtOrSExt(m_Value())) && 24705f757f3fSDimitry Andric Ex1->getOpcode() == Ex2->getOpcode()) 24715f757f3fSDimitry Andric return true; 24725f757f3fSDimitry Andric 24735f757f3fSDimitry Andric return false; 24745f757f3fSDimitry Andric } 24755f757f3fSDimitry Andric 2476fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 2477fe6060f1SDimitry Andric Type *Src, 2478e8d8bef9SDimitry Andric TTI::CastContextHint CCH, 24795ffd83dbSDimitry Andric TTI::TargetCostKind CostKind, 24800b57cec5SDimitry Andric const Instruction *I) { 24810b57cec5SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode); 24820b57cec5SDimitry Andric assert(ISD && "Invalid opcode"); 24830b57cec5SDimitry Andric // If the cast is observable, and it is used by a widening instruction (e.g., 24840b57cec5SDimitry Andric // uaddl, saddw, etc.), it may be free. 248581ad6265SDimitry Andric if (I && I->hasOneUser()) { 24860b57cec5SDimitry Andric auto *SingleUser = cast<Instruction>(*I->user_begin()); 24870b57cec5SDimitry Andric SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); 248806c3fb27SDimitry Andric if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) { 248906c3fb27SDimitry Andric // For adds only count the second operand as free if both operands are 249006c3fb27SDimitry Andric // extends but not the same operation. (i.e both operands are not free in 249106c3fb27SDimitry Andric // add(sext, zext)). 249206c3fb27SDimitry Andric if (SingleUser->getOpcode() == Instruction::Add) { 249306c3fb27SDimitry Andric if (I == SingleUser->getOperand(1) || 249406c3fb27SDimitry Andric (isa<CastInst>(SingleUser->getOperand(1)) && 249506c3fb27SDimitry Andric cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode)) 24960b57cec5SDimitry Andric return 0; 249706c3fb27SDimitry Andric } else // Others are free so long as isWideningInstruction returned true. 24980b57cec5SDimitry Andric return 0; 24990b57cec5SDimitry Andric } 25005f757f3fSDimitry Andric 25015f757f3fSDimitry Andric // The cast will be free for the s/urhadd instructions 25025f757f3fSDimitry Andric if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) && 25035f757f3fSDimitry Andric isExtPartOfAvgExpr(SingleUser, Dst, Src)) 25045f757f3fSDimitry Andric return 0; 25050b57cec5SDimitry Andric } 25060b57cec5SDimitry Andric 25075ffd83dbSDimitry Andric // TODO: Allow non-throughput costs that aren't binary. 2508fe6060f1SDimitry Andric auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { 25095ffd83dbSDimitry Andric if (CostKind != TTI::TCK_RecipThroughput) 25105ffd83dbSDimitry Andric return Cost == 0 ? 0 : 1; 25115ffd83dbSDimitry Andric return Cost; 25125ffd83dbSDimitry Andric }; 25135ffd83dbSDimitry Andric 25140b57cec5SDimitry Andric EVT SrcTy = TLI->getValueType(DL, Src); 25150b57cec5SDimitry Andric EVT DstTy = TLI->getValueType(DL, Dst); 25160b57cec5SDimitry Andric 25170b57cec5SDimitry Andric if (!SrcTy.isSimple() || !DstTy.isSimple()) 2518e8d8bef9SDimitry Andric return AdjustCost( 2519e8d8bef9SDimitry Andric BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 25200b57cec5SDimitry Andric 25210b57cec5SDimitry Andric static const TypeConversionCostTblEntry 25220b57cec5SDimitry Andric ConversionTbl[] = { 2523bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn 2524bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn 2525bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn 2526bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn 2527bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1 2528bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn 2529bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn 2530bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1 2531bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn 2532bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn 2533bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn 2534bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1 2535bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1 2536bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1 2537bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1 2538bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1 2539bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1 2540bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1 2541bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1 2542bdd1243dSDimitry Andric { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1 25430b57cec5SDimitry Andric 2544fe6060f1SDimitry Andric // Truncations on nxvmiN 2545fe6060f1SDimitry Andric { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 }, 2546fe6060f1SDimitry Andric { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 }, 2547fe6060f1SDimitry Andric { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 }, 2548fe6060f1SDimitry Andric { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 }, 2549fe6060f1SDimitry Andric { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 }, 2550fe6060f1SDimitry Andric { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 }, 2551fe6060f1SDimitry Andric { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, 2552fe6060f1SDimitry Andric { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, 2553fe6060f1SDimitry Andric { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, 2554fe6060f1SDimitry Andric { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 }, 2555fe6060f1SDimitry Andric { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, 2556fe6060f1SDimitry Andric { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, 2557fe6060f1SDimitry Andric { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, 2558fe6060f1SDimitry Andric { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 }, 2559fe6060f1SDimitry Andric { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 }, 2560fe6060f1SDimitry Andric { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 }, 2561fe6060f1SDimitry Andric 25620b57cec5SDimitry Andric // The number of shll instructions for the extension. 25630b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 25640b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 25650b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 25660b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 25670b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 25680b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 25690b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 25700b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 25710b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 25720b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 25730b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 25740b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 25750b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 25760b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 25770b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 25780b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 25790b57cec5SDimitry Andric 25800b57cec5SDimitry Andric // LowerVectorINT_TO_FP: 25810b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 25820b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 25830b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 25840b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 25850b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 25860b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 25870b57cec5SDimitry Andric 25880b57cec5SDimitry Andric // Complex: to v2f32 25890b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 25900b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 25910b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 25920b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 25930b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 25940b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 25950b57cec5SDimitry Andric 25960b57cec5SDimitry Andric // Complex: to v4f32 25970b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, 25980b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 25990b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 26000b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 26010b57cec5SDimitry Andric 26020b57cec5SDimitry Andric // Complex: to v8f32 26030b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 26040b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 26050b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, 26060b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 26070b57cec5SDimitry Andric 26080b57cec5SDimitry Andric // Complex: to v16f32 26090b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 26100b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, 26110b57cec5SDimitry Andric 26120b57cec5SDimitry Andric // Complex: to v2f64 26130b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 26140b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 26150b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 26160b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 26170b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 26180b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 26190b57cec5SDimitry Andric 2620bdd1243dSDimitry Andric // Complex: to v4f64 2621bdd1243dSDimitry Andric { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 }, 2622bdd1243dSDimitry Andric { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 }, 26230b57cec5SDimitry Andric 26240b57cec5SDimitry Andric // LowerVectorFP_TO_INT 26250b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, 26260b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 26270b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 26280b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 26290b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 26300b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 26310b57cec5SDimitry Andric 26320b57cec5SDimitry Andric // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). 26330b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, 26340b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, 26350b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, 26360b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, 26370b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, 26380b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, 26390b57cec5SDimitry Andric 26400b57cec5SDimitry Andric // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 26410b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 26420b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, 26430b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 26440b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, 26450b57cec5SDimitry Andric 2646fe6060f1SDimitry Andric // Complex, from nxv2f32. 2647fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 2648fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 2649fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 2650fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 2651fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, 2652fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, 2653fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, 2654fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, 2655fe6060f1SDimitry Andric 26560b57cec5SDimitry Andric // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. 26570b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 26580b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, 26590b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, 26600b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 26610b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, 26620b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, 2663fe6060f1SDimitry Andric 2664fe6060f1SDimitry Andric // Complex, from nxv2f64. 2665fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 2666fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 2667fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 2668fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 2669fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, 2670fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, 2671fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, 2672fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, 2673fe6060f1SDimitry Andric 2674fe6060f1SDimitry Andric // Complex, from nxv4f32. 2675fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 2676fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 2677fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 2678fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 2679fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, 2680fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, 2681fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, 2682fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, 2683fe6060f1SDimitry Andric 2684fe6060f1SDimitry Andric // Complex, from nxv8f64. Illegal -> illegal conversions not required. 2685fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 2686fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 2687fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, 2688fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, 2689fe6060f1SDimitry Andric 2690fe6060f1SDimitry Andric // Complex, from nxv4f64. Illegal -> illegal conversions not required. 2691fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 2692fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 2693fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 2694fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, 2695fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, 2696fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, 2697fe6060f1SDimitry Andric 2698fe6060f1SDimitry Andric // Complex, from nxv8f32. Illegal -> illegal conversions not required. 2699fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 2700fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 2701fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, 2702fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, 2703fe6060f1SDimitry Andric 2704fe6060f1SDimitry Andric // Complex, from nxv8f16. 2705fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 2706fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 2707fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 2708fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 2709fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, 2710fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, 2711fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, 2712fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, 2713fe6060f1SDimitry Andric 2714fe6060f1SDimitry Andric // Complex, from nxv4f16. 2715fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 2716fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 2717fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 2718fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 2719fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, 2720fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, 2721fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, 2722fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, 2723fe6060f1SDimitry Andric 2724fe6060f1SDimitry Andric // Complex, from nxv2f16. 2725fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 2726fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 2727fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 2728fe6060f1SDimitry Andric { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 2729fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, 2730fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, 2731fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, 2732fe6060f1SDimitry Andric { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, 2733fe6060f1SDimitry Andric 2734fe6060f1SDimitry Andric // Truncate from nxvmf32 to nxvmf16. 2735fe6060f1SDimitry Andric { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 }, 2736fe6060f1SDimitry Andric { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 }, 2737fe6060f1SDimitry Andric { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 }, 2738fe6060f1SDimitry Andric 2739fe6060f1SDimitry Andric // Truncate from nxvmf64 to nxvmf16. 2740fe6060f1SDimitry Andric { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 }, 2741fe6060f1SDimitry Andric { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 }, 2742fe6060f1SDimitry Andric { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 }, 2743fe6060f1SDimitry Andric 2744fe6060f1SDimitry Andric // Truncate from nxvmf64 to nxvmf32. 2745fe6060f1SDimitry Andric { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 }, 2746fe6060f1SDimitry Andric { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 }, 2747fe6060f1SDimitry Andric { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 }, 2748fe6060f1SDimitry Andric 2749fe6060f1SDimitry Andric // Extend from nxvmf16 to nxvmf32. 2750fe6060f1SDimitry Andric { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, 2751fe6060f1SDimitry Andric { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, 2752fe6060f1SDimitry Andric { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, 2753fe6060f1SDimitry Andric 2754fe6060f1SDimitry Andric // Extend from nxvmf16 to nxvmf64. 2755fe6060f1SDimitry Andric { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, 2756fe6060f1SDimitry Andric { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, 2757fe6060f1SDimitry Andric { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, 2758fe6060f1SDimitry Andric 2759fe6060f1SDimitry Andric // Extend from nxvmf32 to nxvmf64. 2760fe6060f1SDimitry Andric { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, 2761fe6060f1SDimitry Andric { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, 2762fe6060f1SDimitry Andric { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, 2763fe6060f1SDimitry Andric 276404eeddc0SDimitry Andric // Bitcasts from float to integer 276504eeddc0SDimitry Andric { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 }, 276604eeddc0SDimitry Andric { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 }, 276704eeddc0SDimitry Andric { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 }, 276804eeddc0SDimitry Andric 276904eeddc0SDimitry Andric // Bitcasts from integer to float 277004eeddc0SDimitry Andric { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 }, 277104eeddc0SDimitry Andric { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 }, 277204eeddc0SDimitry Andric { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 }, 277306c3fb27SDimitry Andric 277406c3fb27SDimitry Andric // Add cost for extending to illegal -too wide- scalable vectors. 277506c3fb27SDimitry Andric // zero/sign extend are implemented by multiple unpack operations, 277606c3fb27SDimitry Andric // where each operation has a cost of 1. 277706c3fb27SDimitry Andric { ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2}, 277806c3fb27SDimitry Andric { ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6}, 277906c3fb27SDimitry Andric { ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14}, 278006c3fb27SDimitry Andric { ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2}, 278106c3fb27SDimitry Andric { ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6}, 278206c3fb27SDimitry Andric { ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2}, 278306c3fb27SDimitry Andric 278406c3fb27SDimitry Andric { ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2}, 278506c3fb27SDimitry Andric { ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6}, 278606c3fb27SDimitry Andric { ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14}, 278706c3fb27SDimitry Andric { ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2}, 278806c3fb27SDimitry Andric { ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6}, 278906c3fb27SDimitry Andric { ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2}, 27900b57cec5SDimitry Andric }; 27910b57cec5SDimitry Andric 279206c3fb27SDimitry Andric // We have to estimate a cost of fixed length operation upon 279306c3fb27SDimitry Andric // SVE registers(operations) with the number of registers required 279406c3fb27SDimitry Andric // for a fixed type to be represented upon SVE registers. 279506c3fb27SDimitry Andric EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy; 279606c3fb27SDimitry Andric if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() && 279706c3fb27SDimitry Andric SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() && 279806c3fb27SDimitry Andric ST->useSVEForFixedLengthVectors(WiderTy)) { 279906c3fb27SDimitry Andric std::pair<InstructionCost, MVT> LT = 280006c3fb27SDimitry Andric getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext())); 280106c3fb27SDimitry Andric unsigned NumElements = AArch64::SVEBitsPerBlock / 28020fca6ea1SDimitry Andric LT.second.getScalarSizeInBits(); 280306c3fb27SDimitry Andric return AdjustCost( 280406c3fb27SDimitry Andric LT.first * 280506c3fb27SDimitry Andric getCastInstrCost( 280606c3fb27SDimitry Andric Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements), 280706c3fb27SDimitry Andric ScalableVectorType::get(Src->getScalarType(), NumElements), CCH, 280806c3fb27SDimitry Andric CostKind, I)); 280906c3fb27SDimitry Andric } 281006c3fb27SDimitry Andric 28110b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, 28120b57cec5SDimitry Andric DstTy.getSimpleVT(), 28130b57cec5SDimitry Andric SrcTy.getSimpleVT())) 28145ffd83dbSDimitry Andric return AdjustCost(Entry->Cost); 28150b57cec5SDimitry Andric 281681ad6265SDimitry Andric static const TypeConversionCostTblEntry FP16Tbl[] = { 281781ad6265SDimitry Andric {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs 281881ad6265SDimitry Andric {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1}, 281981ad6265SDimitry Andric {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs 282081ad6265SDimitry Andric {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1}, 282181ad6265SDimitry Andric {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs 282281ad6265SDimitry Andric {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2}, 282381ad6265SDimitry Andric {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn 282481ad6265SDimitry Andric {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2}, 282581ad6265SDimitry Andric {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs 282681ad6265SDimitry Andric {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1}, 282781ad6265SDimitry Andric {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs 282881ad6265SDimitry Andric {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4}, 282981ad6265SDimitry Andric {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn 283081ad6265SDimitry Andric {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3}, 283181ad6265SDimitry Andric {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs 283281ad6265SDimitry Andric {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2}, 283381ad6265SDimitry Andric {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs 283481ad6265SDimitry Andric {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8}, 283581ad6265SDimitry Andric {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf 283681ad6265SDimitry Andric {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf 283781ad6265SDimitry Andric {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf 283881ad6265SDimitry Andric {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf 283981ad6265SDimitry Andric }; 284081ad6265SDimitry Andric 284181ad6265SDimitry Andric if (ST->hasFullFP16()) 284281ad6265SDimitry Andric if (const auto *Entry = ConvertCostTableLookup( 284381ad6265SDimitry Andric FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 284481ad6265SDimitry Andric return AdjustCost(Entry->Cost); 284581ad6265SDimitry Andric 28465f757f3fSDimitry Andric if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && 28470fca6ea1SDimitry Andric CCH == TTI::CastContextHint::Masked && 28480fca6ea1SDimitry Andric ST->isSVEorStreamingSVEAvailable() && 28495f757f3fSDimitry Andric TLI->getTypeAction(Src->getContext(), SrcTy) == 28505f757f3fSDimitry Andric TargetLowering::TypePromoteInteger && 28515f757f3fSDimitry Andric TLI->getTypeAction(Dst->getContext(), DstTy) == 28525f757f3fSDimitry Andric TargetLowering::TypeSplitVector) { 28535f757f3fSDimitry Andric // The standard behaviour in the backend for these cases is to split the 28545f757f3fSDimitry Andric // extend up into two parts: 28555f757f3fSDimitry Andric // 1. Perform an extending load or masked load up to the legal type. 28565f757f3fSDimitry Andric // 2. Extend the loaded data to the final type. 28575f757f3fSDimitry Andric std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src); 28585f757f3fSDimitry Andric Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext()); 28595f757f3fSDimitry Andric InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost( 28605f757f3fSDimitry Andric Opcode, LegalTy, Src, CCH, CostKind, I); 28615f757f3fSDimitry Andric InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost( 28625f757f3fSDimitry Andric Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I); 28635f757f3fSDimitry Andric return Part1 + Part2; 28645f757f3fSDimitry Andric } 28655f757f3fSDimitry Andric 286606c3fb27SDimitry Andric // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal, 286706c3fb27SDimitry Andric // but we also want to include the TTI::CastContextHint::Masked case too. 286806c3fb27SDimitry Andric if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && 28690fca6ea1SDimitry Andric CCH == TTI::CastContextHint::Masked && 28700fca6ea1SDimitry Andric ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy)) 287106c3fb27SDimitry Andric CCH = TTI::CastContextHint::Normal; 287206c3fb27SDimitry Andric 2873e8d8bef9SDimitry Andric return AdjustCost( 2874e8d8bef9SDimitry Andric BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 28750b57cec5SDimitry Andric } 28760b57cec5SDimitry Andric 2877fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, 2878fe6060f1SDimitry Andric Type *Dst, 28790b57cec5SDimitry Andric VectorType *VecTy, 28800b57cec5SDimitry Andric unsigned Index) { 28810b57cec5SDimitry Andric 28820b57cec5SDimitry Andric // Make sure we were given a valid extend opcode. 28830b57cec5SDimitry Andric assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && 28840b57cec5SDimitry Andric "Invalid opcode"); 28850b57cec5SDimitry Andric 28860b57cec5SDimitry Andric // We are extending an element we extract from a vector, so the source type 28870b57cec5SDimitry Andric // of the extend is the element type of the vector. 28880b57cec5SDimitry Andric auto *Src = VecTy->getElementType(); 28890b57cec5SDimitry Andric 28900b57cec5SDimitry Andric // Sign- and zero-extends are for integer types only. 28910b57cec5SDimitry Andric assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type"); 28920b57cec5SDimitry Andric 28930b57cec5SDimitry Andric // Get the cost for the extract. We compute the cost (if any) for the extend 28940b57cec5SDimitry Andric // below. 2895bdd1243dSDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2896bdd1243dSDimitry Andric InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, 2897bdd1243dSDimitry Andric CostKind, Index, nullptr, nullptr); 28980b57cec5SDimitry Andric 28990b57cec5SDimitry Andric // Legalize the types. 2900bdd1243dSDimitry Andric auto VecLT = getTypeLegalizationCost(VecTy); 29010b57cec5SDimitry Andric auto DstVT = TLI->getValueType(DL, Dst); 29020b57cec5SDimitry Andric auto SrcVT = TLI->getValueType(DL, Src); 29030b57cec5SDimitry Andric 29040b57cec5SDimitry Andric // If the resulting type is still a vector and the destination type is legal, 29050b57cec5SDimitry Andric // we may get the extension for free. If not, get the default cost for the 29060b57cec5SDimitry Andric // extend. 29070b57cec5SDimitry Andric if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) 2908e8d8bef9SDimitry Andric return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 2909e8d8bef9SDimitry Andric CostKind); 29100b57cec5SDimitry Andric 29110b57cec5SDimitry Andric // The destination type should be larger than the element type. If not, get 29120b57cec5SDimitry Andric // the default cost for the extend. 2913e8d8bef9SDimitry Andric if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) 2914e8d8bef9SDimitry Andric return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 2915e8d8bef9SDimitry Andric CostKind); 29160b57cec5SDimitry Andric 29170b57cec5SDimitry Andric switch (Opcode) { 29180b57cec5SDimitry Andric default: 29190b57cec5SDimitry Andric llvm_unreachable("Opcode should be either SExt or ZExt"); 29200b57cec5SDimitry Andric 29210b57cec5SDimitry Andric // For sign-extends, we only need a smov, which performs the extension 29220b57cec5SDimitry Andric // automatically. 29230b57cec5SDimitry Andric case Instruction::SExt: 29240b57cec5SDimitry Andric return Cost; 29250b57cec5SDimitry Andric 29260b57cec5SDimitry Andric // For zero-extends, the extend is performed automatically by a umov unless 29270b57cec5SDimitry Andric // the destination type is i64 and the element type is i8 or i16. 29280b57cec5SDimitry Andric case Instruction::ZExt: 29290b57cec5SDimitry Andric if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) 29300b57cec5SDimitry Andric return Cost; 29310b57cec5SDimitry Andric } 29320b57cec5SDimitry Andric 29330b57cec5SDimitry Andric // If we are unable to perform the extend for free, get the default cost. 2934e8d8bef9SDimitry Andric return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 2935e8d8bef9SDimitry Andric CostKind); 29365ffd83dbSDimitry Andric } 29375ffd83dbSDimitry Andric 2938fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, 2939fe6060f1SDimitry Andric TTI::TargetCostKind CostKind, 2940fe6060f1SDimitry Andric const Instruction *I) { 29415ffd83dbSDimitry Andric if (CostKind != TTI::TCK_RecipThroughput) 29425ffd83dbSDimitry Andric return Opcode == Instruction::PHI ? 0 : 1; 29435ffd83dbSDimitry Andric assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); 29445ffd83dbSDimitry Andric // Branches are assumed to be predicted. 29455ffd83dbSDimitry Andric return 0; 29460b57cec5SDimitry Andric } 29470b57cec5SDimitry Andric 294806c3fb27SDimitry Andric InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I, 294906c3fb27SDimitry Andric Type *Val, 2950bdd1243dSDimitry Andric unsigned Index, 2951bdd1243dSDimitry Andric bool HasRealUse) { 29520b57cec5SDimitry Andric assert(Val->isVectorTy() && "This must be a vector type"); 29530b57cec5SDimitry Andric 29540b57cec5SDimitry Andric if (Index != -1U) { 29550b57cec5SDimitry Andric // Legalize the type. 2956bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); 29570b57cec5SDimitry Andric 29580b57cec5SDimitry Andric // This type is legalized to a scalar type. 29590b57cec5SDimitry Andric if (!LT.second.isVector()) 29600b57cec5SDimitry Andric return 0; 29610b57cec5SDimitry Andric 296204eeddc0SDimitry Andric // The type may be split. For fixed-width vectors we can normalize the 296304eeddc0SDimitry Andric // index to the new type. 296404eeddc0SDimitry Andric if (LT.second.isFixedLengthVector()) { 29650b57cec5SDimitry Andric unsigned Width = LT.second.getVectorNumElements(); 29660b57cec5SDimitry Andric Index = Index % Width; 296704eeddc0SDimitry Andric } 29680b57cec5SDimitry Andric 29690b57cec5SDimitry Andric // The element at index zero is already inside the vector. 2970bdd1243dSDimitry Andric // - For a physical (HasRealUse==true) insert-element or extract-element 2971bdd1243dSDimitry Andric // instruction that extracts integers, an explicit FPR -> GPR move is 2972bdd1243dSDimitry Andric // needed. So it has non-zero cost. 2973bdd1243dSDimitry Andric // - For the rest of cases (virtual instruction or element type is float), 2974bdd1243dSDimitry Andric // consider the instruction free. 297506c3fb27SDimitry Andric if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy())) 297606c3fb27SDimitry Andric return 0; 297706c3fb27SDimitry Andric 297806c3fb27SDimitry Andric // This is recognising a LD1 single-element structure to one lane of one 297906c3fb27SDimitry Andric // register instruction. I.e., if this is an `insertelement` instruction, 298006c3fb27SDimitry Andric // and its second operand is a load, then we will generate a LD1, which 298106c3fb27SDimitry Andric // are expensive instructions. 298206c3fb27SDimitry Andric if (I && dyn_cast<LoadInst>(I->getOperand(1))) 298306c3fb27SDimitry Andric return ST->getVectorInsertExtractBaseCost() + 1; 298406c3fb27SDimitry Andric 298506c3fb27SDimitry Andric // i1 inserts and extract will include an extra cset or cmp of the vector 298606c3fb27SDimitry Andric // value. Increase the cost by 1 to account. 298706c3fb27SDimitry Andric if (Val->getScalarSizeInBits() == 1) 298806c3fb27SDimitry Andric return ST->getVectorInsertExtractBaseCost() + 1; 298906c3fb27SDimitry Andric 2990bdd1243dSDimitry Andric // FIXME: 2991bdd1243dSDimitry Andric // If the extract-element and insert-element instructions could be 2992bdd1243dSDimitry Andric // simplified away (e.g., could be combined into users by looking at use-def 2993bdd1243dSDimitry Andric // context), they have no cost. This is not done in the first place for 2994bdd1243dSDimitry Andric // compile-time considerations. 29950b57cec5SDimitry Andric } 29960b57cec5SDimitry Andric 29970b57cec5SDimitry Andric // All other insert/extracts cost this much. 29980b57cec5SDimitry Andric return ST->getVectorInsertExtractBaseCost(); 29990b57cec5SDimitry Andric } 30000b57cec5SDimitry Andric 3001bdd1243dSDimitry Andric InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 3002bdd1243dSDimitry Andric TTI::TargetCostKind CostKind, 3003bdd1243dSDimitry Andric unsigned Index, Value *Op0, 3004bdd1243dSDimitry Andric Value *Op1) { 300506c3fb27SDimitry Andric bool HasRealUse = 300606c3fb27SDimitry Andric Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0); 300706c3fb27SDimitry Andric return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse); 3008bdd1243dSDimitry Andric } 3009bdd1243dSDimitry Andric 3010bdd1243dSDimitry Andric InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, 3011bdd1243dSDimitry Andric Type *Val, 3012bdd1243dSDimitry Andric TTI::TargetCostKind CostKind, 3013bdd1243dSDimitry Andric unsigned Index) { 301406c3fb27SDimitry Andric return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */); 3015bdd1243dSDimitry Andric } 3016bdd1243dSDimitry Andric 30175f757f3fSDimitry Andric InstructionCost AArch64TTIImpl::getScalarizationOverhead( 30185f757f3fSDimitry Andric VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, 30195f757f3fSDimitry Andric TTI::TargetCostKind CostKind) { 30205f757f3fSDimitry Andric if (isa<ScalableVectorType>(Ty)) 30215f757f3fSDimitry Andric return InstructionCost::getInvalid(); 30225f757f3fSDimitry Andric if (Ty->getElementType()->isFloatingPointTy()) 30235f757f3fSDimitry Andric return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, 30245f757f3fSDimitry Andric CostKind); 30255f757f3fSDimitry Andric return DemandedElts.popcount() * (Insert + Extract) * 30265f757f3fSDimitry Andric ST->getVectorInsertExtractBaseCost(); 30275f757f3fSDimitry Andric } 30285f757f3fSDimitry Andric 3029fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getArithmeticInstrCost( 30305ffd83dbSDimitry Andric unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 3031bdd1243dSDimitry Andric TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 3032bdd1243dSDimitry Andric ArrayRef<const Value *> Args, 3033480093f4SDimitry Andric const Instruction *CxtI) { 3034bdd1243dSDimitry Andric 3035*62987288SDimitry Andric // The code-generator is currently not able to handle scalable vectors 3036*62987288SDimitry Andric // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3037*62987288SDimitry Andric // it. This change will be removed when code-generation for these types is 3038*62987288SDimitry Andric // sufficiently reliable. 3039*62987288SDimitry Andric if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 3040*62987288SDimitry Andric if (VTy->getElementCount() == ElementCount::getScalable(1)) 3041*62987288SDimitry Andric return InstructionCost::getInvalid(); 3042*62987288SDimitry Andric 30435ffd83dbSDimitry Andric // TODO: Handle more cost kinds. 30445ffd83dbSDimitry Andric if (CostKind != TTI::TCK_RecipThroughput) 3045bdd1243dSDimitry Andric return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 3046bdd1243dSDimitry Andric Op2Info, Args, CxtI); 30475ffd83dbSDimitry Andric 30480b57cec5SDimitry Andric // Legalize the type. 3049bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 30500b57cec5SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode); 30510b57cec5SDimitry Andric 30520b57cec5SDimitry Andric switch (ISD) { 30530b57cec5SDimitry Andric default: 3054bdd1243dSDimitry Andric return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 3055bdd1243dSDimitry Andric Op2Info); 30560b57cec5SDimitry Andric case ISD::SDIV: 3057bdd1243dSDimitry Andric if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) { 30580b57cec5SDimitry Andric // On AArch64, scalar signed division by constants power-of-two are 30590b57cec5SDimitry Andric // normally expanded to the sequence ADD + CMP + SELECT + SRA. 30600b57cec5SDimitry Andric // The OperandValue properties many not be same as that of previous 30610b57cec5SDimitry Andric // operation; conservatively assume OP_None. 306281ad6265SDimitry Andric InstructionCost Cost = getArithmeticInstrCost( 3063bdd1243dSDimitry Andric Instruction::Add, Ty, CostKind, 3064bdd1243dSDimitry Andric Op1Info.getNoProps(), Op2Info.getNoProps()); 3065bdd1243dSDimitry Andric Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, 3066bdd1243dSDimitry Andric Op1Info.getNoProps(), Op2Info.getNoProps()); 306781ad6265SDimitry Andric Cost += getArithmeticInstrCost( 3068bdd1243dSDimitry Andric Instruction::Select, Ty, CostKind, 3069bdd1243dSDimitry Andric Op1Info.getNoProps(), Op2Info.getNoProps()); 3070bdd1243dSDimitry Andric Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, 3071bdd1243dSDimitry Andric Op1Info.getNoProps(), Op2Info.getNoProps()); 30720b57cec5SDimitry Andric return Cost; 30730b57cec5SDimitry Andric } 3074bdd1243dSDimitry Andric [[fallthrough]]; 307581ad6265SDimitry Andric case ISD::UDIV: { 3076bdd1243dSDimitry Andric if (Op2Info.isConstant() && Op2Info.isUniform()) { 30770b57cec5SDimitry Andric auto VT = TLI->getValueType(DL, Ty); 30780b57cec5SDimitry Andric if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { 30790b57cec5SDimitry Andric // Vector signed division by constant are expanded to the 30800b57cec5SDimitry Andric // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division 30810b57cec5SDimitry Andric // to MULHS + SUB + SRL + ADD + SRL. 3082fe6060f1SDimitry Andric InstructionCost MulCost = getArithmeticInstrCost( 3083bdd1243dSDimitry Andric Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); 3084fe6060f1SDimitry Andric InstructionCost AddCost = getArithmeticInstrCost( 3085bdd1243dSDimitry Andric Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); 3086fe6060f1SDimitry Andric InstructionCost ShrCost = getArithmeticInstrCost( 3087bdd1243dSDimitry Andric Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); 30880b57cec5SDimitry Andric return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; 30890b57cec5SDimitry Andric } 30900b57cec5SDimitry Andric } 30910b57cec5SDimitry Andric 309281ad6265SDimitry Andric InstructionCost Cost = BaseT::getArithmeticInstrCost( 3093bdd1243dSDimitry Andric Opcode, Ty, CostKind, Op1Info, Op2Info); 30940b57cec5SDimitry Andric if (Ty->isVectorTy()) { 3095bdd1243dSDimitry Andric if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) { 3096bdd1243dSDimitry Andric // SDIV/UDIV operations are lowered using SVE, then we can have less 3097bdd1243dSDimitry Andric // costs. 3098bdd1243dSDimitry Andric if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty) 3099bdd1243dSDimitry Andric ->getPrimitiveSizeInBits() 3100bdd1243dSDimitry Andric .getFixedValue() < 128) { 3101bdd1243dSDimitry Andric EVT VT = TLI->getValueType(DL, Ty); 3102bdd1243dSDimitry Andric static const CostTblEntry DivTbl[]{ 3103bdd1243dSDimitry Andric {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8}, 3104bdd1243dSDimitry Andric {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5}, 3105bdd1243dSDimitry Andric {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1}, 3106bdd1243dSDimitry Andric {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8}, 3107bdd1243dSDimitry Andric {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5}, 3108bdd1243dSDimitry Andric {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}}; 3109bdd1243dSDimitry Andric 3110bdd1243dSDimitry Andric const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT()); 3111bdd1243dSDimitry Andric if (nullptr != Entry) 3112bdd1243dSDimitry Andric return Entry->Cost; 3113bdd1243dSDimitry Andric } 3114bdd1243dSDimitry Andric // For 8/16-bit elements, the cost is higher because the type 3115bdd1243dSDimitry Andric // requires promotion and possibly splitting: 3116bdd1243dSDimitry Andric if (LT.second.getScalarType() == MVT::i8) 3117bdd1243dSDimitry Andric Cost *= 8; 3118bdd1243dSDimitry Andric else if (LT.second.getScalarType() == MVT::i16) 3119bdd1243dSDimitry Andric Cost *= 4; 3120bdd1243dSDimitry Andric return Cost; 3121bdd1243dSDimitry Andric } else { 3122bdd1243dSDimitry Andric // If one of the operands is a uniform constant then the cost for each 3123bdd1243dSDimitry Andric // element is Cost for insertion, extraction and division. 3124bdd1243dSDimitry Andric // Insertion cost = 2, Extraction Cost = 2, Division = cost for the 3125bdd1243dSDimitry Andric // operation with scalar type 3126bdd1243dSDimitry Andric if ((Op1Info.isConstant() && Op1Info.isUniform()) || 3127bdd1243dSDimitry Andric (Op2Info.isConstant() && Op2Info.isUniform())) { 3128bdd1243dSDimitry Andric if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { 3129bdd1243dSDimitry Andric InstructionCost DivCost = BaseT::getArithmeticInstrCost( 3130bdd1243dSDimitry Andric Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info); 3131bdd1243dSDimitry Andric return (4 + DivCost) * VTy->getNumElements(); 3132bdd1243dSDimitry Andric } 3133bdd1243dSDimitry Andric } 3134bdd1243dSDimitry Andric // On AArch64, without SVE, vector divisions are expanded 3135bdd1243dSDimitry Andric // into scalar divisions of each pair of elements. 3136bdd1243dSDimitry Andric Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, 3137bdd1243dSDimitry Andric CostKind, Op1Info, Op2Info); 31385ffd83dbSDimitry Andric Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, 3139bdd1243dSDimitry Andric Op1Info, Op2Info); 3140bdd1243dSDimitry Andric } 3141bdd1243dSDimitry Andric 31420b57cec5SDimitry Andric // TODO: if one of the arguments is scalar, then it's not necessary to 31430b57cec5SDimitry Andric // double the cost of handling the vector elements. 31440b57cec5SDimitry Andric Cost += Cost; 31450b57cec5SDimitry Andric } 31460b57cec5SDimitry Andric return Cost; 314781ad6265SDimitry Andric } 31480b57cec5SDimitry Andric case ISD::MUL: 3149bdd1243dSDimitry Andric // When SVE is available, then we can lower the v2i64 operation using 3150bdd1243dSDimitry Andric // the SVE mul instruction, which has a lower cost. 3151bdd1243dSDimitry Andric if (LT.second == MVT::v2i64 && ST->hasSVE()) 3152bdd1243dSDimitry Andric return LT.first; 3153bdd1243dSDimitry Andric 3154bdd1243dSDimitry Andric // When SVE is not available, there is no MUL.2d instruction, 3155bdd1243dSDimitry Andric // which means mul <2 x i64> is expensive as elements are extracted 3156bdd1243dSDimitry Andric // from the vectors and the muls scalarized. 3157bdd1243dSDimitry Andric // As getScalarizationOverhead is a bit too pessimistic, we 3158bdd1243dSDimitry Andric // estimate the cost for a i64 vector directly here, which is: 315981ad6265SDimitry Andric // - four 2-cost i64 extracts, 316081ad6265SDimitry Andric // - two 2-cost i64 inserts, and 316181ad6265SDimitry Andric // - two 1-cost muls. 316281ad6265SDimitry Andric // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with 316381ad6265SDimitry Andric // LT.first = 2 the cost is 28. If both operands are extensions it will not 316481ad6265SDimitry Andric // need to scalarize so the cost can be cheaper (smull or umull). 3165bdd1243dSDimitry Andric // so the cost can be cheaper (smull or umull). 316681ad6265SDimitry Andric if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) 316781ad6265SDimitry Andric return LT.first; 316881ad6265SDimitry Andric return LT.first * 14; 3169e8d8bef9SDimitry Andric case ISD::ADD: 31700b57cec5SDimitry Andric case ISD::XOR: 31710b57cec5SDimitry Andric case ISD::OR: 31720b57cec5SDimitry Andric case ISD::AND: 317381ad6265SDimitry Andric case ISD::SRL: 317481ad6265SDimitry Andric case ISD::SRA: 317581ad6265SDimitry Andric case ISD::SHL: 31760b57cec5SDimitry Andric // These nodes are marked as 'custom' for combining purposes only. 31770b57cec5SDimitry Andric // We know that they are legal. See LowerAdd in ISelLowering. 317881ad6265SDimitry Andric return LT.first; 31795ffd83dbSDimitry Andric 318006c3fb27SDimitry Andric case ISD::FNEG: 31815ffd83dbSDimitry Andric case ISD::FADD: 3182349cc55cSDimitry Andric case ISD::FSUB: 318306c3fb27SDimitry Andric // Increase the cost for half and bfloat types if not architecturally 318406c3fb27SDimitry Andric // supported. 318506c3fb27SDimitry Andric if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) || 318606c3fb27SDimitry Andric (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16())) 318706c3fb27SDimitry Andric return 2 * LT.first; 318806c3fb27SDimitry Andric if (!Ty->getScalarType()->isFP128Ty()) 318906c3fb27SDimitry Andric return LT.first; 319006c3fb27SDimitry Andric [[fallthrough]]; 3191349cc55cSDimitry Andric case ISD::FMUL: 3192349cc55cSDimitry Andric case ISD::FDIV: 31935ffd83dbSDimitry Andric // These nodes are marked as 'custom' just to lower them to SVE. 31945ffd83dbSDimitry Andric // We know said lowering will incur no additional cost. 3195349cc55cSDimitry Andric if (!Ty->getScalarType()->isFP128Ty()) 319681ad6265SDimitry Andric return 2 * LT.first; 31975ffd83dbSDimitry Andric 3198bdd1243dSDimitry Andric return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 3199bdd1243dSDimitry Andric Op2Info); 32000fca6ea1SDimitry Andric case ISD::FREM: 32010fca6ea1SDimitry Andric // Pass nullptr as fmod/fmodf calls are emitted by the backend even when 32020fca6ea1SDimitry Andric // those functions are not declared in the module. 32030fca6ea1SDimitry Andric if (!Ty->isVectorTy()) 32040fca6ea1SDimitry Andric return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind); 32050fca6ea1SDimitry Andric return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 32060fca6ea1SDimitry Andric Op2Info); 32070b57cec5SDimitry Andric } 32080b57cec5SDimitry Andric } 32090b57cec5SDimitry Andric 3210fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, 3211fe6060f1SDimitry Andric ScalarEvolution *SE, 32120b57cec5SDimitry Andric const SCEV *Ptr) { 32130b57cec5SDimitry Andric // Address computations in vectorized code with non-consecutive addresses will 32140b57cec5SDimitry Andric // likely result in more instructions compared to scalar code where the 32150b57cec5SDimitry Andric // computation can more often be merged into the index mode. The resulting 32160b57cec5SDimitry Andric // extra micro-ops can significantly decrease throughput. 321706c3fb27SDimitry Andric unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead; 32180b57cec5SDimitry Andric int MaxMergeDistance = 64; 32190b57cec5SDimitry Andric 32200b57cec5SDimitry Andric if (Ty->isVectorTy() && SE && 32210b57cec5SDimitry Andric !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 32220b57cec5SDimitry Andric return NumVectorInstToHideOverhead; 32230b57cec5SDimitry Andric 32240b57cec5SDimitry Andric // In many cases the address computation is not merged into the instruction 32250b57cec5SDimitry Andric // addressing mode. 32260b57cec5SDimitry Andric return 1; 32270b57cec5SDimitry Andric } 32280b57cec5SDimitry Andric 3229fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 3230fe6060f1SDimitry Andric Type *CondTy, 3231fe6060f1SDimitry Andric CmpInst::Predicate VecPred, 32325ffd83dbSDimitry Andric TTI::TargetCostKind CostKind, 32335ffd83dbSDimitry Andric const Instruction *I) { 32345ffd83dbSDimitry Andric // TODO: Handle other cost kinds. 32355ffd83dbSDimitry Andric if (CostKind != TTI::TCK_RecipThroughput) 3236e8d8bef9SDimitry Andric return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 3237e8d8bef9SDimitry Andric I); 32380b57cec5SDimitry Andric 32390b57cec5SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode); 32400b57cec5SDimitry Andric // We don't lower some vector selects well that are wider than the register 32410b57cec5SDimitry Andric // width. 3242e8d8bef9SDimitry Andric if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { 32430b57cec5SDimitry Andric // We would need this many instructions to hide the scalarization happening. 32440b57cec5SDimitry Andric const int AmortizationCost = 20; 3245e8d8bef9SDimitry Andric 3246e8d8bef9SDimitry Andric // If VecPred is not set, check if we can get a predicate from the context 3247e8d8bef9SDimitry Andric // instruction, if its type matches the requested ValTy. 3248e8d8bef9SDimitry Andric if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { 3249e8d8bef9SDimitry Andric CmpInst::Predicate CurrentPred; 3250e8d8bef9SDimitry Andric if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), 3251e8d8bef9SDimitry Andric m_Value()))) 3252e8d8bef9SDimitry Andric VecPred = CurrentPred; 3253e8d8bef9SDimitry Andric } 32541fd87a68SDimitry Andric // Check if we have a compare/select chain that can be lowered using 32551fd87a68SDimitry Andric // a (F)CMxx & BFI pair. 32561fd87a68SDimitry Andric if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE || 32571fd87a68SDimitry Andric VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || 32581fd87a68SDimitry Andric VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || 32591fd87a68SDimitry Andric VecPred == CmpInst::FCMP_UNE) { 32601fd87a68SDimitry Andric static const auto ValidMinMaxTys = { 32611fd87a68SDimitry Andric MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, 32621fd87a68SDimitry Andric MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; 32631fd87a68SDimitry Andric static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; 32641fd87a68SDimitry Andric 3265bdd1243dSDimitry Andric auto LT = getTypeLegalizationCost(ValTy); 32661fd87a68SDimitry Andric if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || 32671fd87a68SDimitry Andric (ST->hasFullFP16() && 32681fd87a68SDimitry Andric any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) 3269e8d8bef9SDimitry Andric return LT.first; 3270e8d8bef9SDimitry Andric } 3271e8d8bef9SDimitry Andric 32720b57cec5SDimitry Andric static const TypeConversionCostTblEntry 32730b57cec5SDimitry Andric VectorSelectTbl[] = { 327406c3fb27SDimitry Andric { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 }, 327506c3fb27SDimitry Andric { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 }, 327606c3fb27SDimitry Andric { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 }, 327706c3fb27SDimitry Andric { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 }, 327806c3fb27SDimitry Andric { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 }, 32790b57cec5SDimitry Andric { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, 32800b57cec5SDimitry Andric { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, 32810b57cec5SDimitry Andric { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, 32820b57cec5SDimitry Andric { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 32830b57cec5SDimitry Andric { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 32840b57cec5SDimitry Andric { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 32850b57cec5SDimitry Andric }; 32860b57cec5SDimitry Andric 32870b57cec5SDimitry Andric EVT SelCondTy = TLI->getValueType(DL, CondTy); 32880b57cec5SDimitry Andric EVT SelValTy = TLI->getValueType(DL, ValTy); 32890b57cec5SDimitry Andric if (SelCondTy.isSimple() && SelValTy.isSimple()) { 32900b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, 32910b57cec5SDimitry Andric SelCondTy.getSimpleVT(), 32920b57cec5SDimitry Andric SelValTy.getSimpleVT())) 32930b57cec5SDimitry Andric return Entry->Cost; 32940b57cec5SDimitry Andric } 32950b57cec5SDimitry Andric } 329606c3fb27SDimitry Andric 329706c3fb27SDimitry Andric if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) { 329806c3fb27SDimitry Andric auto LT = getTypeLegalizationCost(ValTy); 329906c3fb27SDimitry Andric // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back. 330006c3fb27SDimitry Andric if (LT.second == MVT::v4f16 && !ST->hasFullFP16()) 330106c3fb27SDimitry Andric return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn 330206c3fb27SDimitry Andric } 330306c3fb27SDimitry Andric 330406c3fb27SDimitry Andric // Treat the icmp in icmp(and, 0) as free, as we can make use of ands. 330506c3fb27SDimitry Andric // FIXME: This can apply to more conditions and add/sub if it can be shown to 330606c3fb27SDimitry Andric // be profitable. 330706c3fb27SDimitry Andric if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I && 330806c3fb27SDimitry Andric ICmpInst::isEquality(VecPred) && 330906c3fb27SDimitry Andric TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) && 331006c3fb27SDimitry Andric match(I->getOperand(1), m_Zero()) && 331106c3fb27SDimitry Andric match(I->getOperand(0), m_And(m_Value(), m_Value()))) 331206c3fb27SDimitry Andric return 0; 331306c3fb27SDimitry Andric 3314e8d8bef9SDimitry Andric // The base case handles scalable vectors fine for now, since it treats the 3315e8d8bef9SDimitry Andric // cost as 1 * legalization cost. 3316e8d8bef9SDimitry Andric return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 33170b57cec5SDimitry Andric } 33180b57cec5SDimitry Andric 33190b57cec5SDimitry Andric AArch64TTIImpl::TTI::MemCmpExpansionOptions 33200b57cec5SDimitry Andric AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 33210b57cec5SDimitry Andric TTI::MemCmpExpansionOptions Options; 33225ffd83dbSDimitry Andric if (ST->requiresStrictAlign()) { 33235ffd83dbSDimitry Andric // TODO: Add cost modeling for strict align. Misaligned loads expand to 33245ffd83dbSDimitry Andric // a bunch of instructions when strict align is enabled. 33255ffd83dbSDimitry Andric return Options; 33265ffd83dbSDimitry Andric } 33275ffd83dbSDimitry Andric Options.AllowOverlappingLoads = true; 33280b57cec5SDimitry Andric Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 33290b57cec5SDimitry Andric Options.NumLoadsPerBlock = Options.MaxNumLoads; 33300b57cec5SDimitry Andric // TODO: Though vector loads usually perform well on AArch64, in some targets 33310b57cec5SDimitry Andric // they may wake up the FP unit, which raises the power consumption. Perhaps 33320b57cec5SDimitry Andric // they could be used with no holds barred (-O3). 33330b57cec5SDimitry Andric Options.LoadSizes = {8, 4, 2, 1}; 33345f757f3fSDimitry Andric Options.AllowedTailExpansions = {3, 5, 6}; 33350b57cec5SDimitry Andric return Options; 33360b57cec5SDimitry Andric } 33370b57cec5SDimitry Andric 333881ad6265SDimitry Andric bool AArch64TTIImpl::prefersVectorizedAddressing() const { 333981ad6265SDimitry Andric return ST->hasSVE(); 334081ad6265SDimitry Andric } 334181ad6265SDimitry Andric 3342fe6060f1SDimitry Andric InstructionCost 3343fe6060f1SDimitry Andric AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, 3344fe6060f1SDimitry Andric Align Alignment, unsigned AddressSpace, 3345fe6060f1SDimitry Andric TTI::TargetCostKind CostKind) { 33460eae32dcSDimitry Andric if (useNeonVector(Src)) 3347fe6060f1SDimitry Andric return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 3348fe6060f1SDimitry Andric CostKind); 3349bdd1243dSDimitry Andric auto LT = getTypeLegalizationCost(Src); 3350fe6060f1SDimitry Andric if (!LT.first.isValid()) 3351fe6060f1SDimitry Andric return InstructionCost::getInvalid(); 3352fe6060f1SDimitry Andric 33530fca6ea1SDimitry Andric // Return an invalid cost for element types that we are unable to lower. 33540fca6ea1SDimitry Andric auto *VT = cast<VectorType>(Src); 33550fca6ea1SDimitry Andric if (VT->getElementType()->isIntegerTy(1)) 33560fca6ea1SDimitry Andric return InstructionCost::getInvalid(); 33570fca6ea1SDimitry Andric 3358fe6060f1SDimitry Andric // The code-generator is currently not able to handle scalable vectors 3359fe6060f1SDimitry Andric // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3360fe6060f1SDimitry Andric // it. This change will be removed when code-generation for these types is 3361fe6060f1SDimitry Andric // sufficiently reliable. 33620fca6ea1SDimitry Andric if (VT->getElementCount() == ElementCount::getScalable(1)) 3363fe6060f1SDimitry Andric return InstructionCost::getInvalid(); 3364fe6060f1SDimitry Andric 3365bdd1243dSDimitry Andric return LT.first; 3366fe6060f1SDimitry Andric } 3367fe6060f1SDimitry Andric 33680eae32dcSDimitry Andric static unsigned getSVEGatherScatterOverhead(unsigned Opcode) { 33690eae32dcSDimitry Andric return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead; 33700eae32dcSDimitry Andric } 33710eae32dcSDimitry Andric 3372fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getGatherScatterOpCost( 3373e8d8bef9SDimitry Andric unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 3374e8d8bef9SDimitry Andric Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 33755f757f3fSDimitry Andric if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy)) 3376e8d8bef9SDimitry Andric return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 3377e8d8bef9SDimitry Andric Alignment, CostKind, I); 3378e8d8bef9SDimitry Andric auto *VT = cast<VectorType>(DataTy); 3379bdd1243dSDimitry Andric auto LT = getTypeLegalizationCost(DataTy); 3380fe6060f1SDimitry Andric if (!LT.first.isValid()) 3381fe6060f1SDimitry Andric return InstructionCost::getInvalid(); 3382e8d8bef9SDimitry Andric 33830fca6ea1SDimitry Andric // Return an invalid cost for element types that we are unable to lower. 33845f757f3fSDimitry Andric if (!LT.second.isVector() || 33850fca6ea1SDimitry Andric !isElementTypeLegalForScalableVector(VT->getElementType()) || 33860fca6ea1SDimitry Andric VT->getElementType()->isIntegerTy(1)) 33875f757f3fSDimitry Andric return InstructionCost::getInvalid(); 33885f757f3fSDimitry Andric 3389fe6060f1SDimitry Andric // The code-generator is currently not able to handle scalable vectors 3390fe6060f1SDimitry Andric // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3391fe6060f1SDimitry Andric // it. This change will be removed when code-generation for these types is 3392fe6060f1SDimitry Andric // sufficiently reliable. 33930fca6ea1SDimitry Andric if (VT->getElementCount() == ElementCount::getScalable(1)) 3394fe6060f1SDimitry Andric return InstructionCost::getInvalid(); 3395fe6060f1SDimitry Andric 3396fe6060f1SDimitry Andric ElementCount LegalVF = LT.second.getVectorElementCount(); 3397fe6060f1SDimitry Andric InstructionCost MemOpCost = 3398bdd1243dSDimitry Andric getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, 3399bdd1243dSDimitry Andric {TTI::OK_AnyValue, TTI::OP_None}, I); 34000eae32dcSDimitry Andric // Add on an overhead cost for using gathers/scatters. 34010eae32dcSDimitry Andric // TODO: At the moment this is applied unilaterally for all CPUs, but at some 34020eae32dcSDimitry Andric // point we may want a per-CPU overhead. 34030eae32dcSDimitry Andric MemOpCost *= getSVEGatherScatterOverhead(Opcode); 3404fe6060f1SDimitry Andric return LT.first * MemOpCost * getMaxNumElements(LegalVF); 3405e8d8bef9SDimitry Andric } 3406e8d8bef9SDimitry Andric 3407e8d8bef9SDimitry Andric bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { 3408e8d8bef9SDimitry Andric return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); 3409e8d8bef9SDimitry Andric } 3410e8d8bef9SDimitry Andric 3411fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, 3412fe6060f1SDimitry Andric MaybeAlign Alignment, 3413fe6060f1SDimitry Andric unsigned AddressSpace, 34145ffd83dbSDimitry Andric TTI::TargetCostKind CostKind, 3415bdd1243dSDimitry Andric TTI::OperandValueInfo OpInfo, 34160b57cec5SDimitry Andric const Instruction *I) { 3417fe6060f1SDimitry Andric EVT VT = TLI->getValueType(DL, Ty, true); 34185ffd83dbSDimitry Andric // Type legalization can't handle structs 3419fe6060f1SDimitry Andric if (VT == MVT::Other) 34205ffd83dbSDimitry Andric return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, 34215ffd83dbSDimitry Andric CostKind); 34225ffd83dbSDimitry Andric 3423bdd1243dSDimitry Andric auto LT = getTypeLegalizationCost(Ty); 3424fe6060f1SDimitry Andric if (!LT.first.isValid()) 3425fe6060f1SDimitry Andric return InstructionCost::getInvalid(); 3426fe6060f1SDimitry Andric 3427fe6060f1SDimitry Andric // The code-generator is currently not able to handle scalable vectors 3428fe6060f1SDimitry Andric // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3429fe6060f1SDimitry Andric // it. This change will be removed when code-generation for these types is 3430fe6060f1SDimitry Andric // sufficiently reliable. 34310fca6ea1SDimitry Andric // We also only support full register predicate loads and stores. 3432fe6060f1SDimitry Andric if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 34330fca6ea1SDimitry Andric if (VTy->getElementCount() == ElementCount::getScalable(1) || 34340fca6ea1SDimitry Andric (VTy->getElementType()->isIntegerTy(1) && 34350fca6ea1SDimitry Andric !VTy->getElementCount().isKnownMultipleOf( 34360fca6ea1SDimitry Andric ElementCount::getScalable(16)))) 3437fe6060f1SDimitry Andric return InstructionCost::getInvalid(); 3438fe6060f1SDimitry Andric 3439fe6060f1SDimitry Andric // TODO: consider latency as well for TCK_SizeAndLatency. 3440fe6060f1SDimitry Andric if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) 3441fe6060f1SDimitry Andric return LT.first; 3442fe6060f1SDimitry Andric 3443fe6060f1SDimitry Andric if (CostKind != TTI::TCK_RecipThroughput) 3444fe6060f1SDimitry Andric return 1; 34450b57cec5SDimitry Andric 34460b57cec5SDimitry Andric if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && 3447480093f4SDimitry Andric LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { 34480b57cec5SDimitry Andric // Unaligned stores are extremely inefficient. We don't split all 34490b57cec5SDimitry Andric // unaligned 128-bit stores because the negative impact that has shown in 34500b57cec5SDimitry Andric // practice on inlined block copy code. 34510b57cec5SDimitry Andric // We make such stores expensive so that we will only vectorize if there 34520b57cec5SDimitry Andric // are 6 other instructions getting vectorized. 34530b57cec5SDimitry Andric const int AmortizationCost = 6; 34540b57cec5SDimitry Andric 34550b57cec5SDimitry Andric return LT.first * 2 * AmortizationCost; 34560b57cec5SDimitry Andric } 34570b57cec5SDimitry Andric 3458bdd1243dSDimitry Andric // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs. 3459bdd1243dSDimitry Andric if (Ty->isPtrOrPtrVectorTy()) 3460bdd1243dSDimitry Andric return LT.first; 3461bdd1243dSDimitry Andric 34627a6dacacSDimitry Andric if (useNeonVector(Ty)) { 3463fe6060f1SDimitry Andric // Check truncating stores and extending loads. 34647a6dacacSDimitry Andric if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { 3465fe6060f1SDimitry Andric // v4i8 types are lowered to scalar a load/store and sshll/xtn. 3466fe6060f1SDimitry Andric if (VT == MVT::v4i8) 3467fe6060f1SDimitry Andric return 2; 3468fe6060f1SDimitry Andric // Otherwise we need to scalarize. 3469fe6060f1SDimitry Andric return cast<FixedVectorType>(Ty)->getNumElements() * 2; 34700b57cec5SDimitry Andric } 34717a6dacacSDimitry Andric EVT EltVT = VT.getVectorElementType(); 34727a6dacacSDimitry Andric unsigned EltSize = EltVT.getScalarSizeInBits(); 34737a6dacacSDimitry Andric if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 || 34747a6dacacSDimitry Andric VT.getVectorNumElements() >= (128 / EltSize) || !Alignment || 34757a6dacacSDimitry Andric *Alignment != Align(1)) 34767a6dacacSDimitry Andric return LT.first; 34777a6dacacSDimitry Andric // FIXME: v3i8 lowering currently is very inefficient, due to automatic 34787a6dacacSDimitry Andric // widening to v4i8, which produces suboptimal results. 34797a6dacacSDimitry Andric if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8) 34807a6dacacSDimitry Andric return LT.first; 34817a6dacacSDimitry Andric 34827a6dacacSDimitry Andric // Check non-power-of-2 loads/stores for legal vector element types with 34837a6dacacSDimitry Andric // NEON. Non-power-of-2 memory ops will get broken down to a set of 34847a6dacacSDimitry Andric // operations on smaller power-of-2 ops, including ld1/st1. 34857a6dacacSDimitry Andric LLVMContext &C = Ty->getContext(); 34867a6dacacSDimitry Andric InstructionCost Cost(0); 34877a6dacacSDimitry Andric SmallVector<EVT> TypeWorklist; 34887a6dacacSDimitry Andric TypeWorklist.push_back(VT); 34897a6dacacSDimitry Andric while (!TypeWorklist.empty()) { 34907a6dacacSDimitry Andric EVT CurrVT = TypeWorklist.pop_back_val(); 34917a6dacacSDimitry Andric unsigned CurrNumElements = CurrVT.getVectorNumElements(); 34927a6dacacSDimitry Andric if (isPowerOf2_32(CurrNumElements)) { 34937a6dacacSDimitry Andric Cost += 1; 34947a6dacacSDimitry Andric continue; 34957a6dacacSDimitry Andric } 34967a6dacacSDimitry Andric 34977a6dacacSDimitry Andric unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2; 34987a6dacacSDimitry Andric TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2)); 34997a6dacacSDimitry Andric TypeWorklist.push_back( 35007a6dacacSDimitry Andric EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2)); 35017a6dacacSDimitry Andric } 35027a6dacacSDimitry Andric return Cost; 35037a6dacacSDimitry Andric } 35040b57cec5SDimitry Andric 35050b57cec5SDimitry Andric return LT.first; 35060b57cec5SDimitry Andric } 35070b57cec5SDimitry Andric 3508fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( 35095ffd83dbSDimitry Andric unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 35105ffd83dbSDimitry Andric Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 35115ffd83dbSDimitry Andric bool UseMaskForCond, bool UseMaskForGaps) { 35120b57cec5SDimitry Andric assert(Factor >= 2 && "Invalid interleave factor"); 351306c3fb27SDimitry Andric auto *VecVTy = cast<VectorType>(VecTy); 35140b57cec5SDimitry Andric 351506c3fb27SDimitry Andric if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2)) 351606c3fb27SDimitry Andric return InstructionCost::getInvalid(); 351706c3fb27SDimitry Andric 351806c3fb27SDimitry Andric // Vectorization for masked interleaved accesses is only enabled for scalable 351906c3fb27SDimitry Andric // VF. 352006c3fb27SDimitry Andric if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps)) 352106c3fb27SDimitry Andric return InstructionCost::getInvalid(); 352206c3fb27SDimitry Andric 352306c3fb27SDimitry Andric if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { 352406c3fb27SDimitry Andric unsigned MinElts = VecVTy->getElementCount().getKnownMinValue(); 35255ffd83dbSDimitry Andric auto *SubVecTy = 352606c3fb27SDimitry Andric VectorType::get(VecVTy->getElementType(), 352706c3fb27SDimitry Andric VecVTy->getElementCount().divideCoefficientBy(Factor)); 35280b57cec5SDimitry Andric 35290b57cec5SDimitry Andric // ldN/stN only support legal vector types of size 64 or 128 in bits. 35300b57cec5SDimitry Andric // Accesses having vector types that are a multiple of 128 bits can be 35310b57cec5SDimitry Andric // matched to more than one ldN/stN instruction. 3532349cc55cSDimitry Andric bool UseScalable; 353306c3fb27SDimitry Andric if (MinElts % Factor == 0 && 3534349cc55cSDimitry Andric TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) 3535349cc55cSDimitry Andric return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); 35360b57cec5SDimitry Andric } 35370b57cec5SDimitry Andric 35380b57cec5SDimitry Andric return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 35395ffd83dbSDimitry Andric Alignment, AddressSpace, CostKind, 35400b57cec5SDimitry Andric UseMaskForCond, UseMaskForGaps); 35410b57cec5SDimitry Andric } 35420b57cec5SDimitry Andric 3543fe6060f1SDimitry Andric InstructionCost 3544fe6060f1SDimitry Andric AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { 3545fe6060f1SDimitry Andric InstructionCost Cost = 0; 35465ffd83dbSDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 35470b57cec5SDimitry Andric for (auto *I : Tys) { 35480b57cec5SDimitry Andric if (!I->isVectorTy()) 35490b57cec5SDimitry Andric continue; 35505ffd83dbSDimitry Andric if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == 35515ffd83dbSDimitry Andric 128) 35525ffd83dbSDimitry Andric Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + 35535ffd83dbSDimitry Andric getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); 35540b57cec5SDimitry Andric } 35550b57cec5SDimitry Andric return Cost; 35560b57cec5SDimitry Andric } 35570b57cec5SDimitry Andric 355806c3fb27SDimitry Andric unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) { 35590b57cec5SDimitry Andric return ST->getMaxInterleaveFactor(); 35600b57cec5SDimitry Andric } 35610b57cec5SDimitry Andric 35620b57cec5SDimitry Andric // For Falkor, we want to avoid having too many strided loads in a loop since 35630b57cec5SDimitry Andric // that can exhaust the HW prefetcher resources. We adjust the unroller 35640b57cec5SDimitry Andric // MaxCount preference below to attempt to ensure unrolling doesn't create too 35650b57cec5SDimitry Andric // many strided loads. 35660b57cec5SDimitry Andric static void 35670b57cec5SDimitry Andric getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, 35680b57cec5SDimitry Andric TargetTransformInfo::UnrollingPreferences &UP) { 35690b57cec5SDimitry Andric enum { MaxStridedLoads = 7 }; 35700b57cec5SDimitry Andric auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { 35710b57cec5SDimitry Andric int StridedLoads = 0; 35720b57cec5SDimitry Andric // FIXME? We could make this more precise by looking at the CFG and 35730b57cec5SDimitry Andric // e.g. not counting loads in each side of an if-then-else diamond. 35740b57cec5SDimitry Andric for (const auto BB : L->blocks()) { 35750b57cec5SDimitry Andric for (auto &I : *BB) { 35760b57cec5SDimitry Andric LoadInst *LMemI = dyn_cast<LoadInst>(&I); 35770b57cec5SDimitry Andric if (!LMemI) 35780b57cec5SDimitry Andric continue; 35790b57cec5SDimitry Andric 35800b57cec5SDimitry Andric Value *PtrValue = LMemI->getPointerOperand(); 35810b57cec5SDimitry Andric if (L->isLoopInvariant(PtrValue)) 35820b57cec5SDimitry Andric continue; 35830b57cec5SDimitry Andric 35840b57cec5SDimitry Andric const SCEV *LSCEV = SE.getSCEV(PtrValue); 35850b57cec5SDimitry Andric const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); 35860b57cec5SDimitry Andric if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) 35870b57cec5SDimitry Andric continue; 35880b57cec5SDimitry Andric 35890b57cec5SDimitry Andric // FIXME? We could take pairing of unrolled load copies into account 35900b57cec5SDimitry Andric // by looking at the AddRec, but we would probably have to limit this 35910b57cec5SDimitry Andric // to loops with no stores or other memory optimization barriers. 35920b57cec5SDimitry Andric ++StridedLoads; 35930b57cec5SDimitry Andric // We've seen enough strided loads that seeing more won't make a 35940b57cec5SDimitry Andric // difference. 35950b57cec5SDimitry Andric if (StridedLoads > MaxStridedLoads / 2) 35960b57cec5SDimitry Andric return StridedLoads; 35970b57cec5SDimitry Andric } 35980b57cec5SDimitry Andric } 35990b57cec5SDimitry Andric return StridedLoads; 36000b57cec5SDimitry Andric }; 36010b57cec5SDimitry Andric 36020b57cec5SDimitry Andric int StridedLoads = countStridedLoads(L, SE); 36030b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads 36040b57cec5SDimitry Andric << " strided loads\n"); 36050b57cec5SDimitry Andric // Pick the largest power of 2 unroll count that won't result in too many 36060b57cec5SDimitry Andric // strided loads. 36070b57cec5SDimitry Andric if (StridedLoads) { 36080b57cec5SDimitry Andric UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); 36090b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " 36100b57cec5SDimitry Andric << UP.MaxCount << '\n'); 36110b57cec5SDimitry Andric } 36120b57cec5SDimitry Andric } 36130b57cec5SDimitry Andric 36140b57cec5SDimitry Andric void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 3615349cc55cSDimitry Andric TTI::UnrollingPreferences &UP, 3616349cc55cSDimitry Andric OptimizationRemarkEmitter *ORE) { 36170b57cec5SDimitry Andric // Enable partial unrolling and runtime unrolling. 3618349cc55cSDimitry Andric BaseT::getUnrollingPreferences(L, SE, UP, ORE); 3619349cc55cSDimitry Andric 3620349cc55cSDimitry Andric UP.UpperBound = true; 36210b57cec5SDimitry Andric 36220b57cec5SDimitry Andric // For inner loop, it is more likely to be a hot one, and the runtime check 36230b57cec5SDimitry Andric // can be promoted out from LICM pass, so the overhead is less, let's try 36240b57cec5SDimitry Andric // a larger threshold to unroll more loops. 36250b57cec5SDimitry Andric if (L->getLoopDepth() > 1) 36260b57cec5SDimitry Andric UP.PartialThreshold *= 2; 36270b57cec5SDimitry Andric 36280b57cec5SDimitry Andric // Disable partial & runtime unrolling on -Os. 36290b57cec5SDimitry Andric UP.PartialOptSizeThreshold = 0; 36300b57cec5SDimitry Andric 36310b57cec5SDimitry Andric if (ST->getProcFamily() == AArch64Subtarget::Falkor && 36320b57cec5SDimitry Andric EnableFalkorHWPFUnrollFix) 36330b57cec5SDimitry Andric getFalkorUnrollingPreferences(L, SE, UP); 3634fe6060f1SDimitry Andric 3635fe6060f1SDimitry Andric // Scan the loop: don't unroll loops with calls as this could prevent 3636fe6060f1SDimitry Andric // inlining. Don't unroll vector loops either, as they don't benefit much from 3637fe6060f1SDimitry Andric // unrolling. 3638fe6060f1SDimitry Andric for (auto *BB : L->getBlocks()) { 3639fe6060f1SDimitry Andric for (auto &I : *BB) { 3640fe6060f1SDimitry Andric // Don't unroll vectorised loop. 3641fe6060f1SDimitry Andric if (I.getType()->isVectorTy()) 3642fe6060f1SDimitry Andric return; 3643fe6060f1SDimitry Andric 3644fe6060f1SDimitry Andric if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 3645fe6060f1SDimitry Andric if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 3646fe6060f1SDimitry Andric if (!isLoweredToCall(F)) 3647fe6060f1SDimitry Andric continue; 3648fe6060f1SDimitry Andric } 3649fe6060f1SDimitry Andric return; 3650fe6060f1SDimitry Andric } 3651fe6060f1SDimitry Andric } 3652fe6060f1SDimitry Andric } 3653fe6060f1SDimitry Andric 3654fe6060f1SDimitry Andric // Enable runtime unrolling for in-order models 3655fe6060f1SDimitry Andric // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by 3656fe6060f1SDimitry Andric // checking for that case, we can ensure that the default behaviour is 3657fe6060f1SDimitry Andric // unchanged 3658fe6060f1SDimitry Andric if (ST->getProcFamily() != AArch64Subtarget::Others && 3659fe6060f1SDimitry Andric !ST->getSchedModel().isOutOfOrder()) { 3660fe6060f1SDimitry Andric UP.Runtime = true; 3661fe6060f1SDimitry Andric UP.Partial = true; 3662fe6060f1SDimitry Andric UP.UnrollRemainder = true; 3663fe6060f1SDimitry Andric UP.DefaultUnrollRuntimeCount = 4; 3664fe6060f1SDimitry Andric 3665fe6060f1SDimitry Andric UP.UnrollAndJam = true; 3666fe6060f1SDimitry Andric UP.UnrollAndJamInnerLoopThreshold = 60; 3667fe6060f1SDimitry Andric } 36680b57cec5SDimitry Andric } 36690b57cec5SDimitry Andric 36705ffd83dbSDimitry Andric void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 36715ffd83dbSDimitry Andric TTI::PeelingPreferences &PP) { 36725ffd83dbSDimitry Andric BaseT::getPeelingPreferences(L, SE, PP); 36735ffd83dbSDimitry Andric } 36745ffd83dbSDimitry Andric 36750b57cec5SDimitry Andric Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, 36760b57cec5SDimitry Andric Type *ExpectedType) { 36770b57cec5SDimitry Andric switch (Inst->getIntrinsicID()) { 36780b57cec5SDimitry Andric default: 36790b57cec5SDimitry Andric return nullptr; 36800b57cec5SDimitry Andric case Intrinsic::aarch64_neon_st2: 36810b57cec5SDimitry Andric case Intrinsic::aarch64_neon_st3: 36820b57cec5SDimitry Andric case Intrinsic::aarch64_neon_st4: { 36830b57cec5SDimitry Andric // Create a struct type 36840b57cec5SDimitry Andric StructType *ST = dyn_cast<StructType>(ExpectedType); 36850b57cec5SDimitry Andric if (!ST) 36860b57cec5SDimitry Andric return nullptr; 3687349cc55cSDimitry Andric unsigned NumElts = Inst->arg_size() - 1; 36880b57cec5SDimitry Andric if (ST->getNumElements() != NumElts) 36890b57cec5SDimitry Andric return nullptr; 36900b57cec5SDimitry Andric for (unsigned i = 0, e = NumElts; i != e; ++i) { 36910b57cec5SDimitry Andric if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) 36920b57cec5SDimitry Andric return nullptr; 36930b57cec5SDimitry Andric } 3694bdd1243dSDimitry Andric Value *Res = PoisonValue::get(ExpectedType); 36950b57cec5SDimitry Andric IRBuilder<> Builder(Inst); 36960b57cec5SDimitry Andric for (unsigned i = 0, e = NumElts; i != e; ++i) { 36970b57cec5SDimitry Andric Value *L = Inst->getArgOperand(i); 36980b57cec5SDimitry Andric Res = Builder.CreateInsertValue(Res, L, i); 36990b57cec5SDimitry Andric } 37000b57cec5SDimitry Andric return Res; 37010b57cec5SDimitry Andric } 37020b57cec5SDimitry Andric case Intrinsic::aarch64_neon_ld2: 37030b57cec5SDimitry Andric case Intrinsic::aarch64_neon_ld3: 37040b57cec5SDimitry Andric case Intrinsic::aarch64_neon_ld4: 37050b57cec5SDimitry Andric if (Inst->getType() == ExpectedType) 37060b57cec5SDimitry Andric return Inst; 37070b57cec5SDimitry Andric return nullptr; 37080b57cec5SDimitry Andric } 37090b57cec5SDimitry Andric } 37100b57cec5SDimitry Andric 37110b57cec5SDimitry Andric bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 37120b57cec5SDimitry Andric MemIntrinsicInfo &Info) { 37130b57cec5SDimitry Andric switch (Inst->getIntrinsicID()) { 37140b57cec5SDimitry Andric default: 37150b57cec5SDimitry Andric break; 37160b57cec5SDimitry Andric case Intrinsic::aarch64_neon_ld2: 37170b57cec5SDimitry Andric case Intrinsic::aarch64_neon_ld3: 37180b57cec5SDimitry Andric case Intrinsic::aarch64_neon_ld4: 37190b57cec5SDimitry Andric Info.ReadMem = true; 37200b57cec5SDimitry Andric Info.WriteMem = false; 37210b57cec5SDimitry Andric Info.PtrVal = Inst->getArgOperand(0); 37220b57cec5SDimitry Andric break; 37230b57cec5SDimitry Andric case Intrinsic::aarch64_neon_st2: 37240b57cec5SDimitry Andric case Intrinsic::aarch64_neon_st3: 37250b57cec5SDimitry Andric case Intrinsic::aarch64_neon_st4: 37260b57cec5SDimitry Andric Info.ReadMem = false; 37270b57cec5SDimitry Andric Info.WriteMem = true; 3728349cc55cSDimitry Andric Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1); 37290b57cec5SDimitry Andric break; 37300b57cec5SDimitry Andric } 37310b57cec5SDimitry Andric 37320b57cec5SDimitry Andric switch (Inst->getIntrinsicID()) { 37330b57cec5SDimitry Andric default: 37340b57cec5SDimitry Andric return false; 37350b57cec5SDimitry Andric case Intrinsic::aarch64_neon_ld2: 37360b57cec5SDimitry Andric case Intrinsic::aarch64_neon_st2: 37370b57cec5SDimitry Andric Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; 37380b57cec5SDimitry Andric break; 37390b57cec5SDimitry Andric case Intrinsic::aarch64_neon_ld3: 37400b57cec5SDimitry Andric case Intrinsic::aarch64_neon_st3: 37410b57cec5SDimitry Andric Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; 37420b57cec5SDimitry Andric break; 37430b57cec5SDimitry Andric case Intrinsic::aarch64_neon_ld4: 37440b57cec5SDimitry Andric case Intrinsic::aarch64_neon_st4: 37450b57cec5SDimitry Andric Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; 37460b57cec5SDimitry Andric break; 37470b57cec5SDimitry Andric } 37480b57cec5SDimitry Andric return true; 37490b57cec5SDimitry Andric } 37500b57cec5SDimitry Andric 37510b57cec5SDimitry Andric /// See if \p I should be considered for address type promotion. We check if \p 37520b57cec5SDimitry Andric /// I is a sext with right type and used in memory accesses. If it used in a 37530b57cec5SDimitry Andric /// "complex" getelementptr, we allow it to be promoted without finding other 37540b57cec5SDimitry Andric /// sext instructions that sign extended the same initial value. A getelementptr 37550b57cec5SDimitry Andric /// is considered as "complex" if it has more than 2 operands. 37560b57cec5SDimitry Andric bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( 37570b57cec5SDimitry Andric const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { 37580b57cec5SDimitry Andric bool Considerable = false; 37590b57cec5SDimitry Andric AllowPromotionWithoutCommonHeader = false; 37600b57cec5SDimitry Andric if (!isa<SExtInst>(&I)) 37610b57cec5SDimitry Andric return false; 37620b57cec5SDimitry Andric Type *ConsideredSExtType = 37630b57cec5SDimitry Andric Type::getInt64Ty(I.getParent()->getParent()->getContext()); 37640b57cec5SDimitry Andric if (I.getType() != ConsideredSExtType) 37650b57cec5SDimitry Andric return false; 37660b57cec5SDimitry Andric // See if the sext is the one with the right type and used in at least one 37670b57cec5SDimitry Andric // GetElementPtrInst. 37680b57cec5SDimitry Andric for (const User *U : I.users()) { 37690b57cec5SDimitry Andric if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { 37700b57cec5SDimitry Andric Considerable = true; 37710b57cec5SDimitry Andric // A getelementptr is considered as "complex" if it has more than 2 37720b57cec5SDimitry Andric // operands. We will promote a SExt used in such complex GEP as we 37730b57cec5SDimitry Andric // expect some computation to be merged if they are done on 64 bits. 37740b57cec5SDimitry Andric if (GEPInst->getNumOperands() > 2) { 37750b57cec5SDimitry Andric AllowPromotionWithoutCommonHeader = true; 37760b57cec5SDimitry Andric break; 37770b57cec5SDimitry Andric } 37780b57cec5SDimitry Andric } 37790b57cec5SDimitry Andric } 37800b57cec5SDimitry Andric return Considerable; 37810b57cec5SDimitry Andric } 37820b57cec5SDimitry Andric 3783fe6060f1SDimitry Andric bool AArch64TTIImpl::isLegalToVectorizeReduction( 3784fe6060f1SDimitry Andric const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { 3785fe6060f1SDimitry Andric if (!VF.isScalable()) 3786fe6060f1SDimitry Andric return true; 3787fe6060f1SDimitry Andric 3788fe6060f1SDimitry Andric Type *Ty = RdxDesc.getRecurrenceType(); 3789fe6060f1SDimitry Andric if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) 37900b57cec5SDimitry Andric return false; 3791fe6060f1SDimitry Andric 3792fe6060f1SDimitry Andric switch (RdxDesc.getRecurrenceKind()) { 3793fe6060f1SDimitry Andric case RecurKind::Add: 3794fe6060f1SDimitry Andric case RecurKind::FAdd: 3795fe6060f1SDimitry Andric case RecurKind::And: 3796fe6060f1SDimitry Andric case RecurKind::Or: 3797fe6060f1SDimitry Andric case RecurKind::Xor: 3798fe6060f1SDimitry Andric case RecurKind::SMin: 3799fe6060f1SDimitry Andric case RecurKind::SMax: 3800fe6060f1SDimitry Andric case RecurKind::UMin: 3801fe6060f1SDimitry Andric case RecurKind::UMax: 3802fe6060f1SDimitry Andric case RecurKind::FMin: 3803fe6060f1SDimitry Andric case RecurKind::FMax: 38044824e7fdSDimitry Andric case RecurKind::FMulAdd: 38055f757f3fSDimitry Andric case RecurKind::IAnyOf: 38065f757f3fSDimitry Andric case RecurKind::FAnyOf: 3807fe6060f1SDimitry Andric return true; 38080b57cec5SDimitry Andric default: 38090b57cec5SDimitry Andric return false; 38100b57cec5SDimitry Andric } 3811fe6060f1SDimitry Andric } 38120b57cec5SDimitry Andric 3813fe6060f1SDimitry Andric InstructionCost 381406c3fb27SDimitry Andric AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 381506c3fb27SDimitry Andric FastMathFlags FMF, 3816e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind) { 3817*62987288SDimitry Andric // The code-generator is currently not able to handle scalable vectors 3818*62987288SDimitry Andric // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3819*62987288SDimitry Andric // it. This change will be removed when code-generation for these types is 3820*62987288SDimitry Andric // sufficiently reliable. 3821*62987288SDimitry Andric if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 3822*62987288SDimitry Andric if (VTy->getElementCount() == ElementCount::getScalable(1)) 3823*62987288SDimitry Andric return InstructionCost::getInvalid(); 3824*62987288SDimitry Andric 3825bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 3826349cc55cSDimitry Andric 3827349cc55cSDimitry Andric if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) 382806c3fb27SDimitry Andric return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 3829349cc55cSDimitry Andric 3830fe6060f1SDimitry Andric InstructionCost LegalizationCost = 0; 3831e8d8bef9SDimitry Andric if (LT.first > 1) { 3832e8d8bef9SDimitry Andric Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); 383306c3fb27SDimitry Andric IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF); 3834349cc55cSDimitry Andric LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1); 3835e8d8bef9SDimitry Andric } 3836e8d8bef9SDimitry Andric 3837e8d8bef9SDimitry Andric return LegalizationCost + /*Cost of horizontal reduction*/ 2; 3838e8d8bef9SDimitry Andric } 3839e8d8bef9SDimitry Andric 3840fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( 3841fe6060f1SDimitry Andric unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { 3842bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 3843fe6060f1SDimitry Andric InstructionCost LegalizationCost = 0; 3844e8d8bef9SDimitry Andric if (LT.first > 1) { 3845e8d8bef9SDimitry Andric Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); 3846e8d8bef9SDimitry Andric LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); 3847e8d8bef9SDimitry Andric LegalizationCost *= LT.first - 1; 3848e8d8bef9SDimitry Andric } 3849e8d8bef9SDimitry Andric 3850e8d8bef9SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode); 3851e8d8bef9SDimitry Andric assert(ISD && "Invalid opcode"); 3852e8d8bef9SDimitry Andric // Add the final reduction cost for the legal horizontal reduction 3853e8d8bef9SDimitry Andric switch (ISD) { 3854e8d8bef9SDimitry Andric case ISD::ADD: 3855e8d8bef9SDimitry Andric case ISD::AND: 3856e8d8bef9SDimitry Andric case ISD::OR: 3857e8d8bef9SDimitry Andric case ISD::XOR: 3858e8d8bef9SDimitry Andric case ISD::FADD: 3859e8d8bef9SDimitry Andric return LegalizationCost + 2; 3860e8d8bef9SDimitry Andric default: 3861fe6060f1SDimitry Andric return InstructionCost::getInvalid(); 3862e8d8bef9SDimitry Andric } 3863e8d8bef9SDimitry Andric } 3864e8d8bef9SDimitry Andric 3865fe6060f1SDimitry Andric InstructionCost 3866fe6060f1SDimitry Andric AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 3867bdd1243dSDimitry Andric std::optional<FastMathFlags> FMF, 38685ffd83dbSDimitry Andric TTI::TargetCostKind CostKind) { 3869*62987288SDimitry Andric // The code-generator is currently not able to handle scalable vectors 3870*62987288SDimitry Andric // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3871*62987288SDimitry Andric // it. This change will be removed when code-generation for these types is 3872*62987288SDimitry Andric // sufficiently reliable. 3873*62987288SDimitry Andric if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy)) 3874*62987288SDimitry Andric if (VTy->getElementCount() == ElementCount::getScalable(1)) 3875*62987288SDimitry Andric return InstructionCost::getInvalid(); 3876*62987288SDimitry Andric 3877fe6060f1SDimitry Andric if (TTI::requiresOrderedReduction(FMF)) { 3878349cc55cSDimitry Andric if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) { 3879349cc55cSDimitry Andric InstructionCost BaseCost = 3880349cc55cSDimitry Andric BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 3881349cc55cSDimitry Andric // Add on extra cost to reflect the extra overhead on some CPUs. We still 3882349cc55cSDimitry Andric // end up vectorizing for more computationally intensive loops. 3883349cc55cSDimitry Andric return BaseCost + FixedVTy->getNumElements(); 3884349cc55cSDimitry Andric } 3885fe6060f1SDimitry Andric 3886fe6060f1SDimitry Andric if (Opcode != Instruction::FAdd) 3887fe6060f1SDimitry Andric return InstructionCost::getInvalid(); 3888fe6060f1SDimitry Andric 3889fe6060f1SDimitry Andric auto *VTy = cast<ScalableVectorType>(ValTy); 3890fe6060f1SDimitry Andric InstructionCost Cost = 3891fe6060f1SDimitry Andric getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); 3892fe6060f1SDimitry Andric Cost *= getMaxNumElements(VTy->getElementCount()); 3893fe6060f1SDimitry Andric return Cost; 3894fe6060f1SDimitry Andric } 38950b57cec5SDimitry Andric 3896e8d8bef9SDimitry Andric if (isa<ScalableVectorType>(ValTy)) 3897fe6060f1SDimitry Andric return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); 38980b57cec5SDimitry Andric 3899bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 39000b57cec5SDimitry Andric MVT MTy = LT.second; 39010b57cec5SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode); 39020b57cec5SDimitry Andric assert(ISD && "Invalid opcode"); 39030b57cec5SDimitry Andric 39040b57cec5SDimitry Andric // Horizontal adds can use the 'addv' instruction. We model the cost of these 3905fe6060f1SDimitry Andric // instructions as twice a normal vector add, plus 1 for each legalization 3906fe6060f1SDimitry Andric // step (LT.first). This is the only arithmetic vector reduction operation for 3907fe6060f1SDimitry Andric // which we have an instruction. 3908fe6060f1SDimitry Andric // OR, XOR and AND costs should match the codegen from: 3909fe6060f1SDimitry Andric // OR: llvm/test/CodeGen/AArch64/reduce-or.ll 3910fe6060f1SDimitry Andric // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll 3911fe6060f1SDimitry Andric // AND: llvm/test/CodeGen/AArch64/reduce-and.ll 39120b57cec5SDimitry Andric static const CostTblEntry CostTblNoPairwise[]{ 3913fe6060f1SDimitry Andric {ISD::ADD, MVT::v8i8, 2}, 3914fe6060f1SDimitry Andric {ISD::ADD, MVT::v16i8, 2}, 3915fe6060f1SDimitry Andric {ISD::ADD, MVT::v4i16, 2}, 3916fe6060f1SDimitry Andric {ISD::ADD, MVT::v8i16, 2}, 3917fe6060f1SDimitry Andric {ISD::ADD, MVT::v4i32, 2}, 3918bdd1243dSDimitry Andric {ISD::ADD, MVT::v2i64, 2}, 3919fe6060f1SDimitry Andric {ISD::OR, MVT::v8i8, 15}, 3920fe6060f1SDimitry Andric {ISD::OR, MVT::v16i8, 17}, 3921fe6060f1SDimitry Andric {ISD::OR, MVT::v4i16, 7}, 3922fe6060f1SDimitry Andric {ISD::OR, MVT::v8i16, 9}, 3923fe6060f1SDimitry Andric {ISD::OR, MVT::v2i32, 3}, 3924fe6060f1SDimitry Andric {ISD::OR, MVT::v4i32, 5}, 3925fe6060f1SDimitry Andric {ISD::OR, MVT::v2i64, 3}, 3926fe6060f1SDimitry Andric {ISD::XOR, MVT::v8i8, 15}, 3927fe6060f1SDimitry Andric {ISD::XOR, MVT::v16i8, 17}, 3928fe6060f1SDimitry Andric {ISD::XOR, MVT::v4i16, 7}, 3929fe6060f1SDimitry Andric {ISD::XOR, MVT::v8i16, 9}, 3930fe6060f1SDimitry Andric {ISD::XOR, MVT::v2i32, 3}, 3931fe6060f1SDimitry Andric {ISD::XOR, MVT::v4i32, 5}, 3932fe6060f1SDimitry Andric {ISD::XOR, MVT::v2i64, 3}, 3933fe6060f1SDimitry Andric {ISD::AND, MVT::v8i8, 15}, 3934fe6060f1SDimitry Andric {ISD::AND, MVT::v16i8, 17}, 3935fe6060f1SDimitry Andric {ISD::AND, MVT::v4i16, 7}, 3936fe6060f1SDimitry Andric {ISD::AND, MVT::v8i16, 9}, 3937fe6060f1SDimitry Andric {ISD::AND, MVT::v2i32, 3}, 3938fe6060f1SDimitry Andric {ISD::AND, MVT::v4i32, 5}, 3939fe6060f1SDimitry Andric {ISD::AND, MVT::v2i64, 3}, 39400b57cec5SDimitry Andric }; 3941fe6060f1SDimitry Andric switch (ISD) { 3942fe6060f1SDimitry Andric default: 3943fe6060f1SDimitry Andric break; 3944fe6060f1SDimitry Andric case ISD::ADD: 39450b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) 3946fe6060f1SDimitry Andric return (LT.first - 1) + Entry->Cost; 3947fe6060f1SDimitry Andric break; 3948fe6060f1SDimitry Andric case ISD::XOR: 3949fe6060f1SDimitry Andric case ISD::AND: 3950fe6060f1SDimitry Andric case ISD::OR: 3951fe6060f1SDimitry Andric const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); 3952fe6060f1SDimitry Andric if (!Entry) 3953fe6060f1SDimitry Andric break; 3954fe6060f1SDimitry Andric auto *ValVTy = cast<FixedVectorType>(ValTy); 395506c3fb27SDimitry Andric if (MTy.getVectorNumElements() <= ValVTy->getNumElements() && 3956fe6060f1SDimitry Andric isPowerOf2_32(ValVTy->getNumElements())) { 3957fe6060f1SDimitry Andric InstructionCost ExtraCost = 0; 3958fe6060f1SDimitry Andric if (LT.first != 1) { 3959fe6060f1SDimitry Andric // Type needs to be split, so there is an extra cost of LT.first - 1 3960fe6060f1SDimitry Andric // arithmetic ops. 3961fe6060f1SDimitry Andric auto *Ty = FixedVectorType::get(ValTy->getElementType(), 3962fe6060f1SDimitry Andric MTy.getVectorNumElements()); 3963fe6060f1SDimitry Andric ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); 3964fe6060f1SDimitry Andric ExtraCost *= LT.first - 1; 3965fe6060f1SDimitry Andric } 396606c3fb27SDimitry Andric // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov 396706c3fb27SDimitry Andric auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost; 396806c3fb27SDimitry Andric return Cost + ExtraCost; 3969fe6060f1SDimitry Andric } 3970fe6060f1SDimitry Andric break; 3971fe6060f1SDimitry Andric } 3972fe6060f1SDimitry Andric return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 39730b57cec5SDimitry Andric } 39740b57cec5SDimitry Andric 3975fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { 3976fe6060f1SDimitry Andric static const CostTblEntry ShuffleTbl[] = { 3977fe6060f1SDimitry Andric { TTI::SK_Splice, MVT::nxv16i8, 1 }, 3978fe6060f1SDimitry Andric { TTI::SK_Splice, MVT::nxv8i16, 1 }, 3979fe6060f1SDimitry Andric { TTI::SK_Splice, MVT::nxv4i32, 1 }, 3980fe6060f1SDimitry Andric { TTI::SK_Splice, MVT::nxv2i64, 1 }, 3981fe6060f1SDimitry Andric { TTI::SK_Splice, MVT::nxv2f16, 1 }, 3982fe6060f1SDimitry Andric { TTI::SK_Splice, MVT::nxv4f16, 1 }, 3983fe6060f1SDimitry Andric { TTI::SK_Splice, MVT::nxv8f16, 1 }, 3984fe6060f1SDimitry Andric { TTI::SK_Splice, MVT::nxv2bf16, 1 }, 3985fe6060f1SDimitry Andric { TTI::SK_Splice, MVT::nxv4bf16, 1 }, 3986fe6060f1SDimitry Andric { TTI::SK_Splice, MVT::nxv8bf16, 1 }, 3987fe6060f1SDimitry Andric { TTI::SK_Splice, MVT::nxv2f32, 1 }, 3988fe6060f1SDimitry Andric { TTI::SK_Splice, MVT::nxv4f32, 1 }, 3989fe6060f1SDimitry Andric { TTI::SK_Splice, MVT::nxv2f64, 1 }, 3990fe6060f1SDimitry Andric }; 3991fe6060f1SDimitry Andric 3992bdd1243dSDimitry Andric // The code-generator is currently not able to handle scalable vectors 3993bdd1243dSDimitry Andric // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3994bdd1243dSDimitry Andric // it. This change will be removed when code-generation for these types is 3995bdd1243dSDimitry Andric // sufficiently reliable. 3996bdd1243dSDimitry Andric if (Tp->getElementCount() == ElementCount::getScalable(1)) 3997bdd1243dSDimitry Andric return InstructionCost::getInvalid(); 3998bdd1243dSDimitry Andric 3999bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 4000fe6060f1SDimitry Andric Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); 4001fe6060f1SDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 4002fe6060f1SDimitry Andric EVT PromotedVT = LT.second.getScalarType() == MVT::i1 4003fe6060f1SDimitry Andric ? TLI->getPromotedVTForPredicate(EVT(LT.second)) 4004fe6060f1SDimitry Andric : LT.second; 4005fe6060f1SDimitry Andric Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); 4006fe6060f1SDimitry Andric InstructionCost LegalizationCost = 0; 4007fe6060f1SDimitry Andric if (Index < 0) { 4008fe6060f1SDimitry Andric LegalizationCost = 4009fe6060f1SDimitry Andric getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, 4010fe6060f1SDimitry Andric CmpInst::BAD_ICMP_PREDICATE, CostKind) + 4011fe6060f1SDimitry Andric getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, 4012fe6060f1SDimitry Andric CmpInst::BAD_ICMP_PREDICATE, CostKind); 4013fe6060f1SDimitry Andric } 4014fe6060f1SDimitry Andric 4015fe6060f1SDimitry Andric // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp 4016fe6060f1SDimitry Andric // Cost performed on a promoted type. 4017fe6060f1SDimitry Andric if (LT.second.getScalarType() == MVT::i1) { 4018fe6060f1SDimitry Andric LegalizationCost += 4019fe6060f1SDimitry Andric getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, 4020fe6060f1SDimitry Andric TTI::CastContextHint::None, CostKind) + 4021fe6060f1SDimitry Andric getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, 4022fe6060f1SDimitry Andric TTI::CastContextHint::None, CostKind); 4023fe6060f1SDimitry Andric } 4024fe6060f1SDimitry Andric const auto *Entry = 4025fe6060f1SDimitry Andric CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); 4026fe6060f1SDimitry Andric assert(Entry && "Illegal Type for Splice"); 4027fe6060f1SDimitry Andric LegalizationCost += Entry->Cost; 4028fe6060f1SDimitry Andric return LegalizationCost * LT.first; 4029fe6060f1SDimitry Andric } 4030fe6060f1SDimitry Andric 40310fca6ea1SDimitry Andric InstructionCost AArch64TTIImpl::getShuffleCost( 40320fca6ea1SDimitry Andric TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask, 40330fca6ea1SDimitry Andric TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, 40340fca6ea1SDimitry Andric ArrayRef<const Value *> Args, const Instruction *CxtI) { 4035bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 40360fca6ea1SDimitry Andric 403781ad6265SDimitry Andric // If we have a Mask, and the LT is being legalized somehow, split the Mask 403881ad6265SDimitry Andric // into smaller vectors and sum the cost of each shuffle. 403981ad6265SDimitry Andric if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() && 404081ad6265SDimitry Andric Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && 40415f757f3fSDimitry Andric Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) { 40420fca6ea1SDimitry Andric 40430fca6ea1SDimitry Andric // Check for LD3/LD4 instructions, which are represented in llvm IR as 40440fca6ea1SDimitry Andric // deinterleaving-shuffle(load). The shuffle cost could potentially be free, 40450fca6ea1SDimitry Andric // but we model it with a cost of LT.first so that LD3/LD4 have a higher 40460fca6ea1SDimitry Andric // cost than just the load. 40470fca6ea1SDimitry Andric if (Args.size() >= 1 && isa<LoadInst>(Args[0]) && 40480fca6ea1SDimitry Andric (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 3) || 40490fca6ea1SDimitry Andric ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 4))) 40500fca6ea1SDimitry Andric return std::max<InstructionCost>(1, LT.first / 4); 40510fca6ea1SDimitry Andric 40520fca6ea1SDimitry Andric // Check for ST3/ST4 instructions, which are represented in llvm IR as 40530fca6ea1SDimitry Andric // store(interleaving-shuffle). The shuffle cost could potentially be free, 40540fca6ea1SDimitry Andric // but we model it with a cost of LT.first so that ST3/ST4 have a higher 40550fca6ea1SDimitry Andric // cost than just the store. 40560fca6ea1SDimitry Andric if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) && 40570fca6ea1SDimitry Andric (ShuffleVectorInst::isInterleaveMask( 40580fca6ea1SDimitry Andric Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) || 40590fca6ea1SDimitry Andric ShuffleVectorInst::isInterleaveMask( 40600fca6ea1SDimitry Andric Mask, 3, Tp->getElementCount().getKnownMinValue() * 2))) 40610fca6ea1SDimitry Andric return LT.first; 40620fca6ea1SDimitry Andric 40635f757f3fSDimitry Andric unsigned TpNumElts = Mask.size(); 406481ad6265SDimitry Andric unsigned LTNumElts = LT.second.getVectorNumElements(); 406581ad6265SDimitry Andric unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; 406681ad6265SDimitry Andric VectorType *NTp = 406781ad6265SDimitry Andric VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount()); 406881ad6265SDimitry Andric InstructionCost Cost; 406981ad6265SDimitry Andric for (unsigned N = 0; N < NumVecs; N++) { 407081ad6265SDimitry Andric SmallVector<int> NMask; 407181ad6265SDimitry Andric // Split the existing mask into chunks of size LTNumElts. Track the source 407281ad6265SDimitry Andric // sub-vectors to ensure the result has at most 2 inputs. 407381ad6265SDimitry Andric unsigned Source1, Source2; 407481ad6265SDimitry Andric unsigned NumSources = 0; 407581ad6265SDimitry Andric for (unsigned E = 0; E < LTNumElts; E++) { 407681ad6265SDimitry Andric int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] 407706c3fb27SDimitry Andric : PoisonMaskElem; 407881ad6265SDimitry Andric if (MaskElt < 0) { 407906c3fb27SDimitry Andric NMask.push_back(PoisonMaskElem); 408081ad6265SDimitry Andric continue; 408181ad6265SDimitry Andric } 408281ad6265SDimitry Andric 408381ad6265SDimitry Andric // Calculate which source from the input this comes from and whether it 408481ad6265SDimitry Andric // is new to us. 408581ad6265SDimitry Andric unsigned Source = MaskElt / LTNumElts; 408681ad6265SDimitry Andric if (NumSources == 0) { 408781ad6265SDimitry Andric Source1 = Source; 408881ad6265SDimitry Andric NumSources = 1; 408981ad6265SDimitry Andric } else if (NumSources == 1 && Source != Source1) { 409081ad6265SDimitry Andric Source2 = Source; 409181ad6265SDimitry Andric NumSources = 2; 409281ad6265SDimitry Andric } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { 409381ad6265SDimitry Andric NumSources++; 409481ad6265SDimitry Andric } 409581ad6265SDimitry Andric 409681ad6265SDimitry Andric // Add to the new mask. For the NumSources>2 case these are not correct, 409781ad6265SDimitry Andric // but are only used for the modular lane number. 409881ad6265SDimitry Andric if (Source == Source1) 409981ad6265SDimitry Andric NMask.push_back(MaskElt % LTNumElts); 410081ad6265SDimitry Andric else if (Source == Source2) 410181ad6265SDimitry Andric NMask.push_back(MaskElt % LTNumElts + LTNumElts); 410281ad6265SDimitry Andric else 410381ad6265SDimitry Andric NMask.push_back(MaskElt % LTNumElts); 410481ad6265SDimitry Andric } 410581ad6265SDimitry Andric // If the sub-mask has at most 2 input sub-vectors then re-cost it using 410681ad6265SDimitry Andric // getShuffleCost. If not then cost it using the worst case. 410781ad6265SDimitry Andric if (NumSources <= 2) 410881ad6265SDimitry Andric Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc 410981ad6265SDimitry Andric : TTI::SK_PermuteTwoSrc, 41100fca6ea1SDimitry Andric NTp, NMask, CostKind, 0, nullptr, Args, CxtI); 411181ad6265SDimitry Andric else if (any_of(enumerate(NMask), [&](const auto &ME) { 411281ad6265SDimitry Andric return ME.value() % LTNumElts == ME.index(); 411381ad6265SDimitry Andric })) 411481ad6265SDimitry Andric Cost += LTNumElts - 1; 411581ad6265SDimitry Andric else 411681ad6265SDimitry Andric Cost += LTNumElts; 411781ad6265SDimitry Andric } 411881ad6265SDimitry Andric return Cost; 411981ad6265SDimitry Andric } 412081ad6265SDimitry Andric 41215f757f3fSDimitry Andric Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); 41220fca6ea1SDimitry Andric // Treat extractsubvector as single op permutation. 41230fca6ea1SDimitry Andric bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector; 41240fca6ea1SDimitry Andric if (IsExtractSubvector && LT.second.isFixedLengthVector()) 41250fca6ea1SDimitry Andric Kind = TTI::SK_PermuteSingleSrc; 412681ad6265SDimitry Andric 412706c3fb27SDimitry Andric // Check for broadcast loads, which are supported by the LD1R instruction. 412806c3fb27SDimitry Andric // In terms of code-size, the shuffle vector is free when a load + dup get 412906c3fb27SDimitry Andric // folded into a LD1R. That's what we check and return here. For performance 413006c3fb27SDimitry Andric // and reciprocal throughput, a LD1R is not completely free. In this case, we 413106c3fb27SDimitry Andric // return the cost for the broadcast below (i.e. 1 for most/all types), so 413206c3fb27SDimitry Andric // that we model the load + dup sequence slightly higher because LD1R is a 413306c3fb27SDimitry Andric // high latency instruction. 413406c3fb27SDimitry Andric if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) { 413581ad6265SDimitry Andric bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]); 413681ad6265SDimitry Andric if (IsLoad && LT.second.isVector() && 413781ad6265SDimitry Andric isLegalBroadcastLoad(Tp->getElementType(), 413881ad6265SDimitry Andric LT.second.getVectorElementCount())) 413906c3fb27SDimitry Andric return 0; 414081ad6265SDimitry Andric } 414181ad6265SDimitry Andric 414281ad6265SDimitry Andric // If we have 4 elements for the shuffle and a Mask, get the cost straight 414381ad6265SDimitry Andric // from the perfect shuffle tables. 414481ad6265SDimitry Andric if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) && 414581ad6265SDimitry Andric (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && 414681ad6265SDimitry Andric all_of(Mask, [](int E) { return E < 8; })) 414781ad6265SDimitry Andric return getPerfectShuffleCost(Mask); 414881ad6265SDimitry Andric 41490fca6ea1SDimitry Andric // Check for identity masks, which we can treat as free. 41500fca6ea1SDimitry Andric if (!Mask.empty() && LT.second.isFixedLengthVector() && 41510fca6ea1SDimitry Andric (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && 41520fca6ea1SDimitry Andric all_of(enumerate(Mask), [](const auto &M) { 41530fca6ea1SDimitry Andric return M.value() < 0 || M.value() == (int)M.index(); 41540fca6ea1SDimitry Andric })) 41550fca6ea1SDimitry Andric return 0; 41560fca6ea1SDimitry Andric 41570fca6ea1SDimitry Andric // Check for other shuffles that are not SK_ kinds but we have native 41580fca6ea1SDimitry Andric // instructions for, for example ZIP and UZP. 41590fca6ea1SDimitry Andric unsigned Unused; 41600fca6ea1SDimitry Andric if (LT.second.isFixedLengthVector() && 41610fca6ea1SDimitry Andric LT.second.getVectorNumElements() == Mask.size() && 41620fca6ea1SDimitry Andric (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && 41630fca6ea1SDimitry Andric (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) || 41640fca6ea1SDimitry Andric isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) || 41650fca6ea1SDimitry Andric // Check for non-zero lane splats 41660fca6ea1SDimitry Andric all_of(drop_begin(Mask), 41670fca6ea1SDimitry Andric [&Mask](int M) { return M < 0 || M == Mask[0]; }))) 41680fca6ea1SDimitry Andric return 1; 41690fca6ea1SDimitry Andric 41700b57cec5SDimitry Andric if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || 4171fe6060f1SDimitry Andric Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || 4172bdd1243dSDimitry Andric Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) { 41730b57cec5SDimitry Andric static const CostTblEntry ShuffleTbl[] = { 41740b57cec5SDimitry Andric // Broadcast shuffle kinds can be performed with 'dup'. 41750b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v8i8, 1}, 41760b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v16i8, 1}, 41770b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v4i16, 1}, 41780b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v8i16, 1}, 41790b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v2i32, 1}, 41800b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v4i32, 1}, 41810b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v2i64, 1}, 418206c3fb27SDimitry Andric {TTI::SK_Broadcast, MVT::v4f16, 1}, 418306c3fb27SDimitry Andric {TTI::SK_Broadcast, MVT::v8f16, 1}, 41840b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v2f32, 1}, 41850b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v4f32, 1}, 41860b57cec5SDimitry Andric {TTI::SK_Broadcast, MVT::v2f64, 1}, 41870b57cec5SDimitry Andric // Transpose shuffle kinds can be performed with 'trn1/trn2' and 41880b57cec5SDimitry Andric // 'zip1/zip2' instructions. 41890b57cec5SDimitry Andric {TTI::SK_Transpose, MVT::v8i8, 1}, 41900b57cec5SDimitry Andric {TTI::SK_Transpose, MVT::v16i8, 1}, 41910b57cec5SDimitry Andric {TTI::SK_Transpose, MVT::v4i16, 1}, 41920b57cec5SDimitry Andric {TTI::SK_Transpose, MVT::v8i16, 1}, 41930b57cec5SDimitry Andric {TTI::SK_Transpose, MVT::v2i32, 1}, 41940b57cec5SDimitry Andric {TTI::SK_Transpose, MVT::v4i32, 1}, 41950b57cec5SDimitry Andric {TTI::SK_Transpose, MVT::v2i64, 1}, 419606c3fb27SDimitry Andric {TTI::SK_Transpose, MVT::v4f16, 1}, 419706c3fb27SDimitry Andric {TTI::SK_Transpose, MVT::v8f16, 1}, 41980b57cec5SDimitry Andric {TTI::SK_Transpose, MVT::v2f32, 1}, 41990b57cec5SDimitry Andric {TTI::SK_Transpose, MVT::v4f32, 1}, 42000b57cec5SDimitry Andric {TTI::SK_Transpose, MVT::v2f64, 1}, 42010b57cec5SDimitry Andric // Select shuffle kinds. 42020b57cec5SDimitry Andric // TODO: handle vXi8/vXi16. 42030b57cec5SDimitry Andric {TTI::SK_Select, MVT::v2i32, 1}, // mov. 42040b57cec5SDimitry Andric {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar). 42050b57cec5SDimitry Andric {TTI::SK_Select, MVT::v2i64, 1}, // mov. 42060b57cec5SDimitry Andric {TTI::SK_Select, MVT::v2f32, 1}, // mov. 42070b57cec5SDimitry Andric {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar). 42080b57cec5SDimitry Andric {TTI::SK_Select, MVT::v2f64, 1}, // mov. 42090b57cec5SDimitry Andric // PermuteSingleSrc shuffle kinds. 42100b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov. 42110b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case. 42120b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov. 42130b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov. 42140b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case. 42150b57cec5SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov. 4216fe6060f1SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case. 4217fe6060f1SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case. 4218bdd1243dSDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same 4219fe6060f1SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl 4220fe6060f1SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl 4221fe6060f1SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl 4222fe6060f1SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl 4223fe6060f1SDimitry Andric {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl 4224fe6060f1SDimitry Andric // Reverse can be lowered with `rev`. 4225bdd1243dSDimitry Andric {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64 4226fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT 4227bdd1243dSDimitry Andric {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT 4228bdd1243dSDimitry Andric {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64 4229fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT 4230bdd1243dSDimitry Andric {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT 423181ad6265SDimitry Andric {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT 423281ad6265SDimitry Andric {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT 423381ad6265SDimitry Andric {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT 423481ad6265SDimitry Andric {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64 423581ad6265SDimitry Andric {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64 423681ad6265SDimitry Andric {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64 4237bdd1243dSDimitry Andric // Splice can all be lowered as `ext`. 4238bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v2i32, 1}, 4239bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v4i32, 1}, 4240bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v2i64, 1}, 4241bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v2f32, 1}, 4242bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v4f32, 1}, 4243bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v2f64, 1}, 4244bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v8f16, 1}, 4245bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v8bf16, 1}, 4246bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v8i16, 1}, 4247bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v16i8, 1}, 4248bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v4bf16, 1}, 4249bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v4f16, 1}, 4250bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v4i16, 1}, 4251bdd1243dSDimitry Andric {TTI::SK_Splice, MVT::v8i8, 1}, 4252fe6060f1SDimitry Andric // Broadcast shuffle kinds for scalable vectors 4253fe6060f1SDimitry Andric {TTI::SK_Broadcast, MVT::nxv16i8, 1}, 4254fe6060f1SDimitry Andric {TTI::SK_Broadcast, MVT::nxv8i16, 1}, 4255fe6060f1SDimitry Andric {TTI::SK_Broadcast, MVT::nxv4i32, 1}, 4256fe6060f1SDimitry Andric {TTI::SK_Broadcast, MVT::nxv2i64, 1}, 4257fe6060f1SDimitry Andric {TTI::SK_Broadcast, MVT::nxv2f16, 1}, 4258fe6060f1SDimitry Andric {TTI::SK_Broadcast, MVT::nxv4f16, 1}, 4259fe6060f1SDimitry Andric {TTI::SK_Broadcast, MVT::nxv8f16, 1}, 4260fe6060f1SDimitry Andric {TTI::SK_Broadcast, MVT::nxv2bf16, 1}, 4261fe6060f1SDimitry Andric {TTI::SK_Broadcast, MVT::nxv4bf16, 1}, 4262fe6060f1SDimitry Andric {TTI::SK_Broadcast, MVT::nxv8bf16, 1}, 4263fe6060f1SDimitry Andric {TTI::SK_Broadcast, MVT::nxv2f32, 1}, 4264fe6060f1SDimitry Andric {TTI::SK_Broadcast, MVT::nxv4f32, 1}, 4265fe6060f1SDimitry Andric {TTI::SK_Broadcast, MVT::nxv2f64, 1}, 4266fe6060f1SDimitry Andric {TTI::SK_Broadcast, MVT::nxv16i1, 1}, 4267fe6060f1SDimitry Andric {TTI::SK_Broadcast, MVT::nxv8i1, 1}, 4268fe6060f1SDimitry Andric {TTI::SK_Broadcast, MVT::nxv4i1, 1}, 4269fe6060f1SDimitry Andric {TTI::SK_Broadcast, MVT::nxv2i1, 1}, 4270fe6060f1SDimitry Andric // Handle the cases for vector.reverse with scalable vectors 4271fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::nxv16i8, 1}, 4272fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::nxv8i16, 1}, 4273fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::nxv4i32, 1}, 4274fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::nxv2i64, 1}, 4275fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::nxv2f16, 1}, 4276fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::nxv4f16, 1}, 4277fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::nxv8f16, 1}, 4278fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::nxv2bf16, 1}, 4279fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::nxv4bf16, 1}, 4280fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::nxv8bf16, 1}, 4281fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::nxv2f32, 1}, 4282fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::nxv4f32, 1}, 4283fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::nxv2f64, 1}, 4284fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::nxv16i1, 1}, 4285fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::nxv8i1, 1}, 4286fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::nxv4i1, 1}, 4287fe6060f1SDimitry Andric {TTI::SK_Reverse, MVT::nxv2i1, 1}, 42880b57cec5SDimitry Andric }; 42890b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) 42900b57cec5SDimitry Andric return LT.first * Entry->Cost; 42910b57cec5SDimitry Andric } 429281ad6265SDimitry Andric 4293fe6060f1SDimitry Andric if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) 4294fe6060f1SDimitry Andric return getSpliceCost(Tp, Index); 429581ad6265SDimitry Andric 429681ad6265SDimitry Andric // Inserting a subvector can often be done with either a D, S or H register 429781ad6265SDimitry Andric // move, so long as the inserted vector is "aligned". 429881ad6265SDimitry Andric if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && 429981ad6265SDimitry Andric LT.second.getSizeInBits() <= 128 && SubTp) { 4300bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); 430181ad6265SDimitry Andric if (SubLT.second.isVector()) { 430281ad6265SDimitry Andric int NumElts = LT.second.getVectorNumElements(); 430381ad6265SDimitry Andric int NumSubElts = SubLT.second.getVectorNumElements(); 430481ad6265SDimitry Andric if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 430581ad6265SDimitry Andric return SubLT.first; 430681ad6265SDimitry Andric } 430781ad6265SDimitry Andric } 430881ad6265SDimitry Andric 43090fca6ea1SDimitry Andric // Restore optimal kind. 43100fca6ea1SDimitry Andric if (IsExtractSubvector) 43110fca6ea1SDimitry Andric Kind = TTI::SK_ExtractSubvector; 43120fca6ea1SDimitry Andric return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args, 43130fca6ea1SDimitry Andric CxtI); 43140b57cec5SDimitry Andric } 4315fcaf7f86SDimitry Andric 431606c3fb27SDimitry Andric static bool containsDecreasingPointers(Loop *TheLoop, 431706c3fb27SDimitry Andric PredicatedScalarEvolution *PSE) { 431806c3fb27SDimitry Andric const auto &Strides = DenseMap<Value *, const SCEV *>(); 431906c3fb27SDimitry Andric for (BasicBlock *BB : TheLoop->blocks()) { 432006c3fb27SDimitry Andric // Scan the instructions in the block and look for addresses that are 432106c3fb27SDimitry Andric // consecutive and decreasing. 432206c3fb27SDimitry Andric for (Instruction &I : *BB) { 432306c3fb27SDimitry Andric if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) { 432406c3fb27SDimitry Andric Value *Ptr = getLoadStorePointerOperand(&I); 432506c3fb27SDimitry Andric Type *AccessTy = getLoadStoreType(&I); 432606c3fb27SDimitry Andric if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true, 432706c3fb27SDimitry Andric /*ShouldCheckWrap=*/false) 432806c3fb27SDimitry Andric .value_or(0) < 0) 432906c3fb27SDimitry Andric return true; 433006c3fb27SDimitry Andric } 433106c3fb27SDimitry Andric } 433206c3fb27SDimitry Andric } 433306c3fb27SDimitry Andric return false; 433406c3fb27SDimitry Andric } 433506c3fb27SDimitry Andric 433606c3fb27SDimitry Andric bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) { 433706c3fb27SDimitry Andric if (!ST->hasSVE()) 4338fcaf7f86SDimitry Andric return false; 4339fcaf7f86SDimitry Andric 4340bdd1243dSDimitry Andric // We don't currently support vectorisation with interleaving for SVE - with 4341bdd1243dSDimitry Andric // such loops we're better off not using tail-folding. This gives us a chance 4342bdd1243dSDimitry Andric // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc. 434306c3fb27SDimitry Andric if (TFI->IAI->hasGroups()) 4344bdd1243dSDimitry Andric return false; 4345bdd1243dSDimitry Andric 434606c3fb27SDimitry Andric TailFoldingOpts Required = TailFoldingOpts::Disabled; 434706c3fb27SDimitry Andric if (TFI->LVL->getReductionVars().size()) 434806c3fb27SDimitry Andric Required |= TailFoldingOpts::Reductions; 434906c3fb27SDimitry Andric if (TFI->LVL->getFixedOrderRecurrences().size()) 435006c3fb27SDimitry Andric Required |= TailFoldingOpts::Recurrences; 4351fcaf7f86SDimitry Andric 435206c3fb27SDimitry Andric // We call this to discover whether any load/store pointers in the loop have 435306c3fb27SDimitry Andric // negative strides. This will require extra work to reverse the loop 435406c3fb27SDimitry Andric // predicate, which may be expensive. 435506c3fb27SDimitry Andric if (containsDecreasingPointers(TFI->LVL->getLoop(), 435606c3fb27SDimitry Andric TFI->LVL->getPredicatedScalarEvolution())) 435706c3fb27SDimitry Andric Required |= TailFoldingOpts::Reverse; 435806c3fb27SDimitry Andric if (Required == TailFoldingOpts::Disabled) 435906c3fb27SDimitry Andric Required |= TailFoldingOpts::Simple; 436006c3fb27SDimitry Andric 436106c3fb27SDimitry Andric if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(), 436206c3fb27SDimitry Andric Required)) 436306c3fb27SDimitry Andric return false; 436406c3fb27SDimitry Andric 436506c3fb27SDimitry Andric // Don't tail-fold for tight loops where we would be better off interleaving 436606c3fb27SDimitry Andric // with an unpredicated loop. 436706c3fb27SDimitry Andric unsigned NumInsns = 0; 436806c3fb27SDimitry Andric for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) { 436906c3fb27SDimitry Andric NumInsns += BB->sizeWithoutDebug(); 437006c3fb27SDimitry Andric } 437106c3fb27SDimitry Andric 437206c3fb27SDimitry Andric // We expect 4 of these to be a IV PHI, IV add, IV compare and branch. 437306c3fb27SDimitry Andric return NumInsns >= SVETailFoldInsnThreshold; 4374fcaf7f86SDimitry Andric } 4375bdd1243dSDimitry Andric 4376bdd1243dSDimitry Andric InstructionCost 4377bdd1243dSDimitry Andric AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 43780fca6ea1SDimitry Andric StackOffset BaseOffset, bool HasBaseReg, 4379bdd1243dSDimitry Andric int64_t Scale, unsigned AddrSpace) const { 4380bdd1243dSDimitry Andric // Scaling factors are not free at all. 4381bdd1243dSDimitry Andric // Operands | Rt Latency 4382bdd1243dSDimitry Andric // ------------------------------------------- 4383bdd1243dSDimitry Andric // Rt, [Xn, Xm] | 4 4384bdd1243dSDimitry Andric // ------------------------------------------- 4385bdd1243dSDimitry Andric // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 4386bdd1243dSDimitry Andric // Rt, [Xn, Wm, <extend> #imm] | 4387bdd1243dSDimitry Andric TargetLoweringBase::AddrMode AM; 4388bdd1243dSDimitry Andric AM.BaseGV = BaseGV; 43890fca6ea1SDimitry Andric AM.BaseOffs = BaseOffset.getFixed(); 4390bdd1243dSDimitry Andric AM.HasBaseReg = HasBaseReg; 4391bdd1243dSDimitry Andric AM.Scale = Scale; 43920fca6ea1SDimitry Andric AM.ScalableOffset = BaseOffset.getScalable(); 4393bdd1243dSDimitry Andric if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) 4394bdd1243dSDimitry Andric // Scale represents reg2 * scale, thus account for 1 if 4395bdd1243dSDimitry Andric // it is not equal to 0 or 1. 4396bdd1243dSDimitry Andric return AM.Scale != 0 && AM.Scale != 1; 4397bdd1243dSDimitry Andric return -1; 4398bdd1243dSDimitry Andric } 43997a6dacacSDimitry Andric 44007a6dacacSDimitry Andric bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) { 44017a6dacacSDimitry Andric // For the binary operators (e.g. or) we need to be more careful than 44027a6dacacSDimitry Andric // selects, here we only transform them if they are already at a natural 44037a6dacacSDimitry Andric // break point in the code - the end of a block with an unconditional 44047a6dacacSDimitry Andric // terminator. 44057a6dacacSDimitry Andric if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or && 44067a6dacacSDimitry Andric isa<BranchInst>(I->getNextNode()) && 44077a6dacacSDimitry Andric cast<BranchInst>(I->getNextNode())->isUnconditional()) 44087a6dacacSDimitry Andric return true; 44097a6dacacSDimitry Andric return BaseT::shouldTreatInstructionLikeSelect(I); 44107a6dacacSDimitry Andric } 44110fca6ea1SDimitry Andric 44120fca6ea1SDimitry Andric bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 44130fca6ea1SDimitry Andric const TargetTransformInfo::LSRCost &C2) { 44140fca6ea1SDimitry Andric // AArch64 specific here is adding the number of instructions to the 44150fca6ea1SDimitry Andric // comparison (though not as the first consideration, as some targets do) 44160fca6ea1SDimitry Andric // along with changing the priority of the base additions. 44170fca6ea1SDimitry Andric // TODO: Maybe a more nuanced tradeoff between instruction count 44180fca6ea1SDimitry Andric // and number of registers? To be investigated at a later date. 44190fca6ea1SDimitry Andric if (EnableLSRCostOpt) 44200fca6ea1SDimitry Andric return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost, 44210fca6ea1SDimitry Andric C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 44220fca6ea1SDimitry Andric std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost, 44230fca6ea1SDimitry Andric C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost); 44240fca6ea1SDimitry Andric 44250fca6ea1SDimitry Andric return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); 44260fca6ea1SDimitry Andric } 4427