1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AArch64TargetTransformInfo.h" 10 #include "AArch64ExpandImm.h" 11 #include "AArch64PerfectShuffle.h" 12 #include "MCTargetDesc/AArch64AddressingModes.h" 13 #include "Utils/AArch64SMEAttributes.h" 14 #include "llvm/ADT/DenseMap.h" 15 #include "llvm/Analysis/IVDescriptors.h" 16 #include "llvm/Analysis/LoopInfo.h" 17 #include "llvm/Analysis/TargetTransformInfo.h" 18 #include "llvm/CodeGen/BasicTTIImpl.h" 19 #include "llvm/CodeGen/CostTable.h" 20 #include "llvm/CodeGen/TargetLowering.h" 21 #include "llvm/IR/IntrinsicInst.h" 22 #include "llvm/IR/Intrinsics.h" 23 #include "llvm/IR/IntrinsicsAArch64.h" 24 #include "llvm/IR/PatternMatch.h" 25 #include "llvm/Support/Debug.h" 26 #include "llvm/TargetParser/AArch64TargetParser.h" 27 #include "llvm/Transforms/InstCombine/InstCombiner.h" 28 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 29 #include <algorithm> 30 #include <optional> 31 using namespace llvm; 32 using namespace llvm::PatternMatch; 33 34 #define DEBUG_TYPE "aarch64tti" 35 36 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", 37 cl::init(true), cl::Hidden); 38 39 static cl::opt<bool> SVEPreferFixedOverScalableIfEqualCost( 40 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden); 41 42 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), 43 cl::Hidden); 44 45 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", 46 cl::init(10), cl::Hidden); 47 48 static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", 49 cl::init(15), cl::Hidden); 50 51 static cl::opt<unsigned> 52 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), 53 cl::Hidden); 54 55 static cl::opt<unsigned> CallPenaltyChangeSM( 56 "call-penalty-sm-change", cl::init(5), cl::Hidden, 57 cl::desc( 58 "Penalty of calling a function that requires a change to PSTATE.SM")); 59 60 static cl::opt<unsigned> InlineCallPenaltyChangeSM( 61 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden, 62 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM")); 63 64 static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select", 65 cl::init(true), cl::Hidden); 66 67 static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", 68 cl::init(true), cl::Hidden); 69 70 // A complete guess as to a reasonable cost. 71 static cl::opt<unsigned> 72 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, 73 cl::desc("The cost of a histcnt instruction")); 74 75 static cl::opt<unsigned> DMBLookaheadThreshold( 76 "dmb-lookahead-threshold", cl::init(10), cl::Hidden, 77 cl::desc("The number of instructions to search for a redundant dmb")); 78 79 namespace { 80 class TailFoldingOption { 81 // These bitfields will only ever be set to something non-zero in operator=, 82 // when setting the -sve-tail-folding option. This option should always be of 83 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here 84 // InitialBits is one of (disabled|all|simple). EnableBits represents 85 // additional flags we're enabling, and DisableBits for those flags we're 86 // disabling. The default flag is tracked in the variable NeedsDefault, since 87 // at the time of setting the option we may not know what the default value 88 // for the CPU is. 89 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled; 90 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled; 91 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled; 92 93 // This value needs to be initialised to true in case the user does not 94 // explicitly set the -sve-tail-folding option. 95 bool NeedsDefault = true; 96 97 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; } 98 99 void setNeedsDefault(bool V) { NeedsDefault = V; } 100 101 void setEnableBit(TailFoldingOpts Bit) { 102 EnableBits |= Bit; 103 DisableBits &= ~Bit; 104 } 105 106 void setDisableBit(TailFoldingOpts Bit) { 107 EnableBits &= ~Bit; 108 DisableBits |= Bit; 109 } 110 111 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const { 112 TailFoldingOpts Bits = TailFoldingOpts::Disabled; 113 114 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) && 115 "Initial bits should only include one of " 116 "(disabled|all|simple|default)"); 117 Bits = NeedsDefault ? DefaultBits : InitialBits; 118 Bits |= EnableBits; 119 Bits &= ~DisableBits; 120 121 return Bits; 122 } 123 124 void reportError(std::string Opt) { 125 errs() << "invalid argument '" << Opt 126 << "' to -sve-tail-folding=; the option should be of the form\n" 127 " (disabled|all|default|simple)[+(reductions|recurrences" 128 "|reverse|noreductions|norecurrences|noreverse)]\n"; 129 report_fatal_error("Unrecognised tail-folding option"); 130 } 131 132 public: 133 134 void operator=(const std::string &Val) { 135 // If the user explicitly sets -sve-tail-folding= then treat as an error. 136 if (Val.empty()) { 137 reportError(""); 138 return; 139 } 140 141 // Since the user is explicitly setting the option we don't automatically 142 // need the default unless they require it. 143 setNeedsDefault(false); 144 145 SmallVector<StringRef, 4> TailFoldTypes; 146 StringRef(Val).split(TailFoldTypes, '+', -1, false); 147 148 unsigned StartIdx = 1; 149 if (TailFoldTypes[0] == "disabled") 150 setInitialBits(TailFoldingOpts::Disabled); 151 else if (TailFoldTypes[0] == "all") 152 setInitialBits(TailFoldingOpts::All); 153 else if (TailFoldTypes[0] == "default") 154 setNeedsDefault(true); 155 else if (TailFoldTypes[0] == "simple") 156 setInitialBits(TailFoldingOpts::Simple); 157 else { 158 StartIdx = 0; 159 setInitialBits(TailFoldingOpts::Disabled); 160 } 161 162 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) { 163 if (TailFoldTypes[I] == "reductions") 164 setEnableBit(TailFoldingOpts::Reductions); 165 else if (TailFoldTypes[I] == "recurrences") 166 setEnableBit(TailFoldingOpts::Recurrences); 167 else if (TailFoldTypes[I] == "reverse") 168 setEnableBit(TailFoldingOpts::Reverse); 169 else if (TailFoldTypes[I] == "noreductions") 170 setDisableBit(TailFoldingOpts::Reductions); 171 else if (TailFoldTypes[I] == "norecurrences") 172 setDisableBit(TailFoldingOpts::Recurrences); 173 else if (TailFoldTypes[I] == "noreverse") 174 setDisableBit(TailFoldingOpts::Reverse); 175 else 176 reportError(Val); 177 } 178 } 179 180 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const { 181 return (getBits(DefaultBits) & Required) == Required; 182 } 183 }; 184 } // namespace 185 186 TailFoldingOption TailFoldingOptionLoc; 187 188 cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding( 189 "sve-tail-folding", 190 cl::desc( 191 "Control the use of vectorisation using tail-folding for SVE where the" 192 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" 193 "\ndisabled (Initial) No loop types will vectorize using " 194 "tail-folding" 195 "\ndefault (Initial) Uses the default tail-folding settings for " 196 "the target CPU" 197 "\nall (Initial) All legal loop types will vectorize using " 198 "tail-folding" 199 "\nsimple (Initial) Use tail-folding for simple loops (not " 200 "reductions or recurrences)" 201 "\nreductions Use tail-folding for loops containing reductions" 202 "\nnoreductions Inverse of above" 203 "\nrecurrences Use tail-folding for loops containing fixed order " 204 "recurrences" 205 "\nnorecurrences Inverse of above" 206 "\nreverse Use tail-folding for loops requiring reversed " 207 "predicates" 208 "\nnoreverse Inverse of above"), 209 cl::location(TailFoldingOptionLoc)); 210 211 // Experimental option that will only be fully functional when the 212 // code-generator is changed to use SVE instead of NEON for all fixed-width 213 // operations. 214 static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode( 215 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden); 216 217 // Experimental option that will only be fully functional when the cost-model 218 // and code-generator have been changed to avoid using scalable vector 219 // instructions that are not legal in streaming SVE mode. 220 static cl::opt<bool> EnableScalableAutovecInStreamingMode( 221 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden); 222 223 static bool isSMEABIRoutineCall(const CallInst &CI) { 224 const auto *F = CI.getCalledFunction(); 225 return F && StringSwitch<bool>(F->getName()) 226 .Case("__arm_sme_state", true) 227 .Case("__arm_tpidr2_save", true) 228 .Case("__arm_tpidr2_restore", true) 229 .Case("__arm_za_disable", true) 230 .Default(false); 231 } 232 233 /// Returns true if the function has explicit operations that can only be 234 /// lowered using incompatible instructions for the selected mode. This also 235 /// returns true if the function F may use or modify ZA state. 236 static bool hasPossibleIncompatibleOps(const Function *F) { 237 for (const BasicBlock &BB : *F) { 238 for (const Instruction &I : BB) { 239 // Be conservative for now and assume that any call to inline asm or to 240 // intrinsics could could result in non-streaming ops (e.g. calls to 241 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that 242 // all native LLVM instructions can be lowered to compatible instructions. 243 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() && 244 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) || 245 isSMEABIRoutineCall(cast<CallInst>(I)))) 246 return true; 247 } 248 } 249 return false; 250 } 251 252 uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const { 253 StringRef AttributeStr = 254 isMultiversionedFunction(F) ? "fmv-features" : "target-features"; 255 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString(); 256 SmallVector<StringRef, 8> Features; 257 FeatureStr.split(Features, ","); 258 return AArch64::getFMVPriority(Features); 259 } 260 261 bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const { 262 return F.hasFnAttribute("fmv-features"); 263 } 264 265 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, 266 const Function *Callee) const { 267 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee); 268 269 // When inlining, we should consider the body of the function, not the 270 // interface. 271 if (CalleeAttrs.hasStreamingBody()) { 272 CalleeAttrs.set(SMEAttrs::SM_Compatible, false); 273 CalleeAttrs.set(SMEAttrs::SM_Enabled, true); 274 } 275 276 if (CalleeAttrs.isNewZA() || CalleeAttrs.isNewZT0()) 277 return false; 278 279 if (CallerAttrs.requiresLazySave(CalleeAttrs) || 280 CallerAttrs.requiresSMChange(CalleeAttrs) || 281 CallerAttrs.requiresPreservingZT0(CalleeAttrs) || 282 CallerAttrs.requiresPreservingAllZAState(CalleeAttrs)) { 283 if (hasPossibleIncompatibleOps(Callee)) 284 return false; 285 } 286 287 return BaseT::areInlineCompatible(Caller, Callee); 288 } 289 290 bool AArch64TTIImpl::areTypesABICompatible( 291 const Function *Caller, const Function *Callee, 292 const ArrayRef<Type *> &Types) const { 293 if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) 294 return false; 295 296 // We need to ensure that argument promotion does not attempt to promote 297 // pointers to fixed-length vector types larger than 128 bits like 298 // <8 x float> (and pointers to aggregate types which have such fixed-length 299 // vector type members) into the values of the pointees. Such vector types 300 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the 301 // backend cannot lower such value arguments. The 128-bit fixed-length SVE 302 // types can be safely treated as 128-bit NEON types and they cannot be 303 // distinguished in IR. 304 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) { 305 auto FVTy = dyn_cast<FixedVectorType>(Ty); 306 return FVTy && 307 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128; 308 })) 309 return false; 310 311 return true; 312 } 313 314 unsigned 315 AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call, 316 unsigned DefaultCallPenalty) const { 317 // This function calculates a penalty for executing Call in F. 318 // 319 // There are two ways this function can be called: 320 // (1) F: 321 // call from F -> G (the call here is Call) 322 // 323 // For (1), Call.getCaller() == F, so it will always return a high cost if 324 // a streaming-mode change is required (thus promoting the need to inline the 325 // function) 326 // 327 // (2) F: 328 // call from F -> G (the call here is not Call) 329 // G: 330 // call from G -> H (the call here is Call) 331 // 332 // For (2), if after inlining the body of G into F the call to H requires a 333 // streaming-mode change, and the call to G from F would also require a 334 // streaming-mode change, then there is benefit to do the streaming-mode 335 // change only once and avoid inlining of G into F. 336 SMEAttrs FAttrs(*F); 337 SMEAttrs CalleeAttrs(Call); 338 if (FAttrs.requiresSMChange(CalleeAttrs)) { 339 if (F == Call.getCaller()) // (1) 340 return CallPenaltyChangeSM * DefaultCallPenalty; 341 if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2) 342 return InlineCallPenaltyChangeSM * DefaultCallPenalty; 343 } 344 345 return DefaultCallPenalty; 346 } 347 348 bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( 349 TargetTransformInfo::RegisterKind K) const { 350 assert(K != TargetTransformInfo::RGK_Scalar); 351 return (K == TargetTransformInfo::RGK_FixedWidthVector && 352 ST->isNeonAvailable()); 353 } 354 355 /// Calculate the cost of materializing a 64-bit value. This helper 356 /// method might only calculate a fraction of a larger immediate. Therefore it 357 /// is valid to return a cost of ZERO. 358 InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { 359 // Check if the immediate can be encoded within an instruction. 360 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 361 return 0; 362 363 if (Val < 0) 364 Val = ~Val; 365 366 // Calculate how many moves we will need to materialize this constant. 367 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 368 AArch64_IMM::expandMOVImm(Val, 64, Insn); 369 return Insn.size(); 370 } 371 372 /// Calculate the cost of materializing the given constant. 373 InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 374 TTI::TargetCostKind CostKind) { 375 assert(Ty->isIntegerTy()); 376 377 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 378 if (BitSize == 0) 379 return ~0U; 380 381 // Sign-extend all constants to a multiple of 64-bit. 382 APInt ImmVal = Imm; 383 if (BitSize & 0x3f) 384 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 385 386 // Split the constant into 64-bit chunks and calculate the cost for each 387 // chunk. 388 InstructionCost Cost = 0; 389 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 390 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 391 int64_t Val = Tmp.getSExtValue(); 392 Cost += getIntImmCost(Val); 393 } 394 // We need at least one instruction to materialze the constant. 395 return std::max<InstructionCost>(1, Cost); 396 } 397 398 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 399 const APInt &Imm, Type *Ty, 400 TTI::TargetCostKind CostKind, 401 Instruction *Inst) { 402 assert(Ty->isIntegerTy()); 403 404 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 405 // There is no cost model for constants with a bit size of 0. Return TCC_Free 406 // here, so that constant hoisting will ignore this constant. 407 if (BitSize == 0) 408 return TTI::TCC_Free; 409 410 unsigned ImmIdx = ~0U; 411 switch (Opcode) { 412 default: 413 return TTI::TCC_Free; 414 case Instruction::GetElementPtr: 415 // Always hoist the base address of a GetElementPtr. 416 if (Idx == 0) 417 return 2 * TTI::TCC_Basic; 418 return TTI::TCC_Free; 419 case Instruction::Store: 420 ImmIdx = 0; 421 break; 422 case Instruction::Add: 423 case Instruction::Sub: 424 case Instruction::Mul: 425 case Instruction::UDiv: 426 case Instruction::SDiv: 427 case Instruction::URem: 428 case Instruction::SRem: 429 case Instruction::And: 430 case Instruction::Or: 431 case Instruction::Xor: 432 case Instruction::ICmp: 433 ImmIdx = 1; 434 break; 435 // Always return TCC_Free for the shift value of a shift instruction. 436 case Instruction::Shl: 437 case Instruction::LShr: 438 case Instruction::AShr: 439 if (Idx == 1) 440 return TTI::TCC_Free; 441 break; 442 case Instruction::Trunc: 443 case Instruction::ZExt: 444 case Instruction::SExt: 445 case Instruction::IntToPtr: 446 case Instruction::PtrToInt: 447 case Instruction::BitCast: 448 case Instruction::PHI: 449 case Instruction::Call: 450 case Instruction::Select: 451 case Instruction::Ret: 452 case Instruction::Load: 453 break; 454 } 455 456 if (Idx == ImmIdx) { 457 int NumConstants = (BitSize + 63) / 64; 458 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 459 return (Cost <= NumConstants * TTI::TCC_Basic) 460 ? static_cast<int>(TTI::TCC_Free) 461 : Cost; 462 } 463 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 464 } 465 466 InstructionCost 467 AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 468 const APInt &Imm, Type *Ty, 469 TTI::TargetCostKind CostKind) { 470 assert(Ty->isIntegerTy()); 471 472 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 473 // There is no cost model for constants with a bit size of 0. Return TCC_Free 474 // here, so that constant hoisting will ignore this constant. 475 if (BitSize == 0) 476 return TTI::TCC_Free; 477 478 // Most (all?) AArch64 intrinsics do not support folding immediates into the 479 // selected instruction, so we compute the materialization cost for the 480 // immediate directly. 481 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) 482 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 483 484 switch (IID) { 485 default: 486 return TTI::TCC_Free; 487 case Intrinsic::sadd_with_overflow: 488 case Intrinsic::uadd_with_overflow: 489 case Intrinsic::ssub_with_overflow: 490 case Intrinsic::usub_with_overflow: 491 case Intrinsic::smul_with_overflow: 492 case Intrinsic::umul_with_overflow: 493 if (Idx == 1) { 494 int NumConstants = (BitSize + 63) / 64; 495 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 496 return (Cost <= NumConstants * TTI::TCC_Basic) 497 ? static_cast<int>(TTI::TCC_Free) 498 : Cost; 499 } 500 break; 501 case Intrinsic::experimental_stackmap: 502 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 503 return TTI::TCC_Free; 504 break; 505 case Intrinsic::experimental_patchpoint_void: 506 case Intrinsic::experimental_patchpoint: 507 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 508 return TTI::TCC_Free; 509 break; 510 case Intrinsic::experimental_gc_statepoint: 511 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 512 return TTI::TCC_Free; 513 break; 514 } 515 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); 516 } 517 518 TargetTransformInfo::PopcntSupportKind 519 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { 520 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 521 if (TyWidth == 32 || TyWidth == 64) 522 return TTI::PSK_FastHardware; 523 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 524 return TTI::PSK_Software; 525 } 526 527 static bool isUnpackedVectorVT(EVT VecVT) { 528 return VecVT.isScalableVector() && 529 VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock; 530 } 531 532 static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) { 533 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers 534 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements 535 unsigned TotalHistCnts = 1; 536 537 unsigned EltSize = EltTy->getScalarSizeInBits(); 538 // Only allow (up to 64b) integers or pointers 539 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64) 540 return InstructionCost::getInvalid(); 541 542 // FIXME: We should be able to generate histcnt for fixed-length vectors 543 // using ptrue with a specific VL. 544 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) { 545 unsigned EC = VTy->getElementCount().getKnownMinValue(); 546 if (!isPowerOf2_64(EC) || !VTy->isScalableTy()) 547 return InstructionCost::getInvalid(); 548 549 // HistCnt only supports 32b and 64b element types 550 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64; 551 552 if (EC == 2 || (LegalEltSize == 32 && EC == 4)) 553 return InstructionCost(BaseHistCntCost); 554 555 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize; 556 TotalHistCnts = EC / NaturalVectorWidth; 557 } 558 559 return InstructionCost(BaseHistCntCost * TotalHistCnts); 560 } 561 562 InstructionCost 563 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 564 TTI::TargetCostKind CostKind) { 565 // The code-generator is currently not able to handle scalable vectors 566 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 567 // it. This change will be removed when code-generation for these types is 568 // sufficiently reliable. 569 auto *RetTy = ICA.getReturnType(); 570 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy)) 571 if (VTy->getElementCount() == ElementCount::getScalable(1)) 572 return InstructionCost::getInvalid(); 573 574 switch (ICA.getID()) { 575 case Intrinsic::experimental_vector_histogram_add: 576 if (!ST->hasSVE2()) 577 return InstructionCost::getInvalid(); 578 return getHistogramCost(ICA); 579 case Intrinsic::umin: 580 case Intrinsic::umax: 581 case Intrinsic::smin: 582 case Intrinsic::smax: { 583 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 584 MVT::v8i16, MVT::v2i32, MVT::v4i32, 585 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, 586 MVT::nxv2i64}; 587 auto LT = getTypeLegalizationCost(RetTy); 588 // v2i64 types get converted to cmp+bif hence the cost of 2 589 if (LT.second == MVT::v2i64) 590 return LT.first * 2; 591 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) 592 return LT.first; 593 break; 594 } 595 case Intrinsic::sadd_sat: 596 case Intrinsic::ssub_sat: 597 case Intrinsic::uadd_sat: 598 case Intrinsic::usub_sat: { 599 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 600 MVT::v8i16, MVT::v2i32, MVT::v4i32, 601 MVT::v2i64}; 602 auto LT = getTypeLegalizationCost(RetTy); 603 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we 604 // need to extend the type, as it uses shr(qadd(shl, shl)). 605 unsigned Instrs = 606 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; 607 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) 608 return LT.first * Instrs; 609 break; 610 } 611 case Intrinsic::abs: { 612 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, 613 MVT::v8i16, MVT::v2i32, MVT::v4i32, 614 MVT::v2i64}; 615 auto LT = getTypeLegalizationCost(RetTy); 616 if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) 617 return LT.first; 618 break; 619 } 620 case Intrinsic::bswap: { 621 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32, 622 MVT::v4i32, MVT::v2i64}; 623 auto LT = getTypeLegalizationCost(RetTy); 624 if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; }) && 625 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits()) 626 return LT.first; 627 break; 628 } 629 case Intrinsic::stepvector: { 630 InstructionCost Cost = 1; // Cost of the `index' instruction 631 auto LT = getTypeLegalizationCost(RetTy); 632 // Legalisation of illegal vectors involves an `index' instruction plus 633 // (LT.first - 1) vector adds. 634 if (LT.first > 1) { 635 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); 636 InstructionCost AddCost = 637 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); 638 Cost += AddCost * (LT.first - 1); 639 } 640 return Cost; 641 } 642 case Intrinsic::vector_extract: 643 case Intrinsic::vector_insert: { 644 // If both the vector and subvector types are legal types and the index 645 // is 0, then this should be a no-op or simple operation; return a 646 // relatively low cost. 647 648 // If arguments aren't actually supplied, then we cannot determine the 649 // value of the index. We also want to skip predicate types. 650 if (ICA.getArgs().size() != ICA.getArgTypes().size() || 651 ICA.getReturnType()->getScalarType()->isIntegerTy(1)) 652 break; 653 654 LLVMContext &C = RetTy->getContext(); 655 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); 656 bool IsExtract = ICA.getID() == Intrinsic::vector_extract; 657 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy) 658 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]); 659 // Skip this if either the vector or subvector types are unpacked 660 // SVE types; they may get lowered to stack stores and loads. 661 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT)) 662 break; 663 664 TargetLoweringBase::LegalizeKind SubVecLK = 665 getTLI()->getTypeConversion(C, SubVecVT); 666 TargetLoweringBase::LegalizeKind VecLK = 667 getTLI()->getTypeConversion(C, VecVT); 668 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2]; 669 const ConstantInt *CIdx = cast<ConstantInt>(Idx); 670 if (SubVecLK.first == TargetLoweringBase::TypeLegal && 671 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero()) 672 return TTI::TCC_Free; 673 break; 674 } 675 case Intrinsic::bitreverse: { 676 static const CostTblEntry BitreverseTbl[] = { 677 {Intrinsic::bitreverse, MVT::i32, 1}, 678 {Intrinsic::bitreverse, MVT::i64, 1}, 679 {Intrinsic::bitreverse, MVT::v8i8, 1}, 680 {Intrinsic::bitreverse, MVT::v16i8, 1}, 681 {Intrinsic::bitreverse, MVT::v4i16, 2}, 682 {Intrinsic::bitreverse, MVT::v8i16, 2}, 683 {Intrinsic::bitreverse, MVT::v2i32, 2}, 684 {Intrinsic::bitreverse, MVT::v4i32, 2}, 685 {Intrinsic::bitreverse, MVT::v1i64, 2}, 686 {Intrinsic::bitreverse, MVT::v2i64, 2}, 687 }; 688 const auto LegalisationCost = getTypeLegalizationCost(RetTy); 689 const auto *Entry = 690 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); 691 if (Entry) { 692 // Cost Model is using the legal type(i32) that i8 and i16 will be 693 // converted to +1 so that we match the actual lowering cost 694 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || 695 TLI->getValueType(DL, RetTy, true) == MVT::i16) 696 return LegalisationCost.first * Entry->Cost + 1; 697 698 return LegalisationCost.first * Entry->Cost; 699 } 700 break; 701 } 702 case Intrinsic::ctpop: { 703 if (!ST->hasNEON()) { 704 // 32-bit or 64-bit ctpop without NEON is 12 instructions. 705 return getTypeLegalizationCost(RetTy).first * 12; 706 } 707 static const CostTblEntry CtpopCostTbl[] = { 708 {ISD::CTPOP, MVT::v2i64, 4}, 709 {ISD::CTPOP, MVT::v4i32, 3}, 710 {ISD::CTPOP, MVT::v8i16, 2}, 711 {ISD::CTPOP, MVT::v16i8, 1}, 712 {ISD::CTPOP, MVT::i64, 4}, 713 {ISD::CTPOP, MVT::v2i32, 3}, 714 {ISD::CTPOP, MVT::v4i16, 2}, 715 {ISD::CTPOP, MVT::v8i8, 1}, 716 {ISD::CTPOP, MVT::i32, 5}, 717 }; 718 auto LT = getTypeLegalizationCost(RetTy); 719 MVT MTy = LT.second; 720 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { 721 // Extra cost of +1 when illegal vector types are legalized by promoting 722 // the integer type. 723 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() != 724 RetTy->getScalarSizeInBits() 725 ? 1 726 : 0; 727 return LT.first * Entry->Cost + ExtraCost; 728 } 729 break; 730 } 731 case Intrinsic::sadd_with_overflow: 732 case Intrinsic::uadd_with_overflow: 733 case Intrinsic::ssub_with_overflow: 734 case Intrinsic::usub_with_overflow: 735 case Intrinsic::smul_with_overflow: 736 case Intrinsic::umul_with_overflow: { 737 static const CostTblEntry WithOverflowCostTbl[] = { 738 {Intrinsic::sadd_with_overflow, MVT::i8, 3}, 739 {Intrinsic::uadd_with_overflow, MVT::i8, 3}, 740 {Intrinsic::sadd_with_overflow, MVT::i16, 3}, 741 {Intrinsic::uadd_with_overflow, MVT::i16, 3}, 742 {Intrinsic::sadd_with_overflow, MVT::i32, 1}, 743 {Intrinsic::uadd_with_overflow, MVT::i32, 1}, 744 {Intrinsic::sadd_with_overflow, MVT::i64, 1}, 745 {Intrinsic::uadd_with_overflow, MVT::i64, 1}, 746 {Intrinsic::ssub_with_overflow, MVT::i8, 3}, 747 {Intrinsic::usub_with_overflow, MVT::i8, 3}, 748 {Intrinsic::ssub_with_overflow, MVT::i16, 3}, 749 {Intrinsic::usub_with_overflow, MVT::i16, 3}, 750 {Intrinsic::ssub_with_overflow, MVT::i32, 1}, 751 {Intrinsic::usub_with_overflow, MVT::i32, 1}, 752 {Intrinsic::ssub_with_overflow, MVT::i64, 1}, 753 {Intrinsic::usub_with_overflow, MVT::i64, 1}, 754 {Intrinsic::smul_with_overflow, MVT::i8, 5}, 755 {Intrinsic::umul_with_overflow, MVT::i8, 4}, 756 {Intrinsic::smul_with_overflow, MVT::i16, 5}, 757 {Intrinsic::umul_with_overflow, MVT::i16, 4}, 758 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst 759 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw 760 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp 761 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr 762 }; 763 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true); 764 if (MTy.isSimple()) 765 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(), 766 MTy.getSimpleVT())) 767 return Entry->Cost; 768 break; 769 } 770 case Intrinsic::fptosi_sat: 771 case Intrinsic::fptoui_sat: { 772 if (ICA.getArgTypes().empty()) 773 break; 774 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; 775 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]); 776 EVT MTy = TLI->getValueType(DL, RetTy); 777 // Check for the legal types, which are where the size of the input and the 778 // output are the same, or we are using cvt f64->i32 or f32->i64. 779 if ((LT.second == MVT::f32 || LT.second == MVT::f64 || 780 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || 781 LT.second == MVT::v2f64)) { 782 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || 783 (LT.second == MVT::f64 && MTy == MVT::i32) || 784 (LT.second == MVT::f32 && MTy == MVT::i64))) 785 return LT.first; 786 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2 787 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() && 788 MTy.getScalarSizeInBits() == 64) 789 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2); 790 } 791 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to 792 // f32. 793 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) 794 return LT.first + getIntrinsicInstrCost( 795 {ICA.getID(), 796 RetTy, 797 {ICA.getArgTypes()[0]->getWithNewType( 798 Type::getFloatTy(RetTy->getContext()))}}, 799 CostKind); 800 if ((LT.second == MVT::f16 && MTy == MVT::i32) || 801 (LT.second == MVT::f16 && MTy == MVT::i64) || 802 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && 803 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))) 804 return LT.first; 805 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2 806 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() && 807 MTy.getScalarSizeInBits() == 32) 808 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2); 809 // Extending vector types v8f16->v8i32. These current scalarize but the 810 // codegen could be better. 811 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() && 812 MTy.getScalarSizeInBits() == 64) 813 return MTy.getVectorNumElements() * 3; 814 815 // If we can we use a legal convert followed by a min+max 816 if ((LT.second.getScalarType() == MVT::f32 || 817 LT.second.getScalarType() == MVT::f64 || 818 LT.second.getScalarType() == MVT::f16) && 819 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { 820 Type *LegalTy = 821 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits()); 822 if (LT.second.isVector()) 823 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount()); 824 InstructionCost Cost = 1; 825 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, 826 LegalTy, {LegalTy, LegalTy}); 827 Cost += getIntrinsicInstrCost(Attrs1, CostKind); 828 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, 829 LegalTy, {LegalTy, LegalTy}); 830 Cost += getIntrinsicInstrCost(Attrs2, CostKind); 831 return LT.first * Cost + 832 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0 833 : 1); 834 } 835 // Otherwise we need to follow the default expansion that clamps the value 836 // using a float min/max with a fcmp+sel for nan handling when signed. 837 Type *FPTy = ICA.getArgTypes()[0]->getScalarType(); 838 RetTy = RetTy->getScalarType(); 839 if (LT.second.isVector()) { 840 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount()); 841 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount()); 842 } 843 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy}); 844 InstructionCost Cost = getIntrinsicInstrCost(Attrs1, CostKind); 845 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy}); 846 Cost += getIntrinsicInstrCost(Attrs2, CostKind); 847 Cost += 848 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI, 849 RetTy, FPTy, TTI::CastContextHint::None, CostKind); 850 if (IsSigned) { 851 Type *CondTy = RetTy->getWithNewBitWidth(1); 852 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy, 853 CmpInst::FCMP_UNO, CostKind); 854 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 855 CmpInst::FCMP_UNO, CostKind); 856 } 857 return LT.first * Cost; 858 } 859 case Intrinsic::fshl: 860 case Intrinsic::fshr: { 861 if (ICA.getArgs().empty()) 862 break; 863 864 // TODO: Add handling for fshl where third argument is not a constant. 865 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]); 866 if (!OpInfoZ.isConstant()) 867 break; 868 869 const auto LegalisationCost = getTypeLegalizationCost(RetTy); 870 if (OpInfoZ.isUniform()) { 871 // FIXME: The costs could be lower if the codegen is better. 872 static const CostTblEntry FshlTbl[] = { 873 {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr 874 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4}, 875 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3}, 876 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}}; 877 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl 878 // to avoid having to duplicate the costs. 879 const auto *Entry = 880 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second); 881 if (Entry) 882 return LegalisationCost.first * Entry->Cost; 883 } 884 885 auto TyL = getTypeLegalizationCost(RetTy); 886 if (!RetTy->isIntegerTy()) 887 break; 888 889 // Estimate cost manually, as types like i8 and i16 will get promoted to 890 // i32 and CostTableLookup will ignore the extra conversion cost. 891 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 && 892 RetTy->getScalarSizeInBits() < 64) || 893 (RetTy->getScalarSizeInBits() % 64 != 0); 894 unsigned ExtraCost = HigherCost ? 1 : 0; 895 if (RetTy->getScalarSizeInBits() == 32 || 896 RetTy->getScalarSizeInBits() == 64) 897 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single 898 // extr instruction. 899 else if (HigherCost) 900 ExtraCost = 1; 901 else 902 break; 903 return TyL.first + ExtraCost; 904 } 905 case Intrinsic::get_active_lane_mask: { 906 auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType()); 907 if (RetTy) { 908 EVT RetVT = getTLI()->getValueType(DL, RetTy); 909 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); 910 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) && 911 !getTLI()->isTypeLegal(RetVT)) { 912 // We don't have enough context at this point to determine if the mask 913 // is going to be kept live after the block, which will force the vXi1 914 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32. 915 // For now, we just assume the vectorizer created this intrinsic and 916 // the result will be the input for a PHI. In this case the cost will 917 // be extremely high for fixed-width vectors. 918 // NOTE: getScalarizationOverhead returns a cost that's far too 919 // pessimistic for the actual generated codegen. In reality there are 920 // two instructions generated per lane. 921 return RetTy->getNumElements() * 2; 922 } 923 } 924 break; 925 } 926 case Intrinsic::experimental_vector_match: { 927 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]); 928 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); 929 unsigned SearchSize = NeedleTy->getNumElements(); 930 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) { 931 // Base cost for MATCH instructions. At least on the Neoverse V2 and 932 // Neoverse V3, these are cheap operations with the same latency as a 933 // vector ADD. In most cases, however, we also need to do an extra DUP. 934 // For fixed-length vectors we currently need an extra five--six 935 // instructions besides the MATCH. 936 InstructionCost Cost = 4; 937 if (isa<FixedVectorType>(RetTy)) 938 Cost += 10; 939 return Cost; 940 } 941 break; 942 } 943 default: 944 break; 945 } 946 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 947 } 948 949 /// The function will remove redundant reinterprets casting in the presence 950 /// of the control flow 951 static std::optional<Instruction *> processPhiNode(InstCombiner &IC, 952 IntrinsicInst &II) { 953 SmallVector<Instruction *, 32> Worklist; 954 auto RequiredType = II.getType(); 955 956 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0)); 957 assert(PN && "Expected Phi Node!"); 958 959 // Don't create a new Phi unless we can remove the old one. 960 if (!PN->hasOneUse()) 961 return std::nullopt; 962 963 for (Value *IncValPhi : PN->incoming_values()) { 964 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi); 965 if (!Reinterpret || 966 Reinterpret->getIntrinsicID() != 967 Intrinsic::aarch64_sve_convert_to_svbool || 968 RequiredType != Reinterpret->getArgOperand(0)->getType()) 969 return std::nullopt; 970 } 971 972 // Create the new Phi 973 IC.Builder.SetInsertPoint(PN); 974 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); 975 Worklist.push_back(PN); 976 977 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { 978 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); 979 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); 980 Worklist.push_back(Reinterpret); 981 } 982 983 // Cleanup Phi Node and reinterprets 984 return IC.replaceInstUsesWith(II, NPN); 985 } 986 987 // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) 988 // => (binop (pred) (from_svbool _) (from_svbool _)) 989 // 990 // The above transformation eliminates a `to_svbool` in the predicate 991 // operand of bitwise operation `binop` by narrowing the vector width of 992 // the operation. For example, it would convert a `<vscale x 16 x i1> 993 // and` into a `<vscale x 4 x i1> and`. This is profitable because 994 // to_svbool must zero the new lanes during widening, whereas 995 // from_svbool is free. 996 static std::optional<Instruction *> 997 tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) { 998 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0)); 999 if (!BinOp) 1000 return std::nullopt; 1001 1002 auto IntrinsicID = BinOp->getIntrinsicID(); 1003 switch (IntrinsicID) { 1004 case Intrinsic::aarch64_sve_and_z: 1005 case Intrinsic::aarch64_sve_bic_z: 1006 case Intrinsic::aarch64_sve_eor_z: 1007 case Intrinsic::aarch64_sve_nand_z: 1008 case Intrinsic::aarch64_sve_nor_z: 1009 case Intrinsic::aarch64_sve_orn_z: 1010 case Intrinsic::aarch64_sve_orr_z: 1011 break; 1012 default: 1013 return std::nullopt; 1014 } 1015 1016 auto BinOpPred = BinOp->getOperand(0); 1017 auto BinOpOp1 = BinOp->getOperand(1); 1018 auto BinOpOp2 = BinOp->getOperand(2); 1019 1020 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred); 1021 if (!PredIntr || 1022 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) 1023 return std::nullopt; 1024 1025 auto PredOp = PredIntr->getOperand(0); 1026 auto PredOpTy = cast<VectorType>(PredOp->getType()); 1027 if (PredOpTy != II.getType()) 1028 return std::nullopt; 1029 1030 SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; 1031 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic( 1032 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1}); 1033 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 1034 if (BinOpOp1 == BinOpOp2) 1035 NarrowedBinOpArgs.push_back(NarrowBinOpOp1); 1036 else 1037 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic( 1038 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2})); 1039 1040 auto NarrowedBinOp = 1041 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs); 1042 return IC.replaceInstUsesWith(II, NarrowedBinOp); 1043 } 1044 1045 static std::optional<Instruction *> 1046 instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) { 1047 // If the reinterpret instruction operand is a PHI Node 1048 if (isa<PHINode>(II.getArgOperand(0))) 1049 return processPhiNode(IC, II); 1050 1051 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) 1052 return BinOpCombine; 1053 1054 // Ignore converts to/from svcount_t. 1055 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) || 1056 isa<TargetExtType>(II.getType())) 1057 return std::nullopt; 1058 1059 SmallVector<Instruction *, 32> CandidatesForRemoval; 1060 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; 1061 1062 const auto *IVTy = cast<VectorType>(II.getType()); 1063 1064 // Walk the chain of conversions. 1065 while (Cursor) { 1066 // If the type of the cursor has fewer lanes than the final result, zeroing 1067 // must take place, which breaks the equivalence chain. 1068 const auto *CursorVTy = cast<VectorType>(Cursor->getType()); 1069 if (CursorVTy->getElementCount().getKnownMinValue() < 1070 IVTy->getElementCount().getKnownMinValue()) 1071 break; 1072 1073 // If the cursor has the same type as I, it is a viable replacement. 1074 if (Cursor->getType() == IVTy) 1075 EarliestReplacement = Cursor; 1076 1077 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); 1078 1079 // If this is not an SVE conversion intrinsic, this is the end of the chain. 1080 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == 1081 Intrinsic::aarch64_sve_convert_to_svbool || 1082 IntrinsicCursor->getIntrinsicID() == 1083 Intrinsic::aarch64_sve_convert_from_svbool)) 1084 break; 1085 1086 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); 1087 Cursor = IntrinsicCursor->getOperand(0); 1088 } 1089 1090 // If no viable replacement in the conversion chain was found, there is 1091 // nothing to do. 1092 if (!EarliestReplacement) 1093 return std::nullopt; 1094 1095 return IC.replaceInstUsesWith(II, EarliestReplacement); 1096 } 1097 1098 static bool isAllActivePredicate(Value *Pred) { 1099 // Look through convert.from.svbool(convert.to.svbool(...) chain. 1100 Value *UncastedPred; 1101 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( 1102 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( 1103 m_Value(UncastedPred))))) 1104 // If the predicate has the same or less lanes than the uncasted 1105 // predicate then we know the casting has no effect. 1106 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <= 1107 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements()) 1108 Pred = UncastedPred; 1109 1110 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 1111 m_ConstantInt<AArch64SVEPredPattern::all>())); 1112 } 1113 1114 // Simplify unary operation where predicate has all inactive lanes by replacing 1115 // instruction with its operand 1116 static std::optional<Instruction *> 1117 instCombineSVENoActiveReplace(InstCombiner &IC, IntrinsicInst &II, 1118 bool hasInactiveVector) { 1119 int PredOperand = hasInactiveVector ? 1 : 0; 1120 int ReplaceOperand = hasInactiveVector ? 0 : 1; 1121 if (match(II.getOperand(PredOperand), m_ZeroInt())) { 1122 IC.replaceInstUsesWith(II, II.getOperand(ReplaceOperand)); 1123 return IC.eraseInstFromFunction(II); 1124 } 1125 return std::nullopt; 1126 } 1127 1128 // Simplify unary operation where predicate has all inactive lanes or 1129 // replace unused first operand with undef when all lanes are active 1130 static std::optional<Instruction *> 1131 instCombineSVEAllOrNoActiveUnary(InstCombiner &IC, IntrinsicInst &II) { 1132 if (isAllActivePredicate(II.getOperand(1)) && 1133 !isa<llvm::UndefValue>(II.getOperand(0)) && 1134 !isa<llvm::PoisonValue>(II.getOperand(0))) { 1135 Value *Undef = llvm::UndefValue::get(II.getType()); 1136 return IC.replaceOperand(II, 0, Undef); 1137 } 1138 return instCombineSVENoActiveReplace(IC, II, true); 1139 } 1140 1141 // Erase unary operation where predicate has all inactive lanes 1142 static std::optional<Instruction *> 1143 instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II, 1144 int PredPos) { 1145 if (match(II.getOperand(PredPos), m_ZeroInt())) { 1146 return IC.eraseInstFromFunction(II); 1147 } 1148 return std::nullopt; 1149 } 1150 1151 // Simplify operation where predicate has all inactive lanes by replacing 1152 // instruction with zeroed object 1153 static std::optional<Instruction *> 1154 instCombineSVENoActiveZero(InstCombiner &IC, IntrinsicInst &II) { 1155 if (match(II.getOperand(0), m_ZeroInt())) { 1156 Constant *Node; 1157 Type *RetTy = II.getType(); 1158 if (RetTy->isStructTy()) { 1159 auto StructT = cast<StructType>(RetTy); 1160 auto VecT = StructT->getElementType(0); 1161 SmallVector<llvm::Constant *, 4> ZerVec; 1162 for (unsigned i = 0; i < StructT->getNumElements(); i++) { 1163 ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0) 1164 : ConstantInt::get(VecT, 0)); 1165 } 1166 Node = ConstantStruct::get(StructT, ZerVec); 1167 } else 1168 Node = RetTy->isFPOrFPVectorTy() ? ConstantFP::get(RetTy, 0.0) 1169 : ConstantInt::get(II.getType(), 0); 1170 1171 IC.replaceInstUsesWith(II, Node); 1172 return IC.eraseInstFromFunction(II); 1173 } 1174 return std::nullopt; 1175 } 1176 1177 static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC, 1178 IntrinsicInst &II) { 1179 // svsel(ptrue, x, y) => x 1180 auto *OpPredicate = II.getOperand(0); 1181 if (isAllActivePredicate(OpPredicate)) 1182 return IC.replaceInstUsesWith(II, II.getOperand(1)); 1183 1184 auto Select = 1185 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2)); 1186 return IC.replaceInstUsesWith(II, Select); 1187 } 1188 1189 static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC, 1190 IntrinsicInst &II) { 1191 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 1192 if (!Pg) 1193 return std::nullopt; 1194 1195 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 1196 return std::nullopt; 1197 1198 const auto PTruePattern = 1199 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 1200 if (PTruePattern != AArch64SVEPredPattern::vl1) 1201 return std::nullopt; 1202 1203 // The intrinsic is inserting into lane zero so use an insert instead. 1204 auto *IdxTy = Type::getInt64Ty(II.getContext()); 1205 auto *Insert = InsertElementInst::Create( 1206 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); 1207 Insert->insertBefore(II.getIterator()); 1208 Insert->takeName(&II); 1209 1210 return IC.replaceInstUsesWith(II, Insert); 1211 } 1212 1213 static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, 1214 IntrinsicInst &II) { 1215 // Replace DupX with a regular IR splat. 1216 auto *RetTy = cast<ScalableVectorType>(II.getType()); 1217 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(), 1218 II.getArgOperand(0)); 1219 Splat->takeName(&II); 1220 return IC.replaceInstUsesWith(II, Splat); 1221 } 1222 1223 static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, 1224 IntrinsicInst &II) { 1225 LLVMContext &Ctx = II.getContext(); 1226 1227 // Replace by zero constant when all lanes are inactive 1228 if (auto II_NA = instCombineSVENoActiveZero(IC, II)) 1229 return II_NA; 1230 1231 // Check that the predicate is all active 1232 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); 1233 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 1234 return std::nullopt; 1235 1236 const auto PTruePattern = 1237 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); 1238 if (PTruePattern != AArch64SVEPredPattern::all) 1239 return std::nullopt; 1240 1241 // Check that we have a compare of zero.. 1242 auto *SplatValue = 1243 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2))); 1244 if (!SplatValue || !SplatValue->isZero()) 1245 return std::nullopt; 1246 1247 // ..against a dupq 1248 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); 1249 if (!DupQLane || 1250 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) 1251 return std::nullopt; 1252 1253 // Where the dupq is a lane 0 replicate of a vector insert 1254 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1)); 1255 if (!DupQLaneIdx || !DupQLaneIdx->isZero()) 1256 return std::nullopt; 1257 1258 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); 1259 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert) 1260 return std::nullopt; 1261 1262 // Where the vector insert is a fixed constant vector insert into undef at 1263 // index zero 1264 if (!isa<UndefValue>(VecIns->getArgOperand(0))) 1265 return std::nullopt; 1266 1267 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero()) 1268 return std::nullopt; 1269 1270 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1)); 1271 if (!ConstVec) 1272 return std::nullopt; 1273 1274 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType()); 1275 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType()); 1276 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) 1277 return std::nullopt; 1278 1279 unsigned NumElts = VecTy->getNumElements(); 1280 unsigned PredicateBits = 0; 1281 1282 // Expand intrinsic operands to a 16-bit byte level predicate 1283 for (unsigned I = 0; I < NumElts; ++I) { 1284 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I)); 1285 if (!Arg) 1286 return std::nullopt; 1287 if (!Arg->isZero()) 1288 PredicateBits |= 1 << (I * (16 / NumElts)); 1289 } 1290 1291 // If all bits are zero bail early with an empty predicate 1292 if (PredicateBits == 0) { 1293 auto *PFalse = Constant::getNullValue(II.getType()); 1294 PFalse->takeName(&II); 1295 return IC.replaceInstUsesWith(II, PFalse); 1296 } 1297 1298 // Calculate largest predicate type used (where byte predicate is largest) 1299 unsigned Mask = 8; 1300 for (unsigned I = 0; I < 16; ++I) 1301 if ((PredicateBits & (1 << I)) != 0) 1302 Mask |= (I % 8); 1303 1304 unsigned PredSize = Mask & -Mask; 1305 auto *PredType = ScalableVectorType::get( 1306 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); 1307 1308 // Ensure all relevant bits are set 1309 for (unsigned I = 0; I < 16; I += PredSize) 1310 if ((PredicateBits & (1 << I)) == 0) 1311 return std::nullopt; 1312 1313 auto *PTruePat = 1314 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 1315 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 1316 {PredType}, {PTruePat}); 1317 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic( 1318 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); 1319 auto *ConvertFromSVBool = 1320 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, 1321 {II.getType()}, {ConvertToSVBool}); 1322 1323 ConvertFromSVBool->takeName(&II); 1324 return IC.replaceInstUsesWith(II, ConvertFromSVBool); 1325 } 1326 1327 static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC, 1328 IntrinsicInst &II) { 1329 Value *Pg = II.getArgOperand(0); 1330 Value *Vec = II.getArgOperand(1); 1331 auto IntrinsicID = II.getIntrinsicID(); 1332 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; 1333 1334 // lastX(splat(X)) --> X 1335 if (auto *SplatVal = getSplatValue(Vec)) 1336 return IC.replaceInstUsesWith(II, SplatVal); 1337 1338 // If x and/or y is a splat value then: 1339 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) 1340 Value *LHS, *RHS; 1341 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) { 1342 if (isSplatValue(LHS) || isSplatValue(RHS)) { 1343 auto *OldBinOp = cast<BinaryOperator>(Vec); 1344 auto OpC = OldBinOp->getOpcode(); 1345 auto *NewLHS = 1346 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS}); 1347 auto *NewRHS = 1348 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS}); 1349 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( 1350 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator()); 1351 return IC.replaceInstUsesWith(II, NewBinOp); 1352 } 1353 } 1354 1355 auto *C = dyn_cast<Constant>(Pg); 1356 if (IsAfter && C && C->isNullValue()) { 1357 // The intrinsic is extracting lane 0 so use an extract instead. 1358 auto *IdxTy = Type::getInt64Ty(II.getContext()); 1359 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); 1360 Extract->insertBefore(II.getIterator()); 1361 Extract->takeName(&II); 1362 return IC.replaceInstUsesWith(II, Extract); 1363 } 1364 1365 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg); 1366 if (!IntrPG) 1367 return std::nullopt; 1368 1369 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 1370 return std::nullopt; 1371 1372 const auto PTruePattern = 1373 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue(); 1374 1375 // Can the intrinsic's predicate be converted to a known constant index? 1376 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern); 1377 if (!MinNumElts) 1378 return std::nullopt; 1379 1380 unsigned Idx = MinNumElts - 1; 1381 // Increment the index if extracting the element after the last active 1382 // predicate element. 1383 if (IsAfter) 1384 ++Idx; 1385 1386 // Ignore extracts whose index is larger than the known minimum vector 1387 // length. NOTE: This is an artificial constraint where we prefer to 1388 // maintain what the user asked for until an alternative is proven faster. 1389 auto *PgVTy = cast<ScalableVectorType>(Pg->getType()); 1390 if (Idx >= PgVTy->getMinNumElements()) 1391 return std::nullopt; 1392 1393 // The intrinsic is extracting a fixed lane so use an extract instead. 1394 auto *IdxTy = Type::getInt64Ty(II.getContext()); 1395 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx)); 1396 Extract->insertBefore(II.getIterator()); 1397 Extract->takeName(&II); 1398 return IC.replaceInstUsesWith(II, Extract); 1399 } 1400 1401 static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC, 1402 IntrinsicInst &II) { 1403 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar 1404 // integer variant across a variety of micro-architectures. Replace scalar 1405 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple 1406 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more 1407 // depending on the micro-architecture, but has been observed as generally 1408 // being faster, particularly when the CLAST[AB] op is a loop-carried 1409 // dependency. 1410 Value *Pg = II.getArgOperand(0); 1411 Value *Fallback = II.getArgOperand(1); 1412 Value *Vec = II.getArgOperand(2); 1413 Type *Ty = II.getType(); 1414 1415 if (!Ty->isIntegerTy()) 1416 return std::nullopt; 1417 1418 Type *FPTy; 1419 switch (cast<IntegerType>(Ty)->getBitWidth()) { 1420 default: 1421 return std::nullopt; 1422 case 16: 1423 FPTy = IC.Builder.getHalfTy(); 1424 break; 1425 case 32: 1426 FPTy = IC.Builder.getFloatTy(); 1427 break; 1428 case 64: 1429 FPTy = IC.Builder.getDoubleTy(); 1430 break; 1431 } 1432 1433 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy); 1434 auto *FPVTy = VectorType::get( 1435 FPTy, cast<VectorType>(Vec->getType())->getElementCount()); 1436 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy); 1437 auto *FPII = IC.Builder.CreateIntrinsic( 1438 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec}); 1439 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType()); 1440 return IC.replaceInstUsesWith(II, FPIItoInt); 1441 } 1442 1443 static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC, 1444 IntrinsicInst &II) { 1445 LLVMContext &Ctx = II.getContext(); 1446 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr 1447 // can work with RDFFR_PP for ptest elimination. 1448 auto *AllPat = 1449 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); 1450 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, 1451 {II.getType()}, {AllPat}); 1452 auto *RDFFR = 1453 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); 1454 RDFFR->takeName(&II); 1455 return IC.replaceInstUsesWith(II, RDFFR); 1456 } 1457 1458 static std::optional<Instruction *> 1459 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { 1460 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); 1461 1462 if (Pattern == AArch64SVEPredPattern::all) { 1463 Constant *StepVal = ConstantInt::get(II.getType(), NumElts); 1464 auto *VScale = IC.Builder.CreateVScale(StepVal); 1465 VScale->takeName(&II); 1466 return IC.replaceInstUsesWith(II, VScale); 1467 } 1468 1469 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); 1470 1471 return MinNumElts && NumElts >= MinNumElts 1472 ? std::optional<Instruction *>(IC.replaceInstUsesWith( 1473 II, ConstantInt::get(II.getType(), MinNumElts))) 1474 : std::nullopt; 1475 } 1476 1477 static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, 1478 IntrinsicInst &II) { 1479 Value *PgVal = II.getArgOperand(0); 1480 Value *OpVal = II.getArgOperand(1); 1481 1482 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X). 1483 // Later optimizations prefer this form. 1484 if (PgVal == OpVal && 1485 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first || 1486 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) { 1487 Value *Ops[] = {PgVal, OpVal}; 1488 Type *Tys[] = {PgVal->getType()}; 1489 1490 auto *PTest = 1491 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops); 1492 PTest->takeName(&II); 1493 1494 return IC.replaceInstUsesWith(II, PTest); 1495 } 1496 1497 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal); 1498 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal); 1499 1500 if (!Pg || !Op) 1501 return std::nullopt; 1502 1503 Intrinsic::ID OpIID = Op->getIntrinsicID(); 1504 1505 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && 1506 OpIID == Intrinsic::aarch64_sve_convert_to_svbool && 1507 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) { 1508 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)}; 1509 Type *Tys[] = {Pg->getArgOperand(0)->getType()}; 1510 1511 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); 1512 1513 PTest->takeName(&II); 1514 return IC.replaceInstUsesWith(II, PTest); 1515 } 1516 1517 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)). 1518 // Later optimizations may rewrite sequence to use the flag-setting variant 1519 // of instruction X to remove PTEST. 1520 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) && 1521 ((OpIID == Intrinsic::aarch64_sve_brka_z) || 1522 (OpIID == Intrinsic::aarch64_sve_brkb_z) || 1523 (OpIID == Intrinsic::aarch64_sve_brkpa_z) || 1524 (OpIID == Intrinsic::aarch64_sve_brkpb_z) || 1525 (OpIID == Intrinsic::aarch64_sve_rdffr_z) || 1526 (OpIID == Intrinsic::aarch64_sve_and_z) || 1527 (OpIID == Intrinsic::aarch64_sve_bic_z) || 1528 (OpIID == Intrinsic::aarch64_sve_eor_z) || 1529 (OpIID == Intrinsic::aarch64_sve_nand_z) || 1530 (OpIID == Intrinsic::aarch64_sve_nor_z) || 1531 (OpIID == Intrinsic::aarch64_sve_orn_z) || 1532 (OpIID == Intrinsic::aarch64_sve_orr_z))) { 1533 Value *Ops[] = {Pg->getArgOperand(0), Pg}; 1534 Type *Tys[] = {Pg->getType()}; 1535 1536 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); 1537 PTest->takeName(&II); 1538 1539 return IC.replaceInstUsesWith(II, PTest); 1540 } 1541 1542 return std::nullopt; 1543 } 1544 1545 template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc> 1546 static std::optional<Instruction *> 1547 instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, 1548 bool MergeIntoAddendOp) { 1549 Value *P = II.getOperand(0); 1550 Value *MulOp0, *MulOp1, *AddendOp, *Mul; 1551 if (MergeIntoAddendOp) { 1552 AddendOp = II.getOperand(1); 1553 Mul = II.getOperand(2); 1554 } else { 1555 AddendOp = II.getOperand(2); 1556 Mul = II.getOperand(1); 1557 } 1558 1559 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0), 1560 m_Value(MulOp1)))) 1561 return std::nullopt; 1562 1563 if (!Mul->hasOneUse()) 1564 return std::nullopt; 1565 1566 Instruction *FMFSource = nullptr; 1567 if (II.getType()->isFPOrFPVectorTy()) { 1568 llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); 1569 // Stop the combine when the flags on the inputs differ in case dropping 1570 // flags would lead to us missing out on more beneficial optimizations. 1571 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags()) 1572 return std::nullopt; 1573 if (!FAddFlags.allowContract()) 1574 return std::nullopt; 1575 FMFSource = &II; 1576 } 1577 1578 CallInst *Res; 1579 if (MergeIntoAddendOp) 1580 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()}, 1581 {P, AddendOp, MulOp0, MulOp1}, FMFSource); 1582 else 1583 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()}, 1584 {P, MulOp0, MulOp1, AddendOp}, FMFSource); 1585 1586 return IC.replaceInstUsesWith(II, Res); 1587 } 1588 1589 static std::optional<Instruction *> 1590 instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 1591 Value *Pred = II.getOperand(0); 1592 Value *PtrOp = II.getOperand(1); 1593 Type *VecTy = II.getType(); 1594 1595 // Replace by zero constant when all lanes are inactive 1596 if (auto II_NA = instCombineSVENoActiveZero(IC, II)) 1597 return II_NA; 1598 1599 if (isAllActivePredicate(Pred)) { 1600 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp); 1601 Load->copyMetadata(II); 1602 return IC.replaceInstUsesWith(II, Load); 1603 } 1604 1605 CallInst *MaskedLoad = 1606 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL), 1607 Pred, ConstantAggregateZero::get(VecTy)); 1608 MaskedLoad->copyMetadata(II); 1609 return IC.replaceInstUsesWith(II, MaskedLoad); 1610 } 1611 1612 static std::optional<Instruction *> 1613 instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { 1614 Value *VecOp = II.getOperand(0); 1615 Value *Pred = II.getOperand(1); 1616 Value *PtrOp = II.getOperand(2); 1617 1618 if (isAllActivePredicate(Pred)) { 1619 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp); 1620 Store->copyMetadata(II); 1621 return IC.eraseInstFromFunction(II); 1622 } 1623 1624 CallInst *MaskedStore = IC.Builder.CreateMaskedStore( 1625 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred); 1626 MaskedStore->copyMetadata(II); 1627 return IC.eraseInstFromFunction(II); 1628 } 1629 1630 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { 1631 switch (Intrinsic) { 1632 case Intrinsic::aarch64_sve_fmul_u: 1633 return Instruction::BinaryOps::FMul; 1634 case Intrinsic::aarch64_sve_fadd_u: 1635 return Instruction::BinaryOps::FAdd; 1636 case Intrinsic::aarch64_sve_fsub_u: 1637 return Instruction::BinaryOps::FSub; 1638 default: 1639 return Instruction::BinaryOpsEnd; 1640 } 1641 } 1642 1643 static std::optional<Instruction *> 1644 instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) { 1645 // Bail due to missing support for ISD::STRICT_ scalable vector operations. 1646 if (II.isStrictFP()) 1647 return std::nullopt; 1648 1649 auto *OpPredicate = II.getOperand(0); 1650 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID()); 1651 if (BinOpCode == Instruction::BinaryOpsEnd || 1652 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 1653 m_ConstantInt<AArch64SVEPredPattern::all>()))) 1654 return std::nullopt; 1655 auto BinOp = IC.Builder.CreateBinOpFMF( 1656 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags()); 1657 return IC.replaceInstUsesWith(II, BinOp); 1658 } 1659 1660 // Canonicalise operations that take an all active predicate (e.g. sve.add -> 1661 // sve.add_u). 1662 static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II, 1663 Intrinsic::ID IID) { 1664 auto *OpPredicate = II.getOperand(0); 1665 if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( 1666 m_ConstantInt<AArch64SVEPredPattern::all>()))) 1667 return std::nullopt; 1668 1669 auto *Mod = II.getModule(); 1670 auto *NewDecl = Intrinsic::getOrInsertDeclaration(Mod, IID, {II.getType()}); 1671 II.setCalledFunction(NewDecl); 1672 1673 return &II; 1674 } 1675 1676 // Simplify operations where predicate has all inactive lanes or try to replace 1677 // with _u form when all lanes are active 1678 static std::optional<Instruction *> 1679 instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, 1680 Intrinsic::ID IID) { 1681 if (match(II.getOperand(0), m_ZeroInt())) { 1682 // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are 1683 // inactive for sv[func]_m 1684 return IC.replaceInstUsesWith(II, II.getOperand(1)); 1685 } 1686 return instCombineSVEAllActive(II, IID); 1687 } 1688 1689 static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC, 1690 IntrinsicInst &II) { 1691 if (auto II_U = 1692 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u)) 1693 return II_U; 1694 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, 1695 Intrinsic::aarch64_sve_mla>( 1696 IC, II, true)) 1697 return MLA; 1698 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, 1699 Intrinsic::aarch64_sve_mad>( 1700 IC, II, false)) 1701 return MAD; 1702 return std::nullopt; 1703 } 1704 1705 static std::optional<Instruction *> 1706 instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) { 1707 if (auto II_U = 1708 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u)) 1709 return II_U; 1710 if (auto FMLA = 1711 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1712 Intrinsic::aarch64_sve_fmla>(IC, II, 1713 true)) 1714 return FMLA; 1715 if (auto FMAD = 1716 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1717 Intrinsic::aarch64_sve_fmad>(IC, II, 1718 false)) 1719 return FMAD; 1720 if (auto FMLA = 1721 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, 1722 Intrinsic::aarch64_sve_fmla>(IC, II, 1723 true)) 1724 return FMLA; 1725 return std::nullopt; 1726 } 1727 1728 static std::optional<Instruction *> 1729 instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) { 1730 if (auto FMLA = 1731 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1732 Intrinsic::aarch64_sve_fmla>(IC, II, 1733 true)) 1734 return FMLA; 1735 if (auto FMAD = 1736 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1737 Intrinsic::aarch64_sve_fmad>(IC, II, 1738 false)) 1739 return FMAD; 1740 if (auto FMLA_U = 1741 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, 1742 Intrinsic::aarch64_sve_fmla_u>( 1743 IC, II, true)) 1744 return FMLA_U; 1745 return instCombineSVEVectorBinOp(IC, II); 1746 } 1747 1748 static std::optional<Instruction *> 1749 instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) { 1750 if (auto II_U = 1751 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u)) 1752 return II_U; 1753 if (auto FMLS = 1754 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1755 Intrinsic::aarch64_sve_fmls>(IC, II, 1756 true)) 1757 return FMLS; 1758 if (auto FMSB = 1759 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1760 Intrinsic::aarch64_sve_fnmsb>( 1761 IC, II, false)) 1762 return FMSB; 1763 if (auto FMLS = 1764 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, 1765 Intrinsic::aarch64_sve_fmls>(IC, II, 1766 true)) 1767 return FMLS; 1768 return std::nullopt; 1769 } 1770 1771 static std::optional<Instruction *> 1772 instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) { 1773 if (auto FMLS = 1774 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1775 Intrinsic::aarch64_sve_fmls>(IC, II, 1776 true)) 1777 return FMLS; 1778 if (auto FMSB = 1779 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, 1780 Intrinsic::aarch64_sve_fnmsb>( 1781 IC, II, false)) 1782 return FMSB; 1783 if (auto FMLS_U = 1784 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, 1785 Intrinsic::aarch64_sve_fmls_u>( 1786 IC, II, true)) 1787 return FMLS_U; 1788 return instCombineSVEVectorBinOp(IC, II); 1789 } 1790 1791 static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC, 1792 IntrinsicInst &II) { 1793 if (auto II_U = 1794 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u)) 1795 return II_U; 1796 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, 1797 Intrinsic::aarch64_sve_mls>( 1798 IC, II, true)) 1799 return MLS; 1800 return std::nullopt; 1801 } 1802 1803 static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, 1804 IntrinsicInst &II, 1805 Intrinsic::ID IID) { 1806 auto *OpPredicate = II.getOperand(0); 1807 auto *OpMultiplicand = II.getOperand(1); 1808 auto *OpMultiplier = II.getOperand(2); 1809 1810 // Return true if a given instruction is a unit splat value, false otherwise. 1811 auto IsUnitSplat = [](auto *I) { 1812 auto *SplatValue = getSplatValue(I); 1813 if (!SplatValue) 1814 return false; 1815 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1816 }; 1817 1818 // Return true if a given instruction is an aarch64_sve_dup intrinsic call 1819 // with a unit splat value, false otherwise. 1820 auto IsUnitDup = [](auto *I) { 1821 auto *IntrI = dyn_cast<IntrinsicInst>(I); 1822 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) 1823 return false; 1824 1825 auto *SplatValue = IntrI->getOperand(2); 1826 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); 1827 }; 1828 1829 if (IsUnitSplat(OpMultiplier)) { 1830 // [f]mul pg %n, (dupx 1) => %n 1831 OpMultiplicand->takeName(&II); 1832 return IC.replaceInstUsesWith(II, OpMultiplicand); 1833 } else if (IsUnitDup(OpMultiplier)) { 1834 // [f]mul pg %n, (dup pg 1) => %n 1835 auto *DupInst = cast<IntrinsicInst>(OpMultiplier); 1836 auto *DupPg = DupInst->getOperand(1); 1837 // TODO: this is naive. The optimization is still valid if DupPg 1838 // 'encompasses' OpPredicate, not only if they're the same predicate. 1839 if (OpPredicate == DupPg) { 1840 OpMultiplicand->takeName(&II); 1841 return IC.replaceInstUsesWith(II, OpMultiplicand); 1842 } 1843 } 1844 1845 return instCombineSVEVectorBinOp(IC, II); 1846 } 1847 1848 static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, 1849 IntrinsicInst &II) { 1850 Value *UnpackArg = II.getArgOperand(0); 1851 auto *RetTy = cast<ScalableVectorType>(II.getType()); 1852 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || 1853 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; 1854 1855 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) 1856 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) 1857 if (auto *ScalarArg = getSplatValue(UnpackArg)) { 1858 ScalarArg = 1859 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned); 1860 Value *NewVal = 1861 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg); 1862 NewVal->takeName(&II); 1863 return IC.replaceInstUsesWith(II, NewVal); 1864 } 1865 1866 return std::nullopt; 1867 } 1868 static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC, 1869 IntrinsicInst &II) { 1870 auto *OpVal = II.getOperand(0); 1871 auto *OpIndices = II.getOperand(1); 1872 VectorType *VTy = cast<VectorType>(II.getType()); 1873 1874 // Check whether OpIndices is a constant splat value < minimal element count 1875 // of result. 1876 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices)); 1877 if (!SplatValue || 1878 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) 1879 return std::nullopt; 1880 1881 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to 1882 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. 1883 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue); 1884 auto *VectorSplat = 1885 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract); 1886 1887 VectorSplat->takeName(&II); 1888 return IC.replaceInstUsesWith(II, VectorSplat); 1889 } 1890 1891 static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC, 1892 IntrinsicInst &II) { 1893 Value *A, *B; 1894 Type *RetTy = II.getType(); 1895 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool; 1896 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool; 1897 1898 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B> 1899 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B> 1900 if ((match(II.getArgOperand(0), 1901 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) && 1902 match(II.getArgOperand(1), 1903 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) || 1904 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) && 1905 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) { 1906 auto *TyA = cast<ScalableVectorType>(A->getType()); 1907 if (TyA == B->getType() && 1908 RetTy == ScalableVectorType::getDoubleElementsVectorType(TyA)) { 1909 auto *SubVec = IC.Builder.CreateInsertVector( 1910 RetTy, PoisonValue::get(RetTy), A, IC.Builder.getInt64(0)); 1911 auto *ConcatVec = IC.Builder.CreateInsertVector( 1912 RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements())); 1913 ConcatVec->takeName(&II); 1914 return IC.replaceInstUsesWith(II, ConcatVec); 1915 } 1916 } 1917 1918 return std::nullopt; 1919 } 1920 1921 static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC, 1922 IntrinsicInst &II) { 1923 // zip1(uzp1(A, B), uzp2(A, B)) --> A 1924 // zip2(uzp1(A, B), uzp2(A, B)) --> B 1925 Value *A, *B; 1926 if (match(II.getArgOperand(0), 1927 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) && 1928 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( 1929 m_Specific(A), m_Specific(B)))) 1930 return IC.replaceInstUsesWith( 1931 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); 1932 1933 return std::nullopt; 1934 } 1935 1936 static std::optional<Instruction *> 1937 instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) { 1938 Value *Mask = II.getOperand(0); 1939 Value *BasePtr = II.getOperand(1); 1940 Value *Index = II.getOperand(2); 1941 Type *Ty = II.getType(); 1942 Value *PassThru = ConstantAggregateZero::get(Ty); 1943 1944 // Replace by zero constant when all lanes are inactive 1945 if (auto II_NA = instCombineSVENoActiveZero(IC, II)) 1946 return II_NA; 1947 1948 // Contiguous gather => masked load. 1949 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) 1950 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) 1951 Value *IndexBase; 1952 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1953 m_Value(IndexBase), m_SpecificInt(1)))) { 1954 Align Alignment = 1955 BasePtr->getPointerAlignment(II.getDataLayout()); 1956 1957 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(), 1958 BasePtr, IndexBase); 1959 CallInst *MaskedLoad = 1960 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); 1961 MaskedLoad->takeName(&II); 1962 return IC.replaceInstUsesWith(II, MaskedLoad); 1963 } 1964 1965 return std::nullopt; 1966 } 1967 1968 static std::optional<Instruction *> 1969 instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) { 1970 Value *Val = II.getOperand(0); 1971 Value *Mask = II.getOperand(1); 1972 Value *BasePtr = II.getOperand(2); 1973 Value *Index = II.getOperand(3); 1974 Type *Ty = Val->getType(); 1975 1976 // Contiguous scatter => masked store. 1977 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) 1978 // => (masked.store Value (gep BasePtr IndexBase) Align Mask) 1979 Value *IndexBase; 1980 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( 1981 m_Value(IndexBase), m_SpecificInt(1)))) { 1982 Align Alignment = 1983 BasePtr->getPointerAlignment(II.getDataLayout()); 1984 1985 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(), 1986 BasePtr, IndexBase); 1987 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); 1988 1989 return IC.eraseInstFromFunction(II); 1990 } 1991 1992 return std::nullopt; 1993 } 1994 1995 static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, 1996 IntrinsicInst &II) { 1997 Type *Int32Ty = IC.Builder.getInt32Ty(); 1998 Value *Pred = II.getOperand(0); 1999 Value *Vec = II.getOperand(1); 2000 Value *DivVec = II.getOperand(2); 2001 2002 Value *SplatValue = getSplatValue(DivVec); 2003 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue); 2004 if (!SplatConstantInt) 2005 return std::nullopt; 2006 2007 APInt Divisor = SplatConstantInt->getValue(); 2008 const int64_t DivisorValue = Divisor.getSExtValue(); 2009 if (DivisorValue == -1) 2010 return std::nullopt; 2011 if (DivisorValue == 1) 2012 IC.replaceInstUsesWith(II, Vec); 2013 2014 if (Divisor.isPowerOf2()) { 2015 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 2016 auto ASRD = IC.Builder.CreateIntrinsic( 2017 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 2018 return IC.replaceInstUsesWith(II, ASRD); 2019 } 2020 if (Divisor.isNegatedPowerOf2()) { 2021 Divisor.negate(); 2022 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); 2023 auto ASRD = IC.Builder.CreateIntrinsic( 2024 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); 2025 auto NEG = IC.Builder.CreateIntrinsic( 2026 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD}); 2027 return IC.replaceInstUsesWith(II, NEG); 2028 } 2029 2030 return std::nullopt; 2031 } 2032 2033 bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) { 2034 size_t VecSize = Vec.size(); 2035 if (VecSize == 1) 2036 return true; 2037 if (!isPowerOf2_64(VecSize)) 2038 return false; 2039 size_t HalfVecSize = VecSize / 2; 2040 2041 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize; 2042 RHS != Vec.end(); LHS++, RHS++) { 2043 if (*LHS != nullptr && *RHS != nullptr) { 2044 if (*LHS == *RHS) 2045 continue; 2046 else 2047 return false; 2048 } 2049 if (!AllowPoison) 2050 return false; 2051 if (*LHS == nullptr && *RHS != nullptr) 2052 *LHS = *RHS; 2053 } 2054 2055 Vec.resize(HalfVecSize); 2056 SimplifyValuePattern(Vec, AllowPoison); 2057 return true; 2058 } 2059 2060 // Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B) 2061 // to dupqlane(f64(C)) where C is A concatenated with B 2062 static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC, 2063 IntrinsicInst &II) { 2064 Value *CurrentInsertElt = nullptr, *Default = nullptr; 2065 if (!match(II.getOperand(0), 2066 m_Intrinsic<Intrinsic::vector_insert>( 2067 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) || 2068 !isa<FixedVectorType>(CurrentInsertElt->getType())) 2069 return std::nullopt; 2070 auto IIScalableTy = cast<ScalableVectorType>(II.getType()); 2071 2072 // Insert the scalars into a container ordered by InsertElement index 2073 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr); 2074 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) { 2075 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2)); 2076 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1); 2077 CurrentInsertElt = InsertElt->getOperand(0); 2078 } 2079 2080 bool AllowPoison = 2081 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default); 2082 if (!SimplifyValuePattern(Elts, AllowPoison)) 2083 return std::nullopt; 2084 2085 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b) 2086 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType()); 2087 for (size_t I = 0; I < Elts.size(); I++) { 2088 if (Elts[I] == nullptr) 2089 continue; 2090 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I], 2091 IC.Builder.getInt64(I)); 2092 } 2093 if (InsertEltChain == nullptr) 2094 return std::nullopt; 2095 2096 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64 2097 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector 2098 // be bitcast to a type wide enough to fit the sequence, be splatted, and then 2099 // be narrowed back to the original type. 2100 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size(); 2101 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() * 2102 IIScalableTy->getMinNumElements() / 2103 PatternWidth; 2104 2105 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth); 2106 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount); 2107 auto *WideShuffleMaskTy = 2108 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount); 2109 2110 auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0)); 2111 auto InsertSubvector = IC.Builder.CreateInsertVector( 2112 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx); 2113 auto WideBitcast = 2114 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy); 2115 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy); 2116 auto WideShuffle = IC.Builder.CreateShuffleVector( 2117 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask); 2118 auto NarrowBitcast = 2119 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType()); 2120 2121 return IC.replaceInstUsesWith(II, NarrowBitcast); 2122 } 2123 2124 static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, 2125 IntrinsicInst &II) { 2126 Value *A = II.getArgOperand(0); 2127 Value *B = II.getArgOperand(1); 2128 if (A == B) 2129 return IC.replaceInstUsesWith(II, A); 2130 2131 return std::nullopt; 2132 } 2133 2134 static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC, 2135 IntrinsicInst &II) { 2136 Value *Pred = II.getOperand(0); 2137 Value *Vec = II.getOperand(1); 2138 Value *Shift = II.getOperand(2); 2139 2140 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic. 2141 Value *AbsPred, *MergedValue; 2142 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>( 2143 m_Value(MergedValue), m_Value(AbsPred), m_Value())) && 2144 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>( 2145 m_Value(MergedValue), m_Value(AbsPred), m_Value()))) 2146 2147 return std::nullopt; 2148 2149 // Transform is valid if any of the following are true: 2150 // * The ABS merge value is an undef or non-negative 2151 // * The ABS predicate is all active 2152 // * The ABS predicate and the SRSHL predicates are the same 2153 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) && 2154 AbsPred != Pred && !isAllActivePredicate(AbsPred)) 2155 return std::nullopt; 2156 2157 // Only valid when the shift amount is non-negative, otherwise the rounding 2158 // behaviour of SRSHL cannot be ignored. 2159 if (!match(Shift, m_NonNegative())) 2160 return std::nullopt; 2161 2162 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, 2163 {II.getType()}, {Pred, Vec, Shift}); 2164 2165 return IC.replaceInstUsesWith(II, LSL); 2166 } 2167 2168 static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC, 2169 IntrinsicInst &II) { 2170 Value *Vec = II.getOperand(0); 2171 2172 if (getSplatValue(Vec) == II.getOperand(1)) 2173 return IC.replaceInstUsesWith(II, Vec); 2174 2175 return std::nullopt; 2176 } 2177 2178 static std::optional<Instruction *> instCombineDMB(InstCombiner &IC, 2179 IntrinsicInst &II) { 2180 // If this barrier is post-dominated by identical one we can remove it 2181 auto *NI = II.getNextNonDebugInstruction(); 2182 unsigned LookaheadThreshold = DMBLookaheadThreshold; 2183 auto CanSkipOver = [](Instruction *I) { 2184 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects(); 2185 }; 2186 while (LookaheadThreshold-- && CanSkipOver(NI)) { 2187 auto *NIBB = NI->getParent(); 2188 NI = NI->getNextNonDebugInstruction(); 2189 if (!NI) { 2190 if (auto *SuccBB = NIBB->getUniqueSuccessor()) 2191 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime(); 2192 else 2193 break; 2194 } 2195 } 2196 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI); 2197 if (NextII && II.isIdenticalTo(NextII)) 2198 return IC.eraseInstFromFunction(II); 2199 2200 return std::nullopt; 2201 } 2202 2203 std::optional<Instruction *> 2204 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, 2205 IntrinsicInst &II) const { 2206 Intrinsic::ID IID = II.getIntrinsicID(); 2207 switch (IID) { 2208 default: 2209 break; 2210 case Intrinsic::aarch64_dmb: 2211 return instCombineDMB(IC, II); 2212 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2: 2213 case Intrinsic::aarch64_sve_fcvt_f16f32: 2214 case Intrinsic::aarch64_sve_fcvt_f16f64: 2215 case Intrinsic::aarch64_sve_fcvt_f32f16: 2216 case Intrinsic::aarch64_sve_fcvt_f32f64: 2217 case Intrinsic::aarch64_sve_fcvt_f64f16: 2218 case Intrinsic::aarch64_sve_fcvt_f64f32: 2219 case Intrinsic::aarch64_sve_fcvtlt_f32f16: 2220 case Intrinsic::aarch64_sve_fcvtlt_f64f32: 2221 case Intrinsic::aarch64_sve_fcvtx_f32f64: 2222 case Intrinsic::aarch64_sve_fcvtzs: 2223 case Intrinsic::aarch64_sve_fcvtzs_i32f16: 2224 case Intrinsic::aarch64_sve_fcvtzs_i32f64: 2225 case Intrinsic::aarch64_sve_fcvtzs_i64f16: 2226 case Intrinsic::aarch64_sve_fcvtzs_i64f32: 2227 case Intrinsic::aarch64_sve_fcvtzu: 2228 case Intrinsic::aarch64_sve_fcvtzu_i32f16: 2229 case Intrinsic::aarch64_sve_fcvtzu_i32f64: 2230 case Intrinsic::aarch64_sve_fcvtzu_i64f16: 2231 case Intrinsic::aarch64_sve_fcvtzu_i64f32: 2232 case Intrinsic::aarch64_sve_scvtf: 2233 case Intrinsic::aarch64_sve_scvtf_f16i32: 2234 case Intrinsic::aarch64_sve_scvtf_f16i64: 2235 case Intrinsic::aarch64_sve_scvtf_f32i64: 2236 case Intrinsic::aarch64_sve_scvtf_f64i32: 2237 case Intrinsic::aarch64_sve_ucvtf: 2238 case Intrinsic::aarch64_sve_ucvtf_f16i32: 2239 case Intrinsic::aarch64_sve_ucvtf_f16i64: 2240 case Intrinsic::aarch64_sve_ucvtf_f32i64: 2241 case Intrinsic::aarch64_sve_ucvtf_f64i32: 2242 return instCombineSVEAllOrNoActiveUnary(IC, II); 2243 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2: 2244 case Intrinsic::aarch64_sve_fcvtnt_f16f32: 2245 case Intrinsic::aarch64_sve_fcvtnt_f32f64: 2246 case Intrinsic::aarch64_sve_fcvtxnt_f32f64: 2247 return instCombineSVENoActiveReplace(IC, II, true); 2248 case Intrinsic::aarch64_sve_st1_scatter: 2249 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: 2250 case Intrinsic::aarch64_sve_st1_scatter_sxtw: 2251 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: 2252 case Intrinsic::aarch64_sve_st1_scatter_uxtw: 2253 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: 2254 case Intrinsic::aarch64_sve_st1dq: 2255 case Intrinsic::aarch64_sve_st1q_scatter_index: 2256 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset: 2257 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset: 2258 case Intrinsic::aarch64_sve_st1wq: 2259 case Intrinsic::aarch64_sve_stnt1: 2260 case Intrinsic::aarch64_sve_stnt1_scatter: 2261 case Intrinsic::aarch64_sve_stnt1_scatter_index: 2262 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: 2263 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: 2264 return instCombineSVENoActiveUnaryErase(IC, II, 1); 2265 case Intrinsic::aarch64_sve_st2: 2266 case Intrinsic::aarch64_sve_st2q: 2267 return instCombineSVENoActiveUnaryErase(IC, II, 2); 2268 case Intrinsic::aarch64_sve_st3: 2269 case Intrinsic::aarch64_sve_st3q: 2270 return instCombineSVENoActiveUnaryErase(IC, II, 3); 2271 case Intrinsic::aarch64_sve_st4: 2272 case Intrinsic::aarch64_sve_st4q: 2273 return instCombineSVENoActiveUnaryErase(IC, II, 4); 2274 case Intrinsic::aarch64_sve_addqv: 2275 case Intrinsic::aarch64_sve_and_z: 2276 case Intrinsic::aarch64_sve_bic_z: 2277 case Intrinsic::aarch64_sve_brka_z: 2278 case Intrinsic::aarch64_sve_brkb_z: 2279 case Intrinsic::aarch64_sve_brkn_z: 2280 case Intrinsic::aarch64_sve_brkpa_z: 2281 case Intrinsic::aarch64_sve_brkpb_z: 2282 case Intrinsic::aarch64_sve_cntp: 2283 case Intrinsic::aarch64_sve_compact: 2284 case Intrinsic::aarch64_sve_eor_z: 2285 case Intrinsic::aarch64_sve_eorv: 2286 case Intrinsic::aarch64_sve_eorqv: 2287 case Intrinsic::aarch64_sve_nand_z: 2288 case Intrinsic::aarch64_sve_nor_z: 2289 case Intrinsic::aarch64_sve_orn_z: 2290 case Intrinsic::aarch64_sve_orr_z: 2291 case Intrinsic::aarch64_sve_orv: 2292 case Intrinsic::aarch64_sve_orqv: 2293 case Intrinsic::aarch64_sve_pnext: 2294 case Intrinsic::aarch64_sve_rdffr_z: 2295 case Intrinsic::aarch64_sve_saddv: 2296 case Intrinsic::aarch64_sve_uaddv: 2297 case Intrinsic::aarch64_sve_umaxv: 2298 case Intrinsic::aarch64_sve_umaxqv: 2299 case Intrinsic::aarch64_sve_cmpeq: 2300 case Intrinsic::aarch64_sve_cmpeq_wide: 2301 case Intrinsic::aarch64_sve_cmpge: 2302 case Intrinsic::aarch64_sve_cmpge_wide: 2303 case Intrinsic::aarch64_sve_cmpgt: 2304 case Intrinsic::aarch64_sve_cmpgt_wide: 2305 case Intrinsic::aarch64_sve_cmphi: 2306 case Intrinsic::aarch64_sve_cmphi_wide: 2307 case Intrinsic::aarch64_sve_cmphs: 2308 case Intrinsic::aarch64_sve_cmphs_wide: 2309 case Intrinsic::aarch64_sve_cmple_wide: 2310 case Intrinsic::aarch64_sve_cmplo_wide: 2311 case Intrinsic::aarch64_sve_cmpls_wide: 2312 case Intrinsic::aarch64_sve_cmplt_wide: 2313 case Intrinsic::aarch64_sve_facge: 2314 case Intrinsic::aarch64_sve_facgt: 2315 case Intrinsic::aarch64_sve_fcmpeq: 2316 case Intrinsic::aarch64_sve_fcmpge: 2317 case Intrinsic::aarch64_sve_fcmpgt: 2318 case Intrinsic::aarch64_sve_fcmpne: 2319 case Intrinsic::aarch64_sve_fcmpuo: 2320 case Intrinsic::aarch64_sve_ld1_gather: 2321 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: 2322 case Intrinsic::aarch64_sve_ld1_gather_sxtw: 2323 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: 2324 case Intrinsic::aarch64_sve_ld1_gather_uxtw: 2325 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: 2326 case Intrinsic::aarch64_sve_ld1q_gather_index: 2327 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset: 2328 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset: 2329 case Intrinsic::aarch64_sve_ld1ro: 2330 case Intrinsic::aarch64_sve_ld1rq: 2331 case Intrinsic::aarch64_sve_ld1udq: 2332 case Intrinsic::aarch64_sve_ld1uwq: 2333 case Intrinsic::aarch64_sve_ld2_sret: 2334 case Intrinsic::aarch64_sve_ld2q_sret: 2335 case Intrinsic::aarch64_sve_ld3_sret: 2336 case Intrinsic::aarch64_sve_ld3q_sret: 2337 case Intrinsic::aarch64_sve_ld4_sret: 2338 case Intrinsic::aarch64_sve_ld4q_sret: 2339 case Intrinsic::aarch64_sve_ldff1: 2340 case Intrinsic::aarch64_sve_ldff1_gather: 2341 case Intrinsic::aarch64_sve_ldff1_gather_index: 2342 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: 2343 case Intrinsic::aarch64_sve_ldff1_gather_sxtw: 2344 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: 2345 case Intrinsic::aarch64_sve_ldff1_gather_uxtw: 2346 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: 2347 case Intrinsic::aarch64_sve_ldnf1: 2348 case Intrinsic::aarch64_sve_ldnt1: 2349 case Intrinsic::aarch64_sve_ldnt1_gather: 2350 case Intrinsic::aarch64_sve_ldnt1_gather_index: 2351 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: 2352 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: 2353 return instCombineSVENoActiveZero(IC, II); 2354 case Intrinsic::aarch64_sve_prf: 2355 case Intrinsic::aarch64_sve_prfb_gather_index: 2356 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset: 2357 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index: 2358 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index: 2359 case Intrinsic::aarch64_sve_prfd_gather_index: 2360 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset: 2361 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: 2362 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: 2363 case Intrinsic::aarch64_sve_prfh_gather_index: 2364 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset: 2365 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: 2366 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: 2367 case Intrinsic::aarch64_sve_prfw_gather_index: 2368 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset: 2369 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: 2370 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: 2371 return instCombineSVENoActiveUnaryErase(IC, II, 0); 2372 case Intrinsic::aarch64_neon_fmaxnm: 2373 case Intrinsic::aarch64_neon_fminnm: 2374 return instCombineMaxMinNM(IC, II); 2375 case Intrinsic::aarch64_sve_convert_from_svbool: 2376 return instCombineConvertFromSVBool(IC, II); 2377 case Intrinsic::aarch64_sve_dup: 2378 return instCombineSVEDup(IC, II); 2379 case Intrinsic::aarch64_sve_dup_x: 2380 return instCombineSVEDupX(IC, II); 2381 case Intrinsic::aarch64_sve_cmpne: 2382 case Intrinsic::aarch64_sve_cmpne_wide: 2383 return instCombineSVECmpNE(IC, II); 2384 case Intrinsic::aarch64_sve_rdffr: 2385 return instCombineRDFFR(IC, II); 2386 case Intrinsic::aarch64_sve_lasta: 2387 case Intrinsic::aarch64_sve_lastb: 2388 return instCombineSVELast(IC, II); 2389 case Intrinsic::aarch64_sve_clasta_n: 2390 case Intrinsic::aarch64_sve_clastb_n: 2391 return instCombineSVECondLast(IC, II); 2392 case Intrinsic::aarch64_sve_cntd: 2393 return instCombineSVECntElts(IC, II, 2); 2394 case Intrinsic::aarch64_sve_cntw: 2395 return instCombineSVECntElts(IC, II, 4); 2396 case Intrinsic::aarch64_sve_cnth: 2397 return instCombineSVECntElts(IC, II, 8); 2398 case Intrinsic::aarch64_sve_cntb: 2399 return instCombineSVECntElts(IC, II, 16); 2400 case Intrinsic::aarch64_sve_ptest_any: 2401 case Intrinsic::aarch64_sve_ptest_first: 2402 case Intrinsic::aarch64_sve_ptest_last: 2403 return instCombineSVEPTest(IC, II); 2404 case Intrinsic::aarch64_sve_fabd: 2405 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u); 2406 case Intrinsic::aarch64_sve_fadd: 2407 return instCombineSVEVectorFAdd(IC, II); 2408 case Intrinsic::aarch64_sve_fadd_u: 2409 return instCombineSVEVectorFAddU(IC, II); 2410 case Intrinsic::aarch64_sve_fdiv: 2411 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u); 2412 case Intrinsic::aarch64_sve_fmax: 2413 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u); 2414 case Intrinsic::aarch64_sve_fmaxnm: 2415 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u); 2416 case Intrinsic::aarch64_sve_fmin: 2417 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u); 2418 case Intrinsic::aarch64_sve_fminnm: 2419 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u); 2420 case Intrinsic::aarch64_sve_fmla: 2421 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u); 2422 case Intrinsic::aarch64_sve_fmls: 2423 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u); 2424 case Intrinsic::aarch64_sve_fmul: 2425 if (auto II_U = 2426 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u)) 2427 return II_U; 2428 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u); 2429 case Intrinsic::aarch64_sve_fmul_u: 2430 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u); 2431 case Intrinsic::aarch64_sve_fmulx: 2432 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u); 2433 case Intrinsic::aarch64_sve_fnmla: 2434 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u); 2435 case Intrinsic::aarch64_sve_fnmls: 2436 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u); 2437 case Intrinsic::aarch64_sve_fsub: 2438 return instCombineSVEVectorFSub(IC, II); 2439 case Intrinsic::aarch64_sve_fsub_u: 2440 return instCombineSVEVectorFSubU(IC, II); 2441 case Intrinsic::aarch64_sve_add: 2442 return instCombineSVEVectorAdd(IC, II); 2443 case Intrinsic::aarch64_sve_add_u: 2444 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u, 2445 Intrinsic::aarch64_sve_mla_u>( 2446 IC, II, true); 2447 case Intrinsic::aarch64_sve_mla: 2448 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u); 2449 case Intrinsic::aarch64_sve_mls: 2450 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u); 2451 case Intrinsic::aarch64_sve_mul: 2452 if (auto II_U = 2453 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u)) 2454 return II_U; 2455 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u); 2456 case Intrinsic::aarch64_sve_mul_u: 2457 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u); 2458 case Intrinsic::aarch64_sve_sabd: 2459 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u); 2460 case Intrinsic::aarch64_sve_smax: 2461 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u); 2462 case Intrinsic::aarch64_sve_smin: 2463 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u); 2464 case Intrinsic::aarch64_sve_smulh: 2465 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u); 2466 case Intrinsic::aarch64_sve_sub: 2467 return instCombineSVEVectorSub(IC, II); 2468 case Intrinsic::aarch64_sve_sub_u: 2469 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u, 2470 Intrinsic::aarch64_sve_mls_u>( 2471 IC, II, true); 2472 case Intrinsic::aarch64_sve_uabd: 2473 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u); 2474 case Intrinsic::aarch64_sve_umax: 2475 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u); 2476 case Intrinsic::aarch64_sve_umin: 2477 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u); 2478 case Intrinsic::aarch64_sve_umulh: 2479 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u); 2480 case Intrinsic::aarch64_sve_asr: 2481 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u); 2482 case Intrinsic::aarch64_sve_lsl: 2483 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u); 2484 case Intrinsic::aarch64_sve_lsr: 2485 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u); 2486 case Intrinsic::aarch64_sve_and: 2487 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u); 2488 case Intrinsic::aarch64_sve_bic: 2489 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u); 2490 case Intrinsic::aarch64_sve_eor: 2491 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u); 2492 case Intrinsic::aarch64_sve_orr: 2493 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u); 2494 case Intrinsic::aarch64_sve_sqsub: 2495 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u); 2496 case Intrinsic::aarch64_sve_uqsub: 2497 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u); 2498 case Intrinsic::aarch64_sve_tbl: 2499 return instCombineSVETBL(IC, II); 2500 case Intrinsic::aarch64_sve_uunpkhi: 2501 case Intrinsic::aarch64_sve_uunpklo: 2502 case Intrinsic::aarch64_sve_sunpkhi: 2503 case Intrinsic::aarch64_sve_sunpklo: 2504 return instCombineSVEUnpack(IC, II); 2505 case Intrinsic::aarch64_sve_uzp1: 2506 return instCombineSVEUzp1(IC, II); 2507 case Intrinsic::aarch64_sve_zip1: 2508 case Intrinsic::aarch64_sve_zip2: 2509 return instCombineSVEZip(IC, II); 2510 case Intrinsic::aarch64_sve_ld1_gather_index: 2511 return instCombineLD1GatherIndex(IC, II); 2512 case Intrinsic::aarch64_sve_st1_scatter_index: 2513 return instCombineST1ScatterIndex(IC, II); 2514 case Intrinsic::aarch64_sve_ld1: 2515 return instCombineSVELD1(IC, II, DL); 2516 case Intrinsic::aarch64_sve_st1: 2517 return instCombineSVEST1(IC, II, DL); 2518 case Intrinsic::aarch64_sve_sdiv: 2519 return instCombineSVESDIV(IC, II); 2520 case Intrinsic::aarch64_sve_sel: 2521 return instCombineSVESel(IC, II); 2522 case Intrinsic::aarch64_sve_srshl: 2523 return instCombineSVESrshl(IC, II); 2524 case Intrinsic::aarch64_sve_dupq_lane: 2525 return instCombineSVEDupqLane(IC, II); 2526 case Intrinsic::aarch64_sve_insr: 2527 return instCombineSVEInsr(IC, II); 2528 } 2529 2530 return std::nullopt; 2531 } 2532 2533 std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( 2534 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, 2535 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, 2536 std::function<void(Instruction *, unsigned, APInt, APInt &)> 2537 SimplifyAndSetOp) const { 2538 switch (II.getIntrinsicID()) { 2539 default: 2540 break; 2541 case Intrinsic::aarch64_neon_fcvtxn: 2542 case Intrinsic::aarch64_neon_rshrn: 2543 case Intrinsic::aarch64_neon_sqrshrn: 2544 case Intrinsic::aarch64_neon_sqrshrun: 2545 case Intrinsic::aarch64_neon_sqshrn: 2546 case Intrinsic::aarch64_neon_sqshrun: 2547 case Intrinsic::aarch64_neon_sqxtn: 2548 case Intrinsic::aarch64_neon_sqxtun: 2549 case Intrinsic::aarch64_neon_uqrshrn: 2550 case Intrinsic::aarch64_neon_uqshrn: 2551 case Intrinsic::aarch64_neon_uqxtn: 2552 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); 2553 break; 2554 } 2555 2556 return std::nullopt; 2557 } 2558 2559 bool AArch64TTIImpl::enableScalableVectorization() const { 2560 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() && 2561 EnableScalableAutovecInStreamingMode); 2562 } 2563 2564 TypeSize 2565 AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 2566 switch (K) { 2567 case TargetTransformInfo::RGK_Scalar: 2568 return TypeSize::getFixed(64); 2569 case TargetTransformInfo::RGK_FixedWidthVector: 2570 if (ST->useSVEForFixedLengthVectors() && 2571 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode)) 2572 return TypeSize::getFixed( 2573 std::max(ST->getMinSVEVectorSizeInBits(), 128u)); 2574 else if (ST->isNeonAvailable()) 2575 return TypeSize::getFixed(128); 2576 else 2577 return TypeSize::getFixed(0); 2578 case TargetTransformInfo::RGK_ScalableVector: 2579 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() && 2580 EnableScalableAutovecInStreamingMode)) 2581 return TypeSize::getScalable(128); 2582 else 2583 return TypeSize::getScalable(0); 2584 } 2585 llvm_unreachable("Unsupported register kind"); 2586 } 2587 2588 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, 2589 ArrayRef<const Value *> Args, 2590 Type *SrcOverrideTy) { 2591 // A helper that returns a vector type from the given type. The number of 2592 // elements in type Ty determines the vector width. 2593 auto toVectorTy = [&](Type *ArgTy) { 2594 return VectorType::get(ArgTy->getScalarType(), 2595 cast<VectorType>(DstTy)->getElementCount()); 2596 }; 2597 2598 // Exit early if DstTy is not a vector type whose elements are one of [i16, 2599 // i32, i64]. SVE doesn't generally have the same set of instructions to 2600 // perform an extend with the add/sub/mul. There are SMULLB style 2601 // instructions, but they operate on top/bottom, requiring some sort of lane 2602 // interleaving to be used with zext/sext. 2603 unsigned DstEltSize = DstTy->getScalarSizeInBits(); 2604 if (!useNeonVector(DstTy) || Args.size() != 2 || 2605 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) 2606 return false; 2607 2608 // Determine if the operation has a widening variant. We consider both the 2609 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the 2610 // instructions. 2611 // 2612 // TODO: Add additional widening operations (e.g., shl, etc.) once we 2613 // verify that their extending operands are eliminated during code 2614 // generation. 2615 Type *SrcTy = SrcOverrideTy; 2616 switch (Opcode) { 2617 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). 2618 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). 2619 // The second operand needs to be an extend 2620 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) { 2621 if (!SrcTy) 2622 SrcTy = 2623 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType()); 2624 } else 2625 return false; 2626 break; 2627 case Instruction::Mul: { // SMULL(2), UMULL(2) 2628 // Both operands need to be extends of the same type. 2629 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || 2630 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { 2631 if (!SrcTy) 2632 SrcTy = 2633 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType()); 2634 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) { 2635 // If one of the operands is a Zext and the other has enough zero bits to 2636 // be treated as unsigned, we can still general a umull, meaning the zext 2637 // is free. 2638 KnownBits Known = 2639 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); 2640 if (Args[0]->getType()->getScalarSizeInBits() - 2641 Known.Zero.countLeadingOnes() > 2642 DstTy->getScalarSizeInBits() / 2) 2643 return false; 2644 if (!SrcTy) 2645 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(), 2646 DstTy->getScalarSizeInBits() / 2)); 2647 } else 2648 return false; 2649 break; 2650 } 2651 default: 2652 return false; 2653 } 2654 2655 // Legalize the destination type and ensure it can be used in a widening 2656 // operation. 2657 auto DstTyL = getTypeLegalizationCost(DstTy); 2658 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits()) 2659 return false; 2660 2661 // Legalize the source type and ensure it can be used in a widening 2662 // operation. 2663 assert(SrcTy && "Expected some SrcTy"); 2664 auto SrcTyL = getTypeLegalizationCost(SrcTy); 2665 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); 2666 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) 2667 return false; 2668 2669 // Get the total number of vector elements in the legalized types. 2670 InstructionCost NumDstEls = 2671 DstTyL.first * DstTyL.second.getVectorMinNumElements(); 2672 InstructionCost NumSrcEls = 2673 SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); 2674 2675 // Return true if the legalized types have the same number of vector elements 2676 // and the destination element type size is twice that of the source type. 2677 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize; 2678 } 2679 2680 // s/urhadd instructions implement the following pattern, making the 2681 // extends free: 2682 // %x = add ((zext i8 -> i16), 1) 2683 // %y = (zext i8 -> i16) 2684 // trunc i16 (lshr (add %x, %y), 1) -> i8 2685 // 2686 bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, 2687 Type *Src) { 2688 // The source should be a legal vector type. 2689 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) || 2690 (Src->isScalableTy() && !ST->hasSVE2())) 2691 return false; 2692 2693 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse()) 2694 return false; 2695 2696 // Look for trunc/shl/add before trying to match the pattern. 2697 const Instruction *Add = ExtUser; 2698 auto *AddUser = 2699 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser()); 2700 if (AddUser && AddUser->getOpcode() == Instruction::Add) 2701 Add = AddUser; 2702 2703 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser()); 2704 if (!Shr || Shr->getOpcode() != Instruction::LShr) 2705 return false; 2706 2707 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser()); 2708 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc || 2709 Src->getScalarSizeInBits() != 2710 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits()) 2711 return false; 2712 2713 // Try to match the whole pattern. Ext could be either the first or second 2714 // m_ZExtOrSExt matched. 2715 Instruction *Ex1, *Ex2; 2716 if (!(match(Add, m_c_Add(m_Instruction(Ex1), 2717 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1)))))) 2718 return false; 2719 2720 // Ensure both extends are of the same type 2721 if (match(Ex1, m_ZExtOrSExt(m_Value())) && 2722 Ex1->getOpcode() == Ex2->getOpcode()) 2723 return true; 2724 2725 return false; 2726 } 2727 2728 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 2729 Type *Src, 2730 TTI::CastContextHint CCH, 2731 TTI::TargetCostKind CostKind, 2732 const Instruction *I) { 2733 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2734 assert(ISD && "Invalid opcode"); 2735 // If the cast is observable, and it is used by a widening instruction (e.g., 2736 // uaddl, saddw, etc.), it may be free. 2737 if (I && I->hasOneUser()) { 2738 auto *SingleUser = cast<Instruction>(*I->user_begin()); 2739 SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); 2740 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) { 2741 // For adds only count the second operand as free if both operands are 2742 // extends but not the same operation. (i.e both operands are not free in 2743 // add(sext, zext)). 2744 if (SingleUser->getOpcode() == Instruction::Add) { 2745 if (I == SingleUser->getOperand(1) || 2746 (isa<CastInst>(SingleUser->getOperand(1)) && 2747 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode)) 2748 return 0; 2749 } else // Others are free so long as isWideningInstruction returned true. 2750 return 0; 2751 } 2752 2753 // The cast will be free for the s/urhadd instructions 2754 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) && 2755 isExtPartOfAvgExpr(SingleUser, Dst, Src)) 2756 return 0; 2757 } 2758 2759 // TODO: Allow non-throughput costs that aren't binary. 2760 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { 2761 if (CostKind != TTI::TCK_RecipThroughput) 2762 return Cost == 0 ? 0 : 1; 2763 return Cost; 2764 }; 2765 2766 EVT SrcTy = TLI->getValueType(DL, Src); 2767 EVT DstTy = TLI->getValueType(DL, Dst); 2768 2769 if (!SrcTy.isSimple() || !DstTy.isSimple()) 2770 return AdjustCost( 2771 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 2772 2773 static const TypeConversionCostTblEntry BF16Tbl[] = { 2774 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt 2775 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt 2776 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn 2777 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2 2778 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn 2779 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn 2780 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn 2781 }; 2782 2783 if (ST->hasBF16()) 2784 if (const auto *Entry = ConvertCostTableLookup( 2785 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 2786 return AdjustCost(Entry->Cost); 2787 2788 static const TypeConversionCostTblEntry ConversionTbl[] = { 2789 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn 2790 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn 2791 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn 2792 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn 2793 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1 2794 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn 2795 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn 2796 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1 2797 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn 2798 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn 2799 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn 2800 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1 2801 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1 2802 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1 2803 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1 2804 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1 2805 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1 2806 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1 2807 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1 2808 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1 2809 2810 // Truncations on nxvmiN 2811 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2}, 2812 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2}, 2813 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2}, 2814 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2}, 2815 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2}, 2816 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2}, 2817 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2}, 2818 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5}, 2819 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2}, 2820 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2}, 2821 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5}, 2822 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11}, 2823 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2}, 2824 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0}, 2825 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0}, 2826 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0}, 2827 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0}, 2828 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0}, 2829 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0}, 2830 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0}, 2831 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0}, 2832 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1}, 2833 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0}, 2834 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1}, 2835 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1}, 2836 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0}, 2837 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1}, 2838 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3}, 2839 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1}, 2840 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3}, 2841 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1}, 2842 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3}, 2843 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7}, 2844 2845 // The number of shll instructions for the extension. 2846 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3}, 2847 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3}, 2848 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2}, 2849 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2}, 2850 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3}, 2851 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3}, 2852 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2}, 2853 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2}, 2854 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7}, 2855 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7}, 2856 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6}, 2857 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6}, 2858 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2}, 2859 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2}, 2860 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6}, 2861 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6}, 2862 2863 // FP Ext and trunc 2864 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt 2865 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl 2866 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2 2867 // FP16 2868 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt 2869 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt 2870 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl 2871 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2 2872 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl 2873 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl 2874 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl 2875 // BF16 (uses shift) 2876 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl 2877 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt 2878 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll 2879 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2 2880 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl 2881 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2 2882 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2 2883 // FP Ext and trunc 2884 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt 2885 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn 2886 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2 2887 // FP16 2888 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt 2889 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt 2890 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn 2891 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2 2892 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn 2893 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn 2894 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn 2895 // BF16 (more complex, with +bf16 is handled above) 2896 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns 2897 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above 2898 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8}, 2899 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8}, 2900 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15}, 2901 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9}, 2902 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10}, 2903 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19}, 2904 2905 // LowerVectorINT_TO_FP: 2906 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1}, 2907 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1}, 2908 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1}, 2909 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1}, 2910 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1}, 2911 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1}, 2912 2913 // Complex: to v2f32 2914 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3}, 2915 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3}, 2916 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2}, 2917 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3}, 2918 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3}, 2919 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2}, 2920 2921 // Complex: to v4f32 2922 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4}, 2923 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2}, 2924 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3}, 2925 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2}, 2926 2927 // Complex: to v8f32 2928 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10}, 2929 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4}, 2930 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10}, 2931 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4}, 2932 2933 // Complex: to v16f32 2934 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21}, 2935 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21}, 2936 2937 // Complex: to v2f64 2938 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4}, 2939 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4}, 2940 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2}, 2941 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4}, 2942 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4}, 2943 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2}, 2944 2945 // Complex: to v4f64 2946 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4}, 2947 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4}, 2948 2949 // LowerVectorFP_TO_INT 2950 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1}, 2951 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1}, 2952 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1}, 2953 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1}, 2954 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1}, 2955 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1}, 2956 2957 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). 2958 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2}, 2959 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1}, 2960 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1}, 2961 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2}, 2962 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1}, 2963 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1}, 2964 2965 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 2966 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2}, 2967 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2}, 2968 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2}, 2969 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2}, 2970 2971 // Complex, from nxv2f32. 2972 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1}, 2973 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1}, 2974 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1}, 2975 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1}, 2976 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1}, 2977 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1}, 2978 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1}, 2979 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1}, 2980 2981 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. 2982 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2}, 2983 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2}, 2984 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2}, 2985 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2}, 2986 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2}, 2987 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2}, 2988 2989 // Complex, from nxv2f64. 2990 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1}, 2991 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1}, 2992 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1}, 2993 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1}, 2994 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1}, 2995 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1}, 2996 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1}, 2997 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1}, 2998 2999 // Complex, from nxv4f32. 3000 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4}, 3001 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1}, 3002 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1}, 3003 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1}, 3004 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4}, 3005 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1}, 3006 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1}, 3007 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1}, 3008 3009 // Complex, from nxv8f64. Illegal -> illegal conversions not required. 3010 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7}, 3011 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7}, 3012 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7}, 3013 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7}, 3014 3015 // Complex, from nxv4f64. Illegal -> illegal conversions not required. 3016 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3}, 3017 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3}, 3018 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3}, 3019 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3}, 3020 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3}, 3021 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3}, 3022 3023 // Complex, from nxv8f32. Illegal -> illegal conversions not required. 3024 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3}, 3025 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3}, 3026 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3}, 3027 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3}, 3028 3029 // Complex, from nxv8f16. 3030 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10}, 3031 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4}, 3032 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1}, 3033 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1}, 3034 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10}, 3035 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4}, 3036 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1}, 3037 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1}, 3038 3039 // Complex, from nxv4f16. 3040 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4}, 3041 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1}, 3042 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1}, 3043 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1}, 3044 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4}, 3045 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1}, 3046 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1}, 3047 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1}, 3048 3049 // Complex, from nxv2f16. 3050 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1}, 3051 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1}, 3052 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1}, 3053 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1}, 3054 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1}, 3055 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1}, 3056 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1}, 3057 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1}, 3058 3059 // Truncate from nxvmf32 to nxvmf16. 3060 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1}, 3061 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1}, 3062 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3}, 3063 3064 // Truncate from nxvmf64 to nxvmf16. 3065 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1}, 3066 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3}, 3067 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7}, 3068 3069 // Truncate from nxvmf64 to nxvmf32. 3070 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1}, 3071 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3}, 3072 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6}, 3073 3074 // Extend from nxvmf16 to nxvmf32. 3075 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, 3076 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, 3077 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, 3078 3079 // Extend from nxvmf16 to nxvmf64. 3080 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, 3081 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, 3082 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, 3083 3084 // Extend from nxvmf32 to nxvmf64. 3085 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, 3086 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, 3087 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, 3088 3089 // Bitcasts from float to integer 3090 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0}, 3091 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0}, 3092 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0}, 3093 3094 // Bitcasts from integer to float 3095 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0}, 3096 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0}, 3097 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0}, 3098 3099 // Add cost for extending to illegal -too wide- scalable vectors. 3100 // zero/sign extend are implemented by multiple unpack operations, 3101 // where each operation has a cost of 1. 3102 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2}, 3103 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6}, 3104 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14}, 3105 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2}, 3106 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6}, 3107 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2}, 3108 3109 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2}, 3110 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6}, 3111 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14}, 3112 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2}, 3113 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6}, 3114 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2}, 3115 }; 3116 3117 // We have to estimate a cost of fixed length operation upon 3118 // SVE registers(operations) with the number of registers required 3119 // for a fixed type to be represented upon SVE registers. 3120 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy; 3121 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() && 3122 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() && 3123 ST->useSVEForFixedLengthVectors(WiderTy)) { 3124 std::pair<InstructionCost, MVT> LT = 3125 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext())); 3126 unsigned NumElements = 3127 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits(); 3128 return AdjustCost( 3129 LT.first * 3130 getCastInstrCost( 3131 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements), 3132 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH, 3133 CostKind, I)); 3134 } 3135 3136 if (const auto *Entry = ConvertCostTableLookup( 3137 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 3138 return AdjustCost(Entry->Cost); 3139 3140 static const TypeConversionCostTblEntry FP16Tbl[] = { 3141 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs 3142 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1}, 3143 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs 3144 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1}, 3145 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs 3146 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2}, 3147 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn 3148 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2}, 3149 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs 3150 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1}, 3151 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs 3152 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4}, 3153 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn 3154 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3}, 3155 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs 3156 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2}, 3157 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs 3158 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8}, 3159 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf 3160 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf 3161 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf 3162 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf 3163 }; 3164 3165 if (ST->hasFullFP16()) 3166 if (const auto *Entry = ConvertCostTableLookup( 3167 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 3168 return AdjustCost(Entry->Cost); 3169 3170 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && 3171 CCH == TTI::CastContextHint::Masked && 3172 ST->isSVEorStreamingSVEAvailable() && 3173 TLI->getTypeAction(Src->getContext(), SrcTy) == 3174 TargetLowering::TypePromoteInteger && 3175 TLI->getTypeAction(Dst->getContext(), DstTy) == 3176 TargetLowering::TypeSplitVector) { 3177 // The standard behaviour in the backend for these cases is to split the 3178 // extend up into two parts: 3179 // 1. Perform an extending load or masked load up to the legal type. 3180 // 2. Extend the loaded data to the final type. 3181 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src); 3182 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext()); 3183 InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost( 3184 Opcode, LegalTy, Src, CCH, CostKind, I); 3185 InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost( 3186 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I); 3187 return Part1 + Part2; 3188 } 3189 3190 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal, 3191 // but we also want to include the TTI::CastContextHint::Masked case too. 3192 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && 3193 CCH == TTI::CastContextHint::Masked && 3194 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy)) 3195 CCH = TTI::CastContextHint::Normal; 3196 3197 return AdjustCost( 3198 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 3199 } 3200 3201 InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, 3202 Type *Dst, 3203 VectorType *VecTy, 3204 unsigned Index) { 3205 3206 // Make sure we were given a valid extend opcode. 3207 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && 3208 "Invalid opcode"); 3209 3210 // We are extending an element we extract from a vector, so the source type 3211 // of the extend is the element type of the vector. 3212 auto *Src = VecTy->getElementType(); 3213 3214 // Sign- and zero-extends are for integer types only. 3215 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type"); 3216 3217 // Get the cost for the extract. We compute the cost (if any) for the extend 3218 // below. 3219 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3220 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, 3221 CostKind, Index, nullptr, nullptr); 3222 3223 // Legalize the types. 3224 auto VecLT = getTypeLegalizationCost(VecTy); 3225 auto DstVT = TLI->getValueType(DL, Dst); 3226 auto SrcVT = TLI->getValueType(DL, Src); 3227 3228 // If the resulting type is still a vector and the destination type is legal, 3229 // we may get the extension for free. If not, get the default cost for the 3230 // extend. 3231 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) 3232 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 3233 CostKind); 3234 3235 // The destination type should be larger than the element type. If not, get 3236 // the default cost for the extend. 3237 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) 3238 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 3239 CostKind); 3240 3241 switch (Opcode) { 3242 default: 3243 llvm_unreachable("Opcode should be either SExt or ZExt"); 3244 3245 // For sign-extends, we only need a smov, which performs the extension 3246 // automatically. 3247 case Instruction::SExt: 3248 return Cost; 3249 3250 // For zero-extends, the extend is performed automatically by a umov unless 3251 // the destination type is i64 and the element type is i8 or i16. 3252 case Instruction::ZExt: 3253 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) 3254 return Cost; 3255 } 3256 3257 // If we are unable to perform the extend for free, get the default cost. 3258 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, 3259 CostKind); 3260 } 3261 3262 InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, 3263 TTI::TargetCostKind CostKind, 3264 const Instruction *I) { 3265 if (CostKind != TTI::TCK_RecipThroughput) 3266 return Opcode == Instruction::PHI ? 0 : 1; 3267 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); 3268 // Branches are assumed to be predicted. 3269 return 0; 3270 } 3271 3272 InstructionCost AArch64TTIImpl::getVectorInstrCostHelper( 3273 unsigned Opcode, Type *Val, unsigned Index, bool HasRealUse, 3274 const Instruction *I, Value *Scalar, 3275 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) { 3276 assert(Val->isVectorTy() && "This must be a vector type"); 3277 3278 if (Index != -1U) { 3279 // Legalize the type. 3280 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); 3281 3282 // This type is legalized to a scalar type. 3283 if (!LT.second.isVector()) 3284 return 0; 3285 3286 // The type may be split. For fixed-width vectors we can normalize the 3287 // index to the new type. 3288 if (LT.second.isFixedLengthVector()) { 3289 unsigned Width = LT.second.getVectorNumElements(); 3290 Index = Index % Width; 3291 } 3292 3293 // The element at index zero is already inside the vector. 3294 // - For a physical (HasRealUse==true) insert-element or extract-element 3295 // instruction that extracts integers, an explicit FPR -> GPR move is 3296 // needed. So it has non-zero cost. 3297 // - For the rest of cases (virtual instruction or element type is float), 3298 // consider the instruction free. 3299 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy())) 3300 return 0; 3301 3302 // This is recognising a LD1 single-element structure to one lane of one 3303 // register instruction. I.e., if this is an `insertelement` instruction, 3304 // and its second operand is a load, then we will generate a LD1, which 3305 // are expensive instructions. 3306 if (I && dyn_cast<LoadInst>(I->getOperand(1))) 3307 return ST->getVectorInsertExtractBaseCost() + 1; 3308 3309 // i1 inserts and extract will include an extra cset or cmp of the vector 3310 // value. Increase the cost by 1 to account. 3311 if (Val->getScalarSizeInBits() == 1) 3312 return ST->getVectorInsertExtractBaseCost() + 1; 3313 3314 // FIXME: 3315 // If the extract-element and insert-element instructions could be 3316 // simplified away (e.g., could be combined into users by looking at use-def 3317 // context), they have no cost. This is not done in the first place for 3318 // compile-time considerations. 3319 } 3320 3321 // In case of Neon, if there exists extractelement from lane != 0 such that 3322 // 1. extractelement does not necessitate a move from vector_reg -> GPR. 3323 // 2. extractelement result feeds into fmul. 3324 // 3. Other operand of fmul is an extractelement from lane 0 or lane 3325 // equivalent to 0. 3326 // then the extractelement can be merged with fmul in the backend and it 3327 // incurs no cost. 3328 // e.g. 3329 // define double @foo(<2 x double> %a) { 3330 // %1 = extractelement <2 x double> %a, i32 0 3331 // %2 = extractelement <2 x double> %a, i32 1 3332 // %res = fmul double %1, %2 3333 // ret double %res 3334 // } 3335 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1] 3336 auto ExtractCanFuseWithFmul = [&]() { 3337 // We bail out if the extract is from lane 0. 3338 if (Index == 0) 3339 return false; 3340 3341 // Check if the scalar element type of the vector operand of ExtractElement 3342 // instruction is one of the allowed types. 3343 auto IsAllowedScalarTy = [&](const Type *T) { 3344 return T->isFloatTy() || T->isDoubleTy() || 3345 (T->isHalfTy() && ST->hasFullFP16()); 3346 }; 3347 3348 // Check if the extractelement user is scalar fmul. 3349 auto IsUserFMulScalarTy = [](const Value *EEUser) { 3350 // Check if the user is scalar fmul. 3351 const auto *BO = dyn_cast<BinaryOperator>(EEUser); 3352 return BO && BO->getOpcode() == BinaryOperator::FMul && 3353 !BO->getType()->isVectorTy(); 3354 }; 3355 3356 // Check if the extract index is from lane 0 or lane equivalent to 0 for a 3357 // certain scalar type and a certain vector register width. 3358 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) { 3359 auto RegWidth = 3360 getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 3361 .getFixedValue(); 3362 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0); 3363 }; 3364 3365 // Check if the type constraints on input vector type and result scalar type 3366 // of extractelement instruction are satisfied. 3367 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType())) 3368 return false; 3369 3370 if (Scalar) { 3371 DenseMap<User *, unsigned> UserToExtractIdx; 3372 for (auto *U : Scalar->users()) { 3373 if (!IsUserFMulScalarTy(U)) 3374 return false; 3375 // Recording entry for the user is important. Index value is not 3376 // important. 3377 UserToExtractIdx[U]; 3378 } 3379 if (UserToExtractIdx.empty()) 3380 return false; 3381 for (auto &[S, U, L] : ScalarUserAndIdx) { 3382 for (auto *U : S->users()) { 3383 if (UserToExtractIdx.find(U) != UserToExtractIdx.end()) { 3384 auto *FMul = cast<BinaryOperator>(U); 3385 auto *Op0 = FMul->getOperand(0); 3386 auto *Op1 = FMul->getOperand(1); 3387 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) { 3388 UserToExtractIdx[U] = L; 3389 break; 3390 } 3391 } 3392 } 3393 } 3394 for (auto &[U, L] : UserToExtractIdx) { 3395 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) && 3396 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits())) 3397 return false; 3398 } 3399 } else { 3400 const auto *EE = cast<ExtractElementInst>(I); 3401 3402 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand()); 3403 if (!IdxOp) 3404 return false; 3405 3406 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) { 3407 if (!IsUserFMulScalarTy(U)) 3408 return false; 3409 3410 // Check if the other operand of extractelement is also extractelement 3411 // from lane equivalent to 0. 3412 const auto *BO = cast<BinaryOperator>(U); 3413 const auto *OtherEE = dyn_cast<ExtractElementInst>( 3414 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0)); 3415 if (OtherEE) { 3416 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand()); 3417 if (!IdxOp) 3418 return false; 3419 return IsExtractLaneEquivalentToZero( 3420 cast<ConstantInt>(OtherEE->getIndexOperand()) 3421 ->getValue() 3422 .getZExtValue(), 3423 OtherEE->getType()->getScalarSizeInBits()); 3424 } 3425 return true; 3426 }); 3427 } 3428 return true; 3429 }; 3430 3431 if (Opcode == Instruction::ExtractElement && (I || Scalar) && 3432 ExtractCanFuseWithFmul()) 3433 return 0; 3434 3435 // All other insert/extracts cost this much. 3436 return ST->getVectorInsertExtractBaseCost(); 3437 } 3438 3439 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 3440 TTI::TargetCostKind CostKind, 3441 unsigned Index, Value *Op0, 3442 Value *Op1) { 3443 bool HasRealUse = 3444 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0); 3445 return getVectorInstrCostHelper(Opcode, Val, Index, HasRealUse); 3446 } 3447 3448 InstructionCost AArch64TTIImpl::getVectorInstrCost( 3449 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, 3450 Value *Scalar, 3451 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) { 3452 return getVectorInstrCostHelper(Opcode, Val, Index, false, nullptr, Scalar, 3453 ScalarUserAndIdx); 3454 } 3455 3456 InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, 3457 Type *Val, 3458 TTI::TargetCostKind CostKind, 3459 unsigned Index) { 3460 return getVectorInstrCostHelper(I.getOpcode(), Val, Index, 3461 true /* HasRealUse */, &I); 3462 } 3463 3464 InstructionCost AArch64TTIImpl::getScalarizationOverhead( 3465 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, 3466 TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) { 3467 if (isa<ScalableVectorType>(Ty)) 3468 return InstructionCost::getInvalid(); 3469 if (Ty->getElementType()->isFloatingPointTy()) 3470 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, 3471 CostKind); 3472 return DemandedElts.popcount() * (Insert + Extract) * 3473 ST->getVectorInsertExtractBaseCost(); 3474 } 3475 3476 InstructionCost AArch64TTIImpl::getArithmeticInstrCost( 3477 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 3478 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 3479 ArrayRef<const Value *> Args, 3480 const Instruction *CxtI) { 3481 3482 // The code-generator is currently not able to handle scalable vectors 3483 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3484 // it. This change will be removed when code-generation for these types is 3485 // sufficiently reliable. 3486 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 3487 if (VTy->getElementCount() == ElementCount::getScalable(1)) 3488 return InstructionCost::getInvalid(); 3489 3490 // TODO: Handle more cost kinds. 3491 if (CostKind != TTI::TCK_RecipThroughput) 3492 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 3493 Op2Info, Args, CxtI); 3494 3495 // Legalize the type. 3496 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 3497 int ISD = TLI->InstructionOpcodeToISD(Opcode); 3498 3499 switch (ISD) { 3500 default: 3501 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 3502 Op2Info); 3503 case ISD::SDIV: 3504 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) { 3505 // On AArch64, scalar signed division by constants power-of-two are 3506 // normally expanded to the sequence ADD + CMP + SELECT + SRA. 3507 // The OperandValue properties many not be same as that of previous 3508 // operation; conservatively assume OP_None. 3509 InstructionCost Cost = getArithmeticInstrCost( 3510 Instruction::Add, Ty, CostKind, 3511 Op1Info.getNoProps(), Op2Info.getNoProps()); 3512 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, 3513 Op1Info.getNoProps(), Op2Info.getNoProps()); 3514 Cost += getArithmeticInstrCost( 3515 Instruction::Select, Ty, CostKind, 3516 Op1Info.getNoProps(), Op2Info.getNoProps()); 3517 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, 3518 Op1Info.getNoProps(), Op2Info.getNoProps()); 3519 return Cost; 3520 } 3521 [[fallthrough]]; 3522 case ISD::UDIV: { 3523 auto VT = TLI->getValueType(DL, Ty); 3524 if (Op2Info.isConstant() && Op2Info.isUniform()) { 3525 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { 3526 // Vector signed division by constant are expanded to the 3527 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division 3528 // to MULHS + SUB + SRL + ADD + SRL. 3529 InstructionCost MulCost = getArithmeticInstrCost( 3530 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); 3531 InstructionCost AddCost = getArithmeticInstrCost( 3532 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); 3533 InstructionCost ShrCost = getArithmeticInstrCost( 3534 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); 3535 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; 3536 } 3537 } 3538 3539 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are 3540 // emitted by the backend even when those functions are not declared in the 3541 // module. 3542 if (!VT.isVector() && VT.getSizeInBits() > 64) 3543 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind); 3544 3545 InstructionCost Cost = BaseT::getArithmeticInstrCost( 3546 Opcode, Ty, CostKind, Op1Info, Op2Info); 3547 if (Ty->isVectorTy()) { 3548 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) { 3549 // SDIV/UDIV operations are lowered using SVE, then we can have less 3550 // costs. 3551 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty) 3552 ->getPrimitiveSizeInBits() 3553 .getFixedValue() < 128) { 3554 EVT VT = TLI->getValueType(DL, Ty); 3555 static const CostTblEntry DivTbl[]{ 3556 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8}, 3557 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5}, 3558 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1}, 3559 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8}, 3560 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5}, 3561 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}}; 3562 3563 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT()); 3564 if (nullptr != Entry) 3565 return Entry->Cost; 3566 } 3567 // For 8/16-bit elements, the cost is higher because the type 3568 // requires promotion and possibly splitting: 3569 if (LT.second.getScalarType() == MVT::i8) 3570 Cost *= 8; 3571 else if (LT.second.getScalarType() == MVT::i16) 3572 Cost *= 4; 3573 return Cost; 3574 } else { 3575 // If one of the operands is a uniform constant then the cost for each 3576 // element is Cost for insertion, extraction and division. 3577 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the 3578 // operation with scalar type 3579 if ((Op1Info.isConstant() && Op1Info.isUniform()) || 3580 (Op2Info.isConstant() && Op2Info.isUniform())) { 3581 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { 3582 InstructionCost DivCost = BaseT::getArithmeticInstrCost( 3583 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info); 3584 return (4 + DivCost) * VTy->getNumElements(); 3585 } 3586 } 3587 // On AArch64, without SVE, vector divisions are expanded 3588 // into scalar divisions of each pair of elements. 3589 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, 3590 CostKind, Op1Info, Op2Info); 3591 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, 3592 Op1Info, Op2Info); 3593 } 3594 3595 // TODO: if one of the arguments is scalar, then it's not necessary to 3596 // double the cost of handling the vector elements. 3597 Cost += Cost; 3598 } 3599 return Cost; 3600 } 3601 case ISD::MUL: 3602 // When SVE is available, then we can lower the v2i64 operation using 3603 // the SVE mul instruction, which has a lower cost. 3604 if (LT.second == MVT::v2i64 && ST->hasSVE()) 3605 return LT.first; 3606 3607 // When SVE is not available, there is no MUL.2d instruction, 3608 // which means mul <2 x i64> is expensive as elements are extracted 3609 // from the vectors and the muls scalarized. 3610 // As getScalarizationOverhead is a bit too pessimistic, we 3611 // estimate the cost for a i64 vector directly here, which is: 3612 // - four 2-cost i64 extracts, 3613 // - two 2-cost i64 inserts, and 3614 // - two 1-cost muls. 3615 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with 3616 // LT.first = 2 the cost is 28. If both operands are extensions it will not 3617 // need to scalarize so the cost can be cheaper (smull or umull). 3618 // so the cost can be cheaper (smull or umull). 3619 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) 3620 return LT.first; 3621 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() * 3622 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) + 3623 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1, 3624 nullptr, nullptr) * 3625 2 + 3626 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1, 3627 nullptr, nullptr)); 3628 case ISD::ADD: 3629 case ISD::XOR: 3630 case ISD::OR: 3631 case ISD::AND: 3632 case ISD::SRL: 3633 case ISD::SRA: 3634 case ISD::SHL: 3635 // These nodes are marked as 'custom' for combining purposes only. 3636 // We know that they are legal. See LowerAdd in ISelLowering. 3637 return LT.first; 3638 3639 case ISD::FNEG: 3640 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul 3641 if ((Ty->isFloatTy() || Ty->isDoubleTy() || 3642 (Ty->isHalfTy() && ST->hasFullFP16())) && 3643 CxtI && 3644 ((CxtI->hasOneUse() && 3645 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) || 3646 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value())))) 3647 return 0; 3648 [[fallthrough]]; 3649 case ISD::FADD: 3650 case ISD::FSUB: 3651 // Increase the cost for half and bfloat types if not architecturally 3652 // supported. 3653 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) || 3654 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16())) 3655 return 2 * LT.first; 3656 if (!Ty->getScalarType()->isFP128Ty()) 3657 return LT.first; 3658 [[fallthrough]]; 3659 case ISD::FMUL: 3660 case ISD::FDIV: 3661 // These nodes are marked as 'custom' just to lower them to SVE. 3662 // We know said lowering will incur no additional cost. 3663 if (!Ty->getScalarType()->isFP128Ty()) 3664 return 2 * LT.first; 3665 3666 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 3667 Op2Info); 3668 case ISD::FREM: 3669 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when 3670 // those functions are not declared in the module. 3671 if (!Ty->isVectorTy()) 3672 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind); 3673 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 3674 Op2Info); 3675 } 3676 } 3677 3678 InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, 3679 ScalarEvolution *SE, 3680 const SCEV *Ptr) { 3681 // Address computations in vectorized code with non-consecutive addresses will 3682 // likely result in more instructions compared to scalar code where the 3683 // computation can more often be merged into the index mode. The resulting 3684 // extra micro-ops can significantly decrease throughput. 3685 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead; 3686 int MaxMergeDistance = 64; 3687 3688 if (Ty->isVectorTy() && SE && 3689 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 3690 return NumVectorInstToHideOverhead; 3691 3692 // In many cases the address computation is not merged into the instruction 3693 // addressing mode. 3694 return 1; 3695 } 3696 3697 InstructionCost AArch64TTIImpl::getCmpSelInstrCost( 3698 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, 3699 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, 3700 TTI::OperandValueInfo Op2Info, const Instruction *I) { 3701 // TODO: Handle other cost kinds. 3702 if (CostKind != TTI::TCK_RecipThroughput) 3703 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 3704 Op1Info, Op2Info, I); 3705 3706 int ISD = TLI->InstructionOpcodeToISD(Opcode); 3707 // We don't lower some vector selects well that are wider than the register 3708 // width. 3709 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { 3710 // We would need this many instructions to hide the scalarization happening. 3711 const int AmortizationCost = 20; 3712 3713 // If VecPred is not set, check if we can get a predicate from the context 3714 // instruction, if its type matches the requested ValTy. 3715 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { 3716 CmpPredicate CurrentPred; 3717 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), 3718 m_Value()))) 3719 VecPred = CurrentPred; 3720 } 3721 // Check if we have a compare/select chain that can be lowered using 3722 // a (F)CMxx & BFI pair. 3723 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE || 3724 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || 3725 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || 3726 VecPred == CmpInst::FCMP_UNE) { 3727 static const auto ValidMinMaxTys = { 3728 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, 3729 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; 3730 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; 3731 3732 auto LT = getTypeLegalizationCost(ValTy); 3733 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || 3734 (ST->hasFullFP16() && 3735 any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) 3736 return LT.first; 3737 } 3738 3739 static const TypeConversionCostTblEntry 3740 VectorSelectTbl[] = { 3741 { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 }, 3742 { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 }, 3743 { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 }, 3744 { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 }, 3745 { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 }, 3746 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, 3747 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, 3748 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, 3749 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 3750 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 3751 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 3752 }; 3753 3754 EVT SelCondTy = TLI->getValueType(DL, CondTy); 3755 EVT SelValTy = TLI->getValueType(DL, ValTy); 3756 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 3757 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, 3758 SelCondTy.getSimpleVT(), 3759 SelValTy.getSimpleVT())) 3760 return Entry->Cost; 3761 } 3762 } 3763 3764 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) { 3765 auto LT = getTypeLegalizationCost(ValTy); 3766 // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back. 3767 if (LT.second == MVT::v4f16 && !ST->hasFullFP16()) 3768 return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn 3769 } 3770 3771 // Treat the icmp in icmp(and, 0) as free, as we can make use of ands. 3772 // FIXME: This can apply to more conditions and add/sub if it can be shown to 3773 // be profitable. 3774 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I && 3775 ICmpInst::isEquality(VecPred) && 3776 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) && 3777 match(I->getOperand(1), m_Zero()) && 3778 match(I->getOperand(0), m_And(m_Value(), m_Value()))) 3779 return 0; 3780 3781 // The base case handles scalable vectors fine for now, since it treats the 3782 // cost as 1 * legalization cost. 3783 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 3784 Op1Info, Op2Info, I); 3785 } 3786 3787 AArch64TTIImpl::TTI::MemCmpExpansionOptions 3788 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 3789 TTI::MemCmpExpansionOptions Options; 3790 if (ST->requiresStrictAlign()) { 3791 // TODO: Add cost modeling for strict align. Misaligned loads expand to 3792 // a bunch of instructions when strict align is enabled. 3793 return Options; 3794 } 3795 Options.AllowOverlappingLoads = true; 3796 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 3797 Options.NumLoadsPerBlock = Options.MaxNumLoads; 3798 // TODO: Though vector loads usually perform well on AArch64, in some targets 3799 // they may wake up the FP unit, which raises the power consumption. Perhaps 3800 // they could be used with no holds barred (-O3). 3801 Options.LoadSizes = {8, 4, 2, 1}; 3802 Options.AllowedTailExpansions = {3, 5, 6}; 3803 return Options; 3804 } 3805 3806 bool AArch64TTIImpl::prefersVectorizedAddressing() const { 3807 return ST->hasSVE(); 3808 } 3809 3810 InstructionCost 3811 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, 3812 Align Alignment, unsigned AddressSpace, 3813 TTI::TargetCostKind CostKind) { 3814 if (useNeonVector(Src)) 3815 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 3816 CostKind); 3817 auto LT = getTypeLegalizationCost(Src); 3818 if (!LT.first.isValid()) 3819 return InstructionCost::getInvalid(); 3820 3821 // Return an invalid cost for element types that we are unable to lower. 3822 auto *VT = cast<VectorType>(Src); 3823 if (VT->getElementType()->isIntegerTy(1)) 3824 return InstructionCost::getInvalid(); 3825 3826 // The code-generator is currently not able to handle scalable vectors 3827 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3828 // it. This change will be removed when code-generation for these types is 3829 // sufficiently reliable. 3830 if (VT->getElementCount() == ElementCount::getScalable(1)) 3831 return InstructionCost::getInvalid(); 3832 3833 return LT.first; 3834 } 3835 3836 // This function returns gather/scatter overhead either from 3837 // user-provided value or specialized values per-target from \p ST. 3838 static unsigned getSVEGatherScatterOverhead(unsigned Opcode, 3839 const AArch64Subtarget *ST) { 3840 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 3841 "Should be called on only load or stores."); 3842 switch (Opcode) { 3843 case Instruction::Load: 3844 if (SVEGatherOverhead.getNumOccurrences() > 0) 3845 return SVEGatherOverhead; 3846 return ST->getGatherOverhead(); 3847 break; 3848 case Instruction::Store: 3849 if (SVEScatterOverhead.getNumOccurrences() > 0) 3850 return SVEScatterOverhead; 3851 return ST->getScatterOverhead(); 3852 break; 3853 default: 3854 llvm_unreachable("Shouldn't have reached here"); 3855 } 3856 } 3857 3858 InstructionCost AArch64TTIImpl::getGatherScatterOpCost( 3859 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 3860 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 3861 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy)) 3862 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 3863 Alignment, CostKind, I); 3864 auto *VT = cast<VectorType>(DataTy); 3865 auto LT = getTypeLegalizationCost(DataTy); 3866 if (!LT.first.isValid()) 3867 return InstructionCost::getInvalid(); 3868 3869 // Return an invalid cost for element types that we are unable to lower. 3870 if (!LT.second.isVector() || 3871 !isElementTypeLegalForScalableVector(VT->getElementType()) || 3872 VT->getElementType()->isIntegerTy(1)) 3873 return InstructionCost::getInvalid(); 3874 3875 // The code-generator is currently not able to handle scalable vectors 3876 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3877 // it. This change will be removed when code-generation for these types is 3878 // sufficiently reliable. 3879 if (VT->getElementCount() == ElementCount::getScalable(1)) 3880 return InstructionCost::getInvalid(); 3881 3882 ElementCount LegalVF = LT.second.getVectorElementCount(); 3883 InstructionCost MemOpCost = 3884 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, 3885 {TTI::OK_AnyValue, TTI::OP_None}, I); 3886 // Add on an overhead cost for using gathers/scatters. 3887 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST); 3888 return LT.first * MemOpCost * getMaxNumElements(LegalVF); 3889 } 3890 3891 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { 3892 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); 3893 } 3894 3895 InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, 3896 MaybeAlign Alignment, 3897 unsigned AddressSpace, 3898 TTI::TargetCostKind CostKind, 3899 TTI::OperandValueInfo OpInfo, 3900 const Instruction *I) { 3901 EVT VT = TLI->getValueType(DL, Ty, true); 3902 // Type legalization can't handle structs 3903 if (VT == MVT::Other) 3904 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, 3905 CostKind); 3906 3907 auto LT = getTypeLegalizationCost(Ty); 3908 if (!LT.first.isValid()) 3909 return InstructionCost::getInvalid(); 3910 3911 // The code-generator is currently not able to handle scalable vectors 3912 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 3913 // it. This change will be removed when code-generation for these types is 3914 // sufficiently reliable. 3915 // We also only support full register predicate loads and stores. 3916 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 3917 if (VTy->getElementCount() == ElementCount::getScalable(1) || 3918 (VTy->getElementType()->isIntegerTy(1) && 3919 !VTy->getElementCount().isKnownMultipleOf( 3920 ElementCount::getScalable(16)))) 3921 return InstructionCost::getInvalid(); 3922 3923 // TODO: consider latency as well for TCK_SizeAndLatency. 3924 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) 3925 return LT.first; 3926 3927 if (CostKind != TTI::TCK_RecipThroughput) 3928 return 1; 3929 3930 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && 3931 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { 3932 // Unaligned stores are extremely inefficient. We don't split all 3933 // unaligned 128-bit stores because the negative impact that has shown in 3934 // practice on inlined block copy code. 3935 // We make such stores expensive so that we will only vectorize if there 3936 // are 6 other instructions getting vectorized. 3937 const int AmortizationCost = 6; 3938 3939 return LT.first * 2 * AmortizationCost; 3940 } 3941 3942 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs. 3943 if (Ty->isPtrOrPtrVectorTy()) 3944 return LT.first; 3945 3946 if (useNeonVector(Ty)) { 3947 // Check truncating stores and extending loads. 3948 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { 3949 // v4i8 types are lowered to scalar a load/store and sshll/xtn. 3950 if (VT == MVT::v4i8) 3951 return 2; 3952 // Otherwise we need to scalarize. 3953 return cast<FixedVectorType>(Ty)->getNumElements() * 2; 3954 } 3955 EVT EltVT = VT.getVectorElementType(); 3956 unsigned EltSize = EltVT.getScalarSizeInBits(); 3957 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 || 3958 VT.getVectorNumElements() >= (128 / EltSize) || !Alignment || 3959 *Alignment != Align(1)) 3960 return LT.first; 3961 // FIXME: v3i8 lowering currently is very inefficient, due to automatic 3962 // widening to v4i8, which produces suboptimal results. 3963 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8) 3964 return LT.first; 3965 3966 // Check non-power-of-2 loads/stores for legal vector element types with 3967 // NEON. Non-power-of-2 memory ops will get broken down to a set of 3968 // operations on smaller power-of-2 ops, including ld1/st1. 3969 LLVMContext &C = Ty->getContext(); 3970 InstructionCost Cost(0); 3971 SmallVector<EVT> TypeWorklist; 3972 TypeWorklist.push_back(VT); 3973 while (!TypeWorklist.empty()) { 3974 EVT CurrVT = TypeWorklist.pop_back_val(); 3975 unsigned CurrNumElements = CurrVT.getVectorNumElements(); 3976 if (isPowerOf2_32(CurrNumElements)) { 3977 Cost += 1; 3978 continue; 3979 } 3980 3981 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2; 3982 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2)); 3983 TypeWorklist.push_back( 3984 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2)); 3985 } 3986 return Cost; 3987 } 3988 3989 return LT.first; 3990 } 3991 3992 InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( 3993 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 3994 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 3995 bool UseMaskForCond, bool UseMaskForGaps) { 3996 assert(Factor >= 2 && "Invalid interleave factor"); 3997 auto *VecVTy = cast<VectorType>(VecTy); 3998 3999 if (VecTy->isScalableTy() && !ST->hasSVE()) 4000 return InstructionCost::getInvalid(); 4001 4002 // Vectorization for masked interleaved accesses is only enabled for scalable 4003 // VF. 4004 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps)) 4005 return InstructionCost::getInvalid(); 4006 4007 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { 4008 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue(); 4009 auto *SubVecTy = 4010 VectorType::get(VecVTy->getElementType(), 4011 VecVTy->getElementCount().divideCoefficientBy(Factor)); 4012 4013 // ldN/stN only support legal vector types of size 64 or 128 in bits. 4014 // Accesses having vector types that are a multiple of 128 bits can be 4015 // matched to more than one ldN/stN instruction. 4016 bool UseScalable; 4017 if (MinElts % Factor == 0 && 4018 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) 4019 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); 4020 } 4021 4022 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 4023 Alignment, AddressSpace, CostKind, 4024 UseMaskForCond, UseMaskForGaps); 4025 } 4026 4027 InstructionCost 4028 AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { 4029 InstructionCost Cost = 0; 4030 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 4031 for (auto *I : Tys) { 4032 if (!I->isVectorTy()) 4033 continue; 4034 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == 4035 128) 4036 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + 4037 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); 4038 } 4039 return Cost; 4040 } 4041 4042 unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) { 4043 return ST->getMaxInterleaveFactor(); 4044 } 4045 4046 // For Falkor, we want to avoid having too many strided loads in a loop since 4047 // that can exhaust the HW prefetcher resources. We adjust the unroller 4048 // MaxCount preference below to attempt to ensure unrolling doesn't create too 4049 // many strided loads. 4050 static void 4051 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, 4052 TargetTransformInfo::UnrollingPreferences &UP) { 4053 enum { MaxStridedLoads = 7 }; 4054 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { 4055 int StridedLoads = 0; 4056 // FIXME? We could make this more precise by looking at the CFG and 4057 // e.g. not counting loads in each side of an if-then-else diamond. 4058 for (const auto BB : L->blocks()) { 4059 for (auto &I : *BB) { 4060 LoadInst *LMemI = dyn_cast<LoadInst>(&I); 4061 if (!LMemI) 4062 continue; 4063 4064 Value *PtrValue = LMemI->getPointerOperand(); 4065 if (L->isLoopInvariant(PtrValue)) 4066 continue; 4067 4068 const SCEV *LSCEV = SE.getSCEV(PtrValue); 4069 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); 4070 if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) 4071 continue; 4072 4073 // FIXME? We could take pairing of unrolled load copies into account 4074 // by looking at the AddRec, but we would probably have to limit this 4075 // to loops with no stores or other memory optimization barriers. 4076 ++StridedLoads; 4077 // We've seen enough strided loads that seeing more won't make a 4078 // difference. 4079 if (StridedLoads > MaxStridedLoads / 2) 4080 return StridedLoads; 4081 } 4082 } 4083 return StridedLoads; 4084 }; 4085 4086 int StridedLoads = countStridedLoads(L, SE); 4087 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads 4088 << " strided loads\n"); 4089 // Pick the largest power of 2 unroll count that won't result in too many 4090 // strided loads. 4091 if (StridedLoads) { 4092 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); 4093 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " 4094 << UP.MaxCount << '\n'); 4095 } 4096 } 4097 4098 /// For Apple CPUs, we want to runtime-unroll loops to make better use if the 4099 /// OOO engine's wide instruction window and various predictors. 4100 static void 4101 getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, 4102 TargetTransformInfo::UnrollingPreferences &UP, 4103 AArch64TTIImpl &TTI) { 4104 // Limit loops with structure that is highly likely to benefit from runtime 4105 // unrolling; that is we exclude outer loops, loops with multiple exits and 4106 // many blocks (i.e. likely with complex control flow). Note that the 4107 // heuristics here may be overly conservative and we err on the side of 4108 // avoiding runtime unrolling rather than unroll excessively. They are all 4109 // subject to further refinement. 4110 if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8) 4111 return; 4112 4113 const SCEV *BTC = SE.getBackedgeTakenCount(L); 4114 if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) || 4115 (SE.getSmallConstantMaxTripCount(L) > 0 && 4116 SE.getSmallConstantMaxTripCount(L) <= 32)) 4117 return; 4118 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized")) 4119 return; 4120 4121 int64_t Size = 0; 4122 for (auto *BB : L->getBlocks()) { 4123 for (auto &I : *BB) { 4124 if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I)) 4125 return; 4126 SmallVector<const Value *, 4> Operands(I.operand_values()); 4127 Size += 4128 *TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue(); 4129 } 4130 } 4131 4132 // Limit to loops with trip counts that are cheap to expand. 4133 UP.SCEVExpansionBudget = 1; 4134 4135 // Try to unroll small, single block loops, if they have load/store 4136 // dependencies, to expose more parallel memory access streams. 4137 BasicBlock *Header = L->getHeader(); 4138 if (Header == L->getLoopLatch()) { 4139 if (Size > 8) 4140 return; 4141 4142 SmallPtrSet<Value *, 8> LoadedValues; 4143 SmallVector<StoreInst *> Stores; 4144 for (auto *BB : L->blocks()) { 4145 for (auto &I : *BB) { 4146 Value *Ptr = getLoadStorePointerOperand(&I); 4147 if (!Ptr) 4148 continue; 4149 const SCEV *PtrSCEV = SE.getSCEV(Ptr); 4150 if (SE.isLoopInvariant(PtrSCEV, L)) 4151 continue; 4152 if (isa<LoadInst>(&I)) 4153 LoadedValues.insert(&I); 4154 else 4155 Stores.push_back(cast<StoreInst>(&I)); 4156 } 4157 } 4158 4159 // Try to find an unroll count that maximizes the use of the instruction 4160 // window, i.e. trying to fetch as many instructions per cycle as possible. 4161 unsigned MaxInstsPerLine = 16; 4162 unsigned UC = 1; 4163 unsigned BestUC = 1; 4164 unsigned SizeWithBestUC = BestUC * Size; 4165 while (UC <= 8) { 4166 unsigned SizeWithUC = UC * Size; 4167 if (SizeWithUC > 48) 4168 break; 4169 if ((SizeWithUC % MaxInstsPerLine) == 0 || 4170 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) { 4171 BestUC = UC; 4172 SizeWithBestUC = BestUC * Size; 4173 } 4174 UC++; 4175 } 4176 4177 if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) { 4178 return LoadedValues.contains(SI->getOperand(0)); 4179 })) 4180 return; 4181 4182 UP.Runtime = true; 4183 UP.DefaultUnrollRuntimeCount = BestUC; 4184 return; 4185 } 4186 4187 // Try to runtime-unroll loops with early-continues depending on loop-varying 4188 // loads; this helps with branch-prediction for the early-continues. 4189 auto *Term = dyn_cast<BranchInst>(Header->getTerminator()); 4190 auto *Latch = L->getLoopLatch(); 4191 SmallVector<BasicBlock *> Preds(predecessors(Latch)); 4192 if (!Term || !Term->isConditional() || Preds.size() == 1 || 4193 none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) || 4194 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); })) 4195 return; 4196 4197 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad = 4198 [&](Instruction *I, unsigned Depth) -> bool { 4199 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8) 4200 return false; 4201 4202 if (isa<LoadInst>(I)) 4203 return true; 4204 4205 return any_of(I->operands(), [&](Value *V) { 4206 auto *I = dyn_cast<Instruction>(V); 4207 return I && DependsOnLoopLoad(I, Depth + 1); 4208 }); 4209 }; 4210 CmpPredicate Pred; 4211 Instruction *I; 4212 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(), 4213 m_Value())) && 4214 DependsOnLoopLoad(I, 0)) { 4215 UP.Runtime = true; 4216 } 4217 } 4218 4219 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 4220 TTI::UnrollingPreferences &UP, 4221 OptimizationRemarkEmitter *ORE) { 4222 // Enable partial unrolling and runtime unrolling. 4223 BaseT::getUnrollingPreferences(L, SE, UP, ORE); 4224 4225 UP.UpperBound = true; 4226 4227 // For inner loop, it is more likely to be a hot one, and the runtime check 4228 // can be promoted out from LICM pass, so the overhead is less, let's try 4229 // a larger threshold to unroll more loops. 4230 if (L->getLoopDepth() > 1) 4231 UP.PartialThreshold *= 2; 4232 4233 // Disable partial & runtime unrolling on -Os. 4234 UP.PartialOptSizeThreshold = 0; 4235 4236 // Apply subtarget-specific unrolling preferences. 4237 switch (ST->getProcFamily()) { 4238 case AArch64Subtarget::AppleA14: 4239 case AArch64Subtarget::AppleA15: 4240 case AArch64Subtarget::AppleA16: 4241 case AArch64Subtarget::AppleM4: 4242 getAppleRuntimeUnrollPreferences(L, SE, UP, *this); 4243 break; 4244 case AArch64Subtarget::Falkor: 4245 if (EnableFalkorHWPFUnrollFix) 4246 getFalkorUnrollingPreferences(L, SE, UP); 4247 break; 4248 default: 4249 break; 4250 } 4251 4252 // Scan the loop: don't unroll loops with calls as this could prevent 4253 // inlining. Don't unroll vector loops either, as they don't benefit much from 4254 // unrolling. 4255 for (auto *BB : L->getBlocks()) { 4256 for (auto &I : *BB) { 4257 // Don't unroll vectorised loop. 4258 if (I.getType()->isVectorTy()) 4259 return; 4260 4261 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 4262 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 4263 if (!isLoweredToCall(F)) 4264 continue; 4265 } 4266 return; 4267 } 4268 } 4269 } 4270 4271 // Enable runtime unrolling for in-order models 4272 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by 4273 // checking for that case, we can ensure that the default behaviour is 4274 // unchanged 4275 if (ST->getProcFamily() != AArch64Subtarget::Others && 4276 !ST->getSchedModel().isOutOfOrder()) { 4277 UP.Runtime = true; 4278 UP.Partial = true; 4279 UP.UnrollRemainder = true; 4280 UP.DefaultUnrollRuntimeCount = 4; 4281 4282 UP.UnrollAndJam = true; 4283 UP.UnrollAndJamInnerLoopThreshold = 60; 4284 } 4285 } 4286 4287 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 4288 TTI::PeelingPreferences &PP) { 4289 BaseT::getPeelingPreferences(L, SE, PP); 4290 } 4291 4292 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, 4293 Type *ExpectedType) { 4294 switch (Inst->getIntrinsicID()) { 4295 default: 4296 return nullptr; 4297 case Intrinsic::aarch64_neon_st2: 4298 case Intrinsic::aarch64_neon_st3: 4299 case Intrinsic::aarch64_neon_st4: { 4300 // Create a struct type 4301 StructType *ST = dyn_cast<StructType>(ExpectedType); 4302 if (!ST) 4303 return nullptr; 4304 unsigned NumElts = Inst->arg_size() - 1; 4305 if (ST->getNumElements() != NumElts) 4306 return nullptr; 4307 for (unsigned i = 0, e = NumElts; i != e; ++i) { 4308 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) 4309 return nullptr; 4310 } 4311 Value *Res = PoisonValue::get(ExpectedType); 4312 IRBuilder<> Builder(Inst); 4313 for (unsigned i = 0, e = NumElts; i != e; ++i) { 4314 Value *L = Inst->getArgOperand(i); 4315 Res = Builder.CreateInsertValue(Res, L, i); 4316 } 4317 return Res; 4318 } 4319 case Intrinsic::aarch64_neon_ld2: 4320 case Intrinsic::aarch64_neon_ld3: 4321 case Intrinsic::aarch64_neon_ld4: 4322 if (Inst->getType() == ExpectedType) 4323 return Inst; 4324 return nullptr; 4325 } 4326 } 4327 4328 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 4329 MemIntrinsicInfo &Info) { 4330 switch (Inst->getIntrinsicID()) { 4331 default: 4332 break; 4333 case Intrinsic::aarch64_neon_ld2: 4334 case Intrinsic::aarch64_neon_ld3: 4335 case Intrinsic::aarch64_neon_ld4: 4336 Info.ReadMem = true; 4337 Info.WriteMem = false; 4338 Info.PtrVal = Inst->getArgOperand(0); 4339 break; 4340 case Intrinsic::aarch64_neon_st2: 4341 case Intrinsic::aarch64_neon_st3: 4342 case Intrinsic::aarch64_neon_st4: 4343 Info.ReadMem = false; 4344 Info.WriteMem = true; 4345 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1); 4346 break; 4347 } 4348 4349 switch (Inst->getIntrinsicID()) { 4350 default: 4351 return false; 4352 case Intrinsic::aarch64_neon_ld2: 4353 case Intrinsic::aarch64_neon_st2: 4354 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; 4355 break; 4356 case Intrinsic::aarch64_neon_ld3: 4357 case Intrinsic::aarch64_neon_st3: 4358 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; 4359 break; 4360 case Intrinsic::aarch64_neon_ld4: 4361 case Intrinsic::aarch64_neon_st4: 4362 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; 4363 break; 4364 } 4365 return true; 4366 } 4367 4368 /// See if \p I should be considered for address type promotion. We check if \p 4369 /// I is a sext with right type and used in memory accesses. If it used in a 4370 /// "complex" getelementptr, we allow it to be promoted without finding other 4371 /// sext instructions that sign extended the same initial value. A getelementptr 4372 /// is considered as "complex" if it has more than 2 operands. 4373 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( 4374 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { 4375 bool Considerable = false; 4376 AllowPromotionWithoutCommonHeader = false; 4377 if (!isa<SExtInst>(&I)) 4378 return false; 4379 Type *ConsideredSExtType = 4380 Type::getInt64Ty(I.getParent()->getParent()->getContext()); 4381 if (I.getType() != ConsideredSExtType) 4382 return false; 4383 // See if the sext is the one with the right type and used in at least one 4384 // GetElementPtrInst. 4385 for (const User *U : I.users()) { 4386 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { 4387 Considerable = true; 4388 // A getelementptr is considered as "complex" if it has more than 2 4389 // operands. We will promote a SExt used in such complex GEP as we 4390 // expect some computation to be merged if they are done on 64 bits. 4391 if (GEPInst->getNumOperands() > 2) { 4392 AllowPromotionWithoutCommonHeader = true; 4393 break; 4394 } 4395 } 4396 } 4397 return Considerable; 4398 } 4399 4400 bool AArch64TTIImpl::isLegalToVectorizeReduction( 4401 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { 4402 if (!VF.isScalable()) 4403 return true; 4404 4405 Type *Ty = RdxDesc.getRecurrenceType(); 4406 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) 4407 return false; 4408 4409 switch (RdxDesc.getRecurrenceKind()) { 4410 case RecurKind::Add: 4411 case RecurKind::FAdd: 4412 case RecurKind::And: 4413 case RecurKind::Or: 4414 case RecurKind::Xor: 4415 case RecurKind::SMin: 4416 case RecurKind::SMax: 4417 case RecurKind::UMin: 4418 case RecurKind::UMax: 4419 case RecurKind::FMin: 4420 case RecurKind::FMax: 4421 case RecurKind::FMulAdd: 4422 case RecurKind::IAnyOf: 4423 case RecurKind::FAnyOf: 4424 return true; 4425 default: 4426 return false; 4427 } 4428 } 4429 4430 InstructionCost 4431 AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 4432 FastMathFlags FMF, 4433 TTI::TargetCostKind CostKind) { 4434 // The code-generator is currently not able to handle scalable vectors 4435 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 4436 // it. This change will be removed when code-generation for these types is 4437 // sufficiently reliable. 4438 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) 4439 if (VTy->getElementCount() == ElementCount::getScalable(1)) 4440 return InstructionCost::getInvalid(); 4441 4442 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 4443 4444 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) 4445 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 4446 4447 InstructionCost LegalizationCost = 0; 4448 if (LT.first > 1) { 4449 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); 4450 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF); 4451 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1); 4452 } 4453 4454 return LegalizationCost + /*Cost of horizontal reduction*/ 2; 4455 } 4456 4457 InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( 4458 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { 4459 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 4460 InstructionCost LegalizationCost = 0; 4461 if (LT.first > 1) { 4462 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); 4463 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); 4464 LegalizationCost *= LT.first - 1; 4465 } 4466 4467 int ISD = TLI->InstructionOpcodeToISD(Opcode); 4468 assert(ISD && "Invalid opcode"); 4469 // Add the final reduction cost for the legal horizontal reduction 4470 switch (ISD) { 4471 case ISD::ADD: 4472 case ISD::AND: 4473 case ISD::OR: 4474 case ISD::XOR: 4475 case ISD::FADD: 4476 return LegalizationCost + 2; 4477 default: 4478 return InstructionCost::getInvalid(); 4479 } 4480 } 4481 4482 InstructionCost 4483 AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 4484 std::optional<FastMathFlags> FMF, 4485 TTI::TargetCostKind CostKind) { 4486 // The code-generator is currently not able to handle scalable vectors 4487 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 4488 // it. This change will be removed when code-generation for these types is 4489 // sufficiently reliable. 4490 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy)) 4491 if (VTy->getElementCount() == ElementCount::getScalable(1)) 4492 return InstructionCost::getInvalid(); 4493 4494 if (TTI::requiresOrderedReduction(FMF)) { 4495 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) { 4496 InstructionCost BaseCost = 4497 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 4498 // Add on extra cost to reflect the extra overhead on some CPUs. We still 4499 // end up vectorizing for more computationally intensive loops. 4500 return BaseCost + FixedVTy->getNumElements(); 4501 } 4502 4503 if (Opcode != Instruction::FAdd) 4504 return InstructionCost::getInvalid(); 4505 4506 auto *VTy = cast<ScalableVectorType>(ValTy); 4507 InstructionCost Cost = 4508 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); 4509 Cost *= getMaxNumElements(VTy->getElementCount()); 4510 return Cost; 4511 } 4512 4513 if (isa<ScalableVectorType>(ValTy)) 4514 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); 4515 4516 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 4517 MVT MTy = LT.second; 4518 int ISD = TLI->InstructionOpcodeToISD(Opcode); 4519 assert(ISD && "Invalid opcode"); 4520 4521 // Horizontal adds can use the 'addv' instruction. We model the cost of these 4522 // instructions as twice a normal vector add, plus 1 for each legalization 4523 // step (LT.first). This is the only arithmetic vector reduction operation for 4524 // which we have an instruction. 4525 // OR, XOR and AND costs should match the codegen from: 4526 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll 4527 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll 4528 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll 4529 static const CostTblEntry CostTblNoPairwise[]{ 4530 {ISD::ADD, MVT::v8i8, 2}, 4531 {ISD::ADD, MVT::v16i8, 2}, 4532 {ISD::ADD, MVT::v4i16, 2}, 4533 {ISD::ADD, MVT::v8i16, 2}, 4534 {ISD::ADD, MVT::v4i32, 2}, 4535 {ISD::ADD, MVT::v2i64, 2}, 4536 {ISD::OR, MVT::v8i8, 15}, 4537 {ISD::OR, MVT::v16i8, 17}, 4538 {ISD::OR, MVT::v4i16, 7}, 4539 {ISD::OR, MVT::v8i16, 9}, 4540 {ISD::OR, MVT::v2i32, 3}, 4541 {ISD::OR, MVT::v4i32, 5}, 4542 {ISD::OR, MVT::v2i64, 3}, 4543 {ISD::XOR, MVT::v8i8, 15}, 4544 {ISD::XOR, MVT::v16i8, 17}, 4545 {ISD::XOR, MVT::v4i16, 7}, 4546 {ISD::XOR, MVT::v8i16, 9}, 4547 {ISD::XOR, MVT::v2i32, 3}, 4548 {ISD::XOR, MVT::v4i32, 5}, 4549 {ISD::XOR, MVT::v2i64, 3}, 4550 {ISD::AND, MVT::v8i8, 15}, 4551 {ISD::AND, MVT::v16i8, 17}, 4552 {ISD::AND, MVT::v4i16, 7}, 4553 {ISD::AND, MVT::v8i16, 9}, 4554 {ISD::AND, MVT::v2i32, 3}, 4555 {ISD::AND, MVT::v4i32, 5}, 4556 {ISD::AND, MVT::v2i64, 3}, 4557 }; 4558 switch (ISD) { 4559 default: 4560 break; 4561 case ISD::FADD: 4562 if (Type *EltTy = ValTy->getScalarType(); 4563 // FIXME: For half types without fullfp16 support, this could extend and 4564 // use a fp32 faddp reduction but current codegen unrolls. 4565 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() || 4566 (EltTy->isHalfTy() && ST->hasFullFP16()))) { 4567 const unsigned NElts = MTy.getVectorNumElements(); 4568 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 && 4569 isPowerOf2_32(NElts)) 4570 // Reduction corresponding to series of fadd instructions is lowered to 4571 // series of faddp instructions. faddp has latency/throughput that 4572 // matches fadd instruction and hence, every faddp instruction can be 4573 // considered to have a relative cost = 1 with 4574 // CostKind = TCK_RecipThroughput. 4575 // An faddp will pairwise add vector elements, so the size of input 4576 // vector reduces by half every time, requiring 4577 // #(faddp instructions) = log2_32(NElts). 4578 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts); 4579 } 4580 break; 4581 case ISD::ADD: 4582 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) 4583 return (LT.first - 1) + Entry->Cost; 4584 break; 4585 case ISD::XOR: 4586 case ISD::AND: 4587 case ISD::OR: 4588 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); 4589 if (!Entry) 4590 break; 4591 auto *ValVTy = cast<FixedVectorType>(ValTy); 4592 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() && 4593 isPowerOf2_32(ValVTy->getNumElements())) { 4594 InstructionCost ExtraCost = 0; 4595 if (LT.first != 1) { 4596 // Type needs to be split, so there is an extra cost of LT.first - 1 4597 // arithmetic ops. 4598 auto *Ty = FixedVectorType::get(ValTy->getElementType(), 4599 MTy.getVectorNumElements()); 4600 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); 4601 ExtraCost *= LT.first - 1; 4602 } 4603 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov 4604 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost; 4605 return Cost + ExtraCost; 4606 } 4607 break; 4608 } 4609 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 4610 } 4611 4612 InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { 4613 static const CostTblEntry ShuffleTbl[] = { 4614 { TTI::SK_Splice, MVT::nxv16i8, 1 }, 4615 { TTI::SK_Splice, MVT::nxv8i16, 1 }, 4616 { TTI::SK_Splice, MVT::nxv4i32, 1 }, 4617 { TTI::SK_Splice, MVT::nxv2i64, 1 }, 4618 { TTI::SK_Splice, MVT::nxv2f16, 1 }, 4619 { TTI::SK_Splice, MVT::nxv4f16, 1 }, 4620 { TTI::SK_Splice, MVT::nxv8f16, 1 }, 4621 { TTI::SK_Splice, MVT::nxv2bf16, 1 }, 4622 { TTI::SK_Splice, MVT::nxv4bf16, 1 }, 4623 { TTI::SK_Splice, MVT::nxv8bf16, 1 }, 4624 { TTI::SK_Splice, MVT::nxv2f32, 1 }, 4625 { TTI::SK_Splice, MVT::nxv4f32, 1 }, 4626 { TTI::SK_Splice, MVT::nxv2f64, 1 }, 4627 }; 4628 4629 // The code-generator is currently not able to handle scalable vectors 4630 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting 4631 // it. This change will be removed when code-generation for these types is 4632 // sufficiently reliable. 4633 if (Tp->getElementCount() == ElementCount::getScalable(1)) 4634 return InstructionCost::getInvalid(); 4635 4636 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 4637 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); 4638 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 4639 EVT PromotedVT = LT.second.getScalarType() == MVT::i1 4640 ? TLI->getPromotedVTForPredicate(EVT(LT.second)) 4641 : LT.second; 4642 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); 4643 InstructionCost LegalizationCost = 0; 4644 if (Index < 0) { 4645 LegalizationCost = 4646 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, 4647 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 4648 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, 4649 CmpInst::BAD_ICMP_PREDICATE, CostKind); 4650 } 4651 4652 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp 4653 // Cost performed on a promoted type. 4654 if (LT.second.getScalarType() == MVT::i1) { 4655 LegalizationCost += 4656 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, 4657 TTI::CastContextHint::None, CostKind) + 4658 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, 4659 TTI::CastContextHint::None, CostKind); 4660 } 4661 const auto *Entry = 4662 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); 4663 assert(Entry && "Illegal Type for Splice"); 4664 LegalizationCost += Entry->Cost; 4665 return LegalizationCost * LT.first; 4666 } 4667 4668 InstructionCost AArch64TTIImpl::getPartialReductionCost( 4669 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, 4670 ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, 4671 TTI::PartialReductionExtendKind OpBExtend, 4672 std::optional<unsigned> BinOp) const { 4673 InstructionCost Invalid = InstructionCost::getInvalid(); 4674 InstructionCost Cost(TTI::TCC_Basic); 4675 4676 if (Opcode != Instruction::Add) 4677 return Invalid; 4678 4679 if (InputTypeA != InputTypeB) 4680 return Invalid; 4681 4682 EVT InputEVT = EVT::getEVT(InputTypeA); 4683 EVT AccumEVT = EVT::getEVT(AccumType); 4684 4685 if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable()) 4686 return Invalid; 4687 if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd())) 4688 return Invalid; 4689 4690 if (InputEVT == MVT::i8) { 4691 switch (VF.getKnownMinValue()) { 4692 default: 4693 return Invalid; 4694 case 8: 4695 if (AccumEVT == MVT::i32) 4696 Cost *= 2; 4697 else if (AccumEVT != MVT::i64) 4698 return Invalid; 4699 break; 4700 case 16: 4701 if (AccumEVT == MVT::i64) 4702 Cost *= 2; 4703 else if (AccumEVT != MVT::i32) 4704 return Invalid; 4705 break; 4706 } 4707 } else if (InputEVT == MVT::i16) { 4708 // FIXME: Allow i32 accumulator but increase cost, as we would extend 4709 // it to i64. 4710 if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64) 4711 return Invalid; 4712 } else 4713 return Invalid; 4714 4715 // AArch64 supports lowering mixed extensions to a usdot but only if the 4716 // i8mm or sve/streaming features are available. 4717 if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None || 4718 (OpAExtend != OpBExtend && !ST->hasMatMulInt8() && 4719 !ST->isSVEorStreamingSVEAvailable())) 4720 return Invalid; 4721 4722 if (!BinOp || *BinOp != Instruction::Mul) 4723 return Invalid; 4724 4725 return Cost; 4726 } 4727 4728 InstructionCost AArch64TTIImpl::getShuffleCost( 4729 TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask, 4730 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, 4731 ArrayRef<const Value *> Args, const Instruction *CxtI) { 4732 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 4733 4734 // If we have a Mask, and the LT is being legalized somehow, split the Mask 4735 // into smaller vectors and sum the cost of each shuffle. 4736 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() && 4737 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && 4738 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) { 4739 4740 // Check for LD3/LD4 instructions, which are represented in llvm IR as 4741 // deinterleaving-shuffle(load). The shuffle cost could potentially be free, 4742 // but we model it with a cost of LT.first so that LD3/LD4 have a higher 4743 // cost than just the load. 4744 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) && 4745 (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 3) || 4746 ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 4))) 4747 return std::max<InstructionCost>(1, LT.first / 4); 4748 4749 // Check for ST3/ST4 instructions, which are represented in llvm IR as 4750 // store(interleaving-shuffle). The shuffle cost could potentially be free, 4751 // but we model it with a cost of LT.first so that ST3/ST4 have a higher 4752 // cost than just the store. 4753 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) && 4754 (ShuffleVectorInst::isInterleaveMask( 4755 Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) || 4756 ShuffleVectorInst::isInterleaveMask( 4757 Mask, 3, Tp->getElementCount().getKnownMinValue() * 2))) 4758 return LT.first; 4759 4760 unsigned TpNumElts = Mask.size(); 4761 unsigned LTNumElts = LT.second.getVectorNumElements(); 4762 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; 4763 VectorType *NTp = 4764 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount()); 4765 InstructionCost Cost; 4766 for (unsigned N = 0; N < NumVecs; N++) { 4767 SmallVector<int> NMask; 4768 // Split the existing mask into chunks of size LTNumElts. Track the source 4769 // sub-vectors to ensure the result has at most 2 inputs. 4770 unsigned Source1, Source2; 4771 unsigned NumSources = 0; 4772 for (unsigned E = 0; E < LTNumElts; E++) { 4773 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] 4774 : PoisonMaskElem; 4775 if (MaskElt < 0) { 4776 NMask.push_back(PoisonMaskElem); 4777 continue; 4778 } 4779 4780 // Calculate which source from the input this comes from and whether it 4781 // is new to us. 4782 unsigned Source = MaskElt / LTNumElts; 4783 if (NumSources == 0) { 4784 Source1 = Source; 4785 NumSources = 1; 4786 } else if (NumSources == 1 && Source != Source1) { 4787 Source2 = Source; 4788 NumSources = 2; 4789 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { 4790 NumSources++; 4791 } 4792 4793 // Add to the new mask. For the NumSources>2 case these are not correct, 4794 // but are only used for the modular lane number. 4795 if (Source == Source1) 4796 NMask.push_back(MaskElt % LTNumElts); 4797 else if (Source == Source2) 4798 NMask.push_back(MaskElt % LTNumElts + LTNumElts); 4799 else 4800 NMask.push_back(MaskElt % LTNumElts); 4801 } 4802 // If the sub-mask has at most 2 input sub-vectors then re-cost it using 4803 // getShuffleCost. If not then cost it using the worst case as the number 4804 // of element moves into a new vector. 4805 if (NumSources <= 2) 4806 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc 4807 : TTI::SK_PermuteTwoSrc, 4808 NTp, NMask, CostKind, 0, nullptr, Args, CxtI); 4809 else 4810 Cost += LTNumElts; 4811 } 4812 return Cost; 4813 } 4814 4815 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); 4816 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector; 4817 // A subvector extract can be implemented with an ext (or trivial extract, if 4818 // from lane 0). This currently only handles low or high extracts to prevent 4819 // SLP vectorizer regressions. 4820 if (IsExtractSubvector && LT.second.isFixedLengthVector()) { 4821 if (LT.second.is128BitVector() && 4822 cast<FixedVectorType>(SubTp)->getNumElements() == 4823 LT.second.getVectorNumElements() / 2) { 4824 if (Index == 0) 4825 return 0; 4826 if (Index == (int)LT.second.getVectorNumElements() / 2) 4827 return 1; 4828 } 4829 Kind = TTI::SK_PermuteSingleSrc; 4830 } 4831 4832 // Check for broadcast loads, which are supported by the LD1R instruction. 4833 // In terms of code-size, the shuffle vector is free when a load + dup get 4834 // folded into a LD1R. That's what we check and return here. For performance 4835 // and reciprocal throughput, a LD1R is not completely free. In this case, we 4836 // return the cost for the broadcast below (i.e. 1 for most/all types), so 4837 // that we model the load + dup sequence slightly higher because LD1R is a 4838 // high latency instruction. 4839 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) { 4840 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]); 4841 if (IsLoad && LT.second.isVector() && 4842 isLegalBroadcastLoad(Tp->getElementType(), 4843 LT.second.getVectorElementCount())) 4844 return 0; 4845 } 4846 4847 // If we have 4 elements for the shuffle and a Mask, get the cost straight 4848 // from the perfect shuffle tables. 4849 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) && 4850 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && 4851 all_of(Mask, [](int E) { return E < 8; })) 4852 return getPerfectShuffleCost(Mask); 4853 4854 // Check for identity masks, which we can treat as free. 4855 if (!Mask.empty() && LT.second.isFixedLengthVector() && 4856 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && 4857 all_of(enumerate(Mask), [](const auto &M) { 4858 return M.value() < 0 || M.value() == (int)M.index(); 4859 })) 4860 return 0; 4861 4862 // Check for other shuffles that are not SK_ kinds but we have native 4863 // instructions for, for example ZIP and UZP. 4864 unsigned Unused; 4865 if (LT.second.isFixedLengthVector() && 4866 LT.second.getVectorNumElements() == Mask.size() && 4867 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && 4868 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) || 4869 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) || 4870 // Check for non-zero lane splats 4871 all_of(drop_begin(Mask), 4872 [&Mask](int M) { return M < 0 || M == Mask[0]; }))) 4873 return 1; 4874 4875 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || 4876 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || 4877 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) { 4878 static const CostTblEntry ShuffleTbl[] = { 4879 // Broadcast shuffle kinds can be performed with 'dup'. 4880 {TTI::SK_Broadcast, MVT::v8i8, 1}, 4881 {TTI::SK_Broadcast, MVT::v16i8, 1}, 4882 {TTI::SK_Broadcast, MVT::v4i16, 1}, 4883 {TTI::SK_Broadcast, MVT::v8i16, 1}, 4884 {TTI::SK_Broadcast, MVT::v2i32, 1}, 4885 {TTI::SK_Broadcast, MVT::v4i32, 1}, 4886 {TTI::SK_Broadcast, MVT::v2i64, 1}, 4887 {TTI::SK_Broadcast, MVT::v4f16, 1}, 4888 {TTI::SK_Broadcast, MVT::v8f16, 1}, 4889 {TTI::SK_Broadcast, MVT::v2f32, 1}, 4890 {TTI::SK_Broadcast, MVT::v4f32, 1}, 4891 {TTI::SK_Broadcast, MVT::v2f64, 1}, 4892 // Transpose shuffle kinds can be performed with 'trn1/trn2' and 4893 // 'zip1/zip2' instructions. 4894 {TTI::SK_Transpose, MVT::v8i8, 1}, 4895 {TTI::SK_Transpose, MVT::v16i8, 1}, 4896 {TTI::SK_Transpose, MVT::v4i16, 1}, 4897 {TTI::SK_Transpose, MVT::v8i16, 1}, 4898 {TTI::SK_Transpose, MVT::v2i32, 1}, 4899 {TTI::SK_Transpose, MVT::v4i32, 1}, 4900 {TTI::SK_Transpose, MVT::v2i64, 1}, 4901 {TTI::SK_Transpose, MVT::v4f16, 1}, 4902 {TTI::SK_Transpose, MVT::v8f16, 1}, 4903 {TTI::SK_Transpose, MVT::v2f32, 1}, 4904 {TTI::SK_Transpose, MVT::v4f32, 1}, 4905 {TTI::SK_Transpose, MVT::v2f64, 1}, 4906 // Select shuffle kinds. 4907 // TODO: handle vXi8/vXi16. 4908 {TTI::SK_Select, MVT::v2i32, 1}, // mov. 4909 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar). 4910 {TTI::SK_Select, MVT::v2i64, 1}, // mov. 4911 {TTI::SK_Select, MVT::v2f32, 1}, // mov. 4912 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar). 4913 {TTI::SK_Select, MVT::v2f64, 1}, // mov. 4914 // PermuteSingleSrc shuffle kinds. 4915 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov. 4916 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case. 4917 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov. 4918 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov. 4919 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case. 4920 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov. 4921 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case. 4922 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case. 4923 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same 4924 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl 4925 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl 4926 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl 4927 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl 4928 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl 4929 // Reverse can be lowered with `rev`. 4930 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64 4931 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT 4932 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT 4933 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64 4934 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT 4935 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT 4936 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT 4937 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT 4938 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT 4939 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64 4940 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64 4941 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64 4942 // Splice can all be lowered as `ext`. 4943 {TTI::SK_Splice, MVT::v2i32, 1}, 4944 {TTI::SK_Splice, MVT::v4i32, 1}, 4945 {TTI::SK_Splice, MVT::v2i64, 1}, 4946 {TTI::SK_Splice, MVT::v2f32, 1}, 4947 {TTI::SK_Splice, MVT::v4f32, 1}, 4948 {TTI::SK_Splice, MVT::v2f64, 1}, 4949 {TTI::SK_Splice, MVT::v8f16, 1}, 4950 {TTI::SK_Splice, MVT::v8bf16, 1}, 4951 {TTI::SK_Splice, MVT::v8i16, 1}, 4952 {TTI::SK_Splice, MVT::v16i8, 1}, 4953 {TTI::SK_Splice, MVT::v4bf16, 1}, 4954 {TTI::SK_Splice, MVT::v4f16, 1}, 4955 {TTI::SK_Splice, MVT::v4i16, 1}, 4956 {TTI::SK_Splice, MVT::v8i8, 1}, 4957 // Broadcast shuffle kinds for scalable vectors 4958 {TTI::SK_Broadcast, MVT::nxv16i8, 1}, 4959 {TTI::SK_Broadcast, MVT::nxv8i16, 1}, 4960 {TTI::SK_Broadcast, MVT::nxv4i32, 1}, 4961 {TTI::SK_Broadcast, MVT::nxv2i64, 1}, 4962 {TTI::SK_Broadcast, MVT::nxv2f16, 1}, 4963 {TTI::SK_Broadcast, MVT::nxv4f16, 1}, 4964 {TTI::SK_Broadcast, MVT::nxv8f16, 1}, 4965 {TTI::SK_Broadcast, MVT::nxv2bf16, 1}, 4966 {TTI::SK_Broadcast, MVT::nxv4bf16, 1}, 4967 {TTI::SK_Broadcast, MVT::nxv8bf16, 1}, 4968 {TTI::SK_Broadcast, MVT::nxv2f32, 1}, 4969 {TTI::SK_Broadcast, MVT::nxv4f32, 1}, 4970 {TTI::SK_Broadcast, MVT::nxv2f64, 1}, 4971 {TTI::SK_Broadcast, MVT::nxv16i1, 1}, 4972 {TTI::SK_Broadcast, MVT::nxv8i1, 1}, 4973 {TTI::SK_Broadcast, MVT::nxv4i1, 1}, 4974 {TTI::SK_Broadcast, MVT::nxv2i1, 1}, 4975 // Handle the cases for vector.reverse with scalable vectors 4976 {TTI::SK_Reverse, MVT::nxv16i8, 1}, 4977 {TTI::SK_Reverse, MVT::nxv8i16, 1}, 4978 {TTI::SK_Reverse, MVT::nxv4i32, 1}, 4979 {TTI::SK_Reverse, MVT::nxv2i64, 1}, 4980 {TTI::SK_Reverse, MVT::nxv2f16, 1}, 4981 {TTI::SK_Reverse, MVT::nxv4f16, 1}, 4982 {TTI::SK_Reverse, MVT::nxv8f16, 1}, 4983 {TTI::SK_Reverse, MVT::nxv2bf16, 1}, 4984 {TTI::SK_Reverse, MVT::nxv4bf16, 1}, 4985 {TTI::SK_Reverse, MVT::nxv8bf16, 1}, 4986 {TTI::SK_Reverse, MVT::nxv2f32, 1}, 4987 {TTI::SK_Reverse, MVT::nxv4f32, 1}, 4988 {TTI::SK_Reverse, MVT::nxv2f64, 1}, 4989 {TTI::SK_Reverse, MVT::nxv16i1, 1}, 4990 {TTI::SK_Reverse, MVT::nxv8i1, 1}, 4991 {TTI::SK_Reverse, MVT::nxv4i1, 1}, 4992 {TTI::SK_Reverse, MVT::nxv2i1, 1}, 4993 }; 4994 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) 4995 return LT.first * Entry->Cost; 4996 } 4997 4998 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) 4999 return getSpliceCost(Tp, Index); 5000 5001 // Inserting a subvector can often be done with either a D, S or H register 5002 // move, so long as the inserted vector is "aligned". 5003 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && 5004 LT.second.getSizeInBits() <= 128 && SubTp) { 5005 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); 5006 if (SubLT.second.isVector()) { 5007 int NumElts = LT.second.getVectorNumElements(); 5008 int NumSubElts = SubLT.second.getVectorNumElements(); 5009 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 5010 return SubLT.first; 5011 } 5012 } 5013 5014 // Restore optimal kind. 5015 if (IsExtractSubvector) 5016 Kind = TTI::SK_ExtractSubvector; 5017 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args, 5018 CxtI); 5019 } 5020 5021 static bool containsDecreasingPointers(Loop *TheLoop, 5022 PredicatedScalarEvolution *PSE) { 5023 const auto &Strides = DenseMap<Value *, const SCEV *>(); 5024 for (BasicBlock *BB : TheLoop->blocks()) { 5025 // Scan the instructions in the block and look for addresses that are 5026 // consecutive and decreasing. 5027 for (Instruction &I : *BB) { 5028 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) { 5029 Value *Ptr = getLoadStorePointerOperand(&I); 5030 Type *AccessTy = getLoadStoreType(&I); 5031 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true, 5032 /*ShouldCheckWrap=*/false) 5033 .value_or(0) < 0) 5034 return true; 5035 } 5036 } 5037 } 5038 return false; 5039 } 5040 5041 bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost() const { 5042 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences()) 5043 return SVEPreferFixedOverScalableIfEqualCost; 5044 return ST->useFixedOverScalableIfEqualCost(); 5045 } 5046 5047 unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const { 5048 return ST->getEpilogueVectorizationMinVF(); 5049 } 5050 5051 bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) { 5052 if (!ST->hasSVE()) 5053 return false; 5054 5055 // We don't currently support vectorisation with interleaving for SVE - with 5056 // such loops we're better off not using tail-folding. This gives us a chance 5057 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc. 5058 if (TFI->IAI->hasGroups()) 5059 return false; 5060 5061 TailFoldingOpts Required = TailFoldingOpts::Disabled; 5062 if (TFI->LVL->getReductionVars().size()) 5063 Required |= TailFoldingOpts::Reductions; 5064 if (TFI->LVL->getFixedOrderRecurrences().size()) 5065 Required |= TailFoldingOpts::Recurrences; 5066 5067 // We call this to discover whether any load/store pointers in the loop have 5068 // negative strides. This will require extra work to reverse the loop 5069 // predicate, which may be expensive. 5070 if (containsDecreasingPointers(TFI->LVL->getLoop(), 5071 TFI->LVL->getPredicatedScalarEvolution())) 5072 Required |= TailFoldingOpts::Reverse; 5073 if (Required == TailFoldingOpts::Disabled) 5074 Required |= TailFoldingOpts::Simple; 5075 5076 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(), 5077 Required)) 5078 return false; 5079 5080 // Don't tail-fold for tight loops where we would be better off interleaving 5081 // with an unpredicated loop. 5082 unsigned NumInsns = 0; 5083 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) { 5084 NumInsns += BB->sizeWithoutDebug(); 5085 } 5086 5087 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch. 5088 return NumInsns >= SVETailFoldInsnThreshold; 5089 } 5090 5091 InstructionCost 5092 AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 5093 StackOffset BaseOffset, bool HasBaseReg, 5094 int64_t Scale, unsigned AddrSpace) const { 5095 // Scaling factors are not free at all. 5096 // Operands | Rt Latency 5097 // ------------------------------------------- 5098 // Rt, [Xn, Xm] | 4 5099 // ------------------------------------------- 5100 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 5101 // Rt, [Xn, Wm, <extend> #imm] | 5102 TargetLoweringBase::AddrMode AM; 5103 AM.BaseGV = BaseGV; 5104 AM.BaseOffs = BaseOffset.getFixed(); 5105 AM.HasBaseReg = HasBaseReg; 5106 AM.Scale = Scale; 5107 AM.ScalableOffset = BaseOffset.getScalable(); 5108 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) 5109 // Scale represents reg2 * scale, thus account for 1 if 5110 // it is not equal to 0 or 1. 5111 return AM.Scale != 0 && AM.Scale != 1; 5112 return -1; 5113 } 5114 5115 bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) { 5116 if (EnableOrLikeSelectOpt) { 5117 // For the binary operators (e.g. or) we need to be more careful than 5118 // selects, here we only transform them if they are already at a natural 5119 // break point in the code - the end of a block with an unconditional 5120 // terminator. 5121 if (I->getOpcode() == Instruction::Or && 5122 isa<BranchInst>(I->getNextNode()) && 5123 cast<BranchInst>(I->getNextNode())->isUnconditional()) 5124 return true; 5125 5126 if (I->getOpcode() == Instruction::Add || 5127 I->getOpcode() == Instruction::Sub) 5128 return true; 5129 } 5130 return BaseT::shouldTreatInstructionLikeSelect(I); 5131 } 5132 5133 bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 5134 const TargetTransformInfo::LSRCost &C2) { 5135 // AArch64 specific here is adding the number of instructions to the 5136 // comparison (though not as the first consideration, as some targets do) 5137 // along with changing the priority of the base additions. 5138 // TODO: Maybe a more nuanced tradeoff between instruction count 5139 // and number of registers? To be investigated at a later date. 5140 if (EnableLSRCostOpt) 5141 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost, 5142 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 5143 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost, 5144 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost); 5145 5146 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); 5147 } 5148 5149 static bool isSplatShuffle(Value *V) { 5150 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V)) 5151 return all_equal(Shuf->getShuffleMask()); 5152 return false; 5153 } 5154 5155 /// Check if both Op1 and Op2 are shufflevector extracts of either the lower 5156 /// or upper half of the vector elements. 5157 static bool areExtractShuffleVectors(Value *Op1, Value *Op2, 5158 bool AllowSplat = false) { 5159 // Scalable types can't be extract shuffle vectors. 5160 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy()) 5161 return false; 5162 5163 auto areTypesHalfed = [](Value *FullV, Value *HalfV) { 5164 auto *FullTy = FullV->getType(); 5165 auto *HalfTy = HalfV->getType(); 5166 return FullTy->getPrimitiveSizeInBits().getFixedValue() == 5167 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue(); 5168 }; 5169 5170 auto extractHalf = [](Value *FullV, Value *HalfV) { 5171 auto *FullVT = cast<FixedVectorType>(FullV->getType()); 5172 auto *HalfVT = cast<FixedVectorType>(HalfV->getType()); 5173 return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); 5174 }; 5175 5176 ArrayRef<int> M1, M2; 5177 Value *S1Op1 = nullptr, *S2Op1 = nullptr; 5178 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) || 5179 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2)))) 5180 return false; 5181 5182 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that 5183 // it is not checked as an extract below. 5184 if (AllowSplat && isSplatShuffle(Op1)) 5185 S1Op1 = nullptr; 5186 if (AllowSplat && isSplatShuffle(Op2)) 5187 S2Op1 = nullptr; 5188 5189 // Check that the operands are half as wide as the result and we extract 5190 // half of the elements of the input vectors. 5191 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) || 5192 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2)))) 5193 return false; 5194 5195 // Check the mask extracts either the lower or upper half of vector 5196 // elements. 5197 int M1Start = 0; 5198 int M2Start = 0; 5199 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2; 5200 if ((S1Op1 && 5201 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) || 5202 (S2Op1 && 5203 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start))) 5204 return false; 5205 5206 if ((M1Start != 0 && M1Start != (NumElements / 2)) || 5207 (M2Start != 0 && M2Start != (NumElements / 2))) 5208 return false; 5209 if (S1Op1 && S2Op1 && M1Start != M2Start) 5210 return false; 5211 5212 return true; 5213 } 5214 5215 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth 5216 /// of the vector elements. 5217 static bool areExtractExts(Value *Ext1, Value *Ext2) { 5218 auto areExtDoubled = [](Instruction *Ext) { 5219 return Ext->getType()->getScalarSizeInBits() == 5220 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); 5221 }; 5222 5223 if (!match(Ext1, m_ZExtOrSExt(m_Value())) || 5224 !match(Ext2, m_ZExtOrSExt(m_Value())) || 5225 !areExtDoubled(cast<Instruction>(Ext1)) || 5226 !areExtDoubled(cast<Instruction>(Ext2))) 5227 return false; 5228 5229 return true; 5230 } 5231 5232 /// Check if Op could be used with vmull_high_p64 intrinsic. 5233 static bool isOperandOfVmullHighP64(Value *Op) { 5234 Value *VectorOperand = nullptr; 5235 ConstantInt *ElementIndex = nullptr; 5236 return match(Op, m_ExtractElt(m_Value(VectorOperand), 5237 m_ConstantInt(ElementIndex))) && 5238 ElementIndex->getValue() == 1 && 5239 isa<FixedVectorType>(VectorOperand->getType()) && 5240 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2; 5241 } 5242 5243 /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic. 5244 static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { 5245 return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2); 5246 } 5247 5248 static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) { 5249 // Restrict ourselves to the form CodeGenPrepare typically constructs. 5250 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs); 5251 if (!GEP || GEP->getNumOperands() != 2) 5252 return false; 5253 5254 Value *Base = GEP->getOperand(0); 5255 Value *Offsets = GEP->getOperand(1); 5256 5257 // We only care about scalar_base+vector_offsets. 5258 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy()) 5259 return false; 5260 5261 // Sink extends that would allow us to use 32-bit offset vectors. 5262 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) { 5263 auto *OffsetsInst = cast<Instruction>(Offsets); 5264 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 && 5265 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32) 5266 Ops.push_back(&GEP->getOperandUse(1)); 5267 } 5268 5269 // Sink the GEP. 5270 return true; 5271 } 5272 5273 /// We want to sink following cases: 5274 /// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale; 5275 /// (add|sub|gep) A, ((mul|shl) zext(vscale), imm); 5276 static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) { 5277 if (match(Op, m_VScale())) 5278 return true; 5279 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) || 5280 match(Op, m_Mul(m_VScale(), m_ConstantInt()))) { 5281 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0)); 5282 return true; 5283 } 5284 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) || 5285 match(Op, m_Mul(m_ZExt(m_VScale()), m_ConstantInt()))) { 5286 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0); 5287 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0)); 5288 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0)); 5289 return true; 5290 } 5291 return false; 5292 } 5293 5294 /// Check if sinking \p I's operands to I's basic block is profitable, because 5295 /// the operands can be folded into a target instruction, e.g. 5296 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). 5297 bool AArch64TTIImpl::isProfitableToSinkOperands( 5298 Instruction *I, SmallVectorImpl<Use *> &Ops) const { 5299 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 5300 switch (II->getIntrinsicID()) { 5301 case Intrinsic::aarch64_neon_smull: 5302 case Intrinsic::aarch64_neon_umull: 5303 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1), 5304 /*AllowSplat=*/true)) { 5305 Ops.push_back(&II->getOperandUse(0)); 5306 Ops.push_back(&II->getOperandUse(1)); 5307 return true; 5308 } 5309 [[fallthrough]]; 5310 5311 case Intrinsic::fma: 5312 case Intrinsic::fmuladd: 5313 if (isa<VectorType>(I->getType()) && 5314 cast<VectorType>(I->getType())->getElementType()->isHalfTy() && 5315 !ST->hasFullFP16()) 5316 return false; 5317 [[fallthrough]]; 5318 case Intrinsic::aarch64_neon_sqdmull: 5319 case Intrinsic::aarch64_neon_sqdmulh: 5320 case Intrinsic::aarch64_neon_sqrdmulh: 5321 // Sink splats for index lane variants 5322 if (isSplatShuffle(II->getOperand(0))) 5323 Ops.push_back(&II->getOperandUse(0)); 5324 if (isSplatShuffle(II->getOperand(1))) 5325 Ops.push_back(&II->getOperandUse(1)); 5326 return !Ops.empty(); 5327 case Intrinsic::aarch64_neon_fmlal: 5328 case Intrinsic::aarch64_neon_fmlal2: 5329 case Intrinsic::aarch64_neon_fmlsl: 5330 case Intrinsic::aarch64_neon_fmlsl2: 5331 // Sink splats for index lane variants 5332 if (isSplatShuffle(II->getOperand(1))) 5333 Ops.push_back(&II->getOperandUse(1)); 5334 if (isSplatShuffle(II->getOperand(2))) 5335 Ops.push_back(&II->getOperandUse(2)); 5336 return !Ops.empty(); 5337 case Intrinsic::aarch64_sve_ptest_first: 5338 case Intrinsic::aarch64_sve_ptest_last: 5339 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0))) 5340 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue) 5341 Ops.push_back(&II->getOperandUse(0)); 5342 return !Ops.empty(); 5343 case Intrinsic::aarch64_sme_write_horiz: 5344 case Intrinsic::aarch64_sme_write_vert: 5345 case Intrinsic::aarch64_sme_writeq_horiz: 5346 case Intrinsic::aarch64_sme_writeq_vert: { 5347 auto *Idx = dyn_cast<Instruction>(II->getOperand(1)); 5348 if (!Idx || Idx->getOpcode() != Instruction::Add) 5349 return false; 5350 Ops.push_back(&II->getOperandUse(1)); 5351 return true; 5352 } 5353 case Intrinsic::aarch64_sme_read_horiz: 5354 case Intrinsic::aarch64_sme_read_vert: 5355 case Intrinsic::aarch64_sme_readq_horiz: 5356 case Intrinsic::aarch64_sme_readq_vert: 5357 case Intrinsic::aarch64_sme_ld1b_vert: 5358 case Intrinsic::aarch64_sme_ld1h_vert: 5359 case Intrinsic::aarch64_sme_ld1w_vert: 5360 case Intrinsic::aarch64_sme_ld1d_vert: 5361 case Intrinsic::aarch64_sme_ld1q_vert: 5362 case Intrinsic::aarch64_sme_st1b_vert: 5363 case Intrinsic::aarch64_sme_st1h_vert: 5364 case Intrinsic::aarch64_sme_st1w_vert: 5365 case Intrinsic::aarch64_sme_st1d_vert: 5366 case Intrinsic::aarch64_sme_st1q_vert: 5367 case Intrinsic::aarch64_sme_ld1b_horiz: 5368 case Intrinsic::aarch64_sme_ld1h_horiz: 5369 case Intrinsic::aarch64_sme_ld1w_horiz: 5370 case Intrinsic::aarch64_sme_ld1d_horiz: 5371 case Intrinsic::aarch64_sme_ld1q_horiz: 5372 case Intrinsic::aarch64_sme_st1b_horiz: 5373 case Intrinsic::aarch64_sme_st1h_horiz: 5374 case Intrinsic::aarch64_sme_st1w_horiz: 5375 case Intrinsic::aarch64_sme_st1d_horiz: 5376 case Intrinsic::aarch64_sme_st1q_horiz: { 5377 auto *Idx = dyn_cast<Instruction>(II->getOperand(3)); 5378 if (!Idx || Idx->getOpcode() != Instruction::Add) 5379 return false; 5380 Ops.push_back(&II->getOperandUse(3)); 5381 return true; 5382 } 5383 case Intrinsic::aarch64_neon_pmull: 5384 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) 5385 return false; 5386 Ops.push_back(&II->getOperandUse(0)); 5387 Ops.push_back(&II->getOperandUse(1)); 5388 return true; 5389 case Intrinsic::aarch64_neon_pmull64: 5390 if (!areOperandsOfVmullHighP64(II->getArgOperand(0), 5391 II->getArgOperand(1))) 5392 return false; 5393 Ops.push_back(&II->getArgOperandUse(0)); 5394 Ops.push_back(&II->getArgOperandUse(1)); 5395 return true; 5396 case Intrinsic::masked_gather: 5397 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops)) 5398 return false; 5399 Ops.push_back(&II->getArgOperandUse(0)); 5400 return true; 5401 case Intrinsic::masked_scatter: 5402 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops)) 5403 return false; 5404 Ops.push_back(&II->getArgOperandUse(1)); 5405 return true; 5406 default: 5407 return false; 5408 } 5409 } 5410 5411 auto ShouldSinkCondition = [](Value *Cond) -> bool { 5412 auto *II = dyn_cast<IntrinsicInst>(Cond); 5413 return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or && 5414 isa<ScalableVectorType>(II->getOperand(0)->getType()); 5415 }; 5416 5417 switch (I->getOpcode()) { 5418 case Instruction::GetElementPtr: 5419 case Instruction::Add: 5420 case Instruction::Sub: 5421 // Sink vscales closer to uses for better isel 5422 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) { 5423 if (shouldSinkVScale(I->getOperand(Op), Ops)) { 5424 Ops.push_back(&I->getOperandUse(Op)); 5425 return true; 5426 } 5427 } 5428 break; 5429 case Instruction::Select: { 5430 if (!ShouldSinkCondition(I->getOperand(0))) 5431 return false; 5432 5433 Ops.push_back(&I->getOperandUse(0)); 5434 return true; 5435 } 5436 case Instruction::Br: { 5437 if (cast<BranchInst>(I)->isUnconditional()) 5438 return false; 5439 5440 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition())) 5441 return false; 5442 5443 Ops.push_back(&I->getOperandUse(0)); 5444 return true; 5445 } 5446 default: 5447 break; 5448 } 5449 5450 if (!I->getType()->isVectorTy()) 5451 return false; 5452 5453 switch (I->getOpcode()) { 5454 case Instruction::Sub: 5455 case Instruction::Add: { 5456 if (!areExtractExts(I->getOperand(0), I->getOperand(1))) 5457 return false; 5458 5459 // If the exts' operands extract either the lower or upper elements, we 5460 // can sink them too. 5461 auto Ext1 = cast<Instruction>(I->getOperand(0)); 5462 auto Ext2 = cast<Instruction>(I->getOperand(1)); 5463 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) { 5464 Ops.push_back(&Ext1->getOperandUse(0)); 5465 Ops.push_back(&Ext2->getOperandUse(0)); 5466 } 5467 5468 Ops.push_back(&I->getOperandUse(0)); 5469 Ops.push_back(&I->getOperandUse(1)); 5470 5471 return true; 5472 } 5473 case Instruction::Or: { 5474 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) -> 5475 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1) 5476 if (ST->hasNEON()) { 5477 Instruction *OtherAnd, *IA, *IB; 5478 Value *MaskValue; 5479 // MainAnd refers to And instruction that has 'Not' as one of its operands 5480 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)), 5481 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))), 5482 m_Instruction(IA)))))) { 5483 if (match(OtherAnd, 5484 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) { 5485 Instruction *MainAnd = I->getOperand(0) == OtherAnd 5486 ? cast<Instruction>(I->getOperand(1)) 5487 : cast<Instruction>(I->getOperand(0)); 5488 5489 // Both Ands should be in same basic block as Or 5490 if (I->getParent() != MainAnd->getParent() || 5491 I->getParent() != OtherAnd->getParent()) 5492 return false; 5493 5494 // Non-mask operands of both Ands should also be in same basic block 5495 if (I->getParent() != IA->getParent() || 5496 I->getParent() != IB->getParent()) 5497 return false; 5498 5499 Ops.push_back( 5500 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0)); 5501 Ops.push_back(&I->getOperandUse(0)); 5502 Ops.push_back(&I->getOperandUse(1)); 5503 5504 return true; 5505 } 5506 } 5507 } 5508 5509 return false; 5510 } 5511 case Instruction::Mul: { 5512 auto ShouldSinkSplatForIndexedVariant = [](Value *V) { 5513 auto *Ty = cast<VectorType>(V->getType()); 5514 // For SVE the lane-indexing is within 128-bits, so we can't fold splats. 5515 if (Ty->isScalableTy()) 5516 return false; 5517 5518 // Indexed variants of Mul exist for i16 and i32 element types only. 5519 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32; 5520 }; 5521 5522 int NumZExts = 0, NumSExts = 0; 5523 for (auto &Op : I->operands()) { 5524 // Make sure we are not already sinking this operand 5525 if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) 5526 continue; 5527 5528 if (match(&Op, m_ZExtOrSExt(m_Value()))) { 5529 auto *Ext = cast<Instruction>(Op); 5530 auto *ExtOp = Ext->getOperand(0); 5531 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp)) 5532 Ops.push_back(&Ext->getOperandUse(0)); 5533 Ops.push_back(&Op); 5534 5535 if (isa<SExtInst>(Ext)) 5536 NumSExts++; 5537 else 5538 NumZExts++; 5539 5540 continue; 5541 } 5542 5543 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op); 5544 if (!Shuffle) 5545 continue; 5546 5547 // If the Shuffle is a splat and the operand is a zext/sext, sinking the 5548 // operand and the s/zext can help create indexed s/umull. This is 5549 // especially useful to prevent i64 mul being scalarized. 5550 if (isSplatShuffle(Shuffle) && 5551 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) { 5552 Ops.push_back(&Shuffle->getOperandUse(0)); 5553 Ops.push_back(&Op); 5554 if (match(Shuffle->getOperand(0), m_SExt(m_Value()))) 5555 NumSExts++; 5556 else 5557 NumZExts++; 5558 continue; 5559 } 5560 5561 Value *ShuffleOperand = Shuffle->getOperand(0); 5562 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand); 5563 if (!Insert) 5564 continue; 5565 5566 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1)); 5567 if (!OperandInstr) 5568 continue; 5569 5570 ConstantInt *ElementConstant = 5571 dyn_cast<ConstantInt>(Insert->getOperand(2)); 5572 // Check that the insertelement is inserting into element 0 5573 if (!ElementConstant || !ElementConstant->isZero()) 5574 continue; 5575 5576 unsigned Opcode = OperandInstr->getOpcode(); 5577 if (Opcode == Instruction::SExt) 5578 NumSExts++; 5579 else if (Opcode == Instruction::ZExt) 5580 NumZExts++; 5581 else { 5582 // If we find that the top bits are known 0, then we can sink and allow 5583 // the backend to generate a umull. 5584 unsigned Bitwidth = I->getType()->getScalarSizeInBits(); 5585 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2); 5586 const DataLayout &DL = I->getDataLayout(); 5587 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL)) 5588 continue; 5589 NumZExts++; 5590 } 5591 5592 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking 5593 // the And, just to hoist it again back to the load. 5594 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value()))) 5595 Ops.push_back(&Insert->getOperandUse(1)); 5596 Ops.push_back(&Shuffle->getOperandUse(0)); 5597 Ops.push_back(&Op); 5598 } 5599 5600 // It is profitable to sink if we found two of the same type of extends. 5601 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2)) 5602 return true; 5603 5604 // Otherwise, see if we should sink splats for indexed variants. 5605 if (!ShouldSinkSplatForIndexedVariant(I)) 5606 return false; 5607 5608 Ops.clear(); 5609 if (isSplatShuffle(I->getOperand(0))) 5610 Ops.push_back(&I->getOperandUse(0)); 5611 if (isSplatShuffle(I->getOperand(1))) 5612 Ops.push_back(&I->getOperandUse(1)); 5613 5614 return !Ops.empty(); 5615 } 5616 case Instruction::FMul: { 5617 // For SVE the lane-indexing is within 128-bits, so we can't fold splats. 5618 if (I->getType()->isScalableTy()) 5619 return false; 5620 5621 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() && 5622 !ST->hasFullFP16()) 5623 return false; 5624 5625 // Sink splats for index lane variants 5626 if (isSplatShuffle(I->getOperand(0))) 5627 Ops.push_back(&I->getOperandUse(0)); 5628 if (isSplatShuffle(I->getOperand(1))) 5629 Ops.push_back(&I->getOperandUse(1)); 5630 return !Ops.empty(); 5631 } 5632 default: 5633 return false; 5634 } 5635 return false; 5636 } 5637