1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive 10 // stores that can be put together into vector-stores. Next, it attempts to 11 // construct vectorizable tree using the use-def chains. If a profitable tree 12 // was found, the SLP vectorizer performs vectorization on the tree. 13 // 14 // The pass is inspired by the work described in the paper: 15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. 16 // 17 //===----------------------------------------------------------------------===// 18 19 #include "llvm/Transforms/Vectorize/SLPVectorizer.h" 20 #include "llvm/ADT/DenseMap.h" 21 #include "llvm/ADT/DenseSet.h" 22 #include "llvm/ADT/PriorityQueue.h" 23 #include "llvm/ADT/STLExtras.h" 24 #include "llvm/ADT/ScopeExit.h" 25 #include "llvm/ADT/SetOperations.h" 26 #include "llvm/ADT/SetVector.h" 27 #include "llvm/ADT/SmallBitVector.h" 28 #include "llvm/ADT/SmallPtrSet.h" 29 #include "llvm/ADT/SmallSet.h" 30 #include "llvm/ADT/SmallString.h" 31 #include "llvm/ADT/Statistic.h" 32 #include "llvm/ADT/iterator.h" 33 #include "llvm/ADT/iterator_range.h" 34 #include "llvm/Analysis/AliasAnalysis.h" 35 #include "llvm/Analysis/AssumptionCache.h" 36 #include "llvm/Analysis/CodeMetrics.h" 37 #include "llvm/Analysis/ConstantFolding.h" 38 #include "llvm/Analysis/DemandedBits.h" 39 #include "llvm/Analysis/GlobalsModRef.h" 40 #include "llvm/Analysis/IVDescriptors.h" 41 #include "llvm/Analysis/LoopAccessAnalysis.h" 42 #include "llvm/Analysis/LoopInfo.h" 43 #include "llvm/Analysis/MemoryLocation.h" 44 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 45 #include "llvm/Analysis/ScalarEvolution.h" 46 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 47 #include "llvm/Analysis/TargetLibraryInfo.h" 48 #include "llvm/Analysis/TargetTransformInfo.h" 49 #include "llvm/Analysis/ValueTracking.h" 50 #include "llvm/Analysis/VectorUtils.h" 51 #include "llvm/IR/Attributes.h" 52 #include "llvm/IR/BasicBlock.h" 53 #include "llvm/IR/Constant.h" 54 #include "llvm/IR/Constants.h" 55 #include "llvm/IR/DataLayout.h" 56 #include "llvm/IR/DerivedTypes.h" 57 #include "llvm/IR/Dominators.h" 58 #include "llvm/IR/Function.h" 59 #include "llvm/IR/IRBuilder.h" 60 #include "llvm/IR/InstrTypes.h" 61 #include "llvm/IR/Instruction.h" 62 #include "llvm/IR/Instructions.h" 63 #include "llvm/IR/IntrinsicInst.h" 64 #include "llvm/IR/Intrinsics.h" 65 #include "llvm/IR/Module.h" 66 #include "llvm/IR/Operator.h" 67 #include "llvm/IR/PatternMatch.h" 68 #include "llvm/IR/Type.h" 69 #include "llvm/IR/Use.h" 70 #include "llvm/IR/User.h" 71 #include "llvm/IR/Value.h" 72 #include "llvm/IR/ValueHandle.h" 73 #ifdef EXPENSIVE_CHECKS 74 #include "llvm/IR/Verifier.h" 75 #endif 76 #include "llvm/Pass.h" 77 #include "llvm/Support/Casting.h" 78 #include "llvm/Support/CommandLine.h" 79 #include "llvm/Support/Compiler.h" 80 #include "llvm/Support/DOTGraphTraits.h" 81 #include "llvm/Support/Debug.h" 82 #include "llvm/Support/DebugCounter.h" 83 #include "llvm/Support/ErrorHandling.h" 84 #include "llvm/Support/GraphWriter.h" 85 #include "llvm/Support/InstructionCost.h" 86 #include "llvm/Support/KnownBits.h" 87 #include "llvm/Support/MathExtras.h" 88 #include "llvm/Support/raw_ostream.h" 89 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 90 #include "llvm/Transforms/Utils/Local.h" 91 #include "llvm/Transforms/Utils/LoopUtils.h" 92 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 93 #include <algorithm> 94 #include <cassert> 95 #include <cstdint> 96 #include <iterator> 97 #include <memory> 98 #include <optional> 99 #include <set> 100 #include <string> 101 #include <tuple> 102 #include <utility> 103 104 using namespace llvm; 105 using namespace llvm::PatternMatch; 106 using namespace slpvectorizer; 107 using namespace std::placeholders; 108 109 #define SV_NAME "slp-vectorizer" 110 #define DEBUG_TYPE "SLP" 111 112 STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); 113 114 DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized", 115 "Controls which SLP graphs should be vectorized."); 116 117 static cl::opt<bool> 118 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, 119 cl::desc("Run the SLP vectorization passes")); 120 121 static cl::opt<bool> 122 SLPReVec("slp-revec", cl::init(false), cl::Hidden, 123 cl::desc("Enable vectorization for wider vector utilization")); 124 125 static cl::opt<int> 126 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, 127 cl::desc("Only vectorize if you gain more than this " 128 "number ")); 129 130 static cl::opt<bool> SLPSkipEarlyProfitabilityCheck( 131 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden, 132 cl::desc("When true, SLP vectorizer bypasses profitability checks based on " 133 "heuristics and makes vectorization decision via cost modeling.")); 134 135 static cl::opt<bool> 136 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, 137 cl::desc("Attempt to vectorize horizontal reductions")); 138 139 static cl::opt<bool> ShouldStartVectorizeHorAtStore( 140 "slp-vectorize-hor-store", cl::init(false), cl::Hidden, 141 cl::desc( 142 "Attempt to vectorize horizontal reductions feeding into a store")); 143 144 static cl::opt<int> 145 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, 146 cl::desc("Attempt to vectorize for this register size in bits")); 147 148 static cl::opt<unsigned> 149 MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, 150 cl::desc("Maximum SLP vectorization factor (0=unlimited)")); 151 152 /// Limits the size of scheduling regions in a block. 153 /// It avoid long compile times for _very_ large blocks where vector 154 /// instructions are spread over a wide range. 155 /// This limit is way higher than needed by real-world functions. 156 static cl::opt<int> 157 ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, 158 cl::desc("Limit the size of the SLP scheduling region per block")); 159 160 static cl::opt<int> MinVectorRegSizeOption( 161 "slp-min-reg-size", cl::init(128), cl::Hidden, 162 cl::desc("Attempt to vectorize for this register size in bits")); 163 164 static cl::opt<unsigned> RecursionMaxDepth( 165 "slp-recursion-max-depth", cl::init(12), cl::Hidden, 166 cl::desc("Limit the recursion depth when building a vectorizable tree")); 167 168 static cl::opt<unsigned> MinTreeSize( 169 "slp-min-tree-size", cl::init(3), cl::Hidden, 170 cl::desc("Only vectorize small trees if they are fully vectorizable")); 171 172 // The maximum depth that the look-ahead score heuristic will explore. 173 // The higher this value, the higher the compilation time overhead. 174 static cl::opt<int> LookAheadMaxDepth( 175 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, 176 cl::desc("The maximum look-ahead depth for operand reordering scores")); 177 178 // The maximum depth that the look-ahead score heuristic will explore 179 // when it probing among candidates for vectorization tree roots. 180 // The higher this value, the higher the compilation time overhead but unlike 181 // similar limit for operands ordering this is less frequently used, hence 182 // impact of higher value is less noticeable. 183 static cl::opt<int> RootLookAheadMaxDepth( 184 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, 185 cl::desc("The maximum look-ahead depth for searching best rooting option")); 186 187 static cl::opt<unsigned> MinProfitableStridedLoads( 188 "slp-min-strided-loads", cl::init(2), cl::Hidden, 189 cl::desc("The minimum number of loads, which should be considered strided, " 190 "if the stride is > 1 or is runtime value")); 191 192 static cl::opt<unsigned> MaxProfitableLoadStride( 193 "slp-max-stride", cl::init(8), cl::Hidden, 194 cl::desc("The maximum stride, considered to be profitable.")); 195 196 static cl::opt<bool> 197 ViewSLPTree("view-slp-tree", cl::Hidden, 198 cl::desc("Display the SLP trees with Graphviz")); 199 200 static cl::opt<bool> VectorizeNonPowerOf2( 201 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, 202 cl::desc("Try to vectorize with non-power-of-2 number of elements.")); 203 204 // Limit the number of alias checks. The limit is chosen so that 205 // it has no negative effect on the llvm benchmarks. 206 static const unsigned AliasedCheckLimit = 10; 207 208 // Limit of the number of uses for potentially transformed instructions/values, 209 // used in checks to avoid compile-time explode. 210 static constexpr int UsesLimit = 64; 211 212 // Another limit for the alias checks: The maximum distance between load/store 213 // instructions where alias checks are done. 214 // This limit is useful for very large basic blocks. 215 static const unsigned MaxMemDepDistance = 160; 216 217 /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling 218 /// regions to be handled. 219 static const int MinScheduleRegionSize = 16; 220 221 /// Maximum allowed number of operands in the PHI nodes. 222 static const unsigned MaxPHINumOperands = 128; 223 224 /// Predicate for the element types that the SLP vectorizer supports. 225 /// 226 /// The most important thing to filter here are types which are invalid in LLVM 227 /// vectors. We also filter target specific types which have absolutely no 228 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just 229 /// avoids spending time checking the cost model and realizing that they will 230 /// be inevitably scalarized. 231 static bool isValidElementType(Type *Ty) { 232 // TODO: Support ScalableVectorType. 233 if (SLPReVec && isa<FixedVectorType>(Ty)) 234 Ty = Ty->getScalarType(); 235 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && 236 !Ty->isPPC_FP128Ty(); 237 } 238 239 /// Returns the type of the given value/instruction \p V. If it is store, 240 /// returns the type of its value operand, for Cmp - the types of the compare 241 /// operands and for insertelement - the type os the inserted operand. 242 /// Otherwise, just the type of the value is returned. 243 static Type *getValueType(Value *V) { 244 if (auto *SI = dyn_cast<StoreInst>(V)) 245 return SI->getValueOperand()->getType(); 246 if (auto *CI = dyn_cast<CmpInst>(V)) 247 return CI->getOperand(0)->getType(); 248 if (auto *IE = dyn_cast<InsertElementInst>(V)) 249 return IE->getOperand(1)->getType(); 250 return V->getType(); 251 } 252 253 /// \returns the number of elements for Ty. 254 static unsigned getNumElements(Type *Ty) { 255 assert(!isa<ScalableVectorType>(Ty) && 256 "ScalableVectorType is not supported."); 257 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty)) 258 return VecTy->getNumElements(); 259 return 1; 260 } 261 262 /// \returns the vector type of ScalarTy based on vectorization factor. 263 static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) { 264 return FixedVectorType::get(ScalarTy->getScalarType(), 265 VF * getNumElements(ScalarTy)); 266 } 267 268 /// Returns the number of elements of the given type \p Ty, not less than \p Sz, 269 /// which forms type, which splits by \p TTI into whole vector types during 270 /// legalization. 271 static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, 272 Type *Ty, unsigned Sz) { 273 if (!isValidElementType(Ty)) 274 return bit_ceil(Sz); 275 // Find the number of elements, which forms full vectors. 276 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz)); 277 if (NumParts == 0 || NumParts >= Sz) 278 return bit_ceil(Sz); 279 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts; 280 } 281 282 /// Returns the number of elements of the given type \p Ty, not greater than \p 283 /// Sz, which forms type, which splits by \p TTI into whole vector types during 284 /// legalization. 285 static unsigned 286 getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, 287 unsigned Sz) { 288 if (!isValidElementType(Ty)) 289 return bit_floor(Sz); 290 // Find the number of elements, which forms full vectors. 291 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz)); 292 if (NumParts == 0 || NumParts >= Sz) 293 return bit_floor(Sz); 294 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts)); 295 if (RegVF > Sz) 296 return bit_floor(Sz); 297 return (Sz / RegVF) * RegVF; 298 } 299 300 static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, 301 SmallVectorImpl<int> &Mask) { 302 // The ShuffleBuilder implementation use shufflevector to splat an "element". 303 // But the element have different meaning for SLP (scalar) and REVEC 304 // (vector). We need to expand Mask into masks which shufflevector can use 305 // directly. 306 SmallVector<int> NewMask(Mask.size() * VecTyNumElements); 307 for (unsigned I : seq<unsigned>(Mask.size())) 308 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice( 309 I * VecTyNumElements, VecTyNumElements))) 310 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem 311 : Mask[I] * VecTyNumElements + J; 312 Mask.swap(NewMask); 313 } 314 315 /// \returns the number of groups of shufflevector 316 /// A group has the following features 317 /// 1. All of value in a group are shufflevector. 318 /// 2. The mask of all shufflevector is isExtractSubvectorMask. 319 /// 3. The mask of all shufflevector uses all of the elements of the source. 320 /// e.g., it is 1 group (%0) 321 /// %1 = shufflevector <16 x i8> %0, <16 x i8> poison, 322 /// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 323 /// %2 = shufflevector <16 x i8> %0, <16 x i8> poison, 324 /// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 325 /// it is 2 groups (%3 and %4) 326 /// %5 = shufflevector <8 x i16> %3, <8 x i16> poison, 327 /// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 328 /// %6 = shufflevector <8 x i16> %3, <8 x i16> poison, 329 /// <4 x i32> <i32 4, i32 5, i32 6, i32 7> 330 /// %7 = shufflevector <8 x i16> %4, <8 x i16> poison, 331 /// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 332 /// %8 = shufflevector <8 x i16> %4, <8 x i16> poison, 333 /// <4 x i32> <i32 4, i32 5, i32 6, i32 7> 334 /// it is 0 group 335 /// %12 = shufflevector <8 x i16> %10, <8 x i16> poison, 336 /// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 337 /// %13 = shufflevector <8 x i16> %11, <8 x i16> poison, 338 /// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 339 static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) { 340 if (VL.empty()) 341 return 0; 342 if (!all_of(VL, IsaPred<ShuffleVectorInst>)) 343 return 0; 344 auto *SV = cast<ShuffleVectorInst>(VL.front()); 345 unsigned SVNumElements = 346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements(); 347 unsigned ShuffleMaskSize = SV->getShuffleMask().size(); 348 if (SVNumElements % ShuffleMaskSize != 0) 349 return 0; 350 unsigned GroupSize = SVNumElements / ShuffleMaskSize; 351 if (GroupSize == 0 || (VL.size() % GroupSize) != 0) 352 return 0; 353 unsigned NumGroup = 0; 354 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) { 355 auto *SV = cast<ShuffleVectorInst>(VL[I]); 356 Value *Src = SV->getOperand(0); 357 ArrayRef<Value *> Group = VL.slice(I, GroupSize); 358 SmallBitVector ExpectedIndex(GroupSize); 359 if (!all_of(Group, [&](Value *V) { 360 auto *SV = cast<ShuffleVectorInst>(V); 361 // From the same source. 362 if (SV->getOperand(0) != Src) 363 return false; 364 int Index; 365 if (!SV->isExtractSubvectorMask(Index)) 366 return false; 367 ExpectedIndex.set(Index / ShuffleMaskSize); 368 return true; 369 })) 370 return 0; 371 if (!ExpectedIndex.all()) 372 return 0; 373 ++NumGroup; 374 } 375 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups"); 376 return NumGroup; 377 } 378 379 /// \returns a shufflevector mask which is used to vectorize shufflevectors 380 /// e.g., 381 /// %5 = shufflevector <8 x i16> %3, <8 x i16> poison, 382 /// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 383 /// %6 = shufflevector <8 x i16> %3, <8 x i16> poison, 384 /// <4 x i32> <i32 4, i32 5, i32 6, i32 7> 385 /// %7 = shufflevector <8 x i16> %4, <8 x i16> poison, 386 /// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 387 /// %8 = shufflevector <8 x i16> %4, <8 x i16> poison, 388 /// <4 x i32> <i32 4, i32 5, i32 6, i32 7> 389 /// the result is 390 /// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31> 391 static SmallVector<int> calculateShufflevectorMask(ArrayRef<Value *> VL) { 392 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage."); 393 auto *SV = cast<ShuffleVectorInst>(VL.front()); 394 unsigned SVNumElements = 395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements(); 396 SmallVector<int> Mask; 397 unsigned AccumulateLength = 0; 398 for (Value *V : VL) { 399 auto *SV = cast<ShuffleVectorInst>(V); 400 for (int M : SV->getShuffleMask()) 401 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem 402 : AccumulateLength + M); 403 AccumulateLength += SVNumElements; 404 } 405 return Mask; 406 } 407 408 /// \returns True if the value is a constant (but not globals/constant 409 /// expressions). 410 static bool isConstant(Value *V) { 411 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V); 412 } 413 414 /// Checks if \p V is one of vector-like instructions, i.e. undef, 415 /// insertelement/extractelement with constant indices for fixed vector type or 416 /// extractvalue instruction. 417 static bool isVectorLikeInstWithConstOps(Value *V) { 418 if (!isa<InsertElementInst, ExtractElementInst>(V) && 419 !isa<ExtractValueInst, UndefValue>(V)) 420 return false; 421 auto *I = dyn_cast<Instruction>(V); 422 if (!I || isa<ExtractValueInst>(I)) 423 return true; 424 if (!isa<FixedVectorType>(I->getOperand(0)->getType())) 425 return false; 426 if (isa<ExtractElementInst>(I)) 427 return isConstant(I->getOperand(1)); 428 assert(isa<InsertElementInst>(V) && "Expected only insertelement."); 429 return isConstant(I->getOperand(2)); 430 } 431 432 /// Returns power-of-2 number of elements in a single register (part), given the 433 /// total number of elements \p Size and number of registers (parts) \p 434 /// NumParts. 435 static unsigned getPartNumElems(unsigned Size, unsigned NumParts) { 436 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts))); 437 } 438 439 /// Returns correct remaining number of elements, considering total amount \p 440 /// Size, (power-of-2 number) of elements in a single register \p PartNumElems 441 /// and current register (part) \p Part. 442 static unsigned getNumElems(unsigned Size, unsigned PartNumElems, 443 unsigned Part) { 444 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems); 445 } 446 447 #if !defined(NDEBUG) 448 /// Print a short descriptor of the instruction bundle suitable for debug output. 449 static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) { 450 std::string Result; 451 raw_string_ostream OS(Result); 452 if (Idx >= 0) 453 OS << "Idx: " << Idx << ", "; 454 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]"; 455 return Result; 456 } 457 #endif 458 459 /// \returns true if all of the instructions in \p VL are in the same block or 460 /// false otherwise. 461 static bool allSameBlock(ArrayRef<Value *> VL) { 462 auto *It = find_if(VL, IsaPred<Instruction>); 463 if (It == VL.end()) 464 return false; 465 Instruction *I0 = cast<Instruction>(*It); 466 if (all_of(VL, isVectorLikeInstWithConstOps)) 467 return true; 468 469 BasicBlock *BB = I0->getParent(); 470 for (Value *V : iterator_range(It, VL.end())) { 471 if (isa<PoisonValue>(V)) 472 continue; 473 auto *II = dyn_cast<Instruction>(V); 474 if (!II) 475 return false; 476 477 if (BB != II->getParent()) 478 return false; 479 } 480 return true; 481 } 482 483 /// \returns True if all of the values in \p VL are constants (but not 484 /// globals/constant expressions). 485 static bool allConstant(ArrayRef<Value *> VL) { 486 // Constant expressions and globals can't be vectorized like normal integer/FP 487 // constants. 488 return all_of(VL, isConstant); 489 } 490 491 /// \returns True if all of the values in \p VL are identical or some of them 492 /// are UndefValue. 493 static bool isSplat(ArrayRef<Value *> VL) { 494 Value *FirstNonUndef = nullptr; 495 for (Value *V : VL) { 496 if (isa<UndefValue>(V)) 497 continue; 498 if (!FirstNonUndef) { 499 FirstNonUndef = V; 500 continue; 501 } 502 if (V != FirstNonUndef) 503 return false; 504 } 505 return FirstNonUndef != nullptr; 506 } 507 508 /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator. 509 static bool isCommutative(Instruction *I) { 510 if (auto *Cmp = dyn_cast<CmpInst>(I)) 511 return Cmp->isCommutative(); 512 if (auto *BO = dyn_cast<BinaryOperator>(I)) 513 return BO->isCommutative() || 514 (BO->getOpcode() == Instruction::Sub && 515 !BO->hasNUsesOrMore(UsesLimit) && 516 all_of( 517 BO->uses(), 518 [](const Use &U) { 519 // Commutative, if icmp eq/ne sub, 0 520 CmpPredicate Pred; 521 if (match(U.getUser(), 522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) && 523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)) 524 return true; 525 // Commutative, if abs(sub nsw, true) or abs(sub, false). 526 ConstantInt *Flag; 527 return match(U.getUser(), 528 m_Intrinsic<Intrinsic::abs>( 529 m_Specific(U.get()), m_ConstantInt(Flag))) && 530 (!cast<Instruction>(U.get())->hasNoSignedWrap() || 531 Flag->isOne()); 532 })) || 533 (BO->getOpcode() == Instruction::FSub && 534 !BO->hasNUsesOrMore(UsesLimit) && 535 all_of(BO->uses(), [](const Use &U) { 536 return match(U.getUser(), 537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get()))); 538 })); 539 return I->isCommutative(); 540 } 541 542 template <typename T> 543 static std::optional<unsigned> getInsertExtractIndex(const Value *Inst, 544 unsigned Offset) { 545 static_assert(std::is_same_v<T, InsertElementInst> || 546 std::is_same_v<T, ExtractElementInst>, 547 "unsupported T"); 548 int Index = Offset; 549 if (const auto *IE = dyn_cast<T>(Inst)) { 550 const auto *VT = dyn_cast<FixedVectorType>(IE->getType()); 551 if (!VT) 552 return std::nullopt; 553 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)); 554 if (!CI) 555 return std::nullopt; 556 if (CI->getValue().uge(VT->getNumElements())) 557 return std::nullopt; 558 Index *= VT->getNumElements(); 559 Index += CI->getZExtValue(); 560 return Index; 561 } 562 return std::nullopt; 563 } 564 565 /// \returns inserting or extracting index of InsertElement, ExtractElement or 566 /// InsertValue instruction, using Offset as base offset for index. 567 /// \returns std::nullopt if the index is not an immediate. 568 static std::optional<unsigned> getElementIndex(const Value *Inst, 569 unsigned Offset = 0) { 570 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset)) 571 return Index; 572 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset)) 573 return Index; 574 575 int Index = Offset; 576 577 const auto *IV = dyn_cast<InsertValueInst>(Inst); 578 if (!IV) 579 return std::nullopt; 580 581 Type *CurrentType = IV->getType(); 582 for (unsigned I : IV->indices()) { 583 if (const auto *ST = dyn_cast<StructType>(CurrentType)) { 584 Index *= ST->getNumElements(); 585 CurrentType = ST->getElementType(I); 586 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) { 587 Index *= AT->getNumElements(); 588 CurrentType = AT->getElementType(); 589 } else { 590 return std::nullopt; 591 } 592 Index += I; 593 } 594 return Index; 595 } 596 597 namespace { 598 /// Specifies the way the mask should be analyzed for undefs/poisonous elements 599 /// in the shuffle mask. 600 enum class UseMask { 601 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors, 602 ///< check for the mask elements for the first argument (mask 603 ///< indices are in range [0:VF)). 604 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check 605 ///< for the mask elements for the second argument (mask indices 606 ///< are in range [VF:2*VF)) 607 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for 608 ///< future shuffle elements and mark them as ones as being used 609 ///< in future. Non-undef elements are considered as unused since 610 ///< they're already marked as used in the mask. 611 }; 612 } // namespace 613 614 /// Prepares a use bitset for the given mask either for the first argument or 615 /// for the second. 616 static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask, 617 UseMask MaskArg) { 618 SmallBitVector UseMask(VF, true); 619 for (auto [Idx, Value] : enumerate(Mask)) { 620 if (Value == PoisonMaskElem) { 621 if (MaskArg == UseMask::UndefsAsMask) 622 UseMask.reset(Idx); 623 continue; 624 } 625 if (MaskArg == UseMask::FirstArg && Value < VF) 626 UseMask.reset(Value); 627 else if (MaskArg == UseMask::SecondArg && Value >= VF) 628 UseMask.reset(Value - VF); 629 } 630 return UseMask; 631 } 632 633 /// Checks if the given value is actually an undefined constant vector. 634 /// Also, if the \p UseMask is not empty, tries to check if the non-masked 635 /// elements actually mask the insertelement buildvector, if any. 636 template <bool IsPoisonOnly = false> 637 static SmallBitVector isUndefVector(const Value *V, 638 const SmallBitVector &UseMask = {}) { 639 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true); 640 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>; 641 if (isa<T>(V)) 642 return Res; 643 auto *VecTy = dyn_cast<FixedVectorType>(V->getType()); 644 if (!VecTy) 645 return Res.reset(); 646 auto *C = dyn_cast<Constant>(V); 647 if (!C) { 648 if (!UseMask.empty()) { 649 const Value *Base = V; 650 while (auto *II = dyn_cast<InsertElementInst>(Base)) { 651 Base = II->getOperand(0); 652 if (isa<T>(II->getOperand(1))) 653 continue; 654 std::optional<unsigned> Idx = getElementIndex(II); 655 if (!Idx) { 656 Res.reset(); 657 return Res; 658 } 659 if (*Idx < UseMask.size() && !UseMask.test(*Idx)) 660 Res.reset(*Idx); 661 } 662 // TODO: Add analysis for shuffles here too. 663 if (V == Base) { 664 Res.reset(); 665 } else { 666 SmallBitVector SubMask(UseMask.size(), false); 667 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask); 668 } 669 } else { 670 Res.reset(); 671 } 672 return Res; 673 } 674 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) { 675 if (Constant *Elem = C->getAggregateElement(I)) 676 if (!isa<T>(Elem) && 677 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I)))) 678 Res.reset(I); 679 } 680 return Res; 681 } 682 683 /// Checks if the vector of instructions can be represented as a shuffle, like: 684 /// %x0 = extractelement <4 x i8> %x, i32 0 685 /// %x3 = extractelement <4 x i8> %x, i32 3 686 /// %y1 = extractelement <4 x i8> %y, i32 1 687 /// %y2 = extractelement <4 x i8> %y, i32 2 688 /// %x0x0 = mul i8 %x0, %x0 689 /// %x3x3 = mul i8 %x3, %x3 690 /// %y1y1 = mul i8 %y1, %y1 691 /// %y2y2 = mul i8 %y2, %y2 692 /// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0 693 /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1 694 /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2 695 /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3 696 /// ret <4 x i8> %ins4 697 /// can be transformed into: 698 /// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5, 699 /// i32 6> 700 /// %2 = mul <4 x i8> %1, %1 701 /// ret <4 x i8> %2 702 /// Mask will return the Shuffle Mask equivalent to the extracted elements. 703 /// TODO: Can we split off and reuse the shuffle mask detection from 704 /// ShuffleVectorInst/getShuffleCost? 705 static std::optional<TargetTransformInfo::ShuffleKind> 706 isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask, 707 AssumptionCache *AC) { 708 const auto *It = find_if(VL, IsaPred<ExtractElementInst>); 709 if (It == VL.end()) 710 return std::nullopt; 711 unsigned Size = 712 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) { 713 auto *EI = dyn_cast<ExtractElementInst>(V); 714 if (!EI) 715 return S; 716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType()); 717 if (!VTy) 718 return S; 719 return std::max(S, VTy->getNumElements()); 720 }); 721 722 Value *Vec1 = nullptr; 723 Value *Vec2 = nullptr; 724 bool HasNonUndefVec = any_of(VL, [&](Value *V) { 725 auto *EE = dyn_cast<ExtractElementInst>(V); 726 if (!EE) 727 return false; 728 Value *Vec = EE->getVectorOperand(); 729 if (isa<UndefValue>(Vec)) 730 return false; 731 return isGuaranteedNotToBePoison(Vec, AC); 732 }); 733 enum ShuffleMode { Unknown, Select, Permute }; 734 ShuffleMode CommonShuffleMode = Unknown; 735 Mask.assign(VL.size(), PoisonMaskElem); 736 for (unsigned I = 0, E = VL.size(); I < E; ++I) { 737 // Undef can be represented as an undef element in a vector. 738 if (isa<UndefValue>(VL[I])) 739 continue; 740 auto *EI = cast<ExtractElementInst>(VL[I]); 741 if (isa<ScalableVectorType>(EI->getVectorOperandType())) 742 return std::nullopt; 743 auto *Vec = EI->getVectorOperand(); 744 // We can extractelement from undef or poison vector. 745 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all()) 746 continue; 747 // All vector operands must have the same number of vector elements. 748 if (isa<UndefValue>(Vec)) { 749 Mask[I] = I; 750 } else { 751 if (isa<UndefValue>(EI->getIndexOperand())) 752 continue; 753 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); 754 if (!Idx) 755 return std::nullopt; 756 // Undefined behavior if Idx is negative or >= Size. 757 if (Idx->getValue().uge(Size)) 758 continue; 759 unsigned IntIdx = Idx->getValue().getZExtValue(); 760 Mask[I] = IntIdx; 761 } 762 if (isUndefVector(Vec).all() && HasNonUndefVec) 763 continue; 764 // For correct shuffling we have to have at most 2 different vector operands 765 // in all extractelement instructions. 766 if (!Vec1 || Vec1 == Vec) { 767 Vec1 = Vec; 768 } else if (!Vec2 || Vec2 == Vec) { 769 Vec2 = Vec; 770 Mask[I] += Size; 771 } else { 772 return std::nullopt; 773 } 774 if (CommonShuffleMode == Permute) 775 continue; 776 // If the extract index is not the same as the operation number, it is a 777 // permutation. 778 if (Mask[I] % Size != I) { 779 CommonShuffleMode = Permute; 780 continue; 781 } 782 CommonShuffleMode = Select; 783 } 784 // If we're not crossing lanes in different vectors, consider it as blending. 785 if (CommonShuffleMode == Select && Vec2) 786 return TargetTransformInfo::SK_Select; 787 // If Vec2 was never used, we have a permutation of a single vector, otherwise 788 // we have permutation of 2 vectors. 789 return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc 790 : TargetTransformInfo::SK_PermuteSingleSrc; 791 } 792 793 /// \returns True if Extract{Value,Element} instruction extracts element Idx. 794 static std::optional<unsigned> getExtractIndex(Instruction *E) { 795 unsigned Opcode = E->getOpcode(); 796 assert((Opcode == Instruction::ExtractElement || 797 Opcode == Instruction::ExtractValue) && 798 "Expected extractelement or extractvalue instruction."); 799 if (Opcode == Instruction::ExtractElement) { 800 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1)); 801 if (!CI) 802 return std::nullopt; 803 return CI->getZExtValue(); 804 } 805 auto *EI = cast<ExtractValueInst>(E); 806 if (EI->getNumIndices() != 1) 807 return std::nullopt; 808 return *EI->idx_begin(); 809 } 810 811 namespace { 812 813 /// Main data required for vectorization of instructions. 814 class InstructionsState { 815 /// The main/alternate instruction. MainOp is also VL0. 816 Instruction *MainOp = nullptr; 817 Instruction *AltOp = nullptr; 818 819 public: 820 Instruction *getMainOp() const { 821 assert(valid() && "InstructionsState is invalid."); 822 return MainOp; 823 } 824 825 Instruction *getAltOp() const { 826 assert(valid() && "InstructionsState is invalid."); 827 return AltOp; 828 } 829 830 /// The main/alternate opcodes for the list of instructions. 831 unsigned getOpcode() const { return getMainOp()->getOpcode(); } 832 833 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); } 834 835 /// Some of the instructions in the list have alternate opcodes. 836 bool isAltShuffle() const { return getMainOp() != getAltOp(); } 837 838 bool isOpcodeOrAlt(Instruction *I) const { 839 unsigned CheckedOpcode = I->getOpcode(); 840 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; 841 } 842 843 /// Checks if the current state is valid, i.e. has non-null MainOp 844 bool valid() const { return MainOp && AltOp; } 845 846 explicit operator bool() const { return valid(); } 847 848 InstructionsState() = delete; 849 InstructionsState(Instruction *MainOp, Instruction *AltOp) 850 : MainOp(MainOp), AltOp(AltOp) {} 851 static InstructionsState invalid() { return {nullptr, nullptr}; } 852 }; 853 854 } // end anonymous namespace 855 856 /// \returns true if \p Opcode is allowed as part of the main/alternate 857 /// instruction for SLP vectorization. 858 /// 859 /// Example of unsupported opcode is SDIV that can potentially cause UB if the 860 /// "shuffled out" lane would result in division by zero. 861 static bool isValidForAlternation(unsigned Opcode) { 862 if (Instruction::isIntDivRem(Opcode)) 863 return false; 864 865 return true; 866 } 867 868 static InstructionsState getSameOpcode(ArrayRef<Value *> VL, 869 const TargetLibraryInfo &TLI); 870 871 /// Checks if the provided operands of 2 cmp instructions are compatible, i.e. 872 /// compatible instructions or constants, or just some other regular values. 873 static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, 874 Value *Op1, const TargetLibraryInfo &TLI) { 875 return (isConstant(BaseOp0) && isConstant(Op0)) || 876 (isConstant(BaseOp1) && isConstant(Op1)) || 877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) && 878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) || 879 BaseOp0 == Op0 || BaseOp1 == Op1 || 880 getSameOpcode({BaseOp0, Op0}, TLI) || 881 getSameOpcode({BaseOp1, Op1}, TLI); 882 } 883 884 /// \returns true if a compare instruction \p CI has similar "look" and 885 /// same predicate as \p BaseCI, "as is" or with its operands and predicate 886 /// swapped, false otherwise. 887 static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, 888 const TargetLibraryInfo &TLI) { 889 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() && 890 "Assessing comparisons of different types?"); 891 CmpInst::Predicate BasePred = BaseCI->getPredicate(); 892 CmpInst::Predicate Pred = CI->getPredicate(); 893 CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(Pred); 894 895 Value *BaseOp0 = BaseCI->getOperand(0); 896 Value *BaseOp1 = BaseCI->getOperand(1); 897 Value *Op0 = CI->getOperand(0); 898 Value *Op1 = CI->getOperand(1); 899 900 return (BasePred == Pred && 901 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) || 902 (BasePred == SwappedPred && 903 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI)); 904 } 905 906 /// \returns analysis of the Instructions in \p VL described in 907 /// InstructionsState, the Opcode that we suppose the whole list 908 /// could be vectorized even if its structure is diverse. 909 static InstructionsState getSameOpcode(ArrayRef<Value *> VL, 910 const TargetLibraryInfo &TLI) { 911 // Make sure these are all Instructions. 912 if (!all_of(VL, IsaPred<Instruction, PoisonValue>)) 913 return InstructionsState::invalid(); 914 915 auto *It = find_if(VL, IsaPred<Instruction>); 916 if (It == VL.end()) 917 return InstructionsState::invalid(); 918 919 Instruction *MainOp = cast<Instruction>(*It); 920 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>); 921 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) || 922 (VL.size() == 2 && InstCnt < 2)) 923 return InstructionsState::invalid(); 924 925 bool IsCastOp = isa<CastInst>(MainOp); 926 bool IsBinOp = isa<BinaryOperator>(MainOp); 927 bool IsCmpOp = isa<CmpInst>(MainOp); 928 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate() 929 : CmpInst::BAD_ICMP_PREDICATE; 930 Instruction *AltOp = MainOp; 931 unsigned Opcode = MainOp->getOpcode(); 932 unsigned AltOpcode = Opcode; 933 934 bool SwappedPredsCompatible = IsCmpOp && [&]() { 935 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds; 936 UniquePreds.insert(BasePred); 937 UniqueNonSwappedPreds.insert(BasePred); 938 for (Value *V : VL) { 939 auto *I = dyn_cast<CmpInst>(V); 940 if (!I) 941 return false; 942 CmpInst::Predicate CurrentPred = I->getPredicate(); 943 CmpInst::Predicate SwappedCurrentPred = 944 CmpInst::getSwappedPredicate(CurrentPred); 945 UniqueNonSwappedPreds.insert(CurrentPred); 946 if (!UniquePreds.contains(CurrentPred) && 947 !UniquePreds.contains(SwappedCurrentPred)) 948 UniquePreds.insert(CurrentPred); 949 } 950 // Total number of predicates > 2, but if consider swapped predicates 951 // compatible only 2, consider swappable predicates as compatible opcodes, 952 // not alternate. 953 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2; 954 }(); 955 // Check for one alternate opcode from another BinaryOperator. 956 // TODO - generalize to support all operators (types, calls etc.). 957 Intrinsic::ID BaseID = 0; 958 SmallVector<VFInfo> BaseMappings; 959 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) { 960 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI); 961 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase); 962 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty()) 963 return InstructionsState::invalid(); 964 } 965 bool AnyPoison = InstCnt != VL.size(); 966 // Check MainOp too to be sure that it matches the requirements for the 967 // instructions. 968 for (Value *V : iterator_range(It, VL.end())) { 969 auto *I = dyn_cast<Instruction>(V); 970 if (!I) 971 continue; 972 973 // Cannot combine poison and divisions. 974 // TODO: do some smart analysis of the CallInsts to exclude divide-like 975 // intrinsics/functions only. 976 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I))) 977 return InstructionsState::invalid(); 978 unsigned InstOpcode = I->getOpcode(); 979 if (IsBinOp && isa<BinaryOperator>(I)) { 980 if (InstOpcode == Opcode || InstOpcode == AltOpcode) 981 continue; 982 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) && 983 isValidForAlternation(Opcode)) { 984 AltOpcode = InstOpcode; 985 AltOp = I; 986 continue; 987 } 988 } else if (IsCastOp && isa<CastInst>(I)) { 989 Value *Op0 = MainOp->getOperand(0); 990 Type *Ty0 = Op0->getType(); 991 Value *Op1 = I->getOperand(0); 992 Type *Ty1 = Op1->getType(); 993 if (Ty0 == Ty1) { 994 if (InstOpcode == Opcode || InstOpcode == AltOpcode) 995 continue; 996 if (Opcode == AltOpcode) { 997 assert(isValidForAlternation(Opcode) && 998 isValidForAlternation(InstOpcode) && 999 "Cast isn't safe for alternation, logic needs to be updated!"); 1000 AltOpcode = InstOpcode; 1001 AltOp = I; 1002 continue; 1003 } 1004 } 1005 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) { 1006 auto *BaseInst = cast<CmpInst>(MainOp); 1007 Type *Ty0 = BaseInst->getOperand(0)->getType(); 1008 Type *Ty1 = Inst->getOperand(0)->getType(); 1009 if (Ty0 == Ty1) { 1010 assert(InstOpcode == Opcode && "Expected same CmpInst opcode."); 1011 assert(InstOpcode == AltOpcode && 1012 "Alternate instructions are only supported by BinaryOperator " 1013 "and CastInst."); 1014 // Check for compatible operands. If the corresponding operands are not 1015 // compatible - need to perform alternate vectorization. 1016 CmpInst::Predicate CurrentPred = Inst->getPredicate(); 1017 CmpInst::Predicate SwappedCurrentPred = 1018 CmpInst::getSwappedPredicate(CurrentPred); 1019 1020 if ((VL.size() == 2 || SwappedPredsCompatible) && 1021 (BasePred == CurrentPred || BasePred == SwappedCurrentPred)) 1022 continue; 1023 1024 if (isCmpSameOrSwapped(BaseInst, Inst, TLI)) 1025 continue; 1026 auto *AltInst = cast<CmpInst>(AltOp); 1027 if (MainOp != AltOp) { 1028 if (isCmpSameOrSwapped(AltInst, Inst, TLI)) 1029 continue; 1030 } else if (BasePred != CurrentPred) { 1031 assert( 1032 isValidForAlternation(InstOpcode) && 1033 "CmpInst isn't safe for alternation, logic needs to be updated!"); 1034 AltOp = I; 1035 continue; 1036 } 1037 CmpInst::Predicate AltPred = AltInst->getPredicate(); 1038 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred || 1039 AltPred == CurrentPred || AltPred == SwappedCurrentPred) 1040 continue; 1041 } 1042 } else if (InstOpcode == Opcode) { 1043 assert(InstOpcode == AltOpcode && 1044 "Alternate instructions are only supported by BinaryOperator and " 1045 "CastInst."); 1046 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) { 1047 if (Gep->getNumOperands() != 2 || 1048 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType()) 1049 return InstructionsState::invalid(); 1050 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) { 1051 if (!isVectorLikeInstWithConstOps(EI)) 1052 return InstructionsState::invalid(); 1053 } else if (auto *LI = dyn_cast<LoadInst>(I)) { 1054 auto *BaseLI = cast<LoadInst>(MainOp); 1055 if (!LI->isSimple() || !BaseLI->isSimple()) 1056 return InstructionsState::invalid(); 1057 } else if (auto *Call = dyn_cast<CallInst>(I)) { 1058 auto *CallBase = cast<CallInst>(MainOp); 1059 if (Call->getCalledFunction() != CallBase->getCalledFunction()) 1060 return InstructionsState::invalid(); 1061 if (Call->hasOperandBundles() && 1062 (!CallBase->hasOperandBundles() || 1063 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(), 1064 Call->op_begin() + Call->getBundleOperandsEndIndex(), 1065 CallBase->op_begin() + 1066 CallBase->getBundleOperandsStartIndex()))) 1067 return InstructionsState::invalid(); 1068 Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, &TLI); 1069 if (ID != BaseID) 1070 return InstructionsState::invalid(); 1071 if (!ID) { 1072 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call); 1073 if (Mappings.size() != BaseMappings.size() || 1074 Mappings.front().ISA != BaseMappings.front().ISA || 1075 Mappings.front().ScalarName != BaseMappings.front().ScalarName || 1076 Mappings.front().VectorName != BaseMappings.front().VectorName || 1077 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF || 1078 Mappings.front().Shape.Parameters != 1079 BaseMappings.front().Shape.Parameters) 1080 return InstructionsState::invalid(); 1081 } 1082 } 1083 continue; 1084 } 1085 return InstructionsState::invalid(); 1086 } 1087 1088 return InstructionsState(MainOp, AltOp); 1089 } 1090 1091 /// \returns true if all of the values in \p VL have the same type or false 1092 /// otherwise. 1093 static bool allSameType(ArrayRef<Value *> VL) { 1094 Type *Ty = VL.front()->getType(); 1095 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; }); 1096 } 1097 1098 /// \returns True if in-tree use also needs extract. This refers to 1099 /// possible scalar operand in vectorized instruction. 1100 static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, 1101 TargetLibraryInfo *TLI, 1102 const TargetTransformInfo *TTI) { 1103 if (!UserInst) 1104 return false; 1105 unsigned Opcode = UserInst->getOpcode(); 1106 switch (Opcode) { 1107 case Instruction::Load: { 1108 LoadInst *LI = cast<LoadInst>(UserInst); 1109 return (LI->getPointerOperand() == Scalar); 1110 } 1111 case Instruction::Store: { 1112 StoreInst *SI = cast<StoreInst>(UserInst); 1113 return (SI->getPointerOperand() == Scalar); 1114 } 1115 case Instruction::Call: { 1116 CallInst *CI = cast<CallInst>(UserInst); 1117 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 1118 return any_of(enumerate(CI->args()), [&](auto &&Arg) { 1119 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) && 1120 Arg.value().get() == Scalar; 1121 }); 1122 } 1123 default: 1124 return false; 1125 } 1126 } 1127 1128 /// \returns the AA location that is being access by the instruction. 1129 static MemoryLocation getLocation(Instruction *I) { 1130 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 1131 return MemoryLocation::get(SI); 1132 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 1133 return MemoryLocation::get(LI); 1134 return MemoryLocation(); 1135 } 1136 1137 /// \returns True if the instruction is not a volatile or atomic load/store. 1138 static bool isSimple(Instruction *I) { 1139 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 1140 return LI->isSimple(); 1141 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 1142 return SI->isSimple(); 1143 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) 1144 return !MI->isVolatile(); 1145 return true; 1146 } 1147 1148 /// Shuffles \p Mask in accordance with the given \p SubMask. 1149 /// \param ExtendingManyInputs Supports reshuffling of the mask with not only 1150 /// one but two input vectors. 1151 static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask, 1152 bool ExtendingManyInputs = false) { 1153 if (SubMask.empty()) 1154 return; 1155 assert( 1156 (!ExtendingManyInputs || SubMask.size() > Mask.size() || 1157 // Check if input scalars were extended to match the size of other node. 1158 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) && 1159 "SubMask with many inputs support must be larger than the mask."); 1160 if (Mask.empty()) { 1161 Mask.append(SubMask.begin(), SubMask.end()); 1162 return; 1163 } 1164 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem); 1165 int TermValue = std::min(Mask.size(), SubMask.size()); 1166 for (int I = 0, E = SubMask.size(); I < E; ++I) { 1167 if (SubMask[I] == PoisonMaskElem || 1168 (!ExtendingManyInputs && 1169 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue))) 1170 continue; 1171 NewMask[I] = Mask[SubMask[I]]; 1172 } 1173 Mask.swap(NewMask); 1174 } 1175 1176 /// Order may have elements assigned special value (size) which is out of 1177 /// bounds. Such indices only appear on places which correspond to undef values 1178 /// (see canReuseExtract for details) and used in order to avoid undef values 1179 /// have effect on operands ordering. 1180 /// The first loop below simply finds all unused indices and then the next loop 1181 /// nest assigns these indices for undef values positions. 1182 /// As an example below Order has two undef positions and they have assigned 1183 /// values 3 and 7 respectively: 1184 /// before: 6 9 5 4 9 2 1 0 1185 /// after: 6 3 5 4 7 2 1 0 1186 static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) { 1187 const unsigned Sz = Order.size(); 1188 SmallBitVector UnusedIndices(Sz, /*t=*/true); 1189 SmallBitVector MaskedIndices(Sz); 1190 for (unsigned I = 0; I < Sz; ++I) { 1191 if (Order[I] < Sz) 1192 UnusedIndices.reset(Order[I]); 1193 else 1194 MaskedIndices.set(I); 1195 } 1196 if (MaskedIndices.none()) 1197 return; 1198 assert(UnusedIndices.count() == MaskedIndices.count() && 1199 "Non-synced masked/available indices."); 1200 int Idx = UnusedIndices.find_first(); 1201 int MIdx = MaskedIndices.find_first(); 1202 while (MIdx >= 0) { 1203 assert(Idx >= 0 && "Indices must be synced."); 1204 Order[MIdx] = Idx; 1205 Idx = UnusedIndices.find_next(Idx); 1206 MIdx = MaskedIndices.find_next(MIdx); 1207 } 1208 } 1209 1210 /// \returns a bitset for selecting opcodes. false for Opcode0 and true for 1211 /// Opcode1. 1212 static SmallBitVector getAltInstrMask(ArrayRef<Value *> VL, unsigned Opcode0, 1213 unsigned Opcode1) { 1214 Type *ScalarTy = VL[0]->getType(); 1215 unsigned ScalarTyNumElements = getNumElements(ScalarTy); 1216 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false); 1217 for (unsigned Lane : seq<unsigned>(VL.size())) { 1218 if (isa<PoisonValue>(VL[Lane])) 1219 continue; 1220 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1) 1221 OpcodeMask.set(Lane * ScalarTyNumElements, 1222 Lane * ScalarTyNumElements + ScalarTyNumElements); 1223 } 1224 return OpcodeMask; 1225 } 1226 1227 namespace llvm { 1228 1229 static void inversePermutation(ArrayRef<unsigned> Indices, 1230 SmallVectorImpl<int> &Mask) { 1231 Mask.clear(); 1232 const unsigned E = Indices.size(); 1233 Mask.resize(E, PoisonMaskElem); 1234 for (unsigned I = 0; I < E; ++I) 1235 Mask[Indices[I]] = I; 1236 } 1237 1238 /// Reorders the list of scalars in accordance with the given \p Mask. 1239 static void reorderScalars(SmallVectorImpl<Value *> &Scalars, 1240 ArrayRef<int> Mask) { 1241 assert(!Mask.empty() && "Expected non-empty mask."); 1242 SmallVector<Value *> Prev(Scalars.size(), 1243 PoisonValue::get(Scalars.front()->getType())); 1244 Prev.swap(Scalars); 1245 for (unsigned I = 0, E = Prev.size(); I < E; ++I) 1246 if (Mask[I] != PoisonMaskElem) 1247 Scalars[Mask[I]] = Prev[I]; 1248 } 1249 1250 /// Checks if the provided value does not require scheduling. It does not 1251 /// require scheduling if this is not an instruction or it is an instruction 1252 /// that does not read/write memory and all operands are either not instructions 1253 /// or phi nodes or instructions from different blocks. 1254 static bool areAllOperandsNonInsts(Value *V) { 1255 auto *I = dyn_cast<Instruction>(V); 1256 if (!I) 1257 return true; 1258 return !mayHaveNonDefUseDependency(*I) && 1259 all_of(I->operands(), [I](Value *V) { 1260 auto *IO = dyn_cast<Instruction>(V); 1261 if (!IO) 1262 return true; 1263 return isa<PHINode>(IO) || IO->getParent() != I->getParent(); 1264 }); 1265 } 1266 1267 /// Checks if the provided value does not require scheduling. It does not 1268 /// require scheduling if this is not an instruction or it is an instruction 1269 /// that does not read/write memory and all users are phi nodes or instructions 1270 /// from the different blocks. 1271 static bool isUsedOutsideBlock(Value *V) { 1272 auto *I = dyn_cast<Instruction>(V); 1273 if (!I) 1274 return true; 1275 // Limits the number of uses to save compile time. 1276 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) && 1277 all_of(I->users(), [I](User *U) { 1278 auto *IU = dyn_cast<Instruction>(U); 1279 if (!IU) 1280 return true; 1281 return IU->getParent() != I->getParent() || isa<PHINode>(IU); 1282 }); 1283 } 1284 1285 /// Checks if the specified value does not require scheduling. It does not 1286 /// require scheduling if all operands and all users do not need to be scheduled 1287 /// in the current basic block. 1288 static bool doesNotNeedToBeScheduled(Value *V) { 1289 return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V); 1290 } 1291 1292 /// Checks if the specified array of instructions does not require scheduling. 1293 /// It is so if all either instructions have operands that do not require 1294 /// scheduling or their users do not require scheduling since they are phis or 1295 /// in other basic blocks. 1296 static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) { 1297 return !VL.empty() && 1298 (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts)); 1299 } 1300 1301 /// Returns true if widened type of \p Ty elements with size \p Sz represents 1302 /// full vector type, i.e. adding extra element results in extra parts upon type 1303 /// legalization. 1304 static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, 1305 unsigned Sz) { 1306 if (Sz <= 1) 1307 return false; 1308 if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty)) 1309 return false; 1310 if (has_single_bit(Sz)) 1311 return true; 1312 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz)); 1313 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) && 1314 Sz % NumParts == 0; 1315 } 1316 1317 /// Returns number of parts, the type \p VecTy will be split at the codegen 1318 /// phase. If the type is going to be scalarized or does not uses whole 1319 /// registers, returns 1. 1320 static unsigned 1321 getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, 1322 const unsigned Limit = std::numeric_limits<unsigned>::max()) { 1323 unsigned NumParts = TTI.getNumberOfParts(VecTy); 1324 if (NumParts == 0 || NumParts >= Limit) 1325 return 1; 1326 unsigned Sz = getNumElements(VecTy); 1327 if (NumParts >= Sz || Sz % NumParts != 0 || 1328 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts)) 1329 return 1; 1330 return NumParts; 1331 } 1332 1333 namespace slpvectorizer { 1334 1335 /// Bottom Up SLP Vectorizer. 1336 class BoUpSLP { 1337 struct TreeEntry; 1338 struct ScheduleData; 1339 class ShuffleCostEstimator; 1340 class ShuffleInstructionBuilder; 1341 1342 public: 1343 /// Tracks the state we can represent the loads in the given sequence. 1344 enum class LoadsState { 1345 Gather, 1346 Vectorize, 1347 ScatterVectorize, 1348 StridedVectorize 1349 }; 1350 1351 using ValueList = SmallVector<Value *, 8>; 1352 using InstrList = SmallVector<Instruction *, 16>; 1353 using ValueSet = SmallPtrSet<Value *, 16>; 1354 using StoreList = SmallVector<StoreInst *, 8>; 1355 using ExtraValueToDebugLocsMap = SmallDenseSet<Value *, 4>; 1356 using OrdersType = SmallVector<unsigned, 4>; 1357 1358 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, 1359 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, 1360 DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, 1361 const DataLayout *DL, OptimizationRemarkEmitter *ORE) 1362 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt), 1363 AC(AC), DB(DB), DL(DL), ORE(ORE), 1364 Builder(Se->getContext(), TargetFolder(*DL)) { 1365 CodeMetrics::collectEphemeralValues(F, AC, EphValues); 1366 // Use the vector register size specified by the target unless overridden 1367 // by a command-line option. 1368 // TODO: It would be better to limit the vectorization factor based on 1369 // data type rather than just register size. For example, x86 AVX has 1370 // 256-bit registers, but it does not support integer operations 1371 // at that width (that requires AVX2). 1372 if (MaxVectorRegSizeOption.getNumOccurrences()) 1373 MaxVecRegSize = MaxVectorRegSizeOption; 1374 else 1375 MaxVecRegSize = 1376 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 1377 .getFixedValue(); 1378 1379 if (MinVectorRegSizeOption.getNumOccurrences()) 1380 MinVecRegSize = MinVectorRegSizeOption; 1381 else 1382 MinVecRegSize = TTI->getMinVectorRegisterBitWidth(); 1383 } 1384 1385 /// Vectorize the tree that starts with the elements in \p VL. 1386 /// Returns the vectorized root. 1387 Value *vectorizeTree(); 1388 1389 /// Vectorize the tree but with the list of externally used values \p 1390 /// ExternallyUsedValues. Values in this MapVector can be replaced but the 1391 /// generated extractvalue instructions. 1392 Value * 1393 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, 1394 Instruction *ReductionRoot = nullptr); 1395 1396 /// \returns the cost incurred by unwanted spills and fills, caused by 1397 /// holding live values over call sites. 1398 InstructionCost getSpillCost() const; 1399 1400 /// \returns the vectorization cost of the subtree that starts at \p VL. 1401 /// A negative number means that this is profitable. 1402 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {}); 1403 1404 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for 1405 /// the purpose of scheduling and extraction in the \p UserIgnoreLst. 1406 void buildTree(ArrayRef<Value *> Roots, 1407 const SmallDenseSet<Value *> &UserIgnoreLst); 1408 1409 /// Construct a vectorizable tree that starts at \p Roots. 1410 void buildTree(ArrayRef<Value *> Roots); 1411 1412 /// Returns whether the root node has in-tree uses. 1413 bool doesRootHaveInTreeUses() const { 1414 return !VectorizableTree.empty() && 1415 !VectorizableTree.front()->UserTreeIndices.empty(); 1416 } 1417 1418 /// Return the scalars of the root node. 1419 ArrayRef<Value *> getRootNodeScalars() const { 1420 assert(!VectorizableTree.empty() && "No graph to get the first node from"); 1421 return VectorizableTree.front()->Scalars; 1422 } 1423 1424 /// Returns the type/is-signed info for the root node in the graph without 1425 /// casting. 1426 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const { 1427 const TreeEntry &Root = *VectorizableTree.front().get(); 1428 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() || 1429 !Root.Scalars.front()->getType()->isIntegerTy()) 1430 return std::nullopt; 1431 auto It = MinBWs.find(&Root); 1432 if (It != MinBWs.end()) 1433 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(), 1434 It->second.first), 1435 It->second.second); 1436 if (Root.getOpcode() == Instruction::ZExt || 1437 Root.getOpcode() == Instruction::SExt) 1438 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(), 1439 Root.getOpcode() == Instruction::SExt); 1440 return std::nullopt; 1441 } 1442 1443 /// Checks if the root graph node can be emitted with narrower bitwidth at 1444 /// codegen and returns it signedness, if so. 1445 bool isSignedMinBitwidthRootNode() const { 1446 return MinBWs.at(VectorizableTree.front().get()).second; 1447 } 1448 1449 /// Returns reduction type after minbitdth analysis. 1450 FixedVectorType *getReductionType() const { 1451 if (ReductionBitWidth == 0 || 1452 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() || 1453 ReductionBitWidth >= 1454 DL->getTypeSizeInBits( 1455 VectorizableTree.front()->Scalars.front()->getType())) 1456 return getWidenedType( 1457 VectorizableTree.front()->Scalars.front()->getType(), 1458 VectorizableTree.front()->getVectorFactor()); 1459 return getWidenedType( 1460 IntegerType::get( 1461 VectorizableTree.front()->Scalars.front()->getContext(), 1462 ReductionBitWidth), 1463 VectorizableTree.front()->getVectorFactor()); 1464 } 1465 1466 /// Builds external uses of the vectorized scalars, i.e. the list of 1467 /// vectorized scalars to be extracted, their lanes and their scalar users. \p 1468 /// ExternallyUsedValues contains additional list of external uses to handle 1469 /// vectorization of reductions. 1470 void 1471 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {}); 1472 1473 /// Transforms graph nodes to target specific representations, if profitable. 1474 void transformNodes(); 1475 1476 /// Clear the internal data structures that are created by 'buildTree'. 1477 void deleteTree() { 1478 VectorizableTree.clear(); 1479 ScalarToTreeEntries.clear(); 1480 MustGather.clear(); 1481 NonScheduledFirst.clear(); 1482 EntryToLastInstruction.clear(); 1483 LoadEntriesToVectorize.clear(); 1484 IsGraphTransformMode = false; 1485 GatheredLoadsEntriesFirst.reset(); 1486 ExternalUses.clear(); 1487 ExternalUsesAsOriginalScalar.clear(); 1488 for (auto &Iter : BlocksSchedules) { 1489 BlockScheduling *BS = Iter.second.get(); 1490 BS->clear(); 1491 } 1492 MinBWs.clear(); 1493 ReductionBitWidth = 0; 1494 BaseGraphSize = 1; 1495 CastMaxMinBWSizes.reset(); 1496 ExtraBitWidthNodes.clear(); 1497 InstrElementSize.clear(); 1498 UserIgnoreList = nullptr; 1499 PostponedGathers.clear(); 1500 ValueToGatherNodes.clear(); 1501 } 1502 1503 unsigned getTreeSize() const { return VectorizableTree.size(); } 1504 1505 /// Returns the base graph size, before any transformations. 1506 unsigned getCanonicalGraphSize() const { return BaseGraphSize; } 1507 1508 /// Perform LICM and CSE on the newly generated gather sequences. 1509 void optimizeGatherSequence(); 1510 1511 /// Does this non-empty order represent an identity order? Identity 1512 /// should be represented as an empty order, so this is used to 1513 /// decide if we can canonicalize a computed order. Undef elements 1514 /// (represented as size) are ignored. 1515 bool isIdentityOrder(ArrayRef<unsigned> Order) const { 1516 assert(!Order.empty() && "expected non-empty order"); 1517 const unsigned Sz = Order.size(); 1518 return all_of(enumerate(Order), [&](const auto &P) { 1519 return P.value() == P.index() || P.value() == Sz; 1520 }); 1521 } 1522 1523 /// Checks if the specified gather tree entry \p TE can be represented as a 1524 /// shuffled vector entry + (possibly) permutation with other gathers. It 1525 /// implements the checks only for possibly ordered scalars (Loads, 1526 /// ExtractElement, ExtractValue), which can be part of the graph. 1527 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE); 1528 1529 /// Sort loads into increasing pointers offsets to allow greater clustering. 1530 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE); 1531 1532 /// Gets reordering data for the given tree entry. If the entry is vectorized 1533 /// - just return ReorderIndices, otherwise check if the scalars can be 1534 /// reordered and return the most optimal order. 1535 /// \return std::nullopt if ordering is not important, empty order, if 1536 /// identity order is important, or the actual order. 1537 /// \param TopToBottom If true, include the order of vectorized stores and 1538 /// insertelement nodes, otherwise skip them. 1539 std::optional<OrdersType> getReorderingData(const TreeEntry &TE, 1540 bool TopToBottom); 1541 1542 /// Reorders the current graph to the most profitable order starting from the 1543 /// root node to the leaf nodes. The best order is chosen only from the nodes 1544 /// of the same size (vectorization factor). Smaller nodes are considered 1545 /// parts of subgraph with smaller VF and they are reordered independently. We 1546 /// can make it because we still need to extend smaller nodes to the wider VF 1547 /// and we can merge reordering shuffles with the widening shuffles. 1548 void reorderTopToBottom(); 1549 1550 /// Reorders the current graph to the most profitable order starting from 1551 /// leaves to the root. It allows to rotate small subgraphs and reduce the 1552 /// number of reshuffles if the leaf nodes use the same order. In this case we 1553 /// can merge the orders and just shuffle user node instead of shuffling its 1554 /// operands. Plus, even the leaf nodes have different orders, it allows to 1555 /// sink reordering in the graph closer to the root node and merge it later 1556 /// during analysis. 1557 void reorderBottomToTop(bool IgnoreReorder = false); 1558 1559 /// \return The vector element size in bits to use when vectorizing the 1560 /// expression tree ending at \p V. If V is a store, the size is the width of 1561 /// the stored value. Otherwise, the size is the width of the largest loaded 1562 /// value reaching V. This method is used by the vectorizer to calculate 1563 /// vectorization factors. 1564 unsigned getVectorElementSize(Value *V); 1565 1566 /// Compute the minimum type sizes required to represent the entries in a 1567 /// vectorizable tree. 1568 void computeMinimumValueSizes(); 1569 1570 // \returns maximum vector register size as set by TTI or overridden by cl::opt. 1571 unsigned getMaxVecRegSize() const { 1572 return MaxVecRegSize; 1573 } 1574 1575 // \returns minimum vector register size as set by cl::opt. 1576 unsigned getMinVecRegSize() const { 1577 return MinVecRegSize; 1578 } 1579 1580 unsigned getMinVF(unsigned Sz) const { 1581 return std::max(2U, getMinVecRegSize() / Sz); 1582 } 1583 1584 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { 1585 unsigned MaxVF = MaxVFOption.getNumOccurrences() ? 1586 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode); 1587 return MaxVF ? MaxVF : UINT_MAX; 1588 } 1589 1590 /// Check if homogeneous aggregate is isomorphic to some VectorType. 1591 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like 1592 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, 1593 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on. 1594 /// 1595 /// \returns number of elements in vector if isomorphism exists, 0 otherwise. 1596 unsigned canMapToVector(Type *T) const; 1597 1598 /// \returns True if the VectorizableTree is both tiny and not fully 1599 /// vectorizable. We do not vectorize such trees. 1600 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const; 1601 1602 /// Checks if the graph and all its subgraphs cannot be better vectorized. 1603 /// It may happen, if all gather nodes are loads and they cannot be 1604 /// "clusterized". In this case even subgraphs cannot be vectorized more 1605 /// effectively than the base graph. 1606 bool isTreeNotExtendable() const; 1607 1608 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values 1609 /// can be load combined in the backend. Load combining may not be allowed in 1610 /// the IR optimizer, so we do not want to alter the pattern. For example, 1611 /// partially transforming a scalar bswap() pattern into vector code is 1612 /// effectively impossible for the backend to undo. 1613 /// TODO: If load combining is allowed in the IR optimizer, this analysis 1614 /// may not be necessary. 1615 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const; 1616 1617 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values 1618 /// can be load combined in the backend. Load combining may not be allowed in 1619 /// the IR optimizer, so we do not want to alter the pattern. For example, 1620 /// partially transforming a scalar bswap() pattern into vector code is 1621 /// effectively impossible for the backend to undo. 1622 /// TODO: If load combining is allowed in the IR optimizer, this analysis 1623 /// may not be necessary. 1624 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const; 1625 1626 /// Checks if the given array of loads can be represented as a vectorized, 1627 /// scatter or just simple gather. 1628 /// \param VL list of loads. 1629 /// \param VL0 main load value. 1630 /// \param Order returned order of load instructions. 1631 /// \param PointerOps returned list of pointer operands. 1632 /// \param BestVF return best vector factor, if recursive check found better 1633 /// vectorization sequences rather than masked gather. 1634 /// \param TryRecursiveCheck used to check if long masked gather can be 1635 /// represented as a serie of loads/insert subvector, if profitable. 1636 LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, 1637 SmallVectorImpl<unsigned> &Order, 1638 SmallVectorImpl<Value *> &PointerOps, 1639 unsigned *BestVF = nullptr, 1640 bool TryRecursiveCheck = true) const; 1641 1642 /// Registers non-vectorizable sequence of loads 1643 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) { 1644 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL)); 1645 } 1646 1647 /// Checks if the given loads sequence is known as not vectorizable 1648 template <typename T> 1649 bool areKnownNonVectorizableLoads(ArrayRef<T *> VL) const { 1650 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL)); 1651 } 1652 1653 OptimizationRemarkEmitter *getORE() { return ORE; } 1654 1655 /// This structure holds any data we need about the edges being traversed 1656 /// during buildTree_rec(). We keep track of: 1657 /// (i) the user TreeEntry index, and 1658 /// (ii) the index of the edge. 1659 struct EdgeInfo { 1660 EdgeInfo() = default; 1661 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx) 1662 : UserTE(UserTE), EdgeIdx(EdgeIdx) {} 1663 /// The user TreeEntry. 1664 TreeEntry *UserTE = nullptr; 1665 /// The operand index of the use. 1666 unsigned EdgeIdx = UINT_MAX; 1667 #ifndef NDEBUG 1668 friend inline raw_ostream &operator<<(raw_ostream &OS, 1669 const BoUpSLP::EdgeInfo &EI) { 1670 EI.dump(OS); 1671 return OS; 1672 } 1673 /// Debug print. 1674 void dump(raw_ostream &OS) const { 1675 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null") 1676 << " EdgeIdx:" << EdgeIdx << "}"; 1677 } 1678 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } 1679 #endif 1680 bool operator == (const EdgeInfo &Other) const { 1681 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx; 1682 } 1683 }; 1684 1685 /// A helper class used for scoring candidates for two consecutive lanes. 1686 class LookAheadHeuristics { 1687 const TargetLibraryInfo &TLI; 1688 const DataLayout &DL; 1689 ScalarEvolution &SE; 1690 const BoUpSLP &R; 1691 int NumLanes; // Total number of lanes (aka vectorization factor). 1692 int MaxLevel; // The maximum recursion depth for accumulating score. 1693 1694 public: 1695 LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, 1696 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, 1697 int MaxLevel) 1698 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes), 1699 MaxLevel(MaxLevel) {} 1700 1701 // The hard-coded scores listed here are not very important, though it shall 1702 // be higher for better matches to improve the resulting cost. When 1703 // computing the scores of matching one sub-tree with another, we are 1704 // basically counting the number of values that are matching. So even if all 1705 // scores are set to 1, we would still get a decent matching result. 1706 // However, sometimes we have to break ties. For example we may have to 1707 // choose between matching loads vs matching opcodes. This is what these 1708 // scores are helping us with: they provide the order of preference. Also, 1709 // this is important if the scalar is externally used or used in another 1710 // tree entry node in the different lane. 1711 1712 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). 1713 static const int ScoreConsecutiveLoads = 4; 1714 /// The same load multiple times. This should have a better score than 1715 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it 1716 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for 1717 /// a vector load and 1.0 for a broadcast. 1718 static const int ScoreSplatLoads = 3; 1719 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]). 1720 static const int ScoreReversedLoads = 3; 1721 /// A load candidate for masked gather. 1722 static const int ScoreMaskedGatherCandidate = 1; 1723 /// ExtractElementInst from same vector and consecutive indexes. 1724 static const int ScoreConsecutiveExtracts = 4; 1725 /// ExtractElementInst from same vector and reversed indices. 1726 static const int ScoreReversedExtracts = 3; 1727 /// Constants. 1728 static const int ScoreConstants = 2; 1729 /// Instructions with the same opcode. 1730 static const int ScoreSameOpcode = 2; 1731 /// Instructions with alt opcodes (e.g, add + sub). 1732 static const int ScoreAltOpcodes = 1; 1733 /// Identical instructions (a.k.a. splat or broadcast). 1734 static const int ScoreSplat = 1; 1735 /// Matching with an undef is preferable to failing. 1736 static const int ScoreUndef = 1; 1737 /// Score for failing to find a decent match. 1738 static const int ScoreFail = 0; 1739 /// Score if all users are vectorized. 1740 static const int ScoreAllUserVectorized = 1; 1741 1742 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. 1743 /// \p U1 and \p U2 are the users of \p V1 and \p V2. 1744 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p 1745 /// MainAltOps. 1746 int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, 1747 ArrayRef<Value *> MainAltOps) const { 1748 if (!isValidElementType(V1->getType()) || 1749 !isValidElementType(V2->getType())) 1750 return LookAheadHeuristics::ScoreFail; 1751 1752 if (V1 == V2) { 1753 if (isa<LoadInst>(V1)) { 1754 // Retruns true if the users of V1 and V2 won't need to be extracted. 1755 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) { 1756 // Bail out if we have too many uses to save compilation time. 1757 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit)) 1758 return false; 1759 1760 auto AllUsersVectorized = [U1, U2, this](Value *V) { 1761 return llvm::all_of(V->users(), [U1, U2, this](Value *U) { 1762 return U == U1 || U == U2 || R.isVectorized(U); 1763 }); 1764 }; 1765 return AllUsersVectorized(V1) && AllUsersVectorized(V2); 1766 }; 1767 // A broadcast of a load can be cheaper on some targets. 1768 if (R.TTI->isLegalBroadcastLoad(V1->getType(), 1769 ElementCount::getFixed(NumLanes)) && 1770 ((int)V1->getNumUses() == NumLanes || 1771 AllUsersAreInternal(V1, V2))) 1772 return LookAheadHeuristics::ScoreSplatLoads; 1773 } 1774 return LookAheadHeuristics::ScoreSplat; 1775 } 1776 1777 auto CheckSameEntryOrFail = [&]() { 1778 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) { 1779 SmallPtrSet<TreeEntry *, 4> Set(TEs1.begin(), TEs1.end()); 1780 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2); 1781 !TEs2.empty() && 1782 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); })) 1783 return LookAheadHeuristics::ScoreSplatLoads; 1784 } 1785 return LookAheadHeuristics::ScoreFail; 1786 }; 1787 1788 auto *LI1 = dyn_cast<LoadInst>(V1); 1789 auto *LI2 = dyn_cast<LoadInst>(V2); 1790 if (LI1 && LI2) { 1791 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() || 1792 !LI2->isSimple()) 1793 return CheckSameEntryOrFail(); 1794 1795 std::optional<int> Dist = getPointersDiff( 1796 LI1->getType(), LI1->getPointerOperand(), LI2->getType(), 1797 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true); 1798 if (!Dist || *Dist == 0) { 1799 if (getUnderlyingObject(LI1->getPointerOperand()) == 1800 getUnderlyingObject(LI2->getPointerOperand()) && 1801 R.TTI->isLegalMaskedGather( 1802 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign())) 1803 return LookAheadHeuristics::ScoreMaskedGatherCandidate; 1804 return CheckSameEntryOrFail(); 1805 } 1806 // The distance is too large - still may be profitable to use masked 1807 // loads/gathers. 1808 if (std::abs(*Dist) > NumLanes / 2) 1809 return LookAheadHeuristics::ScoreMaskedGatherCandidate; 1810 // This still will detect consecutive loads, but we might have "holes" 1811 // in some cases. It is ok for non-power-2 vectorization and may produce 1812 // better results. It should not affect current vectorization. 1813 return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads 1814 : LookAheadHeuristics::ScoreReversedLoads; 1815 } 1816 1817 auto *C1 = dyn_cast<Constant>(V1); 1818 auto *C2 = dyn_cast<Constant>(V2); 1819 if (C1 && C2) 1820 return LookAheadHeuristics::ScoreConstants; 1821 1822 // Extracts from consecutive indexes of the same vector better score as 1823 // the extracts could be optimized away. 1824 Value *EV1; 1825 ConstantInt *Ex1Idx; 1826 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) { 1827 // Undefs are always profitable for extractelements. 1828 // Compiler can easily combine poison and extractelement <non-poison> or 1829 // undef and extractelement <poison>. But combining undef + 1830 // extractelement <non-poison-but-may-produce-poison> requires some 1831 // extra operations. 1832 if (isa<UndefValue>(V2)) 1833 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all()) 1834 ? LookAheadHeuristics::ScoreConsecutiveExtracts 1835 : LookAheadHeuristics::ScoreSameOpcode; 1836 Value *EV2 = nullptr; 1837 ConstantInt *Ex2Idx = nullptr; 1838 if (match(V2, 1839 m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx), 1840 m_Undef())))) { 1841 // Undefs are always profitable for extractelements. 1842 if (!Ex2Idx) 1843 return LookAheadHeuristics::ScoreConsecutiveExtracts; 1844 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType()) 1845 return LookAheadHeuristics::ScoreConsecutiveExtracts; 1846 if (EV2 == EV1) { 1847 int Idx1 = Ex1Idx->getZExtValue(); 1848 int Idx2 = Ex2Idx->getZExtValue(); 1849 int Dist = Idx2 - Idx1; 1850 // The distance is too large - still may be profitable to use 1851 // shuffles. 1852 if (std::abs(Dist) == 0) 1853 return LookAheadHeuristics::ScoreSplat; 1854 if (std::abs(Dist) > NumLanes / 2) 1855 return LookAheadHeuristics::ScoreSameOpcode; 1856 return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts 1857 : LookAheadHeuristics::ScoreReversedExtracts; 1858 } 1859 return LookAheadHeuristics::ScoreAltOpcodes; 1860 } 1861 return CheckSameEntryOrFail(); 1862 } 1863 1864 auto *I1 = dyn_cast<Instruction>(V1); 1865 auto *I2 = dyn_cast<Instruction>(V2); 1866 if (I1 && I2) { 1867 if (I1->getParent() != I2->getParent()) 1868 return CheckSameEntryOrFail(); 1869 SmallVector<Value *, 4> Ops(MainAltOps); 1870 Ops.push_back(I1); 1871 Ops.push_back(I2); 1872 InstructionsState S = getSameOpcode(Ops, TLI); 1873 // Note: Only consider instructions with <= 2 operands to avoid 1874 // complexity explosion. 1875 if (S && 1876 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() || 1877 !S.isAltShuffle()) && 1878 all_of(Ops, [&S](Value *V) { 1879 return isa<PoisonValue>(V) || 1880 cast<Instruction>(V)->getNumOperands() == 1881 S.getMainOp()->getNumOperands(); 1882 })) 1883 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes 1884 : LookAheadHeuristics::ScoreSameOpcode; 1885 } 1886 1887 if (I1 && isa<PoisonValue>(V2)) 1888 return LookAheadHeuristics::ScoreSameOpcode; 1889 1890 if (isa<UndefValue>(V2)) 1891 return LookAheadHeuristics::ScoreUndef; 1892 1893 return CheckSameEntryOrFail(); 1894 } 1895 1896 /// Go through the operands of \p LHS and \p RHS recursively until 1897 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are 1898 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands 1899 /// of \p U1 and \p U2), except at the beginning of the recursion where 1900 /// these are set to nullptr. 1901 /// 1902 /// For example: 1903 /// \verbatim 1904 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] 1905 /// \ / \ / \ / \ / 1906 /// + + + + 1907 /// G1 G2 G3 G4 1908 /// \endverbatim 1909 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at 1910 /// each level recursively, accumulating the score. It starts from matching 1911 /// the additions at level 0, then moves on to the loads (level 1). The 1912 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and 1913 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while 1914 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail. 1915 /// Please note that the order of the operands does not matter, as we 1916 /// evaluate the score of all profitable combinations of operands. In 1917 /// other words the score of G1 and G4 is the same as G1 and G2. This 1918 /// heuristic is based on ideas described in: 1919 /// Look-ahead SLP: Auto-vectorization in the presence of commutative 1920 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, 1921 /// Luís F. W. Góes 1922 int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, 1923 Instruction *U2, int CurrLevel, 1924 ArrayRef<Value *> MainAltOps) const { 1925 1926 // Get the shallow score of V1 and V2. 1927 int ShallowScoreAtThisLevel = 1928 getShallowScore(LHS, RHS, U1, U2, MainAltOps); 1929 1930 // If reached MaxLevel, 1931 // or if V1 and V2 are not instructions, 1932 // or if they are SPLAT, 1933 // or if they are not consecutive, 1934 // or if profitable to vectorize loads or extractelements, early return 1935 // the current cost. 1936 auto *I1 = dyn_cast<Instruction>(LHS); 1937 auto *I2 = dyn_cast<Instruction>(RHS); 1938 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || 1939 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail || 1940 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) || 1941 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) || 1942 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) && 1943 ShallowScoreAtThisLevel)) 1944 return ShallowScoreAtThisLevel; 1945 assert(I1 && I2 && "Should have early exited."); 1946 1947 // Contains the I2 operand indexes that got matched with I1 operands. 1948 SmallSet<unsigned, 4> Op2Used; 1949 1950 // Recursion towards the operands of I1 and I2. We are trying all possible 1951 // operand pairs, and keeping track of the best score. 1952 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); 1953 OpIdx1 != NumOperands1; ++OpIdx1) { 1954 // Try to pair op1I with the best operand of I2. 1955 int MaxTmpScore = 0; 1956 unsigned MaxOpIdx2 = 0; 1957 bool FoundBest = false; 1958 // If I2 is commutative try all combinations. 1959 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1; 1960 unsigned ToIdx = isCommutative(I2) 1961 ? I2->getNumOperands() 1962 : std::min(I2->getNumOperands(), OpIdx1 + 1); 1963 assert(FromIdx <= ToIdx && "Bad index"); 1964 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) { 1965 // Skip operands already paired with OpIdx1. 1966 if (Op2Used.count(OpIdx2)) 1967 continue; 1968 // Recursively calculate the cost at each level 1969 int TmpScore = 1970 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2), 1971 I1, I2, CurrLevel + 1, {}); 1972 // Look for the best score. 1973 if (TmpScore > LookAheadHeuristics::ScoreFail && 1974 TmpScore > MaxTmpScore) { 1975 MaxTmpScore = TmpScore; 1976 MaxOpIdx2 = OpIdx2; 1977 FoundBest = true; 1978 } 1979 } 1980 if (FoundBest) { 1981 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. 1982 Op2Used.insert(MaxOpIdx2); 1983 ShallowScoreAtThisLevel += MaxTmpScore; 1984 } 1985 } 1986 return ShallowScoreAtThisLevel; 1987 } 1988 }; 1989 /// A helper data structure to hold the operands of a vector of instructions. 1990 /// This supports a fixed vector length for all operand vectors. 1991 class VLOperands { 1992 /// For each operand we need (i) the value, and (ii) the opcode that it 1993 /// would be attached to if the expression was in a left-linearized form. 1994 /// This is required to avoid illegal operand reordering. 1995 /// For example: 1996 /// \verbatim 1997 /// 0 Op1 1998 /// |/ 1999 /// Op1 Op2 Linearized + Op2 2000 /// \ / ----------> |/ 2001 /// - - 2002 /// 2003 /// Op1 - Op2 (0 + Op1) - Op2 2004 /// \endverbatim 2005 /// 2006 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. 2007 /// 2008 /// Another way to think of this is to track all the operations across the 2009 /// path from the operand all the way to the root of the tree and to 2010 /// calculate the operation that corresponds to this path. For example, the 2011 /// path from Op2 to the root crosses the RHS of the '-', therefore the 2012 /// corresponding operation is a '-' (which matches the one in the 2013 /// linearized tree, as shown above). 2014 /// 2015 /// For lack of a better term, we refer to this operation as Accumulated 2016 /// Path Operation (APO). 2017 struct OperandData { 2018 OperandData() = default; 2019 OperandData(Value *V, bool APO, bool IsUsed) 2020 : V(V), APO(APO), IsUsed(IsUsed) {} 2021 /// The operand value. 2022 Value *V = nullptr; 2023 /// TreeEntries only allow a single opcode, or an alternate sequence of 2024 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the 2025 /// APO. It is set to 'true' if 'V' is attached to an inverse operation 2026 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise 2027 /// (e.g., Add/Mul) 2028 bool APO = false; 2029 /// Helper data for the reordering function. 2030 bool IsUsed = false; 2031 }; 2032 2033 /// During operand reordering, we are trying to select the operand at lane 2034 /// that matches best with the operand at the neighboring lane. Our 2035 /// selection is based on the type of value we are looking for. For example, 2036 /// if the neighboring lane has a load, we need to look for a load that is 2037 /// accessing a consecutive address. These strategies are summarized in the 2038 /// 'ReorderingMode' enumerator. 2039 enum class ReorderingMode { 2040 Load, ///< Matching loads to consecutive memory addresses 2041 Opcode, ///< Matching instructions based on opcode (same or alternate) 2042 Constant, ///< Matching constants 2043 Splat, ///< Matching the same instruction multiple times (broadcast) 2044 Failed, ///< We failed to create a vectorizable group 2045 }; 2046 2047 using OperandDataVec = SmallVector<OperandData, 2>; 2048 2049 /// A vector of operand vectors. 2050 SmallVector<OperandDataVec, 4> OpsVec; 2051 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0] 2052 /// is not IntrinsicInst, ArgSize is User::getNumOperands. 2053 unsigned ArgSize = 0; 2054 2055 const TargetLibraryInfo &TLI; 2056 const DataLayout &DL; 2057 ScalarEvolution &SE; 2058 const BoUpSLP &R; 2059 const Loop *L = nullptr; 2060 2061 /// \returns the operand data at \p OpIdx and \p Lane. 2062 OperandData &getData(unsigned OpIdx, unsigned Lane) { 2063 return OpsVec[OpIdx][Lane]; 2064 } 2065 2066 /// \returns the operand data at \p OpIdx and \p Lane. Const version. 2067 const OperandData &getData(unsigned OpIdx, unsigned Lane) const { 2068 return OpsVec[OpIdx][Lane]; 2069 } 2070 2071 /// Clears the used flag for all entries. 2072 void clearUsed() { 2073 for (unsigned OpIdx = 0, NumOperands = getNumOperands(); 2074 OpIdx != NumOperands; ++OpIdx) 2075 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; 2076 ++Lane) 2077 OpsVec[OpIdx][Lane].IsUsed = false; 2078 } 2079 2080 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. 2081 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) { 2082 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); 2083 } 2084 2085 /// \param Lane lane of the operands under analysis. 2086 /// \param OpIdx operand index in \p Lane lane we're looking the best 2087 /// candidate for. 2088 /// \param Idx operand index of the current candidate value. 2089 /// \returns The additional score due to possible broadcasting of the 2090 /// elements in the lane. It is more profitable to have power-of-2 unique 2091 /// elements in the lane, it will be vectorized with higher probability 2092 /// after removing duplicates. Currently the SLP vectorizer supports only 2093 /// vectorization of the power-of-2 number of unique scalars. 2094 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx, 2095 const SmallBitVector &UsedLanes) const { 2096 Value *IdxLaneV = getData(Idx, Lane).V; 2097 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V || 2098 isa<ExtractElementInst>(IdxLaneV)) 2099 return 0; 2100 SmallDenseMap<Value *, unsigned, 4> Uniques; 2101 for (unsigned Ln : seq<unsigned>(getNumLanes())) { 2102 if (Ln == Lane) 2103 continue; 2104 Value *OpIdxLnV = getData(OpIdx, Ln).V; 2105 if (!isa<Instruction>(OpIdxLnV)) 2106 return 0; 2107 Uniques.try_emplace(OpIdxLnV, Ln); 2108 } 2109 unsigned UniquesCount = Uniques.size(); 2110 auto IdxIt = Uniques.find(IdxLaneV); 2111 unsigned UniquesCntWithIdxLaneV = 2112 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1; 2113 Value *OpIdxLaneV = getData(OpIdx, Lane).V; 2114 auto OpIdxIt = Uniques.find(OpIdxLaneV); 2115 unsigned UniquesCntWithOpIdxLaneV = 2116 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1; 2117 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV) 2118 return 0; 2119 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) - 2120 UniquesCntWithOpIdxLaneV, 2121 UniquesCntWithOpIdxLaneV - 2122 bit_floor(UniquesCntWithOpIdxLaneV)) - 2123 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second)) 2124 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV) 2125 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV); 2126 } 2127 2128 /// \param Lane lane of the operands under analysis. 2129 /// \param OpIdx operand index in \p Lane lane we're looking the best 2130 /// candidate for. 2131 /// \param Idx operand index of the current candidate value. 2132 /// \returns The additional score for the scalar which users are all 2133 /// vectorized. 2134 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const { 2135 Value *IdxLaneV = getData(Idx, Lane).V; 2136 Value *OpIdxLaneV = getData(OpIdx, Lane).V; 2137 // Do not care about number of uses for vector-like instructions 2138 // (extractelement/extractvalue with constant indices), they are extracts 2139 // themselves and already externally used. Vectorization of such 2140 // instructions does not add extra extractelement instruction, just may 2141 // remove it. 2142 if (isVectorLikeInstWithConstOps(IdxLaneV) && 2143 isVectorLikeInstWithConstOps(OpIdxLaneV)) 2144 return LookAheadHeuristics::ScoreAllUserVectorized; 2145 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV); 2146 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV)) 2147 return 0; 2148 return R.areAllUsersVectorized(IdxLaneI) 2149 ? LookAheadHeuristics::ScoreAllUserVectorized 2150 : 0; 2151 } 2152 2153 /// Score scaling factor for fully compatible instructions but with 2154 /// different number of external uses. Allows better selection of the 2155 /// instructions with less external uses. 2156 static const int ScoreScaleFactor = 10; 2157 2158 /// \Returns the look-ahead score, which tells us how much the sub-trees 2159 /// rooted at \p LHS and \p RHS match, the more they match the higher the 2160 /// score. This helps break ties in an informed way when we cannot decide on 2161 /// the order of the operands by just considering the immediate 2162 /// predecessors. 2163 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps, 2164 int Lane, unsigned OpIdx, unsigned Idx, 2165 bool &IsUsed, const SmallBitVector &UsedLanes) { 2166 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(), 2167 LookAheadMaxDepth); 2168 // Keep track of the instruction stack as we recurse into the operands 2169 // during the look-ahead score exploration. 2170 int Score = 2171 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr, 2172 /*CurrLevel=*/1, MainAltOps); 2173 if (Score) { 2174 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes); 2175 if (Score <= -SplatScore) { 2176 // Failed score. 2177 Score = 0; 2178 } else { 2179 Score += SplatScore; 2180 // Scale score to see the difference between different operands 2181 // and similar operands but all vectorized/not all vectorized 2182 // uses. It does not affect actual selection of the best 2183 // compatible operand in general, just allows to select the 2184 // operand with all vectorized uses. 2185 Score *= ScoreScaleFactor; 2186 Score += getExternalUseScore(Lane, OpIdx, Idx); 2187 IsUsed = true; 2188 } 2189 } 2190 return Score; 2191 } 2192 2193 /// Best defined scores per lanes between the passes. Used to choose the 2194 /// best operand (with the highest score) between the passes. 2195 /// The key - {Operand Index, Lane}. 2196 /// The value - the best score between the passes for the lane and the 2197 /// operand. 2198 SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8> 2199 BestScoresPerLanes; 2200 2201 // Search all operands in Ops[*][Lane] for the one that matches best 2202 // Ops[OpIdx][LastLane] and return its opreand index. 2203 // If no good match can be found, return std::nullopt. 2204 std::optional<unsigned> 2205 getBestOperand(unsigned OpIdx, int Lane, int LastLane, 2206 ArrayRef<ReorderingMode> ReorderingModes, 2207 ArrayRef<Value *> MainAltOps, 2208 const SmallBitVector &UsedLanes) { 2209 unsigned NumOperands = getNumOperands(); 2210 2211 // The operand of the previous lane at OpIdx. 2212 Value *OpLastLane = getData(OpIdx, LastLane).V; 2213 2214 // Our strategy mode for OpIdx. 2215 ReorderingMode RMode = ReorderingModes[OpIdx]; 2216 if (RMode == ReorderingMode::Failed) 2217 return std::nullopt; 2218 2219 // The linearized opcode of the operand at OpIdx, Lane. 2220 bool OpIdxAPO = getData(OpIdx, Lane).APO; 2221 2222 // The best operand index and its score. 2223 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we 2224 // are using the score to differentiate between the two. 2225 struct BestOpData { 2226 std::optional<unsigned> Idx; 2227 unsigned Score = 0; 2228 } BestOp; 2229 BestOp.Score = 2230 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0) 2231 .first->second; 2232 2233 // Track if the operand must be marked as used. If the operand is set to 2234 // Score 1 explicitly (because of non power-of-2 unique scalars, we may 2235 // want to reestimate the operands again on the following iterations). 2236 bool IsUsed = RMode == ReorderingMode::Splat || 2237 RMode == ReorderingMode::Constant || 2238 RMode == ReorderingMode::Load; 2239 // Iterate through all unused operands and look for the best. 2240 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) { 2241 // Get the operand at Idx and Lane. 2242 OperandData &OpData = getData(Idx, Lane); 2243 Value *Op = OpData.V; 2244 bool OpAPO = OpData.APO; 2245 2246 // Skip already selected operands. 2247 if (OpData.IsUsed) 2248 continue; 2249 2250 // Skip if we are trying to move the operand to a position with a 2251 // different opcode in the linearized tree form. This would break the 2252 // semantics. 2253 if (OpAPO != OpIdxAPO) 2254 continue; 2255 2256 // Look for an operand that matches the current mode. 2257 switch (RMode) { 2258 case ReorderingMode::Load: 2259 case ReorderingMode::Opcode: { 2260 bool LeftToRight = Lane > LastLane; 2261 Value *OpLeft = (LeftToRight) ? OpLastLane : Op; 2262 Value *OpRight = (LeftToRight) ? Op : OpLastLane; 2263 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane, 2264 OpIdx, Idx, IsUsed, UsedLanes); 2265 if (Score > static_cast<int>(BestOp.Score) || 2266 (Score > 0 && Score == static_cast<int>(BestOp.Score) && 2267 Idx == OpIdx)) { 2268 BestOp.Idx = Idx; 2269 BestOp.Score = Score; 2270 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score; 2271 } 2272 break; 2273 } 2274 case ReorderingMode::Constant: 2275 if (isa<Constant>(Op) || 2276 (!BestOp.Score && L && L->isLoopInvariant(Op))) { 2277 BestOp.Idx = Idx; 2278 if (isa<Constant>(Op)) { 2279 BestOp.Score = LookAheadHeuristics::ScoreConstants; 2280 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = 2281 LookAheadHeuristics::ScoreConstants; 2282 } 2283 if (isa<UndefValue>(Op) || !isa<Constant>(Op)) 2284 IsUsed = false; 2285 } 2286 break; 2287 case ReorderingMode::Splat: 2288 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) { 2289 IsUsed = Op == OpLastLane; 2290 if (Op == OpLastLane) { 2291 BestOp.Score = LookAheadHeuristics::ScoreSplat; 2292 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = 2293 LookAheadHeuristics::ScoreSplat; 2294 } 2295 BestOp.Idx = Idx; 2296 } 2297 break; 2298 case ReorderingMode::Failed: 2299 llvm_unreachable("Not expected Failed reordering mode."); 2300 } 2301 } 2302 2303 if (BestOp.Idx) { 2304 getData(*BestOp.Idx, Lane).IsUsed = IsUsed; 2305 return BestOp.Idx; 2306 } 2307 // If we could not find a good match return std::nullopt. 2308 return std::nullopt; 2309 } 2310 2311 /// Helper for reorderOperandVecs. 2312 /// \returns the lane that we should start reordering from. This is the one 2313 /// which has the least number of operands that can freely move about or 2314 /// less profitable because it already has the most optimal set of operands. 2315 unsigned getBestLaneToStartReordering() const { 2316 unsigned Min = UINT_MAX; 2317 unsigned SameOpNumber = 0; 2318 // std::pair<unsigned, unsigned> is used to implement a simple voting 2319 // algorithm and choose the lane with the least number of operands that 2320 // can freely move about or less profitable because it already has the 2321 // most optimal set of operands. The first unsigned is a counter for 2322 // voting, the second unsigned is the counter of lanes with instructions 2323 // with same/alternate opcodes and same parent basic block. 2324 MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap; 2325 // Try to be closer to the original results, if we have multiple lanes 2326 // with same cost. If 2 lanes have the same cost, use the one with the 2327 // highest index. 2328 for (int I = getNumLanes(); I > 0; --I) { 2329 unsigned Lane = I - 1; 2330 OperandsOrderData NumFreeOpsHash = 2331 getMaxNumOperandsThatCanBeReordered(Lane); 2332 // Compare the number of operands that can move and choose the one with 2333 // the least number. 2334 if (NumFreeOpsHash.NumOfAPOs < Min) { 2335 Min = NumFreeOpsHash.NumOfAPOs; 2336 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; 2337 HashMap.clear(); 2338 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); 2339 } else if (NumFreeOpsHash.NumOfAPOs == Min && 2340 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) { 2341 // Select the most optimal lane in terms of number of operands that 2342 // should be moved around. 2343 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; 2344 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); 2345 } else if (NumFreeOpsHash.NumOfAPOs == Min && 2346 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) { 2347 auto [It, Inserted] = 2348 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane); 2349 if (!Inserted) 2350 ++It->second.first; 2351 } 2352 } 2353 // Select the lane with the minimum counter. 2354 unsigned BestLane = 0; 2355 unsigned CntMin = UINT_MAX; 2356 for (const auto &Data : reverse(HashMap)) { 2357 if (Data.second.first < CntMin) { 2358 CntMin = Data.second.first; 2359 BestLane = Data.second.second; 2360 } 2361 } 2362 return BestLane; 2363 } 2364 2365 /// Data structure that helps to reorder operands. 2366 struct OperandsOrderData { 2367 /// The best number of operands with the same APOs, which can be 2368 /// reordered. 2369 unsigned NumOfAPOs = UINT_MAX; 2370 /// Number of operands with the same/alternate instruction opcode and 2371 /// parent. 2372 unsigned NumOpsWithSameOpcodeParent = 0; 2373 /// Hash for the actual operands ordering. 2374 /// Used to count operands, actually their position id and opcode 2375 /// value. It is used in the voting mechanism to find the lane with the 2376 /// least number of operands that can freely move about or less profitable 2377 /// because it already has the most optimal set of operands. Can be 2378 /// replaced with SmallVector<unsigned> instead but hash code is faster 2379 /// and requires less memory. 2380 unsigned Hash = 0; 2381 }; 2382 /// \returns the maximum number of operands that are allowed to be reordered 2383 /// for \p Lane and the number of compatible instructions(with the same 2384 /// parent/opcode). This is used as a heuristic for selecting the first lane 2385 /// to start operand reordering. 2386 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { 2387 unsigned CntTrue = 0; 2388 unsigned NumOperands = getNumOperands(); 2389 // Operands with the same APO can be reordered. We therefore need to count 2390 // how many of them we have for each APO, like this: Cnt[APO] = x. 2391 // Since we only have two APOs, namely true and false, we can avoid using 2392 // a map. Instead we can simply count the number of operands that 2393 // correspond to one of them (in this case the 'true' APO), and calculate 2394 // the other by subtracting it from the total number of operands. 2395 // Operands with the same instruction opcode and parent are more 2396 // profitable since we don't need to move them in many cases, with a high 2397 // probability such lane already can be vectorized effectively. 2398 bool AllUndefs = true; 2399 unsigned NumOpsWithSameOpcodeParent = 0; 2400 Instruction *OpcodeI = nullptr; 2401 BasicBlock *Parent = nullptr; 2402 unsigned Hash = 0; 2403 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 2404 const OperandData &OpData = getData(OpIdx, Lane); 2405 if (OpData.APO) 2406 ++CntTrue; 2407 // Use Boyer-Moore majority voting for finding the majority opcode and 2408 // the number of times it occurs. 2409 if (auto *I = dyn_cast<Instruction>(OpData.V)) { 2410 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) || 2411 I->getParent() != Parent) { 2412 if (NumOpsWithSameOpcodeParent == 0) { 2413 NumOpsWithSameOpcodeParent = 1; 2414 OpcodeI = I; 2415 Parent = I->getParent(); 2416 } else { 2417 --NumOpsWithSameOpcodeParent; 2418 } 2419 } else { 2420 ++NumOpsWithSameOpcodeParent; 2421 } 2422 } 2423 Hash = hash_combine( 2424 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1))); 2425 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V); 2426 } 2427 if (AllUndefs) 2428 return {}; 2429 OperandsOrderData Data; 2430 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue); 2431 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent; 2432 Data.Hash = Hash; 2433 return Data; 2434 } 2435 2436 /// Go through the instructions in VL and append their operands. 2437 void appendOperandsOfVL(ArrayRef<Value *> VL, const InstructionsState &S) { 2438 assert(!VL.empty() && "Bad VL"); 2439 assert((empty() || VL.size() == getNumLanes()) && 2440 "Expected same number of lanes"); 2441 assert(S.valid() && "InstructionsState is invalid."); 2442 // IntrinsicInst::isCommutative returns true if swapping the first "two" 2443 // arguments to the intrinsic produces the same result. 2444 constexpr unsigned IntrinsicNumOperands = 2; 2445 Instruction *MainOp = S.getMainOp(); 2446 unsigned NumOperands = MainOp->getNumOperands(); 2447 ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands; 2448 OpsVec.resize(NumOperands); 2449 unsigned NumLanes = VL.size(); 2450 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 2451 OpsVec[OpIdx].resize(NumLanes); 2452 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 2453 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) && 2454 "Expected instruction or poison value"); 2455 // Our tree has just 3 nodes: the root and two operands. 2456 // It is therefore trivial to get the APO. We only need to check the 2457 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or 2458 // RHS operand. The LHS operand of both add and sub is never attached 2459 // to an inversese operation in the linearized form, therefore its APO 2460 // is false. The RHS is true only if VL[Lane] is an inverse operation. 2461 2462 // Since operand reordering is performed on groups of commutative 2463 // operations or alternating sequences (e.g., +, -), we can safely 2464 // tell the inverse operations by checking commutativity. 2465 if (isa<PoisonValue>(VL[Lane])) { 2466 if (auto *EI = dyn_cast<ExtractElementInst>(MainOp)) { 2467 if (OpIdx == 0) { 2468 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(), true, false}; 2469 continue; 2470 } 2471 } else if (auto *EV = dyn_cast<ExtractValueInst>(MainOp)) { 2472 if (OpIdx == 0) { 2473 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(), true, false}; 2474 continue; 2475 } 2476 } 2477 OpsVec[OpIdx][Lane] = { 2478 PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true, 2479 false}; 2480 continue; 2481 } 2482 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane])); 2483 bool APO = (OpIdx == 0) ? false : IsInverseOperation; 2484 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx), 2485 APO, false}; 2486 } 2487 } 2488 } 2489 2490 /// \returns the number of operands. 2491 unsigned getNumOperands() const { return ArgSize; } 2492 2493 /// \returns the number of lanes. 2494 unsigned getNumLanes() const { return OpsVec[0].size(); } 2495 2496 /// \returns the operand value at \p OpIdx and \p Lane. 2497 Value *getValue(unsigned OpIdx, unsigned Lane) const { 2498 return getData(OpIdx, Lane).V; 2499 } 2500 2501 /// \returns true if the data structure is empty. 2502 bool empty() const { return OpsVec.empty(); } 2503 2504 /// Clears the data. 2505 void clear() { OpsVec.clear(); } 2506 2507 /// \Returns true if there are enough operands identical to \p Op to fill 2508 /// the whole vector (it is mixed with constants or loop invariant values). 2509 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow. 2510 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) { 2511 assert(Op == getValue(OpIdx, Lane) && 2512 "Op is expected to be getValue(OpIdx, Lane)."); 2513 // Small number of loads - try load matching. 2514 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2) 2515 return false; 2516 bool OpAPO = getData(OpIdx, Lane).APO; 2517 bool IsInvariant = L && L->isLoopInvariant(Op); 2518 unsigned Cnt = 0; 2519 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) { 2520 if (Ln == Lane) 2521 continue; 2522 // This is set to true if we found a candidate for broadcast at Lane. 2523 bool FoundCandidate = false; 2524 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) { 2525 OperandData &Data = getData(OpI, Ln); 2526 if (Data.APO != OpAPO || Data.IsUsed) 2527 continue; 2528 Value *OpILane = getValue(OpI, Lane); 2529 bool IsConstantOp = isa<Constant>(OpILane); 2530 // Consider the broadcast candidate if: 2531 // 1. Same value is found in one of the operands. 2532 if (Data.V == Op || 2533 // 2. The operand in the given lane is not constant but there is a 2534 // constant operand in another lane (which can be moved to the 2535 // given lane). In this case we can represent it as a simple 2536 // permutation of constant and broadcast. 2537 (!IsConstantOp && 2538 ((Lns > 2 && isa<Constant>(Data.V)) || 2539 // 2.1. If we have only 2 lanes, need to check that value in the 2540 // next lane does not build same opcode sequence. 2541 (Lns == 2 && 2542 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) && 2543 isa<Constant>(Data.V)))) || 2544 // 3. The operand in the current lane is loop invariant (can be 2545 // hoisted out) and another operand is also a loop invariant 2546 // (though not a constant). In this case the whole vector can be 2547 // hoisted out. 2548 // FIXME: need to teach the cost model about this case for better 2549 // estimation. 2550 (IsInvariant && !isa<Constant>(Data.V) && 2551 !getSameOpcode({Op, Data.V}, TLI) && 2552 L->isLoopInvariant(Data.V))) { 2553 FoundCandidate = true; 2554 Data.IsUsed = Data.V == Op; 2555 if (Data.V == Op) 2556 ++Cnt; 2557 break; 2558 } 2559 } 2560 if (!FoundCandidate) 2561 return false; 2562 } 2563 return getNumLanes() == 2 || Cnt > 1; 2564 } 2565 2566 /// Checks if there is at least single compatible operand in lanes other 2567 /// than \p Lane, compatible with the operand \p Op. 2568 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const { 2569 assert(Op == getValue(OpIdx, Lane) && 2570 "Op is expected to be getValue(OpIdx, Lane)."); 2571 bool OpAPO = getData(OpIdx, Lane).APO; 2572 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) { 2573 if (Ln == Lane) 2574 continue; 2575 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) { 2576 const OperandData &Data = getData(OpI, Ln); 2577 if (Data.APO != OpAPO || Data.IsUsed) 2578 return true; 2579 Value *OpILn = getValue(OpI, Ln); 2580 return (L && L->isLoopInvariant(OpILn)) || 2581 (getSameOpcode({Op, OpILn}, TLI) && 2582 allSameBlock({Op, OpILn})); 2583 })) 2584 return true; 2585 } 2586 return false; 2587 } 2588 2589 public: 2590 /// Initialize with all the operands of the instruction vector \p RootVL. 2591 VLOperands(ArrayRef<Value *> RootVL, const InstructionsState &S, 2592 const BoUpSLP &R) 2593 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R), 2594 L(R.LI->getLoopFor(S.getMainOp()->getParent())) { 2595 // Append all the operands of RootVL. 2596 appendOperandsOfVL(RootVL, S); 2597 } 2598 2599 /// \Returns a value vector with the operands across all lanes for the 2600 /// opearnd at \p OpIdx. 2601 ValueList getVL(unsigned OpIdx) const { 2602 ValueList OpVL(OpsVec[OpIdx].size()); 2603 assert(OpsVec[OpIdx].size() == getNumLanes() && 2604 "Expected same num of lanes across all operands"); 2605 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane) 2606 OpVL[Lane] = OpsVec[OpIdx][Lane].V; 2607 return OpVL; 2608 } 2609 2610 // Performs operand reordering for 2 or more operands. 2611 // The original operands are in OrigOps[OpIdx][Lane]. 2612 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'. 2613 void reorder() { 2614 unsigned NumOperands = getNumOperands(); 2615 unsigned NumLanes = getNumLanes(); 2616 // Each operand has its own mode. We are using this mode to help us select 2617 // the instructions for each lane, so that they match best with the ones 2618 // we have selected so far. 2619 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands); 2620 2621 // This is a greedy single-pass algorithm. We are going over each lane 2622 // once and deciding on the best order right away with no back-tracking. 2623 // However, in order to increase its effectiveness, we start with the lane 2624 // that has operands that can move the least. For example, given the 2625 // following lanes: 2626 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd 2627 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st 2628 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd 2629 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th 2630 // we will start at Lane 1, since the operands of the subtraction cannot 2631 // be reordered. Then we will visit the rest of the lanes in a circular 2632 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3. 2633 2634 // Find the first lane that we will start our search from. 2635 unsigned FirstLane = getBestLaneToStartReordering(); 2636 2637 // Initialize the modes. 2638 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 2639 Value *OpLane0 = getValue(OpIdx, FirstLane); 2640 // Keep track if we have instructions with all the same opcode on one 2641 // side. 2642 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) { 2643 // Check if OpLane0 should be broadcast. 2644 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) || 2645 !canBeVectorized(OpILane0, OpIdx, FirstLane)) 2646 ReorderingModes[OpIdx] = ReorderingMode::Splat; 2647 else if (isa<LoadInst>(OpILane0)) 2648 ReorderingModes[OpIdx] = ReorderingMode::Load; 2649 else 2650 ReorderingModes[OpIdx] = ReorderingMode::Opcode; 2651 } else if (isa<Constant>(OpLane0)) { 2652 ReorderingModes[OpIdx] = ReorderingMode::Constant; 2653 } else if (isa<Argument>(OpLane0)) { 2654 // Our best hope is a Splat. It may save some cost in some cases. 2655 ReorderingModes[OpIdx] = ReorderingMode::Splat; 2656 } else { 2657 llvm_unreachable("Unexpected value kind."); 2658 } 2659 } 2660 2661 // Check that we don't have same operands. No need to reorder if operands 2662 // are just perfect diamond or shuffled diamond match. Do not do it only 2663 // for possible broadcasts or non-power of 2 number of scalars (just for 2664 // now). 2665 auto &&SkipReordering = [this]() { 2666 SmallPtrSet<Value *, 4> UniqueValues; 2667 ArrayRef<OperandData> Op0 = OpsVec.front(); 2668 for (const OperandData &Data : Op0) 2669 UniqueValues.insert(Data.V); 2670 for (ArrayRef<OperandData> Op : 2671 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) { 2672 if (any_of(Op, [&UniqueValues](const OperandData &Data) { 2673 return !UniqueValues.contains(Data.V); 2674 })) 2675 return false; 2676 } 2677 // TODO: Check if we can remove a check for non-power-2 number of 2678 // scalars after full support of non-power-2 vectorization. 2679 return UniqueValues.size() != 2 && 2680 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(), 2681 UniqueValues.size()); 2682 }; 2683 2684 // If the initial strategy fails for any of the operand indexes, then we 2685 // perform reordering again in a second pass. This helps avoid assigning 2686 // high priority to the failed strategy, and should improve reordering for 2687 // the non-failed operand indexes. 2688 for (int Pass = 0; Pass != 2; ++Pass) { 2689 // Check if no need to reorder operands since they're are perfect or 2690 // shuffled diamond match. 2691 // Need to do it to avoid extra external use cost counting for 2692 // shuffled matches, which may cause regressions. 2693 if (SkipReordering()) 2694 break; 2695 // Skip the second pass if the first pass did not fail. 2696 bool StrategyFailed = false; 2697 // Mark all operand data as free to use. 2698 clearUsed(); 2699 // We keep the original operand order for the FirstLane, so reorder the 2700 // rest of the lanes. We are visiting the nodes in a circular fashion, 2701 // using FirstLane as the center point and increasing the radius 2702 // distance. 2703 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands); 2704 for (unsigned I = 0; I < NumOperands; ++I) 2705 MainAltOps[I].push_back(getData(I, FirstLane).V); 2706 2707 SmallBitVector UsedLanes(NumLanes); 2708 UsedLanes.set(FirstLane); 2709 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) { 2710 // Visit the lane on the right and then the lane on the left. 2711 for (int Direction : {+1, -1}) { 2712 int Lane = FirstLane + Direction * Distance; 2713 if (Lane < 0 || Lane >= (int)NumLanes) 2714 continue; 2715 UsedLanes.set(Lane); 2716 int LastLane = Lane - Direction; 2717 assert(LastLane >= 0 && LastLane < (int)NumLanes && 2718 "Out of bounds"); 2719 // Look for a good match for each operand. 2720 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 2721 // Search for the operand that matches SortedOps[OpIdx][Lane-1]. 2722 std::optional<unsigned> BestIdx = 2723 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes, 2724 MainAltOps[OpIdx], UsedLanes); 2725 // By not selecting a value, we allow the operands that follow to 2726 // select a better matching value. We will get a non-null value in 2727 // the next run of getBestOperand(). 2728 if (BestIdx) { 2729 // Swap the current operand with the one returned by 2730 // getBestOperand(). 2731 swap(OpIdx, *BestIdx, Lane); 2732 } else { 2733 // Enable the second pass. 2734 StrategyFailed = true; 2735 } 2736 // Try to get the alternate opcode and follow it during analysis. 2737 if (MainAltOps[OpIdx].size() != 2) { 2738 OperandData &AltOp = getData(OpIdx, Lane); 2739 InstructionsState OpS = 2740 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI); 2741 if (OpS && OpS.isAltShuffle()) 2742 MainAltOps[OpIdx].push_back(AltOp.V); 2743 } 2744 } 2745 } 2746 } 2747 // Skip second pass if the strategy did not fail. 2748 if (!StrategyFailed) 2749 break; 2750 } 2751 } 2752 2753 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2754 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) { 2755 switch (RMode) { 2756 case ReorderingMode::Load: 2757 return "Load"; 2758 case ReorderingMode::Opcode: 2759 return "Opcode"; 2760 case ReorderingMode::Constant: 2761 return "Constant"; 2762 case ReorderingMode::Splat: 2763 return "Splat"; 2764 case ReorderingMode::Failed: 2765 return "Failed"; 2766 } 2767 llvm_unreachable("Unimplemented Reordering Type"); 2768 } 2769 2770 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode, 2771 raw_ostream &OS) { 2772 return OS << getModeStr(RMode); 2773 } 2774 2775 /// Debug print. 2776 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) { 2777 printMode(RMode, dbgs()); 2778 } 2779 2780 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) { 2781 return printMode(RMode, OS); 2782 } 2783 2784 LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const { 2785 const unsigned Indent = 2; 2786 unsigned Cnt = 0; 2787 for (const OperandDataVec &OpDataVec : OpsVec) { 2788 OS << "Operand " << Cnt++ << "\n"; 2789 for (const OperandData &OpData : OpDataVec) { 2790 OS.indent(Indent) << "{"; 2791 if (Value *V = OpData.V) 2792 OS << *V; 2793 else 2794 OS << "null"; 2795 OS << ", APO:" << OpData.APO << "}\n"; 2796 } 2797 OS << "\n"; 2798 } 2799 return OS; 2800 } 2801 2802 /// Debug print. 2803 LLVM_DUMP_METHOD void dump() const { print(dbgs()); } 2804 #endif 2805 }; 2806 2807 /// Evaluate each pair in \p Candidates and return index into \p Candidates 2808 /// for a pair which have highest score deemed to have best chance to form 2809 /// root of profitable tree to vectorize. Return std::nullopt if no candidate 2810 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit 2811 /// of the cost, considered to be good enough score. 2812 std::optional<int> 2813 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates, 2814 int Limit = LookAheadHeuristics::ScoreFail) const { 2815 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2, 2816 RootLookAheadMaxDepth); 2817 int BestScore = Limit; 2818 std::optional<int> Index; 2819 for (int I : seq<int>(0, Candidates.size())) { 2820 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first, 2821 Candidates[I].second, 2822 /*U1=*/nullptr, /*U2=*/nullptr, 2823 /*CurrLevel=*/1, {}); 2824 if (Score > BestScore) { 2825 BestScore = Score; 2826 Index = I; 2827 } 2828 } 2829 return Index; 2830 } 2831 2832 /// Checks if the instruction is marked for deletion. 2833 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); } 2834 2835 /// Removes an instruction from its block and eventually deletes it. 2836 /// It's like Instruction::eraseFromParent() except that the actual deletion 2837 /// is delayed until BoUpSLP is destructed. 2838 void eraseInstruction(Instruction *I) { 2839 DeletedInstructions.insert(I); 2840 } 2841 2842 /// Remove instructions from the parent function and clear the operands of \p 2843 /// DeadVals instructions, marking for deletion trivially dead operands. 2844 template <typename T> 2845 void removeInstructionsAndOperands(ArrayRef<T *> DeadVals) { 2846 SmallVector<WeakTrackingVH> DeadInsts; 2847 for (T *V : DeadVals) { 2848 auto *I = cast<Instruction>(V); 2849 DeletedInstructions.insert(I); 2850 } 2851 DenseSet<Value *> Processed; 2852 for (T *V : DeadVals) { 2853 if (!V || !Processed.insert(V).second) 2854 continue; 2855 auto *I = cast<Instruction>(V); 2856 salvageDebugInfo(*I); 2857 ArrayRef<TreeEntry *> Entries = getTreeEntries(I); 2858 for (Use &U : I->operands()) { 2859 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get()); 2860 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() && 2861 wouldInstructionBeTriviallyDead(OpI, TLI) && 2862 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) { 2863 return Entry->VectorizedValue == OpI; 2864 }))) 2865 DeadInsts.push_back(OpI); 2866 } 2867 I->dropAllReferences(); 2868 } 2869 for (T *V : DeadVals) { 2870 auto *I = cast<Instruction>(V); 2871 if (!I->getParent()) 2872 continue; 2873 assert((I->use_empty() || all_of(I->uses(), 2874 [&](Use &U) { 2875 return isDeleted( 2876 cast<Instruction>(U.getUser())); 2877 })) && 2878 "trying to erase instruction with users."); 2879 I->removeFromParent(); 2880 SE->forgetValue(I); 2881 } 2882 // Process the dead instruction list until empty. 2883 while (!DeadInsts.empty()) { 2884 Value *V = DeadInsts.pop_back_val(); 2885 Instruction *VI = cast_or_null<Instruction>(V); 2886 if (!VI || !VI->getParent()) 2887 continue; 2888 assert(isInstructionTriviallyDead(VI, TLI) && 2889 "Live instruction found in dead worklist!"); 2890 assert(VI->use_empty() && "Instructions with uses are not dead."); 2891 2892 // Don't lose the debug info while deleting the instructions. 2893 salvageDebugInfo(*VI); 2894 2895 // Null out all of the instruction's operands to see if any operand 2896 // becomes dead as we go. 2897 for (Use &OpU : VI->operands()) { 2898 Value *OpV = OpU.get(); 2899 if (!OpV) 2900 continue; 2901 OpU.set(nullptr); 2902 2903 if (!OpV->use_empty()) 2904 continue; 2905 2906 // If the operand is an instruction that became dead as we nulled out 2907 // the operand, and if it is 'trivially' dead, delete it in a future 2908 // loop iteration. 2909 if (auto *OpI = dyn_cast<Instruction>(OpV)) 2910 if (!DeletedInstructions.contains(OpI) && 2911 isInstructionTriviallyDead(OpI, TLI)) 2912 DeadInsts.push_back(OpI); 2913 } 2914 2915 VI->removeFromParent(); 2916 DeletedInstructions.insert(VI); 2917 SE->forgetValue(VI); 2918 } 2919 } 2920 2921 /// Checks if the instruction was already analyzed for being possible 2922 /// reduction root. 2923 bool isAnalyzedReductionRoot(Instruction *I) const { 2924 return AnalyzedReductionsRoots.count(I); 2925 } 2926 /// Register given instruction as already analyzed for being possible 2927 /// reduction root. 2928 void analyzedReductionRoot(Instruction *I) { 2929 AnalyzedReductionsRoots.insert(I); 2930 } 2931 /// Checks if the provided list of reduced values was checked already for 2932 /// vectorization. 2933 bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const { 2934 return AnalyzedReductionVals.contains(hash_value(VL)); 2935 } 2936 /// Adds the list of reduced values to list of already checked values for the 2937 /// vectorization. 2938 void analyzedReductionVals(ArrayRef<Value *> VL) { 2939 AnalyzedReductionVals.insert(hash_value(VL)); 2940 } 2941 /// Clear the list of the analyzed reduction root instructions. 2942 void clearReductionData() { 2943 AnalyzedReductionsRoots.clear(); 2944 AnalyzedReductionVals.clear(); 2945 AnalyzedMinBWVals.clear(); 2946 } 2947 /// Checks if the given value is gathered in one of the nodes. 2948 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const { 2949 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); }); 2950 } 2951 /// Checks if the given value is gathered in one of the nodes. 2952 bool isGathered(const Value *V) const { 2953 return MustGather.contains(V); 2954 } 2955 /// Checks if the specified value was not schedule. 2956 bool isNotScheduled(const Value *V) const { 2957 return NonScheduledFirst.contains(V); 2958 } 2959 2960 /// Check if the value is vectorized in the tree. 2961 bool isVectorized(Value *V) const { 2962 assert(V && "V cannot be nullptr."); 2963 return ScalarToTreeEntries.contains(V); 2964 } 2965 2966 ~BoUpSLP(); 2967 2968 private: 2969 /// Determine if a node \p E in can be demoted to a smaller type with a 2970 /// truncation. We collect the entries that will be demoted in ToDemote. 2971 /// \param E Node for analysis 2972 /// \param ToDemote indices of the nodes to be demoted. 2973 bool collectValuesToDemote( 2974 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth, 2975 SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited, 2976 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel, 2977 bool &IsProfitableToDemote, bool IsTruncRoot) const; 2978 2979 /// Check if the operands on the edges \p Edges of the \p UserTE allows 2980 /// reordering (i.e. the operands can be reordered because they have only one 2981 /// user and reordarable). 2982 /// \param ReorderableGathers List of all gather nodes that require reordering 2983 /// (e.g., gather of extractlements or partially vectorizable loads). 2984 /// \param GatherOps List of gather operand nodes for \p UserTE that require 2985 /// reordering, subset of \p NonVectorized. 2986 bool 2987 canReorderOperands(TreeEntry *UserTE, 2988 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges, 2989 ArrayRef<TreeEntry *> ReorderableGathers, 2990 SmallVectorImpl<TreeEntry *> &GatherOps); 2991 2992 /// Checks if the given \p TE is a gather node with clustered reused scalars 2993 /// and reorders it per given \p Mask. 2994 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const; 2995 2996 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, 2997 /// if any. If it is not vectorized (gather node), returns nullptr. 2998 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) { 2999 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx); 3000 TreeEntry *TE = nullptr; 3001 const auto *It = find_if(VL, [&](Value *V) { 3002 for (TreeEntry *E : getTreeEntries(V)) { 3003 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) { 3004 TE = E; 3005 return true; 3006 } 3007 } 3008 return false; 3009 }); 3010 if (It != VL.end()) { 3011 assert(TE->isSame(VL) && "Expected same scalars."); 3012 return TE; 3013 } 3014 return nullptr; 3015 } 3016 3017 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, 3018 /// if any. If it is not vectorized (gather node), returns nullptr. 3019 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE, 3020 unsigned OpIdx) const { 3021 return const_cast<BoUpSLP *>(this)->getVectorizedOperand( 3022 const_cast<TreeEntry *>(UserTE), OpIdx); 3023 } 3024 3025 /// Checks if all users of \p I are the part of the vectorization tree. 3026 bool areAllUsersVectorized( 3027 Instruction *I, 3028 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const; 3029 3030 /// Return information about the vector formed for the specified index 3031 /// of a vector of (the same) instruction. 3032 TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops); 3033 3034 /// \ returns the graph entry for the \p Idx operand of the \p E entry. 3035 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const; 3036 3037 /// Gets the root instruction for the given node. If the node is a strided 3038 /// load/store node with the reverse order, the root instruction is the last 3039 /// one. 3040 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const; 3041 3042 /// \returns Cast context for the given graph node. 3043 TargetTransformInfo::CastContextHint 3044 getCastContextHint(const TreeEntry &TE) const; 3045 3046 /// \returns the cost of the vectorizable entry. 3047 InstructionCost getEntryCost(const TreeEntry *E, 3048 ArrayRef<Value *> VectorizedVals, 3049 SmallPtrSetImpl<Value *> &CheckedExtracts); 3050 3051 /// This is the recursive part of buildTree. 3052 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, 3053 const EdgeInfo &EI, unsigned InterleaveFactor = 0); 3054 3055 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can 3056 /// be vectorized to use the original vector (or aggregate "bitcast" to a 3057 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise 3058 /// returns false, setting \p CurrentOrder to either an empty vector or a 3059 /// non-identity permutation that allows to reuse extract instructions. 3060 /// \param ResizeAllowed indicates whether it is allowed to handle subvector 3061 /// extract order. 3062 bool canReuseExtract(ArrayRef<Value *> VL, 3063 SmallVectorImpl<unsigned> &CurrentOrder, 3064 bool ResizeAllowed = false) const; 3065 3066 /// Vectorize a single entry in the tree. 3067 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to 3068 /// avoid issues with def-use order. 3069 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs); 3070 3071 /// Returns vectorized operand node, that matches the order of the scalars 3072 /// operand number \p NodeIdx in entry \p E. 3073 TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx); 3074 const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, 3075 unsigned NodeIdx) const { 3076 return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx); 3077 } 3078 3079 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry 3080 /// \p E. 3081 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to 3082 /// avoid issues with def-use order. 3083 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs); 3084 3085 /// Create a new vector from a list of scalar values. Produces a sequence 3086 /// which exploits values reused across lanes, and arranges the inserts 3087 /// for ease of later optimization. 3088 template <typename BVTy, typename ResTy, typename... Args> 3089 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params); 3090 3091 /// Create a new vector from a list of scalar values. Produces a sequence 3092 /// which exploits values reused across lanes, and arranges the inserts 3093 /// for ease of later optimization. 3094 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy, 3095 bool PostponedPHIs); 3096 3097 /// Returns the instruction in the bundle, which can be used as a base point 3098 /// for scheduling. Usually it is the last instruction in the bundle, except 3099 /// for the case when all operands are external (in this case, it is the first 3100 /// instruction in the list). 3101 Instruction &getLastInstructionInBundle(const TreeEntry *E); 3102 3103 /// Tries to find extractelement instructions with constant indices from fixed 3104 /// vector type and gather such instructions into a bunch, which highly likely 3105 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt 3106 /// was successful, the matched scalars are replaced by poison values in \p VL 3107 /// for future analysis. 3108 std::optional<TargetTransformInfo::ShuffleKind> 3109 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL, 3110 SmallVectorImpl<int> &Mask) const; 3111 3112 /// Tries to find extractelement instructions with constant indices from fixed 3113 /// vector type and gather such instructions into a bunch, which highly likely 3114 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt 3115 /// was successful, the matched scalars are replaced by poison values in \p VL 3116 /// for future analysis. 3117 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> 3118 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL, 3119 SmallVectorImpl<int> &Mask, 3120 unsigned NumParts) const; 3121 3122 /// Checks if the gathered \p VL can be represented as a single register 3123 /// shuffle(s) of previous tree entries. 3124 /// \param TE Tree entry checked for permutation. 3125 /// \param VL List of scalars (a subset of the TE scalar), checked for 3126 /// permutations. Must form single-register vector. 3127 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also 3128 /// commands to build the mask using the original vector value, without 3129 /// relying on the potential reordering. 3130 /// \returns ShuffleKind, if gathered values can be represented as shuffles of 3131 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask. 3132 std::optional<TargetTransformInfo::ShuffleKind> 3133 isGatherShuffledSingleRegisterEntry( 3134 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask, 3135 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, 3136 bool ForOrder); 3137 3138 /// Checks if the gathered \p VL can be represented as multi-register 3139 /// shuffle(s) of previous tree entries. 3140 /// \param TE Tree entry checked for permutation. 3141 /// \param VL List of scalars (a subset of the TE scalar), checked for 3142 /// permutations. 3143 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also 3144 /// commands to build the mask using the original vector value, without 3145 /// relying on the potential reordering. 3146 /// \returns per-register series of ShuffleKind, if gathered values can be 3147 /// represented as shuffles of previous tree entries. \p Mask is filled with 3148 /// the shuffle mask (also on per-register base). 3149 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> 3150 isGatherShuffledEntry( 3151 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask, 3152 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, 3153 unsigned NumParts, bool ForOrder = false); 3154 3155 /// \returns the cost of gathering (inserting) the values in \p VL into a 3156 /// vector. 3157 /// \param ForPoisonSrc true if initial vector is poison, false otherwise. 3158 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc, 3159 Type *ScalarTy) const; 3160 3161 /// Set the Builder insert point to one after the last instruction in 3162 /// the bundle 3163 void setInsertPointAfterBundle(const TreeEntry *E); 3164 3165 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not 3166 /// specified, the starting vector value is poison. 3167 Value * 3168 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy, 3169 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle); 3170 3171 /// \returns whether the VectorizableTree is fully vectorizable and will 3172 /// be beneficial even the tree height is tiny. 3173 bool isFullyVectorizableTinyTree(bool ForReduction) const; 3174 3175 /// Run through the list of all gathered loads in the graph and try to find 3176 /// vector loads/masked gathers instead of regular gathers. Later these loads 3177 /// are reshufled to build final gathered nodes. 3178 void tryToVectorizeGatheredLoads( 3179 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>, 3180 SmallVector<SmallVector<std::pair<LoadInst *, int>>>, 3181 8> &GatheredLoads); 3182 3183 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the 3184 /// users of \p TE and collects the stores. It returns the map from the store 3185 /// pointers to the collected stores. 3186 SmallVector<SmallVector<StoreInst *>> 3187 collectUserStores(const BoUpSLP::TreeEntry *TE) const; 3188 3189 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the 3190 /// stores in \p StoresVec can form a vector instruction. If so it returns 3191 /// true and populates \p ReorderIndices with the shuffle indices of the 3192 /// stores when compared to the sorted vector. 3193 bool canFormVector(ArrayRef<StoreInst *> StoresVec, 3194 OrdersType &ReorderIndices) const; 3195 3196 /// Iterates through the users of \p TE, looking for scalar stores that can be 3197 /// potentially vectorized in a future SLP-tree. If found, it keeps track of 3198 /// their order and builds an order index vector for each store bundle. It 3199 /// returns all these order vectors found. 3200 /// We run this after the tree has formed, otherwise we may come across user 3201 /// instructions that are not yet in the tree. 3202 SmallVector<OrdersType, 1> 3203 findExternalStoreUsersReorderIndices(TreeEntry *TE) const; 3204 3205 /// Tries to reorder the gathering node for better vectorization 3206 /// opportunities. 3207 void reorderGatherNode(TreeEntry &TE); 3208 3209 struct TreeEntry { 3210 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>; 3211 TreeEntry(VecTreeTy &Container) : Container(Container) {} 3212 3213 /// \returns Common mask for reorder indices and reused scalars. 3214 SmallVector<int> getCommonMask() const { 3215 SmallVector<int> Mask; 3216 inversePermutation(ReorderIndices, Mask); 3217 ::addMask(Mask, ReuseShuffleIndices); 3218 return Mask; 3219 } 3220 3221 /// \returns true if the scalars in VL are equal to this entry. 3222 bool isSame(ArrayRef<Value *> VL) const { 3223 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) { 3224 if (Mask.size() != VL.size() && VL.size() == Scalars.size()) 3225 return std::equal(VL.begin(), VL.end(), Scalars.begin()); 3226 return VL.size() == Mask.size() && 3227 std::equal(VL.begin(), VL.end(), Mask.begin(), 3228 [Scalars](Value *V, int Idx) { 3229 return (isa<UndefValue>(V) && 3230 Idx == PoisonMaskElem) || 3231 (Idx != PoisonMaskElem && V == Scalars[Idx]); 3232 }); 3233 }; 3234 if (!ReorderIndices.empty()) { 3235 // TODO: implement matching if the nodes are just reordered, still can 3236 // treat the vector as the same if the list of scalars matches VL 3237 // directly, without reordering. 3238 SmallVector<int> Mask; 3239 inversePermutation(ReorderIndices, Mask); 3240 if (VL.size() == Scalars.size()) 3241 return IsSame(Scalars, Mask); 3242 if (VL.size() == ReuseShuffleIndices.size()) { 3243 ::addMask(Mask, ReuseShuffleIndices); 3244 return IsSame(Scalars, Mask); 3245 } 3246 return false; 3247 } 3248 return IsSame(Scalars, ReuseShuffleIndices); 3249 } 3250 3251 bool isOperandGatherNode(const EdgeInfo &UserEI) const { 3252 return isGather() && !UserTreeIndices.empty() && 3253 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx && 3254 UserTreeIndices.front().UserTE == UserEI.UserTE; 3255 } 3256 3257 /// \returns true if current entry has same operands as \p TE. 3258 bool hasEqualOperands(const TreeEntry &TE) const { 3259 if (TE.getNumOperands() != getNumOperands()) 3260 return false; 3261 SmallBitVector Used(getNumOperands()); 3262 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) { 3263 unsigned PrevCount = Used.count(); 3264 for (unsigned K = 0; K < E; ++K) { 3265 if (Used.test(K)) 3266 continue; 3267 if (getOperand(K) == TE.getOperand(I)) { 3268 Used.set(K); 3269 break; 3270 } 3271 } 3272 // Check if we actually found the matching operand. 3273 if (PrevCount == Used.count()) 3274 return false; 3275 } 3276 return true; 3277 } 3278 3279 /// \return Final vectorization factor for the node. Defined by the total 3280 /// number of vectorized scalars, including those, used several times in the 3281 /// entry and counted in the \a ReuseShuffleIndices, if any. 3282 unsigned getVectorFactor() const { 3283 if (!ReuseShuffleIndices.empty()) 3284 return ReuseShuffleIndices.size(); 3285 return Scalars.size(); 3286 }; 3287 3288 /// Checks if the current node is a gather node. 3289 bool isGather() const { return State == NeedToGather; } 3290 3291 /// A vector of scalars. 3292 ValueList Scalars; 3293 3294 /// The Scalars are vectorized into this value. It is initialized to Null. 3295 WeakTrackingVH VectorizedValue = nullptr; 3296 3297 /// New vector phi instructions emitted for the vectorized phi nodes. 3298 PHINode *PHI = nullptr; 3299 3300 /// Do we need to gather this sequence or vectorize it 3301 /// (either with vector instruction or with scatter/gather 3302 /// intrinsics for store/load)? 3303 enum EntryState { 3304 Vectorize, ///< The node is regularly vectorized. 3305 ScatterVectorize, ///< Masked scatter/gather node. 3306 StridedVectorize, ///< Strided loads (and stores) 3307 NeedToGather, ///< Gather/buildvector node. 3308 CombinedVectorize, ///< Vectorized node, combined with its user into more 3309 ///< complex node like select/cmp to minmax, mul/add to 3310 ///< fma, etc. Must be used for the following nodes in 3311 ///< the pattern, not the very first one. 3312 }; 3313 EntryState State; 3314 3315 /// List of combined opcodes supported by the vectorizer. 3316 enum CombinedOpcode { 3317 NotCombinedOp = -1, 3318 MinMax = Instruction::OtherOpsEnd + 1, 3319 }; 3320 CombinedOpcode CombinedOp = NotCombinedOp; 3321 3322 /// Does this sequence require some shuffling? 3323 SmallVector<int, 4> ReuseShuffleIndices; 3324 3325 /// Does this entry require reordering? 3326 SmallVector<unsigned, 4> ReorderIndices; 3327 3328 /// Points back to the VectorizableTree. 3329 /// 3330 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has 3331 /// to be a pointer and needs to be able to initialize the child iterator. 3332 /// Thus we need a reference back to the container to translate the indices 3333 /// to entries. 3334 VecTreeTy &Container; 3335 3336 /// The TreeEntry index containing the user of this entry. We can actually 3337 /// have multiple users so the data structure is not truly a tree. 3338 SmallVector<EdgeInfo, 1> UserTreeIndices; 3339 3340 /// The index of this treeEntry in VectorizableTree. 3341 unsigned Idx = 0; 3342 3343 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from 3344 /// other nodes as a series of insertvector instructions. 3345 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices; 3346 3347 private: 3348 /// The operands of each instruction in each lane Operands[op_index][lane]. 3349 /// Note: This helps avoid the replication of the code that performs the 3350 /// reordering of operands during buildTree_rec() and vectorizeTree(). 3351 SmallVector<ValueList, 2> Operands; 3352 3353 /// MainOp and AltOp are recorded inside. S should be obtained from 3354 /// newTreeEntry. 3355 InstructionsState S = InstructionsState::invalid(); 3356 3357 /// Interleaving factor for interleaved loads Vectorize nodes. 3358 unsigned InterleaveFactor = 0; 3359 3360 public: 3361 /// Returns interleave factor for interleave nodes. 3362 unsigned getInterleaveFactor() const { return InterleaveFactor; } 3363 /// Sets interleaving factor for the interleaving nodes. 3364 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; } 3365 3366 /// Set this bundle's \p OpIdx'th operand to \p OpVL. 3367 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) { 3368 if (Operands.size() < OpIdx + 1) 3369 Operands.resize(OpIdx + 1); 3370 assert(Operands[OpIdx].empty() && "Already resized?"); 3371 assert(OpVL.size() <= Scalars.size() && 3372 "Number of operands is greater than the number of scalars."); 3373 Operands[OpIdx].resize(OpVL.size()); 3374 copy(OpVL, Operands[OpIdx].begin()); 3375 } 3376 3377 /// Set this bundle's operand from Scalars. 3378 void setOperand(const BoUpSLP &R, bool RequireReorder = false) { 3379 VLOperands Ops(Scalars, S, R); 3380 if (RequireReorder) 3381 Ops.reorder(); 3382 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) 3383 setOperand(I, Ops.getVL(I)); 3384 } 3385 3386 /// Reorders operands of the node to the given mask \p Mask. 3387 void reorderOperands(ArrayRef<int> Mask) { 3388 for (ValueList &Operand : Operands) 3389 reorderScalars(Operand, Mask); 3390 } 3391 3392 /// \returns the \p OpIdx operand of this TreeEntry. 3393 ValueList &getOperand(unsigned OpIdx) { 3394 assert(OpIdx < Operands.size() && "Off bounds"); 3395 return Operands[OpIdx]; 3396 } 3397 3398 /// \returns the \p OpIdx operand of this TreeEntry. 3399 ArrayRef<Value *> getOperand(unsigned OpIdx) const { 3400 assert(OpIdx < Operands.size() && "Off bounds"); 3401 return Operands[OpIdx]; 3402 } 3403 3404 /// \returns the number of operands. 3405 unsigned getNumOperands() const { return Operands.size(); } 3406 3407 /// \return the single \p OpIdx operand. 3408 Value *getSingleOperand(unsigned OpIdx) const { 3409 assert(OpIdx < Operands.size() && "Off bounds"); 3410 assert(!Operands[OpIdx].empty() && "No operand available"); 3411 return Operands[OpIdx][0]; 3412 } 3413 3414 /// Some of the instructions in the list have alternate opcodes. 3415 bool isAltShuffle() const { return S.isAltShuffle(); } 3416 3417 bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); } 3418 3419 /// Chooses the correct key for scheduling data. If \p Op has the same (or 3420 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is 3421 /// \p OpValue. 3422 Value *isOneOf(Value *Op) const { 3423 auto *I = dyn_cast<Instruction>(Op); 3424 if (I && isOpcodeOrAlt(I)) 3425 return Op; 3426 return S.getMainOp(); 3427 } 3428 3429 void setOperations(const InstructionsState &S) { 3430 assert(S && "InstructionsState is invalid."); 3431 this->S = S; 3432 } 3433 3434 Instruction *getMainOp() const { return S.getMainOp(); } 3435 3436 Instruction *getAltOp() const { return S.getAltOp(); } 3437 3438 /// The main/alternate opcodes for the list of instructions. 3439 unsigned getOpcode() const { return S.getOpcode(); } 3440 3441 unsigned getAltOpcode() const { return S.getAltOpcode(); } 3442 3443 bool hasState() const { return S.valid(); } 3444 3445 /// When ReuseReorderShuffleIndices is empty it just returns position of \p 3446 /// V within vector of Scalars. Otherwise, try to remap on its reuse index. 3447 int findLaneForValue(Value *V) const { 3448 unsigned FoundLane = getVectorFactor(); 3449 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End; 3450 std::advance(It, 1)) { 3451 if (*It != V) 3452 continue; 3453 FoundLane = std::distance(Scalars.begin(), It); 3454 assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); 3455 if (!ReorderIndices.empty()) 3456 FoundLane = ReorderIndices[FoundLane]; 3457 assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); 3458 if (ReuseShuffleIndices.empty()) 3459 break; 3460 if (auto *RIt = find(ReuseShuffleIndices, FoundLane); 3461 RIt != ReuseShuffleIndices.end()) { 3462 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt); 3463 break; 3464 } 3465 } 3466 assert(FoundLane < getVectorFactor() && "Unable to find given value."); 3467 return FoundLane; 3468 } 3469 3470 /// Build a shuffle mask for graph entry which represents a merge of main 3471 /// and alternate operations. 3472 void 3473 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp, 3474 SmallVectorImpl<int> &Mask, 3475 SmallVectorImpl<Value *> *OpScalars = nullptr, 3476 SmallVectorImpl<Value *> *AltScalars = nullptr) const; 3477 3478 /// Return true if this is a non-power-of-2 node. 3479 bool isNonPowOf2Vec() const { 3480 bool IsNonPowerOf2 = !has_single_bit(Scalars.size()); 3481 return IsNonPowerOf2; 3482 } 3483 3484 /// Return true if this is a node, which tries to vectorize number of 3485 /// elements, forming whole vectors. 3486 bool 3487 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const { 3488 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2( 3489 TTI, getValueType(Scalars.front()), Scalars.size()); 3490 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) && 3491 "Reshuffling not supported with non-power-of-2 vectors yet."); 3492 return IsNonPowerOf2; 3493 } 3494 3495 Value *getOrdered(unsigned Idx) const { 3496 assert(isGather() && "Must be used only for buildvectors/gathers."); 3497 if (ReorderIndices.empty()) 3498 return Scalars[Idx]; 3499 SmallVector<int> Mask; 3500 inversePermutation(ReorderIndices, Mask); 3501 return Scalars[Mask[Idx]]; 3502 } 3503 3504 #ifndef NDEBUG 3505 /// Debug printer. 3506 LLVM_DUMP_METHOD void dump() const { 3507 dbgs() << Idx << ".\n"; 3508 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) { 3509 dbgs() << "Operand " << OpI << ":\n"; 3510 for (const Value *V : Operands[OpI]) 3511 dbgs().indent(2) << *V << "\n"; 3512 } 3513 dbgs() << "Scalars: \n"; 3514 for (Value *V : Scalars) 3515 dbgs().indent(2) << *V << "\n"; 3516 dbgs() << "State: "; 3517 switch (State) { 3518 case Vectorize: 3519 if (InterleaveFactor > 0) { 3520 dbgs() << "Vectorize with interleave factor " << InterleaveFactor 3521 << "\n"; 3522 } else { 3523 dbgs() << "Vectorize\n"; 3524 } 3525 break; 3526 case ScatterVectorize: 3527 dbgs() << "ScatterVectorize\n"; 3528 break; 3529 case StridedVectorize: 3530 dbgs() << "StridedVectorize\n"; 3531 break; 3532 case NeedToGather: 3533 dbgs() << "NeedToGather\n"; 3534 break; 3535 case CombinedVectorize: 3536 dbgs() << "CombinedVectorize\n"; 3537 break; 3538 } 3539 if (S) { 3540 dbgs() << "MainOp: " << *S.getMainOp() << "\n"; 3541 dbgs() << "AltOp: " << *S.getAltOp() << "\n"; 3542 } else { 3543 dbgs() << "MainOp: NULL\n"; 3544 dbgs() << "AltOp: NULL\n"; 3545 } 3546 dbgs() << "VectorizedValue: "; 3547 if (VectorizedValue) 3548 dbgs() << *VectorizedValue << "\n"; 3549 else 3550 dbgs() << "NULL\n"; 3551 dbgs() << "ReuseShuffleIndices: "; 3552 if (ReuseShuffleIndices.empty()) 3553 dbgs() << "Empty"; 3554 else 3555 for (int ReuseIdx : ReuseShuffleIndices) 3556 dbgs() << ReuseIdx << ", "; 3557 dbgs() << "\n"; 3558 dbgs() << "ReorderIndices: "; 3559 for (unsigned ReorderIdx : ReorderIndices) 3560 dbgs() << ReorderIdx << ", "; 3561 dbgs() << "\n"; 3562 dbgs() << "UserTreeIndices: "; 3563 for (const auto &EInfo : UserTreeIndices) 3564 dbgs() << EInfo << ", "; 3565 dbgs() << "\n"; 3566 if (!CombinedEntriesWithIndices.empty()) { 3567 dbgs() << "Combined entries: "; 3568 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) { 3569 dbgs() << "Entry index " << P.first << " with offset " << P.second; 3570 }); 3571 dbgs() << "\n"; 3572 } 3573 } 3574 #endif 3575 }; 3576 3577 #ifndef NDEBUG 3578 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost, 3579 InstructionCost VecCost, InstructionCost ScalarCost, 3580 StringRef Banner) const { 3581 dbgs() << "SLP: " << Banner << ":\n"; 3582 E->dump(); 3583 dbgs() << "SLP: Costs:\n"; 3584 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n"; 3585 dbgs() << "SLP: VectorCost = " << VecCost << "\n"; 3586 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n"; 3587 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " 3588 << ReuseShuffleCost + VecCost - ScalarCost << "\n"; 3589 } 3590 #endif 3591 3592 /// Create a new VectorizableTree entry. 3593 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, 3594 std::optional<ScheduleData *> Bundle, 3595 const InstructionsState &S, 3596 const EdgeInfo &UserTreeIdx, 3597 ArrayRef<int> ReuseShuffleIndices = {}, 3598 ArrayRef<unsigned> ReorderIndices = {}, 3599 unsigned InterleaveFactor = 0) { 3600 TreeEntry::EntryState EntryState = 3601 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather; 3602 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx, 3603 ReuseShuffleIndices, ReorderIndices); 3604 if (E && InterleaveFactor > 0) 3605 E->setInterleave(InterleaveFactor); 3606 return E; 3607 } 3608 3609 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, 3610 TreeEntry::EntryState EntryState, 3611 std::optional<ScheduleData *> Bundle, 3612 const InstructionsState &S, 3613 const EdgeInfo &UserTreeIdx, 3614 ArrayRef<int> ReuseShuffleIndices = {}, 3615 ArrayRef<unsigned> ReorderIndices = {}) { 3616 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || 3617 (Bundle && EntryState != TreeEntry::NeedToGather)) && 3618 "Need to vectorize gather entry?"); 3619 // Gathered loads still gathered? Do not create entry, use the original one. 3620 if (GatheredLoadsEntriesFirst.has_value() && 3621 EntryState == TreeEntry::NeedToGather && S && 3622 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX && 3623 !UserTreeIdx.UserTE) 3624 return nullptr; 3625 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree)); 3626 TreeEntry *Last = VectorizableTree.back().get(); 3627 Last->Idx = VectorizableTree.size() - 1; 3628 Last->State = EntryState; 3629 // FIXME: Remove once support for ReuseShuffleIndices has been implemented 3630 // for non-power-of-two vectors. 3631 assert( 3632 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) || 3633 ReuseShuffleIndices.empty()) && 3634 "Reshuffling scalars not yet supported for nodes with padding"); 3635 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), 3636 ReuseShuffleIndices.end()); 3637 if (ReorderIndices.empty()) { 3638 Last->Scalars.assign(VL.begin(), VL.end()); 3639 if (S) 3640 Last->setOperations(S); 3641 } else { 3642 // Reorder scalars and build final mask. 3643 Last->Scalars.assign(VL.size(), nullptr); 3644 transform(ReorderIndices, Last->Scalars.begin(), 3645 [VL](unsigned Idx) -> Value * { 3646 if (Idx >= VL.size()) 3647 return UndefValue::get(VL.front()->getType()); 3648 return VL[Idx]; 3649 }); 3650 InstructionsState S = getSameOpcode(Last->Scalars, *TLI); 3651 if (S) 3652 Last->setOperations(S); 3653 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); 3654 } 3655 if (!Last->isGather()) { 3656 SmallPtrSet<Value *, 4> Processed; 3657 for (Value *V : VL) { 3658 if (isa<PoisonValue>(V)) 3659 continue; 3660 auto It = ScalarToTreeEntries.find(V); 3661 assert( 3662 (It == ScalarToTreeEntries.end() || 3663 (It->getSecond().size() == 1 && It->getSecond().front() == Last) || 3664 doesNotNeedToBeScheduled(V)) && 3665 "Scalar already in tree!"); 3666 if (It == ScalarToTreeEntries.end()) { 3667 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last); 3668 (void)Processed.insert(V); 3669 } else if (Processed.insert(V).second) { 3670 assert(!is_contained(It->getSecond(), Last) && 3671 "Value already associated with the node."); 3672 It->getSecond().push_back(Last); 3673 } 3674 } 3675 // Update the scheduler bundle to point to this TreeEntry. 3676 ScheduleData *BundleMember = *Bundle; 3677 assert((BundleMember || isa<PHINode>(S.getMainOp()) || 3678 isVectorLikeInstWithConstOps(S.getMainOp()) || 3679 doesNotNeedToSchedule(VL)) && 3680 "Bundle and VL out of sync"); 3681 if (BundleMember) { 3682 for (Value *V : VL) { 3683 if (doesNotNeedToBeScheduled(V)) 3684 continue; 3685 if (!BundleMember) 3686 continue; 3687 BundleMember->TE = Last; 3688 BundleMember = BundleMember->NextInBundle; 3689 } 3690 } 3691 assert(!BundleMember && "Bundle and VL out of sync"); 3692 } else { 3693 // Build a map for gathered scalars to the nodes where they are used. 3694 bool AllConstsOrCasts = true; 3695 for (Value *V : VL) 3696 if (!isConstant(V)) { 3697 auto *I = dyn_cast<CastInst>(V); 3698 AllConstsOrCasts &= I && I->getType()->isIntegerTy(); 3699 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE || 3700 !UserTreeIdx.UserTE->isGather()) 3701 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last); 3702 } 3703 if (AllConstsOrCasts) 3704 CastMaxMinBWSizes = 3705 std::make_pair(std::numeric_limits<unsigned>::max(), 1); 3706 MustGather.insert(VL.begin(), VL.end()); 3707 } 3708 3709 if (UserTreeIdx.UserTE) 3710 Last->UserTreeIndices.push_back(UserTreeIdx); 3711 return Last; 3712 } 3713 3714 /// -- Vectorization State -- 3715 /// Holds all of the tree entries. 3716 TreeEntry::VecTreeTy VectorizableTree; 3717 3718 #ifndef NDEBUG 3719 /// Debug printer. 3720 LLVM_DUMP_METHOD void dumpVectorizableTree() const { 3721 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) { 3722 VectorizableTree[Id]->dump(); 3723 dbgs() << "\n"; 3724 } 3725 } 3726 #endif 3727 3728 /// Get list of vector entries, associated with the value \p V. 3729 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const { 3730 assert(V && "V cannot be nullptr."); 3731 auto It = ScalarToTreeEntries.find(V); 3732 if (It == ScalarToTreeEntries.end()) 3733 return {}; 3734 return It->getSecond(); 3735 } 3736 3737 /// Returns first vector node for value \p V, matching values \p VL. 3738 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL, 3739 bool SameVF = false) const { 3740 assert(V && "V cannot be nullptr."); 3741 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V)) 3742 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL)) 3743 return TE; 3744 return nullptr; 3745 } 3746 3747 /// Check that the operand node of alternate node does not generate 3748 /// buildvector sequence. If it is, then probably not worth it to build 3749 /// alternate shuffle, if number of buildvector operands + alternate 3750 /// instruction > than the number of buildvector instructions. 3751 /// \param S the instructions state of the analyzed values. 3752 /// \param VL list of the instructions with alternate opcodes. 3753 bool areAltOperandsProfitable(const InstructionsState &S, 3754 ArrayRef<Value *> VL) const; 3755 3756 /// Checks if the specified list of the instructions/values can be vectorized 3757 /// and fills required data before actual scheduling of the instructions. 3758 TreeEntry::EntryState 3759 getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL, 3760 bool IsScatterVectorizeUserTE, 3761 OrdersType &CurrentOrder, 3762 SmallVectorImpl<Value *> &PointerOps); 3763 3764 /// Maps a specific scalar to its tree entry(ies). 3765 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries; 3766 3767 /// Maps a value to the proposed vectorizable size. 3768 SmallDenseMap<Value *, unsigned> InstrElementSize; 3769 3770 /// A list of scalars that we found that we need to keep as scalars. 3771 ValueSet MustGather; 3772 3773 /// A set of first non-schedulable values. 3774 ValueSet NonScheduledFirst; 3775 3776 /// A map between the vectorized entries and the last instructions in the 3777 /// bundles. The bundles are built in use order, not in the def order of the 3778 /// instructions. So, we cannot rely directly on the last instruction in the 3779 /// bundle being the last instruction in the program order during 3780 /// vectorization process since the basic blocks are affected, need to 3781 /// pre-gather them before. 3782 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction; 3783 3784 /// List of gather nodes, depending on other gather/vector nodes, which should 3785 /// be emitted after the vector instruction emission process to correctly 3786 /// handle order of the vector instructions and shuffles. 3787 SetVector<const TreeEntry *> PostponedGathers; 3788 3789 using ValueToGatherNodesMap = 3790 DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>; 3791 ValueToGatherNodesMap ValueToGatherNodes; 3792 3793 /// A list of the load entries (node indices), which can be vectorized using 3794 /// strided or masked gather approach, but attempted to be represented as 3795 /// contiguous loads. 3796 SetVector<unsigned> LoadEntriesToVectorize; 3797 3798 /// true if graph nodes transforming mode is on. 3799 bool IsGraphTransformMode = false; 3800 3801 /// The index of the first gathered load entry in the VectorizeTree. 3802 std::optional<unsigned> GatheredLoadsEntriesFirst; 3803 3804 /// This POD struct describes one external user in the vectorized tree. 3805 struct ExternalUser { 3806 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, int L) 3807 : Scalar(S), User(U), E(E), Lane(L) {} 3808 3809 /// Which scalar in our function. 3810 Value *Scalar = nullptr; 3811 3812 /// Which user that uses the scalar. 3813 llvm::User *User = nullptr; 3814 3815 /// Vector node, the value is part of. 3816 const TreeEntry &E; 3817 3818 /// Which lane does the scalar belong to. 3819 int Lane; 3820 }; 3821 using UserList = SmallVector<ExternalUser, 16>; 3822 3823 /// Checks if two instructions may access the same memory. 3824 /// 3825 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it 3826 /// is invariant in the calling loop. 3827 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, 3828 Instruction *Inst2) { 3829 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2)) 3830 return true; 3831 // First check if the result is already in the cache. 3832 AliasCacheKey Key = std::make_pair(Inst1, Inst2); 3833 auto It = AliasCache.find(Key); 3834 if (It != AliasCache.end()) 3835 return It->second; 3836 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1)); 3837 // Store the result in the cache. 3838 AliasCache.try_emplace(Key, Aliased); 3839 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased); 3840 return Aliased; 3841 } 3842 3843 using AliasCacheKey = std::pair<Instruction *, Instruction *>; 3844 3845 /// Cache for alias results. 3846 /// TODO: consider moving this to the AliasAnalysis itself. 3847 DenseMap<AliasCacheKey, bool> AliasCache; 3848 3849 // Cache for pointerMayBeCaptured calls inside AA. This is preserved 3850 // globally through SLP because we don't perform any action which 3851 // invalidates capture results. 3852 BatchAAResults BatchAA; 3853 3854 /// Temporary store for deleted instructions. Instructions will be deleted 3855 /// eventually when the BoUpSLP is destructed. The deferral is required to 3856 /// ensure that there are no incorrect collisions in the AliasCache, which 3857 /// can happen if a new instruction is allocated at the same address as a 3858 /// previously deleted instruction. 3859 DenseSet<Instruction *> DeletedInstructions; 3860 3861 /// Set of the instruction, being analyzed already for reductions. 3862 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots; 3863 3864 /// Set of hashes for the list of reduction values already being analyzed. 3865 DenseSet<size_t> AnalyzedReductionVals; 3866 3867 /// Values, already been analyzed for mininmal bitwidth and found to be 3868 /// non-profitable. 3869 DenseSet<Value *> AnalyzedMinBWVals; 3870 3871 /// A list of values that need to extracted out of the tree. 3872 /// This list holds pairs of (Internal Scalar : External User). External User 3873 /// can be nullptr, it means that this Internal Scalar will be used later, 3874 /// after vectorization. 3875 UserList ExternalUses; 3876 3877 /// A list of GEPs which can be reaplced by scalar GEPs instead of 3878 /// extractelement instructions. 3879 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar; 3880 3881 /// Values used only by @llvm.assume calls. 3882 SmallPtrSet<const Value *, 32> EphValues; 3883 3884 /// Holds all of the instructions that we gathered, shuffle instructions and 3885 /// extractelements. 3886 SetVector<Instruction *> GatherShuffleExtractSeq; 3887 3888 /// A list of blocks that we are going to CSE. 3889 DenseSet<BasicBlock *> CSEBlocks; 3890 3891 /// List of hashes of vector of loads, which are known to be non vectorizable. 3892 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads; 3893 3894 /// Contains all scheduling relevant data for an instruction. 3895 /// A ScheduleData either represents a single instruction or a member of an 3896 /// instruction bundle (= a group of instructions which is combined into a 3897 /// vector instruction). 3898 struct ScheduleData { 3899 // The initial value for the dependency counters. It means that the 3900 // dependencies are not calculated yet. 3901 enum { InvalidDeps = -1 }; 3902 3903 ScheduleData() = default; 3904 3905 void init(int BlockSchedulingRegionID, Instruction *I) { 3906 FirstInBundle = this; 3907 NextInBundle = nullptr; 3908 NextLoadStore = nullptr; 3909 IsScheduled = false; 3910 SchedulingRegionID = BlockSchedulingRegionID; 3911 clearDependencies(); 3912 Inst = I; 3913 TE = nullptr; 3914 } 3915 3916 /// Verify basic self consistency properties 3917 void verify() { 3918 if (hasValidDependencies()) { 3919 assert(UnscheduledDeps <= Dependencies && "invariant"); 3920 } else { 3921 assert(UnscheduledDeps == Dependencies && "invariant"); 3922 } 3923 3924 if (IsScheduled) { 3925 assert(isSchedulingEntity() && 3926 "unexpected scheduled state"); 3927 for (const ScheduleData *BundleMember = this; BundleMember; 3928 BundleMember = BundleMember->NextInBundle) { 3929 assert(BundleMember->hasValidDependencies() && 3930 BundleMember->UnscheduledDeps == 0 && 3931 "unexpected scheduled state"); 3932 assert((BundleMember == this || !BundleMember->IsScheduled) && 3933 "only bundle is marked scheduled"); 3934 } 3935 } 3936 3937 assert(Inst->getParent() == FirstInBundle->Inst->getParent() && 3938 "all bundle members must be in same basic block"); 3939 } 3940 3941 /// Returns true if the dependency information has been calculated. 3942 /// Note that depenendency validity can vary between instructions within 3943 /// a single bundle. 3944 bool hasValidDependencies() const { return Dependencies != InvalidDeps; } 3945 3946 /// Returns true for single instructions and for bundle representatives 3947 /// (= the head of a bundle). 3948 bool isSchedulingEntity() const { return FirstInBundle == this; } 3949 3950 /// Returns true if it represents an instruction bundle and not only a 3951 /// single instruction. 3952 bool isPartOfBundle() const { 3953 return NextInBundle != nullptr || FirstInBundle != this || TE; 3954 } 3955 3956 /// Returns true if it is ready for scheduling, i.e. it has no more 3957 /// unscheduled depending instructions/bundles. 3958 bool isReady() const { 3959 assert(isSchedulingEntity() && 3960 "can't consider non-scheduling entity for ready list"); 3961 return unscheduledDepsInBundle() == 0 && !IsScheduled; 3962 } 3963 3964 /// Modifies the number of unscheduled dependencies for this instruction, 3965 /// and returns the number of remaining dependencies for the containing 3966 /// bundle. 3967 int incrementUnscheduledDeps(int Incr) { 3968 assert(hasValidDependencies() && 3969 "increment of unscheduled deps would be meaningless"); 3970 UnscheduledDeps += Incr; 3971 return FirstInBundle->unscheduledDepsInBundle(); 3972 } 3973 3974 /// Sets the number of unscheduled dependencies to the number of 3975 /// dependencies. 3976 void resetUnscheduledDeps() { 3977 UnscheduledDeps = Dependencies; 3978 } 3979 3980 /// Clears all dependency information. 3981 void clearDependencies() { 3982 Dependencies = InvalidDeps; 3983 resetUnscheduledDeps(); 3984 MemoryDependencies.clear(); 3985 ControlDependencies.clear(); 3986 } 3987 3988 int unscheduledDepsInBundle() const { 3989 assert(isSchedulingEntity() && "only meaningful on the bundle"); 3990 int Sum = 0; 3991 for (const ScheduleData *BundleMember = this; BundleMember; 3992 BundleMember = BundleMember->NextInBundle) { 3993 if (BundleMember->UnscheduledDeps == InvalidDeps) 3994 return InvalidDeps; 3995 Sum += BundleMember->UnscheduledDeps; 3996 } 3997 return Sum; 3998 } 3999 4000 void dump(raw_ostream &os) const { 4001 if (!isSchedulingEntity()) { 4002 os << "/ " << *Inst; 4003 } else if (NextInBundle) { 4004 os << '[' << *Inst; 4005 ScheduleData *SD = NextInBundle; 4006 while (SD) { 4007 os << ';' << *SD->Inst; 4008 SD = SD->NextInBundle; 4009 } 4010 os << ']'; 4011 } else { 4012 os << *Inst; 4013 } 4014 } 4015 4016 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } 4017 4018 Instruction *Inst = nullptr; 4019 4020 /// The TreeEntry that this instruction corresponds to. 4021 TreeEntry *TE = nullptr; 4022 4023 /// Points to the head in an instruction bundle (and always to this for 4024 /// single instructions). 4025 ScheduleData *FirstInBundle = nullptr; 4026 4027 /// Single linked list of all instructions in a bundle. Null if it is a 4028 /// single instruction. 4029 ScheduleData *NextInBundle = nullptr; 4030 4031 /// Single linked list of all memory instructions (e.g. load, store, call) 4032 /// in the block - until the end of the scheduling region. 4033 ScheduleData *NextLoadStore = nullptr; 4034 4035 /// The dependent memory instructions. 4036 /// This list is derived on demand in calculateDependencies(). 4037 SmallVector<ScheduleData *, 4> MemoryDependencies; 4038 4039 /// List of instructions which this instruction could be control dependent 4040 /// on. Allowing such nodes to be scheduled below this one could introduce 4041 /// a runtime fault which didn't exist in the original program. 4042 /// ex: this is a load or udiv following a readonly call which inf loops 4043 SmallVector<ScheduleData *, 4> ControlDependencies; 4044 4045 /// This ScheduleData is in the current scheduling region if this matches 4046 /// the current SchedulingRegionID of BlockScheduling. 4047 int SchedulingRegionID = 0; 4048 4049 /// Used for getting a "good" final ordering of instructions. 4050 int SchedulingPriority = 0; 4051 4052 /// The number of dependencies. Constitutes of the number of users of the 4053 /// instruction plus the number of dependent memory instructions (if any). 4054 /// This value is calculated on demand. 4055 /// If InvalidDeps, the number of dependencies is not calculated yet. 4056 int Dependencies = InvalidDeps; 4057 4058 /// The number of dependencies minus the number of dependencies of scheduled 4059 /// instructions. As soon as this is zero, the instruction/bundle gets ready 4060 /// for scheduling. 4061 /// Note that this is negative as long as Dependencies is not calculated. 4062 int UnscheduledDeps = InvalidDeps; 4063 4064 /// True if this instruction is scheduled (or considered as scheduled in the 4065 /// dry-run). 4066 bool IsScheduled = false; 4067 }; 4068 4069 #ifndef NDEBUG 4070 friend inline raw_ostream &operator<<(raw_ostream &os, 4071 const BoUpSLP::ScheduleData &SD) { 4072 SD.dump(os); 4073 return os; 4074 } 4075 #endif 4076 4077 friend struct GraphTraits<BoUpSLP *>; 4078 friend struct DOTGraphTraits<BoUpSLP *>; 4079 4080 /// Contains all scheduling data for a basic block. 4081 /// It does not schedules instructions, which are not memory read/write 4082 /// instructions and their operands are either constants, or arguments, or 4083 /// phis, or instructions from others blocks, or their users are phis or from 4084 /// the other blocks. The resulting vector instructions can be placed at the 4085 /// beginning of the basic block without scheduling (if operands does not need 4086 /// to be scheduled) or at the end of the block (if users are outside of the 4087 /// block). It allows to save some compile time and memory used by the 4088 /// compiler. 4089 /// ScheduleData is assigned for each instruction in between the boundaries of 4090 /// the tree entry, even for those, which are not part of the graph. It is 4091 /// required to correctly follow the dependencies between the instructions and 4092 /// their correct scheduling. The ScheduleData is not allocated for the 4093 /// instructions, which do not require scheduling, like phis, nodes with 4094 /// extractelements/insertelements only or nodes with instructions, with 4095 /// uses/operands outside of the block. 4096 struct BlockScheduling { 4097 BlockScheduling(BasicBlock *BB) 4098 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {} 4099 4100 void clear() { 4101 ReadyInsts.clear(); 4102 ScheduleStart = nullptr; 4103 ScheduleEnd = nullptr; 4104 FirstLoadStoreInRegion = nullptr; 4105 LastLoadStoreInRegion = nullptr; 4106 RegionHasStackSave = false; 4107 4108 // Reduce the maximum schedule region size by the size of the 4109 // previous scheduling run. 4110 ScheduleRegionSizeLimit -= ScheduleRegionSize; 4111 if (ScheduleRegionSizeLimit < MinScheduleRegionSize) 4112 ScheduleRegionSizeLimit = MinScheduleRegionSize; 4113 ScheduleRegionSize = 0; 4114 4115 // Make a new scheduling region, i.e. all existing ScheduleData is not 4116 // in the new region yet. 4117 ++SchedulingRegionID; 4118 } 4119 4120 ScheduleData *getScheduleData(Instruction *I) { 4121 if (BB != I->getParent()) 4122 // Avoid lookup if can't possibly be in map. 4123 return nullptr; 4124 ScheduleData *SD = ScheduleDataMap.lookup(I); 4125 if (SD && isInSchedulingRegion(SD)) 4126 return SD; 4127 return nullptr; 4128 } 4129 4130 ScheduleData *getScheduleData(Value *V) { 4131 if (auto *I = dyn_cast<Instruction>(V)) 4132 return getScheduleData(I); 4133 return nullptr; 4134 } 4135 4136 bool isInSchedulingRegion(ScheduleData *SD) const { 4137 return SD->SchedulingRegionID == SchedulingRegionID; 4138 } 4139 4140 /// Marks an instruction as scheduled and puts all dependent ready 4141 /// instructions into the ready-list. 4142 template <typename ReadyListType> 4143 void schedule(ScheduleData *SD, ReadyListType &ReadyList) { 4144 SD->IsScheduled = true; 4145 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); 4146 4147 for (ScheduleData *BundleMember = SD; BundleMember; 4148 BundleMember = BundleMember->NextInBundle) { 4149 4150 // Handle the def-use chain dependencies. 4151 4152 // Decrement the unscheduled counter and insert to ready list if ready. 4153 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) { 4154 ScheduleData *OpDef = getScheduleData(I); 4155 if (OpDef && OpDef->hasValidDependencies() && 4156 OpDef->incrementUnscheduledDeps(-1) == 0) { 4157 // There are no more unscheduled dependencies after 4158 // decrementing, so we can put the dependent instruction 4159 // into the ready list. 4160 ScheduleData *DepBundle = OpDef->FirstInBundle; 4161 assert(!DepBundle->IsScheduled && 4162 "already scheduled bundle gets ready"); 4163 ReadyList.insert(DepBundle); 4164 LLVM_DEBUG(dbgs() 4165 << "SLP: gets ready (def): " << *DepBundle << "\n"); 4166 } 4167 }; 4168 4169 // If BundleMember is a vector bundle, its operands may have been 4170 // reordered during buildTree(). We therefore need to get its operands 4171 // through the TreeEntry. 4172 if (TreeEntry *TE = BundleMember->TE) { 4173 // Need to search for the lane since the tree entry can be reordered. 4174 auto *In = BundleMember->Inst; 4175 int Lane = std::distance(TE->Scalars.begin(), 4176 find(TE->Scalars, In)); 4177 assert(Lane >= 0 && "Lane not set"); 4178 4179 // Since vectorization tree is being built recursively this assertion 4180 // ensures that the tree entry has all operands set before reaching 4181 // this code. Couple of exceptions known at the moment are extracts 4182 // where their second (immediate) operand is not added. Since 4183 // immediates do not affect scheduler behavior this is considered 4184 // okay. 4185 assert( 4186 In && 4187 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) || 4188 In->getNumOperands() == TE->getNumOperands()) && 4189 "Missed TreeEntry operands?"); 4190 4191 for (unsigned OpIdx : seq<unsigned>(TE->getNumOperands())) 4192 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane])) 4193 DecrUnsched(I); 4194 } else { 4195 // If BundleMember is a stand-alone instruction, no operand reordering 4196 // has taken place, so we directly access its operands. 4197 for (Use &U : BundleMember->Inst->operands()) 4198 if (auto *I = dyn_cast<Instruction>(U.get())) 4199 DecrUnsched(I); 4200 } 4201 // Handle the memory dependencies. 4202 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { 4203 if (MemoryDepSD->hasValidDependencies() && 4204 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { 4205 // There are no more unscheduled dependencies after decrementing, 4206 // so we can put the dependent instruction into the ready list. 4207 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; 4208 assert(!DepBundle->IsScheduled && 4209 "already scheduled bundle gets ready"); 4210 ReadyList.insert(DepBundle); 4211 LLVM_DEBUG(dbgs() 4212 << "SLP: gets ready (mem): " << *DepBundle << "\n"); 4213 } 4214 } 4215 // Handle the control dependencies. 4216 for (ScheduleData *DepSD : BundleMember->ControlDependencies) { 4217 if (DepSD->incrementUnscheduledDeps(-1) == 0) { 4218 // There are no more unscheduled dependencies after decrementing, 4219 // so we can put the dependent instruction into the ready list. 4220 ScheduleData *DepBundle = DepSD->FirstInBundle; 4221 assert(!DepBundle->IsScheduled && 4222 "already scheduled bundle gets ready"); 4223 ReadyList.insert(DepBundle); 4224 LLVM_DEBUG(dbgs() 4225 << "SLP: gets ready (ctl): " << *DepBundle << "\n"); 4226 } 4227 } 4228 } 4229 } 4230 4231 /// Verify basic self consistency properties of the data structure. 4232 void verify() { 4233 if (!ScheduleStart) 4234 return; 4235 4236 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() && 4237 ScheduleStart->comesBefore(ScheduleEnd) && 4238 "Not a valid scheduling region?"); 4239 4240 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 4241 auto *SD = getScheduleData(I); 4242 if (!SD) 4243 continue; 4244 assert(isInSchedulingRegion(SD) && 4245 "primary schedule data not in window?"); 4246 assert(isInSchedulingRegion(SD->FirstInBundle) && 4247 "entire bundle in window!"); 4248 SD->verify(); 4249 } 4250 4251 for (auto *SD : ReadyInsts) { 4252 assert(SD->isSchedulingEntity() && SD->isReady() && 4253 "item in ready list not ready?"); 4254 (void)SD; 4255 } 4256 } 4257 4258 /// Put all instructions into the ReadyList which are ready for scheduling. 4259 template <typename ReadyListType> 4260 void initialFillReadyList(ReadyListType &ReadyList) { 4261 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 4262 ScheduleData *SD = getScheduleData(I); 4263 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() && 4264 SD->isReady()) { 4265 ReadyList.insert(SD); 4266 LLVM_DEBUG(dbgs() 4267 << "SLP: initially in ready list: " << *SD << "\n"); 4268 } 4269 } 4270 } 4271 4272 /// Build a bundle from the ScheduleData nodes corresponding to the 4273 /// scalar instruction for each lane. 4274 ScheduleData *buildBundle(ArrayRef<Value *> VL); 4275 4276 /// Checks if a bundle of instructions can be scheduled, i.e. has no 4277 /// cyclic dependencies. This is only a dry-run, no instructions are 4278 /// actually moved at this stage. 4279 /// \returns the scheduling bundle. The returned Optional value is not 4280 /// std::nullopt if \p VL is allowed to be scheduled. 4281 std::optional<ScheduleData *> 4282 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, 4283 const InstructionsState &S); 4284 4285 /// Un-bundles a group of instructions. 4286 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue); 4287 4288 /// Allocates schedule data chunk. 4289 ScheduleData *allocateScheduleDataChunks(); 4290 4291 /// Extends the scheduling region so that V is inside the region. 4292 /// \returns true if the region size is within the limit. 4293 bool extendSchedulingRegion(Value *V, const InstructionsState &S); 4294 4295 /// Initialize the ScheduleData structures for new instructions in the 4296 /// scheduling region. 4297 void initScheduleData(Instruction *FromI, Instruction *ToI, 4298 ScheduleData *PrevLoadStore, 4299 ScheduleData *NextLoadStore); 4300 4301 /// Updates the dependency information of a bundle and of all instructions/ 4302 /// bundles which depend on the original bundle. 4303 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, 4304 BoUpSLP *SLP); 4305 4306 /// Sets all instruction in the scheduling region to un-scheduled. 4307 void resetSchedule(); 4308 4309 BasicBlock *BB; 4310 4311 /// Simple memory allocation for ScheduleData. 4312 SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; 4313 4314 /// The size of a ScheduleData array in ScheduleDataChunks. 4315 int ChunkSize; 4316 4317 /// The allocator position in the current chunk, which is the last entry 4318 /// of ScheduleDataChunks. 4319 int ChunkPos; 4320 4321 /// Attaches ScheduleData to Instruction. 4322 /// Note that the mapping survives during all vectorization iterations, i.e. 4323 /// ScheduleData structures are recycled. 4324 DenseMap<Instruction *, ScheduleData *> ScheduleDataMap; 4325 4326 /// The ready-list for scheduling (only used for the dry-run). 4327 SetVector<ScheduleData *> ReadyInsts; 4328 4329 /// The first instruction of the scheduling region. 4330 Instruction *ScheduleStart = nullptr; 4331 4332 /// The first instruction _after_ the scheduling region. 4333 Instruction *ScheduleEnd = nullptr; 4334 4335 /// The first memory accessing instruction in the scheduling region 4336 /// (can be null). 4337 ScheduleData *FirstLoadStoreInRegion = nullptr; 4338 4339 /// The last memory accessing instruction in the scheduling region 4340 /// (can be null). 4341 ScheduleData *LastLoadStoreInRegion = nullptr; 4342 4343 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling 4344 /// region? Used to optimize the dependence calculation for the 4345 /// common case where there isn't. 4346 bool RegionHasStackSave = false; 4347 4348 /// The current size of the scheduling region. 4349 int ScheduleRegionSize = 0; 4350 4351 /// The maximum size allowed for the scheduling region. 4352 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget; 4353 4354 /// The ID of the scheduling region. For a new vectorization iteration this 4355 /// is incremented which "removes" all ScheduleData from the region. 4356 /// Make sure that the initial SchedulingRegionID is greater than the 4357 /// initial SchedulingRegionID in ScheduleData (which is 0). 4358 int SchedulingRegionID = 1; 4359 }; 4360 4361 /// Attaches the BlockScheduling structures to basic blocks. 4362 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; 4363 4364 /// Performs the "real" scheduling. Done before vectorization is actually 4365 /// performed in a basic block. 4366 void scheduleBlock(BlockScheduling *BS); 4367 4368 /// List of users to ignore during scheduling and that don't need extracting. 4369 const SmallDenseSet<Value *> *UserIgnoreList = nullptr; 4370 4371 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of 4372 /// sorted SmallVectors of unsigned. 4373 struct OrdersTypeDenseMapInfo { 4374 static OrdersType getEmptyKey() { 4375 OrdersType V; 4376 V.push_back(~1U); 4377 return V; 4378 } 4379 4380 static OrdersType getTombstoneKey() { 4381 OrdersType V; 4382 V.push_back(~2U); 4383 return V; 4384 } 4385 4386 static unsigned getHashValue(const OrdersType &V) { 4387 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end())); 4388 } 4389 4390 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) { 4391 return LHS == RHS; 4392 } 4393 }; 4394 4395 // Analysis and block reference. 4396 Function *F; 4397 ScalarEvolution *SE; 4398 TargetTransformInfo *TTI; 4399 TargetLibraryInfo *TLI; 4400 LoopInfo *LI; 4401 DominatorTree *DT; 4402 AssumptionCache *AC; 4403 DemandedBits *DB; 4404 const DataLayout *DL; 4405 OptimizationRemarkEmitter *ORE; 4406 4407 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. 4408 unsigned MinVecRegSize; // Set by cl::opt (default: 128). 4409 4410 /// Instruction builder to construct the vectorized tree. 4411 IRBuilder<TargetFolder> Builder; 4412 4413 /// A map of scalar integer values to the smallest bit width with which they 4414 /// can legally be represented. The values map to (width, signed) pairs, 4415 /// where "width" indicates the minimum bit width and "signed" is True if the 4416 /// value must be signed-extended, rather than zero-extended, back to its 4417 /// original width. 4418 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs; 4419 4420 /// Final size of the reduced vector, if the current graph represents the 4421 /// input for the reduction and it was possible to narrow the size of the 4422 /// reduction. 4423 unsigned ReductionBitWidth = 0; 4424 4425 /// Canonical graph size before the transformations. 4426 unsigned BaseGraphSize = 1; 4427 4428 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of 4429 /// type sizes, used in the tree. 4430 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes; 4431 4432 /// Indices of the vectorized nodes, which supposed to be the roots of the new 4433 /// bitwidth analysis attempt, like trunc, IToFP or ICmp. 4434 DenseSet<unsigned> ExtraBitWidthNodes; 4435 }; 4436 4437 } // end namespace slpvectorizer 4438 4439 template <> struct GraphTraits<BoUpSLP *> { 4440 using TreeEntry = BoUpSLP::TreeEntry; 4441 4442 /// NodeRef has to be a pointer per the GraphWriter. 4443 using NodeRef = TreeEntry *; 4444 4445 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy; 4446 4447 /// Add the VectorizableTree to the index iterator to be able to return 4448 /// TreeEntry pointers. 4449 struct ChildIteratorType 4450 : public iterator_adaptor_base< 4451 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> { 4452 ContainerTy &VectorizableTree; 4453 4454 ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W, 4455 ContainerTy &VT) 4456 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {} 4457 4458 NodeRef operator*() { return I->UserTE; } 4459 }; 4460 4461 static NodeRef getEntryNode(BoUpSLP &R) { 4462 return R.VectorizableTree[0].get(); 4463 } 4464 4465 static ChildIteratorType child_begin(NodeRef N) { 4466 return {N->UserTreeIndices.begin(), N->Container}; 4467 } 4468 4469 static ChildIteratorType child_end(NodeRef N) { 4470 return {N->UserTreeIndices.end(), N->Container}; 4471 } 4472 4473 /// For the node iterator we just need to turn the TreeEntry iterator into a 4474 /// TreeEntry* iterator so that it dereferences to NodeRef. 4475 class nodes_iterator { 4476 using ItTy = ContainerTy::iterator; 4477 ItTy It; 4478 4479 public: 4480 nodes_iterator(const ItTy &It2) : It(It2) {} 4481 NodeRef operator*() { return It->get(); } 4482 nodes_iterator operator++() { 4483 ++It; 4484 return *this; 4485 } 4486 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; } 4487 }; 4488 4489 static nodes_iterator nodes_begin(BoUpSLP *R) { 4490 return nodes_iterator(R->VectorizableTree.begin()); 4491 } 4492 4493 static nodes_iterator nodes_end(BoUpSLP *R) { 4494 return nodes_iterator(R->VectorizableTree.end()); 4495 } 4496 4497 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); } 4498 }; 4499 4500 template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { 4501 using TreeEntry = BoUpSLP::TreeEntry; 4502 4503 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} 4504 4505 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) { 4506 std::string Str; 4507 raw_string_ostream OS(Str); 4508 OS << Entry->Idx << ".\n"; 4509 if (isSplat(Entry->Scalars)) 4510 OS << "<splat> "; 4511 for (auto *V : Entry->Scalars) { 4512 OS << *V; 4513 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) { 4514 return EU.Scalar == V; 4515 })) 4516 OS << " <extract>"; 4517 OS << "\n"; 4518 } 4519 return Str; 4520 } 4521 4522 static std::string getNodeAttributes(const TreeEntry *Entry, 4523 const BoUpSLP *) { 4524 if (Entry->isGather()) 4525 return "color=red"; 4526 if (Entry->State == TreeEntry::ScatterVectorize || 4527 Entry->State == TreeEntry::StridedVectorize) 4528 return "color=blue"; 4529 return ""; 4530 } 4531 }; 4532 4533 } // end namespace llvm 4534 4535 BoUpSLP::~BoUpSLP() { 4536 SmallVector<WeakTrackingVH> DeadInsts; 4537 for (auto *I : DeletedInstructions) { 4538 if (!I->getParent()) { 4539 // Temporarily insert instruction back to erase them from parent and 4540 // memory later. 4541 if (isa<PHINode>(I)) 4542 // Phi nodes must be the very first instructions in the block. 4543 I->insertBefore(F->getEntryBlock(), 4544 F->getEntryBlock().getFirstNonPHIIt()); 4545 else 4546 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator()); 4547 continue; 4548 } 4549 for (Use &U : I->operands()) { 4550 auto *Op = dyn_cast<Instruction>(U.get()); 4551 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() && 4552 wouldInstructionBeTriviallyDead(Op, TLI)) 4553 DeadInsts.emplace_back(Op); 4554 } 4555 I->dropAllReferences(); 4556 } 4557 for (auto *I : DeletedInstructions) { 4558 assert(I->use_empty() && 4559 "trying to erase instruction with users."); 4560 I->eraseFromParent(); 4561 } 4562 4563 // Cleanup any dead scalar code feeding the vectorized instructions 4564 RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI); 4565 4566 #ifdef EXPENSIVE_CHECKS 4567 // If we could guarantee that this call is not extremely slow, we could 4568 // remove the ifdef limitation (see PR47712). 4569 assert(!verifyFunction(*F, &dbgs())); 4570 #endif 4571 } 4572 4573 /// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses 4574 /// contains original mask for the scalars reused in the node. Procedure 4575 /// transform this mask in accordance with the given \p Mask. 4576 static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) { 4577 assert(!Mask.empty() && Reuses.size() == Mask.size() && 4578 "Expected non-empty mask."); 4579 SmallVector<int> Prev(Reuses.begin(), Reuses.end()); 4580 Prev.swap(Reuses); 4581 for (unsigned I = 0, E = Prev.size(); I < E; ++I) 4582 if (Mask[I] != PoisonMaskElem) 4583 Reuses[Mask[I]] = Prev[I]; 4584 } 4585 4586 /// Reorders the given \p Order according to the given \p Mask. \p Order - is 4587 /// the original order of the scalars. Procedure transforms the provided order 4588 /// in accordance with the given \p Mask. If the resulting \p Order is just an 4589 /// identity order, \p Order is cleared. 4590 static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask, 4591 bool BottomOrder = false) { 4592 assert(!Mask.empty() && "Expected non-empty mask."); 4593 unsigned Sz = Mask.size(); 4594 if (BottomOrder) { 4595 SmallVector<unsigned> PrevOrder; 4596 if (Order.empty()) { 4597 PrevOrder.resize(Sz); 4598 std::iota(PrevOrder.begin(), PrevOrder.end(), 0); 4599 } else { 4600 PrevOrder.swap(Order); 4601 } 4602 Order.assign(Sz, Sz); 4603 for (unsigned I = 0; I < Sz; ++I) 4604 if (Mask[I] != PoisonMaskElem) 4605 Order[I] = PrevOrder[Mask[I]]; 4606 if (all_of(enumerate(Order), [&](const auto &Data) { 4607 return Data.value() == Sz || Data.index() == Data.value(); 4608 })) { 4609 Order.clear(); 4610 return; 4611 } 4612 fixupOrderingIndices(Order); 4613 return; 4614 } 4615 SmallVector<int> MaskOrder; 4616 if (Order.empty()) { 4617 MaskOrder.resize(Sz); 4618 std::iota(MaskOrder.begin(), MaskOrder.end(), 0); 4619 } else { 4620 inversePermutation(Order, MaskOrder); 4621 } 4622 reorderReuses(MaskOrder, Mask); 4623 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) { 4624 Order.clear(); 4625 return; 4626 } 4627 Order.assign(Sz, Sz); 4628 for (unsigned I = 0; I < Sz; ++I) 4629 if (MaskOrder[I] != PoisonMaskElem) 4630 Order[MaskOrder[I]] = I; 4631 fixupOrderingIndices(Order); 4632 } 4633 4634 std::optional<BoUpSLP::OrdersType> 4635 BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { 4636 assert(TE.isGather() && "Expected gather node only."); 4637 // Try to find subvector extract/insert patterns and reorder only such 4638 // patterns. 4639 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end()); 4640 Type *ScalarTy = GatheredScalars.front()->getType(); 4641 int NumScalars = GatheredScalars.size(); 4642 if (!isValidElementType(ScalarTy)) 4643 return std::nullopt; 4644 auto *VecTy = getWidenedType(ScalarTy, NumScalars); 4645 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars); 4646 SmallVector<int> ExtractMask; 4647 SmallVector<int> Mask; 4648 SmallVector<SmallVector<const TreeEntry *>> Entries; 4649 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles = 4650 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts); 4651 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles = 4652 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts, 4653 /*ForOrder=*/true); 4654 // No shuffled operands - ignore. 4655 if (GatherShuffles.empty() && ExtractShuffles.empty()) 4656 return std::nullopt; 4657 OrdersType CurrentOrder(NumScalars, NumScalars); 4658 if (GatherShuffles.size() == 1 && 4659 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc && 4660 Entries.front().front()->isSame(TE.Scalars)) { 4661 // Perfect match in the graph, will reuse the previously vectorized 4662 // node. Cost is 0. 4663 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0); 4664 return CurrentOrder; 4665 } 4666 auto IsSplatMask = [](ArrayRef<int> Mask) { 4667 int SingleElt = PoisonMaskElem; 4668 return all_of(Mask, [&](int I) { 4669 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem) 4670 SingleElt = I; 4671 return I == PoisonMaskElem || I == SingleElt; 4672 }); 4673 }; 4674 // Exclusive broadcast mask - ignore. 4675 if ((ExtractShuffles.empty() && IsSplatMask(Mask) && 4676 (Entries.size() != 1 || 4677 Entries.front().front()->ReorderIndices.empty())) || 4678 (GatherShuffles.empty() && IsSplatMask(ExtractMask))) 4679 return std::nullopt; 4680 SmallBitVector ShuffledSubMasks(NumParts); 4681 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder, 4682 ArrayRef<int> Mask, int PartSz, int NumParts, 4683 function_ref<unsigned(unsigned)> GetVF) { 4684 for (int I : seq<int>(0, NumParts)) { 4685 if (ShuffledSubMasks.test(I)) 4686 continue; 4687 const int VF = GetVF(I); 4688 if (VF == 0) 4689 continue; 4690 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I); 4691 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit); 4692 // Shuffle of at least 2 vectors - ignore. 4693 if (any_of(Slice, [&](int I) { return I != NumScalars; })) { 4694 std::fill(Slice.begin(), Slice.end(), NumScalars); 4695 ShuffledSubMasks.set(I); 4696 continue; 4697 } 4698 // Try to include as much elements from the mask as possible. 4699 int FirstMin = INT_MAX; 4700 int SecondVecFound = false; 4701 for (int K : seq<int>(Limit)) { 4702 int Idx = Mask[I * PartSz + K]; 4703 if (Idx == PoisonMaskElem) { 4704 Value *V = GatheredScalars[I * PartSz + K]; 4705 if (isConstant(V) && !isa<PoisonValue>(V)) { 4706 SecondVecFound = true; 4707 break; 4708 } 4709 continue; 4710 } 4711 if (Idx < VF) { 4712 if (FirstMin > Idx) 4713 FirstMin = Idx; 4714 } else { 4715 SecondVecFound = true; 4716 break; 4717 } 4718 } 4719 FirstMin = (FirstMin / PartSz) * PartSz; 4720 // Shuffle of at least 2 vectors - ignore. 4721 if (SecondVecFound) { 4722 std::fill(Slice.begin(), Slice.end(), NumScalars); 4723 ShuffledSubMasks.set(I); 4724 continue; 4725 } 4726 for (int K : seq<int>(Limit)) { 4727 int Idx = Mask[I * PartSz + K]; 4728 if (Idx == PoisonMaskElem) 4729 continue; 4730 Idx -= FirstMin; 4731 if (Idx >= PartSz) { 4732 SecondVecFound = true; 4733 break; 4734 } 4735 if (CurrentOrder[I * PartSz + Idx] > 4736 static_cast<unsigned>(I * PartSz + K) && 4737 CurrentOrder[I * PartSz + Idx] != 4738 static_cast<unsigned>(I * PartSz + Idx)) 4739 CurrentOrder[I * PartSz + Idx] = I * PartSz + K; 4740 } 4741 // Shuffle of at least 2 vectors - ignore. 4742 if (SecondVecFound) { 4743 std::fill(Slice.begin(), Slice.end(), NumScalars); 4744 ShuffledSubMasks.set(I); 4745 continue; 4746 } 4747 } 4748 }; 4749 int PartSz = getPartNumElems(NumScalars, NumParts); 4750 if (!ExtractShuffles.empty()) 4751 TransformMaskToOrder( 4752 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) { 4753 if (!ExtractShuffles[I]) 4754 return 0U; 4755 unsigned VF = 0; 4756 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I); 4757 for (unsigned Idx : seq<unsigned>(Sz)) { 4758 int K = I * PartSz + Idx; 4759 if (ExtractMask[K] == PoisonMaskElem) 4760 continue; 4761 if (!TE.ReuseShuffleIndices.empty()) 4762 K = TE.ReuseShuffleIndices[K]; 4763 if (K == PoisonMaskElem) 4764 continue; 4765 if (!TE.ReorderIndices.empty()) 4766 K = std::distance(TE.ReorderIndices.begin(), 4767 find(TE.ReorderIndices, K)); 4768 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]); 4769 if (!EI) 4770 continue; 4771 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType()) 4772 ->getElementCount() 4773 .getKnownMinValue()); 4774 } 4775 return VF; 4776 }); 4777 // Check special corner case - single shuffle of the same entry. 4778 if (GatherShuffles.size() == 1 && NumParts != 1) { 4779 if (ShuffledSubMasks.any()) 4780 return std::nullopt; 4781 PartSz = NumScalars; 4782 NumParts = 1; 4783 } 4784 if (!Entries.empty()) 4785 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) { 4786 if (!GatherShuffles[I]) 4787 return 0U; 4788 return std::max(Entries[I].front()->getVectorFactor(), 4789 Entries[I].back()->getVectorFactor()); 4790 }); 4791 int NumUndefs = 4792 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; }); 4793 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2)) 4794 return std::nullopt; 4795 return std::move(CurrentOrder); 4796 } 4797 4798 static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, 4799 const TargetLibraryInfo &TLI, 4800 bool CompareOpcodes = true) { 4801 if (getUnderlyingObject(Ptr1, RecursionMaxDepth) != 4802 getUnderlyingObject(Ptr2, RecursionMaxDepth)) 4803 return false; 4804 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1); 4805 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2); 4806 return (!GEP1 || GEP1->getNumOperands() == 2) && 4807 (!GEP2 || GEP2->getNumOperands() == 2) && 4808 (((!GEP1 || isConstant(GEP1->getOperand(1))) && 4809 (!GEP2 || isConstant(GEP2->getOperand(1)))) || 4810 !CompareOpcodes || 4811 (GEP1 && GEP2 && 4812 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI))); 4813 } 4814 4815 /// Calculates minimal alignment as a common alignment. 4816 template <typename T> 4817 static Align computeCommonAlignment(ArrayRef<Value *> VL) { 4818 Align CommonAlignment = cast<T>(VL.front())->getAlign(); 4819 for (Value *V : VL.drop_front()) 4820 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign()); 4821 return CommonAlignment; 4822 } 4823 4824 /// Check if \p Order represents reverse order. 4825 static bool isReverseOrder(ArrayRef<unsigned> Order) { 4826 assert(!Order.empty() && 4827 "Order is empty. Please check it before using isReverseOrder."); 4828 unsigned Sz = Order.size(); 4829 return all_of(enumerate(Order), [&](const auto &Pair) { 4830 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value(); 4831 }); 4832 } 4833 4834 /// Checks if the provided list of pointers \p Pointers represents the strided 4835 /// pointers for type ElemTy. If they are not, std::nullopt is returned. 4836 /// Otherwise, if \p Inst is not specified, just initialized optional value is 4837 /// returned to show that the pointers represent strided pointers. If \p Inst 4838 /// specified, the runtime stride is materialized before the given \p Inst. 4839 /// \returns std::nullopt if the pointers are not pointers with the runtime 4840 /// stride, nullptr or actual stride value, otherwise. 4841 static std::optional<Value *> 4842 calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, 4843 const DataLayout &DL, ScalarEvolution &SE, 4844 SmallVectorImpl<unsigned> &SortedIndices, 4845 Instruction *Inst = nullptr) { 4846 SmallVector<const SCEV *> SCEVs; 4847 const SCEV *PtrSCEVLowest = nullptr; 4848 const SCEV *PtrSCEVHighest = nullptr; 4849 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest 4850 // addresses). 4851 for (Value *Ptr : PointerOps) { 4852 const SCEV *PtrSCEV = SE.getSCEV(Ptr); 4853 if (!PtrSCEV) 4854 return std::nullopt; 4855 SCEVs.push_back(PtrSCEV); 4856 if (!PtrSCEVLowest && !PtrSCEVHighest) { 4857 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV; 4858 continue; 4859 } 4860 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest); 4861 if (isa<SCEVCouldNotCompute>(Diff)) 4862 return std::nullopt; 4863 if (Diff->isNonConstantNegative()) { 4864 PtrSCEVLowest = PtrSCEV; 4865 continue; 4866 } 4867 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV); 4868 if (isa<SCEVCouldNotCompute>(Diff1)) 4869 return std::nullopt; 4870 if (Diff1->isNonConstantNegative()) { 4871 PtrSCEVHighest = PtrSCEV; 4872 continue; 4873 } 4874 } 4875 // Dist = PtrSCEVHighest - PtrSCEVLowest; 4876 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest); 4877 if (isa<SCEVCouldNotCompute>(Dist)) 4878 return std::nullopt; 4879 int Size = DL.getTypeStoreSize(ElemTy); 4880 auto TryGetStride = [&](const SCEV *Dist, 4881 const SCEV *Multiplier) -> const SCEV * { 4882 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) { 4883 if (M->getOperand(0) == Multiplier) 4884 return M->getOperand(1); 4885 if (M->getOperand(1) == Multiplier) 4886 return M->getOperand(0); 4887 return nullptr; 4888 } 4889 if (Multiplier == Dist) 4890 return SE.getConstant(Dist->getType(), 1); 4891 return SE.getUDivExactExpr(Dist, Multiplier); 4892 }; 4893 // Stride_in_elements = Dist / element_size * (num_elems - 1). 4894 const SCEV *Stride = nullptr; 4895 if (Size != 1 || SCEVs.size() > 2) { 4896 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1)); 4897 Stride = TryGetStride(Dist, Sz); 4898 if (!Stride) 4899 return std::nullopt; 4900 } 4901 if (!Stride || isa<SCEVConstant>(Stride)) 4902 return std::nullopt; 4903 // Iterate through all pointers and check if all distances are 4904 // unique multiple of Stride. 4905 using DistOrdPair = std::pair<int64_t, int>; 4906 auto Compare = llvm::less_first(); 4907 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare); 4908 int Cnt = 0; 4909 bool IsConsecutive = true; 4910 for (const SCEV *PtrSCEV : SCEVs) { 4911 unsigned Dist = 0; 4912 if (PtrSCEV != PtrSCEVLowest) { 4913 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest); 4914 const SCEV *Coeff = TryGetStride(Diff, Stride); 4915 if (!Coeff) 4916 return std::nullopt; 4917 const auto *SC = dyn_cast<SCEVConstant>(Coeff); 4918 if (!SC || isa<SCEVCouldNotCompute>(SC)) 4919 return std::nullopt; 4920 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest, 4921 SE.getMulExpr(Stride, SC))) 4922 ->isZero()) 4923 return std::nullopt; 4924 Dist = SC->getAPInt().getZExtValue(); 4925 } 4926 // If the strides are not the same or repeated, we can't vectorize. 4927 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size()) 4928 return std::nullopt; 4929 auto Res = Offsets.emplace(Dist, Cnt); 4930 if (!Res.second) 4931 return std::nullopt; 4932 // Consecutive order if the inserted element is the last one. 4933 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end(); 4934 ++Cnt; 4935 } 4936 if (Offsets.size() != SCEVs.size()) 4937 return std::nullopt; 4938 SortedIndices.clear(); 4939 if (!IsConsecutive) { 4940 // Fill SortedIndices array only if it is non-consecutive. 4941 SortedIndices.resize(PointerOps.size()); 4942 Cnt = 0; 4943 for (const std::pair<int64_t, int> &Pair : Offsets) { 4944 SortedIndices[Cnt] = Pair.second; 4945 ++Cnt; 4946 } 4947 } 4948 if (!Inst) 4949 return nullptr; 4950 SCEVExpander Expander(SE, DL, "strided-load-vec"); 4951 return Expander.expandCodeFor(Stride, Stride->getType(), Inst); 4952 } 4953 4954 static std::pair<InstructionCost, InstructionCost> 4955 getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs, 4956 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, 4957 Type *ScalarTy, VectorType *VecTy); 4958 4959 /// Returns the cost of the shuffle instructions with the given \p Kind, vector 4960 /// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert 4961 /// subvector pattern. 4962 static InstructionCost 4963 getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, 4964 VectorType *Tp, ArrayRef<int> Mask = {}, 4965 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, 4966 int Index = 0, VectorType *SubTp = nullptr, 4967 ArrayRef<const Value *> Args = {}) { 4968 if (Kind != TTI::SK_PermuteTwoSrc) 4969 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); 4970 int NumSrcElts = Tp->getElementCount().getKnownMinValue(); 4971 int NumSubElts; 4972 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask( 4973 Mask, NumSrcElts, NumSubElts, Index)) { 4974 if (Index + NumSubElts > NumSrcElts && 4975 Index + NumSrcElts <= static_cast<int>(Mask.size())) 4976 return TTI.getShuffleCost( 4977 TTI::SK_InsertSubvector, 4978 getWidenedType(Tp->getElementType(), Mask.size()), Mask, 4979 TTI::TCK_RecipThroughput, Index, Tp); 4980 } 4981 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); 4982 } 4983 4984 /// Correctly creates insert_subvector, checking that the index is multiple of 4985 /// the subvectors length. Otherwise, generates shuffle using \p Generator or 4986 /// using default shuffle. 4987 static Value *createInsertVector( 4988 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, 4989 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) { 4990 const unsigned SubVecVF = getNumElements(V->getType()); 4991 if (Index % SubVecVF == 0) { 4992 Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V, 4993 Builder.getInt64(Index)); 4994 } else { 4995 // Create shuffle, insertvector requires that index is multiple of 4996 // the subvector length. 4997 const unsigned VecVF = getNumElements(Vec->getType()); 4998 SmallVector<int> Mask(VecVF, PoisonMaskElem); 4999 std::iota(Mask.begin(), Mask.end(), 0); 5000 for (unsigned I : seq<unsigned>(SubVecVF)) 5001 Mask[I + Index] = I + VecVF; 5002 if (Generator) { 5003 Vec = Generator(Vec, V, Mask); 5004 } else { 5005 // 1. Resize V to the size of Vec. 5006 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem); 5007 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0); 5008 V = Builder.CreateShuffleVector(V, ResizeMask); 5009 Vec = Builder.CreateShuffleVector(Vec, V, Mask); 5010 } 5011 } 5012 return Vec; 5013 } 5014 5015 /// Correctly creates extract_subvector, checking that the index is multiple of 5016 /// the subvectors length. Otherwise, generates shuffle using \p Generator or 5017 /// using default shuffle. 5018 static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec, 5019 unsigned SubVecVF, unsigned Index) { 5020 if (Index % SubVecVF == 0) { 5021 VectorType *SubVecTy = 5022 getWidenedType(Vec->getType()->getScalarType(), SubVecVF); 5023 return Builder.CreateExtractVector(SubVecTy, Vec, Builder.getInt64(Index)); 5024 } 5025 // Create shuffle, extract_subvector requires that index is multiple of 5026 // the subvector length. 5027 SmallVector<int> Mask(SubVecVF, PoisonMaskElem); 5028 std::iota(Mask.begin(), Mask.end(), Index); 5029 return Builder.CreateShuffleVector(Vec, Mask); 5030 } 5031 5032 BoUpSLP::LoadsState 5033 BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, 5034 SmallVectorImpl<unsigned> &Order, 5035 SmallVectorImpl<Value *> &PointerOps, 5036 unsigned *BestVF, bool TryRecursiveCheck) const { 5037 // Check that a vectorized load would load the same memory as a scalar 5038 // load. For example, we don't want to vectorize loads that are smaller 5039 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 5040 // treats loading/storing it as an i8 struct. If we vectorize loads/stores 5041 // from such a struct, we read/write packed bits disagreeing with the 5042 // unvectorized version. 5043 if (BestVF) 5044 *BestVF = 0; 5045 if (areKnownNonVectorizableLoads(VL)) 5046 return LoadsState::Gather; 5047 Type *ScalarTy = VL0->getType(); 5048 5049 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) 5050 return LoadsState::Gather; 5051 5052 // Make sure all loads in the bundle are simple - we can't vectorize 5053 // atomic or volatile loads. 5054 PointerOps.clear(); 5055 const unsigned Sz = VL.size(); 5056 PointerOps.resize(Sz); 5057 auto *POIter = PointerOps.begin(); 5058 for (Value *V : VL) { 5059 auto *L = dyn_cast<LoadInst>(V); 5060 if (!L || !L->isSimple()) 5061 return LoadsState::Gather; 5062 *POIter = L->getPointerOperand(); 5063 ++POIter; 5064 } 5065 5066 Order.clear(); 5067 // Check the order of pointer operands or that all pointers are the same. 5068 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order); 5069 5070 auto *VecTy = getWidenedType(ScalarTy, Sz); 5071 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL); 5072 if (!IsSorted) { 5073 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) { 5074 if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) && 5075 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order)) 5076 return LoadsState::StridedVectorize; 5077 } 5078 5079 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || 5080 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) 5081 return LoadsState::Gather; 5082 5083 if (!all_of(PointerOps, [&](Value *P) { 5084 return arePointersCompatible(P, PointerOps.front(), *TLI); 5085 })) 5086 return LoadsState::Gather; 5087 5088 } else { 5089 Value *Ptr0; 5090 Value *PtrN; 5091 if (Order.empty()) { 5092 Ptr0 = PointerOps.front(); 5093 PtrN = PointerOps.back(); 5094 } else { 5095 Ptr0 = PointerOps[Order.front()]; 5096 PtrN = PointerOps[Order.back()]; 5097 } 5098 std::optional<int> Diff = 5099 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); 5100 // Check that the sorted loads are consecutive. 5101 if (static_cast<unsigned>(*Diff) == Sz - 1) 5102 return LoadsState::Vectorize; 5103 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || 5104 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) 5105 return LoadsState::Gather; 5106 // Simple check if not a strided access - clear order. 5107 bool IsPossibleStrided = *Diff % (Sz - 1) == 0; 5108 // Try to generate strided load node if: 5109 // 1. Target with strided load support is detected. 5110 // 2. The number of loads is greater than MinProfitableStridedLoads, 5111 // or the potential stride <= MaxProfitableLoadStride and the 5112 // potential stride is power-of-2 (to avoid perf regressions for the very 5113 // small number of loads) and max distance > number of loads, or potential 5114 // stride is -1. 5115 // 3. The loads are ordered, or number of unordered loads <= 5116 // MaxProfitableUnorderedLoads, or loads are in reversed order. 5117 // (this check is to avoid extra costs for very expensive shuffles). 5118 // 4. Any pointer operand is an instruction with the users outside of the 5119 // current graph (for masked gathers extra extractelement instructions 5120 // might be required). 5121 auto IsAnyPointerUsedOutGraph = 5122 IsPossibleStrided && any_of(PointerOps, [&](Value *V) { 5123 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) { 5124 return !isVectorized(U) && !MustGather.contains(U); 5125 }); 5126 }); 5127 const unsigned AbsoluteDiff = std::abs(*Diff); 5128 if (IsPossibleStrided && 5129 (IsAnyPointerUsedOutGraph || 5130 (AbsoluteDiff > Sz && 5131 (Sz > MinProfitableStridedLoads || 5132 (AbsoluteDiff <= MaxProfitableLoadStride * Sz && 5133 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) || 5134 *Diff == -(static_cast<int>(Sz) - 1))) { 5135 int Stride = *Diff / static_cast<int>(Sz - 1); 5136 if (*Diff == Stride * static_cast<int>(Sz - 1)) { 5137 Align Alignment = 5138 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]) 5139 ->getAlign(); 5140 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) { 5141 // Iterate through all pointers and check if all distances are 5142 // unique multiple of Dist. 5143 SmallSet<int, 4> Dists; 5144 for (Value *Ptr : PointerOps) { 5145 int Dist = 0; 5146 if (Ptr == PtrN) 5147 Dist = *Diff; 5148 else if (Ptr != Ptr0) 5149 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE); 5150 // If the strides are not the same or repeated, we can't 5151 // vectorize. 5152 if (((Dist / Stride) * Stride) != Dist || 5153 !Dists.insert(Dist).second) 5154 break; 5155 } 5156 if (Dists.size() == Sz) 5157 return LoadsState::StridedVectorize; 5158 } 5159 } 5160 } 5161 } 5162 // Correctly identify compare the cost of loads + shuffles rather than 5163 // strided/masked gather loads. Returns true if vectorized + shuffles 5164 // representation is better than just gather. 5165 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment, 5166 unsigned *BestVF, 5167 bool ProfitableGatherPointers) { 5168 if (BestVF) 5169 *BestVF = 0; 5170 // Compare masked gather cost and loads + insert subvector costs. 5171 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5172 auto [ScalarGEPCost, VectorGEPCost] = 5173 getGEPCosts(TTI, PointerOps, PointerOps.front(), 5174 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy); 5175 // Estimate the cost of masked gather GEP. If not a splat, roughly 5176 // estimate as a buildvector, otherwise estimate as splat. 5177 APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements()); 5178 VectorType *PtrVecTy = 5179 getWidenedType(PointerOps.front()->getType()->getScalarType(), 5180 VecTy->getNumElements()); 5181 if (static_cast<unsigned>(count_if( 5182 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 || 5183 any_of(PointerOps, [&](Value *V) { 5184 return getUnderlyingObject(V) != 5185 getUnderlyingObject(PointerOps.front()); 5186 })) 5187 VectorGEPCost += TTI.getScalarizationOverhead( 5188 PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind); 5189 else 5190 VectorGEPCost += 5191 TTI.getScalarizationOverhead( 5192 PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0), 5193 /*Insert=*/true, /*Extract=*/false, CostKind) + 5194 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind); 5195 // The cost of scalar loads. 5196 InstructionCost ScalarLoadsCost = 5197 std::accumulate(VL.begin(), VL.end(), InstructionCost(), 5198 [&](InstructionCost C, Value *V) { 5199 return C + TTI.getInstructionCost( 5200 cast<Instruction>(V), CostKind); 5201 }) + 5202 ScalarGEPCost; 5203 // The cost of masked gather. 5204 InstructionCost MaskedGatherCost = 5205 TTI.getGatherScatterOpCost( 5206 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(), 5207 /*VariableMask=*/false, CommonAlignment, CostKind) + 5208 (ProfitableGatherPointers ? 0 : VectorGEPCost); 5209 InstructionCost GatherCost = 5210 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true, 5211 /*Extract=*/false, CostKind) + 5212 ScalarLoadsCost; 5213 // The list of loads is small or perform partial check already - directly 5214 // compare masked gather cost and gather cost. 5215 constexpr unsigned ListLimit = 4; 5216 if (!TryRecursiveCheck || VL.size() < ListLimit) 5217 return MaskedGatherCost - GatherCost >= -SLPCostThreshold; 5218 5219 // FIXME: The following code has not been updated for non-power-of-2 5220 // vectors (and not whole registers). The splitting logic here does not 5221 // cover the original vector if the vector factor is not a power of two. 5222 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size())) 5223 return false; 5224 5225 unsigned Sz = DL->getTypeSizeInBits(ScalarTy); 5226 unsigned MinVF = getMinVF(2 * Sz); 5227 DemandedElts.clearAllBits(); 5228 // Iterate through possible vectorization factors and check if vectorized + 5229 // shuffles is better than just gather. 5230 for (unsigned VF = 5231 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1); 5232 VF >= MinVF; 5233 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) { 5234 SmallVector<LoadsState> States; 5235 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) { 5236 ArrayRef<Value *> Slice = VL.slice(Cnt, VF); 5237 SmallVector<unsigned> Order; 5238 SmallVector<Value *> PointerOps; 5239 LoadsState LS = 5240 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF, 5241 /*TryRecursiveCheck=*/false); 5242 // Check that the sorted loads are consecutive. 5243 if (LS == LoadsState::Gather) { 5244 if (BestVF) { 5245 DemandedElts.setAllBits(); 5246 break; 5247 } 5248 DemandedElts.setBits(Cnt, Cnt + VF); 5249 continue; 5250 } 5251 // If need the reorder - consider as high-cost masked gather for now. 5252 if ((LS == LoadsState::Vectorize || 5253 LS == LoadsState::StridedVectorize) && 5254 !Order.empty() && !isReverseOrder(Order)) 5255 LS = LoadsState::ScatterVectorize; 5256 States.push_back(LS); 5257 } 5258 if (DemandedElts.isAllOnes()) 5259 // All loads gathered - try smaller VF. 5260 continue; 5261 // Can be vectorized later as a serie of loads/insertelements. 5262 InstructionCost VecLdCost = 0; 5263 if (!DemandedElts.isZero()) { 5264 VecLdCost = 5265 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true, 5266 /*Extract=*/false, CostKind) + 5267 ScalarGEPCost; 5268 for (unsigned Idx : seq<unsigned>(VL.size())) 5269 if (DemandedElts[Idx]) 5270 VecLdCost += 5271 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind); 5272 } 5273 unsigned ScalarTyNumElements = getNumElements(ScalarTy); 5274 auto *SubVecTy = getWidenedType(ScalarTy, VF); 5275 for (auto [I, LS] : enumerate(States)) { 5276 auto *LI0 = cast<LoadInst>(VL[I * VF]); 5277 InstructionCost VectorGEPCost = 5278 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers) 5279 ? 0 5280 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF), 5281 LI0->getPointerOperand(), 5282 Instruction::GetElementPtr, CostKind, ScalarTy, 5283 SubVecTy) 5284 .second; 5285 if (LS == LoadsState::ScatterVectorize) { 5286 if (static_cast<unsigned>( 5287 count_if(PointerOps, IsaPred<GetElementPtrInst>)) < 5288 PointerOps.size() - 1 || 5289 any_of(PointerOps, [&](Value *V) { 5290 return getUnderlyingObject(V) != 5291 getUnderlyingObject(PointerOps.front()); 5292 })) 5293 VectorGEPCost += TTI.getScalarizationOverhead( 5294 SubVecTy, APInt::getAllOnes(VF), 5295 /*Insert=*/true, /*Extract=*/false, CostKind); 5296 else 5297 VectorGEPCost += 5298 TTI.getScalarizationOverhead( 5299 SubVecTy, APInt::getOneBitSet(ScalarTyNumElements * VF, 0), 5300 /*Insert=*/true, /*Extract=*/false, CostKind) + 5301 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {}, 5302 CostKind); 5303 } 5304 switch (LS) { 5305 case LoadsState::Vectorize: 5306 VecLdCost += 5307 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(), 5308 LI0->getPointerAddressSpace(), CostKind, 5309 TTI::OperandValueInfo()) + 5310 VectorGEPCost; 5311 break; 5312 case LoadsState::StridedVectorize: 5313 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy, 5314 LI0->getPointerOperand(), 5315 /*VariableMask=*/false, 5316 CommonAlignment, CostKind) + 5317 VectorGEPCost; 5318 break; 5319 case LoadsState::ScatterVectorize: 5320 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy, 5321 LI0->getPointerOperand(), 5322 /*VariableMask=*/false, 5323 CommonAlignment, CostKind) + 5324 VectorGEPCost; 5325 break; 5326 case LoadsState::Gather: 5327 // Gathers are already calculated - ignore. 5328 continue; 5329 } 5330 SmallVector<int> ShuffleMask(VL.size()); 5331 for (int Idx : seq<int>(0, VL.size())) 5332 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx; 5333 if (I > 0) 5334 VecLdCost += 5335 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask, 5336 CostKind, I * VF, SubVecTy); 5337 } 5338 // If masked gather cost is higher - better to vectorize, so 5339 // consider it as a gather node. It will be better estimated 5340 // later. 5341 if (MaskedGatherCost >= VecLdCost && 5342 VecLdCost - GatherCost < -SLPCostThreshold) { 5343 if (BestVF) 5344 *BestVF = VF; 5345 return true; 5346 } 5347 } 5348 return MaskedGatherCost - GatherCost >= -SLPCostThreshold; 5349 }; 5350 // TODO: need to improve analysis of the pointers, if not all of them are 5351 // GEPs or have > 2 operands, we end up with a gather node, which just 5352 // increases the cost. 5353 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent()); 5354 bool ProfitableGatherPointers = 5355 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) { 5356 return L->isLoopInvariant(V); 5357 })) <= Sz / 2; 5358 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) { 5359 auto *GEP = dyn_cast<GetElementPtrInst>(P); 5360 return (!GEP && doesNotNeedToBeScheduled(P)) || 5361 (GEP && GEP->getNumOperands() == 2 && 5362 isa<Constant, Instruction>(GEP->getOperand(1))); 5363 })) { 5364 // Check if potential masked gather can be represented as series 5365 // of loads + insertsubvectors. 5366 // If masked gather cost is higher - better to vectorize, so 5367 // consider it as a gather node. It will be better estimated 5368 // later. 5369 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF, 5370 ProfitableGatherPointers)) 5371 return LoadsState::ScatterVectorize; 5372 } 5373 5374 return LoadsState::Gather; 5375 } 5376 5377 static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, 5378 ArrayRef<BasicBlock *> BBs, Type *ElemTy, 5379 const DataLayout &DL, ScalarEvolution &SE, 5380 SmallVectorImpl<unsigned> &SortedIndices) { 5381 assert( 5382 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && 5383 "Expected list of pointer operands."); 5384 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each 5385 // Ptr into, sort and return the sorted indices with values next to one 5386 // another. 5387 SmallMapVector<std::pair<BasicBlock *, Value *>, 5388 SmallVector<SmallVector<std::tuple<Value *, int, unsigned>>>, 8> 5389 Bases; 5390 Bases 5391 .try_emplace(std::make_pair( 5392 BBs.front(), getUnderlyingObject(VL.front(), RecursionMaxDepth))) 5393 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U); 5394 5395 SortedIndices.clear(); 5396 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) { 5397 auto Key = std::make_pair(BBs[Cnt + 1], 5398 getUnderlyingObject(Ptr, RecursionMaxDepth)); 5399 bool Found = any_of(Bases.try_emplace(Key).first->second, 5400 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) { 5401 std::optional<int> Diff = getPointersDiff( 5402 ElemTy, std::get<0>(Base.front()), ElemTy, 5403 Ptr, DL, SE, 5404 /*StrictCheck=*/true); 5405 if (!Diff) 5406 return false; 5407 5408 Base.emplace_back(Ptr, *Diff, Cnt + 1); 5409 return true; 5410 }); 5411 5412 if (!Found) { 5413 // If we haven't found enough to usefully cluster, return early. 5414 if (Bases.size() > VL.size() / 2 - 1) 5415 return false; 5416 5417 // Not found already - add a new Base 5418 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1); 5419 } 5420 } 5421 5422 if (Bases.size() == VL.size()) 5423 return false; 5424 5425 if (Bases.size() == 1 && (Bases.front().second.size() == 1 || 5426 Bases.front().second.size() == VL.size())) 5427 return false; 5428 5429 // For each of the bases sort the pointers by Offset and check if any of the 5430 // base become consecutively allocated. 5431 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) { 5432 SmallPtrSet<Value *, 13> FirstPointers; 5433 SmallPtrSet<Value *, 13> SecondPointers; 5434 Value *P1 = Ptr1; 5435 Value *P2 = Ptr2; 5436 unsigned Depth = 0; 5437 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) { 5438 if (P1 == P2 || Depth > RecursionMaxDepth) 5439 return false; 5440 FirstPointers.insert(P1); 5441 SecondPointers.insert(P2); 5442 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1); 5443 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1); 5444 ++Depth; 5445 } 5446 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) && 5447 "Unable to find matching root."); 5448 return FirstPointers.contains(P2) && !SecondPointers.contains(P1); 5449 }; 5450 for (auto &Base : Bases) { 5451 for (auto &Vec : Base.second) { 5452 if (Vec.size() > 1) { 5453 stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X, 5454 const std::tuple<Value *, int, unsigned> &Y) { 5455 return std::get<1>(X) < std::get<1>(Y); 5456 }); 5457 int InitialOffset = std::get<1>(Vec[0]); 5458 bool AnyConsecutive = 5459 all_of(enumerate(Vec), [InitialOffset](const auto &P) { 5460 return std::get<1>(P.value()) == int(P.index()) + InitialOffset; 5461 }); 5462 // Fill SortedIndices array only if it looks worth-while to sort the 5463 // ptrs. 5464 if (!AnyConsecutive) 5465 return false; 5466 } 5467 } 5468 stable_sort(Base.second, [&](const auto &V1, const auto &V2) { 5469 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front())); 5470 }); 5471 } 5472 5473 for (auto &T : Bases) 5474 for (const auto &Vec : T.second) 5475 for (const auto &P : Vec) 5476 SortedIndices.push_back(std::get<2>(P)); 5477 5478 assert(SortedIndices.size() == VL.size() && 5479 "Expected SortedIndices to be the size of VL"); 5480 return true; 5481 } 5482 5483 std::optional<BoUpSLP::OrdersType> 5484 BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) { 5485 assert(TE.isGather() && "Expected gather node only."); 5486 Type *ScalarTy = TE.Scalars[0]->getType(); 5487 5488 SmallVector<Value *> Ptrs; 5489 Ptrs.reserve(TE.Scalars.size()); 5490 SmallVector<BasicBlock *> BBs; 5491 BBs.reserve(TE.Scalars.size()); 5492 for (Value *V : TE.Scalars) { 5493 auto *L = dyn_cast<LoadInst>(V); 5494 if (!L || !L->isSimple()) 5495 return std::nullopt; 5496 Ptrs.push_back(L->getPointerOperand()); 5497 BBs.push_back(L->getParent()); 5498 } 5499 5500 BoUpSLP::OrdersType Order; 5501 if (!LoadEntriesToVectorize.contains(TE.Idx) && 5502 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order)) 5503 return std::move(Order); 5504 return std::nullopt; 5505 } 5506 5507 /// Check if two insertelement instructions are from the same buildvector. 5508 static bool areTwoInsertFromSameBuildVector( 5509 InsertElementInst *VU, InsertElementInst *V, 5510 function_ref<Value *(InsertElementInst *)> GetBaseOperand) { 5511 // Instructions must be from the same basic blocks. 5512 if (VU->getParent() != V->getParent()) 5513 return false; 5514 // Checks if 2 insertelements are from the same buildvector. 5515 if (VU->getType() != V->getType()) 5516 return false; 5517 // Multiple used inserts are separate nodes. 5518 if (!VU->hasOneUse() && !V->hasOneUse()) 5519 return false; 5520 auto *IE1 = VU; 5521 auto *IE2 = V; 5522 std::optional<unsigned> Idx1 = getElementIndex(IE1); 5523 std::optional<unsigned> Idx2 = getElementIndex(IE2); 5524 if (Idx1 == std::nullopt || Idx2 == std::nullopt) 5525 return false; 5526 // Go through the vector operand of insertelement instructions trying to find 5527 // either VU as the original vector for IE2 or V as the original vector for 5528 // IE1. 5529 SmallBitVector ReusedIdx( 5530 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue()); 5531 bool IsReusedIdx = false; 5532 do { 5533 if (IE2 == VU && !IE1) 5534 return VU->hasOneUse(); 5535 if (IE1 == V && !IE2) 5536 return V->hasOneUse(); 5537 if (IE1 && IE1 != V) { 5538 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2); 5539 IsReusedIdx |= ReusedIdx.test(Idx1); 5540 ReusedIdx.set(Idx1); 5541 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx) 5542 IE1 = nullptr; 5543 else 5544 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1)); 5545 } 5546 if (IE2 && IE2 != VU) { 5547 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1); 5548 IsReusedIdx |= ReusedIdx.test(Idx2); 5549 ReusedIdx.set(Idx2); 5550 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx) 5551 IE2 = nullptr; 5552 else 5553 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2)); 5554 } 5555 } while (!IsReusedIdx && (IE1 || IE2)); 5556 return false; 5557 } 5558 5559 std::optional<BoUpSLP::OrdersType> 5560 BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { 5561 // No need to reorder if need to shuffle reuses, still need to shuffle the 5562 // node. 5563 if (!TE.ReuseShuffleIndices.empty()) { 5564 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors. 5565 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) && 5566 "Reshuffling scalars not yet supported for nodes with padding"); 5567 5568 if (isSplat(TE.Scalars)) 5569 return std::nullopt; 5570 // Check if reuse shuffle indices can be improved by reordering. 5571 // For this, check that reuse mask is "clustered", i.e. each scalar values 5572 // is used once in each submask of size <number_of_scalars>. 5573 // Example: 4 scalar values. 5574 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered. 5575 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because 5576 // element 3 is used twice in the second submask. 5577 unsigned Sz = TE.Scalars.size(); 5578 if (TE.isGather()) { 5579 if (std::optional<OrdersType> CurrentOrder = 5580 findReusedOrderedScalars(TE)) { 5581 SmallVector<int> Mask; 5582 fixupOrderingIndices(*CurrentOrder); 5583 inversePermutation(*CurrentOrder, Mask); 5584 ::addMask(Mask, TE.ReuseShuffleIndices); 5585 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor()); 5586 unsigned Sz = TE.Scalars.size(); 5587 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) { 5588 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz))) 5589 if (Idx != PoisonMaskElem) 5590 Res[Idx + K * Sz] = I + K * Sz; 5591 } 5592 return std::move(Res); 5593 } 5594 } 5595 if (Sz == 2 && TE.getVectorFactor() == 4 && 5596 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(), 5597 2 * TE.getVectorFactor())) == 1) 5598 return std::nullopt; 5599 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, 5600 Sz)) { 5601 SmallVector<int> ReorderMask(Sz, PoisonMaskElem); 5602 if (TE.ReorderIndices.empty()) 5603 std::iota(ReorderMask.begin(), ReorderMask.end(), 0); 5604 else 5605 inversePermutation(TE.ReorderIndices, ReorderMask); 5606 ::addMask(ReorderMask, TE.ReuseShuffleIndices); 5607 unsigned VF = ReorderMask.size(); 5608 OrdersType ResOrder(VF, VF); 5609 unsigned NumParts = divideCeil(VF, Sz); 5610 SmallBitVector UsedVals(NumParts); 5611 for (unsigned I = 0; I < VF; I += Sz) { 5612 int Val = PoisonMaskElem; 5613 unsigned UndefCnt = 0; 5614 unsigned Limit = std::min(Sz, VF - I); 5615 if (any_of(ArrayRef(ReorderMask).slice(I, Limit), 5616 [&](int Idx) { 5617 if (Val == PoisonMaskElem && Idx != PoisonMaskElem) 5618 Val = Idx; 5619 if (Idx == PoisonMaskElem) 5620 ++UndefCnt; 5621 return Idx != PoisonMaskElem && Idx != Val; 5622 }) || 5623 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) || 5624 UndefCnt > Sz / 2) 5625 return std::nullopt; 5626 UsedVals.set(Val); 5627 for (unsigned K = 0; K < NumParts; ++K) { 5628 unsigned Idx = Val + Sz * K; 5629 if (Idx < VF) 5630 ResOrder[Idx] = I + K; 5631 } 5632 } 5633 return std::move(ResOrder); 5634 } 5635 unsigned VF = TE.getVectorFactor(); 5636 // Try build correct order for extractelement instructions. 5637 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(), 5638 TE.ReuseShuffleIndices.end()); 5639 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement && 5640 all_of(TE.Scalars, [Sz](Value *V) { 5641 if (isa<PoisonValue>(V)) 5642 return true; 5643 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V)); 5644 return Idx && *Idx < Sz; 5645 })) { 5646 assert(!TE.isAltShuffle() && "Alternate instructions are only supported " 5647 "by BinaryOperator and CastInst."); 5648 SmallVector<int> ReorderMask(Sz, PoisonMaskElem); 5649 if (TE.ReorderIndices.empty()) 5650 std::iota(ReorderMask.begin(), ReorderMask.end(), 0); 5651 else 5652 inversePermutation(TE.ReorderIndices, ReorderMask); 5653 for (unsigned I = 0; I < VF; ++I) { 5654 int &Idx = ReusedMask[I]; 5655 if (Idx == PoisonMaskElem) 5656 continue; 5657 Value *V = TE.Scalars[ReorderMask[Idx]]; 5658 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V)); 5659 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI)); 5660 } 5661 } 5662 // Build the order of the VF size, need to reorder reuses shuffles, they are 5663 // always of VF size. 5664 OrdersType ResOrder(VF); 5665 std::iota(ResOrder.begin(), ResOrder.end(), 0); 5666 auto *It = ResOrder.begin(); 5667 for (unsigned K = 0; K < VF; K += Sz) { 5668 OrdersType CurrentOrder(TE.ReorderIndices); 5669 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)}; 5670 if (SubMask.front() == PoisonMaskElem) 5671 std::iota(SubMask.begin(), SubMask.end(), 0); 5672 reorderOrder(CurrentOrder, SubMask); 5673 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; }); 5674 std::advance(It, Sz); 5675 } 5676 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) { 5677 return Data.index() == Data.value(); 5678 })) 5679 return std::nullopt; // No need to reorder. 5680 return std::move(ResOrder); 5681 } 5682 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom && 5683 any_of(TE.UserTreeIndices, 5684 [](const EdgeInfo &EI) { 5685 return !Instruction::isBinaryOp(EI.UserTE->getOpcode()); 5686 }) && 5687 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices))) 5688 return std::nullopt; 5689 if ((TE.State == TreeEntry::Vectorize || 5690 TE.State == TreeEntry::StridedVectorize) && 5691 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) || 5692 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) { 5693 assert(!TE.isAltShuffle() && "Alternate instructions are only supported by " 5694 "BinaryOperator and CastInst."); 5695 return TE.ReorderIndices; 5696 } 5697 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) { 5698 if (!TE.ReorderIndices.empty()) 5699 return TE.ReorderIndices; 5700 5701 SmallVector<Instruction *> UserBVHead(TE.Scalars.size()); 5702 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) { 5703 if (!V->hasNUsesOrMore(1)) 5704 continue; 5705 auto *II = dyn_cast<InsertElementInst>(*V->user_begin()); 5706 if (!II) 5707 continue; 5708 Instruction *BVHead = nullptr; 5709 BasicBlock *BB = II->getParent(); 5710 while (II && II->hasOneUse() && II->getParent() == BB) { 5711 BVHead = II; 5712 II = dyn_cast<InsertElementInst>(II->getOperand(0)); 5713 } 5714 I = BVHead; 5715 } 5716 5717 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) { 5718 assert(BB1 != BB2 && "Expected different basic blocks."); 5719 auto *NodeA = DT->getNode(BB1); 5720 auto *NodeB = DT->getNode(BB2); 5721 assert(NodeA && "Should only process reachable instructions"); 5722 assert(NodeB && "Should only process reachable instructions"); 5723 assert((NodeA == NodeB) == 5724 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && 5725 "Different nodes should have different DFS numbers"); 5726 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn(); 5727 }; 5728 auto PHICompare = [&](unsigned I1, unsigned I2) { 5729 Value *V1 = TE.Scalars[I1]; 5730 Value *V2 = TE.Scalars[I2]; 5731 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0)) 5732 return false; 5733 if (isa<PoisonValue>(V1)) 5734 return true; 5735 if (isa<PoisonValue>(V2)) 5736 return false; 5737 if (V1->getNumUses() < V2->getNumUses()) 5738 return true; 5739 if (V1->getNumUses() > V2->getNumUses()) 5740 return false; 5741 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin()); 5742 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin()); 5743 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent()) 5744 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(), 5745 FirstUserOfPhi2->getParent()); 5746 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1); 5747 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2); 5748 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1); 5749 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2); 5750 if (IE1 && !IE2) 5751 return true; 5752 if (!IE1 && IE2) 5753 return false; 5754 if (IE1 && IE2) { 5755 if (UserBVHead[I1] && !UserBVHead[I2]) 5756 return true; 5757 if (!UserBVHead[I1]) 5758 return false; 5759 if (UserBVHead[I1] == UserBVHead[I2]) 5760 return getElementIndex(IE1) < getElementIndex(IE2); 5761 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent()) 5762 return CompareByBasicBlocks(UserBVHead[I1]->getParent(), 5763 UserBVHead[I2]->getParent()); 5764 return UserBVHead[I1]->comesBefore(UserBVHead[I2]); 5765 } 5766 if (EE1 && !EE2) 5767 return true; 5768 if (!EE1 && EE2) 5769 return false; 5770 if (EE1 && EE2) { 5771 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0)); 5772 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0)); 5773 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0)); 5774 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0)); 5775 if (!Inst2 && !P2) 5776 return Inst1 || P1; 5777 if (EE1->getOperand(0) == EE2->getOperand(0)) 5778 return getElementIndex(EE1) < getElementIndex(EE2); 5779 if (!Inst1 && Inst2) 5780 return false; 5781 if (Inst1 && Inst2) { 5782 if (Inst1->getParent() != Inst2->getParent()) 5783 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent()); 5784 return Inst1->comesBefore(Inst2); 5785 } 5786 if (!P1 && P2) 5787 return false; 5788 assert(P1 && P2 && 5789 "Expected either instructions or arguments vector operands."); 5790 return P1->getArgNo() < P2->getArgNo(); 5791 } 5792 return false; 5793 }; 5794 OrdersType Phis(TE.Scalars.size()); 5795 std::iota(Phis.begin(), Phis.end(), 0); 5796 stable_sort(Phis, PHICompare); 5797 if (isIdentityOrder(Phis)) 5798 return std::nullopt; // No need to reorder. 5799 return std::move(Phis); 5800 } 5801 if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) && 5802 allSameType(TE.Scalars)) { 5803 // TODO: add analysis of other gather nodes with extractelement 5804 // instructions and other values/instructions, not only undefs. 5805 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) || 5806 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) && 5807 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) && 5808 all_of(TE.Scalars, [](Value *V) { 5809 auto *EE = dyn_cast<ExtractElementInst>(V); 5810 return !EE || isa<FixedVectorType>(EE->getVectorOperandType()); 5811 })) { 5812 // Check that gather of extractelements can be represented as 5813 // just a shuffle of a single vector. 5814 OrdersType CurrentOrder; 5815 bool Reuse = 5816 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true); 5817 if (Reuse || !CurrentOrder.empty()) 5818 return std::move(CurrentOrder); 5819 } 5820 // If the gather node is <undef, v, .., poison> and 5821 // insertelement poison, v, 0 [+ permute] 5822 // is cheaper than 5823 // insertelement poison, v, n - try to reorder. 5824 // If rotating the whole graph, exclude the permute cost, the whole graph 5825 // might be transformed. 5826 int Sz = TE.Scalars.size(); 5827 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) && 5828 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) { 5829 const auto *It = 5830 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); }); 5831 if (It == TE.Scalars.begin()) 5832 return OrdersType(); 5833 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz); 5834 if (It != TE.Scalars.end()) { 5835 OrdersType Order(Sz, Sz); 5836 unsigned Idx = std::distance(TE.Scalars.begin(), It); 5837 Order[Idx] = 0; 5838 fixupOrderingIndices(Order); 5839 SmallVector<int> Mask; 5840 inversePermutation(Order, Mask); 5841 InstructionCost PermuteCost = 5842 TopToBottom 5843 ? 0 5844 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask); 5845 InstructionCost InsertFirstCost = TTI->getVectorInstrCost( 5846 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0, 5847 PoisonValue::get(Ty), *It); 5848 InstructionCost InsertIdxCost = TTI->getVectorInstrCost( 5849 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx, 5850 PoisonValue::get(Ty), *It); 5851 if (InsertFirstCost + PermuteCost < InsertIdxCost) { 5852 OrdersType Order(Sz, Sz); 5853 Order[Idx] = 0; 5854 return std::move(Order); 5855 } 5856 } 5857 } 5858 if (isSplat(TE.Scalars)) 5859 return std::nullopt; 5860 if (TE.Scalars.size() >= 3) 5861 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE)) 5862 return Order; 5863 // Check if can include the order of vectorized loads. For masked gathers do 5864 // extra analysis later, so include such nodes into a special list. 5865 if (TE.hasState() && TE.getOpcode() == Instruction::Load) { 5866 SmallVector<Value *> PointerOps; 5867 OrdersType CurrentOrder; 5868 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(), 5869 CurrentOrder, PointerOps); 5870 if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize) 5871 return std::move(CurrentOrder); 5872 } 5873 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars 5874 // has been auditted for correctness with non-power-of-two vectors. 5875 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) 5876 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE)) 5877 return CurrentOrder; 5878 } 5879 return std::nullopt; 5880 } 5881 5882 /// Checks if the given mask is a "clustered" mask with the same clusters of 5883 /// size \p Sz, which are not identity submasks. 5884 static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask, 5885 unsigned Sz) { 5886 ArrayRef<int> FirstCluster = Mask.slice(0, Sz); 5887 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz)) 5888 return false; 5889 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) { 5890 ArrayRef<int> Cluster = Mask.slice(I, Sz); 5891 if (Cluster != FirstCluster) 5892 return false; 5893 } 5894 return true; 5895 } 5896 5897 void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const { 5898 // Reorder reuses mask. 5899 reorderReuses(TE.ReuseShuffleIndices, Mask); 5900 const unsigned Sz = TE.Scalars.size(); 5901 // For vectorized and non-clustered reused no need to do anything else. 5902 if (!TE.isGather() || 5903 !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, 5904 Sz) || 5905 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz)) 5906 return; 5907 SmallVector<int> NewMask; 5908 inversePermutation(TE.ReorderIndices, NewMask); 5909 addMask(NewMask, TE.ReuseShuffleIndices); 5910 // Clear reorder since it is going to be applied to the new mask. 5911 TE.ReorderIndices.clear(); 5912 // Try to improve gathered nodes with clustered reuses, if possible. 5913 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz); 5914 SmallVector<unsigned> NewOrder(Slice); 5915 inversePermutation(NewOrder, NewMask); 5916 reorderScalars(TE.Scalars, NewMask); 5917 // Fill the reuses mask with the identity submasks. 5918 for (auto *It = TE.ReuseShuffleIndices.begin(), 5919 *End = TE.ReuseShuffleIndices.end(); 5920 It != End; std::advance(It, Sz)) 5921 std::iota(It, std::next(It, Sz), 0); 5922 } 5923 5924 static void combineOrders(MutableArrayRef<unsigned> Order, 5925 ArrayRef<unsigned> SecondaryOrder) { 5926 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) && 5927 "Expected same size of orders"); 5928 unsigned Sz = Order.size(); 5929 SmallBitVector UsedIndices(Sz); 5930 for (unsigned Idx : seq<unsigned>(0, Sz)) { 5931 if (Order[Idx] != Sz) 5932 UsedIndices.set(Order[Idx]); 5933 } 5934 if (SecondaryOrder.empty()) { 5935 for (unsigned Idx : seq<unsigned>(0, Sz)) 5936 if (Order[Idx] == Sz && !UsedIndices.test(Idx)) 5937 Order[Idx] = Idx; 5938 } else { 5939 for (unsigned Idx : seq<unsigned>(0, Sz)) 5940 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz && 5941 !UsedIndices.test(SecondaryOrder[Idx])) 5942 Order[Idx] = SecondaryOrder[Idx]; 5943 } 5944 } 5945 5946 void BoUpSLP::reorderTopToBottom() { 5947 // Maps VF to the graph nodes. 5948 DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries; 5949 // ExtractElement gather nodes which can be vectorized and need to handle 5950 // their ordering. 5951 DenseMap<const TreeEntry *, OrdersType> GathersToOrders; 5952 5953 // Phi nodes can have preferred ordering based on their result users 5954 DenseMap<const TreeEntry *, OrdersType> PhisToOrders; 5955 5956 // AltShuffles can also have a preferred ordering that leads to fewer 5957 // instructions, e.g., the addsub instruction in x86. 5958 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders; 5959 5960 // Maps a TreeEntry to the reorder indices of external users. 5961 DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>> 5962 ExternalUserReorderMap; 5963 // Find all reorderable nodes with the given VF. 5964 // Currently the are vectorized stores,loads,extracts + some gathering of 5965 // extracts. 5966 for_each(VectorizableTree, [&, &TTIRef = *TTI]( 5967 const std::unique_ptr<TreeEntry> &TE) { 5968 // Look for external users that will probably be vectorized. 5969 SmallVector<OrdersType, 1> ExternalUserReorderIndices = 5970 findExternalStoreUsersReorderIndices(TE.get()); 5971 if (!ExternalUserReorderIndices.empty()) { 5972 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); 5973 ExternalUserReorderMap.try_emplace(TE.get(), 5974 std::move(ExternalUserReorderIndices)); 5975 } 5976 5977 // Patterns like [fadd,fsub] can be combined into a single instruction in 5978 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need 5979 // to take into account their order when looking for the most used order. 5980 if (TE->hasState() && TE->isAltShuffle()) { 5981 VectorType *VecTy = 5982 getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size()); 5983 unsigned Opcode0 = TE->getOpcode(); 5984 unsigned Opcode1 = TE->getAltOpcode(); 5985 SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1)); 5986 // If this pattern is supported by the target then we consider the order. 5987 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) { 5988 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); 5989 AltShufflesToOrders.try_emplace(TE.get(), OrdersType()); 5990 } 5991 // TODO: Check the reverse order too. 5992 } 5993 5994 if (std::optional<OrdersType> CurrentOrder = 5995 getReorderingData(*TE, /*TopToBottom=*/true)) { 5996 // Do not include ordering for nodes used in the alt opcode vectorization, 5997 // better to reorder them during bottom-to-top stage. If follow the order 5998 // here, it causes reordering of the whole graph though actually it is 5999 // profitable just to reorder the subgraph that starts from the alternate 6000 // opcode vectorization node. Such nodes already end-up with the shuffle 6001 // instruction and it is just enough to change this shuffle rather than 6002 // rotate the scalars for the whole graph. 6003 unsigned Cnt = 0; 6004 const TreeEntry *UserTE = TE.get(); 6005 while (UserTE && Cnt < RecursionMaxDepth) { 6006 if (UserTE->UserTreeIndices.size() != 1) 6007 break; 6008 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) { 6009 return EI.UserTE->State == TreeEntry::Vectorize && 6010 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0; 6011 })) 6012 return; 6013 UserTE = UserTE->UserTreeIndices.back().UserTE; 6014 ++Cnt; 6015 } 6016 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); 6017 if (!(TE->State == TreeEntry::Vectorize || 6018 TE->State == TreeEntry::StridedVectorize) || 6019 !TE->ReuseShuffleIndices.empty()) 6020 GathersToOrders.try_emplace(TE.get(), *CurrentOrder); 6021 if (TE->State == TreeEntry::Vectorize && 6022 TE->getOpcode() == Instruction::PHI) 6023 PhisToOrders.try_emplace(TE.get(), *CurrentOrder); 6024 } 6025 }); 6026 6027 // Reorder the graph nodes according to their vectorization factor. 6028 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); 6029 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) { 6030 auto It = VFToOrderedEntries.find(VF); 6031 if (It == VFToOrderedEntries.end()) 6032 continue; 6033 // Try to find the most profitable order. We just are looking for the most 6034 // used order and reorder scalar elements in the nodes according to this 6035 // mostly used order. 6036 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef(); 6037 // Delete VF entry upon exit. 6038 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); }); 6039 6040 // All operands are reordered and used only in this node - propagate the 6041 // most used order to the user node. 6042 MapVector<OrdersType, unsigned, 6043 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>> 6044 OrdersUses; 6045 SmallPtrSet<const TreeEntry *, 4> VisitedOps; 6046 for (const TreeEntry *OpTE : OrderedEntries) { 6047 // No need to reorder this nodes, still need to extend and to use shuffle, 6048 // just need to merge reordering shuffle and the reuse shuffle. 6049 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE)) 6050 continue; 6051 // Count number of orders uses. 6052 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders, 6053 &PhisToOrders]() -> const OrdersType & { 6054 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) { 6055 auto It = GathersToOrders.find(OpTE); 6056 if (It != GathersToOrders.end()) 6057 return It->second; 6058 } 6059 if (OpTE->hasState() && OpTE->isAltShuffle()) { 6060 auto It = AltShufflesToOrders.find(OpTE); 6061 if (It != AltShufflesToOrders.end()) 6062 return It->second; 6063 } 6064 if (OpTE->State == TreeEntry::Vectorize && 6065 OpTE->getOpcode() == Instruction::PHI) { 6066 auto It = PhisToOrders.find(OpTE); 6067 if (It != PhisToOrders.end()) 6068 return It->second; 6069 } 6070 return OpTE->ReorderIndices; 6071 }(); 6072 // First consider the order of the external scalar users. 6073 auto It = ExternalUserReorderMap.find(OpTE); 6074 if (It != ExternalUserReorderMap.end()) { 6075 const auto &ExternalUserReorderIndices = It->second; 6076 // If the OpTE vector factor != number of scalars - use natural order, 6077 // it is an attempt to reorder node with reused scalars but with 6078 // external uses. 6079 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) { 6080 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second += 6081 ExternalUserReorderIndices.size(); 6082 } else { 6083 for (const OrdersType &ExtOrder : ExternalUserReorderIndices) 6084 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second; 6085 } 6086 // No other useful reorder data in this entry. 6087 if (Order.empty()) 6088 continue; 6089 } 6090 // Stores actually store the mask, not the order, need to invert. 6091 if (OpTE->State == TreeEntry::Vectorize && 6092 OpTE->getOpcode() == Instruction::Store && !Order.empty()) { 6093 assert(!OpTE->isAltShuffle() && 6094 "Alternate instructions are only supported by BinaryOperator " 6095 "and CastInst."); 6096 SmallVector<int> Mask; 6097 inversePermutation(Order, Mask); 6098 unsigned E = Order.size(); 6099 OrdersType CurrentOrder(E, E); 6100 transform(Mask, CurrentOrder.begin(), [E](int Idx) { 6101 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx); 6102 }); 6103 fixupOrderingIndices(CurrentOrder); 6104 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second; 6105 } else { 6106 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; 6107 } 6108 } 6109 if (OrdersUses.empty()) 6110 continue; 6111 // Choose the most used order. 6112 unsigned IdentityCnt = 0; 6113 unsigned FilledIdentityCnt = 0; 6114 OrdersType IdentityOrder(VF, VF); 6115 for (auto &Pair : OrdersUses) { 6116 if (Pair.first.empty() || isIdentityOrder(Pair.first)) { 6117 if (!Pair.first.empty()) 6118 FilledIdentityCnt += Pair.second; 6119 IdentityCnt += Pair.second; 6120 combineOrders(IdentityOrder, Pair.first); 6121 } 6122 } 6123 MutableArrayRef<unsigned> BestOrder = IdentityOrder; 6124 unsigned Cnt = IdentityCnt; 6125 for (auto &Pair : OrdersUses) { 6126 // Prefer identity order. But, if filled identity found (non-empty order) 6127 // with same number of uses, as the new candidate order, we can choose 6128 // this candidate order. 6129 if (Cnt < Pair.second || 6130 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt && 6131 Cnt == Pair.second && !BestOrder.empty() && 6132 isIdentityOrder(BestOrder))) { 6133 combineOrders(Pair.first, BestOrder); 6134 BestOrder = Pair.first; 6135 Cnt = Pair.second; 6136 } else { 6137 combineOrders(BestOrder, Pair.first); 6138 } 6139 } 6140 // Set order of the user node. 6141 if (isIdentityOrder(BestOrder)) 6142 continue; 6143 fixupOrderingIndices(BestOrder); 6144 SmallVector<int> Mask; 6145 inversePermutation(BestOrder, Mask); 6146 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem); 6147 unsigned E = BestOrder.size(); 6148 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { 6149 return I < E ? static_cast<int>(I) : PoisonMaskElem; 6150 }); 6151 // Do an actual reordering, if profitable. 6152 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) { 6153 // Just do the reordering for the nodes with the given VF. 6154 if (TE->Scalars.size() != VF) { 6155 if (TE->ReuseShuffleIndices.size() == VF) { 6156 // Need to reorder the reuses masks of the operands with smaller VF to 6157 // be able to find the match between the graph nodes and scalar 6158 // operands of the given node during vectorization/cost estimation. 6159 assert(all_of(TE->UserTreeIndices, 6160 [VF, &TE](const EdgeInfo &EI) { 6161 return EI.UserTE->Scalars.size() == VF || 6162 EI.UserTE->Scalars.size() == 6163 TE->Scalars.size(); 6164 }) && 6165 "All users must be of VF size."); 6166 if (SLPReVec) { 6167 assert(SLPReVec && "Only supported by REVEC."); 6168 // ShuffleVectorInst does not do reorderOperands (and it should not 6169 // because ShuffleVectorInst supports only a limited set of 6170 // patterns). Only do reorderNodeWithReuses if all of the users are 6171 // not ShuffleVectorInst. 6172 if (all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) { 6173 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp()); 6174 })) 6175 continue; 6176 assert(none_of(TE->UserTreeIndices, 6177 [&](const EdgeInfo &EI) { 6178 return isa<ShuffleVectorInst>( 6179 EI.UserTE->getMainOp()); 6180 }) && 6181 "Does not know how to reorder."); 6182 } 6183 // Update ordering of the operands with the smaller VF than the given 6184 // one. 6185 reorderNodeWithReuses(*TE, Mask); 6186 } 6187 continue; 6188 } 6189 if ((TE->State == TreeEntry::Vectorize || 6190 TE->State == TreeEntry::StridedVectorize) && 6191 (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst, 6192 InsertElementInst>(TE->getMainOp()) || 6193 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) { 6194 assert(!TE->isAltShuffle() && 6195 "Alternate instructions are only supported by BinaryOperator " 6196 "and CastInst."); 6197 // Build correct orders for extract{element,value}, loads and 6198 // stores. 6199 reorderOrder(TE->ReorderIndices, Mask); 6200 if (isa<InsertElementInst, StoreInst>(TE->getMainOp())) 6201 TE->reorderOperands(Mask); 6202 } else { 6203 // Reorder the node and its operands. 6204 TE->reorderOperands(Mask); 6205 assert(TE->ReorderIndices.empty() && 6206 "Expected empty reorder sequence."); 6207 reorderScalars(TE->Scalars, Mask); 6208 } 6209 if (!TE->ReuseShuffleIndices.empty()) { 6210 // Apply reversed order to keep the original ordering of the reused 6211 // elements to avoid extra reorder indices shuffling. 6212 OrdersType CurrentOrder; 6213 reorderOrder(CurrentOrder, MaskOrder); 6214 SmallVector<int> NewReuses; 6215 inversePermutation(CurrentOrder, NewReuses); 6216 addMask(NewReuses, TE->ReuseShuffleIndices); 6217 TE->ReuseShuffleIndices.swap(NewReuses); 6218 } 6219 } 6220 } 6221 } 6222 6223 bool BoUpSLP::canReorderOperands( 6224 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges, 6225 ArrayRef<TreeEntry *> ReorderableGathers, 6226 SmallVectorImpl<TreeEntry *> &GatherOps) { 6227 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) { 6228 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) { 6229 return OpData.first == I && 6230 (OpData.second->State == TreeEntry::Vectorize || 6231 OpData.second->State == TreeEntry::StridedVectorize); 6232 })) 6233 continue; 6234 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) { 6235 // Do not reorder if operand node is used by many user nodes. 6236 if (any_of(TE->UserTreeIndices, 6237 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; })) 6238 return false; 6239 // Add the node to the list of the ordered nodes with the identity 6240 // order. 6241 Edges.emplace_back(I, TE); 6242 // Add ScatterVectorize nodes to the list of operands, where just 6243 // reordering of the scalars is required. Similar to the gathers, so 6244 // simply add to the list of gathered ops. 6245 // If there are reused scalars, process this node as a regular vectorize 6246 // node, just reorder reuses mask. 6247 if (TE->State != TreeEntry::Vectorize && 6248 TE->State != TreeEntry::StridedVectorize && 6249 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) 6250 GatherOps.push_back(TE); 6251 continue; 6252 } 6253 TreeEntry *Gather = nullptr; 6254 if (count_if(ReorderableGathers, 6255 [&Gather, UserTE, I](TreeEntry *TE) { 6256 assert(TE->State != TreeEntry::Vectorize && 6257 TE->State != TreeEntry::StridedVectorize && 6258 "Only non-vectorized nodes are expected."); 6259 if (any_of(TE->UserTreeIndices, 6260 [UserTE, I](const EdgeInfo &EI) { 6261 return EI.UserTE == UserTE && EI.EdgeIdx == I; 6262 })) { 6263 assert(TE->isSame(UserTE->getOperand(I)) && 6264 "Operand entry does not match operands."); 6265 Gather = TE; 6266 return true; 6267 } 6268 return false; 6269 }) > 1 && 6270 !allConstant(UserTE->getOperand(I))) 6271 return false; 6272 if (Gather) 6273 GatherOps.push_back(Gather); 6274 } 6275 return true; 6276 } 6277 6278 void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { 6279 SetVector<TreeEntry *> OrderedEntries; 6280 DenseSet<const TreeEntry *> GathersToOrders; 6281 // Find all reorderable leaf nodes with the given VF. 6282 // Currently the are vectorized loads,extracts without alternate operands + 6283 // some gathering of extracts. 6284 SmallVector<TreeEntry *> NonVectorized; 6285 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { 6286 if (TE->State != TreeEntry::Vectorize && 6287 TE->State != TreeEntry::StridedVectorize) 6288 NonVectorized.push_back(TE.get()); 6289 if (std::optional<OrdersType> CurrentOrder = 6290 getReorderingData(*TE, /*TopToBottom=*/false)) { 6291 OrderedEntries.insert(TE.get()); 6292 if (!(TE->State == TreeEntry::Vectorize || 6293 TE->State == TreeEntry::StridedVectorize) || 6294 !TE->ReuseShuffleIndices.empty()) 6295 GathersToOrders.insert(TE.get()); 6296 } 6297 } 6298 6299 // 1. Propagate order to the graph nodes, which use only reordered nodes. 6300 // I.e., if the node has operands, that are reordered, try to make at least 6301 // one operand order in the natural order and reorder others + reorder the 6302 // user node itself. 6303 SmallPtrSet<const TreeEntry *, 4> Visited; 6304 while (!OrderedEntries.empty()) { 6305 // 1. Filter out only reordered nodes. 6306 // 2. If the entry has multiple uses - skip it and jump to the next node. 6307 DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users; 6308 SmallVector<TreeEntry *> Filtered; 6309 for (TreeEntry *TE : OrderedEntries) { 6310 if (!(TE->State == TreeEntry::Vectorize || 6311 TE->State == TreeEntry::StridedVectorize || 6312 (TE->isGather() && GathersToOrders.contains(TE))) || 6313 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() || 6314 !all_of(drop_begin(TE->UserTreeIndices), 6315 [TE](const EdgeInfo &EI) { 6316 return EI.UserTE == TE->UserTreeIndices.front().UserTE; 6317 }) || 6318 !Visited.insert(TE).second) { 6319 Filtered.push_back(TE); 6320 continue; 6321 } 6322 // Build a map between user nodes and their operands order to speedup 6323 // search. The graph currently does not provide this dependency directly. 6324 for (EdgeInfo &EI : TE->UserTreeIndices) 6325 Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE); 6326 } 6327 // Erase filtered entries. 6328 for (TreeEntry *TE : Filtered) 6329 OrderedEntries.remove(TE); 6330 SmallVector< 6331 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>> 6332 UsersVec(Users.begin(), Users.end()); 6333 sort(UsersVec, [](const auto &Data1, const auto &Data2) { 6334 return Data1.first->Idx > Data2.first->Idx; 6335 }); 6336 for (auto &Data : UsersVec) { 6337 // Check that operands are used only in the User node. 6338 SmallVector<TreeEntry *> GatherOps; 6339 if (!canReorderOperands(Data.first, Data.second, NonVectorized, 6340 GatherOps)) { 6341 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) 6342 OrderedEntries.remove(Op.second); 6343 continue; 6344 } 6345 // All operands are reordered and used only in this node - propagate the 6346 // most used order to the user node. 6347 MapVector<OrdersType, unsigned, 6348 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>> 6349 OrdersUses; 6350 // Do the analysis for each tree entry only once, otherwise the order of 6351 // the same node my be considered several times, though might be not 6352 // profitable. 6353 SmallPtrSet<const TreeEntry *, 4> VisitedOps; 6354 SmallPtrSet<const TreeEntry *, 4> VisitedUsers; 6355 for (const auto &Op : Data.second) { 6356 TreeEntry *OpTE = Op.second; 6357 if (!VisitedOps.insert(OpTE).second) 6358 continue; 6359 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE)) 6360 continue; 6361 const auto Order = [&]() -> const OrdersType { 6362 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) 6363 return getReorderingData(*OpTE, /*TopToBottom=*/false) 6364 .value_or(OrdersType(1)); 6365 return OpTE->ReorderIndices; 6366 }(); 6367 // The order is partially ordered, skip it in favor of fully non-ordered 6368 // orders. 6369 if (Order.size() == 1) 6370 continue; 6371 unsigned NumOps = count_if( 6372 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) { 6373 return P.second == OpTE; 6374 }); 6375 // Stores actually store the mask, not the order, need to invert. 6376 if (OpTE->State == TreeEntry::Vectorize && 6377 OpTE->getOpcode() == Instruction::Store && !Order.empty()) { 6378 assert(!OpTE->isAltShuffle() && 6379 "Alternate instructions are only supported by BinaryOperator " 6380 "and CastInst."); 6381 SmallVector<int> Mask; 6382 inversePermutation(Order, Mask); 6383 unsigned E = Order.size(); 6384 OrdersType CurrentOrder(E, E); 6385 transform(Mask, CurrentOrder.begin(), [E](int Idx) { 6386 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx); 6387 }); 6388 fixupOrderingIndices(CurrentOrder); 6389 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second += 6390 NumOps; 6391 } else { 6392 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps; 6393 } 6394 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0)); 6395 const auto AllowsReordering = [&](const TreeEntry *TE) { 6396 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || 6397 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) || 6398 (IgnoreReorder && TE->Idx == 0)) 6399 return true; 6400 if (TE->isGather()) { 6401 if (GathersToOrders.contains(TE)) 6402 return !getReorderingData(*TE, /*TopToBottom=*/false) 6403 .value_or(OrdersType(1)) 6404 .empty(); 6405 return true; 6406 } 6407 return false; 6408 }; 6409 for (const EdgeInfo &EI : OpTE->UserTreeIndices) { 6410 TreeEntry *UserTE = EI.UserTE; 6411 if (!VisitedUsers.insert(UserTE).second) 6412 continue; 6413 // May reorder user node if it requires reordering, has reused 6414 // scalars, is an alternate op vectorize node or its op nodes require 6415 // reordering. 6416 if (AllowsReordering(UserTE)) 6417 continue; 6418 // Check if users allow reordering. 6419 // Currently look up just 1 level of operands to avoid increase of 6420 // the compile time. 6421 // Profitable to reorder if definitely more operands allow 6422 // reordering rather than those with natural order. 6423 ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE]; 6424 if (static_cast<unsigned>(count_if( 6425 Ops, [UserTE, &AllowsReordering]( 6426 const std::pair<unsigned, TreeEntry *> &Op) { 6427 return AllowsReordering(Op.second) && 6428 all_of(Op.second->UserTreeIndices, 6429 [UserTE](const EdgeInfo &EI) { 6430 return EI.UserTE == UserTE; 6431 }); 6432 })) <= Ops.size() / 2) 6433 ++Res.first->second; 6434 } 6435 } 6436 if (OrdersUses.empty()) { 6437 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) 6438 OrderedEntries.remove(Op.second); 6439 continue; 6440 } 6441 // Choose the most used order. 6442 unsigned IdentityCnt = 0; 6443 unsigned VF = Data.second.front().second->getVectorFactor(); 6444 OrdersType IdentityOrder(VF, VF); 6445 for (auto &Pair : OrdersUses) { 6446 if (Pair.first.empty() || isIdentityOrder(Pair.first)) { 6447 IdentityCnt += Pair.second; 6448 combineOrders(IdentityOrder, Pair.first); 6449 } 6450 } 6451 MutableArrayRef<unsigned> BestOrder = IdentityOrder; 6452 unsigned Cnt = IdentityCnt; 6453 for (auto &Pair : OrdersUses) { 6454 // Prefer identity order. But, if filled identity found (non-empty 6455 // order) with same number of uses, as the new candidate order, we can 6456 // choose this candidate order. 6457 if (Cnt < Pair.second) { 6458 combineOrders(Pair.first, BestOrder); 6459 BestOrder = Pair.first; 6460 Cnt = Pair.second; 6461 } else { 6462 combineOrders(BestOrder, Pair.first); 6463 } 6464 } 6465 // Set order of the user node. 6466 if (isIdentityOrder(BestOrder)) { 6467 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) 6468 OrderedEntries.remove(Op.second); 6469 continue; 6470 } 6471 fixupOrderingIndices(BestOrder); 6472 // Erase operands from OrderedEntries list and adjust their orders. 6473 VisitedOps.clear(); 6474 SmallVector<int> Mask; 6475 inversePermutation(BestOrder, Mask); 6476 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem); 6477 unsigned E = BestOrder.size(); 6478 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { 6479 return I < E ? static_cast<int>(I) : PoisonMaskElem; 6480 }); 6481 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) { 6482 TreeEntry *TE = Op.second; 6483 OrderedEntries.remove(TE); 6484 if (!VisitedOps.insert(TE).second) 6485 continue; 6486 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) { 6487 reorderNodeWithReuses(*TE, Mask); 6488 continue; 6489 } 6490 // Gathers are processed separately. 6491 if (TE->State != TreeEntry::Vectorize && 6492 TE->State != TreeEntry::StridedVectorize && 6493 (TE->State != TreeEntry::ScatterVectorize || 6494 TE->ReorderIndices.empty())) 6495 continue; 6496 assert((BestOrder.size() == TE->ReorderIndices.size() || 6497 TE->ReorderIndices.empty()) && 6498 "Non-matching sizes of user/operand entries."); 6499 reorderOrder(TE->ReorderIndices, Mask); 6500 if (IgnoreReorder && TE == VectorizableTree.front().get()) 6501 IgnoreReorder = false; 6502 } 6503 // For gathers just need to reorder its scalars. 6504 for (TreeEntry *Gather : GatherOps) { 6505 assert(Gather->ReorderIndices.empty() && 6506 "Unexpected reordering of gathers."); 6507 if (!Gather->ReuseShuffleIndices.empty()) { 6508 // Just reorder reuses indices. 6509 reorderReuses(Gather->ReuseShuffleIndices, Mask); 6510 continue; 6511 } 6512 reorderScalars(Gather->Scalars, Mask); 6513 OrderedEntries.remove(Gather); 6514 } 6515 // Reorder operands of the user node and set the ordering for the user 6516 // node itself. 6517 if (Data.first->State != TreeEntry::Vectorize || 6518 !isa<ExtractElementInst, ExtractValueInst, LoadInst>( 6519 Data.first->getMainOp()) || 6520 Data.first->isAltShuffle()) 6521 Data.first->reorderOperands(Mask); 6522 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) || 6523 Data.first->isAltShuffle() || 6524 Data.first->State == TreeEntry::StridedVectorize) { 6525 reorderScalars(Data.first->Scalars, Mask); 6526 reorderOrder(Data.first->ReorderIndices, MaskOrder, 6527 /*BottomOrder=*/true); 6528 if (Data.first->ReuseShuffleIndices.empty() && 6529 !Data.first->ReorderIndices.empty() && 6530 !Data.first->isAltShuffle()) { 6531 // Insert user node to the list to try to sink reordering deeper in 6532 // the graph. 6533 OrderedEntries.insert(Data.first); 6534 } 6535 } else { 6536 reorderOrder(Data.first->ReorderIndices, Mask); 6537 } 6538 } 6539 } 6540 // If the reordering is unnecessary, just remove the reorder. 6541 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() && 6542 VectorizableTree.front()->ReuseShuffleIndices.empty()) 6543 VectorizableTree.front()->ReorderIndices.clear(); 6544 } 6545 6546 Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const { 6547 if ((Entry.getOpcode() == Instruction::Store || 6548 Entry.getOpcode() == Instruction::Load) && 6549 Entry.State == TreeEntry::StridedVectorize && 6550 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices)) 6551 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]); 6552 return dyn_cast<Instruction>(Entry.Scalars.front()); 6553 } 6554 6555 void BoUpSLP::buildExternalUses( 6556 const ExtraValueToDebugLocsMap &ExternallyUsedValues) { 6557 DenseMap<Value *, unsigned> ScalarToExtUses; 6558 // Collect the values that we need to extract from the tree. 6559 for (auto &TEPtr : VectorizableTree) { 6560 TreeEntry *Entry = TEPtr.get(); 6561 6562 // No need to handle users of gathered values. 6563 if (Entry->isGather()) 6564 continue; 6565 6566 // For each lane: 6567 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 6568 Value *Scalar = Entry->Scalars[Lane]; 6569 if (!isa<Instruction>(Scalar)) 6570 continue; 6571 // All uses must be replaced already? No need to do it again. 6572 auto It = ScalarToExtUses.find(Scalar); 6573 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User) 6574 continue; 6575 6576 // Check if the scalar is externally used as an extra arg. 6577 const auto ExtI = ExternallyUsedValues.find(Scalar); 6578 if (ExtI != ExternallyUsedValues.end()) { 6579 int FoundLane = Entry->findLaneForValue(Scalar); 6580 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " 6581 << FoundLane << " from " << *Scalar << ".\n"); 6582 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()); 6583 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane); 6584 continue; 6585 } 6586 for (User *U : Scalar->users()) { 6587 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); 6588 6589 Instruction *UserInst = dyn_cast<Instruction>(U); 6590 if (!UserInst || isDeleted(UserInst)) 6591 continue; 6592 6593 // Ignore users in the user ignore list. 6594 if (UserIgnoreList && UserIgnoreList->contains(UserInst)) 6595 continue; 6596 6597 // Skip in-tree scalars that become vectors 6598 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U); 6599 !UseEntries.empty()) { 6600 // Some in-tree scalars will remain as scalar in vectorized 6601 // instructions. If that is the case, the one in FoundLane will 6602 // be used. 6603 if (any_of(UseEntries, [&](TreeEntry *UseEntry) { 6604 return UseEntry->State == TreeEntry::ScatterVectorize || 6605 !doesInTreeUserNeedToExtract( 6606 Scalar, getRootEntryInstruction(*UseEntry), TLI, 6607 TTI); 6608 })) { 6609 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U 6610 << ".\n"); 6611 assert(none_of(UseEntries, 6612 [](TreeEntry *UseEntry) { 6613 return UseEntry->isGather(); 6614 }) && 6615 "Bad state"); 6616 continue; 6617 } 6618 U = nullptr; 6619 if (It != ScalarToExtUses.end()) { 6620 ExternalUses[It->second].User = nullptr; 6621 break; 6622 } 6623 } 6624 6625 if (U && Scalar->hasNUsesOrMore(UsesLimit)) 6626 U = nullptr; 6627 int FoundLane = Entry->findLaneForValue(Scalar); 6628 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst 6629 << " from lane " << FoundLane << " from " << *Scalar 6630 << ".\n"); 6631 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first; 6632 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane); 6633 if (!U) 6634 break; 6635 } 6636 } 6637 } 6638 } 6639 6640 SmallVector<SmallVector<StoreInst *>> 6641 BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const { 6642 SmallDenseMap<std::tuple<BasicBlock *, Type *, Value *>, 6643 SmallVector<StoreInst *>, 8> 6644 PtrToStoresMap; 6645 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) { 6646 Value *V = TE->Scalars[Lane]; 6647 // Don't iterate over the users of constant data. 6648 if (!isa<Instruction>(V)) 6649 continue; 6650 // To save compilation time we don't visit if we have too many users. 6651 if (V->hasNUsesOrMore(UsesLimit)) 6652 break; 6653 6654 // Collect stores per pointer object. 6655 for (User *U : V->users()) { 6656 auto *SI = dyn_cast<StoreInst>(U); 6657 // Test whether we can handle the store. V might be a global, which could 6658 // be used in a different function. 6659 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F || 6660 !isValidElementType(SI->getValueOperand()->getType())) 6661 continue; 6662 // Skip entry if already 6663 if (isVectorized(U)) 6664 continue; 6665 6666 Value *Ptr = 6667 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth); 6668 auto &StoresVec = PtrToStoresMap[{SI->getParent(), 6669 SI->getValueOperand()->getType(), Ptr}]; 6670 // For now just keep one store per pointer object per lane. 6671 // TODO: Extend this to support multiple stores per pointer per lane 6672 if (StoresVec.size() > Lane) 6673 continue; 6674 if (!StoresVec.empty()) { 6675 std::optional<int> Diff = getPointersDiff( 6676 SI->getValueOperand()->getType(), SI->getPointerOperand(), 6677 SI->getValueOperand()->getType(), 6678 StoresVec.front()->getPointerOperand(), *DL, *SE, 6679 /*StrictCheck=*/true); 6680 // We failed to compare the pointers so just abandon this store. 6681 if (!Diff) 6682 continue; 6683 } 6684 StoresVec.push_back(SI); 6685 } 6686 } 6687 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size()); 6688 unsigned I = 0; 6689 for (auto &P : PtrToStoresMap) { 6690 Res[I].swap(P.second); 6691 ++I; 6692 } 6693 return Res; 6694 } 6695 6696 bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec, 6697 OrdersType &ReorderIndices) const { 6698 // We check whether the stores in StoreVec can form a vector by sorting them 6699 // and checking whether they are consecutive. 6700 6701 // To avoid calling getPointersDiff() while sorting we create a vector of 6702 // pairs {store, offset from first} and sort this instead. 6703 SmallVector<std::pair<int, unsigned>> StoreOffsetVec; 6704 StoreInst *S0 = StoresVec[0]; 6705 StoreOffsetVec.emplace_back(0, 0); 6706 Type *S0Ty = S0->getValueOperand()->getType(); 6707 Value *S0Ptr = S0->getPointerOperand(); 6708 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) { 6709 StoreInst *SI = StoresVec[Idx]; 6710 std::optional<int> Diff = 6711 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(), 6712 SI->getPointerOperand(), *DL, *SE, 6713 /*StrictCheck=*/true); 6714 StoreOffsetVec.emplace_back(*Diff, Idx); 6715 } 6716 6717 // Check if the stores are consecutive by checking if their difference is 1. 6718 if (StoreOffsetVec.size() != StoresVec.size()) 6719 return false; 6720 sort(StoreOffsetVec, 6721 [](const std::pair<int, unsigned> &L, 6722 const std::pair<int, unsigned> &R) { return L.first < R.first; }); 6723 unsigned Idx = 0; 6724 int PrevDist = 0; 6725 for (const auto &P : StoreOffsetVec) { 6726 if (Idx > 0 && P.first != PrevDist + 1) 6727 return false; 6728 PrevDist = P.first; 6729 ++Idx; 6730 } 6731 6732 // Calculate the shuffle indices according to their offset against the sorted 6733 // StoreOffsetVec. 6734 ReorderIndices.assign(StoresVec.size(), 0); 6735 bool IsIdentity = true; 6736 for (auto [I, P] : enumerate(StoreOffsetVec)) { 6737 ReorderIndices[P.second] = I; 6738 IsIdentity &= P.second == I; 6739 } 6740 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in 6741 // reorderTopToBottom() and reorderBottomToTop(), so we are following the 6742 // same convention here. 6743 if (IsIdentity) 6744 ReorderIndices.clear(); 6745 6746 return true; 6747 } 6748 6749 #ifndef NDEBUG 6750 LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) { 6751 for (unsigned Idx : Order) 6752 dbgs() << Idx << ", "; 6753 dbgs() << "\n"; 6754 } 6755 #endif 6756 6757 SmallVector<BoUpSLP::OrdersType, 1> 6758 BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { 6759 unsigned NumLanes = TE->Scalars.size(); 6760 6761 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE); 6762 6763 // Holds the reorder indices for each candidate store vector that is a user of 6764 // the current TreeEntry. 6765 SmallVector<OrdersType, 1> ExternalReorderIndices; 6766 6767 // Now inspect the stores collected per pointer and look for vectorization 6768 // candidates. For each candidate calculate the reorder index vector and push 6769 // it into `ExternalReorderIndices` 6770 for (ArrayRef<StoreInst *> StoresVec : Stores) { 6771 // If we have fewer than NumLanes stores, then we can't form a vector. 6772 if (StoresVec.size() != NumLanes) 6773 continue; 6774 6775 // If the stores are not consecutive then abandon this StoresVec. 6776 OrdersType ReorderIndices; 6777 if (!canFormVector(StoresVec, ReorderIndices)) 6778 continue; 6779 6780 // We now know that the scalars in StoresVec can form a vector instruction, 6781 // so set the reorder indices. 6782 ExternalReorderIndices.push_back(ReorderIndices); 6783 } 6784 return ExternalReorderIndices; 6785 } 6786 6787 void BoUpSLP::buildTree(ArrayRef<Value *> Roots, 6788 const SmallDenseSet<Value *> &UserIgnoreLst) { 6789 deleteTree(); 6790 UserIgnoreList = &UserIgnoreLst; 6791 if (!allSameType(Roots)) 6792 return; 6793 buildTree_rec(Roots, 0, EdgeInfo()); 6794 } 6795 6796 void BoUpSLP::buildTree(ArrayRef<Value *> Roots) { 6797 deleteTree(); 6798 if (!allSameType(Roots)) 6799 return; 6800 buildTree_rec(Roots, 0, EdgeInfo()); 6801 } 6802 6803 /// Tries to find subvector of loads and builds new vector of only loads if can 6804 /// be profitable. 6805 static void gatherPossiblyVectorizableLoads( 6806 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL, 6807 ScalarEvolution &SE, const TargetTransformInfo &TTI, 6808 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads, 6809 bool AddNew = true) { 6810 if (VL.empty()) 6811 return; 6812 Type *ScalarTy = getValueType(VL.front()); 6813 if (!isValidElementType(ScalarTy)) 6814 return; 6815 SmallVector<SmallVector<std::pair<LoadInst *, int>>> ClusteredLoads; 6816 SmallVector<DenseMap<int, LoadInst *>> ClusteredDistToLoad; 6817 for (Value *V : VL) { 6818 auto *LI = dyn_cast<LoadInst>(V); 6819 if (!LI) 6820 continue; 6821 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple()) 6822 continue; 6823 bool IsFound = false; 6824 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) { 6825 assert(LI->getParent() == Data.front().first->getParent() && 6826 LI->getType() == Data.front().first->getType() && 6827 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) == 6828 getUnderlyingObject(Data.front().first->getPointerOperand(), 6829 RecursionMaxDepth) && 6830 "Expected loads with the same type, same parent and same " 6831 "underlying pointer."); 6832 std::optional<int> Dist = getPointersDiff( 6833 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(), 6834 Data.front().first->getPointerOperand(), DL, SE, 6835 /*StrictCheck=*/true); 6836 if (!Dist) 6837 continue; 6838 auto It = Map.find(*Dist); 6839 if (It != Map.end() && It->second != LI) 6840 continue; 6841 if (It == Map.end()) { 6842 Data.emplace_back(LI, *Dist); 6843 Map.try_emplace(*Dist, LI); 6844 } 6845 IsFound = true; 6846 break; 6847 } 6848 if (!IsFound) { 6849 ClusteredLoads.emplace_back().emplace_back(LI, 0); 6850 ClusteredDistToLoad.emplace_back().try_emplace(0, LI); 6851 } 6852 } 6853 auto FindMatchingLoads = 6854 [&](ArrayRef<std::pair<LoadInst *, int>> Loads, 6855 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> 6856 &GatheredLoads, 6857 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated, 6858 int &Offset, unsigned &Start) { 6859 if (Loads.empty()) 6860 return GatheredLoads.end(); 6861 SmallVector<std::pair<int, int>> Res; 6862 LoadInst *LI = Loads.front().first; 6863 for (auto [Idx, Data] : enumerate(GatheredLoads)) { 6864 if (Idx < Start) 6865 continue; 6866 ToAdd.clear(); 6867 if (LI->getParent() != Data.front().first->getParent() || 6868 LI->getType() != Data.front().first->getType()) 6869 continue; 6870 std::optional<int> Dist = 6871 getPointersDiff(LI->getType(), LI->getPointerOperand(), 6872 Data.front().first->getType(), 6873 Data.front().first->getPointerOperand(), DL, SE, 6874 /*StrictCheck=*/true); 6875 if (!Dist) 6876 continue; 6877 SmallSet<int, 4> DataDists; 6878 SmallPtrSet<LoadInst *, 4> DataLoads; 6879 for (std::pair<LoadInst *, int> P : Data) { 6880 DataDists.insert(P.second); 6881 DataLoads.insert(P.first); 6882 } 6883 // Found matching gathered loads - check if all loads are unique or 6884 // can be effectively vectorized. 6885 unsigned NumUniques = 0; 6886 for (auto [Cnt, Pair] : enumerate(Loads)) { 6887 bool Used = DataLoads.contains(Pair.first); 6888 if (!Used && !DataDists.contains(*Dist + Pair.second)) { 6889 ++NumUniques; 6890 ToAdd.insert(Cnt); 6891 } else if (Used) { 6892 Repeated.insert(Cnt); 6893 } 6894 } 6895 if (NumUniques > 0 && 6896 (Loads.size() == NumUniques || 6897 (Loads.size() - NumUniques >= 2 && 6898 Loads.size() - NumUniques >= Loads.size() / 2 && 6899 (has_single_bit(Data.size() + NumUniques) || 6900 bit_ceil(Data.size()) < 6901 bit_ceil(Data.size() + NumUniques))))) { 6902 Offset = *Dist; 6903 Start = Idx + 1; 6904 return std::next(GatheredLoads.begin(), Idx); 6905 } 6906 } 6907 ToAdd.clear(); 6908 return GatheredLoads.end(); 6909 }; 6910 for (ArrayRef<std::pair<LoadInst *, int>> Data : ClusteredLoads) { 6911 unsigned Start = 0; 6912 SetVector<unsigned> ToAdd, LocalToAdd, Repeated; 6913 int Offset = 0; 6914 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, 6915 Offset, Start); 6916 while (It != GatheredLoads.end()) { 6917 assert(!LocalToAdd.empty() && "Expected some elements to add."); 6918 for (unsigned Idx : LocalToAdd) 6919 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset); 6920 ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end()); 6921 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset, 6922 Start); 6923 } 6924 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) { 6925 return !ToAdd.contains(Idx) && !Repeated.contains(Idx); 6926 })) { 6927 auto AddNewLoads = 6928 [&](SmallVectorImpl<std::pair<LoadInst *, int>> &Loads) { 6929 for (unsigned Idx : seq<unsigned>(Data.size())) { 6930 if (ToAdd.contains(Idx) || Repeated.contains(Idx)) 6931 continue; 6932 Loads.push_back(Data[Idx]); 6933 } 6934 }; 6935 if (!AddNew) { 6936 LoadInst *LI = Data.front().first; 6937 It = find_if( 6938 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int>> PD) { 6939 return PD.front().first->getParent() == LI->getParent() && 6940 PD.front().first->getType() == LI->getType(); 6941 }); 6942 while (It != GatheredLoads.end()) { 6943 AddNewLoads(*It); 6944 It = std::find_if( 6945 std::next(It), GatheredLoads.end(), 6946 [&](ArrayRef<std::pair<LoadInst *, int>> PD) { 6947 return PD.front().first->getParent() == LI->getParent() && 6948 PD.front().first->getType() == LI->getType(); 6949 }); 6950 } 6951 } 6952 GatheredLoads.emplace_back().append(Data.begin(), Data.end()); 6953 AddNewLoads(GatheredLoads.emplace_back()); 6954 } 6955 } 6956 } 6957 6958 void BoUpSLP::tryToVectorizeGatheredLoads( 6959 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>, 6960 SmallVector<SmallVector<std::pair<LoadInst *, int>>>, 6961 8> &GatheredLoads) { 6962 GatheredLoadsEntriesFirst = VectorizableTree.size(); 6963 6964 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize( 6965 LoadEntriesToVectorize.size()); 6966 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize)) 6967 Set.insert(VectorizableTree[Idx]->Scalars.begin(), 6968 VectorizableTree[Idx]->Scalars.end()); 6969 6970 // Sort loads by distance. 6971 auto LoadSorter = [](const std::pair<LoadInst *, int> &L1, 6972 const std::pair<LoadInst *, int> &L2) { 6973 return L1.second > L2.second; 6974 }; 6975 6976 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) { 6977 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()), 6978 Loads.size()); 6979 Align Alignment = computeCommonAlignment<LoadInst>(Values); 6980 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size()); 6981 return TTI->isLegalMaskedGather(Ty, Alignment) && 6982 !TTI->forceScalarizeMaskedGather(Ty, Alignment); 6983 }; 6984 6985 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads, 6986 BoUpSLP::ValueSet &VectorizedLoads, 6987 SmallVectorImpl<LoadInst *> &NonVectorized, 6988 bool Final, unsigned MaxVF) { 6989 SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results; 6990 unsigned StartIdx = 0; 6991 SmallVector<int> CandidateVFs; 6992 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1)) 6993 CandidateVFs.push_back(MaxVF); 6994 for (int NumElts = getFloorFullVectorNumberOfElements( 6995 *TTI, Loads.front()->getType(), MaxVF); 6996 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements( 6997 *TTI, Loads.front()->getType(), NumElts - 1)) { 6998 CandidateVFs.push_back(NumElts); 6999 if (VectorizeNonPowerOf2 && NumElts > 2) 7000 CandidateVFs.push_back(NumElts - 1); 7001 } 7002 7003 if (Final && CandidateVFs.empty()) 7004 return Results; 7005 7006 unsigned BestVF = Final ? CandidateVFs.back() : 0; 7007 for (unsigned NumElts : CandidateVFs) { 7008 if (Final && NumElts > BestVF) 7009 continue; 7010 SmallVector<unsigned> MaskedGatherVectorized; 7011 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E; 7012 ++Cnt) { 7013 ArrayRef<LoadInst *> Slice = 7014 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt)); 7015 if (VectorizedLoads.count(Slice.front()) || 7016 VectorizedLoads.count(Slice.back()) || 7017 areKnownNonVectorizableLoads(Slice)) 7018 continue; 7019 // Check if it is profitable to try vectorizing gathered loads. It is 7020 // profitable if we have more than 3 consecutive loads or if we have 7021 // less but all users are vectorized or deleted. 7022 bool AllowToVectorize = false; 7023 // Check if it is profitable to vectorize 2-elements loads. 7024 if (NumElts == 2) { 7025 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad( 7026 Slice.front()->getType(), ElementCount::getFixed(NumElts)); 7027 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) { 7028 for (LoadInst *LI : Slice) { 7029 // If single use/user - allow to vectorize. 7030 if (LI->hasOneUse()) 7031 continue; 7032 // 1. Check if number of uses equals number of users. 7033 // 2. All users are deleted. 7034 // 3. The load broadcasts are not allowed or the load is not 7035 // broadcasted. 7036 if (static_cast<unsigned int>(std::distance( 7037 LI->user_begin(), LI->user_end())) != LI->getNumUses()) 7038 return false; 7039 if (!IsLegalBroadcastLoad) 7040 continue; 7041 if (LI->hasNUsesOrMore(UsesLimit)) 7042 return false; 7043 for (User *U : LI->users()) { 7044 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI)) 7045 continue; 7046 for (const TreeEntry *UTE : getTreeEntries(U)) { 7047 for (int I : seq<int>(UTE->getNumOperands())) { 7048 if (all_of(UTE->getOperand(I), [LI](Value *V) { 7049 return V == LI || isa<PoisonValue>(V); 7050 })) 7051 // Found legal broadcast - do not vectorize. 7052 return false; 7053 } 7054 } 7055 } 7056 } 7057 return true; 7058 }; 7059 AllowToVectorize = CheckIfAllowed(Slice); 7060 } else { 7061 AllowToVectorize = 7062 (NumElts >= 3 || 7063 any_of(ValueToGatherNodes.at(Slice.front()), 7064 [=](const TreeEntry *TE) { 7065 return TE->Scalars.size() == 2 && 7066 ((TE->Scalars.front() == Slice.front() && 7067 TE->Scalars.back() == Slice.back()) || 7068 (TE->Scalars.front() == Slice.back() && 7069 TE->Scalars.back() == Slice.front())); 7070 })) && 7071 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), 7072 Slice.size()); 7073 } 7074 if (AllowToVectorize) { 7075 SmallVector<Value *> PointerOps; 7076 OrdersType CurrentOrder; 7077 // Try to build vector load. 7078 ArrayRef<Value *> Values( 7079 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size()); 7080 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder, 7081 PointerOps, &BestVF); 7082 if (LS != LoadsState::Gather || 7083 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) { 7084 if (LS == LoadsState::ScatterVectorize) { 7085 if (MaskedGatherVectorized.empty() || 7086 Cnt >= MaskedGatherVectorized.back() + NumElts) 7087 MaskedGatherVectorized.push_back(Cnt); 7088 continue; 7089 } 7090 if (LS != LoadsState::Gather) { 7091 Results.emplace_back(Values, LS); 7092 VectorizedLoads.insert(Slice.begin(), Slice.end()); 7093 // If we vectorized initial block, no need to try to vectorize it 7094 // again. 7095 if (Cnt == StartIdx) 7096 StartIdx += NumElts; 7097 } 7098 // Check if the whole array was vectorized already - exit. 7099 if (StartIdx >= Loads.size()) 7100 break; 7101 // Erase last masked gather candidate, if another candidate within 7102 // the range is found to be better. 7103 if (!MaskedGatherVectorized.empty() && 7104 Cnt < MaskedGatherVectorized.back() + NumElts) 7105 MaskedGatherVectorized.pop_back(); 7106 Cnt += NumElts - 1; 7107 continue; 7108 } 7109 } 7110 if (!AllowToVectorize || BestVF == 0) 7111 registerNonVectorizableLoads(Slice); 7112 } 7113 // Mark masked gathers candidates as vectorized, if any. 7114 for (unsigned Cnt : MaskedGatherVectorized) { 7115 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice( 7116 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt)); 7117 ArrayRef<Value *> Values( 7118 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size()); 7119 Results.emplace_back(Values, LoadsState::ScatterVectorize); 7120 VectorizedLoads.insert(Slice.begin(), Slice.end()); 7121 // If we vectorized initial block, no need to try to vectorize it again. 7122 if (Cnt == StartIdx) 7123 StartIdx += NumElts; 7124 } 7125 } 7126 for (LoadInst *LI : Loads) { 7127 if (!VectorizedLoads.contains(LI)) 7128 NonVectorized.push_back(LI); 7129 } 7130 return Results; 7131 }; 7132 auto ProcessGatheredLoads = 7133 [&, &TTI = *TTI]( 7134 ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads, 7135 bool Final = false) { 7136 SmallVector<LoadInst *> NonVectorized; 7137 for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) { 7138 if (LoadsDists.size() <= 1) { 7139 NonVectorized.push_back(LoadsDists.back().first); 7140 continue; 7141 } 7142 SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists); 7143 SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size()); 7144 transform(LoadsDists, OriginalLoads.begin(), 7145 [](const std::pair<LoadInst *, int> &L) -> LoadInst * { 7146 return L.first; 7147 }); 7148 stable_sort(LocalLoadsDists, LoadSorter); 7149 SmallVector<LoadInst *> Loads; 7150 unsigned MaxConsecutiveDistance = 0; 7151 unsigned CurrentConsecutiveDist = 1; 7152 int LastDist = LocalLoadsDists.front().second; 7153 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads); 7154 for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) { 7155 if (isVectorized(L.first)) 7156 continue; 7157 assert(LastDist >= L.second && 7158 "Expected first distance always not less than second"); 7159 if (static_cast<unsigned>(LastDist - L.second) == 7160 CurrentConsecutiveDist) { 7161 ++CurrentConsecutiveDist; 7162 MaxConsecutiveDistance = 7163 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist); 7164 Loads.push_back(L.first); 7165 continue; 7166 } 7167 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 && 7168 !Loads.empty()) 7169 Loads.pop_back(); 7170 CurrentConsecutiveDist = 1; 7171 LastDist = L.second; 7172 Loads.push_back(L.first); 7173 } 7174 if (Loads.size() <= 1) 7175 continue; 7176 if (AllowMaskedGather) 7177 MaxConsecutiveDistance = Loads.size(); 7178 else if (MaxConsecutiveDistance < 2) 7179 continue; 7180 BoUpSLP::ValueSet VectorizedLoads; 7181 SmallVector<LoadInst *> SortedNonVectorized; 7182 SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results = 7183 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized, 7184 Final, MaxConsecutiveDistance); 7185 if (!Results.empty() && !SortedNonVectorized.empty() && 7186 OriginalLoads.size() == Loads.size() && 7187 MaxConsecutiveDistance == Loads.size() && 7188 all_of(Results, 7189 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) { 7190 return P.second == LoadsState::ScatterVectorize; 7191 })) { 7192 VectorizedLoads.clear(); 7193 SmallVector<LoadInst *> UnsortedNonVectorized; 7194 SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> 7195 UnsortedResults = 7196 GetVectorizedRanges(OriginalLoads, VectorizedLoads, 7197 UnsortedNonVectorized, Final, 7198 OriginalLoads.size()); 7199 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) { 7200 SortedNonVectorized.swap(UnsortedNonVectorized); 7201 Results.swap(UnsortedResults); 7202 } 7203 } 7204 for (auto [Slice, _] : Results) { 7205 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads (" 7206 << Slice.size() << ")\n"); 7207 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) { 7208 for (Value *L : Slice) 7209 if (!isVectorized(L)) 7210 SortedNonVectorized.push_back(cast<LoadInst>(L)); 7211 continue; 7212 } 7213 7214 // Select maximum VF as a maximum of user gathered nodes and 7215 // distance between scalar loads in these nodes. 7216 unsigned MaxVF = Slice.size(); 7217 unsigned UserMaxVF = 0; 7218 unsigned InterleaveFactor = 0; 7219 if (MaxVF == 2) { 7220 UserMaxVF = MaxVF; 7221 } else { 7222 // Found distance between segments of the interleaved loads. 7223 std::optional<unsigned> InterleavedLoadsDistance = 0; 7224 unsigned Order = 0; 7225 std::optional<unsigned> CommonVF = 0; 7226 DenseMap<const TreeEntry *, unsigned> EntryToPosition; 7227 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes; 7228 for (auto [Idx, V] : enumerate(Slice)) { 7229 for (const TreeEntry *E : ValueToGatherNodes.at(V)) { 7230 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size()); 7231 unsigned Pos = 7232 EntryToPosition.try_emplace(E, Idx).first->second; 7233 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1); 7234 if (CommonVF) { 7235 if (*CommonVF == 0) { 7236 CommonVF = E->Scalars.size(); 7237 continue; 7238 } 7239 if (*CommonVF != E->Scalars.size()) 7240 CommonVF.reset(); 7241 } 7242 // Check if the load is the part of the interleaved load. 7243 if (Pos != Idx && InterleavedLoadsDistance) { 7244 if (!DeinterleavedNodes.contains(E) && 7245 any_of(E->Scalars, [&, Slice = Slice](Value *V) { 7246 if (isa<Constant>(V)) 7247 return false; 7248 if (isVectorized(V)) 7249 return true; 7250 const auto &Nodes = ValueToGatherNodes.at(V); 7251 return (Nodes.size() != 1 || !Nodes.contains(E)) && 7252 !is_contained(Slice, V); 7253 })) { 7254 InterleavedLoadsDistance.reset(); 7255 continue; 7256 } 7257 DeinterleavedNodes.insert(E); 7258 if (*InterleavedLoadsDistance == 0) { 7259 InterleavedLoadsDistance = Idx - Pos; 7260 continue; 7261 } 7262 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 || 7263 (Idx - Pos) / *InterleavedLoadsDistance < Order) 7264 InterleavedLoadsDistance.reset(); 7265 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1); 7266 } 7267 } 7268 } 7269 DeinterleavedNodes.clear(); 7270 // Check if the large load represents interleaved load operation. 7271 if (InterleavedLoadsDistance.value_or(0) > 1 && 7272 CommonVF.value_or(0) != 0) { 7273 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance); 7274 unsigned VF = *CommonVF; 7275 OrdersType Order; 7276 SmallVector<Value *> PointerOps; 7277 // Segmented load detected - vectorize at maximum vector factor. 7278 if (InterleaveFactor <= Slice.size() && 7279 TTI.isLegalInterleavedAccessType( 7280 getWidenedType(Slice.front()->getType(), VF), 7281 InterleaveFactor, 7282 cast<LoadInst>(Slice.front())->getAlign(), 7283 cast<LoadInst>(Slice.front()) 7284 ->getPointerAddressSpace()) && 7285 canVectorizeLoads(Slice, Slice.front(), Order, 7286 PointerOps) == LoadsState::Vectorize) { 7287 UserMaxVF = InterleaveFactor * VF; 7288 } else { 7289 InterleaveFactor = 0; 7290 } 7291 } 7292 // Cannot represent the loads as consecutive vectorizable nodes - 7293 // just exit. 7294 unsigned ConsecutiveNodesSize = 0; 7295 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 && 7296 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize), 7297 [&, Slice = Slice](const auto &P) { 7298 const auto *It = find_if(Slice, [&](Value *V) { 7299 return std::get<1>(P).contains(V); 7300 }); 7301 if (It == Slice.end()) 7302 return false; 7303 ArrayRef<Value *> VL = 7304 VectorizableTree[std::get<0>(P)]->Scalars; 7305 ConsecutiveNodesSize += VL.size(); 7306 unsigned Start = std::distance(Slice.begin(), It); 7307 unsigned Sz = Slice.size() - Start; 7308 return Sz < VL.size() || 7309 Slice.slice(std::distance(Slice.begin(), It), 7310 VL.size()) != VL; 7311 })) 7312 continue; 7313 // Try to build long masked gather loads. 7314 UserMaxVF = bit_ceil(UserMaxVF); 7315 if (InterleaveFactor == 0 && 7316 any_of(seq<unsigned>(Slice.size() / UserMaxVF), 7317 [&, Slice = Slice](unsigned Idx) { 7318 OrdersType Order; 7319 SmallVector<Value *> PointerOps; 7320 return canVectorizeLoads( 7321 Slice.slice(Idx * UserMaxVF, UserMaxVF), 7322 Slice[Idx * UserMaxVF], Order, 7323 PointerOps) == 7324 LoadsState::ScatterVectorize; 7325 })) 7326 UserMaxVF = MaxVF; 7327 if (Slice.size() != ConsecutiveNodesSize) 7328 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF); 7329 } 7330 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) { 7331 bool IsVectorized = true; 7332 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) { 7333 ArrayRef<Value *> SubSlice = 7334 Slice.slice(I, std::min(VF, E - I)); 7335 if (isVectorized(SubSlice.front())) 7336 continue; 7337 // Check if the subslice is to be-vectorized entry, which is not 7338 // equal to entry. 7339 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize), 7340 [&](const auto &P) { 7341 return !SubSlice.equals( 7342 VectorizableTree[std::get<0>(P)] 7343 ->Scalars) && 7344 set_is_subset(SubSlice, std::get<1>(P)); 7345 })) 7346 continue; 7347 unsigned Sz = VectorizableTree.size(); 7348 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor); 7349 if (Sz == VectorizableTree.size()) { 7350 IsVectorized = false; 7351 // Try non-interleaved vectorization with smaller vector 7352 // factor. 7353 if (InterleaveFactor > 0) { 7354 VF = 2 * (MaxVF / InterleaveFactor); 7355 InterleaveFactor = 0; 7356 } 7357 continue; 7358 } 7359 } 7360 if (IsVectorized) 7361 break; 7362 } 7363 } 7364 NonVectorized.append(SortedNonVectorized); 7365 } 7366 return NonVectorized; 7367 }; 7368 for (const auto &GLs : GatheredLoads) { 7369 const auto &Ref = GLs.second; 7370 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref); 7371 if (!Ref.empty() && !NonVectorized.empty() && 7372 std::accumulate( 7373 Ref.begin(), Ref.end(), 0u, 7374 [](unsigned S, 7375 ArrayRef<std::pair<LoadInst *, int>> LoadsDists) -> unsigned { 7376 return S + LoadsDists.size(); 7377 }) != NonVectorized.size() && 7378 IsMaskedGatherSupported(NonVectorized)) { 7379 SmallVector<SmallVector<std::pair<LoadInst *, int>>> FinalGatheredLoads; 7380 for (LoadInst *LI : NonVectorized) { 7381 // Reinsert non-vectorized loads to other list of loads with the same 7382 // base pointers. 7383 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI, 7384 FinalGatheredLoads, 7385 /*AddNew=*/false); 7386 } 7387 // Final attempt to vectorize non-vectorized loads. 7388 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true); 7389 } 7390 } 7391 // Try to vectorize postponed load entries, previously marked as gathered. 7392 for (unsigned Idx : LoadEntriesToVectorize) { 7393 const TreeEntry &E = *VectorizableTree[Idx]; 7394 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end()); 7395 // Avoid reordering, if possible. 7396 if (!E.ReorderIndices.empty()) { 7397 // Build a mask out of the reorder indices and reorder scalars per this 7398 // mask. 7399 SmallVector<int> ReorderMask; 7400 inversePermutation(E.ReorderIndices, ReorderMask); 7401 reorderScalars(GatheredScalars, ReorderMask); 7402 } 7403 buildTree_rec(GatheredScalars, 0, EdgeInfo()); 7404 } 7405 // If no new entries created, consider it as no gathered loads entries must be 7406 // handled. 7407 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) == 7408 VectorizableTree.size()) 7409 GatheredLoadsEntriesFirst.reset(); 7410 } 7411 7412 /// \return true if the specified list of values has only one instruction that 7413 /// requires scheduling, false otherwise. 7414 #ifndef NDEBUG 7415 static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) { 7416 Value *NeedsScheduling = nullptr; 7417 for (Value *V : VL) { 7418 if (doesNotNeedToBeScheduled(V)) 7419 continue; 7420 if (!NeedsScheduling) { 7421 NeedsScheduling = V; 7422 continue; 7423 } 7424 return false; 7425 } 7426 return NeedsScheduling; 7427 } 7428 #endif 7429 7430 /// Generates key/subkey pair for the given value to provide effective sorting 7431 /// of the values and better detection of the vectorizable values sequences. The 7432 /// keys/subkeys can be used for better sorting of the values themselves (keys) 7433 /// and in values subgroups (subkeys). 7434 static std::pair<size_t, size_t> generateKeySubkey( 7435 Value *V, const TargetLibraryInfo *TLI, 7436 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, 7437 bool AllowAlternate) { 7438 hash_code Key = hash_value(V->getValueID() + 2); 7439 hash_code SubKey = hash_value(0); 7440 // Sort the loads by the distance between the pointers. 7441 if (auto *LI = dyn_cast<LoadInst>(V)) { 7442 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key); 7443 if (LI->isSimple()) 7444 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI)); 7445 else 7446 Key = SubKey = hash_value(LI); 7447 } else if (isVectorLikeInstWithConstOps(V)) { 7448 // Sort extracts by the vector operands. 7449 if (isa<ExtractElementInst, UndefValue>(V)) 7450 Key = hash_value(Value::UndefValueVal + 1); 7451 if (auto *EI = dyn_cast<ExtractElementInst>(V)) { 7452 if (!isUndefVector(EI->getVectorOperand()).all() && 7453 !isa<UndefValue>(EI->getIndexOperand())) 7454 SubKey = hash_value(EI->getVectorOperand()); 7455 } 7456 } else if (auto *I = dyn_cast<Instruction>(V)) { 7457 // Sort other instructions just by the opcodes except for CMPInst. 7458 // For CMP also sort by the predicate kind. 7459 if ((isa<BinaryOperator, CastInst>(I)) && 7460 isValidForAlternation(I->getOpcode())) { 7461 if (AllowAlternate) 7462 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0); 7463 else 7464 Key = hash_combine(hash_value(I->getOpcode()), Key); 7465 SubKey = hash_combine( 7466 hash_value(I->getOpcode()), hash_value(I->getType()), 7467 hash_value(isa<BinaryOperator>(I) 7468 ? I->getType() 7469 : cast<CastInst>(I)->getOperand(0)->getType())); 7470 // For casts, look through the only operand to improve compile time. 7471 if (isa<CastInst>(I)) { 7472 std::pair<size_t, size_t> OpVals = 7473 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator, 7474 /*AllowAlternate=*/true); 7475 Key = hash_combine(OpVals.first, Key); 7476 SubKey = hash_combine(OpVals.first, SubKey); 7477 } 7478 } else if (auto *CI = dyn_cast<CmpInst>(I)) { 7479 CmpInst::Predicate Pred = CI->getPredicate(); 7480 if (CI->isCommutative()) 7481 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred)); 7482 CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred); 7483 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred), 7484 hash_value(SwapPred), 7485 hash_value(CI->getOperand(0)->getType())); 7486 } else if (auto *Call = dyn_cast<CallInst>(I)) { 7487 Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI); 7488 if (isTriviallyVectorizable(ID)) { 7489 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID)); 7490 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) { 7491 SubKey = hash_combine(hash_value(I->getOpcode()), 7492 hash_value(Call->getCalledFunction())); 7493 } else { 7494 Key = hash_combine(hash_value(Call), Key); 7495 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call)); 7496 } 7497 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos()) 7498 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End), 7499 hash_value(Op.Tag), SubKey); 7500 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) { 7501 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1))) 7502 SubKey = hash_value(Gep->getPointerOperand()); 7503 else 7504 SubKey = hash_value(Gep); 7505 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) && 7506 !isa<ConstantInt>(I->getOperand(1))) { 7507 // Do not try to vectorize instructions with potentially high cost. 7508 SubKey = hash_value(I); 7509 } else { 7510 SubKey = hash_value(I->getOpcode()); 7511 } 7512 Key = hash_combine(hash_value(I->getParent()), Key); 7513 } 7514 return std::make_pair(Key, SubKey); 7515 } 7516 7517 /// Checks if the specified instruction \p I is an alternate operation for 7518 /// the given \p MainOp and \p AltOp instructions. 7519 static bool isAlternateInstruction(const Instruction *I, 7520 const Instruction *MainOp, 7521 const Instruction *AltOp, 7522 const TargetLibraryInfo &TLI); 7523 7524 bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S, 7525 ArrayRef<Value *> VL) const { 7526 unsigned Opcode0 = S.getOpcode(); 7527 unsigned Opcode1 = S.getAltOpcode(); 7528 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1)); 7529 // If this pattern is supported by the target then consider it profitable. 7530 if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()), 7531 Opcode0, Opcode1, OpcodeMask)) 7532 return true; 7533 SmallVector<ValueList> Operands; 7534 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) { 7535 Operands.emplace_back(); 7536 // Prepare the operand vector. 7537 for (Value *V : VL) { 7538 if (isa<PoisonValue>(V)) { 7539 Operands.back().push_back( 7540 PoisonValue::get(S.getMainOp()->getOperand(I)->getType())); 7541 continue; 7542 } 7543 Operands.back().push_back(cast<Instruction>(V)->getOperand(I)); 7544 } 7545 } 7546 if (Operands.size() == 2) { 7547 // Try find best operands candidates. 7548 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) { 7549 SmallVector<std::pair<Value *, Value *>> Candidates(3); 7550 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]); 7551 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]); 7552 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]); 7553 std::optional<int> Res = findBestRootPair(Candidates); 7554 switch (Res.value_or(0)) { 7555 case 0: 7556 break; 7557 case 1: 7558 std::swap(Operands[0][I + 1], Operands[1][I + 1]); 7559 break; 7560 case 2: 7561 std::swap(Operands[0][I], Operands[1][I]); 7562 break; 7563 default: 7564 llvm_unreachable("Unexpected index."); 7565 } 7566 } 7567 } 7568 DenseSet<unsigned> UniqueOpcodes; 7569 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle. 7570 unsigned NonInstCnt = 0; 7571 // Estimate number of instructions, required for the vectorized node and for 7572 // the buildvector node. 7573 unsigned UndefCnt = 0; 7574 // Count the number of extra shuffles, required for vector nodes. 7575 unsigned ExtraShuffleInsts = 0; 7576 // Check that operands do not contain same values and create either perfect 7577 // diamond match or shuffled match. 7578 if (Operands.size() == 2) { 7579 // Do not count same operands twice. 7580 if (Operands.front() == Operands.back()) { 7581 Operands.erase(Operands.begin()); 7582 } else if (!allConstant(Operands.front()) && 7583 all_of(Operands.front(), [&](Value *V) { 7584 return is_contained(Operands.back(), V); 7585 })) { 7586 Operands.erase(Operands.begin()); 7587 ++ExtraShuffleInsts; 7588 } 7589 } 7590 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent()); 7591 // Vectorize node, if: 7592 // 1. at least single operand is constant or splat. 7593 // 2. Operands have many loop invariants (the instructions are not loop 7594 // invariants). 7595 // 3. At least single unique operands is supposed to vectorized. 7596 return none_of(Operands, 7597 [&](ArrayRef<Value *> Op) { 7598 if (allConstant(Op) || 7599 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) && 7600 getSameOpcode(Op, *TLI))) 7601 return false; 7602 DenseMap<Value *, unsigned> Uniques; 7603 for (Value *V : Op) { 7604 if (isa<Constant, ExtractElementInst>(V) || 7605 isVectorized(V) || (L && L->isLoopInvariant(V))) { 7606 if (isa<UndefValue>(V)) 7607 ++UndefCnt; 7608 continue; 7609 } 7610 auto Res = Uniques.try_emplace(V, 0); 7611 // Found first duplicate - need to add shuffle. 7612 if (!Res.second && Res.first->second == 1) 7613 ++ExtraShuffleInsts; 7614 ++Res.first->getSecond(); 7615 if (auto *I = dyn_cast<Instruction>(V)) 7616 UniqueOpcodes.insert(I->getOpcode()); 7617 else if (Res.second) 7618 ++NonInstCnt; 7619 } 7620 return none_of(Uniques, [&](const auto &P) { 7621 return P.first->hasNUsesOrMore(P.second + 1) && 7622 none_of(P.first->users(), [&](User *U) { 7623 return isVectorized(U) || Uniques.contains(U); 7624 }); 7625 }); 7626 }) || 7627 // Do not vectorize node, if estimated number of vector instructions is 7628 // more than estimated number of buildvector instructions. Number of 7629 // vector operands is number of vector instructions + number of vector 7630 // instructions for operands (buildvectors). Number of buildvector 7631 // instructions is just number_of_operands * number_of_scalars. 7632 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() && 7633 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts + 7634 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size()); 7635 } 7636 7637 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( 7638 const InstructionsState &S, ArrayRef<Value *> VL, 7639 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder, 7640 SmallVectorImpl<Value *> &PointerOps) { 7641 assert(S.getMainOp() && 7642 "Expected instructions with same/alternate opcodes only."); 7643 7644 unsigned ShuffleOrOp = 7645 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); 7646 Instruction *VL0 = S.getMainOp(); 7647 switch (ShuffleOrOp) { 7648 case Instruction::PHI: { 7649 // Too many operands - gather, most probably won't be vectorized. 7650 if (VL0->getNumOperands() > MaxPHINumOperands) 7651 return TreeEntry::NeedToGather; 7652 // Check for terminator values (e.g. invoke). 7653 for (Value *V : VL) { 7654 auto *PHI = dyn_cast<PHINode>(V); 7655 if (!PHI) 7656 continue; 7657 for (Value *Incoming : PHI->incoming_values()) { 7658 Instruction *Term = dyn_cast<Instruction>(Incoming); 7659 if (Term && Term->isTerminator()) { 7660 LLVM_DEBUG(dbgs() 7661 << "SLP: Need to swizzle PHINodes (terminator use).\n"); 7662 return TreeEntry::NeedToGather; 7663 } 7664 } 7665 } 7666 7667 return TreeEntry::Vectorize; 7668 } 7669 case Instruction::ExtractValue: 7670 case Instruction::ExtractElement: { 7671 bool Reuse = canReuseExtract(VL, CurrentOrder); 7672 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and 7673 // non-full registers). 7674 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size())) 7675 return TreeEntry::NeedToGather; 7676 if (Reuse || !CurrentOrder.empty()) 7677 return TreeEntry::Vectorize; 7678 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); 7679 return TreeEntry::NeedToGather; 7680 } 7681 case Instruction::InsertElement: { 7682 // Check that we have a buildvector and not a shuffle of 2 or more 7683 // different vectors. 7684 ValueSet SourceVectors; 7685 for (Value *V : VL) { 7686 SourceVectors.insert(cast<Instruction>(V)->getOperand(0)); 7687 assert(getElementIndex(V) != std::nullopt && 7688 "Non-constant or undef index?"); 7689 } 7690 7691 if (count_if(VL, [&SourceVectors](Value *V) { 7692 return !SourceVectors.contains(V); 7693 }) >= 2) { 7694 // Found 2nd source vector - cancel. 7695 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " 7696 "different source vectors.\n"); 7697 return TreeEntry::NeedToGather; 7698 } 7699 7700 if (any_of(VL, [&SourceVectors](Value *V) { 7701 // The last InsertElement can have multiple uses. 7702 return SourceVectors.contains(V) && !V->hasOneUse(); 7703 })) { 7704 assert(SLPReVec && "Only supported by REVEC."); 7705 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " 7706 "multiple uses.\n"); 7707 return TreeEntry::NeedToGather; 7708 } 7709 7710 return TreeEntry::Vectorize; 7711 } 7712 case Instruction::Load: { 7713 // Check that a vectorized load would load the same memory as a scalar 7714 // load. For example, we don't want to vectorize loads that are smaller 7715 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 7716 // treats loading/storing it as an i8 struct. If we vectorize loads/stores 7717 // from such a struct, we read/write packed bits disagreeing with the 7718 // unvectorized version. 7719 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) { 7720 case LoadsState::Vectorize: 7721 return TreeEntry::Vectorize; 7722 case LoadsState::ScatterVectorize: 7723 if (!IsGraphTransformMode && !VectorizableTree.empty()) { 7724 // Delay slow vectorized nodes for better vectorization attempts. 7725 LoadEntriesToVectorize.insert(VectorizableTree.size()); 7726 return TreeEntry::NeedToGather; 7727 } 7728 return TreeEntry::ScatterVectorize; 7729 case LoadsState::StridedVectorize: 7730 if (!IsGraphTransformMode && VectorizableTree.size() > 1) { 7731 // Delay slow vectorized nodes for better vectorization attempts. 7732 LoadEntriesToVectorize.insert(VectorizableTree.size()); 7733 return TreeEntry::NeedToGather; 7734 } 7735 return TreeEntry::StridedVectorize; 7736 case LoadsState::Gather: 7737 #ifndef NDEBUG 7738 Type *ScalarTy = VL0->getType(); 7739 if (DL->getTypeSizeInBits(ScalarTy) != 7740 DL->getTypeAllocSizeInBits(ScalarTy)) 7741 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); 7742 else if (any_of(VL, [](Value *V) { 7743 auto *LI = dyn_cast<LoadInst>(V); 7744 return !LI || !LI->isSimple(); 7745 })) 7746 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); 7747 else 7748 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); 7749 #endif // NDEBUG 7750 registerNonVectorizableLoads(VL); 7751 return TreeEntry::NeedToGather; 7752 } 7753 llvm_unreachable("Unexpected state of loads"); 7754 } 7755 case Instruction::ZExt: 7756 case Instruction::SExt: 7757 case Instruction::FPToUI: 7758 case Instruction::FPToSI: 7759 case Instruction::FPExt: 7760 case Instruction::PtrToInt: 7761 case Instruction::IntToPtr: 7762 case Instruction::SIToFP: 7763 case Instruction::UIToFP: 7764 case Instruction::Trunc: 7765 case Instruction::FPTrunc: 7766 case Instruction::BitCast: { 7767 Type *SrcTy = VL0->getOperand(0)->getType(); 7768 for (Value *V : VL) { 7769 if (isa<PoisonValue>(V)) 7770 continue; 7771 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType(); 7772 if (Ty != SrcTy || !isValidElementType(Ty)) { 7773 LLVM_DEBUG( 7774 dbgs() << "SLP: Gathering casts with different src types.\n"); 7775 return TreeEntry::NeedToGather; 7776 } 7777 } 7778 return TreeEntry::Vectorize; 7779 } 7780 case Instruction::ICmp: 7781 case Instruction::FCmp: { 7782 // Check that all of the compares have the same predicate. 7783 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); 7784 CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); 7785 Type *ComparedTy = VL0->getOperand(0)->getType(); 7786 for (Value *V : VL) { 7787 if (isa<PoisonValue>(V)) 7788 continue; 7789 auto *Cmp = cast<CmpInst>(V); 7790 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || 7791 Cmp->getOperand(0)->getType() != ComparedTy) { 7792 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); 7793 return TreeEntry::NeedToGather; 7794 } 7795 } 7796 return TreeEntry::Vectorize; 7797 } 7798 case Instruction::Select: 7799 case Instruction::FNeg: 7800 case Instruction::Add: 7801 case Instruction::FAdd: 7802 case Instruction::Sub: 7803 case Instruction::FSub: 7804 case Instruction::Mul: 7805 case Instruction::FMul: 7806 case Instruction::UDiv: 7807 case Instruction::SDiv: 7808 case Instruction::FDiv: 7809 case Instruction::URem: 7810 case Instruction::SRem: 7811 case Instruction::FRem: 7812 case Instruction::Shl: 7813 case Instruction::LShr: 7814 case Instruction::AShr: 7815 case Instruction::And: 7816 case Instruction::Or: 7817 case Instruction::Xor: 7818 case Instruction::Freeze: 7819 if (S.getMainOp()->getType()->isFloatingPointTy() && 7820 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) { 7821 auto *I = dyn_cast<Instruction>(V); 7822 return I && I->isBinaryOp() && !I->isFast(); 7823 })) 7824 return TreeEntry::NeedToGather; 7825 return TreeEntry::Vectorize; 7826 case Instruction::GetElementPtr: { 7827 // We don't combine GEPs with complicated (nested) indexing. 7828 for (Value *V : VL) { 7829 auto *I = dyn_cast<GetElementPtrInst>(V); 7830 if (!I) 7831 continue; 7832 if (I->getNumOperands() != 2) { 7833 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); 7834 return TreeEntry::NeedToGather; 7835 } 7836 } 7837 7838 // We can't combine several GEPs into one vector if they operate on 7839 // different types. 7840 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType(); 7841 for (Value *V : VL) { 7842 auto *GEP = dyn_cast<GEPOperator>(V); 7843 if (!GEP) 7844 continue; 7845 Type *CurTy = GEP->getSourceElementType(); 7846 if (Ty0 != CurTy) { 7847 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); 7848 return TreeEntry::NeedToGather; 7849 } 7850 } 7851 7852 // We don't combine GEPs with non-constant indexes. 7853 Type *Ty1 = VL0->getOperand(1)->getType(); 7854 for (Value *V : VL) { 7855 auto *I = dyn_cast<GetElementPtrInst>(V); 7856 if (!I) 7857 continue; 7858 auto *Op = I->getOperand(1); 7859 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) || 7860 (Op->getType() != Ty1 && 7861 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) || 7862 Op->getType()->getScalarSizeInBits() > 7863 DL->getIndexSizeInBits( 7864 V->getType()->getPointerAddressSpace())))) { 7865 LLVM_DEBUG( 7866 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); 7867 return TreeEntry::NeedToGather; 7868 } 7869 } 7870 7871 return TreeEntry::Vectorize; 7872 } 7873 case Instruction::Store: { 7874 // Check if the stores are consecutive or if we need to swizzle them. 7875 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType(); 7876 // Avoid types that are padded when being allocated as scalars, while 7877 // being packed together in a vector (such as i1). 7878 if (DL->getTypeSizeInBits(ScalarTy) != 7879 DL->getTypeAllocSizeInBits(ScalarTy)) { 7880 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n"); 7881 return TreeEntry::NeedToGather; 7882 } 7883 // Make sure all stores in the bundle are simple - we can't vectorize 7884 // atomic or volatile stores. 7885 for (Value *V : VL) { 7886 auto *SI = cast<StoreInst>(V); 7887 if (!SI->isSimple()) { 7888 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n"); 7889 return TreeEntry::NeedToGather; 7890 } 7891 PointerOps.push_back(SI->getPointerOperand()); 7892 } 7893 7894 // Check the order of pointer operands. 7895 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) { 7896 Value *Ptr0; 7897 Value *PtrN; 7898 if (CurrentOrder.empty()) { 7899 Ptr0 = PointerOps.front(); 7900 PtrN = PointerOps.back(); 7901 } else { 7902 Ptr0 = PointerOps[CurrentOrder.front()]; 7903 PtrN = PointerOps[CurrentOrder.back()]; 7904 } 7905 std::optional<int> Dist = 7906 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); 7907 // Check that the sorted pointer operands are consecutive. 7908 if (static_cast<unsigned>(*Dist) == VL.size() - 1) 7909 return TreeEntry::Vectorize; 7910 } 7911 7912 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); 7913 return TreeEntry::NeedToGather; 7914 } 7915 case Instruction::Call: { 7916 if (S.getMainOp()->getType()->isFloatingPointTy() && 7917 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) { 7918 auto *I = dyn_cast<Instruction>(V); 7919 return I && !I->isFast(); 7920 })) 7921 return TreeEntry::NeedToGather; 7922 // Check if the calls are all to the same vectorizable intrinsic or 7923 // library function. 7924 CallInst *CI = cast<CallInst>(VL0); 7925 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7926 7927 VFShape Shape = VFShape::get( 7928 CI->getFunctionType(), 7929 ElementCount::getFixed(static_cast<unsigned int>(VL.size())), 7930 false /*HasGlobalPred*/); 7931 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 7932 7933 if (!VecFunc && !isTriviallyVectorizable(ID)) { 7934 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); 7935 return TreeEntry::NeedToGather; 7936 } 7937 Function *F = CI->getCalledFunction(); 7938 unsigned NumArgs = CI->arg_size(); 7939 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr); 7940 for (unsigned J = 0; J != NumArgs; ++J) 7941 if (isVectorIntrinsicWithScalarOpAtArg(ID, J, TTI)) 7942 ScalarArgs[J] = CI->getArgOperand(J); 7943 for (Value *V : VL) { 7944 CallInst *CI2 = dyn_cast<CallInst>(V); 7945 if (!CI2 || CI2->getCalledFunction() != F || 7946 getVectorIntrinsicIDForCall(CI2, TLI) != ID || 7947 (VecFunc && 7948 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) || 7949 !CI->hasIdenticalOperandBundleSchema(*CI2)) { 7950 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V 7951 << "\n"); 7952 return TreeEntry::NeedToGather; 7953 } 7954 // Some intrinsics have scalar arguments and should be same in order for 7955 // them to be vectorized. 7956 for (unsigned J = 0; J != NumArgs; ++J) { 7957 if (isVectorIntrinsicWithScalarOpAtArg(ID, J, TTI)) { 7958 Value *A1J = CI2->getArgOperand(J); 7959 if (ScalarArgs[J] != A1J) { 7960 LLVM_DEBUG(dbgs() 7961 << "SLP: mismatched arguments in call:" << *CI 7962 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n"); 7963 return TreeEntry::NeedToGather; 7964 } 7965 } 7966 } 7967 // Verify that the bundle operands are identical between the two calls. 7968 if (CI->hasOperandBundles() && 7969 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(), 7970 CI->op_begin() + CI->getBundleOperandsEndIndex(), 7971 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { 7972 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI 7973 << "!=" << *V << '\n'); 7974 return TreeEntry::NeedToGather; 7975 } 7976 } 7977 7978 return TreeEntry::Vectorize; 7979 } 7980 case Instruction::ShuffleVector: { 7981 if (!S.isAltShuffle()) { 7982 // REVEC can support non alternate shuffle. 7983 if (SLPReVec && getShufflevectorNumGroups(VL)) 7984 return TreeEntry::Vectorize; 7985 // If this is not an alternate sequence of opcode like add-sub 7986 // then do not vectorize this instruction. 7987 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); 7988 return TreeEntry::NeedToGather; 7989 } 7990 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) { 7991 LLVM_DEBUG( 7992 dbgs() 7993 << "SLP: ShuffleVector not vectorized, operands are buildvector and " 7994 "the whole alt sequence is not profitable.\n"); 7995 return TreeEntry::NeedToGather; 7996 } 7997 7998 return TreeEntry::Vectorize; 7999 } 8000 default: 8001 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); 8002 return TreeEntry::NeedToGather; 8003 } 8004 } 8005 8006 namespace { 8007 /// Allows to correctly handle operands of the phi nodes based on the \p Main 8008 /// PHINode order of incoming basic blocks/values. 8009 class PHIHandler { 8010 DominatorTree &DT; 8011 PHINode *Main = nullptr; 8012 SmallVector<Value *> Phis; 8013 SmallVector<SmallVector<Value *>> Operands; 8014 8015 public: 8016 PHIHandler() = delete; 8017 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis) 8018 : DT(DT), Main(Main), Phis(Phis), 8019 Operands(Main->getNumIncomingValues(), 8020 SmallVector<Value *>(Phis.size(), nullptr)) {} 8021 void buildOperands() { 8022 constexpr unsigned FastLimit = 4; 8023 if (Main->getNumIncomingValues() <= FastLimit) { 8024 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) { 8025 BasicBlock *InBB = Main->getIncomingBlock(I); 8026 if (!DT.isReachableFromEntry(InBB)) { 8027 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType())); 8028 continue; 8029 } 8030 // Prepare the operand vector. 8031 for (auto [Idx, V] : enumerate(Phis)) { 8032 auto *P = dyn_cast<PHINode>(V); 8033 if (!P) { 8034 assert(isa<PoisonValue>(V) && 8035 "Expected isa instruction or poison value."); 8036 Operands[I][Idx] = V; 8037 continue; 8038 } 8039 if (P->getIncomingBlock(I) == InBB) 8040 Operands[I][Idx] = P->getIncomingValue(I); 8041 else 8042 Operands[I][Idx] = P->getIncomingValueForBlock(InBB); 8043 } 8044 } 8045 return; 8046 } 8047 SmallDenseMap<BasicBlock *, SmallVector<unsigned>, 4> Blocks; 8048 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) { 8049 BasicBlock *InBB = Main->getIncomingBlock(I); 8050 if (!DT.isReachableFromEntry(InBB)) { 8051 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType())); 8052 continue; 8053 } 8054 Blocks.try_emplace(InBB).first->second.push_back(I); 8055 } 8056 for (auto [Idx, V] : enumerate(Phis)) { 8057 if (isa<PoisonValue>(V)) { 8058 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) 8059 Operands[I][Idx] = V; 8060 continue; 8061 } 8062 auto *P = cast<PHINode>(V); 8063 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) { 8064 BasicBlock *InBB = P->getIncomingBlock(I); 8065 if (InBB == Main->getIncomingBlock(I)) { 8066 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx])) 8067 continue; 8068 Operands[I][Idx] = P->getIncomingValue(I); 8069 continue; 8070 } 8071 auto It = Blocks.find(InBB); 8072 if (It == Blocks.end()) 8073 continue; 8074 Operands[It->second.front()][Idx] = P->getIncomingValue(I); 8075 } 8076 } 8077 for (const auto &P : Blocks) { 8078 if (P.getSecond().size() <= 1) 8079 continue; 8080 unsigned BasicI = P.getSecond().front(); 8081 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) { 8082 assert(all_of(enumerate(Operands[I]), 8083 [&](const auto &Data) { 8084 return !Data.value() || 8085 Data.value() == Operands[BasicI][Data.index()]; 8086 }) && 8087 "Expected empty operands list."); 8088 Operands[I] = Operands[BasicI]; 8089 } 8090 } 8091 } 8092 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; } 8093 }; 8094 } // namespace 8095 8096 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, 8097 const EdgeInfo &UserTreeIdx, 8098 unsigned InterleaveFactor) { 8099 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); 8100 8101 SmallVector<int> ReuseShuffleIndices; 8102 SmallVector<Value *> UniqueValues; 8103 SmallVector<Value *> NonUniqueValueVL; 8104 auto TryToFindDuplicates = [&](const InstructionsState &S, 8105 bool DoNotFail = false) { 8106 // Check that every instruction appears once in this bundle. 8107 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size()); 8108 for (Value *V : VL) { 8109 if (isConstant(V)) { 8110 ReuseShuffleIndices.emplace_back( 8111 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size()); 8112 UniqueValues.emplace_back(V); 8113 continue; 8114 } 8115 auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); 8116 ReuseShuffleIndices.emplace_back(Res.first->second); 8117 if (Res.second) 8118 UniqueValues.emplace_back(V); 8119 } 8120 size_t NumUniqueScalarValues = UniqueValues.size(); 8121 bool IsFullVectors = hasFullVectorsOrPowerOf2( 8122 *TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues); 8123 if (NumUniqueScalarValues == VL.size() && 8124 (VectorizeNonPowerOf2 || IsFullVectors)) { 8125 ReuseShuffleIndices.clear(); 8126 } else { 8127 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops. 8128 if ((UserTreeIdx.UserTE && 8129 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) || 8130 !hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) { 8131 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported " 8132 "for nodes with padding.\n"); 8133 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 8134 return false; 8135 } 8136 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); 8137 if (NumUniqueScalarValues <= 1 || !IsFullVectors || 8138 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) { 8139 return isa<UndefValue>(V) || !isConstant(V); 8140 }))) { 8141 if (DoNotFail && UniquePositions.size() > 1 && 8142 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() && 8143 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) { 8144 // Find the number of elements, which forms full vectors. 8145 unsigned PWSz = getFullVectorNumberOfElements( 8146 *TTI, UniqueValues.front()->getType(), UniqueValues.size()); 8147 if (PWSz == VL.size()) { 8148 ReuseShuffleIndices.clear(); 8149 } else { 8150 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end()); 8151 NonUniqueValueVL.append( 8152 PWSz - UniqueValues.size(), 8153 PoisonValue::get(UniqueValues.front()->getType())); 8154 // Check that extended with poisons operations are still valid for 8155 // vectorization (div/rem are not allowed). 8156 if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) { 8157 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); 8158 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 8159 return false; 8160 } 8161 VL = NonUniqueValueVL; 8162 } 8163 return true; 8164 } 8165 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); 8166 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 8167 return false; 8168 } 8169 VL = UniqueValues; 8170 } 8171 return true; 8172 }; 8173 8174 InstructionsState S = getSameOpcode(VL, *TLI); 8175 8176 // Don't go into catchswitch blocks, which can happen with PHIs. 8177 // Such blocks can only have PHIs and the catchswitch. There is no 8178 // place to insert a shuffle if we need to, so just avoid that issue. 8179 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) { 8180 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n"); 8181 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 8182 return; 8183 } 8184 8185 // Check if this is a duplicate of another entry. 8186 if (S) { 8187 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n"); 8188 for (TreeEntry *E : getTreeEntries(S.getMainOp())) { 8189 if (E->isSame(VL)) { 8190 // Record the reuse of the tree node. 8191 E->UserTreeIndices.push_back(UserTreeIdx); 8192 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp() 8193 << ".\n"); 8194 return; 8195 } 8196 SmallPtrSet<Value *, 8> Values(E->Scalars.begin(), E->Scalars.end()); 8197 if (all_of(VL, [&](Value *V) { 8198 return isa<PoisonValue>(V) || Values.contains(V); 8199 })) { 8200 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n"); 8201 if (TryToFindDuplicates(S)) 8202 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 8203 ReuseShuffleIndices); 8204 return; 8205 } 8206 } 8207 } 8208 8209 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of 8210 // a load), in which case peek through to include it in the tree, without 8211 // ballooning over-budget. 8212 if (Depth >= RecursionMaxDepth && 8213 !(S && !S.isAltShuffle() && VL.size() >= 4 && 8214 (match(S.getMainOp(), m_Load(m_Value())) || 8215 all_of(VL, [&S](const Value *I) { 8216 return match(I, 8217 m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) && 8218 cast<Instruction>(I)->getOpcode() == S.getOpcode(); 8219 })))) { 8220 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); 8221 if (TryToFindDuplicates(S)) 8222 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 8223 ReuseShuffleIndices); 8224 return; 8225 } 8226 8227 // Don't handle scalable vectors 8228 if (S && S.getOpcode() == Instruction::ExtractElement && 8229 isa<ScalableVectorType>( 8230 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) { 8231 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n"); 8232 if (TryToFindDuplicates(S)) 8233 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 8234 ReuseShuffleIndices); 8235 return; 8236 } 8237 8238 // Don't handle vectors. 8239 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) { 8240 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); 8241 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 8242 return; 8243 } 8244 8245 // If all of the operands are identical or constant we have a simple solution. 8246 // If we deal with insert/extract instructions, they all must have constant 8247 // indices, otherwise we should gather them, not try to vectorize. 8248 // If alternate op node with 2 elements with gathered operands - do not 8249 // vectorize. 8250 auto &&NotProfitableForVectorization = [&S, this, 8251 Depth](ArrayRef<Value *> VL) { 8252 if (!S || !S.isAltShuffle() || VL.size() > 2) 8253 return false; 8254 if (VectorizableTree.size() < MinTreeSize) 8255 return false; 8256 if (Depth >= RecursionMaxDepth - 1) 8257 return true; 8258 // Check if all operands are extracts, part of vector node or can build a 8259 // regular vectorize node. 8260 SmallVector<unsigned, 8> InstsCount; 8261 for (Value *V : VL) { 8262 auto *I = cast<Instruction>(V); 8263 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) { 8264 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op); 8265 })); 8266 } 8267 bool IsCommutative = 8268 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp()); 8269 if ((IsCommutative && 8270 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) || 8271 (!IsCommutative && 8272 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; }))) 8273 return true; 8274 assert(VL.size() == 2 && "Expected only 2 alternate op instructions."); 8275 SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates; 8276 auto *I1 = cast<Instruction>(VL.front()); 8277 auto *I2 = cast<Instruction>(VL.back()); 8278 for (int Op : seq<int>(S.getMainOp()->getNumOperands())) 8279 Candidates.emplace_back().emplace_back(I1->getOperand(Op), 8280 I2->getOperand(Op)); 8281 if (static_cast<unsigned>(count_if( 8282 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) { 8283 return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat); 8284 })) >= S.getMainOp()->getNumOperands() / 2) 8285 return false; 8286 if (S.getMainOp()->getNumOperands() > 2) 8287 return true; 8288 if (IsCommutative) { 8289 // Check permuted operands. 8290 Candidates.clear(); 8291 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op) 8292 Candidates.emplace_back().emplace_back(I1->getOperand(Op), 8293 I2->getOperand((Op + 1) % E)); 8294 if (any_of( 8295 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) { 8296 return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat); 8297 })) 8298 return false; 8299 } 8300 return true; 8301 }; 8302 SmallVector<unsigned> SortedIndices; 8303 BasicBlock *BB = nullptr; 8304 bool IsScatterVectorizeUserTE = 8305 UserTreeIdx.UserTE && 8306 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize; 8307 bool AreAllSameBlock = S && allSameBlock(VL); 8308 bool AreScatterAllGEPSameBlock = 8309 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() && 8310 VL.size() > 2 && 8311 all_of(VL, 8312 [&BB](Value *V) { 8313 auto *I = dyn_cast<GetElementPtrInst>(V); 8314 if (!I) 8315 return doesNotNeedToBeScheduled(V); 8316 if (!BB) 8317 BB = I->getParent(); 8318 return BB == I->getParent() && I->getNumOperands() == 2; 8319 }) && 8320 BB && 8321 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE, 8322 SortedIndices)); 8323 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock; 8324 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) || 8325 (S && 8326 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>( 8327 S.getMainOp()) && 8328 !all_of(VL, isVectorLikeInstWithConstOps)) || 8329 NotProfitableForVectorization(VL)) { 8330 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n"); 8331 if (TryToFindDuplicates(S)) 8332 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 8333 ReuseShuffleIndices); 8334 return; 8335 } 8336 8337 // Don't vectorize ephemeral values. 8338 if (S && !EphValues.empty()) { 8339 for (Value *V : VL) { 8340 if (EphValues.count(V)) { 8341 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V 8342 << ") is ephemeral.\n"); 8343 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 8344 return; 8345 } 8346 } 8347 } 8348 8349 // We now know that this is a vector of instructions of the same type from 8350 // the same block. 8351 8352 // Check that none of the instructions in the bundle are already in the tree. 8353 for (Value *V : VL) { 8354 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) || 8355 doesNotNeedToBeScheduled(V)) 8356 continue; 8357 if (isVectorized(V)) { 8358 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V 8359 << ") is already in tree.\n"); 8360 if (TryToFindDuplicates(S)) 8361 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 8362 ReuseShuffleIndices); 8363 return; 8364 } 8365 } 8366 8367 // The reduction nodes (stored in UserIgnoreList) also should stay scalar. 8368 if (UserIgnoreList && !UserIgnoreList->empty()) { 8369 for (Value *V : VL) { 8370 if (UserIgnoreList->contains(V)) { 8371 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); 8372 if (TryToFindDuplicates(S)) 8373 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 8374 ReuseShuffleIndices); 8375 return; 8376 } 8377 } 8378 } 8379 8380 // Special processing for sorted pointers for ScatterVectorize node with 8381 // constant indeces only. 8382 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) { 8383 assert(VL.front()->getType()->isPointerTy() && 8384 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 && 8385 "Expected pointers only."); 8386 // Reset S to make it GetElementPtr kind of node. 8387 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>); 8388 assert(It != VL.end() && "Expected at least one GEP."); 8389 S = getSameOpcode(*It, *TLI); 8390 } 8391 8392 // Check that all of the users of the scalars that we want to vectorize are 8393 // schedulable. 8394 Instruction *VL0 = S.getMainOp(); 8395 BB = VL0->getParent(); 8396 8397 if (S && 8398 (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) || 8399 !DT->isReachableFromEntry(BB))) { 8400 // Don't go into unreachable blocks. They may contain instructions with 8401 // dependency cycles which confuse the final scheduling. 8402 // Do not vectorize EH and non-returning blocks, not profitable in most 8403 // cases. 8404 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); 8405 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 8406 return; 8407 } 8408 8409 // Check that every instruction appears once in this bundle. 8410 if (!TryToFindDuplicates(S, /*DoNotFail=*/true)) 8411 return; 8412 8413 // Perform specific checks for each particular instruction kind. 8414 OrdersType CurrentOrder; 8415 SmallVector<Value *> PointerOps; 8416 TreeEntry::EntryState State = getScalarsVectorizationState( 8417 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps); 8418 if (State == TreeEntry::NeedToGather) { 8419 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 8420 ReuseShuffleIndices); 8421 return; 8422 } 8423 8424 auto &BSRef = BlocksSchedules[BB]; 8425 if (!BSRef) 8426 BSRef = std::make_unique<BlockScheduling>(BB); 8427 8428 BlockScheduling &BS = *BSRef; 8429 8430 std::optional<ScheduleData *> Bundle = 8431 BS.tryScheduleBundle(UniqueValues, this, S); 8432 #ifdef EXPENSIVE_CHECKS 8433 // Make sure we didn't break any internal invariants 8434 BS.verify(); 8435 #endif 8436 if (!Bundle) { 8437 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); 8438 assert((!BS.getScheduleData(VL0) || 8439 !BS.getScheduleData(VL0)->isPartOfBundle()) && 8440 "tryScheduleBundle should cancelScheduling on failure"); 8441 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 8442 ReuseShuffleIndices); 8443 NonScheduledFirst.insert(VL.front()); 8444 if (S.getOpcode() == Instruction::Load && 8445 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit) 8446 registerNonVectorizableLoads(VL); 8447 return; 8448 } 8449 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); 8450 8451 unsigned ShuffleOrOp = 8452 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); 8453 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) { 8454 // Postpone PHI nodes creation 8455 SmallVector<unsigned> PHIOps; 8456 for (unsigned I : seq<unsigned>(Operands.size())) { 8457 ArrayRef<Value *> Op = Operands[I]; 8458 if (Op.empty()) 8459 continue; 8460 InstructionsState S = getSameOpcode(Op, *TLI); 8461 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle()) 8462 buildTree_rec(Op, Depth + 1, {TE, I}); 8463 else 8464 PHIOps.push_back(I); 8465 } 8466 for (unsigned I : PHIOps) 8467 buildTree_rec(Operands[I], Depth + 1, {TE, I}); 8468 }; 8469 switch (ShuffleOrOp) { 8470 case Instruction::PHI: { 8471 auto *PH = cast<PHINode>(VL0); 8472 8473 TreeEntry *TE = 8474 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices); 8475 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n"; 8476 TE->dump()); 8477 8478 // Keeps the reordered operands to avoid code duplication. 8479 PHIHandler Handler(*DT, PH, VL); 8480 Handler.buildOperands(); 8481 for (unsigned I : seq<unsigned>(PH->getNumOperands())) 8482 TE->setOperand(I, Handler.getOperands(I)); 8483 SmallVector<ArrayRef<Value *>> Operands(PH->getNumOperands()); 8484 for (unsigned I : seq<unsigned>(PH->getNumOperands())) 8485 Operands[I] = Handler.getOperands(I); 8486 CreateOperandNodes(TE, Operands); 8487 return; 8488 } 8489 case Instruction::ExtractValue: 8490 case Instruction::ExtractElement: { 8491 if (CurrentOrder.empty()) { 8492 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); 8493 } else { 8494 LLVM_DEBUG({ 8495 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence " 8496 "with order"; 8497 for (unsigned Idx : CurrentOrder) 8498 dbgs() << " " << Idx; 8499 dbgs() << "\n"; 8500 }); 8501 fixupOrderingIndices(CurrentOrder); 8502 } 8503 // Insert new order with initial value 0, if it does not exist, 8504 // otherwise return the iterator to the existing one. 8505 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 8506 ReuseShuffleIndices, CurrentOrder); 8507 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry " 8508 "(ExtractValueInst/ExtractElementInst).\n"; 8509 TE->dump()); 8510 // This is a special case, as it does not gather, but at the same time 8511 // we are not extending buildTree_rec() towards the operands. 8512 TE->setOperand(*this); 8513 return; 8514 } 8515 case Instruction::InsertElement: { 8516 assert(ReuseShuffleIndices.empty() && "All inserts should be unique"); 8517 8518 auto OrdCompare = [](const std::pair<int, int> &P1, 8519 const std::pair<int, int> &P2) { 8520 return P1.first > P2.first; 8521 }; 8522 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>, 8523 decltype(OrdCompare)> 8524 Indices(OrdCompare); 8525 for (int I = 0, E = VL.size(); I < E; ++I) { 8526 unsigned Idx = *getElementIndex(VL[I]); 8527 Indices.emplace(Idx, I); 8528 } 8529 OrdersType CurrentOrder(VL.size(), VL.size()); 8530 bool IsIdentity = true; 8531 for (int I = 0, E = VL.size(); I < E; ++I) { 8532 CurrentOrder[Indices.top().second] = I; 8533 IsIdentity &= Indices.top().second == I; 8534 Indices.pop(); 8535 } 8536 if (IsIdentity) 8537 CurrentOrder.clear(); 8538 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 8539 {}, CurrentOrder); 8540 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n"; 8541 TE->dump()); 8542 8543 TE->setOperand(*this); 8544 buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1}); 8545 return; 8546 } 8547 case Instruction::Load: { 8548 // Check that a vectorized load would load the same memory as a scalar 8549 // load. For example, we don't want to vectorize loads that are smaller 8550 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 8551 // treats loading/storing it as an i8 struct. If we vectorize loads/stores 8552 // from such a struct, we read/write packed bits disagreeing with the 8553 // unvectorized version. 8554 TreeEntry *TE = nullptr; 8555 fixupOrderingIndices(CurrentOrder); 8556 switch (State) { 8557 case TreeEntry::Vectorize: 8558 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 8559 ReuseShuffleIndices, CurrentOrder, InterleaveFactor); 8560 if (CurrentOrder.empty()) 8561 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n"; 8562 TE->dump()); 8563 else 8564 LLVM_DEBUG(dbgs() 8565 << "SLP: added a new TreeEntry (jumbled LoadInst).\n"; 8566 TE->dump()); 8567 break; 8568 case TreeEntry::StridedVectorize: 8569 // Vectorizing non-consecutive loads with `llvm.masked.gather`. 8570 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S, 8571 UserTreeIdx, ReuseShuffleIndices, CurrentOrder); 8572 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n"; 8573 TE->dump()); 8574 break; 8575 case TreeEntry::ScatterVectorize: 8576 // Vectorizing non-consecutive loads with `llvm.masked.gather`. 8577 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, 8578 UserTreeIdx, ReuseShuffleIndices); 8579 LLVM_DEBUG( 8580 dbgs() 8581 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n"; 8582 TE->dump()); 8583 break; 8584 case TreeEntry::CombinedVectorize: 8585 case TreeEntry::NeedToGather: 8586 llvm_unreachable("Unexpected loads state."); 8587 } 8588 TE->setOperand(*this); 8589 if (State == TreeEntry::ScatterVectorize) 8590 buildTree_rec(PointerOps, Depth + 1, {TE, 0}); 8591 return; 8592 } 8593 case Instruction::ZExt: 8594 case Instruction::SExt: 8595 case Instruction::FPToUI: 8596 case Instruction::FPToSI: 8597 case Instruction::FPExt: 8598 case Instruction::PtrToInt: 8599 case Instruction::IntToPtr: 8600 case Instruction::SIToFP: 8601 case Instruction::UIToFP: 8602 case Instruction::Trunc: 8603 case Instruction::FPTrunc: 8604 case Instruction::BitCast: { 8605 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or( 8606 std::make_pair(std::numeric_limits<unsigned>::min(), 8607 std::numeric_limits<unsigned>::max())); 8608 if (ShuffleOrOp == Instruction::ZExt || 8609 ShuffleOrOp == Instruction::SExt) { 8610 CastMaxMinBWSizes = std::make_pair( 8611 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()), 8612 PrevMaxBW), 8613 std::min<unsigned>( 8614 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()), 8615 PrevMinBW)); 8616 } else if (ShuffleOrOp == Instruction::Trunc) { 8617 CastMaxMinBWSizes = std::make_pair( 8618 std::max<unsigned>( 8619 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()), 8620 PrevMaxBW), 8621 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()), 8622 PrevMinBW)); 8623 } 8624 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 8625 ReuseShuffleIndices); 8626 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n"; 8627 TE->dump()); 8628 8629 TE->setOperand(*this); 8630 for (unsigned I : seq<unsigned>(VL0->getNumOperands())) 8631 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I}); 8632 if (ShuffleOrOp == Instruction::Trunc) { 8633 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx); 8634 } else if (ShuffleOrOp == Instruction::SIToFP || 8635 ShuffleOrOp == Instruction::UIToFP) { 8636 unsigned NumSignBits = 8637 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT); 8638 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) { 8639 APInt Mask = DB->getDemandedBits(OpI); 8640 NumSignBits = std::max(NumSignBits, Mask.countl_zero()); 8641 } 8642 if (NumSignBits * 2 >= 8643 DL->getTypeSizeInBits(VL0->getOperand(0)->getType())) 8644 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx); 8645 } 8646 return; 8647 } 8648 case Instruction::ICmp: 8649 case Instruction::FCmp: { 8650 // Check that all of the compares have the same predicate. 8651 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); 8652 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 8653 ReuseShuffleIndices); 8654 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n"; 8655 TE->dump()); 8656 8657 ValueList Left, Right; 8658 VLOperands Ops(VL, S, *this); 8659 if (cast<CmpInst>(VL0)->isCommutative()) { 8660 // Commutative predicate - collect + sort operands of the instructions 8661 // so that each side is more likely to have the same opcode. 8662 assert(P0 == CmpInst::getSwappedPredicate(P0) && 8663 "Commutative Predicate mismatch"); 8664 Ops.reorder(); 8665 Left = Ops.getVL(0); 8666 Right = Ops.getVL(1); 8667 } else { 8668 // Collect operands - commute if it uses the swapped predicate. 8669 for (Value *V : VL) { 8670 if (isa<PoisonValue>(V)) { 8671 Left.push_back(PoisonValue::get(VL0->getOperand(0)->getType())); 8672 Right.push_back(PoisonValue::get(VL0->getOperand(1)->getType())); 8673 continue; 8674 } 8675 auto *Cmp = cast<CmpInst>(V); 8676 Value *LHS = Cmp->getOperand(0); 8677 Value *RHS = Cmp->getOperand(1); 8678 if (Cmp->getPredicate() != P0) 8679 std::swap(LHS, RHS); 8680 Left.push_back(LHS); 8681 Right.push_back(RHS); 8682 } 8683 } 8684 TE->setOperand(0, Left); 8685 TE->setOperand(1, Right); 8686 buildTree_rec(Left, Depth + 1, {TE, 0}); 8687 buildTree_rec(Right, Depth + 1, {TE, 1}); 8688 if (ShuffleOrOp == Instruction::ICmp) { 8689 unsigned NumSignBits0 = 8690 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT); 8691 if (NumSignBits0 * 2 >= 8692 DL->getTypeSizeInBits(VL0->getOperand(0)->getType())) 8693 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx); 8694 unsigned NumSignBits1 = 8695 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT); 8696 if (NumSignBits1 * 2 >= 8697 DL->getTypeSizeInBits(VL0->getOperand(1)->getType())) 8698 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx); 8699 } 8700 return; 8701 } 8702 case Instruction::Select: 8703 case Instruction::FNeg: 8704 case Instruction::Add: 8705 case Instruction::FAdd: 8706 case Instruction::Sub: 8707 case Instruction::FSub: 8708 case Instruction::Mul: 8709 case Instruction::FMul: 8710 case Instruction::UDiv: 8711 case Instruction::SDiv: 8712 case Instruction::FDiv: 8713 case Instruction::URem: 8714 case Instruction::SRem: 8715 case Instruction::FRem: 8716 case Instruction::Shl: 8717 case Instruction::LShr: 8718 case Instruction::AShr: 8719 case Instruction::And: 8720 case Instruction::Or: 8721 case Instruction::Xor: 8722 case Instruction::Freeze: { 8723 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 8724 ReuseShuffleIndices); 8725 LLVM_DEBUG( 8726 dbgs() << "SLP: added a new TreeEntry " 8727 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n"; 8728 TE->dump()); 8729 8730 TE->setOperand(*this, isa<BinaryOperator>(VL0) && isCommutative(VL0)); 8731 for (unsigned I : seq<unsigned>(VL0->getNumOperands())) 8732 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I}); 8733 return; 8734 } 8735 case Instruction::GetElementPtr: { 8736 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 8737 ReuseShuffleIndices); 8738 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n"; 8739 TE->dump()); 8740 SmallVector<ValueList, 2> Operands(2); 8741 // Prepare the operand vector for pointer operands. 8742 for (Value *V : VL) { 8743 auto *GEP = dyn_cast<GetElementPtrInst>(V); 8744 if (!GEP) { 8745 Operands.front().push_back(V); 8746 continue; 8747 } 8748 Operands.front().push_back(GEP->getPointerOperand()); 8749 } 8750 TE->setOperand(0, Operands.front()); 8751 // Need to cast all indices to the same type before vectorization to 8752 // avoid crash. 8753 // Required to be able to find correct matches between different gather 8754 // nodes and reuse the vectorized values rather than trying to gather them 8755 // again. 8756 int IndexIdx = 1; 8757 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType(); 8758 Type *Ty = all_of(VL, 8759 [VL0Ty, IndexIdx](Value *V) { 8760 auto *GEP = dyn_cast<GetElementPtrInst>(V); 8761 if (!GEP) 8762 return true; 8763 return VL0Ty == GEP->getOperand(IndexIdx)->getType(); 8764 }) 8765 ? VL0Ty 8766 : DL->getIndexType(cast<GetElementPtrInst>(VL0) 8767 ->getPointerOperandType() 8768 ->getScalarType()); 8769 // Prepare the operand vector. 8770 for (Value *V : VL) { 8771 auto *I = dyn_cast<GetElementPtrInst>(V); 8772 if (!I) { 8773 Operands.back().push_back( 8774 ConstantInt::get(Ty, 0, /*isSigned=*/false)); 8775 continue; 8776 } 8777 auto *Op = I->getOperand(IndexIdx); 8778 auto *CI = dyn_cast<ConstantInt>(Op); 8779 if (!CI) 8780 Operands.back().push_back(Op); 8781 else 8782 Operands.back().push_back(ConstantFoldIntegerCast( 8783 CI, Ty, CI->getValue().isSignBitSet(), *DL)); 8784 } 8785 TE->setOperand(IndexIdx, Operands.back()); 8786 8787 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I) 8788 buildTree_rec(Operands[I], Depth + 1, {TE, I}); 8789 return; 8790 } 8791 case Instruction::Store: { 8792 bool Consecutive = CurrentOrder.empty(); 8793 if (!Consecutive) 8794 fixupOrderingIndices(CurrentOrder); 8795 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 8796 ReuseShuffleIndices, CurrentOrder); 8797 if (Consecutive) 8798 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n"; 8799 TE->dump()); 8800 else 8801 LLVM_DEBUG( 8802 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n"; 8803 TE->dump()); 8804 TE->setOperand(*this); 8805 buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0}); 8806 return; 8807 } 8808 case Instruction::Call: { 8809 // Check if the calls are all to the same vectorizable intrinsic or 8810 // library function. 8811 CallInst *CI = cast<CallInst>(VL0); 8812 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8813 8814 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 8815 ReuseShuffleIndices); 8816 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n"; 8817 TE->dump()); 8818 TE->setOperand(*this, isCommutative(VL0)); 8819 for (unsigned I : seq<unsigned>(CI->arg_size())) { 8820 // For scalar operands no need to create an entry since no need to 8821 // vectorize it. 8822 if (isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) 8823 continue; 8824 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I}); 8825 } 8826 return; 8827 } 8828 case Instruction::ShuffleVector: { 8829 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 8830 ReuseShuffleIndices); 8831 if (S.isAltShuffle()) { 8832 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n"; 8833 TE->dump()); 8834 } else { 8835 assert(SLPReVec && "Only supported by REVEC."); 8836 LLVM_DEBUG( 8837 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n"; 8838 TE->dump()); 8839 } 8840 8841 // Reorder operands if reordering would enable vectorization. 8842 auto *CI = dyn_cast<CmpInst>(VL0); 8843 if (CI && any_of(VL, [](Value *V) { 8844 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative(); 8845 })) { 8846 auto *MainCI = cast<CmpInst>(S.getMainOp()); 8847 auto *AltCI = cast<CmpInst>(S.getAltOp()); 8848 CmpInst::Predicate MainP = MainCI->getPredicate(); 8849 CmpInst::Predicate AltP = AltCI->getPredicate(); 8850 assert(MainP != AltP && 8851 "Expected different main/alternate predicates."); 8852 ValueList Left, Right; 8853 // Collect operands - commute if it uses the swapped predicate or 8854 // alternate operation. 8855 for (Value *V : VL) { 8856 if (isa<PoisonValue>(V)) { 8857 Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType())); 8858 Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType())); 8859 continue; 8860 } 8861 auto *Cmp = cast<CmpInst>(V); 8862 Value *LHS = Cmp->getOperand(0); 8863 Value *RHS = Cmp->getOperand(1); 8864 8865 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) { 8866 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate())) 8867 std::swap(LHS, RHS); 8868 } else { 8869 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate())) 8870 std::swap(LHS, RHS); 8871 } 8872 Left.push_back(LHS); 8873 Right.push_back(RHS); 8874 } 8875 TE->setOperand(0, Left); 8876 TE->setOperand(1, Right); 8877 buildTree_rec(Left, Depth + 1, {TE, 0}); 8878 buildTree_rec(Right, Depth + 1, {TE, 1}); 8879 return; 8880 } 8881 8882 TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI); 8883 for (unsigned I : seq<unsigned>(VL0->getNumOperands())) 8884 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I}); 8885 return; 8886 } 8887 default: 8888 break; 8889 } 8890 llvm_unreachable("Unexpected vectorization of the instructions."); 8891 } 8892 8893 unsigned BoUpSLP::canMapToVector(Type *T) const { 8894 unsigned N = 1; 8895 Type *EltTy = T; 8896 8897 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) { 8898 if (EltTy->isEmptyTy()) 8899 return 0; 8900 if (auto *ST = dyn_cast<StructType>(EltTy)) { 8901 // Check that struct is homogeneous. 8902 for (const auto *Ty : ST->elements()) 8903 if (Ty != *ST->element_begin()) 8904 return 0; 8905 N *= ST->getNumElements(); 8906 EltTy = *ST->element_begin(); 8907 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) { 8908 N *= AT->getNumElements(); 8909 EltTy = AT->getElementType(); 8910 } else { 8911 auto *VT = cast<FixedVectorType>(EltTy); 8912 N *= VT->getNumElements(); 8913 EltTy = VT->getElementType(); 8914 } 8915 } 8916 8917 if (!isValidElementType(EltTy)) 8918 return 0; 8919 uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N)); 8920 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || 8921 VTSize != DL->getTypeStoreSizeInBits(T)) 8922 return 0; 8923 return N; 8924 } 8925 8926 bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, 8927 SmallVectorImpl<unsigned> &CurrentOrder, 8928 bool ResizeAllowed) const { 8929 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>); 8930 assert(It != VL.end() && "Expected at least one extract instruction."); 8931 auto *E0 = cast<Instruction>(*It); 8932 assert( 8933 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) && 8934 "Invalid opcode"); 8935 // Check if all of the extracts come from the same vector and from the 8936 // correct offset. 8937 Value *Vec = E0->getOperand(0); 8938 8939 CurrentOrder.clear(); 8940 8941 // We have to extract from a vector/aggregate with the same number of elements. 8942 unsigned NElts; 8943 if (E0->getOpcode() == Instruction::ExtractValue) { 8944 NElts = canMapToVector(Vec->getType()); 8945 if (!NElts) 8946 return false; 8947 // Check if load can be rewritten as load of vector. 8948 LoadInst *LI = dyn_cast<LoadInst>(Vec); 8949 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size())) 8950 return false; 8951 } else { 8952 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements(); 8953 } 8954 8955 unsigned E = VL.size(); 8956 if (!ResizeAllowed && NElts != E) 8957 return false; 8958 SmallVector<int> Indices(E, PoisonMaskElem); 8959 unsigned MinIdx = NElts, MaxIdx = 0; 8960 for (auto [I, V] : enumerate(VL)) { 8961 auto *Inst = dyn_cast<Instruction>(V); 8962 if (!Inst) 8963 continue; 8964 if (Inst->getOperand(0) != Vec) 8965 return false; 8966 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) 8967 if (isa<UndefValue>(EE->getIndexOperand())) 8968 continue; 8969 std::optional<unsigned> Idx = getExtractIndex(Inst); 8970 if (!Idx) 8971 return false; 8972 const unsigned ExtIdx = *Idx; 8973 if (ExtIdx >= NElts) 8974 continue; 8975 Indices[I] = ExtIdx; 8976 if (MinIdx > ExtIdx) 8977 MinIdx = ExtIdx; 8978 if (MaxIdx < ExtIdx) 8979 MaxIdx = ExtIdx; 8980 } 8981 if (MaxIdx - MinIdx + 1 > E) 8982 return false; 8983 if (MaxIdx + 1 <= E) 8984 MinIdx = 0; 8985 8986 // Check that all of the indices extract from the correct offset. 8987 bool ShouldKeepOrder = true; 8988 // Assign to all items the initial value E + 1 so we can check if the extract 8989 // instruction index was used already. 8990 // Also, later we can check that all the indices are used and we have a 8991 // consecutive access in the extract instructions, by checking that no 8992 // element of CurrentOrder still has value E + 1. 8993 CurrentOrder.assign(E, E); 8994 for (unsigned I = 0; I < E; ++I) { 8995 if (Indices[I] == PoisonMaskElem) 8996 continue; 8997 const unsigned ExtIdx = Indices[I] - MinIdx; 8998 if (CurrentOrder[ExtIdx] != E) { 8999 CurrentOrder.clear(); 9000 return false; 9001 } 9002 ShouldKeepOrder &= ExtIdx == I; 9003 CurrentOrder[ExtIdx] = I; 9004 } 9005 if (ShouldKeepOrder) 9006 CurrentOrder.clear(); 9007 9008 return ShouldKeepOrder; 9009 } 9010 9011 bool BoUpSLP::areAllUsersVectorized( 9012 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const { 9013 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) || 9014 all_of(I->users(), [this](User *U) { 9015 return isVectorized(U) || isVectorLikeInstWithConstOps(U) || 9016 (isa<ExtractElementInst>(U) && MustGather.contains(U)); 9017 }); 9018 } 9019 9020 static std::pair<InstructionCost, InstructionCost> 9021 getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, 9022 TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9023 ArrayRef<Type *> ArgTys) { 9024 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 9025 9026 // Calculate the cost of the scalar and vector calls. 9027 FastMathFlags FMF; 9028 if (auto *FPCI = dyn_cast<FPMathOperator>(CI)) 9029 FMF = FPCI->getFastMathFlags(); 9030 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF); 9031 auto IntrinsicCost = 9032 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); 9033 9034 auto Shape = VFShape::get(CI->getFunctionType(), 9035 ElementCount::getFixed(VecTy->getNumElements()), 9036 false /*HasGlobalPred*/); 9037 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 9038 auto LibCost = IntrinsicCost; 9039 if (!CI->isNoBuiltin() && VecFunc) { 9040 // Calculate the cost of the vector library call. 9041 // If the corresponding vector call is cheaper, return its cost. 9042 LibCost = 9043 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput); 9044 } 9045 return {IntrinsicCost, LibCost}; 9046 } 9047 9048 void BoUpSLP::TreeEntry::buildAltOpShuffleMask( 9049 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask, 9050 SmallVectorImpl<Value *> *OpScalars, 9051 SmallVectorImpl<Value *> *AltScalars) const { 9052 unsigned Sz = Scalars.size(); 9053 Mask.assign(Sz, PoisonMaskElem); 9054 SmallVector<int> OrderMask; 9055 if (!ReorderIndices.empty()) 9056 inversePermutation(ReorderIndices, OrderMask); 9057 for (unsigned I = 0; I < Sz; ++I) { 9058 unsigned Idx = I; 9059 if (!ReorderIndices.empty()) 9060 Idx = OrderMask[I]; 9061 if (isa<PoisonValue>(Scalars[Idx])) 9062 continue; 9063 auto *OpInst = cast<Instruction>(Scalars[Idx]); 9064 if (IsAltOp(OpInst)) { 9065 Mask[I] = Sz + Idx; 9066 if (AltScalars) 9067 AltScalars->push_back(OpInst); 9068 } else { 9069 Mask[I] = Idx; 9070 if (OpScalars) 9071 OpScalars->push_back(OpInst); 9072 } 9073 } 9074 if (!ReuseShuffleIndices.empty()) { 9075 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem); 9076 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) { 9077 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem; 9078 }); 9079 Mask.swap(NewMask); 9080 } 9081 } 9082 9083 static bool isAlternateInstruction(const Instruction *I, 9084 const Instruction *MainOp, 9085 const Instruction *AltOp, 9086 const TargetLibraryInfo &TLI) { 9087 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) { 9088 auto *AltCI = cast<CmpInst>(AltOp); 9089 CmpInst::Predicate MainP = MainCI->getPredicate(); 9090 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate(); 9091 assert(MainP != AltP && "Expected different main/alternate predicates."); 9092 auto *CI = cast<CmpInst>(I); 9093 if (isCmpSameOrSwapped(MainCI, CI, TLI)) 9094 return false; 9095 if (isCmpSameOrSwapped(AltCI, CI, TLI)) 9096 return true; 9097 CmpInst::Predicate P = CI->getPredicate(); 9098 CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(P); 9099 9100 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) && 9101 "CmpInst expected to match either main or alternate predicate or " 9102 "their swap."); 9103 return MainP != P && MainP != SwappedP; 9104 } 9105 return I->getOpcode() == AltOp->getOpcode(); 9106 } 9107 9108 TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) { 9109 assert(!Ops.empty()); 9110 const auto *Op0 = Ops.front(); 9111 9112 const bool IsConstant = all_of(Ops, [](Value *V) { 9113 // TODO: We should allow undef elements here 9114 return isConstant(V) && !isa<UndefValue>(V); 9115 }); 9116 const bool IsUniform = all_of(Ops, [=](Value *V) { 9117 // TODO: We should allow undef elements here 9118 return V == Op0; 9119 }); 9120 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) { 9121 // TODO: We should allow undef elements here 9122 if (auto *CI = dyn_cast<ConstantInt>(V)) 9123 return CI->getValue().isPowerOf2(); 9124 return false; 9125 }); 9126 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) { 9127 // TODO: We should allow undef elements here 9128 if (auto *CI = dyn_cast<ConstantInt>(V)) 9129 return CI->getValue().isNegatedPowerOf2(); 9130 return false; 9131 }); 9132 9133 TTI::OperandValueKind VK = TTI::OK_AnyValue; 9134 if (IsConstant && IsUniform) 9135 VK = TTI::OK_UniformConstantValue; 9136 else if (IsConstant) 9137 VK = TTI::OK_NonUniformConstantValue; 9138 else if (IsUniform) 9139 VK = TTI::OK_UniformValue; 9140 9141 TTI::OperandValueProperties VP = TTI::OP_None; 9142 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP; 9143 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP; 9144 9145 return {VK, VP}; 9146 } 9147 9148 namespace { 9149 /// The base class for shuffle instruction emission and shuffle cost estimation. 9150 class BaseShuffleAnalysis { 9151 protected: 9152 Type *ScalarTy = nullptr; 9153 9154 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {} 9155 9156 /// V is expected to be a vectorized value. 9157 /// When REVEC is disabled, there is no difference between VF and 9158 /// VNumElements. 9159 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements. 9160 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead 9161 /// of 8. 9162 unsigned getVF(Value *V) const { 9163 assert(V && "V cannot be nullptr"); 9164 assert(isa<FixedVectorType>(V->getType()) && 9165 "V does not have FixedVectorType"); 9166 assert(ScalarTy && "ScalarTy cannot be nullptr"); 9167 unsigned ScalarTyNumElements = getNumElements(ScalarTy); 9168 unsigned VNumElements = 9169 cast<FixedVectorType>(V->getType())->getNumElements(); 9170 assert(VNumElements > ScalarTyNumElements && 9171 "the number of elements of V is not large enough"); 9172 assert(VNumElements % ScalarTyNumElements == 0 && 9173 "the number of elements of V is not a vectorized value"); 9174 return VNumElements / ScalarTyNumElements; 9175 } 9176 9177 /// Checks if the mask is an identity mask. 9178 /// \param IsStrict if is true the function returns false if mask size does 9179 /// not match vector size. 9180 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy, 9181 bool IsStrict) { 9182 int Limit = Mask.size(); 9183 int VF = VecTy->getNumElements(); 9184 int Index = -1; 9185 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit)) 9186 return true; 9187 if (!IsStrict) { 9188 // Consider extract subvector starting from index 0. 9189 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) && 9190 Index == 0) 9191 return true; 9192 // All VF-size submasks are identity (e.g. 9193 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4). 9194 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) { 9195 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF); 9196 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) || 9197 ShuffleVectorInst::isIdentityMask(Slice, VF); 9198 })) 9199 return true; 9200 } 9201 return false; 9202 } 9203 9204 /// Tries to combine 2 different masks into single one. 9205 /// \param LocalVF Vector length of the permuted input vector. \p Mask may 9206 /// change the size of the vector, \p LocalVF is the original size of the 9207 /// shuffled vector. 9208 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask, 9209 ArrayRef<int> ExtMask) { 9210 unsigned VF = Mask.size(); 9211 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem); 9212 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { 9213 if (ExtMask[I] == PoisonMaskElem) 9214 continue; 9215 int MaskedIdx = Mask[ExtMask[I] % VF]; 9216 NewMask[I] = 9217 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF; 9218 } 9219 Mask.swap(NewMask); 9220 } 9221 9222 /// Looks through shuffles trying to reduce final number of shuffles in the 9223 /// code. The function looks through the previously emitted shuffle 9224 /// instructions and properly mark indices in mask as undef. 9225 /// For example, given the code 9226 /// \code 9227 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0> 9228 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0> 9229 /// \endcode 9230 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will 9231 /// look through %s1 and %s2 and select vectors %0 and %1 with mask 9232 /// <0, 1, 2, 3> for the shuffle. 9233 /// If 2 operands are of different size, the smallest one will be resized and 9234 /// the mask recalculated properly. 9235 /// For example, given the code 9236 /// \code 9237 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0> 9238 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0> 9239 /// \endcode 9240 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will 9241 /// look through %s1 and %s2 and select vectors %0 and %1 with mask 9242 /// <0, 1, 2, 3> for the shuffle. 9243 /// So, it tries to transform permutations to simple vector merge, if 9244 /// possible. 9245 /// \param V The input vector which must be shuffled using the given \p Mask. 9246 /// If the better candidate is found, \p V is set to this best candidate 9247 /// vector. 9248 /// \param Mask The input mask for the shuffle. If the best candidate is found 9249 /// during looking-through-shuffles attempt, it is updated accordingly. 9250 /// \param SinglePermute true if the shuffle operation is originally a 9251 /// single-value-permutation. In this case the look-through-shuffles procedure 9252 /// may look for resizing shuffles as the best candidates. 9253 /// \return true if the shuffle results in the non-resizing identity shuffle 9254 /// (and thus can be ignored), false - otherwise. 9255 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask, 9256 bool SinglePermute) { 9257 Value *Op = V; 9258 ShuffleVectorInst *IdentityOp = nullptr; 9259 SmallVector<int> IdentityMask; 9260 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) { 9261 // Exit if not a fixed vector type or changing size shuffle. 9262 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType()); 9263 if (!SVTy) 9264 break; 9265 // Remember the identity or broadcast mask, if it is not a resizing 9266 // shuffle. If no better candidates are found, this Op and Mask will be 9267 // used in the final shuffle. 9268 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) { 9269 if (!IdentityOp || !SinglePermute || 9270 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) && 9271 !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask, 9272 IdentityMask.size()))) { 9273 IdentityOp = SV; 9274 // Store current mask in the IdentityMask so later we did not lost 9275 // this info if IdentityOp is selected as the best candidate for the 9276 // permutation. 9277 IdentityMask.assign(Mask); 9278 } 9279 } 9280 // Remember the broadcast mask. If no better candidates are found, this Op 9281 // and Mask will be used in the final shuffle. 9282 // Zero splat can be used as identity too, since it might be used with 9283 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling. 9284 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is 9285 // expensive, the analysis founds out, that the source vector is just a 9286 // broadcast, this original mask can be transformed to identity mask <0, 9287 // 1, 2, 3>. 9288 // \code 9289 // %0 = shuffle %v, poison, zeroinitalizer 9290 // %res = shuffle %0, poison, <3, 1, 2, 0> 9291 // \endcode 9292 // may be transformed to 9293 // \code 9294 // %0 = shuffle %v, poison, zeroinitalizer 9295 // %res = shuffle %0, poison, <0, 1, 2, 3> 9296 // \endcode 9297 if (SV->isZeroEltSplat()) { 9298 IdentityOp = SV; 9299 IdentityMask.assign(Mask); 9300 } 9301 int LocalVF = Mask.size(); 9302 if (auto *SVOpTy = 9303 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType())) 9304 LocalVF = SVOpTy->getNumElements(); 9305 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem); 9306 for (auto [Idx, I] : enumerate(Mask)) { 9307 if (I == PoisonMaskElem || 9308 static_cast<unsigned>(I) >= SV->getShuffleMask().size()) 9309 continue; 9310 ExtMask[Idx] = SV->getMaskValue(I); 9311 } 9312 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>( 9313 SV->getOperand(0), 9314 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg)) 9315 .all(); 9316 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>( 9317 SV->getOperand(1), 9318 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg)) 9319 .all(); 9320 if (!IsOp1Undef && !IsOp2Undef) { 9321 // Update mask and mark undef elems. 9322 for (int &I : Mask) { 9323 if (I == PoisonMaskElem) 9324 continue; 9325 if (SV->getMaskValue(I % SV->getShuffleMask().size()) == 9326 PoisonMaskElem) 9327 I = PoisonMaskElem; 9328 } 9329 break; 9330 } 9331 SmallVector<int> ShuffleMask(SV->getShuffleMask()); 9332 combineMasks(LocalVF, ShuffleMask, Mask); 9333 Mask.swap(ShuffleMask); 9334 if (IsOp2Undef) 9335 Op = SV->getOperand(0); 9336 else 9337 Op = SV->getOperand(1); 9338 } 9339 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType()); 9340 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) || 9341 ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())) { 9342 if (IdentityOp) { 9343 V = IdentityOp; 9344 assert(Mask.size() == IdentityMask.size() && 9345 "Expected masks of same sizes."); 9346 // Clear known poison elements. 9347 for (auto [I, Idx] : enumerate(Mask)) 9348 if (Idx == PoisonMaskElem) 9349 IdentityMask[I] = PoisonMaskElem; 9350 Mask.swap(IdentityMask); 9351 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V); 9352 return SinglePermute && 9353 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()), 9354 /*IsStrict=*/true) || 9355 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() && 9356 Shuffle->isZeroEltSplat() && 9357 ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size()))); 9358 } 9359 V = Op; 9360 return false; 9361 } 9362 V = Op; 9363 return true; 9364 } 9365 9366 /// Smart shuffle instruction emission, walks through shuffles trees and 9367 /// tries to find the best matching vector for the actual shuffle 9368 /// instruction. 9369 template <typename T, typename ShuffleBuilderTy> 9370 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask, 9371 ShuffleBuilderTy &Builder, Type *ScalarTy) { 9372 assert(V1 && "Expected at least one vector value."); 9373 unsigned ScalarTyNumElements = getNumElements(ScalarTy); 9374 SmallVector<int> NewMask(Mask); 9375 if (ScalarTyNumElements != 1) { 9376 assert(SLPReVec && "FixedVectorType is not expected."); 9377 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask); 9378 Mask = NewMask; 9379 } 9380 if (V2) 9381 Builder.resizeToMatch(V1, V2); 9382 int VF = Mask.size(); 9383 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType())) 9384 VF = FTy->getNumElements(); 9385 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>( 9386 V2, buildUseMask(VF, Mask, UseMask::SecondArg)) 9387 .all()) { 9388 // Peek through shuffles. 9389 Value *Op1 = V1; 9390 Value *Op2 = V2; 9391 int VF = 9392 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue(); 9393 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem); 9394 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem); 9395 for (int I = 0, E = Mask.size(); I < E; ++I) { 9396 if (Mask[I] < VF) 9397 CombinedMask1[I] = Mask[I]; 9398 else 9399 CombinedMask2[I] = Mask[I] - VF; 9400 } 9401 Value *PrevOp1; 9402 Value *PrevOp2; 9403 do { 9404 PrevOp1 = Op1; 9405 PrevOp2 = Op2; 9406 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false); 9407 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false); 9408 // Check if we have 2 resizing shuffles - need to peek through operands 9409 // again. 9410 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1)) 9411 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) { 9412 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem); 9413 for (auto [Idx, I] : enumerate(CombinedMask1)) { 9414 if (I == PoisonMaskElem) 9415 continue; 9416 ExtMask1[Idx] = SV1->getMaskValue(I); 9417 } 9418 SmallBitVector UseMask1 = buildUseMask( 9419 cast<FixedVectorType>(SV1->getOperand(1)->getType()) 9420 ->getNumElements(), 9421 ExtMask1, UseMask::SecondArg); 9422 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem); 9423 for (auto [Idx, I] : enumerate(CombinedMask2)) { 9424 if (I == PoisonMaskElem) 9425 continue; 9426 ExtMask2[Idx] = SV2->getMaskValue(I); 9427 } 9428 SmallBitVector UseMask2 = buildUseMask( 9429 cast<FixedVectorType>(SV2->getOperand(1)->getType()) 9430 ->getNumElements(), 9431 ExtMask2, UseMask::SecondArg); 9432 if (SV1->getOperand(0)->getType() == 9433 SV2->getOperand(0)->getType() && 9434 SV1->getOperand(0)->getType() != SV1->getType() && 9435 isUndefVector(SV1->getOperand(1), UseMask1).all() && 9436 isUndefVector(SV2->getOperand(1), UseMask2).all()) { 9437 Op1 = SV1->getOperand(0); 9438 Op2 = SV2->getOperand(0); 9439 SmallVector<int> ShuffleMask1(SV1->getShuffleMask()); 9440 int LocalVF = ShuffleMask1.size(); 9441 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType())) 9442 LocalVF = FTy->getNumElements(); 9443 combineMasks(LocalVF, ShuffleMask1, CombinedMask1); 9444 CombinedMask1.swap(ShuffleMask1); 9445 SmallVector<int> ShuffleMask2(SV2->getShuffleMask()); 9446 LocalVF = ShuffleMask2.size(); 9447 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType())) 9448 LocalVF = FTy->getNumElements(); 9449 combineMasks(LocalVF, ShuffleMask2, CombinedMask2); 9450 CombinedMask2.swap(ShuffleMask2); 9451 } 9452 } 9453 } while (PrevOp1 != Op1 || PrevOp2 != Op2); 9454 Builder.resizeToMatch(Op1, Op2); 9455 VF = std::max(cast<VectorType>(Op1->getType()) 9456 ->getElementCount() 9457 .getKnownMinValue(), 9458 cast<VectorType>(Op2->getType()) 9459 ->getElementCount() 9460 .getKnownMinValue()); 9461 for (int I = 0, E = Mask.size(); I < E; ++I) { 9462 if (CombinedMask2[I] != PoisonMaskElem) { 9463 assert(CombinedMask1[I] == PoisonMaskElem && 9464 "Expected undefined mask element"); 9465 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF); 9466 } 9467 } 9468 if (Op1 == Op2 && 9469 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) || 9470 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) && 9471 isa<ShuffleVectorInst>(Op1) && 9472 cast<ShuffleVectorInst>(Op1)->getShuffleMask() == 9473 ArrayRef(CombinedMask1)))) 9474 return Builder.createIdentity(Op1); 9475 return Builder.createShuffleVector( 9476 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2, 9477 CombinedMask1); 9478 } 9479 if (isa<PoisonValue>(V1)) 9480 return Builder.createPoison( 9481 cast<VectorType>(V1->getType())->getElementType(), Mask.size()); 9482 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true); 9483 assert(V1 && "Expected non-null value after looking through shuffles."); 9484 9485 if (!IsIdentity) 9486 return Builder.createShuffleVector(V1, NewMask); 9487 return Builder.createIdentity(V1); 9488 } 9489 9490 /// Transforms mask \p CommonMask per given \p Mask to make proper set after 9491 /// shuffle emission. 9492 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask, 9493 ArrayRef<int> Mask) { 9494 for (unsigned I : seq<unsigned>(CommonMask.size())) 9495 if (Mask[I] != PoisonMaskElem) 9496 CommonMask[I] = I; 9497 } 9498 }; 9499 } // namespace 9500 9501 /// Calculate the scalar and the vector costs from vectorizing set of GEPs. 9502 static std::pair<InstructionCost, InstructionCost> 9503 getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs, 9504 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, 9505 Type *ScalarTy, VectorType *VecTy) { 9506 InstructionCost ScalarCost = 0; 9507 InstructionCost VecCost = 0; 9508 // Here we differentiate two cases: (1) when Ptrs represent a regular 9509 // vectorization tree node (as they are pointer arguments of scattered 9510 // loads) or (2) when Ptrs are the arguments of loads or stores being 9511 // vectorized as plane wide unit-stride load/store since all the 9512 // loads/stores are known to be from/to adjacent locations. 9513 if (Opcode == Instruction::Load || Opcode == Instruction::Store) { 9514 // Case 2: estimate costs for pointer related costs when vectorizing to 9515 // a wide load/store. 9516 // Scalar cost is estimated as a set of pointers with known relationship 9517 // between them. 9518 // For vector code we will use BasePtr as argument for the wide load/store 9519 // but we also need to account all the instructions which are going to 9520 // stay in vectorized code due to uses outside of these scalar 9521 // loads/stores. 9522 ScalarCost = TTI.getPointersChainCost( 9523 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy, 9524 CostKind); 9525 9526 SmallVector<const Value *> PtrsRetainedInVecCode; 9527 for (Value *V : Ptrs) { 9528 if (V == BasePtr) { 9529 PtrsRetainedInVecCode.push_back(V); 9530 continue; 9531 } 9532 auto *Ptr = dyn_cast<GetElementPtrInst>(V); 9533 // For simplicity assume Ptr to stay in vectorized code if it's not a 9534 // GEP instruction. We don't care since it's cost considered free. 9535 // TODO: We should check for any uses outside of vectorizable tree 9536 // rather than just single use. 9537 if (!Ptr || !Ptr->hasOneUse()) 9538 PtrsRetainedInVecCode.push_back(V); 9539 } 9540 9541 if (PtrsRetainedInVecCode.size() == Ptrs.size()) { 9542 // If all pointers stay in vectorized code then we don't have 9543 // any savings on that. 9544 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free); 9545 } 9546 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr, 9547 TTI::PointersChainInfo::getKnownStride(), 9548 VecTy, CostKind); 9549 } else { 9550 // Case 1: Ptrs are the arguments of loads that we are going to transform 9551 // into masked gather load intrinsic. 9552 // All the scalar GEPs will be removed as a result of vectorization. 9553 // For any external uses of some lanes extract element instructions will 9554 // be generated (which cost is estimated separately). 9555 TTI::PointersChainInfo PtrsInfo = 9556 all_of(Ptrs, 9557 [](const Value *V) { 9558 auto *Ptr = dyn_cast<GetElementPtrInst>(V); 9559 return Ptr && !Ptr->hasAllConstantIndices(); 9560 }) 9561 ? TTI::PointersChainInfo::getUnknownStride() 9562 : TTI::PointersChainInfo::getKnownStride(); 9563 9564 ScalarCost = 9565 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind); 9566 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr); 9567 if (!BaseGEP) { 9568 auto *It = find_if(Ptrs, IsaPred<GEPOperator>); 9569 if (It != Ptrs.end()) 9570 BaseGEP = cast<GEPOperator>(*It); 9571 } 9572 if (BaseGEP) { 9573 SmallVector<const Value *> Indices(BaseGEP->indices()); 9574 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(), 9575 BaseGEP->getPointerOperand(), Indices, VecTy, 9576 CostKind); 9577 } 9578 } 9579 9580 return std::make_pair(ScalarCost, VecCost); 9581 } 9582 9583 void BoUpSLP::reorderGatherNode(TreeEntry &TE) { 9584 assert(TE.isGather() && TE.ReorderIndices.empty() && 9585 "Expected gather node without reordering."); 9586 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap; 9587 SmallSet<size_t, 2> LoadKeyUsed; 9588 9589 // Do not reorder nodes if it small (just 2 elements), all-constant or all 9590 // instructions have same opcode already. 9591 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) || 9592 all_of(TE.Scalars, isConstant)) 9593 return; 9594 9595 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) { 9596 return VectorizableTree[Idx]->isSame(TE.Scalars); 9597 })) 9598 return; 9599 9600 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) { 9601 Key = hash_combine(hash_value(LI->getParent()), Key); 9602 Value *Ptr = 9603 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth); 9604 if (LoadKeyUsed.contains(Key)) { 9605 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr)); 9606 if (LIt != LoadsMap.end()) { 9607 for (LoadInst *RLI : LIt->second) { 9608 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(), 9609 LI->getType(), LI->getPointerOperand(), *DL, *SE, 9610 /*StrictCheck=*/true)) 9611 return hash_value(RLI->getPointerOperand()); 9612 } 9613 for (LoadInst *RLI : LIt->second) { 9614 if (arePointersCompatible(RLI->getPointerOperand(), 9615 LI->getPointerOperand(), *TLI)) { 9616 hash_code SubKey = hash_value(RLI->getPointerOperand()); 9617 return SubKey; 9618 } 9619 } 9620 if (LIt->second.size() > 2) { 9621 hash_code SubKey = 9622 hash_value(LIt->second.back()->getPointerOperand()); 9623 return SubKey; 9624 } 9625 } 9626 } 9627 LoadKeyUsed.insert(Key); 9628 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI); 9629 return hash_value(LI->getPointerOperand()); 9630 }; 9631 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues; 9632 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex; 9633 bool IsOrdered = true; 9634 unsigned NumInstructions = 0; 9635 // Try to "cluster" scalar instructions, to be able to build extra vectorized 9636 // nodes. 9637 for (auto [I, V] : enumerate(TE.Scalars)) { 9638 size_t Key = 1, Idx = 1; 9639 if (auto *Inst = dyn_cast<Instruction>(V); 9640 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) && 9641 !isDeleted(Inst) && !isVectorized(V)) { 9642 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey, 9643 /*AllowAlternate=*/false); 9644 ++NumInstructions; 9645 } 9646 auto &Container = SortedValues[Key]; 9647 if (IsOrdered && !KeyToIndex.contains(V) && 9648 !(isa<Constant, ExtractElementInst>(V) || 9649 isVectorLikeInstWithConstOps(V)) && 9650 ((Container.contains(Idx) && 9651 KeyToIndex.at(Container[Idx].back()).back() != I - 1) || 9652 (!Container.empty() && !Container.contains(Idx) && 9653 KeyToIndex.at(Container.back().second.back()).back() != I - 1))) 9654 IsOrdered = false; 9655 auto &KTI = KeyToIndex[V]; 9656 if (KTI.empty()) 9657 Container[Idx].push_back(V); 9658 KTI.push_back(I); 9659 } 9660 SmallVector<std::pair<unsigned, unsigned>> SubVectors; 9661 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size()); 9662 if (!IsOrdered && NumInstructions > 1) { 9663 unsigned Cnt = 0; 9664 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size()); 9665 for (const auto &D : SortedValues) { 9666 for (const auto &P : D.second) { 9667 unsigned Sz = 0; 9668 for (Value *V : P.second) { 9669 ArrayRef<unsigned> Indices = KeyToIndex.at(V); 9670 for (auto [K, Idx] : enumerate(Indices)) { 9671 TE.ReorderIndices[Cnt + K] = Idx; 9672 TE.Scalars[Cnt + K] = V; 9673 } 9674 Sz += Indices.size(); 9675 Cnt += Indices.size(); 9676 } 9677 if (Sz > 1 && isa<Instruction>(P.second.front())) { 9678 const unsigned SubVF = getFloorFullVectorNumberOfElements( 9679 *TTI, TE.Scalars.front()->getType(), Sz); 9680 SubVectors.emplace_back(Cnt - Sz, SubVF); 9681 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF)) 9682 DemandedElts.clearBit(I); 9683 } else if (!P.second.empty() && isConstant(P.second.front())) { 9684 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt)) 9685 DemandedElts.clearBit(I); 9686 } 9687 } 9688 } 9689 } 9690 // Reuses always require shuffles, so consider it as profitable. 9691 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty()) 9692 return; 9693 // Do simple cost estimation. 9694 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 9695 InstructionCost Cost = 0; 9696 auto *ScalarTy = TE.Scalars.front()->getType(); 9697 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size()); 9698 for (auto [Idx, Sz] : SubVectors) { 9699 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind, 9700 Idx, getWidenedType(ScalarTy, Sz)); 9701 } 9702 if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) { 9703 assert(SLPReVec && "Only supported by REVEC."); 9704 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead 9705 // of CreateInsertElement. 9706 unsigned ScalarTyNumElements = getNumElements(ScalarTy); 9707 for (unsigned I : seq<unsigned>(TE.Scalars.size())) 9708 if (DemandedElts[I]) 9709 Cost += 9710 TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt, 9711 CostKind, I * ScalarTyNumElements, FTy); 9712 } else { 9713 Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true, 9714 /*Extract=*/false, CostKind); 9715 } 9716 int Sz = TE.Scalars.size(); 9717 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(), 9718 TE.ReorderIndices.end()); 9719 for (unsigned I : seq<unsigned>(Sz)) { 9720 Value *V = TE.getOrdered(I); 9721 if (isa<PoisonValue>(V)) { 9722 ReorderMask[I] = PoisonMaskElem; 9723 } else if (isConstant(V) || DemandedElts[I]) { 9724 ReorderMask[I] = I + TE.ReorderIndices.size(); 9725 } 9726 } 9727 Cost += ::getShuffleCost(*TTI, 9728 any_of(ReorderMask, [&](int I) { return I >= Sz; }) 9729 ? TTI::SK_PermuteTwoSrc 9730 : TTI::SK_PermuteSingleSrc, 9731 VecTy, ReorderMask); 9732 DemandedElts = APInt::getAllOnes(VecTy->getNumElements()); 9733 ReorderMask.assign(Sz, PoisonMaskElem); 9734 for (unsigned I : seq<unsigned>(Sz)) { 9735 Value *V = TE.getOrdered(I); 9736 if (isConstant(V)) { 9737 DemandedElts.clearBit(I); 9738 if (!isa<PoisonValue>(V)) 9739 ReorderMask[I] = I; 9740 } else { 9741 ReorderMask[I] = I + Sz; 9742 } 9743 } 9744 InstructionCost BVCost = TTI->getScalarizationOverhead( 9745 VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind); 9746 if (!DemandedElts.isAllOnes()) 9747 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask); 9748 if (Cost >= BVCost) { 9749 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end()); 9750 reorderScalars(TE.Scalars, Mask); 9751 TE.ReorderIndices.clear(); 9752 } 9753 } 9754 9755 void BoUpSLP::transformNodes() { 9756 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 9757 BaseGraphSize = VectorizableTree.size(); 9758 // Turn graph transforming mode on and off, when done. 9759 class GraphTransformModeRAAI { 9760 bool &SavedIsGraphTransformMode; 9761 9762 public: 9763 GraphTransformModeRAAI(bool &IsGraphTransformMode) 9764 : SavedIsGraphTransformMode(IsGraphTransformMode) { 9765 IsGraphTransformMode = true; 9766 } 9767 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; } 9768 } TransformContext(IsGraphTransformMode); 9769 // Operands are profitable if they are: 9770 // 1. At least one constant 9771 // or 9772 // 2. Splats 9773 // or 9774 // 3. Results in good vectorization opportunity, i.e. may generate vector 9775 // nodes and reduce cost of the graph. 9776 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2, 9777 const InstructionsState &S) { 9778 SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates; 9779 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands())) 9780 Candidates.emplace_back().emplace_back(I1->getOperand(Op), 9781 I2->getOperand(Op)); 9782 return all_of( 9783 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) { 9784 return all_of(Cand, 9785 [](const std::pair<Value *, Value *> &P) { 9786 return isa<Constant>(P.first) || 9787 isa<Constant>(P.second) || P.first == P.second; 9788 }) || 9789 findBestRootPair(Cand, LookAheadHeuristics::ScoreSplatLoads); 9790 }); 9791 }; 9792 9793 // Try to reorder gather nodes for better vectorization opportunities. 9794 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) { 9795 TreeEntry &E = *VectorizableTree[Idx]; 9796 if (E.isGather()) 9797 reorderGatherNode(E); 9798 } 9799 9800 // The tree may grow here, so iterate over nodes, built before. 9801 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) { 9802 TreeEntry &E = *VectorizableTree[Idx]; 9803 if (E.isGather()) { 9804 ArrayRef<Value *> VL = E.Scalars; 9805 const unsigned Sz = getVectorElementSize(VL.front()); 9806 unsigned MinVF = getMinVF(2 * Sz); 9807 // Do not try partial vectorization for small nodes (<= 2), nodes with the 9808 // same opcode and same parent block or all constants. 9809 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) || 9810 !(!E.hasState() || E.getOpcode() == Instruction::Load || 9811 E.isAltShuffle() || !allSameBlock(VL)) || 9812 allConstant(VL) || isSplat(VL)) 9813 continue; 9814 // Try to find vectorizable sequences and transform them into a series of 9815 // insertvector instructions. 9816 unsigned StartIdx = 0; 9817 unsigned End = VL.size(); 9818 for (unsigned VF = getFloorFullVectorNumberOfElements( 9819 *TTI, VL.front()->getType(), VL.size() - 1); 9820 VF >= MinVF; VF = getFloorFullVectorNumberOfElements( 9821 *TTI, VL.front()->getType(), VF - 1)) { 9822 if (StartIdx + VF > End) 9823 continue; 9824 SmallVector<std::pair<unsigned, unsigned>> Slices; 9825 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) { 9826 ArrayRef<Value *> Slice = VL.slice(Cnt, VF); 9827 // If any instruction is vectorized already - do not try again. 9828 // Reuse the existing node, if it fully matches the slice. 9829 if (isVectorized(Slice.front()) && 9830 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true)) 9831 continue; 9832 // Constant already handled effectively - skip. 9833 if (allConstant(Slice)) 9834 continue; 9835 // Do not try to vectorize small splats (less than vector register and 9836 // only with the single non-undef element). 9837 bool IsSplat = isSplat(Slice); 9838 bool IsTwoRegisterSplat = true; 9839 if (IsSplat && VF == 2) { 9840 unsigned NumRegs2VF = ::getNumberOfParts( 9841 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF)); 9842 IsTwoRegisterSplat = NumRegs2VF == 2; 9843 } 9844 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat || 9845 count(Slice, Slice.front()) == 9846 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1 9847 : 1)) { 9848 if (IsSplat) 9849 continue; 9850 InstructionsState S = getSameOpcode(Slice, *TLI); 9851 if (!S || S.isAltShuffle() || !allSameBlock(Slice) || 9852 (S.getOpcode() == Instruction::Load && 9853 areKnownNonVectorizableLoads(Slice)) || 9854 (S.getOpcode() != Instruction::Load && 9855 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF))) 9856 continue; 9857 if (VF == 2) { 9858 // Try to vectorize reduced values or if all users are vectorized. 9859 // For expensive instructions extra extracts might be profitable. 9860 if ((!UserIgnoreList || E.Idx != 0) && 9861 TTI->getInstructionCost(S.getMainOp(), CostKind) < 9862 TTI::TCC_Expensive && 9863 !all_of(Slice, [&](Value *V) { 9864 if (isa<PoisonValue>(V)) 9865 return true; 9866 return areAllUsersVectorized(cast<Instruction>(V), 9867 UserIgnoreList); 9868 })) 9869 continue; 9870 if (S.getOpcode() == Instruction::Load) { 9871 OrdersType Order; 9872 SmallVector<Value *> PointerOps; 9873 LoadsState Res = 9874 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps); 9875 // Do not vectorize gathers. 9876 if (Res == LoadsState::ScatterVectorize || 9877 Res == LoadsState::Gather) { 9878 if (Res == LoadsState::Gather) { 9879 registerNonVectorizableLoads(Slice); 9880 // If reductions and the scalars from the root node are 9881 // analyzed - mark as non-vectorizable reduction. 9882 if (UserIgnoreList && E.Idx == 0) 9883 analyzedReductionVals(Slice); 9884 } 9885 continue; 9886 } 9887 } else if (S.getOpcode() == Instruction::ExtractElement || 9888 (TTI->getInstructionCost(S.getMainOp(), CostKind) < 9889 TTI::TCC_Expensive && 9890 !CheckOperandsProfitability( 9891 S.getMainOp(), 9892 cast<Instruction>(*find_if(reverse(Slice), 9893 IsaPred<Instruction>)), 9894 S))) { 9895 // Do not vectorize extractelements (handled effectively 9896 // alread). Do not vectorize non-profitable instructions (with 9897 // low cost and non-vectorizable operands.) 9898 continue; 9899 } 9900 } 9901 } 9902 Slices.emplace_back(Cnt, Slice.size()); 9903 } 9904 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) { 9905 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt); 9906 if (StartIdx == Cnt) 9907 StartIdx = Cnt + Sz; 9908 if (End == Cnt + Sz) 9909 End = Cnt; 9910 }; 9911 for (auto [Cnt, Sz] : Slices) { 9912 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz); 9913 // If any instruction is vectorized already - do not try again. 9914 if (TreeEntry *SE = getSameValuesTreeEntry(Slice.front(), Slice, 9915 /*SameVF=*/true)) { 9916 SE->UserTreeIndices.emplace_back(&E, UINT_MAX); 9917 AddCombinedNode(SE->Idx, Cnt, Sz); 9918 continue; 9919 } 9920 unsigned PrevSize = VectorizableTree.size(); 9921 [[maybe_unused]] unsigned PrevEntriesSize = 9922 LoadEntriesToVectorize.size(); 9923 buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX)); 9924 if (PrevSize + 1 == VectorizableTree.size() && 9925 VectorizableTree[PrevSize]->isGather() && 9926 VectorizableTree[PrevSize]->hasState() && 9927 VectorizableTree[PrevSize]->getOpcode() != 9928 Instruction::ExtractElement && 9929 !isSplat(Slice)) { 9930 if (UserIgnoreList && E.Idx == 0 && VF == 2) 9931 analyzedReductionVals(Slice); 9932 VectorizableTree.pop_back(); 9933 assert(PrevEntriesSize == LoadEntriesToVectorize.size() && 9934 "LoadEntriesToVectorize expected to remain the same"); 9935 continue; 9936 } 9937 AddCombinedNode(PrevSize, Cnt, Sz); 9938 } 9939 } 9940 // Restore ordering, if no extra vectorization happened. 9941 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) { 9942 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end()); 9943 reorderScalars(E.Scalars, Mask); 9944 E.ReorderIndices.clear(); 9945 } 9946 } 9947 if (!E.hasState()) 9948 continue; 9949 switch (E.getOpcode()) { 9950 case Instruction::Load: { 9951 // No need to reorder masked gather loads, just reorder the scalar 9952 // operands. 9953 if (E.State != TreeEntry::Vectorize) 9954 break; 9955 Type *ScalarTy = E.getMainOp()->getType(); 9956 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size()); 9957 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars); 9958 // Check if profitable to represent consecutive load + reverse as strided 9959 // load with stride -1. 9960 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) && 9961 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) { 9962 SmallVector<int> Mask; 9963 inversePermutation(E.ReorderIndices, Mask); 9964 auto *BaseLI = cast<LoadInst>(E.Scalars.back()); 9965 InstructionCost OriginalVecCost = 9966 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(), 9967 BaseLI->getPointerAddressSpace(), CostKind, 9968 TTI::OperandValueInfo()) + 9969 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind); 9970 InstructionCost StridedCost = TTI->getStridedMemoryOpCost( 9971 Instruction::Load, VecTy, BaseLI->getPointerOperand(), 9972 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI); 9973 if (StridedCost < OriginalVecCost) 9974 // Strided load is more profitable than consecutive load + reverse - 9975 // transform the node to strided load. 9976 E.State = TreeEntry::StridedVectorize; 9977 } 9978 break; 9979 } 9980 case Instruction::Store: { 9981 Type *ScalarTy = 9982 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType(); 9983 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size()); 9984 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars); 9985 // Check if profitable to represent consecutive load + reverse as strided 9986 // load with stride -1. 9987 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) && 9988 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) { 9989 SmallVector<int> Mask; 9990 inversePermutation(E.ReorderIndices, Mask); 9991 auto *BaseSI = cast<StoreInst>(E.Scalars.back()); 9992 InstructionCost OriginalVecCost = 9993 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(), 9994 BaseSI->getPointerAddressSpace(), CostKind, 9995 TTI::OperandValueInfo()) + 9996 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind); 9997 InstructionCost StridedCost = TTI->getStridedMemoryOpCost( 9998 Instruction::Store, VecTy, BaseSI->getPointerOperand(), 9999 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI); 10000 if (StridedCost < OriginalVecCost) 10001 // Strided store is more profitable than reverse + consecutive store - 10002 // transform the node to strided store. 10003 E.State = TreeEntry::StridedVectorize; 10004 } else if (!E.ReorderIndices.empty()) { 10005 // Check for interleaved stores. 10006 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) { 10007 auto *BaseSI = cast<StoreInst>(E.Scalars.front()); 10008 assert(Mask.size() > 1 && "Expected mask greater than 1 element."); 10009 if (Mask.size() < 4) 10010 return 0u; 10011 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) { 10012 if (ShuffleVectorInst::isInterleaveMask( 10013 Mask, Factor, VecTy->getElementCount().getFixedValue()) && 10014 TTI.isLegalInterleavedAccessType( 10015 VecTy, Factor, BaseSI->getAlign(), 10016 BaseSI->getPointerAddressSpace())) 10017 return Factor; 10018 } 10019 10020 return 0u; 10021 }; 10022 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end()); 10023 unsigned InterleaveFactor = IsInterleaveMask(Mask); 10024 if (InterleaveFactor != 0) 10025 E.setInterleave(InterleaveFactor); 10026 } 10027 break; 10028 } 10029 case Instruction::Select: { 10030 if (E.State != TreeEntry::Vectorize) 10031 break; 10032 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars); 10033 if (MinMaxID == Intrinsic::not_intrinsic) 10034 break; 10035 // This node is a minmax node. 10036 E.CombinedOp = TreeEntry::MinMax; 10037 TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0)); 10038 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 && 10039 CondEntry->State == TreeEntry::Vectorize) { 10040 // The condition node is part of the combined minmax node. 10041 CondEntry->State = TreeEntry::CombinedVectorize; 10042 } 10043 break; 10044 } 10045 default: 10046 break; 10047 } 10048 } 10049 10050 if (LoadEntriesToVectorize.empty()) { 10051 // Single load node - exit. 10052 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() && 10053 VectorizableTree.front()->getOpcode() == Instruction::Load) 10054 return; 10055 // Small graph with small VF - exit. 10056 constexpr unsigned SmallTree = 3; 10057 constexpr unsigned SmallVF = 2; 10058 if ((VectorizableTree.size() <= SmallTree && 10059 VectorizableTree.front()->Scalars.size() == SmallVF) || 10060 (VectorizableTree.size() <= 2 && UserIgnoreList)) 10061 return; 10062 10063 if (VectorizableTree.front()->isNonPowOf2Vec() && 10064 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList && 10065 getCanonicalGraphSize() <= SmallTree && 10066 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()), 10067 [](const std::unique_ptr<TreeEntry> &TE) { 10068 return TE->isGather() && TE->hasState() && 10069 TE->getOpcode() == Instruction::Load && 10070 !allSameBlock(TE->Scalars); 10071 }) == 1) 10072 return; 10073 } 10074 10075 // A list of loads to be gathered during the vectorization process. We can 10076 // try to vectorize them at the end, if profitable. 10077 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>, 10078 SmallVector<SmallVector<std::pair<LoadInst *, int>>>, 8> 10079 GatheredLoads; 10080 10081 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) { 10082 TreeEntry &E = *TE; 10083 if (E.isGather() && 10084 ((E.hasState() && E.getOpcode() == Instruction::Load) || 10085 (!E.hasState() && any_of(E.Scalars, 10086 [&](Value *V) { 10087 return isa<LoadInst>(V) && 10088 !isVectorized(V) && 10089 !isDeleted(cast<Instruction>(V)); 10090 }))) && 10091 !isSplat(E.Scalars)) { 10092 for (Value *V : E.Scalars) { 10093 auto *LI = dyn_cast<LoadInst>(V); 10094 if (!LI) 10095 continue; 10096 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple()) 10097 continue; 10098 gatherPossiblyVectorizableLoads( 10099 *this, V, *DL, *SE, *TTI, 10100 GatheredLoads[std::make_tuple( 10101 LI->getParent(), 10102 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth), 10103 LI->getType())]); 10104 } 10105 } 10106 } 10107 // Try to vectorize gathered loads if this is not just a gather of loads. 10108 if (!GatheredLoads.empty()) 10109 tryToVectorizeGatheredLoads(GatheredLoads); 10110 } 10111 10112 /// Merges shuffle masks and emits final shuffle instruction, if required. It 10113 /// supports shuffling of 2 input vectors. It implements lazy shuffles emission, 10114 /// when the actual shuffle instruction is generated only if this is actually 10115 /// required. Otherwise, the shuffle instruction emission is delayed till the 10116 /// end of the process, to reduce the number of emitted instructions and further 10117 /// analysis/transformations. 10118 class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { 10119 bool IsFinalized = false; 10120 SmallVector<int> CommonMask; 10121 SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors; 10122 const TargetTransformInfo &TTI; 10123 InstructionCost Cost = 0; 10124 SmallDenseSet<Value *> VectorizedVals; 10125 BoUpSLP &R; 10126 SmallPtrSetImpl<Value *> &CheckedExtracts; 10127 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 10128 /// While set, still trying to estimate the cost for the same nodes and we 10129 /// can delay actual cost estimation (virtual shuffle instruction emission). 10130 /// May help better estimate the cost if same nodes must be permuted + allows 10131 /// to move most of the long shuffles cost estimation to TTI. 10132 bool SameNodesEstimated = true; 10133 10134 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) { 10135 if (Ty->getScalarType()->isPointerTy()) { 10136 Constant *Res = ConstantExpr::getIntToPtr( 10137 ConstantInt::getAllOnesValue( 10138 IntegerType::get(Ty->getContext(), 10139 DL.getTypeStoreSizeInBits(Ty->getScalarType()))), 10140 Ty->getScalarType()); 10141 if (auto *VTy = dyn_cast<VectorType>(Ty)) 10142 Res = ConstantVector::getSplat(VTy->getElementCount(), Res); 10143 return Res; 10144 } 10145 return Constant::getAllOnesValue(Ty); 10146 } 10147 10148 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) { 10149 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>)) 10150 return TTI::TCC_Free; 10151 auto *VecTy = getWidenedType(ScalarTy, VL.size()); 10152 InstructionCost GatherCost = 0; 10153 SmallVector<Value *> Gathers(VL); 10154 if (!Root && isSplat(VL)) { 10155 // Found the broadcasting of the single scalar, calculate the cost as 10156 // the broadcast. 10157 const auto *It = find_if_not(VL, IsaPred<UndefValue>); 10158 assert(It != VL.end() && "Expected at least one non-undef value."); 10159 // Add broadcast for non-identity shuffle only. 10160 bool NeedShuffle = 10161 count(VL, *It) > 1 && 10162 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>)); 10163 if (!NeedShuffle) { 10164 if (isa<FixedVectorType>(ScalarTy)) { 10165 assert(SLPReVec && "FixedVectorType is not expected."); 10166 return TTI.getShuffleCost( 10167 TTI::SK_InsertSubvector, VecTy, {}, CostKind, 10168 std::distance(VL.begin(), It) * getNumElements(ScalarTy), 10169 cast<FixedVectorType>(ScalarTy)); 10170 } 10171 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, 10172 CostKind, std::distance(VL.begin(), It), 10173 PoisonValue::get(VecTy), *It); 10174 } 10175 10176 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem); 10177 transform(VL, ShuffleMask.begin(), [](Value *V) { 10178 return isa<PoisonValue>(V) ? PoisonMaskElem : 0; 10179 }); 10180 InstructionCost InsertCost = 10181 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0, 10182 PoisonValue::get(VecTy), *It); 10183 return InsertCost + ::getShuffleCost(TTI, 10184 TargetTransformInfo::SK_Broadcast, 10185 VecTy, ShuffleMask, CostKind, 10186 /*Index=*/0, /*SubTp=*/nullptr, 10187 /*Args=*/*It); 10188 } 10189 return GatherCost + 10190 (all_of(Gathers, IsaPred<UndefValue>) 10191 ? TTI::TCC_Free 10192 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers), 10193 ScalarTy)); 10194 }; 10195 10196 /// Compute the cost of creating a vector containing the extracted values from 10197 /// \p VL. 10198 InstructionCost 10199 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask, 10200 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds, 10201 unsigned NumParts) { 10202 assert(VL.size() > NumParts && "Unexpected scalarized shuffle."); 10203 unsigned NumElts = 10204 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) { 10205 auto *EE = dyn_cast<ExtractElementInst>(V); 10206 if (!EE) 10207 return Sz; 10208 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType()); 10209 if (!VecTy) 10210 return Sz; 10211 return std::max(Sz, VecTy->getNumElements()); 10212 }); 10213 // FIXME: this must be moved to TTI for better estimation. 10214 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts); 10215 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask, 10216 SmallVectorImpl<unsigned> &Indices) 10217 -> std::optional<TTI::ShuffleKind> { 10218 if (NumElts <= EltsPerVector) 10219 return std::nullopt; 10220 int OffsetReg0 = 10221 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX, 10222 [](int S, int I) { 10223 if (I == PoisonMaskElem) 10224 return S; 10225 return std::min(S, I); 10226 }), 10227 EltsPerVector); 10228 int OffsetReg1 = OffsetReg0; 10229 DenseSet<int> RegIndices; 10230 // Check that if trying to permute same single/2 input vectors. 10231 TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc; 10232 int FirstRegId = -1; 10233 Indices.assign(1, OffsetReg0); 10234 for (auto [Pos, I] : enumerate(Mask)) { 10235 if (I == PoisonMaskElem) 10236 continue; 10237 int Idx = I - OffsetReg0; 10238 int RegId = 10239 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector; 10240 if (FirstRegId < 0) 10241 FirstRegId = RegId; 10242 RegIndices.insert(RegId); 10243 if (RegIndices.size() > 2) 10244 return std::nullopt; 10245 if (RegIndices.size() == 2) { 10246 ShuffleKind = TTI::SK_PermuteTwoSrc; 10247 if (Indices.size() == 1) { 10248 OffsetReg1 = alignDown( 10249 std::accumulate( 10250 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX, 10251 [&](int S, int I) { 10252 if (I == PoisonMaskElem) 10253 return S; 10254 int RegId = ((I - OffsetReg0) / NumElts) * NumParts + 10255 ((I - OffsetReg0) % NumElts) / EltsPerVector; 10256 if (RegId == FirstRegId) 10257 return S; 10258 return std::min(S, I); 10259 }), 10260 EltsPerVector); 10261 Indices.push_back(OffsetReg1 % NumElts); 10262 } 10263 Idx = I - OffsetReg1; 10264 } 10265 I = (Idx % NumElts) % EltsPerVector + 10266 (RegId == FirstRegId ? 0 : EltsPerVector); 10267 } 10268 return ShuffleKind; 10269 }; 10270 InstructionCost Cost = 0; 10271 10272 // Process extracts in blocks of EltsPerVector to check if the source vector 10273 // operand can be re-used directly. If not, add the cost of creating a 10274 // shuffle to extract the values into a vector register. 10275 for (unsigned Part : seq<unsigned>(NumParts)) { 10276 if (!ShuffleKinds[Part]) 10277 continue; 10278 ArrayRef<int> MaskSlice = Mask.slice( 10279 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part)); 10280 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem); 10281 copy(MaskSlice, SubMask.begin()); 10282 SmallVector<unsigned, 2> Indices; 10283 std::optional<TTI::ShuffleKind> RegShuffleKind = 10284 CheckPerRegistersShuffle(SubMask, Indices); 10285 if (!RegShuffleKind) { 10286 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc || 10287 !ShuffleVectorInst::isIdentityMask( 10288 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size()))) 10289 Cost += 10290 ::getShuffleCost(TTI, *ShuffleKinds[Part], 10291 getWidenedType(ScalarTy, NumElts), MaskSlice); 10292 continue; 10293 } 10294 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc || 10295 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) { 10296 Cost += 10297 ::getShuffleCost(TTI, *RegShuffleKind, 10298 getWidenedType(ScalarTy, EltsPerVector), SubMask); 10299 } 10300 const unsigned BaseVF = getFullVectorNumberOfElements( 10301 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector)); 10302 for (unsigned Idx : Indices) { 10303 assert((Idx + EltsPerVector) <= BaseVF && 10304 "SK_ExtractSubvector index out of range"); 10305 Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector, 10306 getWidenedType(ScalarTy, BaseVF), {}, CostKind, 10307 Idx, getWidenedType(ScalarTy, EltsPerVector)); 10308 } 10309 // Second attempt to check, if just a permute is better estimated than 10310 // subvector extract. 10311 SubMask.assign(NumElts, PoisonMaskElem); 10312 copy(MaskSlice, SubMask.begin()); 10313 InstructionCost OriginalCost = ::getShuffleCost( 10314 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask); 10315 if (OriginalCost < Cost) 10316 Cost = OriginalCost; 10317 } 10318 return Cost; 10319 } 10320 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given 10321 /// mask \p Mask, register number \p Part, that includes \p SliceSize 10322 /// elements. 10323 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2, 10324 ArrayRef<int> Mask, unsigned Part, 10325 unsigned SliceSize) { 10326 if (SameNodesEstimated) { 10327 // Delay the cost estimation if the same nodes are reshuffling. 10328 // If we already requested the cost of reshuffling of E1 and E2 before, no 10329 // need to estimate another cost with the sub-Mask, instead include this 10330 // sub-Mask into the CommonMask to estimate it later and avoid double cost 10331 // estimation. 10332 if ((InVectors.size() == 2 && 10333 cast<const TreeEntry *>(InVectors.front()) == &E1 && 10334 cast<const TreeEntry *>(InVectors.back()) == E2) || 10335 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) { 10336 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part); 10337 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit), 10338 [](int Idx) { return Idx == PoisonMaskElem; }) && 10339 "Expected all poisoned elements."); 10340 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit); 10341 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part)); 10342 return; 10343 } 10344 // Found non-matching nodes - need to estimate the cost for the matched 10345 // and transform mask. 10346 Cost += createShuffle(InVectors.front(), 10347 InVectors.size() == 1 ? nullptr : InVectors.back(), 10348 CommonMask); 10349 transformMaskAfterShuffle(CommonMask, CommonMask); 10350 } else if (InVectors.size() == 2) { 10351 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask); 10352 transformMaskAfterShuffle(CommonMask, CommonMask); 10353 } 10354 SameNodesEstimated = false; 10355 if (!E2 && InVectors.size() == 1) { 10356 unsigned VF = E1.getVectorFactor(); 10357 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) { 10358 VF = std::max(VF, 10359 cast<FixedVectorType>(V1->getType())->getNumElements()); 10360 } else { 10361 const auto *E = cast<const TreeEntry *>(InVectors.front()); 10362 VF = std::max(VF, E->getVectorFactor()); 10363 } 10364 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 10365 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) 10366 CommonMask[Idx] = Mask[Idx] + VF; 10367 Cost += createShuffle(InVectors.front(), &E1, CommonMask); 10368 transformMaskAfterShuffle(CommonMask, CommonMask); 10369 } else { 10370 auto P = InVectors.front(); 10371 Cost += createShuffle(&E1, E2, Mask); 10372 unsigned VF = Mask.size(); 10373 if (Value *V1 = P.dyn_cast<Value *>()) { 10374 VF = std::max(VF, 10375 getNumElements(V1->getType())); 10376 } else { 10377 const auto *E = cast<const TreeEntry *>(P); 10378 VF = std::max(VF, E->getVectorFactor()); 10379 } 10380 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 10381 if (Mask[Idx] != PoisonMaskElem) 10382 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF); 10383 Cost += createShuffle(P, InVectors.front(), CommonMask); 10384 transformMaskAfterShuffle(CommonMask, CommonMask); 10385 } 10386 } 10387 10388 class ShuffleCostBuilder { 10389 const TargetTransformInfo &TTI; 10390 10391 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) { 10392 int Index = -1; 10393 return Mask.empty() || 10394 (VF == Mask.size() && 10395 ShuffleVectorInst::isIdentityMask(Mask, VF)) || 10396 (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) && 10397 Index == 0); 10398 } 10399 10400 public: 10401 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {} 10402 ~ShuffleCostBuilder() = default; 10403 InstructionCost createShuffleVector(Value *V1, Value *, 10404 ArrayRef<int> Mask) const { 10405 // Empty mask or identity mask are free. 10406 unsigned VF = 10407 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue(); 10408 if (isEmptyOrIdentity(Mask, VF)) 10409 return TTI::TCC_Free; 10410 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, 10411 cast<VectorType>(V1->getType()), Mask); 10412 } 10413 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const { 10414 // Empty mask or identity mask are free. 10415 unsigned VF = 10416 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue(); 10417 if (isEmptyOrIdentity(Mask, VF)) 10418 return TTI::TCC_Free; 10419 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, 10420 cast<VectorType>(V1->getType()), Mask); 10421 } 10422 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; } 10423 InstructionCost createPoison(Type *Ty, unsigned VF) const { 10424 return TTI::TCC_Free; 10425 } 10426 void resizeToMatch(Value *&, Value *&) const {} 10427 }; 10428 10429 /// Smart shuffle instruction emission, walks through shuffles trees and 10430 /// tries to find the best matching vector for the actual shuffle 10431 /// instruction. 10432 InstructionCost 10433 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1, 10434 const PointerUnion<Value *, const TreeEntry *> &P2, 10435 ArrayRef<int> Mask) { 10436 ShuffleCostBuilder Builder(TTI); 10437 SmallVector<int> CommonMask(Mask); 10438 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>(); 10439 unsigned CommonVF = Mask.size(); 10440 InstructionCost ExtraCost = 0; 10441 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E, 10442 unsigned VF) -> InstructionCost { 10443 if (E.isGather() && allConstant(E.Scalars)) 10444 return TTI::TCC_Free; 10445 Type *EScalarTy = E.Scalars.front()->getType(); 10446 bool IsSigned = true; 10447 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) { 10448 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first); 10449 IsSigned = It->second.second; 10450 } 10451 if (EScalarTy != ScalarTy) { 10452 unsigned CastOpcode = Instruction::Trunc; 10453 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy); 10454 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy); 10455 if (DstSz > SrcSz) 10456 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt; 10457 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF), 10458 getWidenedType(EScalarTy, VF), 10459 TTI::CastContextHint::None, CostKind); 10460 } 10461 return TTI::TCC_Free; 10462 }; 10463 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost { 10464 if (isa<Constant>(V)) 10465 return TTI::TCC_Free; 10466 auto *VecTy = cast<VectorType>(V->getType()); 10467 Type *EScalarTy = VecTy->getElementType(); 10468 if (EScalarTy != ScalarTy) { 10469 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL)); 10470 unsigned CastOpcode = Instruction::Trunc; 10471 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy); 10472 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy); 10473 if (DstSz > SrcSz) 10474 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt; 10475 return TTI.getCastInstrCost( 10476 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()), 10477 VecTy, TTI::CastContextHint::None, CostKind); 10478 } 10479 return TTI::TCC_Free; 10480 }; 10481 if (!V1 && !V2 && !P2.isNull()) { 10482 // Shuffle 2 entry nodes. 10483 const TreeEntry *E = cast<const TreeEntry *>(P1); 10484 unsigned VF = E->getVectorFactor(); 10485 const TreeEntry *E2 = cast<const TreeEntry *>(P2); 10486 CommonVF = std::max(VF, E2->getVectorFactor()); 10487 assert(all_of(Mask, 10488 [=](int Idx) { 10489 return Idx < 2 * static_cast<int>(CommonVF); 10490 }) && 10491 "All elements in mask must be less than 2 * CommonVF."); 10492 if (E->Scalars.size() == E2->Scalars.size()) { 10493 SmallVector<int> EMask = E->getCommonMask(); 10494 SmallVector<int> E2Mask = E2->getCommonMask(); 10495 if (!EMask.empty() || !E2Mask.empty()) { 10496 for (int &Idx : CommonMask) { 10497 if (Idx == PoisonMaskElem) 10498 continue; 10499 if (Idx < static_cast<int>(CommonVF) && !EMask.empty()) 10500 Idx = EMask[Idx]; 10501 else if (Idx >= static_cast<int>(CommonVF)) 10502 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) + 10503 E->Scalars.size(); 10504 } 10505 } 10506 CommonVF = E->Scalars.size(); 10507 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) + 10508 GetNodeMinBWAffectedCost(*E2, CommonVF); 10509 } else { 10510 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) + 10511 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor()); 10512 } 10513 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF)); 10514 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF)); 10515 } else if (!V1 && P2.isNull()) { 10516 // Shuffle single entry node. 10517 const TreeEntry *E = cast<const TreeEntry *>(P1); 10518 unsigned VF = E->getVectorFactor(); 10519 CommonVF = VF; 10520 assert( 10521 all_of(Mask, 10522 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) && 10523 "All elements in mask must be less than CommonVF."); 10524 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) { 10525 SmallVector<int> EMask = E->getCommonMask(); 10526 assert(!EMask.empty() && "Expected non-empty common mask."); 10527 for (int &Idx : CommonMask) { 10528 if (Idx != PoisonMaskElem) 10529 Idx = EMask[Idx]; 10530 } 10531 CommonVF = E->Scalars.size(); 10532 } else if (unsigned Factor = E->getInterleaveFactor(); 10533 Factor > 0 && E->Scalars.size() != Mask.size() && 10534 ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask, 10535 Factor)) { 10536 // Deinterleaved nodes are free. 10537 std::iota(CommonMask.begin(), CommonMask.end(), 0); 10538 } 10539 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF); 10540 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF)); 10541 // Not identity/broadcast? Try to see if the original vector is better. 10542 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() && 10543 CommonVF == CommonMask.size() && 10544 any_of(enumerate(CommonMask), 10545 [](const auto &&P) { 10546 return P.value() != PoisonMaskElem && 10547 static_cast<unsigned>(P.value()) != P.index(); 10548 }) && 10549 any_of(CommonMask, 10550 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) { 10551 SmallVector<int> ReorderMask; 10552 inversePermutation(E->ReorderIndices, ReorderMask); 10553 ::addMask(CommonMask, ReorderMask); 10554 } 10555 } else if (V1 && P2.isNull()) { 10556 // Shuffle single vector. 10557 ExtraCost += GetValueMinBWAffectedCost(V1); 10558 CommonVF = getVF(V1); 10559 assert( 10560 all_of(Mask, 10561 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) && 10562 "All elements in mask must be less than CommonVF."); 10563 } else if (V1 && !V2) { 10564 // Shuffle vector and tree node. 10565 unsigned VF = getVF(V1); 10566 const TreeEntry *E2 = cast<const TreeEntry *>(P2); 10567 CommonVF = std::max(VF, E2->getVectorFactor()); 10568 assert(all_of(Mask, 10569 [=](int Idx) { 10570 return Idx < 2 * static_cast<int>(CommonVF); 10571 }) && 10572 "All elements in mask must be less than 2 * CommonVF."); 10573 if (E2->Scalars.size() == VF && VF != CommonVF) { 10574 SmallVector<int> E2Mask = E2->getCommonMask(); 10575 assert(!E2Mask.empty() && "Expected non-empty common mask."); 10576 for (int &Idx : CommonMask) { 10577 if (Idx == PoisonMaskElem) 10578 continue; 10579 if (Idx >= static_cast<int>(CommonVF)) 10580 Idx = E2Mask[Idx - CommonVF] + VF; 10581 } 10582 CommonVF = VF; 10583 } 10584 ExtraCost += GetValueMinBWAffectedCost(V1); 10585 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF)); 10586 ExtraCost += GetNodeMinBWAffectedCost( 10587 *E2, std::min(CommonVF, E2->getVectorFactor())); 10588 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF)); 10589 } else if (!V1 && V2) { 10590 // Shuffle vector and tree node. 10591 unsigned VF = getVF(V2); 10592 const TreeEntry *E1 = cast<const TreeEntry *>(P1); 10593 CommonVF = std::max(VF, E1->getVectorFactor()); 10594 assert(all_of(Mask, 10595 [=](int Idx) { 10596 return Idx < 2 * static_cast<int>(CommonVF); 10597 }) && 10598 "All elements in mask must be less than 2 * CommonVF."); 10599 if (E1->Scalars.size() == VF && VF != CommonVF) { 10600 SmallVector<int> E1Mask = E1->getCommonMask(); 10601 assert(!E1Mask.empty() && "Expected non-empty common mask."); 10602 for (int &Idx : CommonMask) { 10603 if (Idx == PoisonMaskElem) 10604 continue; 10605 if (Idx >= static_cast<int>(CommonVF)) 10606 Idx = E1Mask[Idx - CommonVF] + VF; 10607 else 10608 Idx = E1Mask[Idx]; 10609 } 10610 CommonVF = VF; 10611 } 10612 ExtraCost += GetNodeMinBWAffectedCost( 10613 *E1, std::min(CommonVF, E1->getVectorFactor())); 10614 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF)); 10615 ExtraCost += GetValueMinBWAffectedCost(V2); 10616 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF)); 10617 } else { 10618 assert(V1 && V2 && "Expected both vectors."); 10619 unsigned VF = getVF(V1); 10620 CommonVF = std::max(VF, getVF(V2)); 10621 assert(all_of(Mask, 10622 [=](int Idx) { 10623 return Idx < 2 * static_cast<int>(CommonVF); 10624 }) && 10625 "All elements in mask must be less than 2 * CommonVF."); 10626 ExtraCost += 10627 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2); 10628 if (V1->getType() != V2->getType()) { 10629 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF)); 10630 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF)); 10631 } else { 10632 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy) 10633 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF)); 10634 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy) 10635 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF)); 10636 } 10637 } 10638 InVectors.front() = 10639 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size())); 10640 if (InVectors.size() == 2) 10641 InVectors.pop_back(); 10642 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>( 10643 V1, V2, CommonMask, Builder, ScalarTy); 10644 } 10645 10646 public: 10647 ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, 10648 ArrayRef<Value *> VectorizedVals, BoUpSLP &R, 10649 SmallPtrSetImpl<Value *> &CheckedExtracts) 10650 : BaseShuffleAnalysis(ScalarTy), TTI(TTI), 10651 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R), 10652 CheckedExtracts(CheckedExtracts) {} 10653 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask, 10654 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds, 10655 unsigned NumParts, bool &UseVecBaseAsInput) { 10656 UseVecBaseAsInput = false; 10657 if (Mask.empty()) 10658 return nullptr; 10659 Value *VecBase = nullptr; 10660 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end()); 10661 if (!E->ReorderIndices.empty()) { 10662 SmallVector<int> ReorderMask(E->ReorderIndices.begin(), 10663 E->ReorderIndices.end()); 10664 reorderScalars(VL, ReorderMask); 10665 } 10666 // Check if it can be considered reused if same extractelements were 10667 // vectorized already. 10668 bool PrevNodeFound = any_of( 10669 ArrayRef(R.VectorizableTree).take_front(E->Idx), 10670 [&](const std::unique_ptr<TreeEntry> &TE) { 10671 return ((TE->hasState() && !TE->isAltShuffle() && 10672 TE->getOpcode() == Instruction::ExtractElement) || 10673 TE->isGather()) && 10674 all_of(enumerate(TE->Scalars), [&](auto &&Data) { 10675 return VL.size() > Data.index() && 10676 (Mask[Data.index()] == PoisonMaskElem || 10677 isa<UndefValue>(VL[Data.index()]) || 10678 Data.value() == VL[Data.index()]); 10679 }); 10680 }); 10681 SmallPtrSet<Value *, 4> UniqueBases; 10682 unsigned SliceSize = getPartNumElems(VL.size(), NumParts); 10683 for (unsigned Part : seq<unsigned>(NumParts)) { 10684 unsigned Limit = getNumElems(VL.size(), SliceSize, Part); 10685 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit); 10686 for (auto [I, V] : 10687 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) { 10688 // Ignore non-extractelement scalars. 10689 if (isa<UndefValue>(V) || 10690 (!SubMask.empty() && SubMask[I] == PoisonMaskElem)) 10691 continue; 10692 // If all users of instruction are going to be vectorized and this 10693 // instruction itself is not going to be vectorized, consider this 10694 // instruction as dead and remove its cost from the final cost of the 10695 // vectorized tree. 10696 // Also, avoid adjusting the cost for extractelements with multiple uses 10697 // in different graph entries. 10698 auto *EE = cast<ExtractElementInst>(V); 10699 VecBase = EE->getVectorOperand(); 10700 UniqueBases.insert(VecBase); 10701 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V); 10702 if (!CheckedExtracts.insert(V).second || 10703 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) || 10704 any_of(EE->users(), 10705 [&](User *U) { 10706 return isa<GetElementPtrInst>(U) && 10707 !R.areAllUsersVectorized(cast<Instruction>(U), 10708 &VectorizedVals); 10709 }) || 10710 (!VEs.empty() && !is_contained(VEs, E))) 10711 continue; 10712 std::optional<unsigned> EEIdx = getExtractIndex(EE); 10713 if (!EEIdx) 10714 continue; 10715 unsigned Idx = *EEIdx; 10716 // Take credit for instruction that will become dead. 10717 if (EE->hasOneUse() || !PrevNodeFound) { 10718 Instruction *Ext = EE->user_back(); 10719 if (isa<SExtInst, ZExtInst>(Ext) && 10720 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) { 10721 // Use getExtractWithExtendCost() to calculate the cost of 10722 // extractelement/ext pair. 10723 Cost -= 10724 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), 10725 EE->getVectorOperandType(), Idx); 10726 // Add back the cost of s|zext which is subtracted separately. 10727 Cost += TTI.getCastInstrCost( 10728 Ext->getOpcode(), Ext->getType(), EE->getType(), 10729 TTI::getCastContextHint(Ext), CostKind, Ext); 10730 continue; 10731 } 10732 } 10733 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), 10734 CostKind, Idx); 10735 } 10736 } 10737 // Check that gather of extractelements can be represented as just a 10738 // shuffle of a single/two vectors the scalars are extracted from. 10739 // Found the bunch of extractelement instructions that must be gathered 10740 // into a vector and can be represented as a permutation elements in a 10741 // single input vector or of 2 input vectors. 10742 // Done for reused if same extractelements were vectorized already. 10743 if (!PrevNodeFound) 10744 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts); 10745 InVectors.assign(1, E); 10746 CommonMask.assign(Mask.begin(), Mask.end()); 10747 transformMaskAfterShuffle(CommonMask, CommonMask); 10748 SameNodesEstimated = false; 10749 if (NumParts != 1 && UniqueBases.size() != 1) { 10750 UseVecBaseAsInput = true; 10751 VecBase = 10752 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size())); 10753 } 10754 return VecBase; 10755 } 10756 /// Checks if the specified entry \p E needs to be delayed because of its 10757 /// dependency nodes. 10758 std::optional<InstructionCost> 10759 needToDelay(const TreeEntry *, 10760 ArrayRef<SmallVector<const TreeEntry *>>) const { 10761 // No need to delay the cost estimation during analysis. 10762 return std::nullopt; 10763 } 10764 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) { 10765 if (&E1 == &E2) { 10766 assert(all_of(Mask, 10767 [&](int Idx) { 10768 return Idx < static_cast<int>(E1.getVectorFactor()); 10769 }) && 10770 "Expected single vector shuffle mask."); 10771 add(E1, Mask); 10772 return; 10773 } 10774 if (InVectors.empty()) { 10775 CommonMask.assign(Mask.begin(), Mask.end()); 10776 InVectors.assign({&E1, &E2}); 10777 return; 10778 } 10779 assert(!CommonMask.empty() && "Expected non-empty common mask."); 10780 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size()); 10781 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size()); 10782 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts); 10783 const auto *It = 10784 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; }); 10785 unsigned Part = std::distance(Mask.begin(), It) / SliceSize; 10786 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize); 10787 } 10788 void add(const TreeEntry &E1, ArrayRef<int> Mask) { 10789 if (InVectors.empty()) { 10790 CommonMask.assign(Mask.begin(), Mask.end()); 10791 InVectors.assign(1, &E1); 10792 return; 10793 } 10794 assert(!CommonMask.empty() && "Expected non-empty common mask."); 10795 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size()); 10796 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size()); 10797 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts); 10798 const auto *It = 10799 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; }); 10800 unsigned Part = std::distance(Mask.begin(), It) / SliceSize; 10801 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize); 10802 if (!SameNodesEstimated && InVectors.size() == 1) 10803 InVectors.emplace_back(&E1); 10804 } 10805 /// Adds 2 input vectors and the mask for their shuffling. 10806 void add(Value *V1, Value *V2, ArrayRef<int> Mask) { 10807 // May come only for shuffling of 2 vectors with extractelements, already 10808 // handled in adjustExtracts. 10809 assert(InVectors.size() == 1 && 10810 all_of(enumerate(CommonMask), 10811 [&](auto P) { 10812 if (P.value() == PoisonMaskElem) 10813 return Mask[P.index()] == PoisonMaskElem; 10814 auto *EI = cast<ExtractElementInst>( 10815 cast<const TreeEntry *>(InVectors.front()) 10816 ->getOrdered(P.index())); 10817 return EI->getVectorOperand() == V1 || 10818 EI->getVectorOperand() == V2; 10819 }) && 10820 "Expected extractelement vectors."); 10821 } 10822 /// Adds another one input vector and the mask for the shuffling. 10823 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) { 10824 if (InVectors.empty()) { 10825 assert(CommonMask.empty() && !ForExtracts && 10826 "Expected empty input mask/vectors."); 10827 CommonMask.assign(Mask.begin(), Mask.end()); 10828 InVectors.assign(1, V1); 10829 return; 10830 } 10831 if (ForExtracts) { 10832 // No need to add vectors here, already handled them in adjustExtracts. 10833 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) && 10834 !CommonMask.empty() && 10835 all_of(enumerate(CommonMask), 10836 [&](auto P) { 10837 Value *Scalar = cast<const TreeEntry *>(InVectors[0]) 10838 ->getOrdered(P.index()); 10839 if (P.value() == PoisonMaskElem) 10840 return P.value() == Mask[P.index()] || 10841 isa<UndefValue>(Scalar); 10842 if (isa<Constant>(V1)) 10843 return true; 10844 auto *EI = cast<ExtractElementInst>(Scalar); 10845 return EI->getVectorOperand() == V1; 10846 }) && 10847 "Expected only tree entry for extractelement vectors."); 10848 return; 10849 } 10850 assert(!InVectors.empty() && !CommonMask.empty() && 10851 "Expected only tree entries from extracts/reused buildvectors."); 10852 unsigned VF = getVF(V1); 10853 if (InVectors.size() == 2) { 10854 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask); 10855 transformMaskAfterShuffle(CommonMask, CommonMask); 10856 VF = std::max<unsigned>(VF, CommonMask.size()); 10857 } else if (const auto *InTE = 10858 InVectors.front().dyn_cast<const TreeEntry *>()) { 10859 VF = std::max(VF, InTE->getVectorFactor()); 10860 } else { 10861 VF = std::max( 10862 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType()) 10863 ->getNumElements()); 10864 } 10865 InVectors.push_back(V1); 10866 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 10867 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) 10868 CommonMask[Idx] = Mask[Idx] + VF; 10869 } 10870 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0, 10871 Value *Root = nullptr) { 10872 Cost += getBuildVectorCost(VL, Root); 10873 if (!Root) { 10874 // FIXME: Need to find a way to avoid use of getNullValue here. 10875 SmallVector<Constant *> Vals; 10876 unsigned VF = VL.size(); 10877 if (MaskVF != 0) 10878 VF = std::min(VF, MaskVF); 10879 for (Value *V : VL.take_front(VF)) { 10880 if (isa<UndefValue>(V)) { 10881 Vals.push_back(cast<Constant>(V)); 10882 continue; 10883 } 10884 Vals.push_back(Constant::getNullValue(V->getType())); 10885 } 10886 if (auto *VecTy = dyn_cast<FixedVectorType>(Vals.front()->getType())) { 10887 assert(SLPReVec && "FixedVectorType is not expected."); 10888 // When REVEC is enabled, we need to expand vector types into scalar 10889 // types. 10890 unsigned VecTyNumElements = VecTy->getNumElements(); 10891 SmallVector<Constant *> NewVals(VF * VecTyNumElements, nullptr); 10892 for (auto [I, V] : enumerate(Vals)) { 10893 Type *ScalarTy = V->getType()->getScalarType(); 10894 Constant *NewVal; 10895 if (isa<PoisonValue>(V)) 10896 NewVal = PoisonValue::get(ScalarTy); 10897 else if (isa<UndefValue>(V)) 10898 NewVal = UndefValue::get(ScalarTy); 10899 else 10900 NewVal = Constant::getNullValue(ScalarTy); 10901 std::fill_n(NewVals.begin() + I * VecTyNumElements, VecTyNumElements, 10902 NewVal); 10903 } 10904 Vals.swap(NewVals); 10905 } 10906 return ConstantVector::get(Vals); 10907 } 10908 return ConstantVector::getSplat( 10909 ElementCount::getFixed( 10910 cast<FixedVectorType>(Root->getType())->getNumElements()), 10911 getAllOnesValue(*R.DL, ScalarTy->getScalarType())); 10912 } 10913 InstructionCost createFreeze(InstructionCost Cost) { return Cost; } 10914 /// Finalize emission of the shuffles. 10915 InstructionCost 10916 finalize(ArrayRef<int> ExtMask, 10917 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors, 10918 ArrayRef<int> SubVectorsMask, unsigned VF = 0, 10919 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) { 10920 IsFinalized = true; 10921 if (Action) { 10922 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front(); 10923 if (InVectors.size() == 2) 10924 Cost += createShuffle(Vec, InVectors.back(), CommonMask); 10925 else 10926 Cost += createShuffle(Vec, nullptr, CommonMask); 10927 transformMaskAfterShuffle(CommonMask, CommonMask); 10928 assert(VF > 0 && 10929 "Expected vector length for the final value before action."); 10930 Value *V = cast<Value *>(Vec); 10931 Action(V, CommonMask); 10932 InVectors.front() = V; 10933 } 10934 if (!SubVectors.empty()) { 10935 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front(); 10936 if (InVectors.size() == 2) 10937 Cost += createShuffle(Vec, InVectors.back(), CommonMask); 10938 else 10939 Cost += createShuffle(Vec, nullptr, CommonMask); 10940 transformMaskAfterShuffle(CommonMask, CommonMask); 10941 // Add subvectors permutation cost. 10942 if (!SubVectorsMask.empty()) { 10943 assert(SubVectorsMask.size() <= CommonMask.size() && 10944 "Expected same size of masks for subvectors and common mask."); 10945 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem); 10946 copy(SubVectorsMask, SVMask.begin()); 10947 for (auto [I1, I2] : zip(SVMask, CommonMask)) { 10948 if (I2 != PoisonMaskElem) { 10949 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask"); 10950 I1 = I2 + CommonMask.size(); 10951 } 10952 } 10953 Cost += ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, 10954 getWidenedType(ScalarTy, CommonMask.size()), 10955 SVMask, CostKind); 10956 } 10957 for (auto [E, Idx] : SubVectors) { 10958 Type *EScalarTy = E->Scalars.front()->getType(); 10959 bool IsSigned = true; 10960 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) { 10961 EScalarTy = 10962 IntegerType::get(EScalarTy->getContext(), It->second.first); 10963 IsSigned = It->second.second; 10964 } 10965 if (ScalarTy != EScalarTy) { 10966 unsigned CastOpcode = Instruction::Trunc; 10967 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy); 10968 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy); 10969 if (DstSz > SrcSz) 10970 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt; 10971 Cost += TTI.getCastInstrCost( 10972 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()), 10973 getWidenedType(EScalarTy, E->getVectorFactor()), 10974 TTI::CastContextHint::Normal, CostKind); 10975 } 10976 Cost += ::getShuffleCost( 10977 TTI, TTI::SK_InsertSubvector, 10978 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx, 10979 getWidenedType(ScalarTy, E->getVectorFactor())); 10980 if (!CommonMask.empty()) { 10981 std::iota(std::next(CommonMask.begin(), Idx), 10982 std::next(CommonMask.begin(), Idx + E->getVectorFactor()), 10983 Idx); 10984 } 10985 } 10986 } 10987 10988 if (!ExtMask.empty()) { 10989 if (CommonMask.empty()) { 10990 CommonMask.assign(ExtMask.begin(), ExtMask.end()); 10991 } else { 10992 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem); 10993 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { 10994 if (ExtMask[I] == PoisonMaskElem) 10995 continue; 10996 NewMask[I] = CommonMask[ExtMask[I]]; 10997 } 10998 CommonMask.swap(NewMask); 10999 } 11000 } 11001 if (CommonMask.empty()) { 11002 assert(InVectors.size() == 1 && "Expected only one vector with no mask"); 11003 return Cost; 11004 } 11005 return Cost + 11006 createShuffle(InVectors.front(), 11007 InVectors.size() == 2 ? InVectors.back() : nullptr, 11008 CommonMask); 11009 } 11010 11011 ~ShuffleCostEstimator() { 11012 assert((IsFinalized || CommonMask.empty()) && 11013 "Shuffle construction must be finalized."); 11014 } 11015 }; 11016 11017 const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E, 11018 unsigned Idx) const { 11019 if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx)) 11020 return VE; 11021 const auto *It = 11022 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { 11023 return TE->isGather() && 11024 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) { 11025 return EI.EdgeIdx == Idx && EI.UserTE == E; 11026 }) != TE->UserTreeIndices.end(); 11027 }); 11028 assert(It != VectorizableTree.end() && "Expected vectorizable entry."); 11029 return It->get(); 11030 } 11031 11032 TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const { 11033 if (TE.State == TreeEntry::ScatterVectorize || 11034 TE.State == TreeEntry::StridedVectorize) 11035 return TTI::CastContextHint::GatherScatter; 11036 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load && 11037 !TE.isAltShuffle()) { 11038 if (TE.ReorderIndices.empty()) 11039 return TTI::CastContextHint::Normal; 11040 SmallVector<int> Mask; 11041 inversePermutation(TE.ReorderIndices, Mask); 11042 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size())) 11043 return TTI::CastContextHint::Reversed; 11044 } 11045 return TTI::CastContextHint::None; 11046 } 11047 11048 /// Builds the arguments types vector for the given call instruction with the 11049 /// given \p ID for the specified vector factor. 11050 static SmallVector<Type *> 11051 buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, 11052 const unsigned VF, unsigned MinBW, 11053 const TargetTransformInfo *TTI) { 11054 SmallVector<Type *> ArgTys; 11055 for (auto [Idx, Arg] : enumerate(CI->args())) { 11056 if (ID != Intrinsic::not_intrinsic) { 11057 if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI)) { 11058 ArgTys.push_back(Arg->getType()); 11059 continue; 11060 } 11061 if (MinBW > 0) { 11062 ArgTys.push_back( 11063 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF)); 11064 continue; 11065 } 11066 } 11067 ArgTys.push_back(getWidenedType(Arg->getType(), VF)); 11068 } 11069 return ArgTys; 11070 } 11071 11072 InstructionCost 11073 BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, 11074 SmallPtrSetImpl<Value *> &CheckedExtracts) { 11075 ArrayRef<Value *> VL = E->Scalars; 11076 11077 Type *ScalarTy = getValueType(VL[0]); 11078 if (!isValidElementType(ScalarTy)) 11079 return InstructionCost::getInvalid(); 11080 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 11081 11082 // If we have computed a smaller type for the expression, update VecTy so 11083 // that the costs will be accurate. 11084 auto It = MinBWs.find(E); 11085 Type *OrigScalarTy = ScalarTy; 11086 if (It != MinBWs.end()) { 11087 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy); 11088 ScalarTy = IntegerType::get(F->getContext(), It->second.first); 11089 if (VecTy) 11090 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements()); 11091 } 11092 auto *VecTy = getWidenedType(ScalarTy, VL.size()); 11093 unsigned EntryVF = E->getVectorFactor(); 11094 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF); 11095 11096 if (E->isGather()) { 11097 if (allConstant(VL)) 11098 return 0; 11099 if (isa<InsertElementInst>(VL[0])) 11100 return InstructionCost::getInvalid(); 11101 if (isa<CmpInst>(VL.front())) 11102 ScalarTy = VL.front()->getType(); 11103 return processBuildVector<ShuffleCostEstimator, InstructionCost>( 11104 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts); 11105 } 11106 InstructionCost CommonCost = 0; 11107 SmallVector<int> Mask; 11108 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize || 11109 !isReverseOrder(E->ReorderIndices))) { 11110 SmallVector<int> NewMask; 11111 if (E->getOpcode() == Instruction::Store) { 11112 // For stores the order is actually a mask. 11113 NewMask.resize(E->ReorderIndices.size()); 11114 copy(E->ReorderIndices, NewMask.begin()); 11115 } else { 11116 inversePermutation(E->ReorderIndices, NewMask); 11117 } 11118 ::addMask(Mask, NewMask); 11119 } 11120 if (!E->ReuseShuffleIndices.empty()) 11121 ::addMask(Mask, E->ReuseShuffleIndices); 11122 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size())) 11123 CommonCost = 11124 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); 11125 assert((E->State == TreeEntry::Vectorize || 11126 E->State == TreeEntry::ScatterVectorize || 11127 E->State == TreeEntry::StridedVectorize) && 11128 "Unhandled state"); 11129 assert(E->getOpcode() && 11130 ((allSameType(VL) && allSameBlock(VL)) || 11131 (E->getOpcode() == Instruction::GetElementPtr && 11132 E->getMainOp()->getType()->isPointerTy())) && 11133 "Invalid VL"); 11134 Instruction *VL0 = E->getMainOp(); 11135 unsigned ShuffleOrOp = 11136 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); 11137 if (E->CombinedOp != TreeEntry::NotCombinedOp) 11138 ShuffleOrOp = E->CombinedOp; 11139 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end()); 11140 const unsigned Sz = UniqueValues.size(); 11141 SmallBitVector UsedScalars(Sz, false); 11142 for (unsigned I = 0; I < Sz; ++I) { 11143 if (isa<Instruction>(UniqueValues[I]) && 11144 is_contained(getTreeEntries(UniqueValues[I]), E)) 11145 continue; 11146 UsedScalars.set(I); 11147 } 11148 auto GetCastContextHint = [&](Value *V) { 11149 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1) 11150 return getCastContextHint(*OpTEs.front()); 11151 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI); 11152 if (SrcState && SrcState.getOpcode() == Instruction::Load && 11153 !SrcState.isAltShuffle()) 11154 return TTI::CastContextHint::GatherScatter; 11155 return TTI::CastContextHint::None; 11156 }; 11157 auto GetCostDiff = 11158 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost, 11159 function_ref<InstructionCost(InstructionCost)> VectorCost) { 11160 // Calculate the cost of this instruction. 11161 InstructionCost ScalarCost = 0; 11162 if (isa<CastInst, CallInst>(VL0)) { 11163 // For some of the instructions no need to calculate cost for each 11164 // particular instruction, we can use the cost of the single 11165 // instruction x total number of scalar instructions. 11166 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0); 11167 } else { 11168 for (unsigned I = 0; I < Sz; ++I) { 11169 if (UsedScalars.test(I)) 11170 continue; 11171 ScalarCost += ScalarEltCost(I); 11172 } 11173 } 11174 11175 InstructionCost VecCost = VectorCost(CommonCost); 11176 // Check if the current node must be resized, if the parent node is not 11177 // resized. 11178 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) && 11179 E->Idx != 0 && 11180 (E->getOpcode() != Instruction::Load || 11181 !E->UserTreeIndices.empty())) { 11182 const EdgeInfo &EI = 11183 *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) { 11184 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX; 11185 }); 11186 if (EI.UserTE->getOpcode() != Instruction::Select || 11187 EI.EdgeIdx != 0) { 11188 auto UserBWIt = MinBWs.find(EI.UserTE); 11189 Type *UserScalarTy = 11190 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType(); 11191 if (UserBWIt != MinBWs.end()) 11192 UserScalarTy = IntegerType::get(ScalarTy->getContext(), 11193 UserBWIt->second.first); 11194 if (ScalarTy != UserScalarTy) { 11195 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy); 11196 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy); 11197 unsigned VecOpcode; 11198 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size()); 11199 if (BWSz > SrcBWSz) 11200 VecOpcode = Instruction::Trunc; 11201 else 11202 VecOpcode = 11203 It->second.second ? Instruction::SExt : Instruction::ZExt; 11204 TTI::CastContextHint CCH = GetCastContextHint(VL0); 11205 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH, 11206 CostKind); 11207 } 11208 } 11209 } 11210 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost, 11211 ScalarCost, "Calculated costs for Tree")); 11212 return VecCost - ScalarCost; 11213 }; 11214 // Calculate cost difference from vectorizing set of GEPs. 11215 // Negative value means vectorizing is profitable. 11216 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) { 11217 assert((E->State == TreeEntry::Vectorize || 11218 E->State == TreeEntry::StridedVectorize) && 11219 "Entry state expected to be Vectorize or StridedVectorize here."); 11220 InstructionCost ScalarCost = 0; 11221 InstructionCost VecCost = 0; 11222 std::tie(ScalarCost, VecCost) = getGEPCosts( 11223 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy); 11224 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost, 11225 "Calculated GEPs cost for Tree")); 11226 11227 return VecCost - ScalarCost; 11228 }; 11229 11230 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) { 11231 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL); 11232 if (MinMaxID == Intrinsic::not_intrinsic) 11233 return InstructionCost::getInvalid(); 11234 Type *CanonicalType = Ty; 11235 if (CanonicalType->isPtrOrPtrVectorTy()) 11236 CanonicalType = CanonicalType->getWithNewType(IntegerType::get( 11237 CanonicalType->getContext(), 11238 DL->getTypeSizeInBits(CanonicalType->getScalarType()))); 11239 11240 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType, 11241 {CanonicalType, CanonicalType}); 11242 InstructionCost IntrinsicCost = 11243 TTI->getIntrinsicInstrCost(CostAttrs, CostKind); 11244 // If the selects are the only uses of the compares, they will be 11245 // dead and we can adjust the cost by removing their cost. 11246 if (VI && SelectOnly) { 11247 assert((!Ty->isVectorTy() || SLPReVec) && 11248 "Expected only for scalar type."); 11249 auto *CI = cast<CmpInst>(VI->getOperand(0)); 11250 IntrinsicCost -= TTI->getCmpSelInstrCost( 11251 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(), 11252 CostKind, {TTI::OK_AnyValue, TTI::OP_None}, 11253 {TTI::OK_AnyValue, TTI::OP_None}, CI); 11254 } 11255 return IntrinsicCost; 11256 }; 11257 switch (ShuffleOrOp) { 11258 case Instruction::PHI: { 11259 // Count reused scalars. 11260 InstructionCost ScalarCost = 0; 11261 SmallPtrSet<const TreeEntry *, 4> CountedOps; 11262 for (Value *V : UniqueValues) { 11263 auto *PHI = dyn_cast<PHINode>(V); 11264 if (!PHI) 11265 continue; 11266 11267 ValueList Operands(PHI->getNumIncomingValues(), nullptr); 11268 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) { 11269 Value *Op = PHI->getIncomingValue(I); 11270 Operands[I] = Op; 11271 } 11272 if (const TreeEntry *OpTE = 11273 getSameValuesTreeEntry(Operands.front(), Operands)) 11274 if (CountedOps.insert(OpTE).second && 11275 !OpTE->ReuseShuffleIndices.empty()) 11276 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() - 11277 OpTE->Scalars.size()); 11278 } 11279 11280 return CommonCost - ScalarCost; 11281 } 11282 case Instruction::ExtractValue: 11283 case Instruction::ExtractElement: { 11284 auto GetScalarCost = [&](unsigned Idx) { 11285 if (isa<PoisonValue>(UniqueValues[Idx])) 11286 return InstructionCost(TTI::TCC_Free); 11287 11288 auto *I = cast<Instruction>(UniqueValues[Idx]); 11289 VectorType *SrcVecTy; 11290 if (ShuffleOrOp == Instruction::ExtractElement) { 11291 auto *EE = cast<ExtractElementInst>(I); 11292 SrcVecTy = EE->getVectorOperandType(); 11293 } else { 11294 auto *EV = cast<ExtractValueInst>(I); 11295 Type *AggregateTy = EV->getAggregateOperand()->getType(); 11296 unsigned NumElts; 11297 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy)) 11298 NumElts = ATy->getNumElements(); 11299 else 11300 NumElts = AggregateTy->getStructNumElements(); 11301 SrcVecTy = getWidenedType(OrigScalarTy, NumElts); 11302 } 11303 if (I->hasOneUse()) { 11304 Instruction *Ext = I->user_back(); 11305 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && 11306 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) { 11307 // Use getExtractWithExtendCost() to calculate the cost of 11308 // extractelement/ext pair. 11309 InstructionCost Cost = TTI->getExtractWithExtendCost( 11310 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I)); 11311 // Subtract the cost of s|zext which is subtracted separately. 11312 Cost -= TTI->getCastInstrCost( 11313 Ext->getOpcode(), Ext->getType(), I->getType(), 11314 TTI::getCastContextHint(Ext), CostKind, Ext); 11315 return Cost; 11316 } 11317 } 11318 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy, 11319 CostKind, *getExtractIndex(I)); 11320 }; 11321 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; }; 11322 return GetCostDiff(GetScalarCost, GetVectorCost); 11323 } 11324 case Instruction::InsertElement: { 11325 assert(E->ReuseShuffleIndices.empty() && 11326 "Unique insertelements only are expected."); 11327 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType()); 11328 unsigned const NumElts = SrcVecTy->getNumElements(); 11329 unsigned const NumScalars = VL.size(); 11330 11331 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy); 11332 11333 SmallVector<int> InsertMask(NumElts, PoisonMaskElem); 11334 unsigned OffsetBeg = *getElementIndex(VL.front()); 11335 unsigned OffsetEnd = OffsetBeg; 11336 InsertMask[OffsetBeg] = 0; 11337 for (auto [I, V] : enumerate(VL.drop_front())) { 11338 unsigned Idx = *getElementIndex(V); 11339 if (OffsetBeg > Idx) 11340 OffsetBeg = Idx; 11341 else if (OffsetEnd < Idx) 11342 OffsetEnd = Idx; 11343 InsertMask[Idx] = I + 1; 11344 } 11345 unsigned VecScalarsSz = PowerOf2Ceil(NumElts); 11346 if (NumOfParts > 0 && NumOfParts < NumElts) 11347 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts); 11348 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) * 11349 VecScalarsSz; 11350 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz); 11351 unsigned InsertVecSz = std::min<unsigned>( 11352 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1), 11353 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz); 11354 bool IsWholeSubvector = 11355 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0); 11356 // Check if we can safely insert a subvector. If it is not possible, just 11357 // generate a whole-sized vector and shuffle the source vector and the new 11358 // subvector. 11359 if (OffsetBeg + InsertVecSz > VecSz) { 11360 // Align OffsetBeg to generate correct mask. 11361 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset); 11362 InsertVecSz = VecSz; 11363 } 11364 11365 APInt DemandedElts = APInt::getZero(NumElts); 11366 // TODO: Add support for Instruction::InsertValue. 11367 SmallVector<int> Mask; 11368 if (!E->ReorderIndices.empty()) { 11369 inversePermutation(E->ReorderIndices, Mask); 11370 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem); 11371 } else { 11372 Mask.assign(VecSz, PoisonMaskElem); 11373 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0); 11374 } 11375 bool IsIdentity = true; 11376 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem); 11377 Mask.swap(PrevMask); 11378 for (unsigned I = 0; I < NumScalars; ++I) { 11379 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]); 11380 DemandedElts.setBit(InsertIdx); 11381 IsIdentity &= InsertIdx - OffsetBeg == I; 11382 Mask[InsertIdx - OffsetBeg] = I; 11383 } 11384 assert(Offset < NumElts && "Failed to find vector index offset"); 11385 11386 InstructionCost Cost = 0; 11387 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts, 11388 /*Insert*/ true, /*Extract*/ false, 11389 CostKind); 11390 11391 // First cost - resize to actual vector size if not identity shuffle or 11392 // need to shift the vector. 11393 // Do not calculate the cost if the actual size is the register size and 11394 // we can merge this shuffle with the following SK_Select. 11395 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz); 11396 if (!IsIdentity) 11397 Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc, 11398 InsertVecTy, Mask); 11399 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) { 11400 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0)); 11401 })); 11402 // Second cost - permutation with subvector, if some elements are from the 11403 // initial vector or inserting a subvector. 11404 // TODO: Implement the analysis of the FirstInsert->getOperand(0) 11405 // subvector of ActualVecTy. 11406 SmallBitVector InMask = 11407 isUndefVector(FirstInsert->getOperand(0), 11408 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask)); 11409 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) { 11410 if (InsertVecSz != VecSz) { 11411 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz); 11412 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {}, 11413 CostKind, OffsetBeg - Offset, InsertVecTy); 11414 } else { 11415 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I) 11416 Mask[I] = InMask.test(I) ? PoisonMaskElem : I; 11417 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset; 11418 I <= End; ++I) 11419 if (Mask[I] != PoisonMaskElem) 11420 Mask[I] = I + VecSz; 11421 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I) 11422 Mask[I] = 11423 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I; 11424 Cost += 11425 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask); 11426 } 11427 } 11428 return Cost; 11429 } 11430 case Instruction::ZExt: 11431 case Instruction::SExt: 11432 case Instruction::FPToUI: 11433 case Instruction::FPToSI: 11434 case Instruction::FPExt: 11435 case Instruction::PtrToInt: 11436 case Instruction::IntToPtr: 11437 case Instruction::SIToFP: 11438 case Instruction::UIToFP: 11439 case Instruction::Trunc: 11440 case Instruction::FPTrunc: 11441 case Instruction::BitCast: { 11442 auto SrcIt = MinBWs.find(getOperandEntry(E, 0)); 11443 Type *SrcScalarTy = VL0->getOperand(0)->getType(); 11444 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size()); 11445 unsigned Opcode = ShuffleOrOp; 11446 unsigned VecOpcode = Opcode; 11447 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() && 11448 (SrcIt != MinBWs.end() || It != MinBWs.end())) { 11449 // Check if the values are candidates to demote. 11450 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType()); 11451 if (SrcIt != MinBWs.end()) { 11452 SrcBWSz = SrcIt->second.first; 11453 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy); 11454 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz); 11455 SrcVecTy = 11456 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements); 11457 } 11458 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType()); 11459 if (BWSz == SrcBWSz) { 11460 VecOpcode = Instruction::BitCast; 11461 } else if (BWSz < SrcBWSz) { 11462 VecOpcode = Instruction::Trunc; 11463 } else if (It != MinBWs.end()) { 11464 assert(BWSz > SrcBWSz && "Invalid cast!"); 11465 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt; 11466 } else if (SrcIt != MinBWs.end()) { 11467 assert(BWSz > SrcBWSz && "Invalid cast!"); 11468 VecOpcode = 11469 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt; 11470 } 11471 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() && 11472 !SrcIt->second.second) { 11473 VecOpcode = Instruction::UIToFP; 11474 } 11475 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost { 11476 assert(Idx == 0 && "Expected 0 index only"); 11477 return TTI->getCastInstrCost(Opcode, VL0->getType(), 11478 VL0->getOperand(0)->getType(), 11479 TTI::getCastContextHint(VL0), CostKind, VL0); 11480 }; 11481 auto GetVectorCost = [=](InstructionCost CommonCost) { 11482 // Do not count cost here if minimum bitwidth is in effect and it is just 11483 // a bitcast (here it is just a noop). 11484 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast) 11485 return CommonCost; 11486 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr; 11487 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0)); 11488 11489 bool IsArithmeticExtendedReduction = 11490 E->Idx == 0 && UserIgnoreList && 11491 all_of(*UserIgnoreList, [](Value *V) { 11492 auto *I = cast<Instruction>(V); 11493 return is_contained({Instruction::Add, Instruction::FAdd, 11494 Instruction::Mul, Instruction::FMul, 11495 Instruction::And, Instruction::Or, 11496 Instruction::Xor}, 11497 I->getOpcode()); 11498 }); 11499 if (IsArithmeticExtendedReduction && 11500 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt)) 11501 return CommonCost; 11502 return CommonCost + 11503 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind, 11504 VecOpcode == Opcode ? VI : nullptr); 11505 }; 11506 return GetCostDiff(GetScalarCost, GetVectorCost); 11507 } 11508 case Instruction::FCmp: 11509 case Instruction::ICmp: 11510 case Instruction::Select: { 11511 CmpPredicate VecPred, SwappedVecPred; 11512 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value()); 11513 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) || 11514 match(VL0, MatchCmp)) 11515 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred); 11516 else 11517 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy() 11518 ? CmpInst::BAD_FCMP_PREDICATE 11519 : CmpInst::BAD_ICMP_PREDICATE; 11520 auto GetScalarCost = [&](unsigned Idx) { 11521 if (isa<PoisonValue>(UniqueValues[Idx])) 11522 return InstructionCost(TTI::TCC_Free); 11523 11524 auto *VI = cast<Instruction>(UniqueValues[Idx]); 11525 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy() 11526 ? CmpInst::BAD_FCMP_PREDICATE 11527 : CmpInst::BAD_ICMP_PREDICATE; 11528 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value()); 11529 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) && 11530 !match(VI, MatchCmp)) || 11531 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) && 11532 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred))) 11533 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy() 11534 ? CmpInst::BAD_FCMP_PREDICATE 11535 : CmpInst::BAD_ICMP_PREDICATE; 11536 11537 InstructionCost ScalarCost = TTI->getCmpSelInstrCost( 11538 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred, 11539 CostKind, getOperandInfo(VI->getOperand(0)), 11540 getOperandInfo(VI->getOperand(1)), VI); 11541 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI); 11542 if (IntrinsicCost.isValid()) 11543 ScalarCost = IntrinsicCost; 11544 11545 return ScalarCost; 11546 }; 11547 auto GetVectorCost = [&](InstructionCost CommonCost) { 11548 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size()); 11549 11550 InstructionCost VecCost = 11551 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred, 11552 CostKind, getOperandInfo(E->getOperand(0)), 11553 getOperandInfo(E->getOperand(1)), VL0); 11554 if (auto *SI = dyn_cast<SelectInst>(VL0)) { 11555 auto *CondType = 11556 getWidenedType(SI->getCondition()->getType(), VL.size()); 11557 unsigned CondNumElements = CondType->getNumElements(); 11558 unsigned VecTyNumElements = getNumElements(VecTy); 11559 assert(VecTyNumElements >= CondNumElements && 11560 VecTyNumElements % CondNumElements == 0 && 11561 "Cannot vectorize Instruction::Select"); 11562 if (CondNumElements != VecTyNumElements) { 11563 // When the return type is i1 but the source is fixed vector type, we 11564 // need to duplicate the condition value. 11565 VecCost += ::getShuffleCost( 11566 *TTI, TTI::SK_PermuteSingleSrc, CondType, 11567 createReplicatedMask(VecTyNumElements / CondNumElements, 11568 CondNumElements)); 11569 } 11570 } 11571 return VecCost + CommonCost; 11572 }; 11573 return GetCostDiff(GetScalarCost, GetVectorCost); 11574 } 11575 case TreeEntry::MinMax: { 11576 auto GetScalarCost = [&](unsigned Idx) { 11577 return GetMinMaxCost(OrigScalarTy); 11578 }; 11579 auto GetVectorCost = [&](InstructionCost CommonCost) { 11580 InstructionCost VecCost = GetMinMaxCost(VecTy); 11581 return VecCost + CommonCost; 11582 }; 11583 return GetCostDiff(GetScalarCost, GetVectorCost); 11584 } 11585 case Instruction::FNeg: 11586 case Instruction::Add: 11587 case Instruction::FAdd: 11588 case Instruction::Sub: 11589 case Instruction::FSub: 11590 case Instruction::Mul: 11591 case Instruction::FMul: 11592 case Instruction::UDiv: 11593 case Instruction::SDiv: 11594 case Instruction::FDiv: 11595 case Instruction::URem: 11596 case Instruction::SRem: 11597 case Instruction::FRem: 11598 case Instruction::Shl: 11599 case Instruction::LShr: 11600 case Instruction::AShr: 11601 case Instruction::And: 11602 case Instruction::Or: 11603 case Instruction::Xor: { 11604 auto GetScalarCost = [&](unsigned Idx) { 11605 if (isa<PoisonValue>(UniqueValues[Idx])) 11606 return InstructionCost(TTI::TCC_Free); 11607 11608 auto *VI = cast<Instruction>(UniqueValues[Idx]); 11609 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1; 11610 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0)); 11611 TTI::OperandValueInfo Op2Info = 11612 TTI::getOperandInfo(VI->getOperand(OpIdx)); 11613 SmallVector<const Value *> Operands(VI->operand_values()); 11614 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind, 11615 Op1Info, Op2Info, Operands, VI); 11616 }; 11617 auto GetVectorCost = [=](InstructionCost CommonCost) { 11618 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) { 11619 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) { 11620 ArrayRef<Value *> Ops = E->getOperand(I); 11621 if (all_of(Ops, [&](Value *Op) { 11622 auto *CI = dyn_cast<ConstantInt>(Op); 11623 return CI && CI->getValue().countr_one() >= It->second.first; 11624 })) 11625 return CommonCost; 11626 } 11627 } 11628 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1; 11629 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0)); 11630 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx)); 11631 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info, 11632 Op2Info, {}, nullptr, TLI) + 11633 CommonCost; 11634 }; 11635 return GetCostDiff(GetScalarCost, GetVectorCost); 11636 } 11637 case Instruction::GetElementPtr: { 11638 return CommonCost + GetGEPCostDiff(VL, VL0); 11639 } 11640 case Instruction::Load: { 11641 auto GetScalarCost = [&](unsigned Idx) { 11642 auto *VI = cast<LoadInst>(UniqueValues[Idx]); 11643 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy, 11644 VI->getAlign(), VI->getPointerAddressSpace(), 11645 CostKind, TTI::OperandValueInfo(), VI); 11646 }; 11647 auto *LI0 = cast<LoadInst>(VL0); 11648 auto GetVectorCost = [&](InstructionCost CommonCost) { 11649 InstructionCost VecLdCost; 11650 switch (E->State) { 11651 case TreeEntry::Vectorize: 11652 if (unsigned Factor = E->getInterleaveFactor()) { 11653 VecLdCost = TTI->getInterleavedMemoryOpCost( 11654 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(), 11655 LI0->getPointerAddressSpace(), CostKind); 11656 11657 } else { 11658 VecLdCost = TTI->getMemoryOpCost( 11659 Instruction::Load, VecTy, LI0->getAlign(), 11660 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo()); 11661 } 11662 break; 11663 case TreeEntry::StridedVectorize: { 11664 Align CommonAlignment = 11665 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef()); 11666 VecLdCost = TTI->getStridedMemoryOpCost( 11667 Instruction::Load, VecTy, LI0->getPointerOperand(), 11668 /*VariableMask=*/false, CommonAlignment, CostKind); 11669 break; 11670 } 11671 case TreeEntry::ScatterVectorize: { 11672 Align CommonAlignment = 11673 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef()); 11674 VecLdCost = TTI->getGatherScatterOpCost( 11675 Instruction::Load, VecTy, LI0->getPointerOperand(), 11676 /*VariableMask=*/false, CommonAlignment, CostKind); 11677 break; 11678 } 11679 case TreeEntry::CombinedVectorize: 11680 case TreeEntry::NeedToGather: 11681 llvm_unreachable("Unexpected vectorization state."); 11682 } 11683 return VecLdCost + CommonCost; 11684 }; 11685 11686 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost); 11687 // If this node generates masked gather load then it is not a terminal node. 11688 // Hence address operand cost is estimated separately. 11689 if (E->State == TreeEntry::ScatterVectorize) 11690 return Cost; 11691 11692 // Estimate cost of GEPs since this tree node is a terminator. 11693 SmallVector<Value *> PointerOps(VL.size()); 11694 for (auto [I, V] : enumerate(VL)) 11695 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand(); 11696 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand()); 11697 } 11698 case Instruction::Store: { 11699 bool IsReorder = !E->ReorderIndices.empty(); 11700 auto GetScalarCost = [=](unsigned Idx) { 11701 auto *VI = cast<StoreInst>(VL[Idx]); 11702 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand()); 11703 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy, 11704 VI->getAlign(), VI->getPointerAddressSpace(), 11705 CostKind, OpInfo, VI); 11706 }; 11707 auto *BaseSI = 11708 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); 11709 auto GetVectorCost = [=](InstructionCost CommonCost) { 11710 // We know that we can merge the stores. Calculate the cost. 11711 InstructionCost VecStCost; 11712 if (E->State == TreeEntry::StridedVectorize) { 11713 Align CommonAlignment = 11714 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef()); 11715 VecStCost = TTI->getStridedMemoryOpCost( 11716 Instruction::Store, VecTy, BaseSI->getPointerOperand(), 11717 /*VariableMask=*/false, CommonAlignment, CostKind); 11718 } else { 11719 assert(E->State == TreeEntry::Vectorize && 11720 "Expected either strided or consecutive stores."); 11721 if (unsigned Factor = E->getInterleaveFactor()) { 11722 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() && 11723 "No reused shuffles expected"); 11724 CommonCost = 0; 11725 VecStCost = TTI->getInterleavedMemoryOpCost( 11726 Instruction::Store, VecTy, Factor, std::nullopt, 11727 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(), CostKind); 11728 } else { 11729 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0)); 11730 VecStCost = TTI->getMemoryOpCost( 11731 Instruction::Store, VecTy, BaseSI->getAlign(), 11732 BaseSI->getPointerAddressSpace(), CostKind, OpInfo); 11733 } 11734 } 11735 return VecStCost + CommonCost; 11736 }; 11737 SmallVector<Value *> PointerOps(VL.size()); 11738 for (auto [I, V] : enumerate(VL)) { 11739 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I; 11740 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand(); 11741 } 11742 11743 return GetCostDiff(GetScalarCost, GetVectorCost) + 11744 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand()); 11745 } 11746 case Instruction::Call: { 11747 auto GetScalarCost = [&](unsigned Idx) { 11748 auto *CI = cast<CallInst>(UniqueValues[Idx]); 11749 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 11750 if (ID != Intrinsic::not_intrinsic) { 11751 IntrinsicCostAttributes CostAttrs(ID, *CI, 1); 11752 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind); 11753 } 11754 return TTI->getCallInstrCost(CI->getCalledFunction(), 11755 CI->getFunctionType()->getReturnType(), 11756 CI->getFunctionType()->params(), CostKind); 11757 }; 11758 auto GetVectorCost = [=](InstructionCost CommonCost) { 11759 auto *CI = cast<CallInst>(VL0); 11760 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 11761 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes( 11762 CI, ID, VecTy->getNumElements(), 11763 It != MinBWs.end() ? It->second.first : 0, TTI); 11764 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys); 11765 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost; 11766 }; 11767 return GetCostDiff(GetScalarCost, GetVectorCost); 11768 } 11769 case Instruction::ShuffleVector: { 11770 if (!SLPReVec || E->isAltShuffle()) 11771 assert(E->isAltShuffle() && 11772 ((Instruction::isBinaryOp(E->getOpcode()) && 11773 Instruction::isBinaryOp(E->getAltOpcode())) || 11774 (Instruction::isCast(E->getOpcode()) && 11775 Instruction::isCast(E->getAltOpcode())) || 11776 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && 11777 "Invalid Shuffle Vector Operand"); 11778 // Try to find the previous shuffle node with the same operands and same 11779 // main/alternate ops. 11780 auto TryFindNodeWithEqualOperands = [=]() { 11781 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { 11782 if (TE.get() == E) 11783 break; 11784 if (TE->hasState() && TE->isAltShuffle() && 11785 ((TE->getOpcode() == E->getOpcode() && 11786 TE->getAltOpcode() == E->getAltOpcode()) || 11787 (TE->getOpcode() == E->getAltOpcode() && 11788 TE->getAltOpcode() == E->getOpcode())) && 11789 TE->hasEqualOperands(*E)) 11790 return true; 11791 } 11792 return false; 11793 }; 11794 auto GetScalarCost = [&](unsigned Idx) { 11795 if (isa<PoisonValue>(UniqueValues[Idx])) 11796 return InstructionCost(TTI::TCC_Free); 11797 11798 auto *VI = cast<Instruction>(UniqueValues[Idx]); 11799 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode"); 11800 (void)E; 11801 return TTI->getInstructionCost(VI, CostKind); 11802 }; 11803 // Need to clear CommonCost since the final shuffle cost is included into 11804 // vector cost. 11805 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) { 11806 // VecCost is equal to sum of the cost of creating 2 vectors 11807 // and the cost of creating shuffle. 11808 InstructionCost VecCost = 0; 11809 if (TryFindNodeWithEqualOperands()) { 11810 LLVM_DEBUG({ 11811 dbgs() << "SLP: diamond match for alternate node found.\n"; 11812 E->dump(); 11813 }); 11814 // No need to add new vector costs here since we're going to reuse 11815 // same main/alternate vector ops, just do different shuffling. 11816 } else if (Instruction::isBinaryOp(E->getOpcode())) { 11817 VecCost = 11818 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); 11819 VecCost += 11820 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); 11821 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) { 11822 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size()); 11823 VecCost = TTIRef.getCmpSelInstrCost( 11824 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind, 11825 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, 11826 VL0); 11827 VecCost += TTIRef.getCmpSelInstrCost( 11828 E->getOpcode(), VecTy, MaskTy, 11829 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind, 11830 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, 11831 E->getAltOp()); 11832 } else { 11833 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType(); 11834 auto *SrcTy = getWidenedType(SrcSclTy, VL.size()); 11835 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) { 11836 auto SrcIt = MinBWs.find(getOperandEntry(E, 0)); 11837 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy); 11838 unsigned SrcBWSz = 11839 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType()); 11840 if (SrcIt != MinBWs.end()) { 11841 SrcBWSz = SrcIt->second.first; 11842 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz); 11843 SrcTy = getWidenedType(SrcSclTy, VL.size()); 11844 } 11845 if (BWSz <= SrcBWSz) { 11846 if (BWSz < SrcBWSz) 11847 VecCost = 11848 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy, 11849 TTI::CastContextHint::None, CostKind); 11850 LLVM_DEBUG({ 11851 dbgs() 11852 << "SLP: alternate extension, which should be truncated.\n"; 11853 E->dump(); 11854 }); 11855 return VecCost; 11856 } 11857 } 11858 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy, 11859 TTI::CastContextHint::None, CostKind); 11860 VecCost += 11861 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy, 11862 TTI::CastContextHint::None, CostKind); 11863 } 11864 SmallVector<int> Mask; 11865 E->buildAltOpShuffleMask( 11866 [&](Instruction *I) { 11867 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); 11868 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(), 11869 *TLI); 11870 }, 11871 Mask); 11872 VecCost += ::getShuffleCost(TTIRef, TargetTransformInfo::SK_PermuteTwoSrc, 11873 FinalVecTy, Mask, CostKind); 11874 // Patterns like [fadd,fsub] can be combined into a single instruction 11875 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we 11876 // need to take into account their order when looking for the most used 11877 // order. 11878 unsigned Opcode0 = E->getOpcode(); 11879 unsigned Opcode1 = E->getAltOpcode(); 11880 SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1)); 11881 // If this pattern is supported by the target then we consider the 11882 // order. 11883 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) { 11884 InstructionCost AltVecCost = TTIRef.getAltInstrCost( 11885 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind); 11886 return AltVecCost < VecCost ? AltVecCost : VecCost; 11887 } 11888 // TODO: Check the reverse order too. 11889 return VecCost; 11890 }; 11891 if (SLPReVec && !E->isAltShuffle()) 11892 return GetCostDiff( 11893 GetScalarCost, [&](InstructionCost) -> InstructionCost { 11894 // If a group uses mask in order, the shufflevector can be 11895 // eliminated by instcombine. Then the cost is 0. 11896 assert(isa<ShuffleVectorInst>(VL.front()) && 11897 "Not supported shufflevector usage."); 11898 auto *SV = cast<ShuffleVectorInst>(VL.front()); 11899 unsigned SVNumElements = 11900 cast<FixedVectorType>(SV->getOperand(0)->getType()) 11901 ->getNumElements(); 11902 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size(); 11903 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) { 11904 ArrayRef<Value *> Group = VL.slice(I, GroupSize); 11905 int NextIndex = 0; 11906 if (!all_of(Group, [&](Value *V) { 11907 assert(isa<ShuffleVectorInst>(V) && 11908 "Not supported shufflevector usage."); 11909 auto *SV = cast<ShuffleVectorInst>(V); 11910 int Index; 11911 [[maybe_unused]] bool IsExtractSubvectorMask = 11912 SV->isExtractSubvectorMask(Index); 11913 assert(IsExtractSubvectorMask && 11914 "Not supported shufflevector usage."); 11915 if (NextIndex != Index) 11916 return false; 11917 NextIndex += SV->getShuffleMask().size(); 11918 return true; 11919 })) 11920 return ::getShuffleCost( 11921 *TTI, TargetTransformInfo::SK_PermuteSingleSrc, VecTy, 11922 calculateShufflevectorMask(E->Scalars)); 11923 } 11924 return TTI::TCC_Free; 11925 }); 11926 return GetCostDiff(GetScalarCost, GetVectorCost); 11927 } 11928 case Instruction::Freeze: 11929 return CommonCost; 11930 default: 11931 llvm_unreachable("Unknown instruction"); 11932 } 11933 } 11934 11935 bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { 11936 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " 11937 << VectorizableTree.size() << " is fully vectorizable .\n"); 11938 11939 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) { 11940 SmallVector<int> Mask; 11941 return TE->isGather() && 11942 !any_of(TE->Scalars, 11943 [this](Value *V) { return EphValues.contains(V); }) && 11944 (allConstant(TE->Scalars) || isSplat(TE->Scalars) || 11945 TE->Scalars.size() < Limit || 11946 (((TE->hasState() && 11947 TE->getOpcode() == Instruction::ExtractElement) || 11948 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) && 11949 isFixedVectorShuffle(TE->Scalars, Mask, AC)) || 11950 (TE->hasState() && TE->getOpcode() == Instruction::Load && 11951 !TE->isAltShuffle()) || 11952 any_of(TE->Scalars, IsaPred<LoadInst>)); 11953 }; 11954 11955 // We only handle trees of heights 1 and 2. 11956 if (VectorizableTree.size() == 1 && 11957 (VectorizableTree[0]->State == TreeEntry::Vectorize || 11958 VectorizableTree[0]->State == TreeEntry::StridedVectorize || 11959 (ForReduction && 11960 AreVectorizableGathers(VectorizableTree[0].get(), 11961 VectorizableTree[0]->Scalars.size()) && 11962 VectorizableTree[0]->getVectorFactor() > 2))) 11963 return true; 11964 11965 if (VectorizableTree.size() != 2) 11966 return false; 11967 11968 // Handle splat and all-constants stores. Also try to vectorize tiny trees 11969 // with the second gather nodes if they have less scalar operands rather than 11970 // the initial tree element (may be profitable to shuffle the second gather) 11971 // or they are extractelements, which form shuffle. 11972 SmallVector<int> Mask; 11973 if (VectorizableTree[0]->State == TreeEntry::Vectorize && 11974 AreVectorizableGathers(VectorizableTree[1].get(), 11975 VectorizableTree[0]->Scalars.size())) 11976 return true; 11977 11978 // Gathering cost would be too much for tiny trees. 11979 if (VectorizableTree[0]->isGather() || 11980 (VectorizableTree[1]->isGather() && 11981 VectorizableTree[0]->State != TreeEntry::ScatterVectorize && 11982 VectorizableTree[0]->State != TreeEntry::StridedVectorize)) 11983 return false; 11984 11985 return true; 11986 } 11987 11988 static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, 11989 TargetTransformInfo *TTI, 11990 bool MustMatchOrInst) { 11991 // Look past the root to find a source value. Arbitrarily follow the 11992 // path through operand 0 of any 'or'. Also, peek through optional 11993 // shift-left-by-multiple-of-8-bits. 11994 Value *ZextLoad = Root; 11995 const APInt *ShAmtC; 11996 bool FoundOr = false; 11997 while (!isa<ConstantExpr>(ZextLoad) && 11998 (match(ZextLoad, m_Or(m_Value(), m_Value())) || 11999 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) && 12000 ShAmtC->urem(8) == 0))) { 12001 auto *BinOp = cast<BinaryOperator>(ZextLoad); 12002 ZextLoad = BinOp->getOperand(0); 12003 if (BinOp->getOpcode() == Instruction::Or) 12004 FoundOr = true; 12005 } 12006 // Check if the input is an extended load of the required or/shift expression. 12007 Value *Load; 12008 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root || 12009 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load)) 12010 return false; 12011 12012 // Require that the total load bit width is a legal integer type. 12013 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. 12014 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. 12015 Type *SrcTy = Load->getType(); 12016 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; 12017 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth))) 12018 return false; 12019 12020 // Everything matched - assume that we can fold the whole sequence using 12021 // load combining. 12022 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at " 12023 << *(cast<Instruction>(Root)) << "\n"); 12024 12025 return true; 12026 } 12027 12028 bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const { 12029 if (RdxKind != RecurKind::Or) 12030 return false; 12031 12032 unsigned NumElts = VectorizableTree[0]->Scalars.size(); 12033 Value *FirstReduced = VectorizableTree[0]->Scalars[0]; 12034 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI, 12035 /* MatchOr */ false); 12036 } 12037 12038 bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const { 12039 // Peek through a final sequence of stores and check if all operations are 12040 // likely to be load-combined. 12041 unsigned NumElts = Stores.size(); 12042 for (Value *Scalar : Stores) { 12043 Value *X; 12044 if (!match(Scalar, m_Store(m_Value(X), m_Value())) || 12045 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true)) 12046 return false; 12047 } 12048 return true; 12049 } 12050 12051 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { 12052 if (!DebugCounter::shouldExecute(VectorizedGraphs)) 12053 return true; 12054 12055 // Graph is empty - do nothing. 12056 if (VectorizableTree.empty()) { 12057 assert(ExternalUses.empty() && "We shouldn't have any external users"); 12058 12059 return true; 12060 } 12061 12062 // No need to vectorize inserts of gathered values. 12063 if (VectorizableTree.size() == 2 && 12064 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) && 12065 VectorizableTree[1]->isGather() && 12066 (VectorizableTree[1]->getVectorFactor() <= 2 || 12067 !(isSplat(VectorizableTree[1]->Scalars) || 12068 allConstant(VectorizableTree[1]->Scalars)))) 12069 return true; 12070 12071 // If the graph includes only PHI nodes and gathers, it is defnitely not 12072 // profitable for the vectorization, we can skip it, if the cost threshold is 12073 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of 12074 // gathers/buildvectors. 12075 constexpr int Limit = 4; 12076 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() && 12077 !VectorizableTree.empty() && 12078 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { 12079 return (TE->isGather() && 12080 (!TE->hasState() || 12081 TE->getOpcode() != Instruction::ExtractElement) && 12082 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) || 12083 (TE->hasState() && TE->getOpcode() == Instruction::PHI); 12084 })) 12085 return true; 12086 12087 // We can vectorize the tree if its size is greater than or equal to the 12088 // minimum size specified by the MinTreeSize command line option. 12089 if (VectorizableTree.size() >= MinTreeSize) 12090 return false; 12091 12092 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we 12093 // can vectorize it if we can prove it fully vectorizable. 12094 if (isFullyVectorizableTinyTree(ForReduction)) 12095 return false; 12096 12097 // Check if any of the gather node forms an insertelement buildvector 12098 // somewhere. 12099 bool IsAllowedSingleBVNode = 12100 VectorizableTree.size() > 1 || 12101 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() && 12102 !VectorizableTree.front()->isAltShuffle() && 12103 VectorizableTree.front()->getOpcode() != Instruction::PHI && 12104 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr && 12105 allSameBlock(VectorizableTree.front()->Scalars)); 12106 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { 12107 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) { 12108 return isa<ExtractElementInst, UndefValue>(V) || 12109 (IsAllowedSingleBVNode && 12110 !V->hasNUsesOrMore(UsesLimit) && 12111 any_of(V->users(), IsaPred<InsertElementInst>)); 12112 }); 12113 })) 12114 return false; 12115 12116 if (VectorizableTree.back()->isGather() && 12117 VectorizableTree.back()->hasState() && 12118 VectorizableTree.back()->isAltShuffle() && 12119 VectorizableTree.back()->getVectorFactor() > 2 && 12120 allSameBlock(VectorizableTree.back()->Scalars) && 12121 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() && 12122 TTI->getScalarizationOverhead( 12123 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(), 12124 VectorizableTree.back()->getVectorFactor()), 12125 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()), 12126 /*Insert=*/true, /*Extract=*/false, 12127 TTI::TCK_RecipThroughput) > -SLPCostThreshold) 12128 return false; 12129 12130 // Otherwise, we can't vectorize the tree. It is both tiny and not fully 12131 // vectorizable. 12132 return true; 12133 } 12134 12135 bool BoUpSLP::isTreeNotExtendable() const { 12136 if (getCanonicalGraphSize() != getTreeSize()) { 12137 constexpr unsigned SmallTree = 3; 12138 if (VectorizableTree.front()->isNonPowOf2Vec() && 12139 getCanonicalGraphSize() <= SmallTree && 12140 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()), 12141 [](const std::unique_ptr<TreeEntry> &TE) { 12142 return TE->isGather() && TE->hasState() && 12143 TE->getOpcode() == Instruction::Load && 12144 !allSameBlock(TE->Scalars); 12145 }) == 1) 12146 return true; 12147 return false; 12148 } 12149 bool Res = false; 12150 for (unsigned Idx : seq<unsigned>(getTreeSize())) { 12151 TreeEntry &E = *VectorizableTree[Idx]; 12152 if (!E.isGather()) 12153 continue; 12154 if (E.hasState() && E.getOpcode() != Instruction::Load) 12155 return false; 12156 if (isSplat(E.Scalars) || allConstant(E.Scalars)) 12157 continue; 12158 Res = true; 12159 } 12160 return Res; 12161 } 12162 12163 InstructionCost BoUpSLP::getSpillCost() const { 12164 // Walk from the bottom of the tree to the top, tracking which values are 12165 // live. When we see a call instruction that is not part of our tree, 12166 // query TTI to see if there is a cost to keeping values live over it 12167 // (for example, if spills and fills are required). 12168 unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); 12169 InstructionCost Cost = 0; 12170 12171 SmallPtrSet<Instruction *, 4> LiveValues; 12172 Instruction *PrevInst = nullptr; 12173 12174 // The entries in VectorizableTree are not necessarily ordered by their 12175 // position in basic blocks. Collect them and order them by dominance so later 12176 // instructions are guaranteed to be visited first. For instructions in 12177 // different basic blocks, we only scan to the beginning of the block, so 12178 // their order does not matter, as long as all instructions in a basic block 12179 // are grouped together. Using dominance ensures a deterministic order. 12180 SmallVector<Instruction *, 16> OrderedScalars; 12181 for (const auto &TEPtr : VectorizableTree) { 12182 if (TEPtr->State != TreeEntry::Vectorize) 12183 continue; 12184 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]); 12185 if (!Inst) 12186 continue; 12187 OrderedScalars.push_back(Inst); 12188 } 12189 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) { 12190 auto *NodeA = DT->getNode(A->getParent()); 12191 auto *NodeB = DT->getNode(B->getParent()); 12192 assert(NodeA && "Should only process reachable instructions"); 12193 assert(NodeB && "Should only process reachable instructions"); 12194 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && 12195 "Different nodes should have different DFS numbers"); 12196 if (NodeA != NodeB) 12197 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn(); 12198 return B->comesBefore(A); 12199 }); 12200 12201 for (Instruction *Inst : OrderedScalars) { 12202 if (!PrevInst) { 12203 PrevInst = Inst; 12204 continue; 12205 } 12206 12207 // Update LiveValues. 12208 LiveValues.erase(PrevInst); 12209 for (auto &J : PrevInst->operands()) { 12210 if (isa<Instruction>(&*J) && isVectorized(&*J)) 12211 LiveValues.insert(cast<Instruction>(&*J)); 12212 } 12213 12214 LLVM_DEBUG({ 12215 dbgs() << "SLP: #LV: " << LiveValues.size(); 12216 for (auto *X : LiveValues) 12217 dbgs() << " " << X->getName(); 12218 dbgs() << ", Looking at "; 12219 Inst->dump(); 12220 }); 12221 12222 // Now find the sequence of instructions between PrevInst and Inst. 12223 unsigned NumCalls = 0; 12224 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(), 12225 PrevInstIt = 12226 PrevInst->getIterator().getReverse(); 12227 while (InstIt != PrevInstIt) { 12228 if (PrevInstIt == PrevInst->getParent()->rend()) { 12229 PrevInstIt = Inst->getParent()->rbegin(); 12230 continue; 12231 } 12232 12233 auto NoCallIntrinsic = [this](Instruction *I) { 12234 if (auto *II = dyn_cast<IntrinsicInst>(I)) { 12235 if (II->isAssumeLikeIntrinsic()) 12236 return true; 12237 FastMathFlags FMF; 12238 SmallVector<Type *, 4> Tys; 12239 for (auto &ArgOp : II->args()) 12240 Tys.push_back(ArgOp->getType()); 12241 if (auto *FPMO = dyn_cast<FPMathOperator>(II)) 12242 FMF = FPMO->getFastMathFlags(); 12243 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys, 12244 FMF); 12245 InstructionCost IntrCost = 12246 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput); 12247 InstructionCost CallCost = TTI->getCallInstrCost( 12248 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput); 12249 if (IntrCost < CallCost) 12250 return true; 12251 } 12252 return false; 12253 }; 12254 12255 // Debug information does not impact spill cost. 12256 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) && 12257 &*PrevInstIt != PrevInst) 12258 NumCalls++; 12259 12260 ++PrevInstIt; 12261 } 12262 12263 if (NumCalls) { 12264 SmallVector<Type *, 4> V; 12265 for (auto *II : LiveValues) { 12266 auto *ScalarTy = II->getType(); 12267 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy)) 12268 ScalarTy = VectorTy->getElementType(); 12269 V.push_back(getWidenedType(ScalarTy, BundleWidth)); 12270 } 12271 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V); 12272 } 12273 12274 PrevInst = Inst; 12275 } 12276 12277 return Cost; 12278 } 12279 12280 /// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the 12281 /// buildvector sequence. 12282 static bool isFirstInsertElement(const InsertElementInst *IE1, 12283 const InsertElementInst *IE2) { 12284 if (IE1 == IE2) 12285 return false; 12286 const auto *I1 = IE1; 12287 const auto *I2 = IE2; 12288 const InsertElementInst *PrevI1; 12289 const InsertElementInst *PrevI2; 12290 unsigned Idx1 = *getElementIndex(IE1); 12291 unsigned Idx2 = *getElementIndex(IE2); 12292 do { 12293 if (I2 == IE1) 12294 return true; 12295 if (I1 == IE2) 12296 return false; 12297 PrevI1 = I1; 12298 PrevI2 = I2; 12299 if (I1 && (I1 == IE1 || I1->hasOneUse()) && 12300 getElementIndex(I1).value_or(Idx2) != Idx2) 12301 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0)); 12302 if (I2 && ((I2 == IE2 || I2->hasOneUse())) && 12303 getElementIndex(I2).value_or(Idx1) != Idx1) 12304 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0)); 12305 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2)); 12306 llvm_unreachable("Two different buildvectors not expected."); 12307 } 12308 12309 namespace { 12310 /// Returns incoming Value *, if the requested type is Value * too, or a default 12311 /// value, otherwise. 12312 struct ValueSelect { 12313 template <typename U> 12314 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) { 12315 return V; 12316 } 12317 template <typename U> 12318 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) { 12319 return U(); 12320 } 12321 }; 12322 } // namespace 12323 12324 /// Does the analysis of the provided shuffle masks and performs the requested 12325 /// actions on the vectors with the given shuffle masks. It tries to do it in 12326 /// several steps. 12327 /// 1. If the Base vector is not undef vector, resizing the very first mask to 12328 /// have common VF and perform action for 2 input vectors (including non-undef 12329 /// Base). Other shuffle masks are combined with the resulting after the 1 stage 12330 /// and processed as a shuffle of 2 elements. 12331 /// 2. If the Base is undef vector and have only 1 shuffle mask, perform the 12332 /// action only for 1 vector with the given mask, if it is not the identity 12333 /// mask. 12334 /// 3. If > 2 masks are used, perform the remaining shuffle actions for 2 12335 /// vectors, combing the masks properly between the steps. 12336 template <typename T> 12337 static T *performExtractsShuffleAction( 12338 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base, 12339 function_ref<unsigned(T *)> GetVF, 12340 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction, 12341 function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) { 12342 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts."); 12343 SmallVector<int> Mask(ShuffleMask.begin()->second); 12344 auto VMIt = std::next(ShuffleMask.begin()); 12345 T *Prev = nullptr; 12346 SmallBitVector UseMask = 12347 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask); 12348 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask); 12349 if (!IsBaseUndef.all()) { 12350 // Base is not undef, need to combine it with the next subvectors. 12351 std::pair<T *, bool> Res = 12352 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false); 12353 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask); 12354 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) { 12355 if (Mask[Idx] == PoisonMaskElem) 12356 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx; 12357 else 12358 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF; 12359 } 12360 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base); 12361 assert((!V || GetVF(V) == Mask.size()) && 12362 "Expected base vector of VF number of elements."); 12363 Prev = Action(Mask, {nullptr, Res.first}); 12364 } else if (ShuffleMask.size() == 1) { 12365 // Base is undef and only 1 vector is shuffled - perform the action only for 12366 // single vector, if the mask is not the identity mask. 12367 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask, 12368 /*ForSingleMask=*/true); 12369 if (Res.second) 12370 // Identity mask is found. 12371 Prev = Res.first; 12372 else 12373 Prev = Action(Mask, {ShuffleMask.begin()->first}); 12374 } else { 12375 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors 12376 // shuffles step by step, combining shuffle between the steps. 12377 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first); 12378 unsigned Vec2VF = GetVF(VMIt->first); 12379 if (Vec1VF == Vec2VF) { 12380 // No need to resize the input vectors since they are of the same size, we 12381 // can shuffle them directly. 12382 ArrayRef<int> SecMask = VMIt->second; 12383 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { 12384 if (SecMask[I] != PoisonMaskElem) { 12385 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars."); 12386 Mask[I] = SecMask[I] + Vec1VF; 12387 } 12388 } 12389 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first}); 12390 } else { 12391 // Vectors of different sizes - resize and reshuffle. 12392 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask, 12393 /*ForSingleMask=*/false); 12394 std::pair<T *, bool> Res2 = 12395 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false); 12396 ArrayRef<int> SecMask = VMIt->second; 12397 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { 12398 if (Mask[I] != PoisonMaskElem) { 12399 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars."); 12400 if (Res1.second) 12401 Mask[I] = I; 12402 } else if (SecMask[I] != PoisonMaskElem) { 12403 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars."); 12404 Mask[I] = (Res2.second ? I : SecMask[I]) + VF; 12405 } 12406 } 12407 Prev = Action(Mask, {Res1.first, Res2.first}); 12408 } 12409 VMIt = std::next(VMIt); 12410 } 12411 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all(); 12412 // Perform requested actions for the remaining masks/vectors. 12413 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) { 12414 // Shuffle other input vectors, if any. 12415 std::pair<T *, bool> Res = 12416 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false); 12417 ArrayRef<int> SecMask = VMIt->second; 12418 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { 12419 if (SecMask[I] != PoisonMaskElem) { 12420 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) && 12421 "Multiple uses of scalars."); 12422 Mask[I] = (Res.second ? I : SecMask[I]) + VF; 12423 } else if (Mask[I] != PoisonMaskElem) { 12424 Mask[I] = I; 12425 } 12426 } 12427 Prev = Action(Mask, {Prev, Res.first}); 12428 } 12429 return Prev; 12430 } 12431 12432 namespace { 12433 /// Data type for handling buildvector sequences with the reused scalars from 12434 /// other tree entries. 12435 template <typename T> struct ShuffledInsertData { 12436 /// List of insertelements to be replaced by shuffles. 12437 SmallVector<InsertElementInst *> InsertElements; 12438 /// The parent vectors and shuffle mask for the given list of inserts. 12439 MapVector<T, SmallVector<int>> ValueMasks; 12440 }; 12441 } // namespace 12442 12443 InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { 12444 InstructionCost Cost = 0; 12445 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " 12446 << VectorizableTree.size() << ".\n"); 12447 12448 unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); 12449 12450 SmallPtrSet<Value *, 4> CheckedExtracts; 12451 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { 12452 TreeEntry &TE = *VectorizableTree[I]; 12453 // No need to count the cost for combined entries, they are combined and 12454 // just skip their cost. 12455 if (TE.State == TreeEntry::CombinedVectorize) { 12456 LLVM_DEBUG( 12457 dbgs() << "SLP: Skipping cost for combined node that starts with " 12458 << *TE.Scalars[0] << ".\n"; 12459 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); 12460 continue; 12461 } 12462 if (TE.isGather() && TE.hasState()) { 12463 if (const TreeEntry *E = 12464 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars); 12465 E && E->getVectorFactor() == TE.getVectorFactor()) { 12466 // Some gather nodes might be absolutely the same as some vectorizable 12467 // nodes after reordering, need to handle it. 12468 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle " 12469 << shortBundleName(TE.Scalars, TE.Idx) << ".\n" 12470 << "SLP: Current total cost = " << Cost << "\n"); 12471 continue; 12472 } 12473 } 12474 12475 // Exclude cost of gather loads nodes which are not used. These nodes were 12476 // built as part of the final attempt to vectorize gathered loads. 12477 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) && 12478 "Expected gather nodes with users only."); 12479 12480 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts); 12481 Cost += C; 12482 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle " 12483 << shortBundleName(TE.Scalars, TE.Idx) << ".\n" 12484 << "SLP: Current total cost = " << Cost << "\n"); 12485 } 12486 12487 SmallPtrSet<Value *, 16> ExtractCostCalculated; 12488 InstructionCost ExtractCost = 0; 12489 SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts; 12490 SmallVector<APInt> DemandedElts; 12491 SmallDenseSet<Value *, 4> UsedInserts; 12492 DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts; 12493 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses; 12494 DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount; 12495 SmallPtrSet<Value *, 4> ScalarOpsFromCasts; 12496 // Keep track {Scalar, Index, User} tuple. 12497 // On AArch64, this helps in fusing a mov instruction, associated with 12498 // extractelement, with fmul in the backend so that extractelement is free. 12499 SmallVector<std::tuple<Value *, User *, int>, 4> ScalarUserAndIdx; 12500 for (ExternalUser &EU : ExternalUses) { 12501 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane); 12502 } 12503 for (ExternalUser &EU : ExternalUses) { 12504 // Uses by ephemeral values are free (because the ephemeral value will be 12505 // removed prior to code generation, and so the extraction will be 12506 // removed as well). 12507 if (EphValues.count(EU.User)) 12508 continue; 12509 12510 // Used in unreachable blocks or in EH pads (rarely executed) or is 12511 // terminated with unreachable instruction. 12512 if (BasicBlock *UserParent = 12513 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr; 12514 UserParent && 12515 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() || 12516 isa_and_present<UnreachableInst>(UserParent->getTerminator()))) 12517 continue; 12518 12519 // We only add extract cost once for the same scalar. 12520 if (!isa_and_nonnull<InsertElementInst>(EU.User) && 12521 !ExtractCostCalculated.insert(EU.Scalar).second) 12522 continue; 12523 12524 // No extract cost for vector "scalar" 12525 if (isa<FixedVectorType>(EU.Scalar->getType())) 12526 continue; 12527 12528 // If found user is an insertelement, do not calculate extract cost but try 12529 // to detect it as a final shuffled/identity match. 12530 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User); 12531 VU && VU->getOperand(1) == EU.Scalar) { 12532 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) { 12533 if (!UsedInserts.insert(VU).second) 12534 continue; 12535 std::optional<unsigned> InsertIdx = getElementIndex(VU); 12536 if (InsertIdx) { 12537 const TreeEntry *ScalarTE = &EU.E; 12538 auto *It = find_if( 12539 ShuffledInserts, 12540 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) { 12541 // Checks if 2 insertelements are from the same buildvector. 12542 InsertElementInst *VecInsert = Data.InsertElements.front(); 12543 return areTwoInsertFromSameBuildVector( 12544 VU, VecInsert, [this](InsertElementInst *II) -> Value * { 12545 Value *Op0 = II->getOperand(0); 12546 if (isVectorized(II) && !isVectorized(Op0)) 12547 return nullptr; 12548 return Op0; 12549 }); 12550 }); 12551 int VecId = -1; 12552 if (It == ShuffledInserts.end()) { 12553 auto &Data = ShuffledInserts.emplace_back(); 12554 Data.InsertElements.emplace_back(VU); 12555 DemandedElts.push_back(APInt::getZero(FTy->getNumElements())); 12556 VecId = ShuffledInserts.size() - 1; 12557 auto It = MinBWs.find(ScalarTE); 12558 if (It != MinBWs.end() && 12559 VectorCasts 12560 .insert(std::make_pair(ScalarTE, FTy->getElementType())) 12561 .second) { 12562 unsigned BWSz = It->second.first; 12563 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType()); 12564 unsigned VecOpcode; 12565 if (DstBWSz < BWSz) 12566 VecOpcode = Instruction::Trunc; 12567 else 12568 VecOpcode = 12569 It->second.second ? Instruction::SExt : Instruction::ZExt; 12570 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 12571 InstructionCost C = TTI->getCastInstrCost( 12572 VecOpcode, FTy, 12573 getWidenedType(IntegerType::get(FTy->getContext(), BWSz), 12574 FTy->getNumElements()), 12575 TTI::CastContextHint::None, CostKind); 12576 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C 12577 << " for extending externally used vector with " 12578 "non-equal minimum bitwidth.\n"); 12579 Cost += C; 12580 } 12581 } else { 12582 if (isFirstInsertElement(VU, It->InsertElements.front())) 12583 It->InsertElements.front() = VU; 12584 VecId = std::distance(ShuffledInserts.begin(), It); 12585 } 12586 int InIdx = *InsertIdx; 12587 SmallVectorImpl<int> &Mask = 12588 ShuffledInserts[VecId].ValueMasks[ScalarTE]; 12589 if (Mask.empty()) 12590 Mask.assign(FTy->getNumElements(), PoisonMaskElem); 12591 Mask[InIdx] = EU.Lane; 12592 DemandedElts[VecId].setBit(InIdx); 12593 continue; 12594 } 12595 } 12596 } 12597 12598 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 12599 // If we plan to rewrite the tree in a smaller type, we will need to sign 12600 // extend the extracted value back to the original type. Here, we account 12601 // for the extract and the added cost of the sign extend if needed. 12602 InstructionCost ExtraCost = TTI::TCC_Free; 12603 auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth); 12604 const TreeEntry *Entry = &EU.E; 12605 auto It = MinBWs.find(Entry); 12606 if (It != MinBWs.end()) { 12607 auto *MinTy = IntegerType::get(F->getContext(), It->second.first); 12608 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL)) 12609 ? Instruction::ZExt 12610 : Instruction::SExt; 12611 VecTy = getWidenedType(MinTy, BundleWidth); 12612 ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), 12613 VecTy, EU.Lane); 12614 } else { 12615 ExtraCost = 12616 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, 12617 EU.Lane, EU.Scalar, ScalarUserAndIdx); 12618 } 12619 // Leave the scalar instructions as is if they are cheaper than extracts. 12620 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr || 12621 Entry->getOpcode() == Instruction::Load) { 12622 // Checks if the user of the external scalar is phi in loop body. 12623 auto IsPhiInLoop = [&](const ExternalUser &U) { 12624 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) { 12625 auto *I = cast<Instruction>(U.Scalar); 12626 const Loop *L = LI->getLoopFor(Phi->getParent()); 12627 return L && (Phi->getParent() == I->getParent() || 12628 L == LI->getLoopFor(I->getParent())); 12629 } 12630 return false; 12631 }; 12632 if (!ValueToExtUses) { 12633 ValueToExtUses.emplace(); 12634 for_each(enumerate(ExternalUses), [&](const auto &P) { 12635 // Ignore phis in loops. 12636 if (IsPhiInLoop(P.value())) 12637 return; 12638 12639 ValueToExtUses->try_emplace(P.value().Scalar, P.index()); 12640 }); 12641 } 12642 // Can use original instruction, if no operands vectorized or they are 12643 // marked as externally used already. 12644 auto *Inst = cast<Instruction>(EU.Scalar); 12645 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind); 12646 auto OperandIsScalar = [&](Value *V) { 12647 if (!isVectorized(V)) { 12648 // Some extractelements might be not vectorized, but 12649 // transformed into shuffle and removed from the function, 12650 // consider it here. 12651 if (auto *EE = dyn_cast<ExtractElementInst>(V)) 12652 return !EE->hasOneUse() || !MustGather.contains(EE); 12653 return true; 12654 } 12655 return ValueToExtUses->contains(V); 12656 }; 12657 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar); 12658 bool CanBeUsedAsScalarCast = false; 12659 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) { 12660 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0)); 12661 Op && all_of(Op->operands(), OperandIsScalar)) { 12662 InstructionCost OpCost = 12663 (isVectorized(Op) && !ValueToExtUses->contains(Op)) 12664 ? TTI->getInstructionCost(Op, CostKind) 12665 : 0; 12666 if (ScalarCost + OpCost <= ExtraCost) { 12667 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true; 12668 ScalarCost += OpCost; 12669 } 12670 } 12671 } 12672 if (CanBeUsedAsScalar) { 12673 bool KeepScalar = ScalarCost <= ExtraCost; 12674 // Try to keep original scalar if the user is the phi node from the same 12675 // block as the root phis, currently vectorized. It allows to keep 12676 // better ordering info of PHIs, being vectorized currently. 12677 bool IsProfitablePHIUser = 12678 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic && 12679 VectorizableTree.front()->Scalars.size() > 2)) && 12680 VectorizableTree.front()->getOpcode() == Instruction::PHI && 12681 !Inst->hasNUsesOrMore(UsesLimit) && 12682 none_of(Inst->users(), 12683 [&](User *U) { 12684 auto *PHIUser = dyn_cast<PHINode>(U); 12685 return (!PHIUser || 12686 PHIUser->getParent() != 12687 cast<Instruction>( 12688 VectorizableTree.front()->getMainOp()) 12689 ->getParent()) && 12690 !isVectorized(U); 12691 }) && 12692 count_if(Entry->Scalars, [&](Value *V) { 12693 return ValueToExtUses->contains(V); 12694 }) <= 2; 12695 if (IsProfitablePHIUser) { 12696 KeepScalar = true; 12697 } else if (KeepScalar && ScalarCost != TTI::TCC_Free && 12698 ExtraCost - ScalarCost <= TTI::TCC_Basic && 12699 (!GatheredLoadsEntriesFirst.has_value() || 12700 Entry->Idx < *GatheredLoadsEntriesFirst)) { 12701 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) { 12702 return ValueToExtUses->contains(V); 12703 }); 12704 auto It = ExtractsCount.find(Entry); 12705 if (It != ExtractsCount.end()) { 12706 assert(ScalarUsesCount >= It->getSecond().size() && 12707 "Expected total number of external uses not less than " 12708 "number of scalar uses."); 12709 ScalarUsesCount -= It->getSecond().size(); 12710 } 12711 // Keep original scalar if number of externally used instructions in 12712 // the same entry is not power of 2. It may help to do some extra 12713 // vectorization for now. 12714 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount); 12715 } 12716 if (KeepScalar) { 12717 ExternalUsesAsOriginalScalar.insert(EU.Scalar); 12718 for_each(Inst->operands(), [&](Value *V) { 12719 auto It = ValueToExtUses->find(V); 12720 if (It != ValueToExtUses->end()) { 12721 // Replace all uses to avoid compiler crash. 12722 ExternalUses[It->second].User = nullptr; 12723 } 12724 }); 12725 ExtraCost = ScalarCost; 12726 if (!IsPhiInLoop(EU)) 12727 ExtractsCount[Entry].insert(Inst); 12728 if (CanBeUsedAsScalarCast) { 12729 ScalarOpsFromCasts.insert(Inst->getOperand(0)); 12730 // Update the users of the operands of the cast operand to avoid 12731 // compiler crash. 12732 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) { 12733 for_each(IOp->operands(), [&](Value *V) { 12734 auto It = ValueToExtUses->find(V); 12735 if (It != ValueToExtUses->end()) { 12736 // Replace all uses to avoid compiler crash. 12737 ExternalUses[It->second].User = nullptr; 12738 } 12739 }); 12740 } 12741 } 12742 } 12743 } 12744 } 12745 12746 ExtractCost += ExtraCost; 12747 } 12748 // Insert externals for extract of operands of casts to be emitted as scalars 12749 // instead of extractelement. 12750 for (Value *V : ScalarOpsFromCasts) { 12751 ExternalUsesAsOriginalScalar.insert(V); 12752 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) { 12753 ExternalUses.emplace_back(V, nullptr, *TEs.front(), 12754 TEs.front()->findLaneForValue(V)); 12755 } 12756 } 12757 // Add reduced value cost, if resized. 12758 if (!VectorizedVals.empty()) { 12759 const TreeEntry &Root = *VectorizableTree.front(); 12760 auto BWIt = MinBWs.find(&Root); 12761 if (BWIt != MinBWs.end()) { 12762 Type *DstTy = Root.Scalars.front()->getType(); 12763 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType()); 12764 unsigned SrcSz = 12765 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth; 12766 if (OriginalSz != SrcSz) { 12767 unsigned Opcode = Instruction::Trunc; 12768 if (OriginalSz > SrcSz) 12769 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt; 12770 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz); 12771 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) { 12772 assert(SLPReVec && "Only supported by REVEC."); 12773 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements()); 12774 } 12775 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy, 12776 TTI::CastContextHint::None, 12777 TTI::TCK_RecipThroughput); 12778 } 12779 } 12780 } 12781 12782 InstructionCost SpillCost = getSpillCost(); 12783 Cost += SpillCost + ExtractCost; 12784 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask, 12785 bool) { 12786 InstructionCost C = 0; 12787 unsigned VF = Mask.size(); 12788 unsigned VecVF = TE->getVectorFactor(); 12789 if (VF != VecVF && 12790 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) || 12791 !ShuffleVectorInst::isIdentityMask(Mask, VF))) { 12792 SmallVector<int> OrigMask(VecVF, PoisonMaskElem); 12793 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)), 12794 OrigMask.begin()); 12795 C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, 12796 getWidenedType(TE->getMainOp()->getType(), VecVF), 12797 OrigMask); 12798 LLVM_DEBUG( 12799 dbgs() << "SLP: Adding cost " << C 12800 << " for final shuffle of insertelement external users.\n"; 12801 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); 12802 Cost += C; 12803 return std::make_pair(TE, true); 12804 } 12805 return std::make_pair(TE, false); 12806 }; 12807 // Calculate the cost of the reshuffled vectors, if any. 12808 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) { 12809 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0); 12810 auto Vector = ShuffledInserts[I].ValueMasks.takeVector(); 12811 unsigned VF = 0; 12812 auto EstimateShufflesCost = [&](ArrayRef<int> Mask, 12813 ArrayRef<const TreeEntry *> TEs) { 12814 assert((TEs.size() == 1 || TEs.size() == 2) && 12815 "Expected exactly 1 or 2 tree entries."); 12816 if (TEs.size() == 1) { 12817 if (VF == 0) 12818 VF = TEs.front()->getVectorFactor(); 12819 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF); 12820 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) && 12821 !all_of(enumerate(Mask), [=](const auto &Data) { 12822 return Data.value() == PoisonMaskElem || 12823 (Data.index() < VF && 12824 static_cast<int>(Data.index()) == Data.value()); 12825 })) { 12826 InstructionCost C = 12827 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FTy, Mask); 12828 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C 12829 << " for final shuffle of insertelement " 12830 "external users.\n"; 12831 TEs.front()->dump(); 12832 dbgs() << "SLP: Current total cost = " << Cost << "\n"); 12833 Cost += C; 12834 } 12835 } else { 12836 if (VF == 0) { 12837 if (TEs.front() && 12838 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor()) 12839 VF = TEs.front()->getVectorFactor(); 12840 else 12841 VF = Mask.size(); 12842 } 12843 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF); 12844 InstructionCost C = 12845 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask); 12846 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C 12847 << " for final shuffle of vector node and external " 12848 "insertelement users.\n"; 12849 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump(); 12850 dbgs() << "SLP: Current total cost = " << Cost << "\n"); 12851 Cost += C; 12852 } 12853 VF = Mask.size(); 12854 return TEs.back(); 12855 }; 12856 (void)performExtractsShuffleAction<const TreeEntry>( 12857 MutableArrayRef(Vector.data(), Vector.size()), Base, 12858 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF, 12859 EstimateShufflesCost); 12860 InstructionCost InsertCost = TTI->getScalarizationOverhead( 12861 cast<FixedVectorType>( 12862 ShuffledInserts[I].InsertElements.front()->getType()), 12863 DemandedElts[I], 12864 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput); 12865 Cost -= InsertCost; 12866 } 12867 12868 // Add the cost for reduced value resize (if required). 12869 if (ReductionBitWidth != 0) { 12870 assert(UserIgnoreList && "Expected reduction tree."); 12871 const TreeEntry &E = *VectorizableTree.front(); 12872 auto It = MinBWs.find(&E); 12873 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) { 12874 unsigned SrcSize = It->second.first; 12875 unsigned DstSize = ReductionBitWidth; 12876 unsigned Opcode = Instruction::Trunc; 12877 if (SrcSize < DstSize) { 12878 bool IsArithmeticExtendedReduction = 12879 all_of(*UserIgnoreList, [](Value *V) { 12880 auto *I = cast<Instruction>(V); 12881 return is_contained({Instruction::Add, Instruction::FAdd, 12882 Instruction::Mul, Instruction::FMul, 12883 Instruction::And, Instruction::Or, 12884 Instruction::Xor}, 12885 I->getOpcode()); 12886 }); 12887 if (IsArithmeticExtendedReduction) 12888 Opcode = 12889 Instruction::BitCast; // Handle it by getExtendedReductionCost 12890 else 12891 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt; 12892 } 12893 if (Opcode != Instruction::BitCast) { 12894 auto *SrcVecTy = 12895 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor()); 12896 auto *DstVecTy = 12897 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor()); 12898 TTI::CastContextHint CCH = getCastContextHint(E); 12899 InstructionCost CastCost; 12900 switch (E.getOpcode()) { 12901 case Instruction::SExt: 12902 case Instruction::ZExt: 12903 case Instruction::Trunc: { 12904 const TreeEntry *OpTE = getOperandEntry(&E, 0); 12905 CCH = getCastContextHint(*OpTE); 12906 break; 12907 } 12908 default: 12909 break; 12910 } 12911 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH, 12912 TTI::TCK_RecipThroughput); 12913 Cost += CastCost; 12914 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost 12915 << " for final resize for reduction from " << SrcVecTy 12916 << " to " << DstVecTy << "\n"; 12917 dbgs() << "SLP: Current total cost = " << Cost << "\n"); 12918 } 12919 } 12920 } 12921 12922 #ifndef NDEBUG 12923 SmallString<256> Str; 12924 { 12925 raw_svector_ostream OS(Str); 12926 OS << "SLP: Spill Cost = " << SpillCost << ".\n" 12927 << "SLP: Extract Cost = " << ExtractCost << ".\n" 12928 << "SLP: Total Cost = " << Cost << ".\n"; 12929 } 12930 LLVM_DEBUG(dbgs() << Str); 12931 if (ViewSLPTree) 12932 ViewGraph(this, "SLP" + F->getName(), false, Str); 12933 #endif 12934 12935 return Cost; 12936 } 12937 12938 /// Tries to find extractelement instructions with constant indices from fixed 12939 /// vector type and gather such instructions into a bunch, which highly likely 12940 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was 12941 /// successful, the matched scalars are replaced by poison values in \p VL for 12942 /// future analysis. 12943 std::optional<TTI::ShuffleKind> 12944 BoUpSLP::tryToGatherSingleRegisterExtractElements( 12945 MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const { 12946 // Scan list of gathered scalars for extractelements that can be represented 12947 // as shuffles. 12948 MapVector<Value *, SmallVector<int>> VectorOpToIdx; 12949 SmallVector<int> UndefVectorExtracts; 12950 for (int I = 0, E = VL.size(); I < E; ++I) { 12951 auto *EI = dyn_cast<ExtractElementInst>(VL[I]); 12952 if (!EI) { 12953 if (isa<UndefValue>(VL[I])) 12954 UndefVectorExtracts.push_back(I); 12955 continue; 12956 } 12957 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType()); 12958 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand())) 12959 continue; 12960 std::optional<unsigned> Idx = getExtractIndex(EI); 12961 // Undefined index. 12962 if (!Idx) { 12963 UndefVectorExtracts.push_back(I); 12964 continue; 12965 } 12966 if (Idx >= VecTy->getNumElements()) { 12967 UndefVectorExtracts.push_back(I); 12968 continue; 12969 } 12970 SmallBitVector ExtractMask(VecTy->getNumElements(), true); 12971 ExtractMask.reset(*Idx); 12972 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) { 12973 UndefVectorExtracts.push_back(I); 12974 continue; 12975 } 12976 VectorOpToIdx[EI->getVectorOperand()].push_back(I); 12977 } 12978 // Sort the vector operands by the maximum number of uses in extractelements. 12979 SmallVector<std::pair<Value *, SmallVector<int>>> Vectors = 12980 VectorOpToIdx.takeVector(); 12981 stable_sort(Vectors, [](const auto &P1, const auto &P2) { 12982 return P1.second.size() > P2.second.size(); 12983 }); 12984 // Find the best pair of the vectors or a single vector. 12985 const int UndefSz = UndefVectorExtracts.size(); 12986 unsigned SingleMax = 0; 12987 unsigned PairMax = 0; 12988 if (!Vectors.empty()) { 12989 SingleMax = Vectors.front().second.size() + UndefSz; 12990 if (Vectors.size() > 1) { 12991 auto *ItNext = std::next(Vectors.begin()); 12992 PairMax = SingleMax + ItNext->second.size(); 12993 } 12994 } 12995 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0) 12996 return std::nullopt; 12997 // Check if better to perform a shuffle of 2 vectors or just of a single 12998 // vector. 12999 SmallVector<Value *> SavedVL(VL.begin(), VL.end()); 13000 SmallVector<Value *> GatheredExtracts( 13001 VL.size(), PoisonValue::get(VL.front()->getType())); 13002 if (SingleMax >= PairMax && SingleMax) { 13003 for (int Idx : Vectors.front().second) 13004 std::swap(GatheredExtracts[Idx], VL[Idx]); 13005 } else if (!Vectors.empty()) { 13006 for (unsigned Idx : {0, 1}) 13007 for (int Idx : Vectors[Idx].second) 13008 std::swap(GatheredExtracts[Idx], VL[Idx]); 13009 } 13010 // Add extracts from undefs too. 13011 for (int Idx : UndefVectorExtracts) 13012 std::swap(GatheredExtracts[Idx], VL[Idx]); 13013 // Check that gather of extractelements can be represented as just a 13014 // shuffle of a single/two vectors the scalars are extracted from. 13015 std::optional<TTI::ShuffleKind> Res = 13016 isFixedVectorShuffle(GatheredExtracts, Mask, AC); 13017 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) { 13018 // TODO: try to check other subsets if possible. 13019 // Restore the original VL if attempt was not successful. 13020 copy(SavedVL, VL.begin()); 13021 return std::nullopt; 13022 } 13023 // Restore unused scalars from mask, if some of the extractelements were not 13024 // selected for shuffle. 13025 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) { 13026 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) && 13027 isa<UndefValue>(GatheredExtracts[I])) { 13028 std::swap(VL[I], GatheredExtracts[I]); 13029 continue; 13030 } 13031 auto *EI = dyn_cast<ExtractElementInst>(VL[I]); 13032 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) || 13033 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) || 13034 is_contained(UndefVectorExtracts, I)) 13035 continue; 13036 } 13037 return Res; 13038 } 13039 13040 /// Tries to find extractelement instructions with constant indices from fixed 13041 /// vector type and gather such instructions into a bunch, which highly likely 13042 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was 13043 /// successful, the matched scalars are replaced by poison values in \p VL for 13044 /// future analysis. 13045 SmallVector<std::optional<TTI::ShuffleKind>> 13046 BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL, 13047 SmallVectorImpl<int> &Mask, 13048 unsigned NumParts) const { 13049 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1."); 13050 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts); 13051 Mask.assign(VL.size(), PoisonMaskElem); 13052 unsigned SliceSize = getPartNumElems(VL.size(), NumParts); 13053 for (unsigned Part : seq<unsigned>(NumParts)) { 13054 // Scan list of gathered scalars for extractelements that can be represented 13055 // as shuffles. 13056 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice( 13057 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part)); 13058 SmallVector<int> SubMask; 13059 std::optional<TTI::ShuffleKind> Res = 13060 tryToGatherSingleRegisterExtractElements(SubVL, SubMask); 13061 ShufflesRes[Part] = Res; 13062 copy(SubMask, std::next(Mask.begin(), Part * SliceSize)); 13063 } 13064 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) { 13065 return Res.has_value(); 13066 })) 13067 ShufflesRes.clear(); 13068 return ShufflesRes; 13069 } 13070 13071 std::optional<TargetTransformInfo::ShuffleKind> 13072 BoUpSLP::isGatherShuffledSingleRegisterEntry( 13073 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask, 13074 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) { 13075 Entries.clear(); 13076 // TODO: currently checking only for Scalars in the tree entry, need to count 13077 // reused elements too for better cost estimation. 13078 const EdgeInfo &TEUseEI = TE == VectorizableTree.front().get() 13079 ? EdgeInfo(const_cast<TreeEntry *>(TE), 0) 13080 : TE->UserTreeIndices.front(); 13081 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE); 13082 const BasicBlock *TEInsertBlock = nullptr; 13083 // Main node of PHI entries keeps the correct order of operands/incoming 13084 // blocks. 13085 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) { 13086 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx); 13087 TEInsertPt = TEInsertBlock->getTerminator(); 13088 } else { 13089 TEInsertBlock = TEInsertPt->getParent(); 13090 } 13091 if (!DT->isReachableFromEntry(TEInsertBlock)) 13092 return std::nullopt; 13093 auto *NodeUI = DT->getNode(TEInsertBlock); 13094 assert(NodeUI && "Should only process reachable instructions"); 13095 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end()); 13096 auto CheckOrdering = [&](const Instruction *InsertPt) { 13097 // Argument InsertPt is an instruction where vector code for some other 13098 // tree entry (one that shares one or more scalars with TE) is going to be 13099 // generated. This lambda returns true if insertion point of vector code 13100 // for the TE dominates that point (otherwise dependency is the other way 13101 // around). The other node is not limited to be of a gather kind. Gather 13102 // nodes are not scheduled and their vector code is inserted before their 13103 // first user. If user is PHI, that is supposed to be at the end of a 13104 // predecessor block. Otherwise it is the last instruction among scalars of 13105 // the user node. So, instead of checking dependency between instructions 13106 // themselves, we check dependency between their insertion points for vector 13107 // code (since each scalar instruction ends up as a lane of a vector 13108 // instruction). 13109 const BasicBlock *InsertBlock = InsertPt->getParent(); 13110 auto *NodeEUI = DT->getNode(InsertBlock); 13111 if (!NodeEUI) 13112 return false; 13113 assert((NodeUI == NodeEUI) == 13114 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) && 13115 "Different nodes should have different DFS numbers"); 13116 // Check the order of the gather nodes users. 13117 if (TEInsertPt->getParent() != InsertBlock && 13118 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI))) 13119 return false; 13120 if (TEInsertPt->getParent() == InsertBlock && 13121 TEInsertPt->comesBefore(InsertPt)) 13122 return false; 13123 return true; 13124 }; 13125 // Find all tree entries used by the gathered values. If no common entries 13126 // found - not a shuffle. 13127 // Here we build a set of tree nodes for each gathered value and trying to 13128 // find the intersection between these sets. If we have at least one common 13129 // tree node for each gathered value - we have just a permutation of the 13130 // single vector. If we have 2 different sets, we're in situation where we 13131 // have a permutation of 2 input vectors. 13132 SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs; 13133 DenseMap<Value *, int> UsedValuesEntry; 13134 for (Value *V : VL) { 13135 if (isConstant(V)) 13136 continue; 13137 // Build a list of tree entries where V is used. 13138 SmallPtrSet<const TreeEntry *, 4> VToTEs; 13139 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) { 13140 if (TEPtr == TE || TEPtr->Idx == 0) 13141 continue; 13142 assert(any_of(TEPtr->Scalars, 13143 [&](Value *V) { return GatheredScalars.contains(V); }) && 13144 "Must contain at least single gathered value."); 13145 assert(TEPtr->UserTreeIndices.size() == 1 && 13146 "Expected only single user of a gather node."); 13147 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front(); 13148 13149 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp()); 13150 const Instruction *InsertPt = 13151 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator() 13152 : &getLastInstructionInBundle(UseEI.UserTE); 13153 if (TEInsertPt == InsertPt) { 13154 // If 2 gathers are operands of the same entry (regardless of whether 13155 // user is PHI or else), compare operands indices, use the earlier one 13156 // as the base. 13157 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx) 13158 continue; 13159 // If the user instruction is used for some reason in different 13160 // vectorized nodes - make it depend on index. 13161 if (TEUseEI.UserTE != UseEI.UserTE && 13162 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx) 13163 continue; 13164 } 13165 13166 // Check if the user node of the TE comes after user node of TEPtr, 13167 // otherwise TEPtr depends on TE. 13168 if ((TEInsertBlock != InsertPt->getParent() || 13169 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) && 13170 !CheckOrdering(InsertPt)) 13171 continue; 13172 VToTEs.insert(TEPtr); 13173 } 13174 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) { 13175 const TreeEntry *VTE = VTEs.front(); 13176 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) && 13177 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) { 13178 VTEs = VTEs.drop_front(); 13179 // Iterate through all vectorized nodes. 13180 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) { 13181 return MTE->State == TreeEntry::Vectorize; 13182 }); 13183 if (MIt == VTEs.end()) 13184 continue; 13185 VTE = *MIt; 13186 } 13187 if (none_of(TE->CombinedEntriesWithIndices, 13188 [&](const auto &P) { return P.first == VTE->Idx; })) { 13189 Instruction &LastBundleInst = getLastInstructionInBundle(VTE); 13190 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst)) 13191 continue; 13192 } 13193 VToTEs.insert(VTE); 13194 } 13195 if (VToTEs.empty()) 13196 continue; 13197 if (UsedTEs.empty()) { 13198 // The first iteration, just insert the list of nodes to vector. 13199 UsedTEs.push_back(VToTEs); 13200 UsedValuesEntry.try_emplace(V, 0); 13201 } else { 13202 // Need to check if there are any previously used tree nodes which use V. 13203 // If there are no such nodes, consider that we have another one input 13204 // vector. 13205 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs); 13206 unsigned Idx = 0; 13207 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) { 13208 // Do we have a non-empty intersection of previously listed tree entries 13209 // and tree entries using current V? 13210 set_intersect(VToTEs, Set); 13211 if (!VToTEs.empty()) { 13212 // Yes, write the new subset and continue analysis for the next 13213 // scalar. 13214 Set.swap(VToTEs); 13215 break; 13216 } 13217 VToTEs = SavedVToTEs; 13218 ++Idx; 13219 } 13220 // No non-empty intersection found - need to add a second set of possible 13221 // source vectors. 13222 if (Idx == UsedTEs.size()) { 13223 // If the number of input vectors is greater than 2 - not a permutation, 13224 // fallback to the regular gather. 13225 // TODO: support multiple reshuffled nodes. 13226 if (UsedTEs.size() == 2) 13227 continue; 13228 UsedTEs.push_back(SavedVToTEs); 13229 Idx = UsedTEs.size() - 1; 13230 } 13231 UsedValuesEntry.try_emplace(V, Idx); 13232 } 13233 } 13234 13235 if (UsedTEs.empty()) { 13236 Entries.clear(); 13237 return std::nullopt; 13238 } 13239 13240 unsigned VF = 0; 13241 if (UsedTEs.size() == 1) { 13242 // Keep the order to avoid non-determinism. 13243 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(), 13244 UsedTEs.front().end()); 13245 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) { 13246 return TE1->Idx < TE2->Idx; 13247 }); 13248 // Try to find the perfect match in another gather node at first. 13249 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) { 13250 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars); 13251 }); 13252 if (It != FirstEntries.end() && 13253 ((*It)->getVectorFactor() == VL.size() || 13254 ((*It)->getVectorFactor() == TE->Scalars.size() && 13255 TE->ReuseShuffleIndices.size() == VL.size() && 13256 (*It)->isSame(TE->Scalars)))) { 13257 Entries.push_back(*It); 13258 if ((*It)->getVectorFactor() == VL.size()) { 13259 std::iota(std::next(Mask.begin(), Part * VL.size()), 13260 std::next(Mask.begin(), (Part + 1) * VL.size()), 0); 13261 } else { 13262 SmallVector<int> CommonMask = TE->getCommonMask(); 13263 copy(CommonMask, Mask.begin()); 13264 } 13265 // Clear undef scalars. 13266 for (unsigned I : seq<unsigned>(VL.size())) 13267 if (isa<PoisonValue>(VL[I])) 13268 Mask[Part * VL.size() + I] = PoisonMaskElem; 13269 return TargetTransformInfo::SK_PermuteSingleSrc; 13270 } 13271 // No perfect match, just shuffle, so choose the first tree node from the 13272 // tree. 13273 Entries.push_back(FirstEntries.front()); 13274 VF = FirstEntries.front()->getVectorFactor(); 13275 } else { 13276 // Try to find nodes with the same vector factor. 13277 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries."); 13278 // Keep the order of tree nodes to avoid non-determinism. 13279 DenseMap<int, const TreeEntry *> VFToTE; 13280 for (const TreeEntry *TE : UsedTEs.front()) { 13281 unsigned VF = TE->getVectorFactor(); 13282 auto It = VFToTE.find(VF); 13283 if (It != VFToTE.end()) { 13284 if (It->second->Idx > TE->Idx) 13285 It->getSecond() = TE; 13286 continue; 13287 } 13288 VFToTE.try_emplace(VF, TE); 13289 } 13290 // Same, keep the order to avoid non-determinism. 13291 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(), 13292 UsedTEs.back().end()); 13293 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) { 13294 return TE1->Idx < TE2->Idx; 13295 }); 13296 for (const TreeEntry *TE : SecondEntries) { 13297 auto It = VFToTE.find(TE->getVectorFactor()); 13298 if (It != VFToTE.end()) { 13299 VF = It->first; 13300 Entries.push_back(It->second); 13301 Entries.push_back(TE); 13302 break; 13303 } 13304 } 13305 // No 2 source vectors with the same vector factor - just choose 2 with max 13306 // index. 13307 if (Entries.empty()) { 13308 Entries.push_back(*llvm::max_element( 13309 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) { 13310 return TE1->Idx < TE2->Idx; 13311 })); 13312 Entries.push_back(SecondEntries.front()); 13313 VF = std::max(Entries.front()->getVectorFactor(), 13314 Entries.back()->getVectorFactor()); 13315 } else { 13316 VF = Entries.front()->getVectorFactor(); 13317 } 13318 } 13319 13320 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>); 13321 // Checks if the 2 PHIs are compatible in terms of high possibility to be 13322 // vectorized. 13323 auto AreCompatiblePHIs = [&](Value *V, Value *V1) { 13324 auto *PHI = cast<PHINode>(V); 13325 auto *PHI1 = cast<PHINode>(V1); 13326 // Check that all incoming values are compatible/from same parent (if they 13327 // are instructions). 13328 // The incoming values are compatible if they all are constants, or 13329 // instruction with the same/alternate opcodes from the same basic block. 13330 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) { 13331 Value *In = PHI->getIncomingValue(I); 13332 Value *In1 = PHI1->getIncomingValue(I); 13333 if (isConstant(In) && isConstant(In1)) 13334 continue; 13335 if (!getSameOpcode({In, In1}, *TLI)) 13336 return false; 13337 if (cast<Instruction>(In)->getParent() != 13338 cast<Instruction>(In1)->getParent()) 13339 return false; 13340 } 13341 return true; 13342 }; 13343 // Check if the value can be ignored during analysis for shuffled gathers. 13344 // We suppose it is better to ignore instruction, which do not form splats, 13345 // are not vectorized/not extractelements (these instructions will be handled 13346 // by extractelements processing) or may form vector node in future. 13347 auto MightBeIgnored = [=](Value *V) { 13348 auto *I = dyn_cast<Instruction>(V); 13349 return I && !IsSplatOrUndefs && !isVectorized(I) && 13350 !isVectorLikeInstWithConstOps(I) && 13351 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I); 13352 }; 13353 // Check that the neighbor instruction may form a full vector node with the 13354 // current instruction V. It is possible, if they have same/alternate opcode 13355 // and same parent basic block. 13356 auto NeighborMightBeIgnored = [&](Value *V, int Idx) { 13357 Value *V1 = VL[Idx]; 13358 bool UsedInSameVTE = false; 13359 auto It = UsedValuesEntry.find(V1); 13360 if (It != UsedValuesEntry.end()) 13361 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second; 13362 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE && 13363 getSameOpcode({V, V1}, *TLI) && 13364 cast<Instruction>(V)->getParent() == 13365 cast<Instruction>(V1)->getParent() && 13366 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1)); 13367 }; 13368 // Build a shuffle mask for better cost estimation and vector emission. 13369 SmallBitVector UsedIdxs(Entries.size()); 13370 SmallVector<std::pair<unsigned, int>> EntryLanes; 13371 for (int I = 0, E = VL.size(); I < E; ++I) { 13372 Value *V = VL[I]; 13373 auto It = UsedValuesEntry.find(V); 13374 if (It == UsedValuesEntry.end()) 13375 continue; 13376 // Do not try to shuffle scalars, if they are constants, or instructions 13377 // that can be vectorized as a result of the following vector build 13378 // vectorization. 13379 if (isConstant(V) || (MightBeIgnored(V) && 13380 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) || 13381 (I != E - 1 && NeighborMightBeIgnored(V, I + 1))))) 13382 continue; 13383 unsigned Idx = It->second; 13384 EntryLanes.emplace_back(Idx, I); 13385 UsedIdxs.set(Idx); 13386 } 13387 // Iterate through all shuffled scalars and select entries, which can be used 13388 // for final shuffle. 13389 SmallVector<const TreeEntry *> TempEntries; 13390 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) { 13391 if (!UsedIdxs.test(I)) 13392 continue; 13393 // Fix the entry number for the given scalar. If it is the first entry, set 13394 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes). 13395 // These indices are used when calculating final shuffle mask as the vector 13396 // offset. 13397 for (std::pair<unsigned, int> &Pair : EntryLanes) 13398 if (Pair.first == I) 13399 Pair.first = TempEntries.size(); 13400 TempEntries.push_back(Entries[I]); 13401 } 13402 Entries.swap(TempEntries); 13403 if (EntryLanes.size() == Entries.size() && 13404 !VL.equals(ArrayRef(TE->Scalars) 13405 .slice(Part * VL.size(), 13406 std::min<int>(VL.size(), TE->Scalars.size())))) { 13407 // We may have here 1 or 2 entries only. If the number of scalars is equal 13408 // to the number of entries, no need to do the analysis, it is not very 13409 // profitable. Since VL is not the same as TE->Scalars, it means we already 13410 // have some shuffles before. Cut off not profitable case. 13411 Entries.clear(); 13412 return std::nullopt; 13413 } 13414 // Build the final mask, check for the identity shuffle, if possible. 13415 bool IsIdentity = Entries.size() == 1; 13416 // Pair.first is the offset to the vector, while Pair.second is the index of 13417 // scalar in the list. 13418 for (const std::pair<unsigned, int> &Pair : EntryLanes) { 13419 unsigned Idx = Part * VL.size() + Pair.second; 13420 Mask[Idx] = 13421 Pair.first * VF + 13422 (ForOrder ? std::distance( 13423 Entries[Pair.first]->Scalars.begin(), 13424 find(Entries[Pair.first]->Scalars, VL[Pair.second])) 13425 : Entries[Pair.first]->findLaneForValue(VL[Pair.second])); 13426 IsIdentity &= Mask[Idx] == Pair.second; 13427 } 13428 if (ForOrder || IsIdentity || Entries.empty()) { 13429 switch (Entries.size()) { 13430 case 1: 13431 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2) 13432 return TargetTransformInfo::SK_PermuteSingleSrc; 13433 break; 13434 case 2: 13435 if (EntryLanes.size() > 2 || VL.size() <= 2) 13436 return TargetTransformInfo::SK_PermuteTwoSrc; 13437 break; 13438 default: 13439 break; 13440 } 13441 } else if (!isa<VectorType>(VL.front()->getType()) && 13442 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) { 13443 // Do the cost estimation if shuffle beneficial than buildvector. 13444 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()), 13445 std::next(Mask.begin(), (Part + 1) * VL.size())); 13446 int MinElement = SubMask.front(), MaxElement = SubMask.front(); 13447 for (int Idx : SubMask) { 13448 if (Idx == PoisonMaskElem) 13449 continue; 13450 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF) 13451 MinElement = Idx; 13452 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF) 13453 MaxElement = Idx; 13454 } 13455 assert(MaxElement >= 0 && MinElement >= 0 && 13456 MaxElement % VF >= MinElement % VF && 13457 "Expected at least single element."); 13458 unsigned NewVF = std::max<unsigned>( 13459 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(), 13460 (MaxElement % VF) - 13461 (MinElement % VF) + 1)); 13462 if (NewVF < VF) { 13463 for_each(SubMask, [&](int &Idx) { 13464 if (Idx == PoisonMaskElem) 13465 return; 13466 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF + 13467 (Idx >= static_cast<int>(VF) ? NewVF : 0); 13468 }); 13469 } else { 13470 NewVF = VF; 13471 } 13472 13473 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 13474 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF); 13475 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size()); 13476 auto GetShuffleCost = [&, 13477 &TTI = *TTI](ArrayRef<int> Mask, 13478 ArrayRef<const TreeEntry *> Entries, 13479 VectorType *VecTy) -> InstructionCost { 13480 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 && 13481 ShuffleVectorInst::isDeInterleaveMaskOfFactor( 13482 Mask, Entries.front()->getInterleaveFactor())) 13483 return TTI::TCC_Free; 13484 return ::getShuffleCost(TTI, 13485 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc 13486 : TTI::SK_PermuteSingleSrc, 13487 VecTy, Mask, CostKind); 13488 }; 13489 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy); 13490 InstructionCost FirstShuffleCost = 0; 13491 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end()); 13492 if (Entries.size() == 1 || !Entries[0]->isGather()) { 13493 FirstShuffleCost = ShuffleCost; 13494 } else { 13495 // Transform mask to include only first entry. 13496 APInt DemandedElts = APInt::getAllOnes(SubMask.size()); 13497 bool IsIdentity = true; 13498 for (auto [I, Idx] : enumerate(FirstMask)) { 13499 if (Idx >= static_cast<int>(NewVF)) { 13500 Idx = PoisonMaskElem; 13501 } else { 13502 DemandedElts.clearBit(I); 13503 if (Idx != PoisonMaskElem) 13504 IsIdentity &= static_cast<int>(I) == Idx; 13505 } 13506 } 13507 if (!IsIdentity) 13508 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy); 13509 FirstShuffleCost += TTI->getScalarizationOverhead( 13510 MaskVecTy, DemandedElts, /*Insert=*/true, 13511 /*Extract=*/false, CostKind); 13512 } 13513 InstructionCost SecondShuffleCost = 0; 13514 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end()); 13515 if (Entries.size() == 1 || !Entries[1]->isGather()) { 13516 SecondShuffleCost = ShuffleCost; 13517 } else { 13518 // Transform mask to include only first entry. 13519 APInt DemandedElts = APInt::getAllOnes(SubMask.size()); 13520 bool IsIdentity = true; 13521 for (auto [I, Idx] : enumerate(SecondMask)) { 13522 if (Idx < static_cast<int>(NewVF) && Idx >= 0) { 13523 Idx = PoisonMaskElem; 13524 } else { 13525 DemandedElts.clearBit(I); 13526 if (Idx != PoisonMaskElem) { 13527 Idx -= NewVF; 13528 IsIdentity &= static_cast<int>(I) == Idx; 13529 } 13530 } 13531 } 13532 if (!IsIdentity) 13533 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy); 13534 SecondShuffleCost += TTI->getScalarizationOverhead( 13535 MaskVecTy, DemandedElts, /*Insert=*/true, 13536 /*Extract=*/false, CostKind); 13537 } 13538 APInt DemandedElts = APInt::getAllOnes(SubMask.size()); 13539 for (auto [I, Idx] : enumerate(SubMask)) 13540 if (Idx == PoisonMaskElem) 13541 DemandedElts.clearBit(I); 13542 InstructionCost BuildVectorCost = 13543 TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true, 13544 /*Extract=*/false, CostKind); 13545 const TreeEntry *BestEntry = nullptr; 13546 if (FirstShuffleCost < ShuffleCost) { 13547 std::for_each(std::next(Mask.begin(), Part * VL.size()), 13548 std::next(Mask.begin(), (Part + 1) * VL.size()), 13549 [&](int &Idx) { 13550 if (Idx >= static_cast<int>(VF)) 13551 Idx = PoisonMaskElem; 13552 }); 13553 BestEntry = Entries.front(); 13554 ShuffleCost = FirstShuffleCost; 13555 } 13556 if (SecondShuffleCost < ShuffleCost) { 13557 std::for_each(std::next(Mask.begin(), Part * VL.size()), 13558 std::next(Mask.begin(), (Part + 1) * VL.size()), 13559 [&](int &Idx) { 13560 if (Idx < static_cast<int>(VF)) 13561 Idx = PoisonMaskElem; 13562 else 13563 Idx -= VF; 13564 }); 13565 BestEntry = Entries[1]; 13566 ShuffleCost = SecondShuffleCost; 13567 } 13568 if (BuildVectorCost >= ShuffleCost) { 13569 if (BestEntry) { 13570 Entries.clear(); 13571 Entries.push_back(BestEntry); 13572 } 13573 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc 13574 : TargetTransformInfo::SK_PermuteSingleSrc; 13575 } 13576 } 13577 Entries.clear(); 13578 // Clear the corresponding mask elements. 13579 std::fill(std::next(Mask.begin(), Part * VL.size()), 13580 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem); 13581 return std::nullopt; 13582 } 13583 13584 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> 13585 BoUpSLP::isGatherShuffledEntry( 13586 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask, 13587 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts, 13588 bool ForOrder) { 13589 assert(NumParts > 0 && NumParts < VL.size() && 13590 "Expected positive number of registers."); 13591 Entries.clear(); 13592 // No need to check for the topmost gather node. 13593 if (TE == VectorizableTree.front().get() && 13594 (!GatheredLoadsEntriesFirst.has_value() || 13595 none_of(ArrayRef(VectorizableTree).drop_front(), 13596 [](const std::unique_ptr<TreeEntry> &TE) { 13597 return !TE->isGather(); 13598 }))) 13599 return {}; 13600 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not 13601 // implemented yet. 13602 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) 13603 return {}; 13604 Mask.assign(VL.size(), PoisonMaskElem); 13605 assert((TE->UserTreeIndices.size() == 1 || 13606 TE == VectorizableTree.front().get()) && 13607 "Expected only single user of the gather node."); 13608 assert(VL.size() % NumParts == 0 && 13609 "Number of scalars must be divisible by NumParts."); 13610 if (!TE->UserTreeIndices.empty() && 13611 TE->UserTreeIndices.front().UserTE->isGather() && 13612 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) { 13613 assert( 13614 (TE->Idx == 0 || 13615 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) || 13616 isSplat(TE->Scalars)) && 13617 "Expected splat or extractelements only node."); 13618 return {}; 13619 } 13620 unsigned SliceSize = getPartNumElems(VL.size(), NumParts); 13621 SmallVector<std::optional<TTI::ShuffleKind>> Res; 13622 for (unsigned Part : seq<unsigned>(NumParts)) { 13623 ArrayRef<Value *> SubVL = 13624 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part)); 13625 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back(); 13626 std::optional<TTI::ShuffleKind> SubRes = 13627 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part, 13628 ForOrder); 13629 if (!SubRes) 13630 SubEntries.clear(); 13631 Res.push_back(SubRes); 13632 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc && 13633 SubEntries.front()->getVectorFactor() == VL.size() && 13634 (SubEntries.front()->isSame(TE->Scalars) || 13635 SubEntries.front()->isSame(VL))) { 13636 SmallVector<const TreeEntry *> LocalSubEntries; 13637 LocalSubEntries.swap(SubEntries); 13638 Entries.clear(); 13639 Res.clear(); 13640 std::iota(Mask.begin(), Mask.end(), 0); 13641 // Clear undef scalars. 13642 for (int I = 0, Sz = VL.size(); I < Sz; ++I) 13643 if (isa<PoisonValue>(VL[I])) 13644 Mask[I] = PoisonMaskElem; 13645 Entries.emplace_back(1, LocalSubEntries.front()); 13646 Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc); 13647 return Res; 13648 } 13649 } 13650 if (all_of(Res, 13651 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) { 13652 Entries.clear(); 13653 return {}; 13654 } 13655 return Res; 13656 } 13657 13658 InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc, 13659 Type *ScalarTy) const { 13660 auto *VecTy = getWidenedType(ScalarTy, VL.size()); 13661 bool DuplicateNonConst = false; 13662 // Find the cost of inserting/extracting values from the vector. 13663 // Check if the same elements are inserted several times and count them as 13664 // shuffle candidates. 13665 APInt ShuffledElements = APInt::getZero(VL.size()); 13666 DenseMap<Value *, unsigned> UniqueElements; 13667 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 13668 InstructionCost Cost; 13669 auto EstimateInsertCost = [&](unsigned I, Value *V) { 13670 if (V->getType() != ScalarTy) { 13671 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(), 13672 TTI::CastContextHint::None, CostKind); 13673 V = nullptr; 13674 } 13675 if (!ForPoisonSrc) 13676 Cost += 13677 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 13678 I, Constant::getNullValue(VecTy), V); 13679 }; 13680 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem); 13681 for (unsigned I = 0, E = VL.size(); I < E; ++I) { 13682 Value *V = VL[I]; 13683 // No need to shuffle duplicates for constants. 13684 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) { 13685 ShuffledElements.setBit(I); 13686 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I; 13687 continue; 13688 } 13689 13690 auto Res = UniqueElements.try_emplace(V, I); 13691 if (Res.second) { 13692 EstimateInsertCost(I, V); 13693 ShuffleMask[I] = I; 13694 continue; 13695 } 13696 13697 DuplicateNonConst = true; 13698 ShuffledElements.setBit(I); 13699 ShuffleMask[I] = Res.first->second; 13700 } 13701 if (ForPoisonSrc) { 13702 if (isa<FixedVectorType>(ScalarTy)) { 13703 assert(SLPReVec && "Only supported by REVEC."); 13704 // We don't need to insert elements one by one. Instead, we can insert the 13705 // entire vector into the destination. 13706 Cost = 0; 13707 unsigned ScalarTyNumElements = getNumElements(ScalarTy); 13708 for (unsigned I : seq<unsigned>(VL.size())) 13709 if (!ShuffledElements[I]) 13710 Cost += TTI->getShuffleCost( 13711 TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind, 13712 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy)); 13713 } else { 13714 Cost = TTI->getScalarizationOverhead(VecTy, 13715 /*DemandedElts*/ ~ShuffledElements, 13716 /*Insert*/ true, 13717 /*Extract*/ false, CostKind, VL); 13718 } 13719 } 13720 if (DuplicateNonConst) 13721 Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc, 13722 VecTy, ShuffleMask); 13723 return Cost; 13724 } 13725 13726 Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { 13727 auto &Res = EntryToLastInstruction.try_emplace(E).first->second; 13728 if (Res) 13729 return *Res; 13730 // Get the basic block this bundle is in. All instructions in the bundle 13731 // should be in this block (except for extractelement-like instructions with 13732 // constant indices or gathered loads). 13733 auto *Front = E->getMainOp(); 13734 auto *BB = Front->getParent(); 13735 assert(((GatheredLoadsEntriesFirst.has_value() && 13736 E->getOpcode() == Instruction::Load && E->isGather() && 13737 E->Idx < *GatheredLoadsEntriesFirst) || 13738 all_of(E->Scalars, 13739 [=](Value *V) -> bool { 13740 if (E->getOpcode() == Instruction::GetElementPtr && 13741 !isa<GetElementPtrInst>(V)) 13742 return true; 13743 auto *I = dyn_cast<Instruction>(V); 13744 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB || 13745 isVectorLikeInstWithConstOps(I); 13746 })) && 13747 "Expected gathered loads or GEPs or instructions from same basic " 13748 "block."); 13749 13750 auto FindLastInst = [&]() { 13751 Instruction *LastInst = Front; 13752 for (Value *V : E->Scalars) { 13753 auto *I = dyn_cast<Instruction>(V); 13754 if (!I) 13755 continue; 13756 if (LastInst->getParent() == I->getParent()) { 13757 if (LastInst->comesBefore(I)) 13758 LastInst = I; 13759 continue; 13760 } 13761 assert(((E->getOpcode() == Instruction::GetElementPtr && 13762 !isa<GetElementPtrInst>(I)) || 13763 (isVectorLikeInstWithConstOps(LastInst) && 13764 isVectorLikeInstWithConstOps(I)) || 13765 (GatheredLoadsEntriesFirst.has_value() && 13766 E->getOpcode() == Instruction::Load && E->isGather() && 13767 E->Idx < *GatheredLoadsEntriesFirst)) && 13768 "Expected vector-like or non-GEP in GEP node insts only."); 13769 if (!DT->isReachableFromEntry(LastInst->getParent())) { 13770 LastInst = I; 13771 continue; 13772 } 13773 if (!DT->isReachableFromEntry(I->getParent())) 13774 continue; 13775 auto *NodeA = DT->getNode(LastInst->getParent()); 13776 auto *NodeB = DT->getNode(I->getParent()); 13777 assert(NodeA && "Should only process reachable instructions"); 13778 assert(NodeB && "Should only process reachable instructions"); 13779 assert((NodeA == NodeB) == 13780 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && 13781 "Different nodes should have different DFS numbers"); 13782 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) 13783 LastInst = I; 13784 } 13785 BB = LastInst->getParent(); 13786 return LastInst; 13787 }; 13788 13789 auto FindFirstInst = [&]() { 13790 Instruction *FirstInst = Front; 13791 for (Value *V : E->Scalars) { 13792 auto *I = dyn_cast<Instruction>(V); 13793 if (!I) 13794 continue; 13795 if (FirstInst->getParent() == I->getParent()) { 13796 if (I->comesBefore(FirstInst)) 13797 FirstInst = I; 13798 continue; 13799 } 13800 assert(((E->getOpcode() == Instruction::GetElementPtr && 13801 !isa<GetElementPtrInst>(I)) || 13802 (isVectorLikeInstWithConstOps(FirstInst) && 13803 isVectorLikeInstWithConstOps(I))) && 13804 "Expected vector-like or non-GEP in GEP node insts only."); 13805 if (!DT->isReachableFromEntry(FirstInst->getParent())) { 13806 FirstInst = I; 13807 continue; 13808 } 13809 if (!DT->isReachableFromEntry(I->getParent())) 13810 continue; 13811 auto *NodeA = DT->getNode(FirstInst->getParent()); 13812 auto *NodeB = DT->getNode(I->getParent()); 13813 assert(NodeA && "Should only process reachable instructions"); 13814 assert(NodeB && "Should only process reachable instructions"); 13815 assert((NodeA == NodeB) == 13816 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && 13817 "Different nodes should have different DFS numbers"); 13818 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn()) 13819 FirstInst = I; 13820 } 13821 return FirstInst; 13822 }; 13823 13824 // Set insertpoint for gathered loads to the very first load. 13825 if (GatheredLoadsEntriesFirst.has_value() && 13826 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() && 13827 E->getOpcode() == Instruction::Load) { 13828 Res = FindFirstInst(); 13829 return *Res; 13830 } 13831 13832 // Set the insert point to the beginning of the basic block if the entry 13833 // should not be scheduled. 13834 if (doesNotNeedToSchedule(E->Scalars) || 13835 (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) { 13836 if ((E->getOpcode() == Instruction::GetElementPtr && 13837 any_of(E->Scalars, 13838 [](Value *V) { 13839 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V); 13840 })) || 13841 all_of(E->Scalars, 13842 [](Value *V) { 13843 return isa<PoisonValue>(V) || 13844 (!isVectorLikeInstWithConstOps(V) && 13845 isUsedOutsideBlock(V)); 13846 }) || 13847 (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) { 13848 return isa<ExtractElementInst, UndefValue>(V) || 13849 areAllOperandsNonInsts(V); 13850 }))) 13851 Res = FindLastInst(); 13852 else 13853 Res = FindFirstInst(); 13854 return *Res; 13855 } 13856 13857 // Find the last instruction. The common case should be that BB has been 13858 // scheduled, and the last instruction is VL.back(). So we start with 13859 // VL.back() and iterate over schedule data until we reach the end of the 13860 // bundle. The end of the bundle is marked by null ScheduleData. 13861 if (BlocksSchedules.count(BB) && !E->isGather()) { 13862 Value *V = E->isOneOf(E->Scalars.back()); 13863 if (doesNotNeedToBeScheduled(V)) 13864 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled); 13865 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V); 13866 if (Bundle && Bundle->isPartOfBundle()) 13867 for (; Bundle; Bundle = Bundle->NextInBundle) 13868 Res = Bundle->Inst; 13869 } 13870 13871 // LastInst can still be null at this point if there's either not an entry 13872 // for BB in BlocksSchedules or there's no ScheduleData available for 13873 // VL.back(). This can be the case if buildTree_rec aborts for various 13874 // reasons (e.g., the maximum recursion depth is reached, the maximum region 13875 // size is reached, etc.). ScheduleData is initialized in the scheduling 13876 // "dry-run". 13877 // 13878 // If this happens, we can still find the last instruction by brute force. We 13879 // iterate forwards from Front (inclusive) until we either see all 13880 // instructions in the bundle or reach the end of the block. If Front is the 13881 // last instruction in program order, LastInst will be set to Front, and we 13882 // will visit all the remaining instructions in the block. 13883 // 13884 // One of the reasons we exit early from buildTree_rec is to place an upper 13885 // bound on compile-time. Thus, taking an additional compile-time hit here is 13886 // not ideal. However, this should be exceedingly rare since it requires that 13887 // we both exit early from buildTree_rec and that the bundle be out-of-order 13888 // (causing us to iterate all the way to the end of the block). 13889 if (!Res) 13890 Res = FindLastInst(); 13891 assert(Res && "Failed to find last instruction in bundle"); 13892 return *Res; 13893 } 13894 13895 void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { 13896 auto *Front = E->getMainOp(); 13897 Instruction *LastInst = &getLastInstructionInBundle(E); 13898 assert(LastInst && "Failed to find last instruction in bundle"); 13899 BasicBlock::iterator LastInstIt = LastInst->getIterator(); 13900 // If the instruction is PHI, set the insert point after all the PHIs. 13901 bool IsPHI = isa<PHINode>(LastInst); 13902 if (IsPHI) 13903 LastInstIt = LastInst->getParent()->getFirstNonPHIIt(); 13904 if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) { 13905 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt); 13906 } else { 13907 // Set the insertion point after the last instruction in the bundle. Set the 13908 // debug location to Front. 13909 Builder.SetInsertPoint( 13910 LastInst->getParent(), 13911 LastInst->getNextNonDebugInstruction()->getIterator()); 13912 } 13913 Builder.SetCurrentDebugLocation(Front->getDebugLoc()); 13914 } 13915 13916 Value *BoUpSLP::gather( 13917 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy, 13918 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) { 13919 // List of instructions/lanes from current block and/or the blocks which are 13920 // part of the current loop. These instructions will be inserted at the end to 13921 // make it possible to optimize loops and hoist invariant instructions out of 13922 // the loops body with better chances for success. 13923 SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts; 13924 SmallSet<int, 4> PostponedIndices; 13925 Loop *L = LI->getLoopFor(Builder.GetInsertBlock()); 13926 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) { 13927 SmallPtrSet<BasicBlock *, 4> Visited; 13928 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second) 13929 InsertBB = InsertBB->getSinglePredecessor(); 13930 return InsertBB && InsertBB == InstBB; 13931 }; 13932 for (int I = 0, E = VL.size(); I < E; ++I) { 13933 if (auto *Inst = dyn_cast<Instruction>(VL[I])) 13934 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) || 13935 isVectorized(Inst) || 13936 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) && 13937 PostponedIndices.insert(I).second) 13938 PostponedInsts.emplace_back(Inst, I); 13939 } 13940 13941 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos, 13942 Type *Ty) { 13943 Value *Scalar = V; 13944 if (Scalar->getType() != Ty) { 13945 assert(Scalar->getType()->isIntOrIntVectorTy() && 13946 Ty->isIntOrIntVectorTy() && "Expected integer types only."); 13947 Value *V = Scalar; 13948 if (auto *CI = dyn_cast<CastInst>(Scalar); 13949 isa_and_nonnull<SExtInst, ZExtInst>(CI)) { 13950 Value *Op = CI->getOperand(0); 13951 if (auto *IOp = dyn_cast<Instruction>(Op); 13952 !IOp || !(isDeleted(IOp) || isVectorized(IOp))) 13953 V = Op; 13954 } 13955 Scalar = Builder.CreateIntCast( 13956 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL))); 13957 } 13958 13959 Instruction *InsElt; 13960 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) { 13961 assert(SLPReVec && "FixedVectorType is not expected."); 13962 Vec = 13963 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy)); 13964 auto *II = dyn_cast<IntrinsicInst>(Vec); 13965 if (!II || II->getIntrinsicID() != Intrinsic::vector_insert) 13966 return Vec; 13967 InsElt = II; 13968 } else { 13969 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos)); 13970 InsElt = dyn_cast<InsertElementInst>(Vec); 13971 if (!InsElt) 13972 return Vec; 13973 } 13974 GatherShuffleExtractSeq.insert(InsElt); 13975 CSEBlocks.insert(InsElt->getParent()); 13976 // Add to our 'need-to-extract' list. 13977 if (isa<Instruction>(V)) { 13978 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) { 13979 // Find which lane we need to extract. 13980 User *UserOp = nullptr; 13981 if (Scalar != V) { 13982 if (auto *SI = dyn_cast<Instruction>(Scalar)) 13983 UserOp = SI; 13984 } else { 13985 UserOp = InsElt; 13986 } 13987 if (UserOp) { 13988 unsigned FoundLane = Entries.front()->findLaneForValue(V); 13989 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane); 13990 } 13991 } 13992 } 13993 return Vec; 13994 }; 13995 auto *VecTy = getWidenedType(ScalarTy, VL.size()); 13996 Value *Vec = PoisonValue::get(VecTy); 13997 SmallVector<int> NonConsts; 13998 SmallVector<int> Mask(VL.size()); 13999 std::iota(Mask.begin(), Mask.end(), 0); 14000 Value *OriginalRoot = Root; 14001 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root); 14002 SV && isa<PoisonValue>(SV->getOperand(1)) && 14003 SV->getOperand(0)->getType() == VecTy) { 14004 Root = SV->getOperand(0); 14005 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end()); 14006 } 14007 // Insert constant values at first. 14008 for (int I = 0, E = VL.size(); I < E; ++I) { 14009 if (PostponedIndices.contains(I)) 14010 continue; 14011 if (!isConstant(VL[I])) { 14012 NonConsts.push_back(I); 14013 continue; 14014 } 14015 if (isa<PoisonValue>(VL[I])) 14016 continue; 14017 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy); 14018 Mask[I] = I + E; 14019 } 14020 if (Root) { 14021 if (isa<PoisonValue>(Vec)) { 14022 Vec = OriginalRoot; 14023 } else { 14024 Vec = CreateShuffle(Root, Vec, Mask); 14025 if (auto *OI = dyn_cast<Instruction>(OriginalRoot); 14026 OI && OI->hasNUses(0) && 14027 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { 14028 return TE->VectorizedValue == OI; 14029 })) 14030 eraseInstruction(OI); 14031 } 14032 } 14033 // Insert non-constant values. 14034 for (int I : NonConsts) 14035 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy); 14036 // Append instructions, which are/may be part of the loop, in the end to make 14037 // it possible to hoist non-loop-based instructions. 14038 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts) 14039 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy); 14040 14041 return Vec; 14042 } 14043 14044 /// Merges shuffle masks and emits final shuffle instruction, if required. It 14045 /// supports shuffling of 2 input vectors. It implements lazy shuffles emission, 14046 /// when the actual shuffle instruction is generated only if this is actually 14047 /// required. Otherwise, the shuffle instruction emission is delayed till the 14048 /// end of the process, to reduce the number of emitted instructions and further 14049 /// analysis/transformations. 14050 /// The class also will look through the previously emitted shuffle instructions 14051 /// and properly mark indices in mask as undef. 14052 /// For example, given the code 14053 /// \code 14054 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0> 14055 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0> 14056 /// \endcode 14057 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will 14058 /// look through %s1 and %s2 and emit 14059 /// \code 14060 /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> 14061 /// \endcode 14062 /// instead. 14063 /// If 2 operands are of different size, the smallest one will be resized and 14064 /// the mask recalculated properly. 14065 /// For example, given the code 14066 /// \code 14067 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0> 14068 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0> 14069 /// \endcode 14070 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will 14071 /// look through %s1 and %s2 and emit 14072 /// \code 14073 /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> 14074 /// \endcode 14075 /// instead. 14076 class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { 14077 bool IsFinalized = false; 14078 /// Combined mask for all applied operands and masks. It is built during 14079 /// analysis and actual emission of shuffle vector instructions. 14080 SmallVector<int> CommonMask; 14081 /// List of operands for the shuffle vector instruction. It hold at max 2 14082 /// operands, if the 3rd is going to be added, the first 2 are combined into 14083 /// shuffle with \p CommonMask mask, the first operand sets to be the 14084 /// resulting shuffle and the second operand sets to be the newly added 14085 /// operand. The \p CommonMask is transformed in the proper way after that. 14086 SmallVector<Value *, 2> InVectors; 14087 IRBuilderBase &Builder; 14088 BoUpSLP &R; 14089 14090 class ShuffleIRBuilder { 14091 IRBuilderBase &Builder; 14092 /// Holds all of the instructions that we gathered. 14093 SetVector<Instruction *> &GatherShuffleExtractSeq; 14094 /// A list of blocks that we are going to CSE. 14095 DenseSet<BasicBlock *> &CSEBlocks; 14096 /// Data layout. 14097 const DataLayout &DL; 14098 14099 public: 14100 ShuffleIRBuilder(IRBuilderBase &Builder, 14101 SetVector<Instruction *> &GatherShuffleExtractSeq, 14102 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL) 14103 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq), 14104 CSEBlocks(CSEBlocks), DL(DL) {} 14105 ~ShuffleIRBuilder() = default; 14106 /// Creates shufflevector for the 2 operands with the given mask. 14107 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) { 14108 if (V1->getType() != V2->getType()) { 14109 assert(V1->getType()->isIntOrIntVectorTy() && 14110 V1->getType()->isIntOrIntVectorTy() && 14111 "Expected integer vector types only."); 14112 if (V1->getType() != V2->getType()) { 14113 if (cast<VectorType>(V2->getType()) 14114 ->getElementType() 14115 ->getIntegerBitWidth() < cast<VectorType>(V1->getType()) 14116 ->getElementType() 14117 ->getIntegerBitWidth()) 14118 V2 = Builder.CreateIntCast( 14119 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL))); 14120 else 14121 V1 = Builder.CreateIntCast( 14122 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL))); 14123 } 14124 } 14125 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask); 14126 if (auto *I = dyn_cast<Instruction>(Vec)) { 14127 GatherShuffleExtractSeq.insert(I); 14128 CSEBlocks.insert(I->getParent()); 14129 } 14130 return Vec; 14131 } 14132 /// Creates permutation of the single vector operand with the given mask, if 14133 /// it is not identity mask. 14134 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) { 14135 if (Mask.empty()) 14136 return V1; 14137 unsigned VF = Mask.size(); 14138 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements(); 14139 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF)) 14140 return V1; 14141 Value *Vec = Builder.CreateShuffleVector(V1, Mask); 14142 if (auto *I = dyn_cast<Instruction>(Vec)) { 14143 GatherShuffleExtractSeq.insert(I); 14144 CSEBlocks.insert(I->getParent()); 14145 } 14146 return Vec; 14147 } 14148 Value *createIdentity(Value *V) { return V; } 14149 Value *createPoison(Type *Ty, unsigned VF) { 14150 return PoisonValue::get(getWidenedType(Ty, VF)); 14151 } 14152 /// Resizes 2 input vector to match the sizes, if the they are not equal 14153 /// yet. The smallest vector is resized to the size of the larger vector. 14154 void resizeToMatch(Value *&V1, Value *&V2) { 14155 if (V1->getType() == V2->getType()) 14156 return; 14157 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements(); 14158 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements(); 14159 int VF = std::max(V1VF, V2VF); 14160 int MinVF = std::min(V1VF, V2VF); 14161 SmallVector<int> IdentityMask(VF, PoisonMaskElem); 14162 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF), 14163 0); 14164 Value *&Op = MinVF == V1VF ? V1 : V2; 14165 Op = Builder.CreateShuffleVector(Op, IdentityMask); 14166 if (auto *I = dyn_cast<Instruction>(Op)) { 14167 GatherShuffleExtractSeq.insert(I); 14168 CSEBlocks.insert(I->getParent()); 14169 } 14170 if (MinVF == V1VF) 14171 V1 = Op; 14172 else 14173 V2 = Op; 14174 } 14175 }; 14176 14177 /// Smart shuffle instruction emission, walks through shuffles trees and 14178 /// tries to find the best matching vector for the actual shuffle 14179 /// instruction. 14180 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) { 14181 assert(V1 && "Expected at least one vector value."); 14182 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq, 14183 R.CSEBlocks, *R.DL); 14184 return BaseShuffleAnalysis::createShuffle<Value *>( 14185 V1, V2, Mask, ShuffleBuilder, ScalarTy); 14186 } 14187 14188 /// Cast value \p V to the vector type with the same number of elements, but 14189 /// the base type \p ScalarTy. 14190 Value *castToScalarTyElem(Value *V, 14191 std::optional<bool> IsSigned = std::nullopt) { 14192 auto *VecTy = cast<VectorType>(V->getType()); 14193 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0); 14194 if (VecTy->getElementType() == ScalarTy->getScalarType()) 14195 return V; 14196 return Builder.CreateIntCast( 14197 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()), 14198 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL)))); 14199 } 14200 14201 public: 14202 ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R) 14203 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {} 14204 14205 /// Adjusts extractelements after reusing them. 14206 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask, 14207 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds, 14208 unsigned NumParts, bool &UseVecBaseAsInput) { 14209 UseVecBaseAsInput = false; 14210 SmallPtrSet<Value *, 4> UniqueBases; 14211 Value *VecBase = nullptr; 14212 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end()); 14213 if (!E->ReorderIndices.empty()) { 14214 SmallVector<int> ReorderMask(E->ReorderIndices.begin(), 14215 E->ReorderIndices.end()); 14216 reorderScalars(VL, ReorderMask); 14217 } 14218 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { 14219 int Idx = Mask[I]; 14220 if (Idx == PoisonMaskElem) 14221 continue; 14222 auto *EI = cast<ExtractElementInst>(VL[I]); 14223 VecBase = EI->getVectorOperand(); 14224 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty()) 14225 VecBase = TEs.front()->VectorizedValue; 14226 assert(VecBase && "Expected vectorized value."); 14227 UniqueBases.insert(VecBase); 14228 // If the only one use is vectorized - can delete the extractelement 14229 // itself. 14230 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) || 14231 (NumParts != 1 && count(VL, EI) > 1) || 14232 any_of(EI->users(), [&](User *U) { 14233 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U); 14234 return UTEs.empty() || UTEs.size() > 1 || 14235 (isa<GetElementPtrInst>(U) && 14236 !R.areAllUsersVectorized(cast<Instruction>(U))) || 14237 (!UTEs.empty() && 14238 count_if(R.VectorizableTree, 14239 [&](const std::unique_ptr<TreeEntry> &TE) { 14240 return any_of(TE->UserTreeIndices, 14241 [&](const EdgeInfo &Edge) { 14242 return Edge.UserTE == 14243 UTEs.front(); 14244 }) && 14245 is_contained(VL, EI); 14246 }) != 1); 14247 })) 14248 continue; 14249 R.eraseInstruction(EI); 14250 } 14251 if (NumParts == 1 || UniqueBases.size() == 1) { 14252 assert(VecBase && "Expected vectorized value."); 14253 return castToScalarTyElem(VecBase); 14254 } 14255 UseVecBaseAsInput = true; 14256 auto TransformToIdentity = [](MutableArrayRef<int> Mask) { 14257 for (auto [I, Idx] : enumerate(Mask)) 14258 if (Idx != PoisonMaskElem) 14259 Idx = I; 14260 }; 14261 // Perform multi-register vector shuffle, joining them into a single virtual 14262 // long vector. 14263 // Need to shuffle each part independently and then insert all this parts 14264 // into a long virtual vector register, forming the original vector. 14265 Value *Vec = nullptr; 14266 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem); 14267 unsigned SliceSize = getPartNumElems(VL.size(), NumParts); 14268 for (unsigned Part : seq<unsigned>(NumParts)) { 14269 unsigned Limit = getNumElems(VL.size(), SliceSize, Part); 14270 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit); 14271 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit); 14272 constexpr int MaxBases = 2; 14273 SmallVector<Value *, MaxBases> Bases(MaxBases); 14274 auto VLMask = zip(SubVL, SubMask); 14275 const unsigned VF = std::accumulate( 14276 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) { 14277 if (std::get<1>(D) == PoisonMaskElem) 14278 return S; 14279 Value *VecOp = 14280 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand(); 14281 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); 14282 !TEs.empty()) 14283 VecOp = TEs.front()->VectorizedValue; 14284 assert(VecOp && "Expected vectorized value."); 14285 const unsigned Size = 14286 cast<FixedVectorType>(VecOp->getType())->getNumElements(); 14287 return std::max(S, Size); 14288 }); 14289 for (const auto [V, I] : VLMask) { 14290 if (I == PoisonMaskElem) 14291 continue; 14292 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand(); 14293 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty()) 14294 VecOp = TEs.front()->VectorizedValue; 14295 assert(VecOp && "Expected vectorized value."); 14296 VecOp = castToScalarTyElem(VecOp); 14297 Bases[I / VF] = VecOp; 14298 } 14299 if (!Bases.front()) 14300 continue; 14301 Value *SubVec; 14302 if (Bases.back()) { 14303 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask); 14304 TransformToIdentity(SubMask); 14305 } else { 14306 SubVec = Bases.front(); 14307 } 14308 if (!Vec) { 14309 Vec = SubVec; 14310 assert((Part == 0 || all_of(seq<unsigned>(0, Part), 14311 [&](unsigned P) { 14312 ArrayRef<int> SubMask = 14313 Mask.slice(P * SliceSize, 14314 getNumElems(Mask.size(), 14315 SliceSize, P)); 14316 return all_of(SubMask, [](int Idx) { 14317 return Idx == PoisonMaskElem; 14318 }); 14319 })) && 14320 "Expected first part or all previous parts masked."); 14321 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize)); 14322 } else { 14323 unsigned NewVF = 14324 cast<FixedVectorType>(Vec->getType())->getNumElements(); 14325 if (Vec->getType() != SubVec->getType()) { 14326 unsigned SubVecVF = 14327 cast<FixedVectorType>(SubVec->getType())->getNumElements(); 14328 NewVF = std::max(NewVF, SubVecVF); 14329 } 14330 // Adjust SubMask. 14331 for (int &Idx : SubMask) 14332 if (Idx != PoisonMaskElem) 14333 Idx += NewVF; 14334 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize)); 14335 Vec = createShuffle(Vec, SubVec, VecMask); 14336 TransformToIdentity(VecMask); 14337 } 14338 } 14339 copy(VecMask, Mask.begin()); 14340 return Vec; 14341 } 14342 /// Checks if the specified entry \p E needs to be delayed because of its 14343 /// dependency nodes. 14344 std::optional<Value *> 14345 needToDelay(const TreeEntry *E, 14346 ArrayRef<SmallVector<const TreeEntry *>> Deps) const { 14347 // No need to delay emission if all deps are ready. 14348 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) { 14349 return all_of( 14350 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; }); 14351 })) 14352 return std::nullopt; 14353 // Postpone gather emission, will be emitted after the end of the 14354 // process to keep correct order. 14355 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor()); 14356 return Builder.CreateAlignedLoad( 14357 ResVecTy, 14358 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())), 14359 MaybeAlign()); 14360 } 14361 /// Adds 2 input vectors (in form of tree entries) and the mask for their 14362 /// shuffling. 14363 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) { 14364 Value *V1 = E1.VectorizedValue; 14365 if (V1->getType()->isIntOrIntVectorTy()) 14366 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) { 14367 if (isa<PoisonValue>(V)) 14368 return false; 14369 return !isKnownNonNegative( 14370 V, SimplifyQuery(*R.DL)); 14371 })); 14372 Value *V2 = E2.VectorizedValue; 14373 if (V2->getType()->isIntOrIntVectorTy()) 14374 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) { 14375 if (isa<PoisonValue>(V)) 14376 return false; 14377 return !isKnownNonNegative( 14378 V, SimplifyQuery(*R.DL)); 14379 })); 14380 add(V1, V2, Mask); 14381 } 14382 /// Adds single input vector (in form of tree entry) and the mask for its 14383 /// shuffling. 14384 void add(const TreeEntry &E1, ArrayRef<int> Mask) { 14385 Value *V1 = E1.VectorizedValue; 14386 if (V1->getType()->isIntOrIntVectorTy()) 14387 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) { 14388 if (isa<PoisonValue>(V)) 14389 return false; 14390 return !isKnownNonNegative( 14391 V, SimplifyQuery(*R.DL)); 14392 })); 14393 add(V1, Mask); 14394 } 14395 /// Adds 2 input vectors and the mask for their shuffling. 14396 void add(Value *V1, Value *V2, ArrayRef<int> Mask) { 14397 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors."); 14398 assert(isa<FixedVectorType>(V1->getType()) && 14399 isa<FixedVectorType>(V2->getType()) && 14400 "castToScalarTyElem expects V1 and V2 to be FixedVectorType"); 14401 V1 = castToScalarTyElem(V1); 14402 V2 = castToScalarTyElem(V2); 14403 if (InVectors.empty()) { 14404 InVectors.push_back(V1); 14405 InVectors.push_back(V2); 14406 CommonMask.assign(Mask.begin(), Mask.end()); 14407 return; 14408 } 14409 Value *Vec = InVectors.front(); 14410 if (InVectors.size() == 2) { 14411 Vec = createShuffle(Vec, InVectors.back(), CommonMask); 14412 transformMaskAfterShuffle(CommonMask, CommonMask); 14413 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() != 14414 Mask.size()) { 14415 Vec = createShuffle(Vec, nullptr, CommonMask); 14416 transformMaskAfterShuffle(CommonMask, CommonMask); 14417 } 14418 V1 = createShuffle(V1, V2, Mask); 14419 unsigned VF = std::max(getVF(V1), getVF(Vec)); 14420 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 14421 if (Mask[Idx] != PoisonMaskElem) 14422 CommonMask[Idx] = Idx + VF; 14423 InVectors.front() = Vec; 14424 if (InVectors.size() == 2) 14425 InVectors.back() = V1; 14426 else 14427 InVectors.push_back(V1); 14428 } 14429 /// Adds another one input vector and the mask for the shuffling. 14430 void add(Value *V1, ArrayRef<int> Mask, bool = false) { 14431 assert(isa<FixedVectorType>(V1->getType()) && 14432 "castToScalarTyElem expects V1 to be FixedVectorType"); 14433 V1 = castToScalarTyElem(V1); 14434 if (InVectors.empty()) { 14435 InVectors.push_back(V1); 14436 CommonMask.assign(Mask.begin(), Mask.end()); 14437 return; 14438 } 14439 const auto *It = find(InVectors, V1); 14440 if (It == InVectors.end()) { 14441 if (InVectors.size() == 2 || 14442 InVectors.front()->getType() != V1->getType()) { 14443 Value *V = InVectors.front(); 14444 if (InVectors.size() == 2) { 14445 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask); 14446 transformMaskAfterShuffle(CommonMask, CommonMask); 14447 } else if (cast<FixedVectorType>(V->getType())->getNumElements() != 14448 CommonMask.size()) { 14449 V = createShuffle(InVectors.front(), nullptr, CommonMask); 14450 transformMaskAfterShuffle(CommonMask, CommonMask); 14451 } 14452 unsigned VF = std::max(CommonMask.size(), Mask.size()); 14453 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 14454 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem) 14455 CommonMask[Idx] = 14456 V->getType() != V1->getType() 14457 ? Idx + VF 14458 : Mask[Idx] + cast<FixedVectorType>(V1->getType()) 14459 ->getNumElements(); 14460 if (V->getType() != V1->getType()) 14461 V1 = createShuffle(V1, nullptr, Mask); 14462 InVectors.front() = V; 14463 if (InVectors.size() == 2) 14464 InVectors.back() = V1; 14465 else 14466 InVectors.push_back(V1); 14467 return; 14468 } 14469 // Check if second vector is required if the used elements are already 14470 // used from the first one. 14471 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 14472 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) { 14473 InVectors.push_back(V1); 14474 break; 14475 } 14476 } 14477 unsigned VF = 0; 14478 for (Value *V : InVectors) 14479 VF = std::max(VF, getVF(V)); 14480 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 14481 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) 14482 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF); 14483 } 14484 /// Adds another one input vector and the mask for the shuffling. 14485 void addOrdered(Value *V1, ArrayRef<unsigned> Order) { 14486 SmallVector<int> NewMask; 14487 inversePermutation(Order, NewMask); 14488 add(V1, NewMask); 14489 } 14490 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0, 14491 Value *Root = nullptr) { 14492 return R.gather(VL, Root, ScalarTy, 14493 [&](Value *V1, Value *V2, ArrayRef<int> Mask) { 14494 return createShuffle(V1, V2, Mask); 14495 }); 14496 } 14497 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); } 14498 /// Finalize emission of the shuffles. 14499 /// \param Action the action (if any) to be performed before final applying of 14500 /// the \p ExtMask mask. 14501 Value * 14502 finalize(ArrayRef<int> ExtMask, 14503 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors, 14504 ArrayRef<int> SubVectorsMask, unsigned VF = 0, 14505 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) { 14506 IsFinalized = true; 14507 if (Action) { 14508 Value *Vec = InVectors.front(); 14509 if (InVectors.size() == 2) { 14510 Vec = createShuffle(Vec, InVectors.back(), CommonMask); 14511 InVectors.pop_back(); 14512 } else { 14513 Vec = createShuffle(Vec, nullptr, CommonMask); 14514 } 14515 transformMaskAfterShuffle(CommonMask, CommonMask); 14516 assert(VF > 0 && 14517 "Expected vector length for the final value before action."); 14518 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements(); 14519 if (VecVF < VF) { 14520 SmallVector<int> ResizeMask(VF, PoisonMaskElem); 14521 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0); 14522 Vec = createShuffle(Vec, nullptr, ResizeMask); 14523 } 14524 Action(Vec, CommonMask); 14525 InVectors.front() = Vec; 14526 } 14527 if (!SubVectors.empty()) { 14528 Value *Vec = InVectors.front(); 14529 if (InVectors.size() == 2) { 14530 Vec = createShuffle(Vec, InVectors.back(), CommonMask); 14531 InVectors.pop_back(); 14532 } else { 14533 Vec = createShuffle(Vec, nullptr, CommonMask); 14534 } 14535 transformMaskAfterShuffle(CommonMask, CommonMask); 14536 auto CreateSubVectors = [&](Value *Vec, 14537 SmallVectorImpl<int> &CommonMask) { 14538 for (auto [E, Idx] : SubVectors) { 14539 Value *V = E->VectorizedValue; 14540 if (V->getType()->isIntOrIntVectorTy()) 14541 V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) { 14542 if (isa<PoisonValue>(V)) 14543 return false; 14544 return !isKnownNonNegative( 14545 V, SimplifyQuery(*R.DL)); 14546 })); 14547 unsigned InsertionIndex = Idx * getNumElements(ScalarTy); 14548 Vec = createInsertVector( 14549 Builder, Vec, V, InsertionIndex, 14550 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2, 14551 _3)); 14552 if (!CommonMask.empty()) { 14553 std::iota(std::next(CommonMask.begin(), Idx), 14554 std::next(CommonMask.begin(), Idx + E->getVectorFactor()), 14555 Idx); 14556 } 14557 } 14558 return Vec; 14559 }; 14560 if (SubVectorsMask.empty()) { 14561 Vec = CreateSubVectors(Vec, CommonMask); 14562 } else { 14563 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem); 14564 copy(SubVectorsMask, SVMask.begin()); 14565 for (auto [I1, I2] : zip(SVMask, CommonMask)) { 14566 if (I2 != PoisonMaskElem) { 14567 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask"); 14568 I1 = I2 + CommonMask.size(); 14569 } 14570 } 14571 Value *InsertVec = 14572 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask); 14573 Vec = createShuffle(InsertVec, Vec, SVMask); 14574 transformMaskAfterShuffle(CommonMask, SVMask); 14575 } 14576 InVectors.front() = Vec; 14577 } 14578 14579 if (!ExtMask.empty()) { 14580 if (CommonMask.empty()) { 14581 CommonMask.assign(ExtMask.begin(), ExtMask.end()); 14582 } else { 14583 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem); 14584 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { 14585 if (ExtMask[I] == PoisonMaskElem) 14586 continue; 14587 NewMask[I] = CommonMask[ExtMask[I]]; 14588 } 14589 CommonMask.swap(NewMask); 14590 } 14591 } 14592 if (CommonMask.empty()) { 14593 assert(InVectors.size() == 1 && "Expected only one vector with no mask"); 14594 return InVectors.front(); 14595 } 14596 if (InVectors.size() == 2) 14597 return createShuffle(InVectors.front(), InVectors.back(), CommonMask); 14598 return createShuffle(InVectors.front(), nullptr, CommonMask); 14599 } 14600 14601 ~ShuffleInstructionBuilder() { 14602 assert((IsFinalized || CommonMask.empty()) && 14603 "Shuffle construction must be finalized."); 14604 } 14605 }; 14606 14607 BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E, 14608 unsigned NodeIdx) { 14609 ArrayRef<Value *> VL = E->getOperand(NodeIdx); 14610 InstructionsState S = getSameOpcode(VL, *TLI); 14611 // Special processing for GEPs bundle, which may include non-gep values. 14612 if (!S && VL.front()->getType()->isPointerTy()) { 14613 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>); 14614 if (It != VL.end()) 14615 S = getSameOpcode(*It, *TLI); 14616 } 14617 if (!S) 14618 return nullptr; 14619 auto CheckSameVE = [&](const TreeEntry *VE) { 14620 return any_of(VE->UserTreeIndices, 14621 [E, NodeIdx](const EdgeInfo &EI) { 14622 return EI.UserTE == E && EI.EdgeIdx == NodeIdx; 14623 }) || 14624 any_of(VectorizableTree, 14625 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) { 14626 return TE->isOperandGatherNode( 14627 {const_cast<TreeEntry *>(E), NodeIdx}) && 14628 VE->isSame(TE->Scalars); 14629 }); 14630 }; 14631 TreeEntry *VE = getSameValuesTreeEntry(S.getMainOp(), VL); 14632 if (VE && CheckSameVE(VE)) 14633 return VE; 14634 return nullptr; 14635 } 14636 14637 Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, 14638 bool PostponedPHIs) { 14639 ValueList &VL = E->getOperand(NodeIdx); 14640 const unsigned VF = VL.size(); 14641 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) { 14642 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) { 14643 // V may be affected by MinBWs. 14644 // We want ShuffleInstructionBuilder to correctly support REVEC. The key 14645 // factor is the number of elements, not their type. 14646 Type *ScalarTy = cast<VectorType>(V->getType())->getElementType(); 14647 unsigned NumElements = getNumElements(VL.front()->getType()); 14648 ShuffleInstructionBuilder ShuffleBuilder( 14649 NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements) 14650 : ScalarTy, 14651 Builder, *this); 14652 ShuffleBuilder.add(V, Mask); 14653 SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors( 14654 E->CombinedEntriesWithIndices.size()); 14655 transform(E->CombinedEntriesWithIndices, SubVectors.begin(), 14656 [&](const auto &P) { 14657 return std::make_pair(VectorizableTree[P.first].get(), 14658 P.second); 14659 }); 14660 assert((E->CombinedEntriesWithIndices.empty() || 14661 E->ReorderIndices.empty()) && 14662 "Expected either combined subnodes or reordering"); 14663 return ShuffleBuilder.finalize({}, SubVectors, {}); 14664 }; 14665 Value *V = vectorizeTree(VE, PostponedPHIs); 14666 if (VF * getNumElements(VL[0]->getType()) != 14667 cast<FixedVectorType>(V->getType())->getNumElements()) { 14668 if (!VE->ReuseShuffleIndices.empty()) { 14669 // Reshuffle to get only unique values. 14670 // If some of the scalars are duplicated in the vectorization 14671 // tree entry, we do not vectorize them but instead generate a 14672 // mask for the reuses. But if there are several users of the 14673 // same entry, they may have different vectorization factors. 14674 // This is especially important for PHI nodes. In this case, we 14675 // need to adapt the resulting instruction for the user 14676 // vectorization factor and have to reshuffle it again to take 14677 // only unique elements of the vector. Without this code the 14678 // function incorrectly returns reduced vector instruction with 14679 // the same elements, not with the unique ones. 14680 14681 // block: 14682 // %phi = phi <2 x > { .., %entry} {%shuffle, %block} 14683 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0> 14684 // ... (use %2) 14685 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0} 14686 // br %block 14687 SmallVector<int> Mask(VF, PoisonMaskElem); 14688 for (auto [I, V] : enumerate(VL)) { 14689 if (isa<PoisonValue>(V)) 14690 continue; 14691 Mask[I] = VE->findLaneForValue(V); 14692 } 14693 V = FinalShuffle(V, Mask); 14694 } else { 14695 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() && 14696 "Expected vectorization factor less " 14697 "than original vector size."); 14698 SmallVector<int> UniformMask(VF, 0); 14699 std::iota(UniformMask.begin(), UniformMask.end(), 0); 14700 V = FinalShuffle(V, UniformMask); 14701 } 14702 } 14703 // Need to update the operand gather node, if actually the operand is not a 14704 // vectorized node, but the buildvector/gather node, which matches one of 14705 // the vectorized nodes. 14706 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) { 14707 return EI.UserTE == E && EI.EdgeIdx == NodeIdx; 14708 }) == VE->UserTreeIndices.end()) { 14709 auto *It = 14710 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { 14711 return TE->isGather() && TE->UserTreeIndices.front().UserTE == E && 14712 TE->UserTreeIndices.front().EdgeIdx == NodeIdx; 14713 }); 14714 assert(It != VectorizableTree.end() && "Expected gather node operand."); 14715 (*It)->VectorizedValue = V; 14716 } 14717 return V; 14718 } 14719 14720 // Find the corresponding gather entry and vectorize it. 14721 // Allows to be more accurate with tree/graph transformations, checks for the 14722 // correctness of the transformations in many cases. 14723 auto *I = find_if(VectorizableTree, 14724 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) { 14725 return TE->isOperandGatherNode({E, NodeIdx}); 14726 }); 14727 assert(I != VectorizableTree.end() && "Gather node is not in the graph."); 14728 assert(I->get()->UserTreeIndices.size() == 1 && 14729 "Expected only single user for the gather node."); 14730 assert(I->get()->isSame(VL) && "Expected same list of scalars."); 14731 return vectorizeTree(I->get(), PostponedPHIs); 14732 } 14733 14734 template <typename BVTy, typename ResTy, typename... Args> 14735 ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, 14736 Args &...Params) { 14737 assert(E->isGather() && "Expected gather node."); 14738 unsigned VF = E->getVectorFactor(); 14739 14740 bool NeedFreeze = false; 14741 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(), 14742 E->ReuseShuffleIndices.end()); 14743 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end()); 14744 // Clear values, to be replaced by insertvector instructions. 14745 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices) 14746 for_each(MutableArrayRef(GatheredScalars) 14747 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()), 14748 [&](Value *&V) { V = PoisonValue::get(V->getType()); }); 14749 SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors( 14750 E->CombinedEntriesWithIndices.size()); 14751 transform(E->CombinedEntriesWithIndices, SubVectors.begin(), 14752 [&](const auto &P) { 14753 return std::make_pair(VectorizableTree[P.first].get(), P.second); 14754 }); 14755 // Build a mask out of the reorder indices and reorder scalars per this 14756 // mask. 14757 SmallVector<int> ReorderMask(E->ReorderIndices.begin(), 14758 E->ReorderIndices.end()); 14759 if (!ReorderMask.empty()) 14760 reorderScalars(GatheredScalars, ReorderMask); 14761 SmallVector<int> SubVectorsMask; 14762 inversePermutation(E->ReorderIndices, SubVectorsMask); 14763 // Transform non-clustered elements in the mask to poison (-1). 14764 // "Clustered" operations will be reordered using this mask later. 14765 if (!SubVectors.empty() && !SubVectorsMask.empty()) { 14766 for (unsigned I : seq<unsigned>(GatheredScalars.size())) 14767 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]]) 14768 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem; 14769 } else { 14770 SubVectorsMask.clear(); 14771 } 14772 SmallVector<Value *> StoredGS(GatheredScalars); 14773 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF, 14774 unsigned I, unsigned SliceSize, 14775 bool IsNotPoisonous) { 14776 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) { 14777 return isa<UndefValue>(V) && !isa<PoisonValue>(V); 14778 })) 14779 return false; 14780 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE; 14781 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx; 14782 if (UserTE->getNumOperands() != 2) 14783 return false; 14784 if (!IsNotPoisonous) { 14785 auto *It = 14786 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) { 14787 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) { 14788 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx; 14789 }) != TE->UserTreeIndices.end(); 14790 }); 14791 if (It == VectorizableTree.end()) 14792 return false; 14793 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end()); 14794 if (!(*It)->ReorderIndices.empty()) { 14795 inversePermutation((*It)->ReorderIndices, ReorderMask); 14796 reorderScalars(GS, ReorderMask); 14797 } 14798 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) { 14799 Value *V0 = std::get<0>(P); 14800 Value *V1 = std::get<1>(P); 14801 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) || 14802 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) && 14803 is_contained(E->Scalars, V1)); 14804 })) 14805 return false; 14806 } 14807 int Idx; 14808 if ((Mask.size() < InputVF && 14809 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) && 14810 Idx == 0) || 14811 (Mask.size() == InputVF && 14812 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) { 14813 std::iota( 14814 std::next(Mask.begin(), I * SliceSize), 14815 std::next(Mask.begin(), 14816 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)), 14817 0); 14818 } else { 14819 unsigned IVal = 14820 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; }); 14821 std::fill( 14822 std::next(Mask.begin(), I * SliceSize), 14823 std::next(Mask.begin(), 14824 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)), 14825 IVal); 14826 } 14827 return true; 14828 }; 14829 BVTy ShuffleBuilder(ScalarTy, Params...); 14830 ResTy Res = ResTy(); 14831 SmallVector<int> Mask; 14832 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem); 14833 SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles; 14834 Value *ExtractVecBase = nullptr; 14835 bool UseVecBaseAsInput = false; 14836 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles; 14837 SmallVector<SmallVector<const TreeEntry *>> Entries; 14838 Type *OrigScalarTy = GatheredScalars.front()->getType(); 14839 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size()); 14840 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size()); 14841 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) { 14842 // Check for gathered extracts. 14843 bool Resized = false; 14844 ExtractShuffles = 14845 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts); 14846 if (!ExtractShuffles.empty()) { 14847 SmallVector<const TreeEntry *> ExtractEntries; 14848 for (auto [Idx, I] : enumerate(ExtractMask)) { 14849 if (I == PoisonMaskElem) 14850 continue; 14851 if (ArrayRef<TreeEntry *> TEs = getTreeEntries( 14852 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand()); 14853 !TEs.empty()) 14854 ExtractEntries.append(TEs.begin(), TEs.end()); 14855 } 14856 if (std::optional<ResTy> Delayed = 14857 ShuffleBuilder.needToDelay(E, ExtractEntries)) { 14858 // Delay emission of gathers which are not ready yet. 14859 PostponedGathers.insert(E); 14860 // Postpone gather emission, will be emitted after the end of the 14861 // process to keep correct order. 14862 return *Delayed; 14863 } 14864 if (Value *VecBase = ShuffleBuilder.adjustExtracts( 14865 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) { 14866 ExtractVecBase = VecBase; 14867 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType())) 14868 if (VF == VecBaseTy->getNumElements() && 14869 GatheredScalars.size() != VF) { 14870 Resized = true; 14871 GatheredScalars.append(VF - GatheredScalars.size(), 14872 PoisonValue::get(OrigScalarTy)); 14873 NumParts = 14874 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF); 14875 } 14876 } 14877 } 14878 // Gather extracts after we check for full matched gathers only. 14879 if (!ExtractShuffles.empty() || !E->hasState() || 14880 E->getOpcode() != Instruction::Load || 14881 (((E->hasState() && E->getOpcode() == Instruction::Load) || 14882 any_of(E->Scalars, IsaPred<LoadInst>)) && 14883 any_of(E->Scalars, 14884 [this](Value *V) { 14885 return isa<LoadInst>(V) && isVectorized(V); 14886 })) || 14887 (E->hasState() && E->isAltShuffle()) || 14888 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) || 14889 isSplat(E->Scalars) || 14890 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) { 14891 GatherShuffles = 14892 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts); 14893 } 14894 if (!GatherShuffles.empty()) { 14895 if (std::optional<ResTy> Delayed = 14896 ShuffleBuilder.needToDelay(E, Entries)) { 14897 // Delay emission of gathers which are not ready yet. 14898 PostponedGathers.insert(E); 14899 // Postpone gather emission, will be emitted after the end of the 14900 // process to keep correct order. 14901 return *Delayed; 14902 } 14903 if (GatherShuffles.size() == 1 && 14904 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc && 14905 Entries.front().front()->isSame(E->Scalars)) { 14906 // Perfect match in the graph, will reuse the previously vectorized 14907 // node. Cost is 0. 14908 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle " 14909 << shortBundleName(E->Scalars, E->Idx) << ".\n"); 14910 // Restore the mask for previous partially matched values. 14911 Mask.resize(E->Scalars.size()); 14912 const TreeEntry *FrontTE = Entries.front().front(); 14913 if (FrontTE->ReorderIndices.empty() && 14914 ((FrontTE->ReuseShuffleIndices.empty() && 14915 E->Scalars.size() == FrontTE->Scalars.size()) || 14916 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) { 14917 std::iota(Mask.begin(), Mask.end(), 0); 14918 } else { 14919 for (auto [I, V] : enumerate(E->Scalars)) { 14920 if (isa<PoisonValue>(V)) { 14921 Mask[I] = PoisonMaskElem; 14922 continue; 14923 } 14924 Mask[I] = FrontTE->findLaneForValue(V); 14925 } 14926 } 14927 ShuffleBuilder.add(*FrontTE, Mask); 14928 // Full matched entry found, no need to insert subvectors. 14929 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {}); 14930 return Res; 14931 } 14932 if (!Resized) { 14933 if (GatheredScalars.size() != VF && 14934 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) { 14935 return any_of(TEs, [&](const TreeEntry *TE) { 14936 return TE->getVectorFactor() == VF; 14937 }); 14938 })) 14939 GatheredScalars.append(VF - GatheredScalars.size(), 14940 PoisonValue::get(OrigScalarTy)); 14941 } 14942 // Remove shuffled elements from list of gathers. 14943 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { 14944 if (Mask[I] != PoisonMaskElem) 14945 GatheredScalars[I] = PoisonValue::get(OrigScalarTy); 14946 } 14947 } 14948 } 14949 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars, 14950 SmallVectorImpl<int> &ReuseMask, 14951 bool IsRootPoison) { 14952 // For splats with can emit broadcasts instead of gathers, so try to find 14953 // such sequences. 14954 bool IsSplat = IsRootPoison && isSplat(Scalars) && 14955 (Scalars.size() > 2 || Scalars.front() == Scalars.back()); 14956 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy)); 14957 SmallVector<int> UndefPos; 14958 DenseMap<Value *, unsigned> UniquePositions; 14959 // Gather unique non-const values and all constant values. 14960 // For repeated values, just shuffle them. 14961 int NumNonConsts = 0; 14962 int SinglePos = 0; 14963 for (auto [I, V] : enumerate(Scalars)) { 14964 if (isa<UndefValue>(V)) { 14965 if (!isa<PoisonValue>(V)) { 14966 ReuseMask[I] = I; 14967 UndefPos.push_back(I); 14968 } 14969 continue; 14970 } 14971 if (isConstant(V)) { 14972 ReuseMask[I] = I; 14973 continue; 14974 } 14975 ++NumNonConsts; 14976 SinglePos = I; 14977 Value *OrigV = V; 14978 Scalars[I] = PoisonValue::get(OrigScalarTy); 14979 if (IsSplat) { 14980 Scalars.front() = OrigV; 14981 ReuseMask[I] = 0; 14982 } else { 14983 const auto Res = UniquePositions.try_emplace(OrigV, I); 14984 Scalars[Res.first->second] = OrigV; 14985 ReuseMask[I] = Res.first->second; 14986 } 14987 } 14988 if (NumNonConsts == 1) { 14989 // Restore single insert element. 14990 if (IsSplat) { 14991 ReuseMask.assign(VF, PoisonMaskElem); 14992 std::swap(Scalars.front(), Scalars[SinglePos]); 14993 if (!UndefPos.empty() && UndefPos.front() == 0) 14994 Scalars.front() = UndefValue::get(OrigScalarTy); 14995 } 14996 ReuseMask[SinglePos] = SinglePos; 14997 } else if (!UndefPos.empty() && IsSplat) { 14998 // For undef values, try to replace them with the simple broadcast. 14999 // We can do it if the broadcasted value is guaranteed to be 15000 // non-poisonous, or by freezing the incoming scalar value first. 15001 auto *It = find_if(Scalars, [this, E](Value *V) { 15002 return !isa<UndefValue>(V) && 15003 (isVectorized(V) || isGuaranteedNotToBePoison(V, AC) || 15004 (E->UserTreeIndices.size() == 1 && 15005 any_of(V->uses(), [E](const Use &U) { 15006 // Check if the value already used in the same operation in 15007 // one of the nodes already. 15008 return E->UserTreeIndices.front().EdgeIdx != 15009 U.getOperandNo() && 15010 is_contained( 15011 E->UserTreeIndices.front().UserTE->Scalars, 15012 U.getUser()); 15013 }))); 15014 }); 15015 if (It != Scalars.end()) { 15016 // Replace undefs by the non-poisoned scalars and emit broadcast. 15017 int Pos = std::distance(Scalars.begin(), It); 15018 for (int I : UndefPos) { 15019 // Set the undef position to the non-poisoned scalar. 15020 ReuseMask[I] = Pos; 15021 // Replace the undef by the poison, in the mask it is replaced by 15022 // non-poisoned scalar already. 15023 if (I != Pos) 15024 Scalars[I] = PoisonValue::get(OrigScalarTy); 15025 } 15026 } else { 15027 // Replace undefs by the poisons, emit broadcast and then emit 15028 // freeze. 15029 for (int I : UndefPos) { 15030 ReuseMask[I] = PoisonMaskElem; 15031 if (isa<UndefValue>(Scalars[I])) 15032 Scalars[I] = PoisonValue::get(OrigScalarTy); 15033 } 15034 NeedFreeze = true; 15035 } 15036 } 15037 }; 15038 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) { 15039 bool IsNonPoisoned = true; 15040 bool IsUsedInExpr = true; 15041 Value *Vec1 = nullptr; 15042 if (!ExtractShuffles.empty()) { 15043 // Gather of extractelements can be represented as just a shuffle of 15044 // a single/two vectors the scalars are extracted from. 15045 // Find input vectors. 15046 Value *Vec2 = nullptr; 15047 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { 15048 if (!Mask.empty() && Mask[I] != PoisonMaskElem) 15049 ExtractMask[I] = PoisonMaskElem; 15050 } 15051 if (UseVecBaseAsInput) { 15052 Vec1 = ExtractVecBase; 15053 } else { 15054 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { 15055 if (ExtractMask[I] == PoisonMaskElem) 15056 continue; 15057 if (isa<UndefValue>(E->Scalars[I])) 15058 continue; 15059 auto *EI = cast<ExtractElementInst>(StoredGS[I]); 15060 Value *VecOp = EI->getVectorOperand(); 15061 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp); 15062 !TEs.empty() && TEs.front()->VectorizedValue) 15063 VecOp = TEs.front()->VectorizedValue; 15064 if (!Vec1) { 15065 Vec1 = VecOp; 15066 } else if (Vec1 != VecOp) { 15067 assert((!Vec2 || Vec2 == VecOp) && 15068 "Expected only 1 or 2 vectors shuffle."); 15069 Vec2 = VecOp; 15070 } 15071 } 15072 } 15073 if (Vec2) { 15074 IsUsedInExpr = false; 15075 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) && 15076 isGuaranteedNotToBePoison(Vec2, AC); 15077 ShuffleBuilder.add(Vec1, Vec2, ExtractMask); 15078 } else if (Vec1) { 15079 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC); 15080 IsUsedInExpr &= FindReusedSplat( 15081 ExtractMask, 15082 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0, 15083 ExtractMask.size(), IsNotPoisonedVec); 15084 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true); 15085 IsNonPoisoned &= IsNotPoisonedVec; 15086 } else { 15087 IsUsedInExpr = false; 15088 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask, 15089 /*ForExtracts=*/true); 15090 } 15091 } 15092 if (!GatherShuffles.empty()) { 15093 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts); 15094 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem); 15095 for (const auto [I, TEs] : enumerate(Entries)) { 15096 if (TEs.empty()) { 15097 assert(!GatherShuffles[I] && 15098 "No shuffles with empty entries list expected."); 15099 continue; 15100 } 15101 assert((TEs.size() == 1 || TEs.size() == 2) && 15102 "Expected shuffle of 1 or 2 entries."); 15103 unsigned Limit = getNumElems(Mask.size(), SliceSize, I); 15104 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit); 15105 VecMask.assign(VecMask.size(), PoisonMaskElem); 15106 copy(SubMask, std::next(VecMask.begin(), I * SliceSize)); 15107 if (TEs.size() == 1) { 15108 bool IsNotPoisonedVec = 15109 TEs.front()->VectorizedValue 15110 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) 15111 : true; 15112 IsUsedInExpr &= 15113 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I, 15114 SliceSize, IsNotPoisonedVec); 15115 ShuffleBuilder.add(*TEs.front(), VecMask); 15116 IsNonPoisoned &= IsNotPoisonedVec; 15117 } else { 15118 IsUsedInExpr = false; 15119 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask); 15120 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue) 15121 IsNonPoisoned &= 15122 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) && 15123 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC); 15124 } 15125 } 15126 } 15127 // Try to figure out best way to combine values: build a shuffle and insert 15128 // elements or just build several shuffles. 15129 // Insert non-constant scalars. 15130 SmallVector<Value *> NonConstants(GatheredScalars); 15131 int EMSz = ExtractMask.size(); 15132 int MSz = Mask.size(); 15133 // Try to build constant vector and shuffle with it only if currently we 15134 // have a single permutation and more than 1 scalar constants. 15135 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty(); 15136 bool IsIdentityShuffle = 15137 ((UseVecBaseAsInput || 15138 all_of(ExtractShuffles, 15139 [](const std::optional<TTI::ShuffleKind> &SK) { 15140 return SK.value_or(TTI::SK_PermuteTwoSrc) == 15141 TTI::SK_PermuteSingleSrc; 15142 })) && 15143 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) && 15144 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) || 15145 (!GatherShuffles.empty() && 15146 all_of(GatherShuffles, 15147 [](const std::optional<TTI::ShuffleKind> &SK) { 15148 return SK.value_or(TTI::SK_PermuteTwoSrc) == 15149 TTI::SK_PermuteSingleSrc; 15150 }) && 15151 none_of(Mask, [&](int I) { return I >= MSz; }) && 15152 ShuffleVectorInst::isIdentityMask(Mask, MSz)); 15153 bool EnoughConstsForShuffle = 15154 IsSingleShuffle && 15155 (none_of(GatheredScalars, 15156 [](Value *V) { 15157 return isa<UndefValue>(V) && !isa<PoisonValue>(V); 15158 }) || 15159 any_of(GatheredScalars, 15160 [](Value *V) { 15161 return isa<Constant>(V) && !isa<UndefValue>(V); 15162 })) && 15163 (!IsIdentityShuffle || 15164 (GatheredScalars.size() == 2 && 15165 any_of(GatheredScalars, 15166 [](Value *V) { return !isa<UndefValue>(V); })) || 15167 count_if(GatheredScalars, [](Value *V) { 15168 return isa<Constant>(V) && !isa<PoisonValue>(V); 15169 }) > 1); 15170 // NonConstants array contains just non-constant values, GatheredScalars 15171 // contains only constant to build final vector and then shuffle. 15172 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) { 15173 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I])) 15174 NonConstants[I] = PoisonValue::get(OrigScalarTy); 15175 else 15176 GatheredScalars[I] = PoisonValue::get(OrigScalarTy); 15177 } 15178 // Generate constants for final shuffle and build a mask for them. 15179 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) { 15180 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem); 15181 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true); 15182 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size()); 15183 ShuffleBuilder.add(BV, BVMask); 15184 } 15185 if (all_of(NonConstants, [=](Value *V) { 15186 return isa<PoisonValue>(V) || 15187 (IsSingleShuffle && ((IsIdentityShuffle && 15188 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V)); 15189 })) 15190 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, 15191 SubVectorsMask); 15192 else 15193 Res = ShuffleBuilder.finalize( 15194 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(), 15195 [&](Value *&Vec, SmallVectorImpl<int> &Mask) { 15196 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false); 15197 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec); 15198 }); 15199 } else if (!allConstant(GatheredScalars)) { 15200 // Gather unique scalars and all constants. 15201 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem); 15202 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true); 15203 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size()); 15204 ShuffleBuilder.add(BV, ReuseMask); 15205 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, 15206 SubVectorsMask); 15207 } else { 15208 // Gather all constants. 15209 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem); 15210 for (auto [I, V] : enumerate(GatheredScalars)) { 15211 if (!isa<PoisonValue>(V)) 15212 Mask[I] = I; 15213 } 15214 Value *BV = ShuffleBuilder.gather(GatheredScalars); 15215 ShuffleBuilder.add(BV, Mask); 15216 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, 15217 SubVectorsMask); 15218 } 15219 15220 if (NeedFreeze) 15221 Res = ShuffleBuilder.createFreeze(Res); 15222 return Res; 15223 } 15224 15225 Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy, 15226 bool PostponedPHIs) { 15227 for (auto [EIdx, _] : E->CombinedEntriesWithIndices) 15228 (void)vectorizeTree(VectorizableTree[EIdx].get(), PostponedPHIs); 15229 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy, 15230 Builder, *this); 15231 } 15232 15233 /// \returns \p I after propagating metadata from \p VL only for instructions in 15234 /// \p VL. 15235 static Instruction *propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) { 15236 SmallVector<Value *> Insts; 15237 for (Value *V : VL) 15238 if (isa<Instruction>(V)) 15239 Insts.push_back(V); 15240 return llvm::propagateMetadata(Inst, Insts); 15241 } 15242 15243 Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { 15244 IRBuilderBase::InsertPointGuard Guard(Builder); 15245 15246 if (E->VectorizedValue && 15247 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI || 15248 E->isAltShuffle())) { 15249 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); 15250 return E->VectorizedValue; 15251 } 15252 15253 Value *V = E->Scalars.front(); 15254 Type *ScalarTy = V->getType(); 15255 if (!isa<CmpInst>(V)) 15256 ScalarTy = getValueType(V); 15257 auto It = MinBWs.find(E); 15258 if (It != MinBWs.end()) { 15259 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy); 15260 ScalarTy = IntegerType::get(F->getContext(), It->second.first); 15261 if (VecTy) 15262 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements()); 15263 } 15264 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size()); 15265 if (E->isGather()) { 15266 // Set insert point for non-reduction initial nodes. 15267 if (E->hasState() && E->Idx == 0 && !UserIgnoreList) 15268 setInsertPointAfterBundle(E); 15269 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs); 15270 E->VectorizedValue = Vec; 15271 return Vec; 15272 } 15273 15274 bool IsReverseOrder = 15275 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices); 15276 auto FinalShuffle = [&](Value *V, const TreeEntry *E) { 15277 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this); 15278 if (E->getOpcode() == Instruction::Store && 15279 E->State == TreeEntry::Vectorize) { 15280 ArrayRef<int> Mask = 15281 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()), 15282 E->ReorderIndices.size()); 15283 ShuffleBuilder.add(V, Mask); 15284 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) { 15285 ShuffleBuilder.addOrdered(V, {}); 15286 } else { 15287 ShuffleBuilder.addOrdered(V, E->ReorderIndices); 15288 } 15289 SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors( 15290 E->CombinedEntriesWithIndices.size()); 15291 transform( 15292 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) { 15293 return std::make_pair(VectorizableTree[P.first].get(), P.second); 15294 }); 15295 assert( 15296 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) && 15297 "Expected either combined subnodes or reordering"); 15298 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {}); 15299 }; 15300 15301 assert(!E->isGather() && "Unhandled state"); 15302 unsigned ShuffleOrOp = 15303 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); 15304 Instruction *VL0 = E->getMainOp(); 15305 auto GetOperandSignedness = [&](unsigned Idx) { 15306 const TreeEntry *OpE = getOperandEntry(E, Idx); 15307 bool IsSigned = false; 15308 auto It = MinBWs.find(OpE); 15309 if (It != MinBWs.end()) 15310 IsSigned = It->second.second; 15311 else 15312 IsSigned = any_of(OpE->Scalars, [&](Value *R) { 15313 if (isa<PoisonValue>(V)) 15314 return false; 15315 return !isKnownNonNegative(R, SimplifyQuery(*DL)); 15316 }); 15317 return IsSigned; 15318 }; 15319 switch (ShuffleOrOp) { 15320 case Instruction::PHI: { 15321 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() || 15322 E != VectorizableTree.front().get() || 15323 !E->UserTreeIndices.empty()) && 15324 "PHI reordering is free."); 15325 if (PostponedPHIs && E->VectorizedValue) 15326 return E->VectorizedValue; 15327 auto *PH = cast<PHINode>(VL0); 15328 Builder.SetInsertPoint(PH->getParent(), 15329 PH->getParent()->getFirstNonPHIIt()); 15330 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 15331 if (PostponedPHIs || !E->VectorizedValue) { 15332 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); 15333 E->PHI = NewPhi; 15334 Value *V = NewPhi; 15335 15336 // Adjust insertion point once all PHI's have been generated. 15337 Builder.SetInsertPoint(PH->getParent(), 15338 PH->getParent()->getFirstInsertionPt()); 15339 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 15340 15341 V = FinalShuffle(V, E); 15342 15343 E->VectorizedValue = V; 15344 if (PostponedPHIs) 15345 return V; 15346 } 15347 PHINode *NewPhi = cast<PHINode>(E->PHI); 15348 // If phi node is fully emitted - exit. 15349 if (NewPhi->getNumIncomingValues() != 0) 15350 return NewPhi; 15351 15352 // PHINodes may have multiple entries from the same block. We want to 15353 // visit every block once. 15354 SmallPtrSet<BasicBlock *, 4> VisitedBBs; 15355 15356 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) { 15357 ValueList Operands; 15358 BasicBlock *IBB = PH->getIncomingBlock(I); 15359 15360 // Stop emission if all incoming values are generated. 15361 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) { 15362 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 15363 return NewPhi; 15364 } 15365 15366 if (!VisitedBBs.insert(IBB).second) { 15367 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); 15368 continue; 15369 } 15370 15371 Builder.SetInsertPoint(IBB->getTerminator()); 15372 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 15373 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true); 15374 if (VecTy != Vec->getType()) { 15375 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() || 15376 MinBWs.contains(getOperandEntry(E, I))) && 15377 "Expected item in MinBWs."); 15378 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I)); 15379 } 15380 NewPhi->addIncoming(Vec, IBB); 15381 } 15382 15383 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && 15384 "Invalid number of incoming values"); 15385 assert(E->VectorizedValue && "Expected vectorized value."); 15386 return E->VectorizedValue; 15387 } 15388 15389 case Instruction::ExtractElement: { 15390 Value *V = E->getSingleOperand(0); 15391 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) 15392 V = TEs.front()->VectorizedValue; 15393 setInsertPointAfterBundle(E); 15394 V = FinalShuffle(V, E); 15395 E->VectorizedValue = V; 15396 return V; 15397 } 15398 case Instruction::ExtractValue: { 15399 auto *LI = cast<LoadInst>(E->getSingleOperand(0)); 15400 Builder.SetInsertPoint(LI); 15401 Value *Ptr = LI->getPointerOperand(); 15402 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); 15403 Value *NewV = ::propagateMetadata(V, E->Scalars); 15404 NewV = FinalShuffle(NewV, E); 15405 E->VectorizedValue = NewV; 15406 return NewV; 15407 } 15408 case Instruction::InsertElement: { 15409 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique"); 15410 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back())); 15411 Value *V = vectorizeOperand(E, 1, PostponedPHIs); 15412 ArrayRef<Value *> Op = E->getOperand(1); 15413 Type *ScalarTy = Op.front()->getType(); 15414 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) { 15415 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs."); 15416 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1)); 15417 assert(Res.first > 0 && "Expected item in MinBWs."); 15418 V = Builder.CreateIntCast( 15419 V, 15420 getWidenedType( 15421 ScalarTy, 15422 cast<FixedVectorType>(V->getType())->getNumElements()), 15423 Res.second); 15424 } 15425 15426 // Create InsertVector shuffle if necessary 15427 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) { 15428 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0)); 15429 })); 15430 const unsigned NumElts = 15431 cast<FixedVectorType>(FirstInsert->getType())->getNumElements(); 15432 const unsigned NumScalars = E->Scalars.size(); 15433 15434 unsigned Offset = *getElementIndex(VL0); 15435 assert(Offset < NumElts && "Failed to find vector index offset"); 15436 15437 // Create shuffle to resize vector 15438 SmallVector<int> Mask; 15439 if (!E->ReorderIndices.empty()) { 15440 inversePermutation(E->ReorderIndices, Mask); 15441 Mask.append(NumElts - NumScalars, PoisonMaskElem); 15442 } else { 15443 Mask.assign(NumElts, PoisonMaskElem); 15444 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); 15445 } 15446 // Create InsertVector shuffle if necessary 15447 bool IsIdentity = true; 15448 SmallVector<int> PrevMask(NumElts, PoisonMaskElem); 15449 Mask.swap(PrevMask); 15450 for (unsigned I = 0; I < NumScalars; ++I) { 15451 Value *Scalar = E->Scalars[PrevMask[I]]; 15452 unsigned InsertIdx = *getElementIndex(Scalar); 15453 IsIdentity &= InsertIdx - Offset == I; 15454 Mask[InsertIdx - Offset] = I; 15455 } 15456 if (!IsIdentity || NumElts != NumScalars) { 15457 Value *V2 = nullptr; 15458 bool IsVNonPoisonous = 15459 !isConstant(V) && isGuaranteedNotToBePoison(V, AC); 15460 SmallVector<int> InsertMask(Mask); 15461 if (NumElts != NumScalars && Offset == 0) { 15462 // Follow all insert element instructions from the current buildvector 15463 // sequence. 15464 InsertElementInst *Ins = cast<InsertElementInst>(VL0); 15465 do { 15466 std::optional<unsigned> InsertIdx = getElementIndex(Ins); 15467 if (!InsertIdx) 15468 break; 15469 if (InsertMask[*InsertIdx] == PoisonMaskElem) 15470 InsertMask[*InsertIdx] = *InsertIdx; 15471 if (!Ins->hasOneUse()) 15472 break; 15473 Ins = dyn_cast_or_null<InsertElementInst>( 15474 Ins->getUniqueUndroppableUser()); 15475 } while (Ins); 15476 SmallBitVector UseMask = 15477 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask); 15478 SmallBitVector IsFirstPoison = 15479 isUndefVector<true>(FirstInsert->getOperand(0), UseMask); 15480 SmallBitVector IsFirstUndef = 15481 isUndefVector(FirstInsert->getOperand(0), UseMask); 15482 if (!IsFirstPoison.all()) { 15483 unsigned Idx = 0; 15484 for (unsigned I = 0; I < NumElts; I++) { 15485 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) && 15486 IsFirstUndef.test(I)) { 15487 if (IsVNonPoisonous) { 15488 InsertMask[I] = I < NumScalars ? I : 0; 15489 continue; 15490 } 15491 if (!V2) 15492 V2 = UndefValue::get(V->getType()); 15493 if (Idx >= NumScalars) 15494 Idx = NumScalars - 1; 15495 InsertMask[I] = NumScalars + Idx; 15496 ++Idx; 15497 } else if (InsertMask[I] != PoisonMaskElem && 15498 Mask[I] == PoisonMaskElem) { 15499 InsertMask[I] = PoisonMaskElem; 15500 } 15501 } 15502 } else { 15503 InsertMask = Mask; 15504 } 15505 } 15506 if (!V2) 15507 V2 = PoisonValue::get(V->getType()); 15508 V = Builder.CreateShuffleVector(V, V2, InsertMask); 15509 if (auto *I = dyn_cast<Instruction>(V)) { 15510 GatherShuffleExtractSeq.insert(I); 15511 CSEBlocks.insert(I->getParent()); 15512 } 15513 } 15514 15515 SmallVector<int> InsertMask(NumElts, PoisonMaskElem); 15516 for (unsigned I = 0; I < NumElts; I++) { 15517 if (Mask[I] != PoisonMaskElem) 15518 InsertMask[Offset + I] = I; 15519 } 15520 SmallBitVector UseMask = 15521 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask); 15522 SmallBitVector IsFirstUndef = 15523 isUndefVector(FirstInsert->getOperand(0), UseMask); 15524 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) && 15525 NumElts != NumScalars) { 15526 if (IsFirstUndef.all()) { 15527 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) { 15528 SmallBitVector IsFirstPoison = 15529 isUndefVector<true>(FirstInsert->getOperand(0), UseMask); 15530 if (!IsFirstPoison.all()) { 15531 for (unsigned I = 0; I < NumElts; I++) { 15532 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I)) 15533 InsertMask[I] = I + NumElts; 15534 } 15535 } 15536 V = Builder.CreateShuffleVector( 15537 V, 15538 IsFirstPoison.all() ? PoisonValue::get(V->getType()) 15539 : FirstInsert->getOperand(0), 15540 InsertMask, cast<Instruction>(E->Scalars.back())->getName()); 15541 if (auto *I = dyn_cast<Instruction>(V)) { 15542 GatherShuffleExtractSeq.insert(I); 15543 CSEBlocks.insert(I->getParent()); 15544 } 15545 } 15546 } else { 15547 SmallBitVector IsFirstPoison = 15548 isUndefVector<true>(FirstInsert->getOperand(0), UseMask); 15549 for (unsigned I = 0; I < NumElts; I++) { 15550 if (InsertMask[I] == PoisonMaskElem) 15551 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I; 15552 else 15553 InsertMask[I] += NumElts; 15554 } 15555 V = Builder.CreateShuffleVector( 15556 FirstInsert->getOperand(0), V, InsertMask, 15557 cast<Instruction>(E->Scalars.back())->getName()); 15558 if (auto *I = dyn_cast<Instruction>(V)) { 15559 GatherShuffleExtractSeq.insert(I); 15560 CSEBlocks.insert(I->getParent()); 15561 } 15562 } 15563 } 15564 15565 ++NumVectorInstructions; 15566 E->VectorizedValue = V; 15567 return V; 15568 } 15569 case Instruction::ZExt: 15570 case Instruction::SExt: 15571 case Instruction::FPToUI: 15572 case Instruction::FPToSI: 15573 case Instruction::FPExt: 15574 case Instruction::PtrToInt: 15575 case Instruction::IntToPtr: 15576 case Instruction::SIToFP: 15577 case Instruction::UIToFP: 15578 case Instruction::Trunc: 15579 case Instruction::FPTrunc: 15580 case Instruction::BitCast: { 15581 setInsertPointAfterBundle(E); 15582 15583 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs); 15584 if (E->VectorizedValue) { 15585 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 15586 return E->VectorizedValue; 15587 } 15588 15589 auto *CI = cast<CastInst>(VL0); 15590 Instruction::CastOps VecOpcode = CI->getOpcode(); 15591 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType(); 15592 auto SrcIt = MinBWs.find(getOperandEntry(E, 0)); 15593 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() && 15594 (SrcIt != MinBWs.end() || It != MinBWs.end() || 15595 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) { 15596 // Check if the values are candidates to demote. 15597 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy); 15598 if (SrcIt != MinBWs.end()) 15599 SrcBWSz = SrcIt->second.first; 15600 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType()); 15601 if (BWSz == SrcBWSz) { 15602 VecOpcode = Instruction::BitCast; 15603 } else if (BWSz < SrcBWSz) { 15604 VecOpcode = Instruction::Trunc; 15605 } else if (It != MinBWs.end()) { 15606 assert(BWSz > SrcBWSz && "Invalid cast!"); 15607 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt; 15608 } else if (SrcIt != MinBWs.end()) { 15609 assert(BWSz > SrcBWSz && "Invalid cast!"); 15610 VecOpcode = 15611 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt; 15612 } 15613 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() && 15614 !SrcIt->second.second) { 15615 VecOpcode = Instruction::UIToFP; 15616 } 15617 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast) 15618 ? InVec 15619 : Builder.CreateCast(VecOpcode, InVec, VecTy); 15620 V = FinalShuffle(V, E); 15621 15622 E->VectorizedValue = V; 15623 ++NumVectorInstructions; 15624 return V; 15625 } 15626 case Instruction::FCmp: 15627 case Instruction::ICmp: { 15628 setInsertPointAfterBundle(E); 15629 15630 Value *L = vectorizeOperand(E, 0, PostponedPHIs); 15631 if (E->VectorizedValue) { 15632 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 15633 return E->VectorizedValue; 15634 } 15635 Value *R = vectorizeOperand(E, 1, PostponedPHIs); 15636 if (E->VectorizedValue) { 15637 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 15638 return E->VectorizedValue; 15639 } 15640 if (L->getType() != R->getType()) { 15641 assert((getOperandEntry(E, 0)->isGather() || 15642 getOperandEntry(E, 1)->isGather() || 15643 MinBWs.contains(getOperandEntry(E, 0)) || 15644 MinBWs.contains(getOperandEntry(E, 1))) && 15645 "Expected item in MinBWs."); 15646 if (cast<VectorType>(L->getType()) 15647 ->getElementType() 15648 ->getIntegerBitWidth() < cast<VectorType>(R->getType()) 15649 ->getElementType() 15650 ->getIntegerBitWidth()) { 15651 Type *CastTy = R->getType(); 15652 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0)); 15653 } else { 15654 Type *CastTy = L->getType(); 15655 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1)); 15656 } 15657 } 15658 15659 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); 15660 Value *V = Builder.CreateCmp(P0, L, R); 15661 propagateIRFlags(V, E->Scalars, VL0); 15662 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end()) 15663 ICmp->setSameSign(/*B=*/false); 15664 // Do not cast for cmps. 15665 VecTy = cast<FixedVectorType>(V->getType()); 15666 V = FinalShuffle(V, E); 15667 15668 E->VectorizedValue = V; 15669 ++NumVectorInstructions; 15670 return V; 15671 } 15672 case Instruction::Select: { 15673 setInsertPointAfterBundle(E); 15674 15675 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs); 15676 if (E->VectorizedValue) { 15677 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 15678 return E->VectorizedValue; 15679 } 15680 Value *True = vectorizeOperand(E, 1, PostponedPHIs); 15681 if (E->VectorizedValue) { 15682 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 15683 return E->VectorizedValue; 15684 } 15685 Value *False = vectorizeOperand(E, 2, PostponedPHIs); 15686 if (E->VectorizedValue) { 15687 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 15688 return E->VectorizedValue; 15689 } 15690 if (True->getType() != VecTy || False->getType() != VecTy) { 15691 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() || 15692 getOperandEntry(E, 2)->isGather() || 15693 MinBWs.contains(getOperandEntry(E, 1)) || 15694 MinBWs.contains(getOperandEntry(E, 2))) && 15695 "Expected item in MinBWs."); 15696 if (True->getType() != VecTy) 15697 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1)); 15698 if (False->getType() != VecTy) 15699 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2)); 15700 } 15701 15702 unsigned CondNumElements = getNumElements(Cond->getType()); 15703 unsigned TrueNumElements = getNumElements(True->getType()); 15704 assert(TrueNumElements >= CondNumElements && 15705 TrueNumElements % CondNumElements == 0 && 15706 "Cannot vectorize Instruction::Select"); 15707 assert(TrueNumElements == getNumElements(False->getType()) && 15708 "Cannot vectorize Instruction::Select"); 15709 if (CondNumElements != TrueNumElements) { 15710 // When the return type is i1 but the source is fixed vector type, we 15711 // need to duplicate the condition value. 15712 Cond = Builder.CreateShuffleVector( 15713 Cond, createReplicatedMask(TrueNumElements / CondNumElements, 15714 CondNumElements)); 15715 } 15716 assert(getNumElements(Cond->getType()) == TrueNumElements && 15717 "Cannot vectorize Instruction::Select"); 15718 Value *V = Builder.CreateSelect(Cond, True, False); 15719 V = FinalShuffle(V, E); 15720 15721 E->VectorizedValue = V; 15722 ++NumVectorInstructions; 15723 return V; 15724 } 15725 case Instruction::FNeg: { 15726 setInsertPointAfterBundle(E); 15727 15728 Value *Op = vectorizeOperand(E, 0, PostponedPHIs); 15729 15730 if (E->VectorizedValue) { 15731 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 15732 return E->VectorizedValue; 15733 } 15734 15735 Value *V = Builder.CreateUnOp( 15736 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op); 15737 propagateIRFlags(V, E->Scalars, VL0); 15738 if (auto *I = dyn_cast<Instruction>(V)) 15739 V = ::propagateMetadata(I, E->Scalars); 15740 15741 V = FinalShuffle(V, E); 15742 15743 E->VectorizedValue = V; 15744 ++NumVectorInstructions; 15745 15746 return V; 15747 } 15748 case Instruction::Freeze: { 15749 setInsertPointAfterBundle(E); 15750 15751 Value *Op = vectorizeOperand(E, 0, PostponedPHIs); 15752 15753 if (E->VectorizedValue) { 15754 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 15755 return E->VectorizedValue; 15756 } 15757 15758 if (Op->getType() != VecTy) { 15759 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() || 15760 MinBWs.contains(getOperandEntry(E, 0))) && 15761 "Expected item in MinBWs."); 15762 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0)); 15763 } 15764 Value *V = Builder.CreateFreeze(Op); 15765 V = FinalShuffle(V, E); 15766 15767 E->VectorizedValue = V; 15768 ++NumVectorInstructions; 15769 15770 return V; 15771 } 15772 case Instruction::Add: 15773 case Instruction::FAdd: 15774 case Instruction::Sub: 15775 case Instruction::FSub: 15776 case Instruction::Mul: 15777 case Instruction::FMul: 15778 case Instruction::UDiv: 15779 case Instruction::SDiv: 15780 case Instruction::FDiv: 15781 case Instruction::URem: 15782 case Instruction::SRem: 15783 case Instruction::FRem: 15784 case Instruction::Shl: 15785 case Instruction::LShr: 15786 case Instruction::AShr: 15787 case Instruction::And: 15788 case Instruction::Or: 15789 case Instruction::Xor: { 15790 setInsertPointAfterBundle(E); 15791 15792 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs); 15793 if (E->VectorizedValue) { 15794 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 15795 return E->VectorizedValue; 15796 } 15797 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs); 15798 if (E->VectorizedValue) { 15799 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 15800 return E->VectorizedValue; 15801 } 15802 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) { 15803 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) { 15804 ArrayRef<Value *> Ops = E->getOperand(I); 15805 if (all_of(Ops, [&](Value *Op) { 15806 auto *CI = dyn_cast<ConstantInt>(Op); 15807 return CI && CI->getValue().countr_one() >= It->second.first; 15808 })) { 15809 V = FinalShuffle(I == 0 ? RHS : LHS, E); 15810 E->VectorizedValue = V; 15811 ++NumVectorInstructions; 15812 return V; 15813 } 15814 } 15815 } 15816 if (LHS->getType() != VecTy || RHS->getType() != VecTy) { 15817 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() || 15818 getOperandEntry(E, 1)->isGather() || 15819 MinBWs.contains(getOperandEntry(E, 0)) || 15820 MinBWs.contains(getOperandEntry(E, 1))) && 15821 "Expected item in MinBWs."); 15822 if (LHS->getType() != VecTy) 15823 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0)); 15824 if (RHS->getType() != VecTy) 15825 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1)); 15826 } 15827 15828 Value *V = Builder.CreateBinOp( 15829 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, 15830 RHS); 15831 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end()); 15832 if (auto *I = dyn_cast<Instruction>(V)) { 15833 V = ::propagateMetadata(I, E->Scalars); 15834 // Drop nuw flags for abs(sub(commutative), true). 15835 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub && 15836 any_of(E->Scalars, [](Value *V) { 15837 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V)); 15838 })) 15839 I->setHasNoUnsignedWrap(/*b=*/false); 15840 } 15841 15842 V = FinalShuffle(V, E); 15843 15844 E->VectorizedValue = V; 15845 ++NumVectorInstructions; 15846 15847 return V; 15848 } 15849 case Instruction::Load: { 15850 // Loads are inserted at the head of the tree because we don't want to 15851 // sink them all the way down past store instructions. 15852 setInsertPointAfterBundle(E); 15853 15854 LoadInst *LI = cast<LoadInst>(VL0); 15855 Instruction *NewLI; 15856 Value *PO = LI->getPointerOperand(); 15857 if (E->State == TreeEntry::Vectorize) { 15858 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign()); 15859 } else if (E->State == TreeEntry::StridedVectorize) { 15860 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand(); 15861 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand(); 15862 PO = IsReverseOrder ? PtrN : Ptr0; 15863 std::optional<int> Diff = getPointersDiff( 15864 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE); 15865 Type *StrideTy = DL->getIndexType(PO->getType()); 15866 Value *StrideVal; 15867 if (Diff) { 15868 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1); 15869 StrideVal = 15870 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride * 15871 DL->getTypeAllocSize(ScalarTy)); 15872 } else { 15873 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr); 15874 transform(E->Scalars, PointerOps.begin(), [](Value *V) { 15875 return cast<LoadInst>(V)->getPointerOperand(); 15876 }); 15877 OrdersType Order; 15878 std::optional<Value *> Stride = 15879 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order, 15880 &*Builder.GetInsertPoint()); 15881 Value *NewStride = 15882 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true); 15883 StrideVal = Builder.CreateMul( 15884 NewStride, 15885 ConstantInt::get( 15886 StrideTy, 15887 (IsReverseOrder ? -1 : 1) * 15888 static_cast<int>(DL->getTypeAllocSize(ScalarTy)))); 15889 } 15890 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars); 15891 auto *Inst = Builder.CreateIntrinsic( 15892 Intrinsic::experimental_vp_strided_load, 15893 {VecTy, PO->getType(), StrideTy}, 15894 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()), 15895 Builder.getInt32(E->Scalars.size())}); 15896 Inst->addParamAttr( 15897 /*ArgNo=*/0, 15898 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment)); 15899 NewLI = Inst; 15900 } else { 15901 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); 15902 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs); 15903 if (E->VectorizedValue) { 15904 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 15905 return E->VectorizedValue; 15906 } 15907 if (isa<FixedVectorType>(ScalarTy)) { 15908 assert(SLPReVec && "FixedVectorType is not expected."); 15909 // CreateMaskedGather expects VecTy and VecPtr have same size. We need 15910 // to expand VecPtr if ScalarTy is a vector type. 15911 unsigned ScalarTyNumElements = 15912 cast<FixedVectorType>(ScalarTy)->getNumElements(); 15913 unsigned VecTyNumElements = 15914 cast<FixedVectorType>(VecTy)->getNumElements(); 15915 assert(VecTyNumElements % ScalarTyNumElements == 0 && 15916 "Cannot expand getelementptr."); 15917 unsigned VF = VecTyNumElements / ScalarTyNumElements; 15918 SmallVector<Constant *> Indices(VecTyNumElements); 15919 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) { 15920 return Builder.getInt64(I % ScalarTyNumElements); 15921 }); 15922 VecPtr = Builder.CreateGEP( 15923 VecTy->getElementType(), 15924 Builder.CreateShuffleVector( 15925 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)), 15926 ConstantVector::get(Indices)); 15927 } 15928 // Use the minimum alignment of the gathered loads. 15929 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars); 15930 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment); 15931 } 15932 Value *V = ::propagateMetadata(NewLI, E->Scalars); 15933 15934 V = FinalShuffle(V, E); 15935 E->VectorizedValue = V; 15936 ++NumVectorInstructions; 15937 return V; 15938 } 15939 case Instruction::Store: { 15940 auto *SI = cast<StoreInst>(VL0); 15941 15942 setInsertPointAfterBundle(E); 15943 15944 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs); 15945 if (VecValue->getType() != VecTy) 15946 VecValue = 15947 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0)); 15948 VecValue = FinalShuffle(VecValue, E); 15949 15950 Value *Ptr = SI->getPointerOperand(); 15951 Instruction *ST; 15952 if (E->State == TreeEntry::Vectorize) { 15953 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign()); 15954 } else { 15955 assert(E->State == TreeEntry::StridedVectorize && 15956 "Expected either strided or consecutive stores."); 15957 if (!E->ReorderIndices.empty()) { 15958 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]); 15959 Ptr = SI->getPointerOperand(); 15960 } 15961 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars); 15962 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType()); 15963 auto *Inst = Builder.CreateIntrinsic( 15964 Intrinsic::experimental_vp_strided_store, 15965 {VecTy, Ptr->getType(), StrideTy}, 15966 {VecValue, Ptr, 15967 ConstantInt::get( 15968 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))), 15969 Builder.getAllOnesMask(VecTy->getElementCount()), 15970 Builder.getInt32(E->Scalars.size())}); 15971 Inst->addParamAttr( 15972 /*ArgNo=*/1, 15973 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment)); 15974 ST = Inst; 15975 } 15976 15977 Value *V = ::propagateMetadata(ST, E->Scalars); 15978 15979 E->VectorizedValue = V; 15980 ++NumVectorInstructions; 15981 return V; 15982 } 15983 case Instruction::GetElementPtr: { 15984 auto *GEP0 = cast<GetElementPtrInst>(VL0); 15985 setInsertPointAfterBundle(E); 15986 15987 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs); 15988 if (E->VectorizedValue) { 15989 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 15990 return E->VectorizedValue; 15991 } 15992 15993 SmallVector<Value *> OpVecs; 15994 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) { 15995 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs); 15996 if (E->VectorizedValue) { 15997 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 15998 return E->VectorizedValue; 15999 } 16000 OpVecs.push_back(OpVec); 16001 } 16002 16003 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs); 16004 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) { 16005 SmallVector<Value *> GEPs; 16006 for (Value *V : E->Scalars) { 16007 if (isa<GetElementPtrInst>(V)) 16008 GEPs.push_back(V); 16009 } 16010 V = ::propagateMetadata(I, GEPs); 16011 } 16012 16013 V = FinalShuffle(V, E); 16014 16015 E->VectorizedValue = V; 16016 ++NumVectorInstructions; 16017 16018 return V; 16019 } 16020 case Instruction::Call: { 16021 CallInst *CI = cast<CallInst>(VL0); 16022 setInsertPointAfterBundle(E); 16023 16024 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 16025 16026 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes( 16027 CI, ID, VecTy->getNumElements(), 16028 It != MinBWs.end() ? It->second.first : 0, TTI); 16029 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys); 16030 bool UseIntrinsic = ID != Intrinsic::not_intrinsic && 16031 VecCallCosts.first <= VecCallCosts.second; 16032 16033 Value *ScalarArg = nullptr; 16034 SmallVector<Value *> OpVecs; 16035 SmallVector<Type *, 2> TysForDecl; 16036 // Add return type if intrinsic is overloaded on it. 16037 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI)) 16038 TysForDecl.push_back(VecTy); 16039 auto *CEI = cast<CallInst>(VL0); 16040 for (unsigned I : seq<unsigned>(0, CI->arg_size())) { 16041 ValueList OpVL; 16042 // Some intrinsics have scalar arguments. This argument should not be 16043 // vectorized. 16044 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) { 16045 ScalarArg = CEI->getArgOperand(I); 16046 // if decided to reduce bitwidth of abs intrinsic, it second argument 16047 // must be set false (do not return poison, if value issigned min). 16048 if (ID == Intrinsic::abs && It != MinBWs.end() && 16049 It->second.first < DL->getTypeSizeInBits(CEI->getType())) 16050 ScalarArg = Builder.getFalse(); 16051 OpVecs.push_back(ScalarArg); 16052 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI)) 16053 TysForDecl.push_back(ScalarArg->getType()); 16054 continue; 16055 } 16056 16057 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs); 16058 if (E->VectorizedValue) { 16059 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 16060 return E->VectorizedValue; 16061 } 16062 ScalarArg = CEI->getArgOperand(I); 16063 if (cast<VectorType>(OpVec->getType())->getElementType() != 16064 ScalarArg->getType()->getScalarType() && 16065 It == MinBWs.end()) { 16066 auto *CastTy = 16067 getWidenedType(ScalarArg->getType(), VecTy->getNumElements()); 16068 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I)); 16069 } else if (It != MinBWs.end()) { 16070 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I)); 16071 } 16072 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n"); 16073 OpVecs.push_back(OpVec); 16074 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI)) 16075 TysForDecl.push_back(OpVec->getType()); 16076 } 16077 16078 Function *CF; 16079 if (!UseIntrinsic) { 16080 VFShape Shape = 16081 VFShape::get(CI->getFunctionType(), 16082 ElementCount::getFixed( 16083 static_cast<unsigned>(VecTy->getNumElements())), 16084 false /*HasGlobalPred*/); 16085 CF = VFDatabase(*CI).getVectorizedFunction(Shape); 16086 } else { 16087 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl); 16088 } 16089 16090 SmallVector<OperandBundleDef, 1> OpBundles; 16091 CI->getOperandBundlesAsDefs(OpBundles); 16092 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles); 16093 16094 propagateIRFlags(V, E->Scalars, VL0); 16095 V = FinalShuffle(V, E); 16096 16097 E->VectorizedValue = V; 16098 ++NumVectorInstructions; 16099 return V; 16100 } 16101 case Instruction::ShuffleVector: { 16102 Value *V; 16103 if (SLPReVec && !E->isAltShuffle()) { 16104 setInsertPointAfterBundle(E); 16105 Value *Src = vectorizeOperand(E, 0, PostponedPHIs); 16106 if (E->VectorizedValue) { 16107 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 16108 return E->VectorizedValue; 16109 } 16110 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars)); 16111 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) { 16112 assert(isa<PoisonValue>(SVSrc->getOperand(1)) && 16113 "Not supported shufflevector usage."); 16114 SmallVector<int> NewMask(ThisMask.size()); 16115 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) { 16116 return SVSrc->getShuffleMask()[Mask]; 16117 }); 16118 V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask); 16119 } else { 16120 V = Builder.CreateShuffleVector(Src, ThisMask); 16121 } 16122 propagateIRFlags(V, E->Scalars, VL0); 16123 if (auto *I = dyn_cast<Instruction>(V)) 16124 V = ::propagateMetadata(I, E->Scalars); 16125 V = FinalShuffle(V, E); 16126 } else { 16127 assert(E->isAltShuffle() && 16128 ((Instruction::isBinaryOp(E->getOpcode()) && 16129 Instruction::isBinaryOp(E->getAltOpcode())) || 16130 (Instruction::isCast(E->getOpcode()) && 16131 Instruction::isCast(E->getAltOpcode())) || 16132 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && 16133 "Invalid Shuffle Vector Operand"); 16134 16135 Value *LHS = nullptr, *RHS = nullptr; 16136 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) { 16137 setInsertPointAfterBundle(E); 16138 LHS = vectorizeOperand(E, 0, PostponedPHIs); 16139 if (E->VectorizedValue) { 16140 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 16141 return E->VectorizedValue; 16142 } 16143 RHS = vectorizeOperand(E, 1, PostponedPHIs); 16144 } else { 16145 setInsertPointAfterBundle(E); 16146 LHS = vectorizeOperand(E, 0, PostponedPHIs); 16147 } 16148 if (E->VectorizedValue) { 16149 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 16150 return E->VectorizedValue; 16151 } 16152 if (LHS && RHS && 16153 ((Instruction::isBinaryOp(E->getOpcode()) && 16154 (LHS->getType() != VecTy || RHS->getType() != VecTy)) || 16155 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) { 16156 assert((It != MinBWs.end() || 16157 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather || 16158 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather || 16159 MinBWs.contains(getOperandEntry(E, 0)) || 16160 MinBWs.contains(getOperandEntry(E, 1))) && 16161 "Expected item in MinBWs."); 16162 Type *CastTy = VecTy; 16163 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) { 16164 if (cast<VectorType>(LHS->getType()) 16165 ->getElementType() 16166 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType()) 16167 ->getElementType() 16168 ->getIntegerBitWidth()) 16169 CastTy = RHS->getType(); 16170 else 16171 CastTy = LHS->getType(); 16172 } 16173 if (LHS->getType() != CastTy) 16174 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0)); 16175 if (RHS->getType() != CastTy) 16176 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1)); 16177 } 16178 16179 Value *V0, *V1; 16180 if (Instruction::isBinaryOp(E->getOpcode())) { 16181 V0 = Builder.CreateBinOp( 16182 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS); 16183 V1 = Builder.CreateBinOp( 16184 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS); 16185 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) { 16186 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS); 16187 auto *AltCI = cast<CmpInst>(E->getAltOp()); 16188 CmpInst::Predicate AltPred = AltCI->getPredicate(); 16189 V1 = Builder.CreateCmp(AltPred, LHS, RHS); 16190 } else { 16191 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) { 16192 unsigned SrcBWSz = DL->getTypeSizeInBits( 16193 cast<VectorType>(LHS->getType())->getElementType()); 16194 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy); 16195 if (BWSz <= SrcBWSz) { 16196 if (BWSz < SrcBWSz) 16197 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first); 16198 assert(LHS->getType() == VecTy && 16199 "Expected same type as operand."); 16200 if (auto *I = dyn_cast<Instruction>(LHS)) 16201 LHS = ::propagateMetadata(I, E->Scalars); 16202 LHS = FinalShuffle(LHS, E); 16203 E->VectorizedValue = LHS; 16204 ++NumVectorInstructions; 16205 return LHS; 16206 } 16207 } 16208 V0 = Builder.CreateCast( 16209 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy); 16210 V1 = Builder.CreateCast( 16211 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy); 16212 } 16213 // Add V0 and V1 to later analysis to try to find and remove matching 16214 // instruction, if any. 16215 for (Value *V : {V0, V1}) { 16216 if (auto *I = dyn_cast<Instruction>(V)) { 16217 GatherShuffleExtractSeq.insert(I); 16218 CSEBlocks.insert(I->getParent()); 16219 } 16220 } 16221 16222 // Create shuffle to take alternate operations from the vector. 16223 // Also, gather up main and alt scalar ops to propagate IR flags to 16224 // each vector operation. 16225 ValueList OpScalars, AltScalars; 16226 SmallVector<int> Mask; 16227 E->buildAltOpShuffleMask( 16228 [E, this](Instruction *I) { 16229 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); 16230 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(), 16231 *TLI); 16232 }, 16233 Mask, &OpScalars, &AltScalars); 16234 16235 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end()); 16236 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end()); 16237 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) { 16238 // Drop nuw flags for abs(sub(commutative), true). 16239 if (auto *I = dyn_cast<Instruction>(Vec); 16240 I && Opcode == Instruction::Sub && !MinBWs.contains(E) && 16241 any_of(E->Scalars, [](Value *V) { 16242 if (isa<PoisonValue>(V)) 16243 return false; 16244 auto *IV = cast<Instruction>(V); 16245 return IV->getOpcode() == Instruction::Sub && isCommutative(IV); 16246 })) 16247 I->setHasNoUnsignedWrap(/*b=*/false); 16248 }; 16249 DropNuwFlag(V0, E->getOpcode()); 16250 DropNuwFlag(V1, E->getAltOpcode()); 16251 16252 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) { 16253 assert(SLPReVec && "FixedVectorType is not expected."); 16254 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask); 16255 } 16256 V = Builder.CreateShuffleVector(V0, V1, Mask); 16257 if (auto *I = dyn_cast<Instruction>(V)) { 16258 V = ::propagateMetadata(I, E->Scalars); 16259 GatherShuffleExtractSeq.insert(I); 16260 CSEBlocks.insert(I->getParent()); 16261 } 16262 } 16263 16264 E->VectorizedValue = V; 16265 ++NumVectorInstructions; 16266 16267 return V; 16268 } 16269 default: 16270 llvm_unreachable("unknown inst"); 16271 } 16272 return nullptr; 16273 } 16274 16275 Value *BoUpSLP::vectorizeTree() { 16276 ExtraValueToDebugLocsMap ExternallyUsedValues; 16277 return vectorizeTree(ExternallyUsedValues); 16278 } 16279 16280 Value * 16281 BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, 16282 Instruction *ReductionRoot) { 16283 // All blocks must be scheduled before any instructions are inserted. 16284 for (auto &BSIter : BlocksSchedules) { 16285 scheduleBlock(BSIter.second.get()); 16286 } 16287 // Clean Entry-to-LastInstruction table. It can be affected after scheduling, 16288 // need to rebuild it. 16289 EntryToLastInstruction.clear(); 16290 16291 if (ReductionRoot) 16292 Builder.SetInsertPoint(ReductionRoot->getParent(), 16293 ReductionRoot->getIterator()); 16294 else 16295 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin()); 16296 16297 // Emit gathered loads first to emit better code for the users of those 16298 // gathered loads. 16299 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { 16300 if (GatheredLoadsEntriesFirst.has_value() && 16301 TE->Idx >= *GatheredLoadsEntriesFirst && 16302 (!TE->isGather() || !TE->UserTreeIndices.empty())) { 16303 assert((!TE->UserTreeIndices.empty() || 16304 (TE->getOpcode() == Instruction::Load && !TE->isGather())) && 16305 "Expected gathered load node."); 16306 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false); 16307 } 16308 } 16309 // Postpone emission of PHIs operands to avoid cyclic dependencies issues. 16310 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true); 16311 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) 16312 if (TE->State == TreeEntry::Vectorize && 16313 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() && 16314 TE->VectorizedValue) 16315 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false); 16316 // Run through the list of postponed gathers and emit them, replacing the temp 16317 // emitted allocas with actual vector instructions. 16318 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef(); 16319 DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues; 16320 for (const TreeEntry *E : PostponedNodes) { 16321 auto *TE = const_cast<TreeEntry *>(E); 16322 if (auto *VecTE = getSameValuesTreeEntry( 16323 TE->Scalars.front(), TE->UserTreeIndices.front().UserTE->getOperand( 16324 TE->UserTreeIndices.front().EdgeIdx)); 16325 VecTE && VecTE->isSame(TE->Scalars)) 16326 // Found gather node which is absolutely the same as one of the 16327 // vectorized nodes. It may happen after reordering. 16328 continue; 16329 auto *PrevVec = cast<Instruction>(TE->VectorizedValue); 16330 TE->VectorizedValue = nullptr; 16331 auto *UserI = 16332 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue); 16333 // If user is a PHI node, its vector code have to be inserted right before 16334 // block terminator. Since the node was delayed, there were some unresolved 16335 // dependencies at the moment when stab instruction was emitted. In a case 16336 // when any of these dependencies turn out an operand of another PHI, coming 16337 // from this same block, position of a stab instruction will become invalid. 16338 // The is because source vector that supposed to feed this gather node was 16339 // inserted at the end of the block [after stab instruction]. So we need 16340 // to adjust insertion point again to the end of block. 16341 if (isa<PHINode>(UserI)) { 16342 // Insert before all users. 16343 Instruction *InsertPt = PrevVec->getParent()->getTerminator(); 16344 for (User *U : PrevVec->users()) { 16345 if (U == UserI) 16346 continue; 16347 auto *UI = dyn_cast<Instruction>(U); 16348 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent()) 16349 continue; 16350 if (UI->comesBefore(InsertPt)) 16351 InsertPt = UI; 16352 } 16353 Builder.SetInsertPoint(InsertPt); 16354 } else { 16355 Builder.SetInsertPoint(PrevVec); 16356 } 16357 Builder.SetCurrentDebugLocation(UserI->getDebugLoc()); 16358 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false); 16359 if (auto *VecI = dyn_cast<Instruction>(Vec); 16360 VecI && VecI->getParent() == Builder.GetInsertBlock() && 16361 Builder.GetInsertPoint()->comesBefore(VecI)) 16362 VecI->moveBeforePreserving(*Builder.GetInsertBlock(), 16363 Builder.GetInsertPoint()); 16364 if (Vec->getType() != PrevVec->getType()) { 16365 assert(Vec->getType()->isIntOrIntVectorTy() && 16366 PrevVec->getType()->isIntOrIntVectorTy() && 16367 "Expected integer vector types only."); 16368 std::optional<bool> IsSigned; 16369 for (Value *V : TE->Scalars) { 16370 if (isVectorized(V)) { 16371 for (const TreeEntry *MNTE : getTreeEntries(V)) { 16372 auto It = MinBWs.find(MNTE); 16373 if (It != MinBWs.end()) { 16374 IsSigned = IsSigned.value_or(false) || It->second.second; 16375 if (*IsSigned) 16376 break; 16377 } 16378 } 16379 if (IsSigned.value_or(false)) 16380 break; 16381 // Scan through gather nodes. 16382 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) { 16383 auto It = MinBWs.find(BVE); 16384 if (It != MinBWs.end()) { 16385 IsSigned = IsSigned.value_or(false) || It->second.second; 16386 if (*IsSigned) 16387 break; 16388 } 16389 } 16390 if (IsSigned.value_or(false)) 16391 break; 16392 if (auto *EE = dyn_cast<ExtractElementInst>(V)) { 16393 IsSigned = 16394 IsSigned.value_or(false) || 16395 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL)); 16396 continue; 16397 } 16398 if (IsSigned.value_or(false)) 16399 break; 16400 } 16401 } 16402 if (IsSigned.value_or(false)) { 16403 // Final attempt - check user node. 16404 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE); 16405 if (It != MinBWs.end()) 16406 IsSigned = It->second.second; 16407 } 16408 assert(IsSigned && 16409 "Expected user node or perfect diamond match in MinBWs."); 16410 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned); 16411 } 16412 PrevVec->replaceAllUsesWith(Vec); 16413 PostponedValues.try_emplace(Vec).first->second.push_back(TE); 16414 // Replace the stub vector node, if it was used before for one of the 16415 // buildvector nodes already. 16416 auto It = PostponedValues.find(PrevVec); 16417 if (It != PostponedValues.end()) { 16418 for (TreeEntry *VTE : It->getSecond()) 16419 VTE->VectorizedValue = Vec; 16420 } 16421 eraseInstruction(PrevVec); 16422 } 16423 16424 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() 16425 << " values .\n"); 16426 16427 SmallVector<ShuffledInsertData<Value *>> ShuffledInserts; 16428 // Maps vector instruction to original insertelement instruction 16429 DenseMap<Value *, InsertElementInst *> VectorToInsertElement; 16430 // Maps extract Scalar to the corresponding extractelement instruction in the 16431 // basic block. Only one extractelement per block should be emitted. 16432 DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>> 16433 ScalarToEEs; 16434 SmallDenseSet<Value *, 4> UsedInserts; 16435 DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts; 16436 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser; 16437 SmallDenseSet<ExtractElementInst *, 4> IgnoredExtracts; 16438 // Extract all of the elements with the external uses. 16439 for (const auto &ExternalUse : ExternalUses) { 16440 Value *Scalar = ExternalUse.Scalar; 16441 llvm::User *User = ExternalUse.User; 16442 16443 // Skip users that we already RAUW. This happens when one instruction 16444 // has multiple uses of the same value. 16445 if (User && !is_contained(Scalar->users(), User)) 16446 continue; 16447 const TreeEntry *E = &ExternalUse.E; 16448 assert(E && "Invalid scalar"); 16449 assert(!E->isGather() && "Extracting from a gather list"); 16450 // Non-instruction pointers are not deleted, just skip them. 16451 if (E->getOpcode() == Instruction::GetElementPtr && 16452 !isa<GetElementPtrInst>(Scalar)) 16453 continue; 16454 16455 Value *Vec = E->VectorizedValue; 16456 assert(Vec && "Can't find vectorizable value"); 16457 16458 Value *Lane = Builder.getInt32(ExternalUse.Lane); 16459 auto ExtractAndExtendIfNeeded = [&](Value *Vec) { 16460 if (Scalar->getType() != Vec->getType()) { 16461 Value *Ex = nullptr; 16462 Value *ExV = nullptr; 16463 auto *Inst = dyn_cast<Instruction>(Scalar); 16464 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst); 16465 auto It = ScalarToEEs.find(Scalar); 16466 if (It != ScalarToEEs.end()) { 16467 // No need to emit many extracts, just move the only one in the 16468 // current block. 16469 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent() 16470 : Builder.GetInsertBlock()); 16471 if (EEIt != It->second.end()) { 16472 Value *PrevV = EEIt->second.first; 16473 if (auto *I = dyn_cast<Instruction>(PrevV); 16474 I && !ReplaceInst && 16475 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() && 16476 Builder.GetInsertPoint()->comesBefore(I)) { 16477 I->moveBefore(*Builder.GetInsertPoint()->getParent(), 16478 Builder.GetInsertPoint()); 16479 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second)) 16480 CI->moveAfter(I); 16481 } 16482 Ex = PrevV; 16483 ExV = EEIt->second.second ? EEIt->second.second : Ex; 16484 } 16485 } 16486 if (!Ex) { 16487 // "Reuse" the existing extract to improve final codegen. 16488 if (ReplaceInst) { 16489 // Leave the instruction as is, if it cheaper extracts and all 16490 // operands are scalar. 16491 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) { 16492 IgnoredExtracts.insert(EE); 16493 Ex = EE; 16494 } else { 16495 auto *CloneInst = Inst->clone(); 16496 CloneInst->insertBefore(Inst->getIterator()); 16497 if (Inst->hasName()) 16498 CloneInst->takeName(Inst); 16499 Ex = CloneInst; 16500 } 16501 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar); 16502 ES && isa<Instruction>(Vec)) { 16503 Value *V = ES->getVectorOperand(); 16504 auto *IVec = cast<Instruction>(Vec); 16505 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty()) 16506 V = ETEs.front()->VectorizedValue; 16507 if (auto *IV = dyn_cast<Instruction>(V); 16508 !IV || IV == Vec || IV->getParent() != IVec->getParent() || 16509 IV->comesBefore(IVec)) 16510 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand()); 16511 else 16512 Ex = Builder.CreateExtractElement(Vec, Lane); 16513 } else if (auto *VecTy = 16514 dyn_cast<FixedVectorType>(Scalar->getType())) { 16515 assert(SLPReVec && "FixedVectorType is not expected."); 16516 unsigned VecTyNumElements = VecTy->getNumElements(); 16517 // When REVEC is enabled, we need to extract a vector. 16518 // Note: The element size of Scalar may be different from the 16519 // element size of Vec. 16520 Ex = createExtractVector(Builder, Vec, VecTyNumElements, 16521 ExternalUse.Lane * VecTyNumElements); 16522 } else { 16523 Ex = Builder.CreateExtractElement(Vec, Lane); 16524 } 16525 // If necessary, sign-extend or zero-extend ScalarRoot 16526 // to the larger type. 16527 ExV = Ex; 16528 if (Scalar->getType() != Ex->getType()) 16529 ExV = Builder.CreateIntCast( 16530 Ex, Scalar->getType(), 16531 !isKnownNonNegative(Scalar, SimplifyQuery(*DL))); 16532 auto *I = dyn_cast<Instruction>(Ex); 16533 ScalarToEEs[Scalar].try_emplace(I ? I->getParent() 16534 : &F->getEntryBlock(), 16535 std::make_pair(Ex, ExV)); 16536 } 16537 // The then branch of the previous if may produce constants, since 0 16538 // operand might be a constant. 16539 if (auto *ExI = dyn_cast<Instruction>(Ex); 16540 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) { 16541 GatherShuffleExtractSeq.insert(ExI); 16542 CSEBlocks.insert(ExI->getParent()); 16543 } 16544 return ExV; 16545 } 16546 assert(isa<FixedVectorType>(Scalar->getType()) && 16547 isa<InsertElementInst>(Scalar) && 16548 "In-tree scalar of vector type is not insertelement?"); 16549 auto *IE = cast<InsertElementInst>(Scalar); 16550 VectorToInsertElement.try_emplace(Vec, IE); 16551 return Vec; 16552 }; 16553 // If User == nullptr, the Scalar remains as scalar in vectorized 16554 // instructions or is used as extra arg. Generate ExtractElement instruction 16555 // and update the record for this scalar in ExternallyUsedValues. 16556 if (!User) { 16557 if (!ScalarsWithNullptrUser.insert(Scalar).second) 16558 continue; 16559 assert( 16560 (ExternallyUsedValues.count(Scalar) || 16561 Scalar->hasNUsesOrMore(UsesLimit) || 16562 ExternalUsesAsOriginalScalar.contains(Scalar) || 16563 any_of( 16564 Scalar->users(), 16565 [&, TTI = TTI](llvm::User *U) { 16566 if (ExternalUsesAsOriginalScalar.contains(U)) 16567 return true; 16568 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U); 16569 return !UseEntries.empty() && 16570 (E->State == TreeEntry::Vectorize || 16571 E->State == TreeEntry::StridedVectorize) && 16572 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) { 16573 return (UseEntry->State == TreeEntry::Vectorize || 16574 UseEntry->State == 16575 TreeEntry::StridedVectorize) && 16576 doesInTreeUserNeedToExtract( 16577 Scalar, getRootEntryInstruction(*UseEntry), 16578 TLI, TTI); 16579 }); 16580 })) && 16581 "Scalar with nullptr User must be registered in " 16582 "ExternallyUsedValues map or remain as scalar in vectorized " 16583 "instructions"); 16584 if (auto *VecI = dyn_cast<Instruction>(Vec)) { 16585 if (auto *PHI = dyn_cast<PHINode>(VecI)) { 16586 if (PHI->getParent()->isLandingPad()) 16587 Builder.SetInsertPoint( 16588 PHI->getParent(), 16589 std::next( 16590 PHI->getParent()->getLandingPadInst()->getIterator())); 16591 else 16592 Builder.SetInsertPoint(PHI->getParent(), 16593 PHI->getParent()->getFirstNonPHIIt()); 16594 } else { 16595 Builder.SetInsertPoint(VecI->getParent(), 16596 std::next(VecI->getIterator())); 16597 } 16598 } else { 16599 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin()); 16600 } 16601 Value *NewInst = ExtractAndExtendIfNeeded(Vec); 16602 // Required to update internally referenced instructions. 16603 if (Scalar != NewInst) { 16604 assert((!isa<ExtractElementInst>(Scalar) || 16605 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) && 16606 "Extractelements should not be replaced."); 16607 Scalar->replaceAllUsesWith(NewInst); 16608 } 16609 continue; 16610 } 16611 16612 if (auto *VU = dyn_cast<InsertElementInst>(User); 16613 VU && VU->getOperand(1) == Scalar) { 16614 // Skip if the scalar is another vector op or Vec is not an instruction. 16615 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) { 16616 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) { 16617 if (!UsedInserts.insert(VU).second) 16618 continue; 16619 // Need to use original vector, if the root is truncated. 16620 auto BWIt = MinBWs.find(E); 16621 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) { 16622 auto *ScalarTy = FTy->getElementType(); 16623 auto Key = std::make_pair(Vec, ScalarTy); 16624 auto VecIt = VectorCasts.find(Key); 16625 if (VecIt == VectorCasts.end()) { 16626 IRBuilderBase::InsertPointGuard Guard(Builder); 16627 if (auto *IVec = dyn_cast<PHINode>(Vec)) { 16628 if (IVec->getParent()->isLandingPad()) 16629 Builder.SetInsertPoint(IVec->getParent(), 16630 std::next(IVec->getParent() 16631 ->getLandingPadInst() 16632 ->getIterator())); 16633 else 16634 Builder.SetInsertPoint( 16635 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime()); 16636 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) { 16637 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction()); 16638 } 16639 Vec = Builder.CreateIntCast( 16640 Vec, 16641 getWidenedType( 16642 ScalarTy, 16643 cast<FixedVectorType>(Vec->getType())->getNumElements()), 16644 BWIt->second.second); 16645 VectorCasts.try_emplace(Key, Vec); 16646 } else { 16647 Vec = VecIt->second; 16648 } 16649 } 16650 16651 std::optional<unsigned> InsertIdx = getElementIndex(VU); 16652 if (InsertIdx) { 16653 auto *It = find_if( 16654 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) { 16655 // Checks if 2 insertelements are from the same buildvector. 16656 InsertElementInst *VecInsert = Data.InsertElements.front(); 16657 return areTwoInsertFromSameBuildVector( 16658 VU, VecInsert, 16659 [](InsertElementInst *II) { return II->getOperand(0); }); 16660 }); 16661 unsigned Idx = *InsertIdx; 16662 if (It == ShuffledInserts.end()) { 16663 (void)ShuffledInserts.emplace_back(); 16664 It = std::next(ShuffledInserts.begin(), 16665 ShuffledInserts.size() - 1); 16666 } 16667 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec]; 16668 if (Mask.empty()) 16669 Mask.assign(FTy->getNumElements(), PoisonMaskElem); 16670 Mask[Idx] = ExternalUse.Lane; 16671 It->InsertElements.push_back(cast<InsertElementInst>(User)); 16672 continue; 16673 } 16674 } 16675 } 16676 } 16677 16678 // Generate extracts for out-of-tree users. 16679 // Find the insertion point for the extractelement lane. 16680 if (auto *VecI = dyn_cast<Instruction>(Vec)) { 16681 if (PHINode *PH = dyn_cast<PHINode>(User)) { 16682 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) { 16683 if (PH->getIncomingValue(I) == Scalar) { 16684 Instruction *IncomingTerminator = 16685 PH->getIncomingBlock(I)->getTerminator(); 16686 if (isa<CatchSwitchInst>(IncomingTerminator)) { 16687 Builder.SetInsertPoint(VecI->getParent(), 16688 std::next(VecI->getIterator())); 16689 } else { 16690 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator()); 16691 } 16692 Value *NewInst = ExtractAndExtendIfNeeded(Vec); 16693 PH->setOperand(I, NewInst); 16694 } 16695 } 16696 } else { 16697 Builder.SetInsertPoint(cast<Instruction>(User)); 16698 Value *NewInst = ExtractAndExtendIfNeeded(Vec); 16699 User->replaceUsesOfWith(Scalar, NewInst); 16700 } 16701 } else { 16702 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin()); 16703 Value *NewInst = ExtractAndExtendIfNeeded(Vec); 16704 User->replaceUsesOfWith(Scalar, NewInst); 16705 } 16706 16707 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); 16708 } 16709 16710 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) { 16711 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem); 16712 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem); 16713 int VF = cast<FixedVectorType>(V1->getType())->getNumElements(); 16714 for (int I = 0, E = Mask.size(); I < E; ++I) { 16715 if (Mask[I] < VF) 16716 CombinedMask1[I] = Mask[I]; 16717 else 16718 CombinedMask2[I] = Mask[I] - VF; 16719 } 16720 ShuffleInstructionBuilder ShuffleBuilder( 16721 cast<VectorType>(V1->getType())->getElementType(), Builder, *this); 16722 ShuffleBuilder.add(V1, CombinedMask1); 16723 if (V2) 16724 ShuffleBuilder.add(V2, CombinedMask2); 16725 return ShuffleBuilder.finalize({}, {}, {}); 16726 }; 16727 16728 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask, 16729 bool ForSingleMask) { 16730 unsigned VF = Mask.size(); 16731 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements(); 16732 if (VF != VecVF) { 16733 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) { 16734 Vec = CreateShuffle(Vec, nullptr, Mask); 16735 return std::make_pair(Vec, true); 16736 } 16737 if (!ForSingleMask) { 16738 SmallVector<int> ResizeMask(VF, PoisonMaskElem); 16739 for (unsigned I = 0; I < VF; ++I) { 16740 if (Mask[I] != PoisonMaskElem) 16741 ResizeMask[Mask[I]] = Mask[I]; 16742 } 16743 Vec = CreateShuffle(Vec, nullptr, ResizeMask); 16744 } 16745 } 16746 16747 return std::make_pair(Vec, false); 16748 }; 16749 // Perform shuffling of the vectorize tree entries for better handling of 16750 // external extracts. 16751 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) { 16752 // Find the first and the last instruction in the list of insertelements. 16753 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement); 16754 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front(); 16755 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back(); 16756 Builder.SetInsertPoint(LastInsert); 16757 auto Vector = ShuffledInserts[I].ValueMasks.takeVector(); 16758 Value *NewInst = performExtractsShuffleAction<Value>( 16759 MutableArrayRef(Vector.data(), Vector.size()), 16760 FirstInsert->getOperand(0), 16761 [](Value *Vec) { 16762 return cast<VectorType>(Vec->getType()) 16763 ->getElementCount() 16764 .getKnownMinValue(); 16765 }, 16766 ResizeToVF, 16767 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask, 16768 ArrayRef<Value *> Vals) { 16769 assert((Vals.size() == 1 || Vals.size() == 2) && 16770 "Expected exactly 1 or 2 input values."); 16771 if (Vals.size() == 1) { 16772 // Do not create shuffle if the mask is a simple identity 16773 // non-resizing mask. 16774 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType()) 16775 ->getNumElements() || 16776 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size())) 16777 return CreateShuffle(Vals.front(), nullptr, Mask); 16778 return Vals.front(); 16779 } 16780 return CreateShuffle(Vals.front() ? Vals.front() 16781 : FirstInsert->getOperand(0), 16782 Vals.back(), Mask); 16783 }); 16784 auto It = ShuffledInserts[I].InsertElements.rbegin(); 16785 // Rebuild buildvector chain. 16786 InsertElementInst *II = nullptr; 16787 if (It != ShuffledInserts[I].InsertElements.rend()) 16788 II = *It; 16789 SmallVector<Instruction *> Inserts; 16790 while (It != ShuffledInserts[I].InsertElements.rend()) { 16791 assert(II && "Must be an insertelement instruction."); 16792 if (*It == II) 16793 ++It; 16794 else 16795 Inserts.push_back(cast<Instruction>(II)); 16796 II = dyn_cast<InsertElementInst>(II->getOperand(0)); 16797 } 16798 for (Instruction *II : reverse(Inserts)) { 16799 II->replaceUsesOfWith(II->getOperand(0), NewInst); 16800 if (auto *NewI = dyn_cast<Instruction>(NewInst)) 16801 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI)) 16802 II->moveAfter(NewI); 16803 NewInst = II; 16804 } 16805 LastInsert->replaceAllUsesWith(NewInst); 16806 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) { 16807 IE->replaceUsesOfWith(IE->getOperand(0), 16808 PoisonValue::get(IE->getOperand(0)->getType())); 16809 IE->replaceUsesOfWith(IE->getOperand(1), 16810 PoisonValue::get(IE->getOperand(1)->getType())); 16811 eraseInstruction(IE); 16812 } 16813 CSEBlocks.insert(LastInsert->getParent()); 16814 } 16815 16816 SmallVector<Instruction *> RemovedInsts; 16817 // For each vectorized value: 16818 for (auto &TEPtr : VectorizableTree) { 16819 TreeEntry *Entry = TEPtr.get(); 16820 16821 // No need to handle users of gathered values. 16822 if (Entry->isGather()) 16823 continue; 16824 16825 assert(Entry->VectorizedValue && "Can't find vectorizable value"); 16826 16827 // For each lane: 16828 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 16829 Value *Scalar = Entry->Scalars[Lane]; 16830 16831 if (Entry->getOpcode() == Instruction::GetElementPtr && 16832 !isa<GetElementPtrInst>(Scalar)) 16833 continue; 16834 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar); 16835 EE && IgnoredExtracts.contains(EE)) 16836 continue; 16837 if (isa<PoisonValue>(Scalar)) 16838 continue; 16839 #ifndef NDEBUG 16840 Type *Ty = Scalar->getType(); 16841 if (!Ty->isVoidTy()) { 16842 for (User *U : Scalar->users()) { 16843 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); 16844 16845 // It is legal to delete users in the ignorelist. 16846 assert((isVectorized(U) || 16847 (UserIgnoreList && UserIgnoreList->contains(U)) || 16848 (isa_and_nonnull<Instruction>(U) && 16849 isDeleted(cast<Instruction>(U)))) && 16850 "Deleting out-of-tree value"); 16851 } 16852 } 16853 #endif 16854 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); 16855 auto *I = cast<Instruction>(Scalar); 16856 RemovedInsts.push_back(I); 16857 } 16858 } 16859 16860 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the 16861 // new vector instruction. 16862 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue)) 16863 V->mergeDIAssignID(RemovedInsts); 16864 16865 // Clear up reduction references, if any. 16866 if (UserIgnoreList) { 16867 for (Instruction *I : RemovedInsts) { 16868 const TreeEntry *IE = getTreeEntries(I).front(); 16869 if (IE->Idx != 0 && 16870 !(VectorizableTree.front()->isGather() && 16871 !IE->UserTreeIndices.empty() && 16872 (ValueToGatherNodes.lookup(I).contains( 16873 VectorizableTree.front().get()) || 16874 any_of(IE->UserTreeIndices, 16875 [&](const EdgeInfo &EI) { 16876 return EI.UserTE == VectorizableTree.front().get() && 16877 EI.EdgeIdx == UINT_MAX; 16878 }))) && 16879 !(GatheredLoadsEntriesFirst.has_value() && 16880 IE->Idx >= *GatheredLoadsEntriesFirst && 16881 VectorizableTree.front()->isGather() && 16882 is_contained(VectorizableTree.front()->Scalars, I))) 16883 continue; 16884 SmallVector<SelectInst *> LogicalOpSelects; 16885 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) { 16886 // Do not replace condition of the logical op in form select <cond>. 16887 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) && 16888 (match(U.getUser(), m_LogicalAnd()) || 16889 match(U.getUser(), m_LogicalOr())) && 16890 U.getOperandNo() == 0; 16891 if (IsPoisoningLogicalOp) { 16892 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser())); 16893 return false; 16894 } 16895 return UserIgnoreList->contains(U.getUser()); 16896 }); 16897 // Replace conditions of the poisoning logical ops with the non-poison 16898 // constant value. 16899 for (SelectInst *SI : LogicalOpSelects) 16900 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType())); 16901 } 16902 } 16903 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias 16904 // cache correctness. 16905 // NOTE: removeInstructionAndOperands only marks the instruction for deletion 16906 // - instructions are not deleted until later. 16907 removeInstructionsAndOperands(ArrayRef(RemovedInsts)); 16908 16909 Builder.ClearInsertionPoint(); 16910 InstrElementSize.clear(); 16911 16912 const TreeEntry &RootTE = *VectorizableTree.front(); 16913 Value *Vec = RootTE.VectorizedValue; 16914 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 && 16915 It != MinBWs.end() && 16916 ReductionBitWidth != It->second.first) { 16917 IRBuilder<>::InsertPointGuard Guard(Builder); 16918 Builder.SetInsertPoint(ReductionRoot->getParent(), 16919 ReductionRoot->getIterator()); 16920 Vec = Builder.CreateIntCast( 16921 Vec, 16922 VectorType::get(Builder.getIntNTy(ReductionBitWidth), 16923 cast<VectorType>(Vec->getType())->getElementCount()), 16924 It->second.second); 16925 } 16926 return Vec; 16927 } 16928 16929 void BoUpSLP::optimizeGatherSequence() { 16930 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size() 16931 << " gather sequences instructions.\n"); 16932 // LICM InsertElementInst sequences. 16933 for (Instruction *I : GatherShuffleExtractSeq) { 16934 if (isDeleted(I)) 16935 continue; 16936 16937 // Check if this block is inside a loop. 16938 Loop *L = LI->getLoopFor(I->getParent()); 16939 if (!L) 16940 continue; 16941 16942 // Check if it has a preheader. 16943 BasicBlock *PreHeader = L->getLoopPreheader(); 16944 if (!PreHeader) 16945 continue; 16946 16947 // If the vector or the element that we insert into it are 16948 // instructions that are defined in this basic block then we can't 16949 // hoist this instruction. 16950 if (any_of(I->operands(), [L](Value *V) { 16951 auto *OpI = dyn_cast<Instruction>(V); 16952 return OpI && L->contains(OpI); 16953 })) 16954 continue; 16955 16956 // We can hoist this instruction. Move it to the pre-header. 16957 I->moveBefore(PreHeader->getTerminator()->getIterator()); 16958 CSEBlocks.insert(PreHeader); 16959 } 16960 16961 // Make a list of all reachable blocks in our CSE queue. 16962 SmallVector<const DomTreeNode *, 8> CSEWorkList; 16963 CSEWorkList.reserve(CSEBlocks.size()); 16964 for (BasicBlock *BB : CSEBlocks) 16965 if (DomTreeNode *N = DT->getNode(BB)) { 16966 assert(DT->isReachableFromEntry(N)); 16967 CSEWorkList.push_back(N); 16968 } 16969 16970 // Sort blocks by domination. This ensures we visit a block after all blocks 16971 // dominating it are visited. 16972 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) { 16973 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) && 16974 "Different nodes should have different DFS numbers"); 16975 return A->getDFSNumIn() < B->getDFSNumIn(); 16976 }); 16977 16978 // Less defined shuffles can be replaced by the more defined copies. 16979 // Between two shuffles one is less defined if it has the same vector operands 16980 // and its mask indeces are the same as in the first one or undefs. E.g. 16981 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0, 16982 // poison, <0, 0, 0, 0>. 16983 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1, 16984 Instruction *I2, 16985 SmallVectorImpl<int> &NewMask) { 16986 if (I1->getType() != I2->getType()) 16987 return false; 16988 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1); 16989 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2); 16990 if (!SI1 || !SI2) 16991 return I1->isIdenticalTo(I2); 16992 if (SI1->isIdenticalTo(SI2)) 16993 return true; 16994 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I) 16995 if (SI1->getOperand(I) != SI2->getOperand(I)) 16996 return false; 16997 // Check if the second instruction is more defined than the first one. 16998 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end()); 16999 ArrayRef<int> SM1 = SI1->getShuffleMask(); 17000 // Count trailing undefs in the mask to check the final number of used 17001 // registers. 17002 unsigned LastUndefsCnt = 0; 17003 for (int I = 0, E = NewMask.size(); I < E; ++I) { 17004 if (SM1[I] == PoisonMaskElem) 17005 ++LastUndefsCnt; 17006 else 17007 LastUndefsCnt = 0; 17008 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem && 17009 NewMask[I] != SM1[I]) 17010 return false; 17011 if (NewMask[I] == PoisonMaskElem) 17012 NewMask[I] = SM1[I]; 17013 } 17014 // Check if the last undefs actually change the final number of used vector 17015 // registers. 17016 return SM1.size() - LastUndefsCnt > 1 && 17017 ::getNumberOfParts(*TTI, SI1->getType()) == 17018 ::getNumberOfParts( 17019 *TTI, getWidenedType(SI1->getType()->getElementType(), 17020 SM1.size() - LastUndefsCnt)); 17021 }; 17022 // Perform O(N^2) search over the gather/shuffle sequences and merge identical 17023 // instructions. TODO: We can further optimize this scan if we split the 17024 // instructions into different buckets based on the insert lane. 17025 SmallVector<Instruction *, 16> Visited; 17026 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { 17027 assert(*I && 17028 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && 17029 "Worklist not sorted properly!"); 17030 BasicBlock *BB = (*I)->getBlock(); 17031 // For all instructions in blocks containing gather sequences: 17032 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 17033 if (isDeleted(&In)) 17034 continue; 17035 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) && 17036 !GatherShuffleExtractSeq.contains(&In)) 17037 continue; 17038 17039 // Check if we can replace this instruction with any of the 17040 // visited instructions. 17041 bool Replaced = false; 17042 for (Instruction *&V : Visited) { 17043 SmallVector<int> NewMask; 17044 if (IsIdenticalOrLessDefined(&In, V, NewMask) && 17045 DT->dominates(V->getParent(), In.getParent())) { 17046 In.replaceAllUsesWith(V); 17047 eraseInstruction(&In); 17048 if (auto *SI = dyn_cast<ShuffleVectorInst>(V)) 17049 if (!NewMask.empty()) 17050 SI->setShuffleMask(NewMask); 17051 Replaced = true; 17052 break; 17053 } 17054 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) && 17055 GatherShuffleExtractSeq.contains(V) && 17056 IsIdenticalOrLessDefined(V, &In, NewMask) && 17057 DT->dominates(In.getParent(), V->getParent())) { 17058 In.moveAfter(V); 17059 V->replaceAllUsesWith(&In); 17060 eraseInstruction(V); 17061 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In)) 17062 if (!NewMask.empty()) 17063 SI->setShuffleMask(NewMask); 17064 V = &In; 17065 Replaced = true; 17066 break; 17067 } 17068 } 17069 if (!Replaced) { 17070 assert(!is_contained(Visited, &In)); 17071 Visited.push_back(&In); 17072 } 17073 } 17074 } 17075 CSEBlocks.clear(); 17076 GatherShuffleExtractSeq.clear(); 17077 } 17078 17079 BoUpSLP::ScheduleData * 17080 BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) { 17081 ScheduleData *Bundle = nullptr; 17082 ScheduleData *PrevInBundle = nullptr; 17083 for (Value *V : VL) { 17084 if (doesNotNeedToBeScheduled(V)) 17085 continue; 17086 ScheduleData *BundleMember = getScheduleData(V); 17087 assert(BundleMember && 17088 "no ScheduleData for bundle member " 17089 "(maybe not in same basic block)"); 17090 assert(BundleMember->isSchedulingEntity() && 17091 "bundle member already part of other bundle"); 17092 if (PrevInBundle) { 17093 PrevInBundle->NextInBundle = BundleMember; 17094 } else { 17095 Bundle = BundleMember; 17096 } 17097 17098 // Group the instructions to a bundle. 17099 BundleMember->FirstInBundle = Bundle; 17100 PrevInBundle = BundleMember; 17101 } 17102 assert(Bundle && "Failed to find schedule bundle"); 17103 return Bundle; 17104 } 17105 17106 // Groups the instructions to a bundle (which is then a single scheduling entity) 17107 // and schedules instructions until the bundle gets ready. 17108 std::optional<BoUpSLP::ScheduleData *> 17109 BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, 17110 const InstructionsState &S) { 17111 // No need to schedule PHIs, insertelement, extractelement and extractvalue 17112 // instructions. 17113 if (isa<PHINode>(S.getMainOp()) || 17114 isVectorLikeInstWithConstOps(S.getMainOp()) || doesNotNeedToSchedule(VL)) 17115 return nullptr; 17116 17117 // Initialize the instruction bundle. 17118 Instruction *OldScheduleEnd = ScheduleEnd; 17119 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n"); 17120 17121 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule, 17122 ScheduleData *Bundle) { 17123 // The scheduling region got new instructions at the lower end (or it is a 17124 // new region for the first bundle). This makes it necessary to 17125 // recalculate all dependencies. 17126 // It is seldom that this needs to be done a second time after adding the 17127 // initial bundle to the region. 17128 if (ScheduleEnd != OldScheduleEnd) { 17129 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) 17130 if (ScheduleData *SD = getScheduleData(I)) 17131 SD->clearDependencies(); 17132 ReSchedule = true; 17133 } 17134 if (Bundle) { 17135 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle 17136 << " in block " << BB->getName() << "\n"); 17137 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP); 17138 } 17139 17140 if (ReSchedule) { 17141 resetSchedule(); 17142 initialFillReadyList(ReadyInsts); 17143 } 17144 17145 // Now try to schedule the new bundle or (if no bundle) just calculate 17146 // dependencies. As soon as the bundle is "ready" it means that there are no 17147 // cyclic dependencies and we can schedule it. Note that's important that we 17148 // don't "schedule" the bundle yet (see cancelScheduling). 17149 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) && 17150 !ReadyInsts.empty()) { 17151 ScheduleData *Picked = ReadyInsts.pop_back_val(); 17152 assert(Picked->isSchedulingEntity() && Picked->isReady() && 17153 "must be ready to schedule"); 17154 schedule(Picked, ReadyInsts); 17155 } 17156 }; 17157 17158 // Make sure that the scheduling region contains all 17159 // instructions of the bundle. 17160 for (Value *V : VL) { 17161 if (doesNotNeedToBeScheduled(V)) 17162 continue; 17163 if (!extendSchedulingRegion(V, S)) { 17164 // If the scheduling region got new instructions at the lower end (or it 17165 // is a new region for the first bundle). This makes it necessary to 17166 // recalculate all dependencies. 17167 // Otherwise the compiler may crash trying to incorrectly calculate 17168 // dependencies and emit instruction in the wrong order at the actual 17169 // scheduling. 17170 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr); 17171 return std::nullopt; 17172 } 17173 } 17174 17175 bool ReSchedule = false; 17176 for (Value *V : VL) { 17177 if (doesNotNeedToBeScheduled(V)) 17178 continue; 17179 ScheduleData *BundleMember = getScheduleData(V); 17180 assert(BundleMember && 17181 "no ScheduleData for bundle member (maybe not in same basic block)"); 17182 17183 // Make sure we don't leave the pieces of the bundle in the ready list when 17184 // whole bundle might not be ready. 17185 ReadyInsts.remove(BundleMember); 17186 17187 if (!BundleMember->IsScheduled) 17188 continue; 17189 // A bundle member was scheduled as single instruction before and now 17190 // needs to be scheduled as part of the bundle. We just get rid of the 17191 // existing schedule. 17192 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember 17193 << " was already scheduled\n"); 17194 ReSchedule = true; 17195 } 17196 17197 auto *Bundle = buildBundle(VL); 17198 TryScheduleBundleImpl(ReSchedule, Bundle); 17199 if (!Bundle->isReady()) { 17200 cancelScheduling(VL, S.getMainOp()); 17201 return std::nullopt; 17202 } 17203 return Bundle; 17204 } 17205 17206 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, 17207 Value *OpValue) { 17208 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) || 17209 doesNotNeedToSchedule(VL)) 17210 return; 17211 17212 if (doesNotNeedToBeScheduled(OpValue)) 17213 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled); 17214 ScheduleData *Bundle = getScheduleData(OpValue); 17215 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); 17216 assert(!Bundle->IsScheduled && 17217 "Can't cancel bundle which is already scheduled"); 17218 assert(Bundle->isSchedulingEntity() && 17219 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) && 17220 "tried to unbundle something which is not a bundle"); 17221 17222 // Remove the bundle from the ready list. 17223 if (Bundle->isReady()) 17224 ReadyInsts.remove(Bundle); 17225 17226 // Un-bundle: make single instructions out of the bundle. 17227 ScheduleData *BundleMember = Bundle; 17228 while (BundleMember) { 17229 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); 17230 BundleMember->FirstInBundle = BundleMember; 17231 ScheduleData *Next = BundleMember->NextInBundle; 17232 BundleMember->NextInBundle = nullptr; 17233 BundleMember->TE = nullptr; 17234 if (BundleMember->unscheduledDepsInBundle() == 0) { 17235 ReadyInsts.insert(BundleMember); 17236 } 17237 BundleMember = Next; 17238 } 17239 } 17240 17241 BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { 17242 // Allocate a new ScheduleData for the instruction. 17243 if (ChunkPos >= ChunkSize) { 17244 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize)); 17245 ChunkPos = 0; 17246 } 17247 return &(ScheduleDataChunks.back()[ChunkPos++]); 17248 } 17249 17250 bool BoUpSLP::BlockScheduling::extendSchedulingRegion( 17251 Value *V, const InstructionsState &S) { 17252 Instruction *I = dyn_cast<Instruction>(V); 17253 assert(I && "bundle member must be an instruction"); 17254 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) && 17255 !doesNotNeedToBeScheduled(I) && 17256 "phi nodes/insertelements/extractelements/extractvalues don't need to " 17257 "be scheduled"); 17258 if (getScheduleData(I)) 17259 return true; 17260 if (!ScheduleStart) { 17261 // It's the first instruction in the new region. 17262 initScheduleData(I, I->getNextNode(), nullptr, nullptr); 17263 ScheduleStart = I; 17264 ScheduleEnd = I->getNextNode(); 17265 assert(ScheduleEnd && "tried to vectorize a terminator?"); 17266 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); 17267 return true; 17268 } 17269 // Search up and down at the same time, because we don't know if the new 17270 // instruction is above or below the existing scheduling region. 17271 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted 17272 // against the budget. Otherwise debug info could affect codegen. 17273 BasicBlock::reverse_iterator UpIter = 17274 ++ScheduleStart->getIterator().getReverse(); 17275 BasicBlock::reverse_iterator UpperEnd = BB->rend(); 17276 BasicBlock::iterator DownIter = ScheduleEnd->getIterator(); 17277 BasicBlock::iterator LowerEnd = BB->end(); 17278 auto IsAssumeLikeIntr = [](const Instruction &I) { 17279 if (auto *II = dyn_cast<IntrinsicInst>(&I)) 17280 return II->isAssumeLikeIntrinsic(); 17281 return false; 17282 }; 17283 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr); 17284 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr); 17285 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I && 17286 &*DownIter != I) { 17287 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { 17288 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n"); 17289 return false; 17290 } 17291 17292 ++UpIter; 17293 ++DownIter; 17294 17295 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr); 17296 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr); 17297 } 17298 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) { 17299 assert(I->getParent() == ScheduleStart->getParent() && 17300 "Instruction is in wrong basic block."); 17301 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); 17302 ScheduleStart = I; 17303 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I 17304 << "\n"); 17305 return true; 17306 } 17307 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) && 17308 "Expected to reach top of the basic block or instruction down the " 17309 "lower end."); 17310 assert(I->getParent() == ScheduleEnd->getParent() && 17311 "Instruction is in wrong basic block."); 17312 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, 17313 nullptr); 17314 ScheduleEnd = I->getNextNode(); 17315 assert(ScheduleEnd && "tried to vectorize a terminator?"); 17316 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); 17317 return true; 17318 } 17319 17320 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, 17321 Instruction *ToI, 17322 ScheduleData *PrevLoadStore, 17323 ScheduleData *NextLoadStore) { 17324 ScheduleData *CurrentLoadStore = PrevLoadStore; 17325 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { 17326 // No need to allocate data for non-schedulable instructions. 17327 if (doesNotNeedToBeScheduled(I)) 17328 continue; 17329 ScheduleData *SD = ScheduleDataMap.lookup(I); 17330 if (!SD) { 17331 SD = allocateScheduleDataChunks(); 17332 ScheduleDataMap[I] = SD; 17333 } 17334 assert(!isInSchedulingRegion(SD) && 17335 "new ScheduleData already in scheduling region"); 17336 SD->init(SchedulingRegionID, I); 17337 17338 if (I->mayReadOrWriteMemory() && 17339 (!isa<IntrinsicInst>(I) || 17340 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect && 17341 cast<IntrinsicInst>(I)->getIntrinsicID() != 17342 Intrinsic::pseudoprobe))) { 17343 // Update the linked list of memory accessing instructions. 17344 if (CurrentLoadStore) { 17345 CurrentLoadStore->NextLoadStore = SD; 17346 } else { 17347 FirstLoadStoreInRegion = SD; 17348 } 17349 CurrentLoadStore = SD; 17350 } 17351 17352 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) || 17353 match(I, m_Intrinsic<Intrinsic::stackrestore>())) 17354 RegionHasStackSave = true; 17355 } 17356 if (NextLoadStore) { 17357 if (CurrentLoadStore) 17358 CurrentLoadStore->NextLoadStore = NextLoadStore; 17359 } else { 17360 LastLoadStoreInRegion = CurrentLoadStore; 17361 } 17362 } 17363 17364 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, 17365 bool InsertInReadyList, 17366 BoUpSLP *SLP) { 17367 assert(SD->isSchedulingEntity()); 17368 17369 SmallVector<ScheduleData *, 10> WorkList; 17370 WorkList.push_back(SD); 17371 17372 while (!WorkList.empty()) { 17373 ScheduleData *SD = WorkList.pop_back_val(); 17374 for (ScheduleData *BundleMember = SD; BundleMember; 17375 BundleMember = BundleMember->NextInBundle) { 17376 assert(isInSchedulingRegion(BundleMember)); 17377 if (BundleMember->hasValidDependencies()) 17378 continue; 17379 17380 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember 17381 << "\n"); 17382 BundleMember->Dependencies = 0; 17383 BundleMember->resetUnscheduledDeps(); 17384 17385 // Handle def-use chain dependencies. 17386 for (User *U : BundleMember->Inst->users()) { 17387 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) { 17388 BundleMember->Dependencies++; 17389 ScheduleData *DestBundle = UseSD->FirstInBundle; 17390 if (!DestBundle->IsScheduled) 17391 BundleMember->incrementUnscheduledDeps(1); 17392 if (!DestBundle->hasValidDependencies()) 17393 WorkList.push_back(DestBundle); 17394 } 17395 } 17396 17397 auto MakeControlDependent = [&](Instruction *I) { 17398 auto *DepDest = getScheduleData(I); 17399 assert(DepDest && "must be in schedule window"); 17400 DepDest->ControlDependencies.push_back(BundleMember); 17401 BundleMember->Dependencies++; 17402 ScheduleData *DestBundle = DepDest->FirstInBundle; 17403 if (!DestBundle->IsScheduled) 17404 BundleMember->incrementUnscheduledDeps(1); 17405 if (!DestBundle->hasValidDependencies()) 17406 WorkList.push_back(DestBundle); 17407 }; 17408 17409 // Any instruction which isn't safe to speculate at the beginning of the 17410 // block is control dependend on any early exit or non-willreturn call 17411 // which proceeds it. 17412 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) { 17413 for (Instruction *I = BundleMember->Inst->getNextNode(); 17414 I != ScheduleEnd; I = I->getNextNode()) { 17415 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC)) 17416 continue; 17417 17418 // Add the dependency 17419 MakeControlDependent(I); 17420 17421 if (!isGuaranteedToTransferExecutionToSuccessor(I)) 17422 // Everything past here must be control dependent on I. 17423 break; 17424 } 17425 } 17426 17427 if (RegionHasStackSave) { 17428 // If we have an inalloc alloca instruction, it needs to be scheduled 17429 // after any preceeding stacksave. We also need to prevent any alloca 17430 // from reordering above a preceeding stackrestore. 17431 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) || 17432 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) { 17433 for (Instruction *I = BundleMember->Inst->getNextNode(); 17434 I != ScheduleEnd; I = I->getNextNode()) { 17435 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) || 17436 match(I, m_Intrinsic<Intrinsic::stackrestore>())) 17437 // Any allocas past here must be control dependent on I, and I 17438 // must be memory dependend on BundleMember->Inst. 17439 break; 17440 17441 if (!isa<AllocaInst>(I)) 17442 continue; 17443 17444 // Add the dependency 17445 MakeControlDependent(I); 17446 } 17447 } 17448 17449 // In addition to the cases handle just above, we need to prevent 17450 // allocas and loads/stores from moving below a stacksave or a 17451 // stackrestore. Avoiding moving allocas below stackrestore is currently 17452 // thought to be conservatism. Moving loads/stores below a stackrestore 17453 // can lead to incorrect code. 17454 if (isa<AllocaInst>(BundleMember->Inst) || 17455 BundleMember->Inst->mayReadOrWriteMemory()) { 17456 for (Instruction *I = BundleMember->Inst->getNextNode(); 17457 I != ScheduleEnd; I = I->getNextNode()) { 17458 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) && 17459 !match(I, m_Intrinsic<Intrinsic::stackrestore>())) 17460 continue; 17461 17462 // Add the dependency 17463 MakeControlDependent(I); 17464 break; 17465 } 17466 } 17467 } 17468 17469 // Handle the memory dependencies (if any). 17470 ScheduleData *DepDest = BundleMember->NextLoadStore; 17471 if (!DepDest) 17472 continue; 17473 Instruction *SrcInst = BundleMember->Inst; 17474 assert(SrcInst->mayReadOrWriteMemory() && 17475 "NextLoadStore list for non memory effecting bundle?"); 17476 MemoryLocation SrcLoc = getLocation(SrcInst); 17477 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); 17478 unsigned NumAliased = 0; 17479 unsigned DistToSrc = 1; 17480 17481 for (; DepDest; DepDest = DepDest->NextLoadStore) { 17482 assert(isInSchedulingRegion(DepDest)); 17483 17484 // We have two limits to reduce the complexity: 17485 // 1) AliasedCheckLimit: It's a small limit to reduce calls to 17486 // SLP->isAliased (which is the expensive part in this loop). 17487 // 2) MaxMemDepDistance: It's for very large blocks and it aborts 17488 // the whole loop (even if the loop is fast, it's quadratic). 17489 // It's important for the loop break condition (see below) to 17490 // check this limit even between two read-only instructions. 17491 if (DistToSrc >= MaxMemDepDistance || 17492 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && 17493 (NumAliased >= AliasedCheckLimit || 17494 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { 17495 17496 // We increment the counter only if the locations are aliased 17497 // (instead of counting all alias checks). This gives a better 17498 // balance between reduced runtime and accurate dependencies. 17499 NumAliased++; 17500 17501 DepDest->MemoryDependencies.push_back(BundleMember); 17502 BundleMember->Dependencies++; 17503 ScheduleData *DestBundle = DepDest->FirstInBundle; 17504 if (!DestBundle->IsScheduled) { 17505 BundleMember->incrementUnscheduledDeps(1); 17506 } 17507 if (!DestBundle->hasValidDependencies()) { 17508 WorkList.push_back(DestBundle); 17509 } 17510 } 17511 17512 // Example, explaining the loop break condition: Let's assume our 17513 // starting instruction is i0 and MaxMemDepDistance = 3. 17514 // 17515 // +--------v--v--v 17516 // i0,i1,i2,i3,i4,i5,i6,i7,i8 17517 // +--------^--^--^ 17518 // 17519 // MaxMemDepDistance let us stop alias-checking at i3 and we add 17520 // dependencies from i0 to i3,i4,.. (even if they are not aliased). 17521 // Previously we already added dependencies from i3 to i6,i7,i8 17522 // (because of MaxMemDepDistance). As we added a dependency from 17523 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 17524 // and we can abort this loop at i6. 17525 if (DistToSrc >= 2 * MaxMemDepDistance) 17526 break; 17527 DistToSrc++; 17528 } 17529 } 17530 if (InsertInReadyList && SD->isReady()) { 17531 ReadyInsts.insert(SD); 17532 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst 17533 << "\n"); 17534 } 17535 } 17536 } 17537 17538 void BoUpSLP::BlockScheduling::resetSchedule() { 17539 assert(ScheduleStart && 17540 "tried to reset schedule on block which has not been scheduled"); 17541 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 17542 if (ScheduleData *SD = getScheduleData(I)) { 17543 assert(isInSchedulingRegion(SD) && 17544 "ScheduleData not in scheduling region"); 17545 SD->IsScheduled = false; 17546 SD->resetUnscheduledDeps(); 17547 } 17548 } 17549 ReadyInsts.clear(); 17550 } 17551 17552 void BoUpSLP::scheduleBlock(BlockScheduling *BS) { 17553 if (!BS->ScheduleStart) 17554 return; 17555 17556 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); 17557 17558 // A key point - if we got here, pre-scheduling was able to find a valid 17559 // scheduling of the sub-graph of the scheduling window which consists 17560 // of all vector bundles and their transitive users. As such, we do not 17561 // need to reschedule anything *outside of* that subgraph. 17562 17563 BS->resetSchedule(); 17564 17565 // For the real scheduling we use a more sophisticated ready-list: it is 17566 // sorted by the original instruction location. This lets the final schedule 17567 // be as close as possible to the original instruction order. 17568 // WARNING: If changing this order causes a correctness issue, that means 17569 // there is some missing dependence edge in the schedule data graph. 17570 struct ScheduleDataCompare { 17571 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const { 17572 return SD2->SchedulingPriority < SD1->SchedulingPriority; 17573 } 17574 }; 17575 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; 17576 17577 // Ensure that all dependency data is updated (for nodes in the sub-graph) 17578 // and fill the ready-list with initial instructions. 17579 int Idx = 0; 17580 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; 17581 I = I->getNextNode()) { 17582 if (ScheduleData *SD = BS->getScheduleData(I)) { 17583 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(SD->Inst); 17584 assert((isVectorLikeInstWithConstOps(SD->Inst) || 17585 SD->isPartOfBundle() == 17586 (!SDTEs.empty() && 17587 !doesNotNeedToSchedule(SDTEs.front()->Scalars))) && 17588 "scheduler and vectorizer bundle mismatch"); 17589 SD->FirstInBundle->SchedulingPriority = Idx++; 17590 17591 if (SD->isSchedulingEntity() && SD->isPartOfBundle()) 17592 BS->calculateDependencies(SD, false, this); 17593 } 17594 } 17595 BS->initialFillReadyList(ReadyInsts); 17596 17597 Instruction *LastScheduledInst = BS->ScheduleEnd; 17598 17599 // Do the "real" scheduling. 17600 while (!ReadyInsts.empty()) { 17601 ScheduleData *Picked = *ReadyInsts.begin(); 17602 ReadyInsts.erase(ReadyInsts.begin()); 17603 17604 // Move the scheduled instruction(s) to their dedicated places, if not 17605 // there yet. 17606 for (ScheduleData *BundleMember = Picked; BundleMember; 17607 BundleMember = BundleMember->NextInBundle) { 17608 Instruction *PickedInst = BundleMember->Inst; 17609 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst) 17610 PickedInst->moveAfter(LastScheduledInst->getPrevNode()); 17611 LastScheduledInst = PickedInst; 17612 } 17613 17614 BS->schedule(Picked, ReadyInsts); 17615 } 17616 17617 // Check that we didn't break any of our invariants. 17618 #ifdef EXPENSIVE_CHECKS 17619 BS->verify(); 17620 #endif 17621 17622 #if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS) 17623 // Check that all schedulable entities got scheduled 17624 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { 17625 ScheduleData *SD = BS->getScheduleData(I); 17626 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies()) 17627 assert(SD->IsScheduled && "must be scheduled at this point"); 17628 } 17629 #endif 17630 17631 // Avoid duplicate scheduling of the block. 17632 BS->ScheduleStart = nullptr; 17633 } 17634 17635 unsigned BoUpSLP::getVectorElementSize(Value *V) { 17636 // If V is a store, just return the width of the stored value (or value 17637 // truncated just before storing) without traversing the expression tree. 17638 // This is the common case. 17639 if (auto *Store = dyn_cast<StoreInst>(V)) 17640 return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); 17641 17642 if (auto *IEI = dyn_cast<InsertElementInst>(V)) 17643 return getVectorElementSize(IEI->getOperand(1)); 17644 17645 auto E = InstrElementSize.find(V); 17646 if (E != InstrElementSize.end()) 17647 return E->second; 17648 17649 // If V is not a store, we can traverse the expression tree to find loads 17650 // that feed it. The type of the loaded value may indicate a more suitable 17651 // width than V's type. We want to base the vector element size on the width 17652 // of memory operations where possible. 17653 SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist; 17654 SmallPtrSet<Instruction *, 16> Visited; 17655 if (auto *I = dyn_cast<Instruction>(V)) { 17656 Worklist.emplace_back(I, I->getParent(), 0); 17657 Visited.insert(I); 17658 } 17659 17660 // Traverse the expression tree in bottom-up order looking for loads. If we 17661 // encounter an instruction we don't yet handle, we give up. 17662 auto Width = 0u; 17663 Value *FirstNonBool = nullptr; 17664 while (!Worklist.empty()) { 17665 auto [I, Parent, Level] = Worklist.pop_back_val(); 17666 17667 // We should only be looking at scalar instructions here. If the current 17668 // instruction has a vector type, skip. 17669 auto *Ty = I->getType(); 17670 if (isa<VectorType>(Ty)) 17671 continue; 17672 if (Ty != Builder.getInt1Ty() && !FirstNonBool) 17673 FirstNonBool = I; 17674 if (Level > RecursionMaxDepth) 17675 continue; 17676 17677 // If the current instruction is a load, update MaxWidth to reflect the 17678 // width of the loaded value. 17679 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I)) 17680 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty)); 17681 17682 // Otherwise, we need to visit the operands of the instruction. We only 17683 // handle the interesting cases from buildTree here. If an operand is an 17684 // instruction we haven't yet visited and from the same basic block as the 17685 // user or the use is a PHI node, we add it to the worklist. 17686 else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst, 17687 BinaryOperator, UnaryOperator>(I)) { 17688 for (Use &U : I->operands()) { 17689 if (auto *J = dyn_cast<Instruction>(U.get())) 17690 if (Visited.insert(J).second && 17691 (isa<PHINode>(I) || J->getParent() == Parent)) { 17692 Worklist.emplace_back(J, J->getParent(), Level + 1); 17693 continue; 17694 } 17695 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty()) 17696 FirstNonBool = U.get(); 17697 } 17698 } else { 17699 break; 17700 } 17701 } 17702 17703 // If we didn't encounter a memory access in the expression tree, or if we 17704 // gave up for some reason, just return the width of V. Otherwise, return the 17705 // maximum width we found. 17706 if (!Width) { 17707 if (V->getType() == Builder.getInt1Ty() && FirstNonBool) 17708 V = FirstNonBool; 17709 Width = DL->getTypeSizeInBits(V->getType()); 17710 } 17711 17712 for (Instruction *I : Visited) 17713 InstrElementSize[I] = Width; 17714 17715 return Width; 17716 } 17717 17718 bool BoUpSLP::collectValuesToDemote( 17719 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth, 17720 SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited, 17721 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel, 17722 bool &IsProfitableToDemote, bool IsTruncRoot) const { 17723 // We can always demote constants. 17724 if (all_of(E.Scalars, IsaPred<Constant>)) 17725 return true; 17726 17727 unsigned OrigBitWidth = 17728 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType()); 17729 if (OrigBitWidth == BitWidth) { 17730 MaxDepthLevel = 1; 17731 return true; 17732 } 17733 17734 // Check if the node was analyzed already and must keep its original bitwidth. 17735 if (NodesToKeepBWs.contains(E.Idx)) 17736 return false; 17737 17738 // If the value is not a vectorized instruction in the expression and not used 17739 // by the insertelement instruction and not used in multiple vector nodes, it 17740 // cannot be demoted. 17741 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) { 17742 if (isa<PoisonValue>(R)) 17743 return false; 17744 return !isKnownNonNegative(R, SimplifyQuery(*DL)); 17745 }); 17746 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool { 17747 if (isa<PoisonValue>(V)) 17748 return true; 17749 if (getTreeEntries(V).size() > 1) 17750 return false; 17751 // For lat shuffle of sext/zext with many uses need to check the extra bit 17752 // for unsigned values, otherwise may have incorrect casting for reused 17753 // scalars. 17754 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL)); 17755 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) { 17756 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); 17757 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL))) 17758 return true; 17759 } 17760 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT); 17761 unsigned BitWidth1 = OrigBitWidth - NumSignBits; 17762 if (IsSignedNode) 17763 ++BitWidth1; 17764 if (auto *I = dyn_cast<Instruction>(V)) { 17765 APInt Mask = DB->getDemandedBits(I); 17766 unsigned BitWidth2 = 17767 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero()); 17768 while (!IsSignedNode && BitWidth2 < OrigBitWidth) { 17769 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1); 17770 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL))) 17771 break; 17772 BitWidth2 *= 2; 17773 } 17774 BitWidth1 = std::min(BitWidth1, BitWidth2); 17775 } 17776 BitWidth = std::max(BitWidth, BitWidth1); 17777 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2); 17778 }; 17779 auto FinalAnalysis = [&, TTI = TTI]() { 17780 if (!IsProfitableToDemote) 17781 return false; 17782 bool Res = all_of( 17783 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth))); 17784 // Demote gathers. 17785 if (Res && E.isGather()) { 17786 // Check possible extractelement instructions bases and final vector 17787 // length. 17788 SmallPtrSet<Value *, 4> UniqueBases; 17789 for (Value *V : E.Scalars) { 17790 auto *EE = dyn_cast<ExtractElementInst>(V); 17791 if (!EE) 17792 continue; 17793 UniqueBases.insert(EE->getVectorOperand()); 17794 } 17795 const unsigned VF = E.Scalars.size(); 17796 Type *OrigScalarTy = E.Scalars.front()->getType(); 17797 if (UniqueBases.size() <= 2 || 17798 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) == 17799 ::getNumberOfParts( 17800 *TTI, 17801 getWidenedType( 17802 IntegerType::get(OrigScalarTy->getContext(), BitWidth), 17803 VF))) 17804 ToDemote.push_back(E.Idx); 17805 } 17806 return Res; 17807 }; 17808 if (E.isGather() || !Visited.insert(&E).second || 17809 any_of(E.Scalars, [&](Value *V) { 17810 return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) { 17811 return isa<InsertElementInst>(U) && !isVectorized(U); 17812 }); 17813 })) 17814 return FinalAnalysis(); 17815 17816 if (any_of(E.Scalars, [&](Value *V) { 17817 return !all_of(V->users(), [=](User *U) { 17818 return isVectorized(U) || 17819 (E.Idx == 0 && UserIgnoreList && 17820 UserIgnoreList->contains(U)) || 17821 (!isa<CmpInst>(U) && U->getType()->isSized() && 17822 !U->getType()->isScalableTy() && 17823 DL->getTypeSizeInBits(U->getType()) <= BitWidth); 17824 }) && !IsPotentiallyTruncated(V, BitWidth); 17825 })) 17826 return false; 17827 17828 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands, 17829 bool &NeedToExit) { 17830 NeedToExit = false; 17831 unsigned InitLevel = MaxDepthLevel; 17832 for (const TreeEntry *Op : Operands) { 17833 unsigned Level = InitLevel; 17834 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth, 17835 ToDemote, Visited, NodesToKeepBWs, Level, 17836 IsProfitableToDemote, IsTruncRoot)) { 17837 if (!IsProfitableToDemote) 17838 return false; 17839 NeedToExit = true; 17840 if (!FinalAnalysis()) 17841 return false; 17842 continue; 17843 } 17844 MaxDepthLevel = std::max(MaxDepthLevel, Level); 17845 } 17846 return true; 17847 }; 17848 auto AttemptCheckBitwidth = 17849 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) { 17850 // Try all bitwidth < OrigBitWidth. 17851 NeedToExit = false; 17852 unsigned BestFailBitwidth = 0; 17853 for (; BitWidth < OrigBitWidth; BitWidth *= 2) { 17854 if (Checker(BitWidth, OrigBitWidth)) 17855 return true; 17856 if (BestFailBitwidth == 0 && FinalAnalysis()) 17857 BestFailBitwidth = BitWidth; 17858 } 17859 if (BitWidth >= OrigBitWidth) { 17860 if (BestFailBitwidth == 0) { 17861 BitWidth = OrigBitWidth; 17862 return false; 17863 } 17864 MaxDepthLevel = 1; 17865 BitWidth = BestFailBitwidth; 17866 NeedToExit = true; 17867 return true; 17868 } 17869 return false; 17870 }; 17871 auto TryProcessInstruction = 17872 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {}, 17873 function_ref<bool(unsigned, unsigned)> Checker = {}) { 17874 if (Operands.empty()) { 17875 if (!IsTruncRoot) 17876 MaxDepthLevel = 1; 17877 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1, 17878 std::ref(BitWidth))); 17879 } else { 17880 // Several vectorized uses? Check if we can truncate it, otherwise - 17881 // exit. 17882 if (E.UserTreeIndices.size() > 1 && 17883 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1, 17884 std::ref(BitWidth)))) 17885 return false; 17886 bool NeedToExit = false; 17887 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit)) 17888 return false; 17889 if (NeedToExit) 17890 return true; 17891 if (!ProcessOperands(Operands, NeedToExit)) 17892 return false; 17893 if (NeedToExit) 17894 return true; 17895 } 17896 17897 ++MaxDepthLevel; 17898 // Record the entry that we can demote. 17899 ToDemote.push_back(E.Idx); 17900 return IsProfitableToDemote; 17901 }; 17902 switch (E.getOpcode()) { 17903 17904 // We can always demote truncations and extensions. Since truncations can 17905 // seed additional demotion, we save the truncated value. 17906 case Instruction::Trunc: 17907 if (IsProfitableToDemoteRoot) 17908 IsProfitableToDemote = true; 17909 return TryProcessInstruction(BitWidth); 17910 case Instruction::ZExt: 17911 case Instruction::SExt: 17912 IsProfitableToDemote = true; 17913 return TryProcessInstruction(BitWidth); 17914 17915 // We can demote certain binary operations if we can demote both of their 17916 // operands. 17917 case Instruction::Add: 17918 case Instruction::Sub: 17919 case Instruction::Mul: 17920 case Instruction::And: 17921 case Instruction::Or: 17922 case Instruction::Xor: { 17923 return TryProcessInstruction( 17924 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}); 17925 } 17926 case Instruction::Freeze: 17927 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0)); 17928 case Instruction::Shl: { 17929 // If we are truncating the result of this SHL, and if it's a shift of an 17930 // inrange amount, we can always perform a SHL in a smaller type. 17931 auto ShlChecker = [&](unsigned BitWidth, unsigned) { 17932 return all_of(E.Scalars, [&](Value *V) { 17933 if (isa<PoisonValue>(V)) 17934 return true; 17935 auto *I = cast<Instruction>(V); 17936 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL); 17937 return AmtKnownBits.getMaxValue().ult(BitWidth); 17938 }); 17939 }; 17940 return TryProcessInstruction( 17941 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker); 17942 } 17943 case Instruction::LShr: { 17944 // If this is a truncate of a logical shr, we can truncate it to a smaller 17945 // lshr iff we know that the bits we would otherwise be shifting in are 17946 // already zeros. 17947 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) { 17948 return all_of(E.Scalars, [&](Value *V) { 17949 if (isa<PoisonValue>(V)) 17950 return true; 17951 auto *I = cast<Instruction>(V); 17952 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL); 17953 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); 17954 return AmtKnownBits.getMaxValue().ult(BitWidth) && 17955 MaskedValueIsZero(I->getOperand(0), ShiftedBits, 17956 SimplifyQuery(*DL)); 17957 }); 17958 }; 17959 return TryProcessInstruction( 17960 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, 17961 LShrChecker); 17962 } 17963 case Instruction::AShr: { 17964 // If this is a truncate of an arithmetic shr, we can truncate it to a 17965 // smaller ashr iff we know that all the bits from the sign bit of the 17966 // original type and the sign bit of the truncate type are similar. 17967 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) { 17968 return all_of(E.Scalars, [&](Value *V) { 17969 if (isa<PoisonValue>(V)) 17970 return true; 17971 auto *I = cast<Instruction>(V); 17972 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL); 17973 unsigned ShiftedBits = OrigBitWidth - BitWidth; 17974 return AmtKnownBits.getMaxValue().ult(BitWidth) && 17975 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, 17976 nullptr, DT); 17977 }); 17978 }; 17979 return TryProcessInstruction( 17980 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, 17981 AShrChecker); 17982 } 17983 case Instruction::UDiv: 17984 case Instruction::URem: { 17985 // UDiv and URem can be truncated if all the truncated bits are zero. 17986 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) { 17987 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!"); 17988 return all_of(E.Scalars, [&](Value *V) { 17989 auto *I = cast<Instruction>(V); 17990 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); 17991 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) && 17992 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)); 17993 }); 17994 }; 17995 return TryProcessInstruction( 17996 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker); 17997 } 17998 17999 // We can demote selects if we can demote their true and false values. 18000 case Instruction::Select: { 18001 return TryProcessInstruction( 18002 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)}); 18003 } 18004 18005 // We can demote phis if we can demote all their incoming operands. Note that 18006 // we don't need to worry about cycles since we ensure single use above. 18007 case Instruction::PHI: { 18008 const unsigned NumOps = E.getNumOperands(); 18009 SmallVector<const TreeEntry *> Ops(NumOps); 18010 transform(seq<unsigned>(0, NumOps), Ops.begin(), 18011 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1)); 18012 18013 return TryProcessInstruction(BitWidth, Ops); 18014 } 18015 18016 case Instruction::Call: { 18017 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp()); 18018 if (!IC) 18019 break; 18020 Intrinsic::ID ID = getVectorIntrinsicIDForCall(IC, TLI); 18021 if (ID != Intrinsic::abs && ID != Intrinsic::smin && 18022 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax) 18023 break; 18024 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0)); 18025 function_ref<bool(unsigned, unsigned)> CallChecker; 18026 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) { 18027 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!"); 18028 return all_of(E.Scalars, [&](Value *V) { 18029 auto *I = cast<Instruction>(V); 18030 if (ID == Intrinsic::umin || ID == Intrinsic::umax) { 18031 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); 18032 return MaskedValueIsZero(I->getOperand(0), Mask, 18033 SimplifyQuery(*DL)) && 18034 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)); 18035 } 18036 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) && 18037 "Expected min/max intrinsics only."); 18038 unsigned SignBits = OrigBitWidth - BitWidth; 18039 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1); 18040 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, 18041 nullptr, DT); 18042 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC, 18043 nullptr, DT); 18044 return SignBits <= Op0SignBits && 18045 ((SignBits != Op0SignBits && 18046 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) || 18047 MaskedValueIsZero(I->getOperand(0), Mask, 18048 SimplifyQuery(*DL))) && 18049 SignBits <= Op1SignBits && 18050 ((SignBits != Op1SignBits && 18051 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) || 18052 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL))); 18053 }); 18054 }; 18055 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) { 18056 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!"); 18057 return all_of(E.Scalars, [&](Value *V) { 18058 auto *I = cast<Instruction>(V); 18059 unsigned SignBits = OrigBitWidth - BitWidth; 18060 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1); 18061 unsigned Op0SignBits = 18062 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT); 18063 return SignBits <= Op0SignBits && 18064 ((SignBits != Op0SignBits && 18065 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) || 18066 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL))); 18067 }); 18068 }; 18069 if (ID != Intrinsic::abs) { 18070 Operands.push_back(getOperandEntry(&E, 1)); 18071 CallChecker = CompChecker; 18072 } else { 18073 CallChecker = AbsChecker; 18074 } 18075 InstructionCost BestCost = 18076 std::numeric_limits<InstructionCost::CostType>::max(); 18077 unsigned BestBitWidth = BitWidth; 18078 unsigned VF = E.Scalars.size(); 18079 // Choose the best bitwidth based on cost estimations. 18080 auto Checker = [&](unsigned BitWidth, unsigned) { 18081 unsigned MinBW = PowerOf2Ceil(BitWidth); 18082 SmallVector<Type *> ArgTys = 18083 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI); 18084 auto VecCallCosts = getVectorCallCosts( 18085 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF), 18086 TTI, TLI, ArgTys); 18087 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second); 18088 if (Cost < BestCost) { 18089 BestCost = Cost; 18090 BestBitWidth = BitWidth; 18091 } 18092 return false; 18093 }; 18094 [[maybe_unused]] bool NeedToExit; 18095 (void)AttemptCheckBitwidth(Checker, NeedToExit); 18096 BitWidth = BestBitWidth; 18097 return TryProcessInstruction(BitWidth, Operands, CallChecker); 18098 } 18099 18100 // Otherwise, conservatively give up. 18101 default: 18102 break; 18103 } 18104 MaxDepthLevel = 1; 18105 return FinalAnalysis(); 18106 } 18107 18108 static RecurKind getRdxKind(Value *V); 18109 18110 void BoUpSLP::computeMinimumValueSizes() { 18111 // We only attempt to truncate integer expressions. 18112 bool IsStoreOrInsertElt = 18113 VectorizableTree.front()->hasState() && 18114 (VectorizableTree.front()->getOpcode() == Instruction::Store || 18115 VectorizableTree.front()->getOpcode() == Instruction::InsertElement); 18116 if ((IsStoreOrInsertElt || UserIgnoreList) && 18117 ExtraBitWidthNodes.size() <= 1 && 18118 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 || 18119 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2)) 18120 return; 18121 18122 unsigned NodeIdx = 0; 18123 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather()) 18124 NodeIdx = 1; 18125 18126 // Ensure the roots of the vectorizable tree don't form a cycle. 18127 if (VectorizableTree[NodeIdx]->isGather() || 18128 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) || 18129 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices, 18130 [NodeIdx](const EdgeInfo &EI) { 18131 return EI.UserTE->Idx > NodeIdx; 18132 }))) 18133 return; 18134 18135 // The first value node for store/insertelement is sext/zext/trunc? Skip it, 18136 // resize to the final type. 18137 bool IsTruncRoot = false; 18138 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt; 18139 SmallVector<unsigned> RootDemotes; 18140 SmallDenseSet<unsigned, 8> NodesToKeepBWs; 18141 if (NodeIdx != 0 && 18142 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize && 18143 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) { 18144 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph."); 18145 IsTruncRoot = true; 18146 RootDemotes.push_back(NodeIdx); 18147 IsProfitableToDemoteRoot = true; 18148 ++NodeIdx; 18149 } 18150 18151 // Analyzed the reduction already and not profitable - exit. 18152 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front())) 18153 return; 18154 18155 SmallVector<unsigned> ToDemote; 18156 auto ComputeMaxBitWidth = 18157 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot, 18158 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned { 18159 ToDemote.clear(); 18160 // Check if the root is trunc and the next node is gather/buildvector, then 18161 // keep trunc in scalars, which is free in most cases. 18162 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 && 18163 !NodesToKeepBWs.contains(E.Idx) && 18164 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) && 18165 all_of(E.Scalars, [&](Value *V) { 18166 return V->hasOneUse() || isa<Constant>(V) || 18167 (!V->hasNUsesOrMore(UsesLimit) && 18168 none_of(V->users(), [&](User *U) { 18169 ArrayRef<TreeEntry *> TEs = getTreeEntries(U); 18170 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE; 18171 if (TEs.empty() || is_contained(TEs, UserTE)) 18172 return false; 18173 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode, 18174 SelectInst>(U) || 18175 !isa<CastInst, BinaryOperator, FreezeInst, PHINode, 18176 SelectInst>(UserTE->getMainOp())) 18177 return true; 18178 unsigned UserTESz = DL->getTypeSizeInBits( 18179 UserTE->Scalars.front()->getType()); 18180 if (all_of(TEs, [&](const TreeEntry *TE) { 18181 auto It = MinBWs.find(TE); 18182 return It != MinBWs.end() && 18183 It->second.first > UserTESz; 18184 })) 18185 return true; 18186 return DL->getTypeSizeInBits(U->getType()) > UserTESz; 18187 })); 18188 })) { 18189 ToDemote.push_back(E.Idx); 18190 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE; 18191 auto It = MinBWs.find(UserTE); 18192 if (It != MinBWs.end()) 18193 return It->second.first; 18194 unsigned MaxBitWidth = 18195 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType()); 18196 MaxBitWidth = bit_ceil(MaxBitWidth); 18197 if (MaxBitWidth < 8 && MaxBitWidth > 1) 18198 MaxBitWidth = 8; 18199 return MaxBitWidth; 18200 } 18201 18202 if (!E.hasState()) 18203 return 0u; 18204 18205 unsigned VF = E.getVectorFactor(); 18206 Type *ScalarTy = E.Scalars.front()->getType(); 18207 unsigned ScalarTyNumElements = getNumElements(ScalarTy); 18208 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType()); 18209 if (!TreeRootIT) 18210 return 0u; 18211 18212 if (any_of(E.Scalars, 18213 [&](Value *V) { return AnalyzedMinBWVals.contains(V); })) 18214 return 0u; 18215 18216 unsigned NumParts = ::getNumberOfParts( 18217 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements)); 18218 18219 // The maximum bit width required to represent all the values that can be 18220 // demoted without loss of precision. It would be safe to truncate the roots 18221 // of the expression to this width. 18222 unsigned MaxBitWidth = 1u; 18223 18224 // True if the roots can be zero-extended back to their original type, 18225 // rather than sign-extended. We know that if the leading bits are not 18226 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to 18227 // True. 18228 // Determine if the sign bit of all the roots is known to be zero. If not, 18229 // IsKnownPositive is set to False. 18230 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) { 18231 if (isa<PoisonValue>(R)) 18232 return true; 18233 KnownBits Known = computeKnownBits(R, *DL); 18234 return Known.isNonNegative(); 18235 }); 18236 18237 // We first check if all the bits of the roots are demanded. If they're not, 18238 // we can truncate the roots to this narrower type. 18239 for (Value *Root : E.Scalars) { 18240 if (isa<PoisonValue>(Root)) 18241 continue; 18242 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT); 18243 TypeSize NumTypeBits = 18244 DL->getTypeSizeInBits(Root->getType()->getScalarType()); 18245 unsigned BitWidth1 = NumTypeBits - NumSignBits; 18246 // If we can't prove that the sign bit is zero, we must add one to the 18247 // maximum bit width to account for the unknown sign bit. This preserves 18248 // the existing sign bit so we can safely sign-extend the root back to the 18249 // original type. Otherwise, if we know the sign bit is zero, we will 18250 // zero-extend the root instead. 18251 // 18252 // FIXME: This is somewhat suboptimal, as there will be cases where adding 18253 // one to the maximum bit width will yield a larger-than-necessary 18254 // type. In general, we need to add an extra bit only if we can't 18255 // prove that the upper bit of the original type is equal to the 18256 // upper bit of the proposed smaller type. If these two bits are 18257 // the same (either zero or one) we know that sign-extending from 18258 // the smaller type will result in the same value. Here, since we 18259 // can't yet prove this, we are just making the proposed smaller 18260 // type larger to ensure correctness. 18261 if (!IsKnownPositive) 18262 ++BitWidth1; 18263 18264 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root)); 18265 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero(); 18266 MaxBitWidth = 18267 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth); 18268 } 18269 18270 if (MaxBitWidth < 8 && MaxBitWidth > 1) 18271 MaxBitWidth = 8; 18272 18273 // If the original type is large, but reduced type does not improve the reg 18274 // use - ignore it. 18275 if (NumParts > 1 && 18276 NumParts == 18277 ::getNumberOfParts( 18278 *TTI, getWidenedType(IntegerType::get(F->getContext(), 18279 bit_ceil(MaxBitWidth)), 18280 VF))) 18281 return 0u; 18282 18283 unsigned Opcode = E.getOpcode(); 18284 bool IsProfitableToDemote = Opcode == Instruction::Trunc || 18285 Opcode == Instruction::SExt || 18286 Opcode == Instruction::ZExt || NumParts > 1; 18287 // Conservatively determine if we can actually truncate the roots of the 18288 // expression. Collect the values that can be demoted in ToDemote and 18289 // additional roots that require investigating in Roots. 18290 DenseSet<const TreeEntry *> Visited; 18291 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1; 18292 bool NeedToDemote = IsProfitableToDemote; 18293 18294 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth, 18295 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel, 18296 NeedToDemote, IsTruncRoot) || 18297 (MaxDepthLevel <= Limit && 18298 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && 18299 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) || 18300 DL->getTypeSizeInBits(TreeRootIT) / 18301 DL->getTypeSizeInBits( 18302 E.getMainOp()->getOperand(0)->getType()) > 18303 2))))) 18304 return 0u; 18305 // Round MaxBitWidth up to the next power-of-two. 18306 MaxBitWidth = bit_ceil(MaxBitWidth); 18307 18308 return MaxBitWidth; 18309 }; 18310 18311 // If we can truncate the root, we must collect additional values that might 18312 // be demoted as a result. That is, those seeded by truncations we will 18313 // modify. 18314 // Add reduction ops sizes, if any. 18315 if (UserIgnoreList && 18316 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) { 18317 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n 18318 // x i1> to in)). 18319 if (all_of(*UserIgnoreList, 18320 [](Value *V) { 18321 return isa<PoisonValue>(V) || 18322 cast<Instruction>(V)->getOpcode() == Instruction::Add; 18323 }) && 18324 VectorizableTree.front()->State == TreeEntry::Vectorize && 18325 VectorizableTree.front()->getOpcode() == Instruction::ZExt && 18326 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() == 18327 Builder.getInt1Ty()) { 18328 ReductionBitWidth = 1; 18329 } else { 18330 for (Value *V : *UserIgnoreList) { 18331 if (isa<PoisonValue>(V)) 18332 continue; 18333 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT); 18334 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType()); 18335 unsigned BitWidth1 = NumTypeBits - NumSignBits; 18336 if (!isKnownNonNegative(V, SimplifyQuery(*DL))) 18337 ++BitWidth1; 18338 unsigned BitWidth2 = BitWidth1; 18339 if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) { 18340 APInt Mask = DB->getDemandedBits(cast<Instruction>(V)); 18341 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero(); 18342 } 18343 ReductionBitWidth = 18344 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth); 18345 } 18346 if (ReductionBitWidth < 8 && ReductionBitWidth > 1) 18347 ReductionBitWidth = 8; 18348 18349 ReductionBitWidth = bit_ceil(ReductionBitWidth); 18350 } 18351 } 18352 bool IsTopRoot = NodeIdx == 0; 18353 while (NodeIdx < VectorizableTree.size() && 18354 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize && 18355 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) { 18356 RootDemotes.push_back(NodeIdx); 18357 ++NodeIdx; 18358 IsTruncRoot = true; 18359 } 18360 bool IsSignedCmp = false; 18361 while (NodeIdx < VectorizableTree.size()) { 18362 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars; 18363 unsigned Limit = 2; 18364 if (IsTopRoot && 18365 ReductionBitWidth == 18366 DL->getTypeSizeInBits( 18367 VectorizableTree.front()->Scalars.front()->getType())) 18368 Limit = 3; 18369 unsigned MaxBitWidth = ComputeMaxBitWidth( 18370 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit, 18371 IsTruncRoot, IsSignedCmp); 18372 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) { 18373 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth) 18374 ReductionBitWidth = bit_ceil(MaxBitWidth); 18375 else if (MaxBitWidth == 0) 18376 ReductionBitWidth = 0; 18377 } 18378 18379 for (unsigned Idx : RootDemotes) { 18380 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) { 18381 uint32_t OrigBitWidth = 18382 DL->getTypeSizeInBits(V->getType()->getScalarType()); 18383 if (OrigBitWidth > MaxBitWidth) { 18384 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth); 18385 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)); 18386 } 18387 return false; 18388 })) 18389 ToDemote.push_back(Idx); 18390 } 18391 RootDemotes.clear(); 18392 IsTopRoot = false; 18393 IsProfitableToDemoteRoot = true; 18394 18395 if (ExtraBitWidthNodes.empty()) { 18396 NodeIdx = VectorizableTree.size(); 18397 } else { 18398 unsigned NewIdx = 0; 18399 do { 18400 NewIdx = *ExtraBitWidthNodes.begin(); 18401 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin()); 18402 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty()); 18403 NodeIdx = NewIdx; 18404 IsTruncRoot = 18405 NodeIdx < VectorizableTree.size() && 18406 any_of(VectorizableTree[NodeIdx]->UserTreeIndices, 18407 [](const EdgeInfo &EI) { 18408 return EI.EdgeIdx == 0 && 18409 EI.UserTE->getOpcode() == Instruction::Trunc && 18410 !EI.UserTE->isAltShuffle(); 18411 }); 18412 IsSignedCmp = 18413 NodeIdx < VectorizableTree.size() && 18414 any_of( 18415 VectorizableTree[NodeIdx]->UserTreeIndices, 18416 [&](const EdgeInfo &EI) { 18417 return (EI.UserTE->hasState() && 18418 EI.UserTE->getOpcode() == Instruction::ICmp) && 18419 any_of(EI.UserTE->Scalars, [&](Value *V) { 18420 auto *IC = dyn_cast<ICmpInst>(V); 18421 return IC && 18422 (IC->isSigned() || 18423 !isKnownNonNegative(IC->getOperand(0), 18424 SimplifyQuery(*DL)) || 18425 !isKnownNonNegative(IC->getOperand(1), 18426 SimplifyQuery(*DL))); 18427 }); 18428 }); 18429 } 18430 18431 // If the maximum bit width we compute is less than the width of the roots' 18432 // type, we can proceed with the narrowing. Otherwise, do nothing. 18433 if (MaxBitWidth == 0 || 18434 MaxBitWidth >= 18435 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType()) 18436 ->getBitWidth()) { 18437 if (UserIgnoreList) 18438 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end()); 18439 NodesToKeepBWs.insert(ToDemote.begin(), ToDemote.end()); 18440 continue; 18441 } 18442 18443 // Finally, map the values we can demote to the maximum bit with we 18444 // computed. 18445 for (unsigned Idx : ToDemote) { 18446 TreeEntry *TE = VectorizableTree[Idx].get(); 18447 if (MinBWs.contains(TE)) 18448 continue; 18449 bool IsSigned = any_of(TE->Scalars, [&](Value *R) { 18450 if (isa<PoisonValue>(R)) 18451 return false; 18452 return !isKnownNonNegative(R, SimplifyQuery(*DL)); 18453 }); 18454 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned); 18455 } 18456 } 18457 } 18458 18459 PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { 18460 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F); 18461 auto *TTI = &AM.getResult<TargetIRAnalysis>(F); 18462 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F); 18463 auto *AA = &AM.getResult<AAManager>(F); 18464 auto *LI = &AM.getResult<LoopAnalysis>(F); 18465 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F); 18466 auto *AC = &AM.getResult<AssumptionAnalysis>(F); 18467 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F); 18468 auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 18469 18470 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); 18471 if (!Changed) 18472 return PreservedAnalyses::all(); 18473 18474 PreservedAnalyses PA; 18475 PA.preserveSet<CFGAnalyses>(); 18476 return PA; 18477 } 18478 18479 bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, 18480 TargetTransformInfo *TTI_, 18481 TargetLibraryInfo *TLI_, AAResults *AA_, 18482 LoopInfo *LI_, DominatorTree *DT_, 18483 AssumptionCache *AC_, DemandedBits *DB_, 18484 OptimizationRemarkEmitter *ORE_) { 18485 if (!RunSLPVectorization) 18486 return false; 18487 SE = SE_; 18488 TTI = TTI_; 18489 TLI = TLI_; 18490 AA = AA_; 18491 LI = LI_; 18492 DT = DT_; 18493 AC = AC_; 18494 DB = DB_; 18495 DL = &F.getDataLayout(); 18496 18497 Stores.clear(); 18498 GEPs.clear(); 18499 bool Changed = false; 18500 18501 // If the target claims to have no vector registers don't attempt 18502 // vectorization. 18503 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) { 18504 LLVM_DEBUG( 18505 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n"); 18506 return false; 18507 } 18508 18509 // Don't vectorize when the attribute NoImplicitFloat is used. 18510 if (F.hasFnAttribute(Attribute::NoImplicitFloat)) 18511 return false; 18512 18513 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); 18514 18515 // Use the bottom up slp vectorizer to construct chains that start with 18516 // store instructions. 18517 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_); 18518 18519 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to 18520 // delete instructions. 18521 18522 // Update DFS numbers now so that we can use them for ordering. 18523 DT->updateDFSNumbers(); 18524 18525 // Scan the blocks in the function in post order. 18526 for (auto *BB : post_order(&F.getEntryBlock())) { 18527 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator())) 18528 continue; 18529 18530 // Start new block - clear the list of reduction roots. 18531 R.clearReductionData(); 18532 collectSeedInstructions(BB); 18533 18534 // Vectorize trees that end at stores. 18535 if (!Stores.empty()) { 18536 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() 18537 << " underlying objects.\n"); 18538 Changed |= vectorizeStoreChains(R); 18539 } 18540 18541 // Vectorize trees that end at reductions. 18542 Changed |= vectorizeChainsInBlock(BB, R); 18543 18544 // Vectorize the index computations of getelementptr instructions. This 18545 // is primarily intended to catch gather-like idioms ending at 18546 // non-consecutive loads. 18547 if (!GEPs.empty()) { 18548 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size() 18549 << " underlying objects.\n"); 18550 Changed |= vectorizeGEPIndices(BB, R); 18551 } 18552 } 18553 18554 if (Changed) { 18555 R.optimizeGatherSequence(); 18556 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); 18557 } 18558 return Changed; 18559 } 18560 18561 std::optional<bool> 18562 SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, 18563 unsigned Idx, unsigned MinVF, 18564 unsigned &Size) { 18565 Size = 0; 18566 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() 18567 << "\n"); 18568 const unsigned Sz = R.getVectorElementSize(Chain[0]); 18569 unsigned VF = Chain.size(); 18570 18571 if (!has_single_bit(Sz) || 18572 !hasFullVectorsOrPowerOf2( 18573 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(), 18574 VF) || 18575 VF < 2 || VF < MinVF) { 18576 // Check if vectorizing with a non-power-of-2 VF should be considered. At 18577 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost 18578 // all vector lanes are used. 18579 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF)) 18580 return false; 18581 } 18582 18583 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx 18584 << "\n"); 18585 18586 SetVector<Value *> ValOps; 18587 for (Value *V : Chain) 18588 ValOps.insert(cast<StoreInst>(V)->getValueOperand()); 18589 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit. 18590 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI); 18591 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) { 18592 DenseSet<Value *> Stores(Chain.begin(), Chain.end()); 18593 bool IsAllowedSize = 18594 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(), 18595 ValOps.size()) || 18596 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1)); 18597 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load && 18598 (!S.getMainOp()->isSafeToRemove() || 18599 any_of(ValOps.getArrayRef(), 18600 [&](Value *V) { 18601 return !isa<ExtractElementInst>(V) && 18602 (V->getNumUses() > Chain.size() || 18603 any_of(V->users(), [&](User *U) { 18604 return !Stores.contains(U); 18605 })); 18606 }))) || 18607 (ValOps.size() > Chain.size() / 2 && !S)) { 18608 Size = (!IsAllowedSize && S) ? 1 : 2; 18609 return false; 18610 } 18611 } 18612 if (R.isLoadCombineCandidate(Chain)) 18613 return true; 18614 R.buildTree(Chain); 18615 // Check if tree tiny and store itself or its value is not vectorized. 18616 if (R.isTreeTinyAndNotFullyVectorizable()) { 18617 if (R.isGathered(Chain.front()) || 18618 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand())) 18619 return std::nullopt; 18620 Size = R.getCanonicalGraphSize(); 18621 return false; 18622 } 18623 R.reorderTopToBottom(); 18624 R.reorderBottomToTop(); 18625 R.transformNodes(); 18626 R.buildExternalUses(); 18627 18628 R.computeMinimumValueSizes(); 18629 18630 Size = R.getCanonicalGraphSize(); 18631 if (S && S.getOpcode() == Instruction::Load) 18632 Size = 2; // cut off masked gather small trees 18633 InstructionCost Cost = R.getTreeCost(); 18634 18635 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n"); 18636 if (Cost < -SLPCostThreshold) { 18637 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n"); 18638 18639 using namespace ore; 18640 18641 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", 18642 cast<StoreInst>(Chain[0])) 18643 << "Stores SLP vectorized with cost " << NV("Cost", Cost) 18644 << " and with tree size " 18645 << NV("TreeSize", R.getTreeSize())); 18646 18647 R.vectorizeTree(); 18648 return true; 18649 } 18650 18651 return false; 18652 } 18653 18654 /// Checks if the quadratic mean deviation is less than 90% of the mean size. 18655 static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes, 18656 bool First) { 18657 unsigned Num = 0; 18658 uint64_t Sum = std::accumulate( 18659 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0), 18660 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) { 18661 unsigned Size = First ? Val.first : Val.second; 18662 if (Size == 1) 18663 return V; 18664 ++Num; 18665 return V + Size; 18666 }); 18667 if (Num == 0) 18668 return true; 18669 uint64_t Mean = Sum / Num; 18670 if (Mean == 0) 18671 return true; 18672 uint64_t Dev = std::accumulate( 18673 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0), 18674 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) { 18675 unsigned P = First ? Val.first : Val.second; 18676 if (P == 1) 18677 return V; 18678 return V + (P - Mean) * (P - Mean); 18679 }) / 18680 Num; 18681 return Dev * 81 / (Mean * Mean) == 0; 18682 } 18683 18684 bool SLPVectorizerPass::vectorizeStores( 18685 ArrayRef<StoreInst *> Stores, BoUpSLP &R, 18686 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> 18687 &Visited) { 18688 // We may run into multiple chains that merge into a single chain. We mark the 18689 // stores that we vectorized so that we don't visit the same store twice. 18690 BoUpSLP::ValueSet VectorizedStores; 18691 bool Changed = false; 18692 18693 struct StoreDistCompare { 18694 bool operator()(const std::pair<unsigned, int> &Op1, 18695 const std::pair<unsigned, int> &Op2) const { 18696 return Op1.second < Op2.second; 18697 } 18698 }; 18699 // A set of pairs (index of store in Stores array ref, Distance of the store 18700 // address relative to base store address in units). 18701 using StoreIndexToDistSet = 18702 std::set<std::pair<unsigned, int>, StoreDistCompare>; 18703 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) { 18704 int PrevDist = -1; 18705 BoUpSLP::ValueList Operands; 18706 // Collect the chain into a list. 18707 for (auto [Idx, Data] : enumerate(Set)) { 18708 if (Operands.empty() || Data.second - PrevDist == 1) { 18709 Operands.push_back(Stores[Data.first]); 18710 PrevDist = Data.second; 18711 if (Idx != Set.size() - 1) 18712 continue; 18713 } 18714 auto E = make_scope_exit([&, &DataVar = Data]() { 18715 Operands.clear(); 18716 Operands.push_back(Stores[DataVar.first]); 18717 PrevDist = DataVar.second; 18718 }); 18719 18720 if (Operands.size() <= 1 || 18721 !Visited 18722 .insert({Operands.front(), 18723 cast<StoreInst>(Operands.front())->getValueOperand(), 18724 Operands.back(), 18725 cast<StoreInst>(Operands.back())->getValueOperand(), 18726 Operands.size()}) 18727 .second) 18728 continue; 18729 18730 unsigned MaxVecRegSize = R.getMaxVecRegSize(); 18731 unsigned EltSize = R.getVectorElementSize(Operands[0]); 18732 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize); 18733 18734 unsigned MaxVF = 18735 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts); 18736 auto *Store = cast<StoreInst>(Operands[0]); 18737 Type *StoreTy = Store->getValueOperand()->getType(); 18738 Type *ValueTy = StoreTy; 18739 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand())) 18740 ValueTy = Trunc->getSrcTy(); 18741 unsigned MinVF = std::max<unsigned>( 18742 2, PowerOf2Ceil(TTI->getStoreMinimumVF( 18743 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, 18744 ValueTy))); 18745 18746 if (MaxVF < MinVF) { 18747 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF 18748 << ") < " 18749 << "MinVF (" << MinVF << ")\n"); 18750 continue; 18751 } 18752 18753 unsigned NonPowerOf2VF = 0; 18754 if (VectorizeNonPowerOf2) { 18755 // First try vectorizing with a non-power-of-2 VF. At the moment, only 18756 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector 18757 // lanes are used. 18758 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF); 18759 if (has_single_bit(CandVF + 1)) { 18760 NonPowerOf2VF = CandVF; 18761 assert(NonPowerOf2VF != MaxVF && 18762 "Non-power-of-2 VF should not be equal to MaxVF"); 18763 } 18764 } 18765 18766 unsigned MaxRegVF = MaxVF; 18767 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size())); 18768 if (MaxVF < MinVF) { 18769 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF 18770 << ") < " 18771 << "MinVF (" << MinVF << ")\n"); 18772 continue; 18773 } 18774 18775 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF); 18776 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0)); 18777 unsigned Size = MinVF; 18778 for_each(reverse(CandidateVFs), [&](unsigned &VF) { 18779 VF = Size > MaxVF ? NonPowerOf2VF : Size; 18780 Size *= 2; 18781 }); 18782 unsigned End = Operands.size(); 18783 unsigned Repeat = 0; 18784 constexpr unsigned MaxAttempts = 4; 18785 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size()); 18786 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) { 18787 P.first = P.second = 1; 18788 }); 18789 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable; 18790 auto IsNotVectorized = [](bool First, 18791 const std::pair<unsigned, unsigned> &P) { 18792 return First ? P.first > 0 : P.second > 0; 18793 }; 18794 auto IsVectorized = [](bool First, 18795 const std::pair<unsigned, unsigned> &P) { 18796 return First ? P.first == 0 : P.second == 0; 18797 }; 18798 auto VFIsProfitable = [](bool First, unsigned Size, 18799 const std::pair<unsigned, unsigned> &P) { 18800 return First ? Size >= P.first : Size >= P.second; 18801 }; 18802 auto FirstSizeSame = [](unsigned Size, 18803 const std::pair<unsigned, unsigned> &P) { 18804 return Size == P.first; 18805 }; 18806 while (true) { 18807 ++Repeat; 18808 bool RepeatChanged = false; 18809 bool AnyProfitableGraph = false; 18810 for (unsigned Size : CandidateVFs) { 18811 AnyProfitableGraph = false; 18812 unsigned StartIdx = std::distance( 18813 RangeSizes.begin(), 18814 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF, 18815 std::placeholders::_1))); 18816 while (StartIdx < End) { 18817 unsigned EndIdx = 18818 std::distance(RangeSizes.begin(), 18819 find_if(RangeSizes.drop_front(StartIdx), 18820 std::bind(IsVectorized, Size >= MaxRegVF, 18821 std::placeholders::_1))); 18822 unsigned Sz = EndIdx >= End ? End : EndIdx; 18823 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) { 18824 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size), 18825 Size >= MaxRegVF)) { 18826 ++Cnt; 18827 continue; 18828 } 18829 ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size); 18830 assert(all_of(Slice, 18831 [&](Value *V) { 18832 return cast<StoreInst>(V) 18833 ->getValueOperand() 18834 ->getType() == 18835 cast<StoreInst>(Slice.front()) 18836 ->getValueOperand() 18837 ->getType(); 18838 }) && 18839 "Expected all operands of same type."); 18840 if (!NonSchedulable.empty()) { 18841 auto [NonSchedSizeMax, NonSchedSizeMin] = 18842 NonSchedulable.lookup(Slice.front()); 18843 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) { 18844 Cnt += NonSchedSizeMax; 18845 continue; 18846 } 18847 } 18848 unsigned TreeSize; 18849 std::optional<bool> Res = 18850 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize); 18851 if (!Res) { 18852 NonSchedulable 18853 .try_emplace(Slice.front(), std::make_pair(Size, Size)) 18854 .first->getSecond() 18855 .second = Size; 18856 } else if (*Res) { 18857 // Mark the vectorized stores so that we don't vectorize them 18858 // again. 18859 VectorizedStores.insert(Slice.begin(), Slice.end()); 18860 // Mark the vectorized stores so that we don't vectorize them 18861 // again. 18862 AnyProfitableGraph = RepeatChanged = Changed = true; 18863 // If we vectorized initial block, no need to try to vectorize 18864 // it again. 18865 for_each(RangeSizes.slice(Cnt, Size), 18866 [](std::pair<unsigned, unsigned> &P) { 18867 P.first = P.second = 0; 18868 }); 18869 if (Cnt < StartIdx + MinVF) { 18870 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx), 18871 [](std::pair<unsigned, unsigned> &P) { 18872 P.first = P.second = 0; 18873 }); 18874 StartIdx = Cnt + Size; 18875 } 18876 if (Cnt > Sz - Size - MinVF) { 18877 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)), 18878 [](std::pair<unsigned, unsigned> &P) { 18879 P.first = P.second = 0; 18880 }); 18881 if (Sz == End) 18882 End = Cnt; 18883 Sz = Cnt; 18884 } 18885 Cnt += Size; 18886 continue; 18887 } 18888 if (Size > 2 && Res && 18889 !all_of(RangeSizes.slice(Cnt, Size), 18890 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize, 18891 std::placeholders::_1))) { 18892 Cnt += Size; 18893 continue; 18894 } 18895 // Check for the very big VFs that we're not rebuilding same 18896 // trees, just with larger number of elements. 18897 if (Size > MaxRegVF && TreeSize > 1 && 18898 all_of(RangeSizes.slice(Cnt, Size), 18899 std::bind(FirstSizeSame, TreeSize, 18900 std::placeholders::_1))) { 18901 Cnt += Size; 18902 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize) 18903 ++Cnt; 18904 continue; 18905 } 18906 if (TreeSize > 1) 18907 for_each(RangeSizes.slice(Cnt, Size), 18908 [&](std::pair<unsigned, unsigned> &P) { 18909 if (Size >= MaxRegVF) 18910 P.second = std::max(P.second, TreeSize); 18911 else 18912 P.first = std::max(P.first, TreeSize); 18913 }); 18914 ++Cnt; 18915 AnyProfitableGraph = true; 18916 } 18917 if (StartIdx >= End) 18918 break; 18919 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF) 18920 AnyProfitableGraph = true; 18921 StartIdx = std::distance( 18922 RangeSizes.begin(), 18923 find_if(RangeSizes.drop_front(Sz), 18924 std::bind(IsNotVectorized, Size >= MaxRegVF, 18925 std::placeholders::_1))); 18926 } 18927 if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size)) 18928 break; 18929 } 18930 // All values vectorized - exit. 18931 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) { 18932 return P.first == 0 && P.second == 0; 18933 })) 18934 break; 18935 // Check if tried all attempts or no need for the last attempts at all. 18936 if (Repeat >= MaxAttempts || 18937 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph))) 18938 break; 18939 constexpr unsigned StoresLimit = 64; 18940 const unsigned MaxTotalNum = std::min<unsigned>( 18941 Operands.size(), 18942 static_cast<unsigned>( 18943 End - 18944 std::distance( 18945 RangeSizes.begin(), 18946 find_if(RangeSizes, std::bind(IsNotVectorized, true, 18947 std::placeholders::_1))) + 18948 1)); 18949 unsigned VF = bit_ceil(CandidateVFs.front()) * 2; 18950 unsigned Limit = 18951 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum); 18952 CandidateVFs.clear(); 18953 if (bit_floor(Limit) == VF) 18954 CandidateVFs.push_back(Limit); 18955 if (VF > MaxTotalNum || VF >= StoresLimit) 18956 break; 18957 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) { 18958 if (P.first != 0) 18959 P.first = std::max(P.second, P.first); 18960 }); 18961 // Last attempt to vectorize max number of elements, if all previous 18962 // attempts were unsuccessful because of the cost issues. 18963 CandidateVFs.push_back(VF); 18964 } 18965 } 18966 }; 18967 18968 // Stores pair (first: index of the store into Stores array ref, address of 18969 // which taken as base, second: sorted set of pairs {index, dist}, which are 18970 // indices of stores in the set and their store location distances relative to 18971 // the base address). 18972 18973 // Need to store the index of the very first store separately, since the set 18974 // may be reordered after the insertion and the first store may be moved. This 18975 // container allows to reduce number of calls of getPointersDiff() function. 18976 SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores; 18977 // Inserts the specified store SI with the given index Idx to the set of the 18978 // stores. If the store with the same distance is found already - stop 18979 // insertion, try to vectorize already found stores. If some stores from this 18980 // sequence were not vectorized - try to vectorize them with the new store 18981 // later. But this logic is applied only to the stores, that come before the 18982 // previous store with the same distance. 18983 // Example: 18984 // 1. store x, %p 18985 // 2. store y, %p+1 18986 // 3. store z, %p+2 18987 // 4. store a, %p 18988 // 5. store b, %p+3 18989 // - Scan this from the last to first store. The very first bunch of stores is 18990 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores 18991 // vector). 18992 // - The next store in the list - #1 - has the same distance from store #5 as 18993 // the store #4. 18994 // - Try to vectorize sequence of stores 4,2,3,5. 18995 // - If all these stores are vectorized - just drop them. 18996 // - If some of them are not vectorized (say, #3 and #5), do extra analysis. 18997 // - Start new stores sequence. 18998 // The new bunch of stores is {1, {1, 0}}. 18999 // - Add the stores from previous sequence, that were not vectorized. 19000 // Here we consider the stores in the reversed order, rather they are used in 19001 // the IR (Stores are reversed already, see vectorizeStoreChains() function). 19002 // Store #3 can be added -> comes after store #4 with the same distance as 19003 // store #1. 19004 // Store #5 cannot be added - comes before store #4. 19005 // This logic allows to improve the compile time, we assume that the stores 19006 // after previous store with the same distance most likely have memory 19007 // dependencies and no need to waste compile time to try to vectorize them. 19008 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}. 19009 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) { 19010 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) { 19011 std::optional<int> Diff = getPointersDiff( 19012 Stores[Set.first]->getValueOperand()->getType(), 19013 Stores[Set.first]->getPointerOperand(), 19014 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE, 19015 /*StrictCheck=*/true); 19016 if (!Diff) 19017 continue; 19018 auto It = Set.second.find(std::make_pair(Idx, *Diff)); 19019 if (It == Set.second.end()) { 19020 Set.second.emplace(Idx, *Diff); 19021 return; 19022 } 19023 // Try to vectorize the first found set to avoid duplicate analysis. 19024 TryToVectorize(Set.second); 19025 unsigned ItIdx = It->first; 19026 int ItDist = It->second; 19027 StoreIndexToDistSet PrevSet; 19028 copy_if(Set.second, std::inserter(PrevSet, PrevSet.end()), 19029 [&](const std::pair<unsigned, int> &Pair) { 19030 return Pair.first > ItIdx; 19031 }); 19032 Set.second.clear(); 19033 Set.first = Idx; 19034 Set.second.emplace(Idx, 0); 19035 // Insert stores that followed previous match to try to vectorize them 19036 // with this store. 19037 unsigned StartIdx = ItIdx + 1; 19038 SmallBitVector UsedStores(Idx - StartIdx); 19039 // Distances to previously found dup store (or this store, since they 19040 // store to the same addresses). 19041 SmallVector<int> Dists(Idx - StartIdx, 0); 19042 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) { 19043 // Do not try to vectorize sequences, we already tried. 19044 if (VectorizedStores.contains(Stores[Pair.first])) 19045 break; 19046 unsigned BI = Pair.first - StartIdx; 19047 UsedStores.set(BI); 19048 Dists[BI] = Pair.second - ItDist; 19049 } 19050 for (unsigned I = StartIdx; I < Idx; ++I) { 19051 unsigned BI = I - StartIdx; 19052 if (UsedStores.test(BI)) 19053 Set.second.emplace(I, Dists[BI]); 19054 } 19055 return; 19056 } 19057 auto &Res = SortedStores.emplace_back(); 19058 Res.first = Idx; 19059 Res.second.emplace(Idx, 0); 19060 }; 19061 Type *PrevValTy = nullptr; 19062 for (auto [I, SI] : enumerate(Stores)) { 19063 if (R.isDeleted(SI)) 19064 continue; 19065 if (!PrevValTy) 19066 PrevValTy = SI->getValueOperand()->getType(); 19067 // Check that we do not try to vectorize stores of different types. 19068 if (PrevValTy != SI->getValueOperand()->getType()) { 19069 for (auto &Set : SortedStores) 19070 TryToVectorize(Set.second); 19071 SortedStores.clear(); 19072 PrevValTy = SI->getValueOperand()->getType(); 19073 } 19074 FillStoresSet(I, SI); 19075 } 19076 19077 // Final vectorization attempt. 19078 for (auto &Set : SortedStores) 19079 TryToVectorize(Set.second); 19080 19081 return Changed; 19082 } 19083 19084 void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { 19085 // Initialize the collections. We will make a single pass over the block. 19086 Stores.clear(); 19087 GEPs.clear(); 19088 19089 // Visit the store and getelementptr instructions in BB and organize them in 19090 // Stores and GEPs according to the underlying objects of their pointer 19091 // operands. 19092 for (Instruction &I : *BB) { 19093 // Ignore store instructions that are volatile or have a pointer operand 19094 // that doesn't point to a scalar type. 19095 if (auto *SI = dyn_cast<StoreInst>(&I)) { 19096 if (!SI->isSimple()) 19097 continue; 19098 if (!isValidElementType(SI->getValueOperand()->getType())) 19099 continue; 19100 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI); 19101 } 19102 19103 // Ignore getelementptr instructions that have more than one index, a 19104 // constant index, or a pointer operand that doesn't point to a scalar 19105 // type. 19106 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 19107 if (GEP->getNumIndices() != 1) 19108 continue; 19109 Value *Idx = GEP->idx_begin()->get(); 19110 if (isa<Constant>(Idx)) 19111 continue; 19112 if (!isValidElementType(Idx->getType())) 19113 continue; 19114 if (GEP->getType()->isVectorTy()) 19115 continue; 19116 GEPs[GEP->getPointerOperand()].push_back(GEP); 19117 } 19118 } 19119 } 19120 19121 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 19122 bool MaxVFOnly) { 19123 if (VL.size() < 2) 19124 return false; 19125 19126 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " 19127 << VL.size() << ".\n"); 19128 19129 // Check that all of the parts are instructions of the same type, 19130 // we permit an alternate opcode via InstructionsState. 19131 InstructionsState S = getSameOpcode(VL, *TLI); 19132 if (!S) 19133 return false; 19134 19135 Instruction *I0 = S.getMainOp(); 19136 // Make sure invalid types (including vector type) are rejected before 19137 // determining vectorization factor for scalar instructions. 19138 for (Value *V : VL) { 19139 Type *Ty = V->getType(); 19140 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) { 19141 // NOTE: the following will give user internal llvm type name, which may 19142 // not be useful. 19143 R.getORE()->emit([&]() { 19144 std::string TypeStr; 19145 llvm::raw_string_ostream rso(TypeStr); 19146 Ty->print(rso); 19147 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0) 19148 << "Cannot SLP vectorize list: type " 19149 << TypeStr + " is unsupported by vectorizer"; 19150 }); 19151 return false; 19152 } 19153 } 19154 19155 Type *ScalarTy = getValueType(VL[0]); 19156 unsigned Sz = R.getVectorElementSize(I0); 19157 unsigned MinVF = R.getMinVF(Sz); 19158 unsigned MaxVF = std::max<unsigned>( 19159 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF); 19160 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF); 19161 if (MaxVF < 2) { 19162 R.getORE()->emit([&]() { 19163 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) 19164 << "Cannot SLP vectorize list: vectorization factor " 19165 << "less than 2 is not supported"; 19166 }); 19167 return false; 19168 } 19169 19170 bool Changed = false; 19171 bool CandidateFound = false; 19172 InstructionCost MinCost = SLPCostThreshold.getValue(); 19173 19174 unsigned NextInst = 0, MaxInst = VL.size(); 19175 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; 19176 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) { 19177 // No actual vectorization should happen, if number of parts is the same as 19178 // provided vectorization factor (i.e. the scalar type is used for vector 19179 // code during codegen). 19180 auto *VecTy = getWidenedType(ScalarTy, VF); 19181 if (TTI->getNumberOfParts(VecTy) == VF) 19182 continue; 19183 for (unsigned I = NextInst; I < MaxInst; ++I) { 19184 unsigned ActualVF = std::min(MaxInst - I, VF); 19185 19186 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF)) 19187 continue; 19188 19189 if (MaxVFOnly && ActualVF < MaxVF) 19190 break; 19191 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2)) 19192 break; 19193 19194 SmallVector<Value *> Ops(ActualVF, nullptr); 19195 unsigned Idx = 0; 19196 for (Value *V : VL.drop_front(I)) { 19197 // Check that a previous iteration of this loop did not delete the 19198 // Value. 19199 if (auto *Inst = dyn_cast<Instruction>(V); 19200 !Inst || !R.isDeleted(Inst)) { 19201 Ops[Idx] = V; 19202 ++Idx; 19203 if (Idx == ActualVF) 19204 break; 19205 } 19206 } 19207 // Not enough vectorizable instructions - exit. 19208 if (Idx != ActualVF) 19209 break; 19210 19211 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations " 19212 << "\n"); 19213 19214 R.buildTree(Ops); 19215 if (R.isTreeTinyAndNotFullyVectorizable()) 19216 continue; 19217 R.reorderTopToBottom(); 19218 R.reorderBottomToTop( 19219 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) && 19220 !R.doesRootHaveInTreeUses()); 19221 R.transformNodes(); 19222 R.buildExternalUses(); 19223 19224 R.computeMinimumValueSizes(); 19225 InstructionCost Cost = R.getTreeCost(); 19226 CandidateFound = true; 19227 MinCost = std::min(MinCost, Cost); 19228 19229 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost 19230 << " for VF=" << ActualVF << "\n"); 19231 if (Cost < -SLPCostThreshold) { 19232 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); 19233 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList", 19234 cast<Instruction>(Ops[0])) 19235 << "SLP vectorized with cost " << ore::NV("Cost", Cost) 19236 << " and with tree size " 19237 << ore::NV("TreeSize", R.getTreeSize())); 19238 19239 R.vectorizeTree(); 19240 // Move to the next bundle. 19241 I += VF - 1; 19242 NextInst = I + 1; 19243 Changed = true; 19244 } 19245 } 19246 } 19247 19248 if (!Changed && CandidateFound) { 19249 R.getORE()->emit([&]() { 19250 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0) 19251 << "List vectorization was possible but not beneficial with cost " 19252 << ore::NV("Cost", MinCost) << " >= " 19253 << ore::NV("Treshold", -SLPCostThreshold); 19254 }); 19255 } else if (!Changed) { 19256 R.getORE()->emit([&]() { 19257 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0) 19258 << "Cannot SLP vectorize list: vectorization was impossible" 19259 << " with available vectorization factors"; 19260 }); 19261 } 19262 return Changed; 19263 } 19264 19265 bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { 19266 if (!I) 19267 return false; 19268 19269 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType())) 19270 return false; 19271 19272 Value *P = I->getParent(); 19273 19274 // Vectorize in current basic block only. 19275 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0)); 19276 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1)); 19277 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P || 19278 R.isDeleted(Op0) || R.isDeleted(Op1)) 19279 return false; 19280 19281 // First collect all possible candidates 19282 SmallVector<std::pair<Value *, Value *>, 4> Candidates; 19283 Candidates.emplace_back(Op0, Op1); 19284 19285 auto *A = dyn_cast<BinaryOperator>(Op0); 19286 auto *B = dyn_cast<BinaryOperator>(Op1); 19287 // Try to skip B. 19288 if (A && B && B->hasOneUse()) { 19289 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); 19290 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); 19291 if (B0 && B0->getParent() == P && !R.isDeleted(B0)) 19292 Candidates.emplace_back(A, B0); 19293 if (B1 && B1->getParent() == P && !R.isDeleted(B1)) 19294 Candidates.emplace_back(A, B1); 19295 } 19296 // Try to skip A. 19297 if (B && A && A->hasOneUse()) { 19298 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); 19299 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); 19300 if (A0 && A0->getParent() == P && !R.isDeleted(A0)) 19301 Candidates.emplace_back(A0, B); 19302 if (A1 && A1->getParent() == P && !R.isDeleted(A1)) 19303 Candidates.emplace_back(A1, B); 19304 } 19305 19306 if (Candidates.size() == 1) 19307 return tryToVectorizeList({Op0, Op1}, R); 19308 19309 // We have multiple options. Try to pick the single best. 19310 std::optional<int> BestCandidate = R.findBestRootPair(Candidates); 19311 if (!BestCandidate) 19312 return false; 19313 return tryToVectorizeList( 19314 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R); 19315 } 19316 19317 namespace { 19318 19319 /// Model horizontal reductions. 19320 /// 19321 /// A horizontal reduction is a tree of reduction instructions that has values 19322 /// that can be put into a vector as its leaves. For example: 19323 /// 19324 /// mul mul mul mul 19325 /// \ / \ / 19326 /// + + 19327 /// \ / 19328 /// + 19329 /// This tree has "mul" as its leaf values and "+" as its reduction 19330 /// instructions. A reduction can feed into a store or a binary operation 19331 /// feeding a phi. 19332 /// ... 19333 /// \ / 19334 /// + 19335 /// | 19336 /// phi += 19337 /// 19338 /// Or: 19339 /// ... 19340 /// \ / 19341 /// + 19342 /// | 19343 /// *p = 19344 /// 19345 class HorizontalReduction { 19346 using ReductionOpsType = SmallVector<Value *, 16>; 19347 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>; 19348 ReductionOpsListType ReductionOps; 19349 /// List of possibly reduced values. 19350 SmallVector<SmallVector<Value *>> ReducedVals; 19351 /// Maps reduced value to the corresponding reduction operation. 19352 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps; 19353 WeakTrackingVH ReductionRoot; 19354 /// The type of reduction operation. 19355 RecurKind RdxKind; 19356 /// Checks if the optimization of original scalar identity operations on 19357 /// matched horizontal reductions is enabled and allowed. 19358 bool IsSupportedHorRdxIdentityOp = false; 19359 19360 static bool isCmpSelMinMax(Instruction *I) { 19361 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) && 19362 RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I)); 19363 } 19364 19365 // And/or are potentially poison-safe logical patterns like: 19366 // select x, y, false 19367 // select x, true, y 19368 static bool isBoolLogicOp(Instruction *I) { 19369 return isa<SelectInst>(I) && 19370 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr())); 19371 } 19372 19373 /// Checks if instruction is associative and can be vectorized. 19374 static bool isVectorizable(RecurKind Kind, Instruction *I) { 19375 if (Kind == RecurKind::None) 19376 return false; 19377 19378 // Integer ops that map to select instructions or intrinsics are fine. 19379 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) || 19380 isBoolLogicOp(I)) 19381 return true; 19382 19383 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) { 19384 // FP min/max are associative except for NaN and -0.0. We do not 19385 // have to rule out -0.0 here because the intrinsic semantics do not 19386 // specify a fixed result for it. 19387 return I->getFastMathFlags().noNaNs(); 19388 } 19389 19390 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum) 19391 return true; 19392 19393 return I->isAssociative(); 19394 } 19395 19396 static Value *getRdxOperand(Instruction *I, unsigned Index) { 19397 // Poison-safe 'or' takes the form: select X, true, Y 19398 // To make that work with the normal operand processing, we skip the 19399 // true value operand. 19400 // TODO: Change the code and data structures to handle this without a hack. 19401 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1) 19402 return I->getOperand(2); 19403 return I->getOperand(Index); 19404 } 19405 19406 /// Creates reduction operation with the current opcode. 19407 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS, 19408 Value *RHS, const Twine &Name, bool UseSelect) { 19409 switch (Kind) { 19410 case RecurKind::Or: { 19411 if (UseSelect && 19412 LHS->getType() == CmpInst::makeCmpResultType(LHS->getType())) 19413 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name); 19414 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); 19415 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, 19416 Name); 19417 } 19418 case RecurKind::And: { 19419 if (UseSelect && 19420 LHS->getType() == CmpInst::makeCmpResultType(LHS->getType())) 19421 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name); 19422 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); 19423 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, 19424 Name); 19425 } 19426 case RecurKind::Add: 19427 case RecurKind::Mul: 19428 case RecurKind::Xor: 19429 case RecurKind::FAdd: 19430 case RecurKind::FMul: { 19431 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); 19432 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, 19433 Name); 19434 } 19435 case RecurKind::SMax: 19436 case RecurKind::SMin: 19437 case RecurKind::UMax: 19438 case RecurKind::UMin: 19439 if (UseSelect) { 19440 CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(Kind); 19441 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name); 19442 return Builder.CreateSelect(Cmp, LHS, RHS, Name); 19443 } 19444 [[fallthrough]]; 19445 case RecurKind::FMax: 19446 case RecurKind::FMin: 19447 case RecurKind::FMaximum: 19448 case RecurKind::FMinimum: { 19449 Intrinsic::ID Id = llvm::getMinMaxReductionIntrinsicOp(Kind); 19450 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS); 19451 } 19452 default: 19453 llvm_unreachable("Unknown reduction operation."); 19454 } 19455 } 19456 19457 /// Creates reduction operation with the current opcode with the IR flags 19458 /// from \p ReductionOps, dropping nuw/nsw flags. 19459 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS, 19460 Value *RHS, const Twine &Name, 19461 const ReductionOpsListType &ReductionOps) { 19462 bool UseSelect = ReductionOps.size() == 2 || 19463 // Logical or/and. 19464 (ReductionOps.size() == 1 && 19465 any_of(ReductionOps.front(), IsaPred<SelectInst>)); 19466 assert((!UseSelect || ReductionOps.size() != 2 || 19467 isa<SelectInst>(ReductionOps[1][0])) && 19468 "Expected cmp + select pairs for reduction"); 19469 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect); 19470 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { 19471 if (auto *Sel = dyn_cast<SelectInst>(Op)) { 19472 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr, 19473 /*IncludeWrapFlags=*/false); 19474 propagateIRFlags(Op, ReductionOps[1], nullptr, 19475 /*IncludeWrapFlags=*/false); 19476 return Op; 19477 } 19478 } 19479 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false); 19480 return Op; 19481 } 19482 19483 public: 19484 static RecurKind getRdxKind(Value *V) { 19485 auto *I = dyn_cast<Instruction>(V); 19486 if (!I) 19487 return RecurKind::None; 19488 if (match(I, m_Add(m_Value(), m_Value()))) 19489 return RecurKind::Add; 19490 if (match(I, m_Mul(m_Value(), m_Value()))) 19491 return RecurKind::Mul; 19492 if (match(I, m_And(m_Value(), m_Value())) || 19493 match(I, m_LogicalAnd(m_Value(), m_Value()))) 19494 return RecurKind::And; 19495 if (match(I, m_Or(m_Value(), m_Value())) || 19496 match(I, m_LogicalOr(m_Value(), m_Value()))) 19497 return RecurKind::Or; 19498 if (match(I, m_Xor(m_Value(), m_Value()))) 19499 return RecurKind::Xor; 19500 if (match(I, m_FAdd(m_Value(), m_Value()))) 19501 return RecurKind::FAdd; 19502 if (match(I, m_FMul(m_Value(), m_Value()))) 19503 return RecurKind::FMul; 19504 19505 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) 19506 return RecurKind::FMax; 19507 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) 19508 return RecurKind::FMin; 19509 19510 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value()))) 19511 return RecurKind::FMaximum; 19512 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value()))) 19513 return RecurKind::FMinimum; 19514 // This matches either cmp+select or intrinsics. SLP is expected to handle 19515 // either form. 19516 // TODO: If we are canonicalizing to intrinsics, we can remove several 19517 // special-case paths that deal with selects. 19518 if (match(I, m_SMax(m_Value(), m_Value()))) 19519 return RecurKind::SMax; 19520 if (match(I, m_SMin(m_Value(), m_Value()))) 19521 return RecurKind::SMin; 19522 if (match(I, m_UMax(m_Value(), m_Value()))) 19523 return RecurKind::UMax; 19524 if (match(I, m_UMin(m_Value(), m_Value()))) 19525 return RecurKind::UMin; 19526 19527 if (auto *Select = dyn_cast<SelectInst>(I)) { 19528 // Try harder: look for min/max pattern based on instructions producing 19529 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2). 19530 // During the intermediate stages of SLP, it's very common to have 19531 // pattern like this (since optimizeGatherSequence is run only once 19532 // at the end): 19533 // %1 = extractelement <2 x i32> %a, i32 0 19534 // %2 = extractelement <2 x i32> %a, i32 1 19535 // %cond = icmp sgt i32 %1, %2 19536 // %3 = extractelement <2 x i32> %a, i32 0 19537 // %4 = extractelement <2 x i32> %a, i32 1 19538 // %select = select i1 %cond, i32 %3, i32 %4 19539 CmpPredicate Pred; 19540 Instruction *L1; 19541 Instruction *L2; 19542 19543 Value *LHS = Select->getTrueValue(); 19544 Value *RHS = Select->getFalseValue(); 19545 Value *Cond = Select->getCondition(); 19546 19547 // TODO: Support inverse predicates. 19548 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) { 19549 if (!isa<ExtractElementInst>(RHS) || 19550 !L2->isIdenticalTo(cast<Instruction>(RHS))) 19551 return RecurKind::None; 19552 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) { 19553 if (!isa<ExtractElementInst>(LHS) || 19554 !L1->isIdenticalTo(cast<Instruction>(LHS))) 19555 return RecurKind::None; 19556 } else { 19557 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS)) 19558 return RecurKind::None; 19559 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) || 19560 !L1->isIdenticalTo(cast<Instruction>(LHS)) || 19561 !L2->isIdenticalTo(cast<Instruction>(RHS))) 19562 return RecurKind::None; 19563 } 19564 19565 switch (Pred) { 19566 default: 19567 return RecurKind::None; 19568 case CmpInst::ICMP_SGT: 19569 case CmpInst::ICMP_SGE: 19570 return RecurKind::SMax; 19571 case CmpInst::ICMP_SLT: 19572 case CmpInst::ICMP_SLE: 19573 return RecurKind::SMin; 19574 case CmpInst::ICMP_UGT: 19575 case CmpInst::ICMP_UGE: 19576 return RecurKind::UMax; 19577 case CmpInst::ICMP_ULT: 19578 case CmpInst::ICMP_ULE: 19579 return RecurKind::UMin; 19580 } 19581 } 19582 return RecurKind::None; 19583 } 19584 19585 /// Get the index of the first operand. 19586 static unsigned getFirstOperandIndex(Instruction *I) { 19587 return isCmpSelMinMax(I) ? 1 : 0; 19588 } 19589 19590 private: 19591 /// Total number of operands in the reduction operation. 19592 static unsigned getNumberOfOperands(Instruction *I) { 19593 return isCmpSelMinMax(I) ? 3 : 2; 19594 } 19595 19596 /// Checks if the instruction is in basic block \p BB. 19597 /// For a cmp+sel min/max reduction check that both ops are in \p BB. 19598 static bool hasSameParent(Instruction *I, BasicBlock *BB) { 19599 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) { 19600 auto *Sel = cast<SelectInst>(I); 19601 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition()); 19602 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB; 19603 } 19604 return I->getParent() == BB; 19605 } 19606 19607 /// Expected number of uses for reduction operations/reduced values. 19608 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) { 19609 if (IsCmpSelMinMax) { 19610 // SelectInst must be used twice while the condition op must have single 19611 // use only. 19612 if (auto *Sel = dyn_cast<SelectInst>(I)) 19613 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse(); 19614 return I->hasNUses(2); 19615 } 19616 19617 // Arithmetic reduction operation must be used once only. 19618 return I->hasOneUse(); 19619 } 19620 19621 /// Initializes the list of reduction operations. 19622 void initReductionOps(Instruction *I) { 19623 if (isCmpSelMinMax(I)) 19624 ReductionOps.assign(2, ReductionOpsType()); 19625 else 19626 ReductionOps.assign(1, ReductionOpsType()); 19627 } 19628 19629 /// Add all reduction operations for the reduction instruction \p I. 19630 void addReductionOps(Instruction *I) { 19631 if (isCmpSelMinMax(I)) { 19632 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition()); 19633 ReductionOps[1].emplace_back(I); 19634 } else { 19635 ReductionOps[0].emplace_back(I); 19636 } 19637 } 19638 19639 static bool isGoodForReduction(ArrayRef<Value *> Data) { 19640 int Sz = Data.size(); 19641 auto *I = dyn_cast<Instruction>(Data.front()); 19642 return Sz > 1 || isConstant(Data.front()) || 19643 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode())); 19644 } 19645 19646 public: 19647 HorizontalReduction() = default; 19648 19649 /// Try to find a reduction tree. 19650 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root, 19651 ScalarEvolution &SE, const DataLayout &DL, 19652 const TargetLibraryInfo &TLI) { 19653 RdxKind = HorizontalReduction::getRdxKind(Root); 19654 if (!isVectorizable(RdxKind, Root)) 19655 return false; 19656 19657 // Analyze "regular" integer/FP types for reductions - no target-specific 19658 // types or pointers. 19659 Type *Ty = Root->getType(); 19660 if (!isValidElementType(Ty) || Ty->isPointerTy()) 19661 return false; 19662 19663 // Though the ultimate reduction may have multiple uses, its condition must 19664 // have only single use. 19665 if (auto *Sel = dyn_cast<SelectInst>(Root)) 19666 if (!Sel->getCondition()->hasOneUse()) 19667 return false; 19668 19669 ReductionRoot = Root; 19670 19671 // Iterate through all the operands of the possible reduction tree and 19672 // gather all the reduced values, sorting them by their value id. 19673 BasicBlock *BB = Root->getParent(); 19674 bool IsCmpSelMinMax = isCmpSelMinMax(Root); 19675 SmallVector<std::pair<Instruction *, unsigned>> Worklist( 19676 1, std::make_pair(Root, 0)); 19677 // Checks if the operands of the \p TreeN instruction are also reduction 19678 // operations or should be treated as reduced values or an extra argument, 19679 // which is not part of the reduction. 19680 auto CheckOperands = [&](Instruction *TreeN, 19681 SmallVectorImpl<Value *> &PossibleReducedVals, 19682 SmallVectorImpl<Instruction *> &ReductionOps, 19683 unsigned Level) { 19684 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN), 19685 getNumberOfOperands(TreeN)))) { 19686 Value *EdgeVal = getRdxOperand(TreeN, I); 19687 ReducedValsToOps[EdgeVal].push_back(TreeN); 19688 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal); 19689 // If the edge is not an instruction, or it is different from the main 19690 // reduction opcode or has too many uses - possible reduced value. 19691 // Also, do not try to reduce const values, if the operation is not 19692 // foldable. 19693 if (!EdgeInst || Level > RecursionMaxDepth || 19694 getRdxKind(EdgeInst) != RdxKind || 19695 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) || 19696 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) || 19697 !isVectorizable(RdxKind, EdgeInst) || 19698 (R.isAnalyzedReductionRoot(EdgeInst) && 19699 all_of(EdgeInst->operands(), IsaPred<Constant>))) { 19700 PossibleReducedVals.push_back(EdgeVal); 19701 continue; 19702 } 19703 ReductionOps.push_back(EdgeInst); 19704 } 19705 }; 19706 // Try to regroup reduced values so that it gets more profitable to try to 19707 // reduce them. Values are grouped by their value ids, instructions - by 19708 // instruction op id and/or alternate op id, plus do extra analysis for 19709 // loads (grouping them by the distabce between pointers) and cmp 19710 // instructions (grouping them by the predicate). 19711 SmallMapVector< 19712 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>, 19713 8> 19714 PossibleReducedVals; 19715 initReductionOps(Root); 19716 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap; 19717 SmallSet<size_t, 2> LoadKeyUsed; 19718 19719 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) { 19720 Key = hash_combine(hash_value(LI->getParent()), Key); 19721 Value *Ptr = 19722 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth); 19723 if (!LoadKeyUsed.insert(Key).second) { 19724 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr)); 19725 if (LIt != LoadsMap.end()) { 19726 for (LoadInst *RLI : LIt->second) { 19727 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(), 19728 LI->getType(), LI->getPointerOperand(), DL, SE, 19729 /*StrictCheck=*/true)) 19730 return hash_value(RLI->getPointerOperand()); 19731 } 19732 for (LoadInst *RLI : LIt->second) { 19733 if (arePointersCompatible(RLI->getPointerOperand(), 19734 LI->getPointerOperand(), TLI)) { 19735 hash_code SubKey = hash_value(RLI->getPointerOperand()); 19736 return SubKey; 19737 } 19738 } 19739 if (LIt->second.size() > 2) { 19740 hash_code SubKey = 19741 hash_value(LIt->second.back()->getPointerOperand()); 19742 return SubKey; 19743 } 19744 } 19745 } 19746 LoadsMap.try_emplace(std::make_pair(Key, Ptr)) 19747 .first->second.push_back(LI); 19748 return hash_value(LI->getPointerOperand()); 19749 }; 19750 19751 while (!Worklist.empty()) { 19752 auto [TreeN, Level] = Worklist.pop_back_val(); 19753 SmallVector<Value *> PossibleRedVals; 19754 SmallVector<Instruction *> PossibleReductionOps; 19755 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level); 19756 addReductionOps(TreeN); 19757 // Add reduction values. The values are sorted for better vectorization 19758 // results. 19759 for (Value *V : PossibleRedVals) { 19760 size_t Key, Idx; 19761 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey, 19762 /*AllowAlternate=*/false); 19763 ++PossibleReducedVals[Key][Idx] 19764 .insert(std::make_pair(V, 0)) 19765 .first->second; 19766 } 19767 for (Instruction *I : reverse(PossibleReductionOps)) 19768 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1); 19769 } 19770 auto PossibleReducedValsVect = PossibleReducedVals.takeVector(); 19771 // Sort values by the total number of values kinds to start the reduction 19772 // from the longest possible reduced values sequences. 19773 for (auto &PossibleReducedVals : PossibleReducedValsVect) { 19774 auto PossibleRedVals = PossibleReducedVals.second.takeVector(); 19775 SmallVector<SmallVector<Value *>> PossibleRedValsVect; 19776 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end(); 19777 It != E; ++It) { 19778 PossibleRedValsVect.emplace_back(); 19779 auto RedValsVect = It->second.takeVector(); 19780 stable_sort(RedValsVect, llvm::less_second()); 19781 for (const std::pair<Value *, unsigned> &Data : RedValsVect) 19782 PossibleRedValsVect.back().append(Data.second, Data.first); 19783 } 19784 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) { 19785 return P1.size() > P2.size(); 19786 }); 19787 int NewIdx = -1; 19788 for (ArrayRef<Value *> Data : PossibleRedValsVect) { 19789 if (NewIdx < 0 || 19790 (!isGoodForReduction(Data) && 19791 (!isa<LoadInst>(Data.front()) || 19792 !isa<LoadInst>(ReducedVals[NewIdx].front()) || 19793 getUnderlyingObject( 19794 cast<LoadInst>(Data.front())->getPointerOperand()) != 19795 getUnderlyingObject( 19796 cast<LoadInst>(ReducedVals[NewIdx].front()) 19797 ->getPointerOperand())))) { 19798 NewIdx = ReducedVals.size(); 19799 ReducedVals.emplace_back(); 19800 } 19801 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend()); 19802 } 19803 } 19804 // Sort the reduced values by number of same/alternate opcode and/or pointer 19805 // operand. 19806 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) { 19807 return P1.size() > P2.size(); 19808 }); 19809 return true; 19810 } 19811 19812 /// Attempt to vectorize the tree found by matchAssociativeReduction. 19813 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI, 19814 const TargetLibraryInfo &TLI, AssumptionCache *AC) { 19815 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4; 19816 constexpr unsigned RegMaxNumber = 4; 19817 constexpr unsigned RedValsMaxNumber = 128; 19818 // If there are a sufficient number of reduction values, reduce 19819 // to a nearby power-of-2. We can safely generate oversized 19820 // vectors and rely on the backend to split them to legal sizes. 19821 if (unsigned NumReducedVals = std::accumulate( 19822 ReducedVals.begin(), ReducedVals.end(), 0, 19823 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned { 19824 if (!isGoodForReduction(Vals)) 19825 return Num; 19826 return Num + Vals.size(); 19827 }); 19828 NumReducedVals < ReductionLimit && 19829 all_of(ReducedVals, [](ArrayRef<Value *> RedV) { 19830 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV); 19831 })) { 19832 for (ReductionOpsType &RdxOps : ReductionOps) 19833 for (Value *RdxOp : RdxOps) 19834 V.analyzedReductionRoot(cast<Instruction>(RdxOp)); 19835 return nullptr; 19836 } 19837 19838 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(), 19839 TargetFolder(DL)); 19840 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot)); 19841 19842 // Track the reduced values in case if they are replaced by extractelement 19843 // because of the vectorization. 19844 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() * 19845 ReducedVals.front().size()); 19846 19847 // The compare instruction of a min/max is the insertion point for new 19848 // instructions and may be replaced with a new compare instruction. 19849 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) { 19850 assert(isa<SelectInst>(RdxRootInst) && 19851 "Expected min/max reduction to have select root instruction"); 19852 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition(); 19853 assert(isa<Instruction>(ScalarCond) && 19854 "Expected min/max reduction to have compare condition"); 19855 return cast<Instruction>(ScalarCond); 19856 }; 19857 19858 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) { 19859 return isBoolLogicOp(cast<Instruction>(V)); 19860 }); 19861 // Return new VectorizedTree, based on previous value. 19862 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) { 19863 if (VectorizedTree) { 19864 // Update the final value in the reduction. 19865 Builder.SetCurrentDebugLocation( 19866 cast<Instruction>(ReductionOps.front().front())->getDebugLoc()); 19867 if (AnyBoolLogicOp) { 19868 auto It = ReducedValsToOps.find(VectorizedTree); 19869 auto It1 = ReducedValsToOps.find(Res); 19870 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) || 19871 isGuaranteedNotToBePoison(VectorizedTree, AC) || 19872 (It != ReducedValsToOps.end() && 19873 any_of(It->getSecond(), [&](Instruction *I) { 19874 return isBoolLogicOp(I) && 19875 getRdxOperand(I, 0) == VectorizedTree; 19876 }))) { 19877 ; 19878 } else if (isGuaranteedNotToBePoison(Res, AC) || 19879 (It1 != ReducedValsToOps.end() && 19880 any_of(It1->getSecond(), [&](Instruction *I) { 19881 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res; 19882 }))) { 19883 std::swap(VectorizedTree, Res); 19884 } else { 19885 VectorizedTree = Builder.CreateFreeze(VectorizedTree); 19886 } 19887 } 19888 19889 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx", 19890 ReductionOps); 19891 } 19892 // Initialize the final value in the reduction. 19893 return Res; 19894 }; 19895 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() * 19896 ReductionOps.front().size()); 19897 for (ReductionOpsType &RdxOps : ReductionOps) 19898 for (Value *RdxOp : RdxOps) { 19899 if (!RdxOp) 19900 continue; 19901 IgnoreList.insert(RdxOp); 19902 } 19903 // Intersect the fast-math-flags from all reduction operations. 19904 FastMathFlags RdxFMF; 19905 RdxFMF.set(); 19906 for (Value *U : IgnoreList) 19907 if (auto *FPMO = dyn_cast<FPMathOperator>(U)) 19908 RdxFMF &= FPMO->getFastMathFlags(); 19909 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot)); 19910 19911 // Need to track reduced vals, they may be changed during vectorization of 19912 // subvectors. 19913 for (ArrayRef<Value *> Candidates : ReducedVals) 19914 for (Value *V : Candidates) 19915 TrackedVals.try_emplace(V, V); 19916 19917 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV, 19918 Value *V) -> unsigned & { 19919 auto *It = MV.find(V); 19920 assert(It != MV.end() && "Unable to find given key."); 19921 return It->second; 19922 }; 19923 19924 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size()); 19925 // List of the values that were reduced in other trees as part of gather 19926 // nodes and thus requiring extract if fully vectorized in other trees. 19927 SmallPtrSet<Value *, 4> RequiredExtract; 19928 WeakTrackingVH VectorizedTree = nullptr; 19929 bool CheckForReusedReductionOps = false; 19930 // Try to vectorize elements based on their type. 19931 SmallVector<InstructionsState> States; 19932 for (ArrayRef<Value *> RV : ReducedVals) 19933 States.push_back(getSameOpcode(RV, TLI)); 19934 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) { 19935 ArrayRef<Value *> OrigReducedVals = ReducedVals[I]; 19936 InstructionsState S = States[I]; 19937 SmallVector<Value *> Candidates; 19938 Candidates.reserve(2 * OrigReducedVals.size()); 19939 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size()); 19940 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) { 19941 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]); 19942 // Check if the reduction value was not overriden by the extractelement 19943 // instruction because of the vectorization and exclude it, if it is not 19944 // compatible with other values. 19945 // Also check if the instruction was folded to constant/other value. 19946 auto *Inst = dyn_cast<Instruction>(RdxVal); 19947 if ((Inst && isVectorLikeInstWithConstOps(Inst) && 19948 (!S || !S.isOpcodeOrAlt(Inst))) || 19949 (S && !Inst)) 19950 continue; 19951 Candidates.push_back(RdxVal); 19952 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]); 19953 } 19954 bool ShuffledExtracts = false; 19955 // Try to handle shuffled extractelements. 19956 if (S && S.getOpcode() == Instruction::ExtractElement && 19957 !S.isAltShuffle() && I + 1 < E) { 19958 SmallVector<Value *> CommonCandidates(Candidates); 19959 for (Value *RV : ReducedVals[I + 1]) { 19960 Value *RdxVal = TrackedVals.at(RV); 19961 // Check if the reduction value was not overriden by the 19962 // extractelement instruction because of the vectorization and 19963 // exclude it, if it is not compatible with other values. 19964 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal); 19965 if (!Inst) 19966 continue; 19967 CommonCandidates.push_back(RdxVal); 19968 TrackedToOrig.try_emplace(RdxVal, RV); 19969 } 19970 SmallVector<int> Mask; 19971 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) { 19972 ++I; 19973 Candidates.swap(CommonCandidates); 19974 ShuffledExtracts = true; 19975 } 19976 } 19977 19978 // Emit code for constant values. 19979 if (Candidates.size() > 1 && allConstant(Candidates)) { 19980 Value *Res = Candidates.front(); 19981 Value *OrigV = TrackedToOrig.at(Candidates.front()); 19982 ++VectorizedVals.try_emplace(OrigV).first->getSecond(); 19983 for (Value *VC : ArrayRef(Candidates).drop_front()) { 19984 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps); 19985 Value *OrigV = TrackedToOrig.at(VC); 19986 ++VectorizedVals.try_emplace(OrigV).first->getSecond(); 19987 if (auto *ResI = dyn_cast<Instruction>(Res)) 19988 V.analyzedReductionRoot(ResI); 19989 } 19990 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res); 19991 continue; 19992 } 19993 19994 unsigned NumReducedVals = Candidates.size(); 19995 if (NumReducedVals < ReductionLimit && 19996 (NumReducedVals < 2 || !isSplat(Candidates))) 19997 continue; 19998 19999 // Check if we support repeated scalar values processing (optimization of 20000 // original scalar identity operations on matched horizontal reductions). 20001 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul && 20002 RdxKind != RecurKind::FMul && 20003 RdxKind != RecurKind::FMulAdd; 20004 // Gather same values. 20005 SmallMapVector<Value *, unsigned, 16> SameValuesCounter; 20006 if (IsSupportedHorRdxIdentityOp) 20007 for (Value *V : Candidates) { 20008 Value *OrigV = TrackedToOrig.at(V); 20009 ++SameValuesCounter.try_emplace(OrigV).first->second; 20010 } 20011 // Used to check if the reduced values used same number of times. In this 20012 // case the compiler may produce better code. E.g. if reduced values are 20013 // aabbccdd (8 x values), then the first node of the tree will have a node 20014 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>. 20015 // Plus, the final reduction will be performed on <8 x aabbccdd>. 20016 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4 20017 // x abcd) * 2. 20018 // Currently it only handles add/fadd/xor. and/or/min/max do not require 20019 // this analysis, other operations may require an extra estimation of 20020 // the profitability. 20021 bool SameScaleFactor = false; 20022 bool OptReusedScalars = IsSupportedHorRdxIdentityOp && 20023 SameValuesCounter.size() != Candidates.size(); 20024 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; 20025 if (OptReusedScalars) { 20026 SameScaleFactor = 20027 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd || 20028 RdxKind == RecurKind::Xor) && 20029 all_of(drop_begin(SameValuesCounter), 20030 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) { 20031 return P.second == SameValuesCounter.front().second; 20032 }); 20033 Candidates.resize(SameValuesCounter.size()); 20034 transform(SameValuesCounter, Candidates.begin(), 20035 [&](const auto &P) { return TrackedVals.at(P.first); }); 20036 NumReducedVals = Candidates.size(); 20037 // Have a reduction of the same element. 20038 if (NumReducedVals == 1) { 20039 Value *OrigV = TrackedToOrig.at(Candidates.front()); 20040 unsigned Cnt = At(SameValuesCounter, OrigV); 20041 Value *RedVal = 20042 emitScaleForReusedOps(Candidates.front(), Builder, Cnt); 20043 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal); 20044 VectorizedVals.try_emplace(OrigV, Cnt); 20045 ExternallyUsedValues.insert(OrigV); 20046 continue; 20047 } 20048 } 20049 20050 unsigned MaxVecRegSize = V.getMaxVecRegSize(); 20051 unsigned EltSize = V.getVectorElementSize(Candidates[0]); 20052 const unsigned MaxElts = std::clamp<unsigned>( 20053 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber, 20054 RegMaxNumber * RedValsMaxNumber); 20055 20056 unsigned ReduxWidth = NumReducedVals; 20057 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) { 20058 unsigned NumParts, NumRegs; 20059 Type *ScalarTy = Candidates.front()->getType(); 20060 ReduxWidth = 20061 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth); 20062 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth); 20063 NumParts = ::getNumberOfParts(TTI, Tp); 20064 NumRegs = 20065 TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp)); 20066 while (NumParts > NumRegs) { 20067 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0."); 20068 ReduxWidth = bit_floor(ReduxWidth - 1); 20069 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth); 20070 NumParts = ::getNumberOfParts(TTI, Tp); 20071 NumRegs = 20072 TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp)); 20073 } 20074 if (NumParts > NumRegs / 2) 20075 ReduxWidth = bit_floor(ReduxWidth); 20076 return ReduxWidth; 20077 }; 20078 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1)) 20079 ReduxWidth = GetVectorFactor(ReduxWidth); 20080 ReduxWidth = std::min(ReduxWidth, MaxElts); 20081 20082 unsigned Start = 0; 20083 unsigned Pos = Start; 20084 // Restarts vectorization attempt with lower vector factor. 20085 unsigned PrevReduxWidth = ReduxWidth; 20086 bool CheckForReusedReductionOpsLocal = false; 20087 auto AdjustReducedVals = [&](bool IgnoreVL = false) { 20088 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList); 20089 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) { 20090 // Check if any of the reduction ops are gathered. If so, worth 20091 // trying again with less number of reduction ops. 20092 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered; 20093 } 20094 ++Pos; 20095 if (Pos < NumReducedVals - ReduxWidth + 1) 20096 return IsAnyRedOpGathered; 20097 Pos = Start; 20098 --ReduxWidth; 20099 if (ReduxWidth > 1) 20100 ReduxWidth = GetVectorFactor(ReduxWidth); 20101 return IsAnyRedOpGathered; 20102 }; 20103 bool AnyVectorized = false; 20104 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates; 20105 while (Pos < NumReducedVals - ReduxWidth + 1 && 20106 ReduxWidth >= ReductionLimit) { 20107 // Dependency in tree of the reduction ops - drop this attempt, try 20108 // later. 20109 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth && 20110 Start == 0) { 20111 CheckForReusedReductionOps = true; 20112 break; 20113 } 20114 PrevReduxWidth = ReduxWidth; 20115 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth); 20116 // Been analyzed already - skip. 20117 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) || 20118 (!has_single_bit(ReduxWidth) && 20119 (IgnoredCandidates.contains( 20120 std::make_pair(Pos, bit_floor(ReduxWidth))) || 20121 IgnoredCandidates.contains( 20122 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)), 20123 bit_floor(ReduxWidth))))) || 20124 V.areAnalyzedReductionVals(VL)) { 20125 (void)AdjustReducedVals(/*IgnoreVL=*/true); 20126 continue; 20127 } 20128 // Early exit if any of the reduction values were deleted during 20129 // previous vectorization attempts. 20130 if (any_of(VL, [&V](Value *RedVal) { 20131 auto *RedValI = dyn_cast<Instruction>(RedVal); 20132 if (!RedValI) 20133 return false; 20134 return V.isDeleted(RedValI); 20135 })) 20136 break; 20137 V.buildTree(VL, IgnoreList); 20138 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) { 20139 if (!AdjustReducedVals()) 20140 V.analyzedReductionVals(VL); 20141 continue; 20142 } 20143 if (V.isLoadCombineReductionCandidate(RdxKind)) { 20144 if (!AdjustReducedVals()) 20145 V.analyzedReductionVals(VL); 20146 continue; 20147 } 20148 V.reorderTopToBottom(); 20149 // No need to reorder the root node at all. 20150 V.reorderBottomToTop(/*IgnoreReorder=*/true); 20151 // Keep extracted other reduction values, if they are used in the 20152 // vectorization trees. 20153 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues( 20154 ExternallyUsedValues); 20155 // The reduction root is used as the insertion point for new 20156 // instructions, so set it as externally used to prevent it from being 20157 // deleted. 20158 LocalExternallyUsedValues.insert(ReductionRoot); 20159 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) { 20160 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1)) 20161 continue; 20162 for (Value *V : ReducedVals[Cnt]) 20163 if (isa<Instruction>(V)) 20164 LocalExternallyUsedValues.insert(TrackedVals[V]); 20165 } 20166 if (!IsSupportedHorRdxIdentityOp) { 20167 // Number of uses of the candidates in the vector of values. 20168 assert(SameValuesCounter.empty() && 20169 "Reused values counter map is not empty"); 20170 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) { 20171 if (Cnt >= Pos && Cnt < Pos + ReduxWidth) 20172 continue; 20173 Value *V = Candidates[Cnt]; 20174 Value *OrigV = TrackedToOrig.at(V); 20175 ++SameValuesCounter.try_emplace(OrigV).first->second; 20176 } 20177 } 20178 V.transformNodes(); 20179 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end()); 20180 // Gather externally used values. 20181 SmallPtrSet<Value *, 4> Visited; 20182 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) { 20183 if (Cnt >= Pos && Cnt < Pos + ReduxWidth) 20184 continue; 20185 Value *RdxVal = Candidates[Cnt]; 20186 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end()) 20187 RdxVal = It->second; 20188 if (!Visited.insert(RdxVal).second) 20189 continue; 20190 // Check if the scalar was vectorized as part of the vectorization 20191 // tree but not the top node. 20192 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) { 20193 LocalExternallyUsedValues.insert(RdxVal); 20194 continue; 20195 } 20196 Value *OrigV = TrackedToOrig.at(RdxVal); 20197 unsigned NumOps = 20198 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV); 20199 if (NumOps != ReducedValsToOps.at(OrigV).size()) 20200 LocalExternallyUsedValues.insert(RdxVal); 20201 } 20202 // Do not need the list of reused scalars in regular mode anymore. 20203 if (!IsSupportedHorRdxIdentityOp) 20204 SameValuesCounter.clear(); 20205 for (Value *RdxVal : VL) 20206 if (RequiredExtract.contains(RdxVal)) 20207 LocalExternallyUsedValues.insert(RdxVal); 20208 V.buildExternalUses(LocalExternallyUsedValues); 20209 20210 V.computeMinimumValueSizes(); 20211 20212 // Estimate cost. 20213 InstructionCost TreeCost = V.getTreeCost(VL); 20214 InstructionCost ReductionCost = 20215 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V); 20216 InstructionCost Cost = TreeCost + ReductionCost; 20217 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost 20218 << " for reduction\n"); 20219 if (!Cost.isValid()) 20220 break; 20221 if (Cost >= -SLPCostThreshold) { 20222 V.getORE()->emit([&]() { 20223 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial", 20224 ReducedValsToOps.at(VL[0]).front()) 20225 << "Vectorizing horizontal reduction is possible " 20226 << "but not beneficial with cost " << ore::NV("Cost", Cost) 20227 << " and threshold " 20228 << ore::NV("Threshold", -SLPCostThreshold); 20229 }); 20230 if (!AdjustReducedVals()) { 20231 V.analyzedReductionVals(VL); 20232 unsigned Offset = Pos == Start ? Pos : Pos - 1; 20233 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) { 20234 // Add subvectors of VL to the list of the analyzed values. 20235 for (unsigned VF = getFloorFullVectorNumberOfElements( 20236 *TTI, VL.front()->getType(), ReduxWidth - 1); 20237 VF >= ReductionLimit; 20238 VF = getFloorFullVectorNumberOfElements( 20239 *TTI, VL.front()->getType(), VF - 1)) { 20240 if (has_single_bit(VF) && 20241 V.getCanonicalGraphSize() != V.getTreeSize()) 20242 continue; 20243 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF)) 20244 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF)); 20245 } 20246 } 20247 } 20248 continue; 20249 } 20250 20251 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" 20252 << Cost << ". (HorRdx)\n"); 20253 V.getORE()->emit([&]() { 20254 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", 20255 ReducedValsToOps.at(VL[0]).front()) 20256 << "Vectorized horizontal reduction with cost " 20257 << ore::NV("Cost", Cost) << " and with tree size " 20258 << ore::NV("TreeSize", V.getTreeSize()); 20259 }); 20260 20261 Builder.setFastMathFlags(RdxFMF); 20262 20263 // Emit a reduction. If the root is a select (min/max idiom), the insert 20264 // point is the compare condition of that select. 20265 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot); 20266 Instruction *InsertPt = RdxRootInst; 20267 if (IsCmpSelMinMax) 20268 InsertPt = GetCmpForMinMaxReduction(RdxRootInst); 20269 20270 // Vectorize a tree. 20271 Value *VectorizedRoot = 20272 V.vectorizeTree(LocalExternallyUsedValues, InsertPt); 20273 // Update TrackedToOrig mapping, since the tracked values might be 20274 // updated. 20275 for (Value *RdxVal : Candidates) { 20276 Value *OrigVal = TrackedToOrig.at(RdxVal); 20277 Value *TransformedRdxVal = TrackedVals.at(OrigVal); 20278 if (TransformedRdxVal != RdxVal) 20279 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal); 20280 } 20281 20282 Builder.SetInsertPoint(InsertPt); 20283 20284 // To prevent poison from leaking across what used to be sequential, 20285 // safe, scalar boolean logic operations, the reduction operand must be 20286 // frozen. 20287 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC)) 20288 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); 20289 20290 // Emit code to correctly handle reused reduced values, if required. 20291 if (OptReusedScalars && !SameScaleFactor) { 20292 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V, 20293 SameValuesCounter, TrackedToOrig); 20294 } 20295 20296 Value *ReducedSubTree; 20297 Type *ScalarTy = VL.front()->getType(); 20298 if (isa<FixedVectorType>(ScalarTy)) { 20299 assert(SLPReVec && "FixedVectorType is not expected."); 20300 unsigned ScalarTyNumElements = getNumElements(ScalarTy); 20301 ReducedSubTree = PoisonValue::get(FixedVectorType::get( 20302 VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements)); 20303 for (unsigned I : seq<unsigned>(ScalarTyNumElements)) { 20304 // Do reduction for each lane. 20305 // e.g., do reduce add for 20306 // VL[0] = <4 x Ty> <a, b, c, d> 20307 // VL[1] = <4 x Ty> <e, f, g, h> 20308 // Lane[0] = <2 x Ty> <a, e> 20309 // Lane[1] = <2 x Ty> <b, f> 20310 // Lane[2] = <2 x Ty> <c, g> 20311 // Lane[3] = <2 x Ty> <d, h> 20312 // result[0] = reduce add Lane[0] 20313 // result[1] = reduce add Lane[1] 20314 // result[2] = reduce add Lane[2] 20315 // result[3] = reduce add Lane[3] 20316 SmallVector<int, 16> Mask = 20317 createStrideMask(I, ScalarTyNumElements, VL.size()); 20318 Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask); 20319 ReducedSubTree = Builder.CreateInsertElement( 20320 ReducedSubTree, 20321 emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I); 20322 } 20323 } else { 20324 ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI, 20325 RdxRootInst->getType()); 20326 } 20327 if (ReducedSubTree->getType() != VL.front()->getType()) { 20328 assert(ReducedSubTree->getType() != VL.front()->getType() && 20329 "Expected different reduction type."); 20330 ReducedSubTree = 20331 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(), 20332 V.isSignedMinBitwidthRootNode()); 20333 } 20334 20335 // Improved analysis for add/fadd/xor reductions with same scale factor 20336 // for all operands of reductions. We can emit scalar ops for them 20337 // instead. 20338 if (OptReusedScalars && SameScaleFactor) 20339 ReducedSubTree = emitScaleForReusedOps( 20340 ReducedSubTree, Builder, SameValuesCounter.front().second); 20341 20342 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree); 20343 // Count vectorized reduced values to exclude them from final reduction. 20344 for (Value *RdxVal : VL) { 20345 Value *OrigV = TrackedToOrig.at(RdxVal); 20346 if (IsSupportedHorRdxIdentityOp) { 20347 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV)); 20348 continue; 20349 } 20350 ++VectorizedVals.try_emplace(OrigV).first->getSecond(); 20351 if (!V.isVectorized(RdxVal)) 20352 RequiredExtract.insert(RdxVal); 20353 } 20354 Pos += ReduxWidth; 20355 Start = Pos; 20356 ReduxWidth = NumReducedVals - Pos; 20357 if (ReduxWidth > 1) 20358 ReduxWidth = GetVectorFactor(NumReducedVals - Pos); 20359 AnyVectorized = true; 20360 } 20361 if (OptReusedScalars && !AnyVectorized) { 20362 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) { 20363 Value *RdxVal = TrackedVals.at(P.first); 20364 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second); 20365 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal); 20366 VectorizedVals.try_emplace(P.first, P.second); 20367 } 20368 continue; 20369 } 20370 } 20371 if (VectorizedTree) { 20372 // Reorder operands of bool logical op in the natural order to avoid 20373 // possible problem with poison propagation. If not possible to reorder 20374 // (both operands are originally RHS), emit an extra freeze instruction 20375 // for the LHS operand. 20376 // I.e., if we have original code like this: 20377 // RedOp1 = select i1 ?, i1 LHS, i1 false 20378 // RedOp2 = select i1 RHS, i1 ?, i1 false 20379 20380 // Then, we swap LHS/RHS to create a new op that matches the poison 20381 // semantics of the original code. 20382 20383 // If we have original code like this and both values could be poison: 20384 // RedOp1 = select i1 ?, i1 LHS, i1 false 20385 // RedOp2 = select i1 ?, i1 RHS, i1 false 20386 20387 // Then, we must freeze LHS in the new op. 20388 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS, 20389 Instruction *RedOp1, 20390 Instruction *RedOp2, 20391 bool InitStep) { 20392 if (!AnyBoolLogicOp) 20393 return; 20394 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) || 20395 getRdxOperand(RedOp1, 0) == LHS || 20396 isGuaranteedNotToBePoison(LHS, AC))) 20397 return; 20398 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) || 20399 getRdxOperand(RedOp2, 0) == RHS || 20400 isGuaranteedNotToBePoison(RHS, AC))) { 20401 std::swap(LHS, RHS); 20402 return; 20403 } 20404 if (LHS != VectorizedTree) 20405 LHS = Builder.CreateFreeze(LHS); 20406 }; 20407 // Finish the reduction. 20408 // Need to add extra arguments and not vectorized possible reduction 20409 // values. 20410 // Try to avoid dependencies between the scalar remainders after 20411 // reductions. 20412 auto FinalGen = 20413 [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals, 20414 bool InitStep) { 20415 unsigned Sz = InstVals.size(); 20416 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + 20417 Sz % 2); 20418 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) { 20419 Instruction *RedOp = InstVals[I + 1].first; 20420 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc()); 20421 Value *RdxVal1 = InstVals[I].second; 20422 Value *StableRdxVal1 = RdxVal1; 20423 auto It1 = TrackedVals.find(RdxVal1); 20424 if (It1 != TrackedVals.end()) 20425 StableRdxVal1 = It1->second; 20426 Value *RdxVal2 = InstVals[I + 1].second; 20427 Value *StableRdxVal2 = RdxVal2; 20428 auto It2 = TrackedVals.find(RdxVal2); 20429 if (It2 != TrackedVals.end()) 20430 StableRdxVal2 = It2->second; 20431 // To prevent poison from leaking across what used to be 20432 // sequential, safe, scalar boolean logic operations, the 20433 // reduction operand must be frozen. 20434 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first, 20435 RedOp, InitStep); 20436 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1, 20437 StableRdxVal2, "op.rdx", ReductionOps); 20438 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed); 20439 } 20440 if (Sz % 2 == 1) 20441 ExtraReds[Sz / 2] = InstVals.back(); 20442 return ExtraReds; 20443 }; 20444 SmallVector<std::pair<Instruction *, Value *>> ExtraReductions; 20445 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot), 20446 VectorizedTree); 20447 SmallPtrSet<Value *, 8> Visited; 20448 for (ArrayRef<Value *> Candidates : ReducedVals) { 20449 for (Value *RdxVal : Candidates) { 20450 if (!Visited.insert(RdxVal).second) 20451 continue; 20452 unsigned NumOps = VectorizedVals.lookup(RdxVal); 20453 for (Instruction *RedOp : 20454 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps)) 20455 ExtraReductions.emplace_back(RedOp, RdxVal); 20456 } 20457 } 20458 // Iterate through all not-vectorized reduction values/extra arguments. 20459 bool InitStep = true; 20460 while (ExtraReductions.size() > 1) { 20461 SmallVector<std::pair<Instruction *, Value *>> NewReds = 20462 FinalGen(ExtraReductions, InitStep); 20463 ExtraReductions.swap(NewReds); 20464 InitStep = false; 20465 } 20466 VectorizedTree = ExtraReductions.front().second; 20467 20468 ReductionRoot->replaceAllUsesWith(VectorizedTree); 20469 20470 // The original scalar reduction is expected to have no remaining 20471 // uses outside the reduction tree itself. Assert that we got this 20472 // correct, replace internal uses with undef, and mark for eventual 20473 // deletion. 20474 #ifndef NDEBUG 20475 SmallSet<Value *, 4> IgnoreSet; 20476 for (ArrayRef<Value *> RdxOps : ReductionOps) 20477 IgnoreSet.insert(RdxOps.begin(), RdxOps.end()); 20478 #endif 20479 for (ArrayRef<Value *> RdxOps : ReductionOps) { 20480 for (Value *Ignore : RdxOps) { 20481 if (!Ignore) 20482 continue; 20483 #ifndef NDEBUG 20484 for (auto *U : Ignore->users()) { 20485 assert(IgnoreSet.count(U) && 20486 "All users must be either in the reduction ops list."); 20487 } 20488 #endif 20489 if (!Ignore->use_empty()) { 20490 Value *P = PoisonValue::get(Ignore->getType()); 20491 Ignore->replaceAllUsesWith(P); 20492 } 20493 } 20494 V.removeInstructionsAndOperands(RdxOps); 20495 } 20496 } else if (!CheckForReusedReductionOps) { 20497 for (ReductionOpsType &RdxOps : ReductionOps) 20498 for (Value *RdxOp : RdxOps) 20499 V.analyzedReductionRoot(cast<Instruction>(RdxOp)); 20500 } 20501 return VectorizedTree; 20502 } 20503 20504 private: 20505 /// Calculate the cost of a reduction. 20506 InstructionCost getReductionCost(TargetTransformInfo *TTI, 20507 ArrayRef<Value *> ReducedVals, 20508 bool IsCmpSelMinMax, FastMathFlags FMF, 20509 const BoUpSLP &R) { 20510 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 20511 Type *ScalarTy = ReducedVals.front()->getType(); 20512 unsigned ReduxWidth = ReducedVals.size(); 20513 FixedVectorType *VectorTy = R.getReductionType(); 20514 InstructionCost VectorCost = 0, ScalarCost; 20515 // If all of the reduced values are constant, the vector cost is 0, since 20516 // the reduction value can be calculated at the compile time. 20517 bool AllConsts = allConstant(ReducedVals); 20518 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) { 20519 InstructionCost Cost = 0; 20520 // Scalar cost is repeated for N-1 elements. 20521 int Cnt = ReducedVals.size(); 20522 for (Value *RdxVal : ReducedVals) { 20523 if (Cnt == 1) 20524 break; 20525 --Cnt; 20526 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) { 20527 Cost += GenCostFn(); 20528 continue; 20529 } 20530 InstructionCost ScalarCost = 0; 20531 for (User *U : RdxVal->users()) { 20532 auto *RdxOp = cast<Instruction>(U); 20533 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) { 20534 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind); 20535 continue; 20536 } 20537 ScalarCost = InstructionCost::getInvalid(); 20538 break; 20539 } 20540 if (ScalarCost.isValid()) 20541 Cost += ScalarCost; 20542 else 20543 Cost += GenCostFn(); 20544 } 20545 return Cost; 20546 }; 20547 switch (RdxKind) { 20548 case RecurKind::Add: 20549 case RecurKind::Mul: 20550 case RecurKind::Or: 20551 case RecurKind::And: 20552 case RecurKind::Xor: 20553 case RecurKind::FAdd: 20554 case RecurKind::FMul: { 20555 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); 20556 if (!AllConsts) { 20557 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) { 20558 assert(SLPReVec && "FixedVectorType is not expected."); 20559 unsigned ScalarTyNumElements = VecTy->getNumElements(); 20560 for (unsigned I : seq<unsigned>(ReducedVals.size())) { 20561 VectorCost += TTI->getShuffleCost( 20562 TTI::SK_PermuteSingleSrc, VectorTy, 20563 createStrideMask(I, ScalarTyNumElements, ReducedVals.size())); 20564 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF, 20565 CostKind); 20566 } 20567 VectorCost += TTI->getScalarizationOverhead( 20568 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true, 20569 /*Extract*/ false, TTI::TCK_RecipThroughput); 20570 } else { 20571 Type *RedTy = VectorTy->getElementType(); 20572 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( 20573 std::make_pair(RedTy, true)); 20574 if (RType == RedTy) { 20575 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, 20576 FMF, CostKind); 20577 } else { 20578 VectorCost = TTI->getExtendedReductionCost( 20579 RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth), 20580 FMF, CostKind); 20581 } 20582 } 20583 } 20584 ScalarCost = EvaluateScalarCost([&]() { 20585 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind); 20586 }); 20587 break; 20588 } 20589 case RecurKind::FMax: 20590 case RecurKind::FMin: 20591 case RecurKind::FMaximum: 20592 case RecurKind::FMinimum: 20593 case RecurKind::SMax: 20594 case RecurKind::SMin: 20595 case RecurKind::UMax: 20596 case RecurKind::UMin: { 20597 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); 20598 if (!AllConsts) 20599 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind); 20600 ScalarCost = EvaluateScalarCost([&]() { 20601 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF); 20602 return TTI->getIntrinsicInstrCost(ICA, CostKind); 20603 }); 20604 break; 20605 } 20606 default: 20607 llvm_unreachable("Expected arithmetic or min/max reduction operation"); 20608 } 20609 20610 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost 20611 << " for reduction of " << shortBundleName(ReducedVals) 20612 << " (It is a splitting reduction)\n"); 20613 return VectorCost - ScalarCost; 20614 } 20615 20616 /// Emit a horizontal reduction of the vectorized value. 20617 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder, 20618 const TargetTransformInfo *TTI, Type *DestTy) { 20619 assert(VectorizedValue && "Need to have a vectorized tree node"); 20620 assert(RdxKind != RecurKind::FMulAdd && 20621 "A call to the llvm.fmuladd intrinsic is not handled yet"); 20622 20623 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType()); 20624 if (FTy->getScalarType() == Builder.getInt1Ty() && 20625 RdxKind == RecurKind::Add && 20626 DestTy->getScalarType() != FTy->getScalarType()) { 20627 // Convert vector_reduce_add(ZExt(<n x i1>)) to 20628 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)). 20629 Value *V = Builder.CreateBitCast( 20630 VectorizedValue, Builder.getIntNTy(FTy->getNumElements())); 20631 ++NumVectorInstructions; 20632 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V); 20633 } 20634 ++NumVectorInstructions; 20635 return createSimpleReduction(Builder, VectorizedValue, RdxKind); 20636 } 20637 20638 /// Emits optimized code for unique scalar value reused \p Cnt times. 20639 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, 20640 unsigned Cnt) { 20641 assert(IsSupportedHorRdxIdentityOp && 20642 "The optimization of matched scalar identity horizontal reductions " 20643 "must be supported."); 20644 if (Cnt == 1) 20645 return VectorizedValue; 20646 switch (RdxKind) { 20647 case RecurKind::Add: { 20648 // res = mul vv, n 20649 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt); 20650 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " 20651 << VectorizedValue << ". (HorRdx)\n"); 20652 return Builder.CreateMul(VectorizedValue, Scale); 20653 } 20654 case RecurKind::Xor: { 20655 // res = n % 2 ? 0 : vv 20656 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue 20657 << ". (HorRdx)\n"); 20658 if (Cnt % 2 == 0) 20659 return Constant::getNullValue(VectorizedValue->getType()); 20660 return VectorizedValue; 20661 } 20662 case RecurKind::FAdd: { 20663 // res = fmul v, n 20664 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt); 20665 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " 20666 << VectorizedValue << ". (HorRdx)\n"); 20667 return Builder.CreateFMul(VectorizedValue, Scale); 20668 } 20669 case RecurKind::And: 20670 case RecurKind::Or: 20671 case RecurKind::SMax: 20672 case RecurKind::SMin: 20673 case RecurKind::UMax: 20674 case RecurKind::UMin: 20675 case RecurKind::FMax: 20676 case RecurKind::FMin: 20677 case RecurKind::FMaximum: 20678 case RecurKind::FMinimum: 20679 // res = vv 20680 return VectorizedValue; 20681 case RecurKind::Mul: 20682 case RecurKind::FMul: 20683 case RecurKind::FMulAdd: 20684 case RecurKind::IAnyOf: 20685 case RecurKind::FAnyOf: 20686 case RecurKind::IFindLastIV: 20687 case RecurKind::FFindLastIV: 20688 case RecurKind::None: 20689 llvm_unreachable("Unexpected reduction kind for repeated scalar."); 20690 } 20691 return nullptr; 20692 } 20693 20694 /// Emits actual operation for the scalar identity values, found during 20695 /// horizontal reduction analysis. 20696 Value * 20697 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R, 20698 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter, 20699 const DenseMap<Value *, Value *> &TrackedToOrig) { 20700 assert(IsSupportedHorRdxIdentityOp && 20701 "The optimization of matched scalar identity horizontal reductions " 20702 "must be supported."); 20703 ArrayRef<Value *> VL = R.getRootNodeScalars(); 20704 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType()); 20705 if (VTy->getElementType() != VL.front()->getType()) { 20706 VectorizedValue = Builder.CreateIntCast( 20707 VectorizedValue, 20708 getWidenedType(VL.front()->getType(), VTy->getNumElements()), 20709 R.isSignedMinBitwidthRootNode()); 20710 } 20711 switch (RdxKind) { 20712 case RecurKind::Add: { 20713 // root = mul prev_root, <1, 1, n, 1> 20714 SmallVector<Constant *> Vals; 20715 for (Value *V : VL) { 20716 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V)); 20717 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false)); 20718 } 20719 auto *Scale = ConstantVector::get(Vals); 20720 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of " 20721 << VectorizedValue << ". (HorRdx)\n"); 20722 return Builder.CreateMul(VectorizedValue, Scale); 20723 } 20724 case RecurKind::And: 20725 case RecurKind::Or: 20726 // No need for multiple or/and(s). 20727 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue 20728 << ". (HorRdx)\n"); 20729 return VectorizedValue; 20730 case RecurKind::SMax: 20731 case RecurKind::SMin: 20732 case RecurKind::UMax: 20733 case RecurKind::UMin: 20734 case RecurKind::FMax: 20735 case RecurKind::FMin: 20736 case RecurKind::FMaximum: 20737 case RecurKind::FMinimum: 20738 // No need for multiple min/max(s) of the same value. 20739 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue 20740 << ". (HorRdx)\n"); 20741 return VectorizedValue; 20742 case RecurKind::Xor: { 20743 // Replace values with even number of repeats with 0, since 20744 // x xor x = 0. 20745 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6, 20746 // 7>, if elements 4th and 6th elements have even number of repeats. 20747 SmallVector<int> Mask( 20748 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(), 20749 PoisonMaskElem); 20750 std::iota(Mask.begin(), Mask.end(), 0); 20751 bool NeedShuffle = false; 20752 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) { 20753 Value *V = VL[I]; 20754 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V)); 20755 if (Cnt % 2 == 0) { 20756 Mask[I] = VF; 20757 NeedShuffle = true; 20758 } 20759 } 20760 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I 20761 : Mask) dbgs() 20762 << I << " "; 20763 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n"); 20764 if (NeedShuffle) 20765 VectorizedValue = Builder.CreateShuffleVector( 20766 VectorizedValue, 20767 ConstantVector::getNullValue(VectorizedValue->getType()), Mask); 20768 return VectorizedValue; 20769 } 20770 case RecurKind::FAdd: { 20771 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0> 20772 SmallVector<Constant *> Vals; 20773 for (Value *V : VL) { 20774 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V)); 20775 Vals.push_back(ConstantFP::get(V->getType(), Cnt)); 20776 } 20777 auto *Scale = ConstantVector::get(Vals); 20778 return Builder.CreateFMul(VectorizedValue, Scale); 20779 } 20780 case RecurKind::Mul: 20781 case RecurKind::FMul: 20782 case RecurKind::FMulAdd: 20783 case RecurKind::IAnyOf: 20784 case RecurKind::FAnyOf: 20785 case RecurKind::IFindLastIV: 20786 case RecurKind::FFindLastIV: 20787 case RecurKind::None: 20788 llvm_unreachable("Unexpected reduction kind for reused scalars."); 20789 } 20790 return nullptr; 20791 } 20792 }; 20793 } // end anonymous namespace 20794 20795 /// Gets recurrence kind from the specified value. 20796 static RecurKind getRdxKind(Value *V) { 20797 return HorizontalReduction::getRdxKind(V); 20798 } 20799 static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) { 20800 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) 20801 return cast<FixedVectorType>(IE->getType())->getNumElements(); 20802 20803 unsigned AggregateSize = 1; 20804 auto *IV = cast<InsertValueInst>(InsertInst); 20805 Type *CurrentType = IV->getType(); 20806 do { 20807 if (auto *ST = dyn_cast<StructType>(CurrentType)) { 20808 for (auto *Elt : ST->elements()) 20809 if (Elt != ST->getElementType(0)) // check homogeneity 20810 return std::nullopt; 20811 AggregateSize *= ST->getNumElements(); 20812 CurrentType = ST->getElementType(0); 20813 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { 20814 AggregateSize *= AT->getNumElements(); 20815 CurrentType = AT->getElementType(); 20816 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) { 20817 AggregateSize *= VT->getNumElements(); 20818 return AggregateSize; 20819 } else if (CurrentType->isSingleValueType()) { 20820 return AggregateSize; 20821 } else { 20822 return std::nullopt; 20823 } 20824 } while (true); 20825 } 20826 20827 static void findBuildAggregate_rec(Instruction *LastInsertInst, 20828 TargetTransformInfo *TTI, 20829 SmallVectorImpl<Value *> &BuildVectorOpds, 20830 SmallVectorImpl<Value *> &InsertElts, 20831 unsigned OperandOffset, const BoUpSLP &R) { 20832 do { 20833 Value *InsertedOperand = LastInsertInst->getOperand(1); 20834 std::optional<unsigned> OperandIndex = 20835 getElementIndex(LastInsertInst, OperandOffset); 20836 if (!OperandIndex || R.isDeleted(LastInsertInst)) 20837 return; 20838 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) { 20839 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI, 20840 BuildVectorOpds, InsertElts, *OperandIndex, R); 20841 20842 } else { 20843 BuildVectorOpds[*OperandIndex] = InsertedOperand; 20844 InsertElts[*OperandIndex] = LastInsertInst; 20845 } 20846 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0)); 20847 } while (LastInsertInst != nullptr && 20848 isa<InsertValueInst, InsertElementInst>(LastInsertInst) && 20849 LastInsertInst->hasOneUse()); 20850 } 20851 20852 /// Recognize construction of vectors like 20853 /// %ra = insertelement <4 x float> poison, float %s0, i32 0 20854 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 20855 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 20856 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 20857 /// starting from the last insertelement or insertvalue instruction. 20858 /// 20859 /// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>}, 20860 /// {{float, float}, {float, float}}, [2 x {float, float}] and so on. 20861 /// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples. 20862 /// 20863 /// Assume LastInsertInst is of InsertElementInst or InsertValueInst type. 20864 /// 20865 /// \return true if it matches. 20866 static bool findBuildAggregate(Instruction *LastInsertInst, 20867 TargetTransformInfo *TTI, 20868 SmallVectorImpl<Value *> &BuildVectorOpds, 20869 SmallVectorImpl<Value *> &InsertElts, 20870 const BoUpSLP &R) { 20871 20872 assert((isa<InsertElementInst>(LastInsertInst) || 20873 isa<InsertValueInst>(LastInsertInst)) && 20874 "Expected insertelement or insertvalue instruction!"); 20875 20876 assert((BuildVectorOpds.empty() && InsertElts.empty()) && 20877 "Expected empty result vectors!"); 20878 20879 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst); 20880 if (!AggregateSize) 20881 return false; 20882 BuildVectorOpds.resize(*AggregateSize); 20883 InsertElts.resize(*AggregateSize); 20884 20885 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, 20886 R); 20887 llvm::erase(BuildVectorOpds, nullptr); 20888 llvm::erase(InsertElts, nullptr); 20889 if (BuildVectorOpds.size() >= 2) 20890 return true; 20891 20892 return false; 20893 } 20894 20895 /// Try and get a reduction instruction from a phi node. 20896 /// 20897 /// Given a phi node \p P in a block \p ParentBB, consider possible reductions 20898 /// if they come from either \p ParentBB or a containing loop latch. 20899 /// 20900 /// \returns A candidate reduction value if possible, or \code nullptr \endcode 20901 /// if not possible. 20902 static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P, 20903 BasicBlock *ParentBB, LoopInfo *LI) { 20904 // There are situations where the reduction value is not dominated by the 20905 // reduction phi. Vectorizing such cases has been reported to cause 20906 // miscompiles. See PR25787. 20907 auto DominatedReduxValue = [&](Value *R) { 20908 return isa<Instruction>(R) && 20909 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent()); 20910 }; 20911 20912 Instruction *Rdx = nullptr; 20913 20914 // Return the incoming value if it comes from the same BB as the phi node. 20915 if (P->getIncomingBlock(0) == ParentBB) { 20916 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0)); 20917 } else if (P->getIncomingBlock(1) == ParentBB) { 20918 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1)); 20919 } 20920 20921 if (Rdx && DominatedReduxValue(Rdx)) 20922 return Rdx; 20923 20924 // Otherwise, check whether we have a loop latch to look at. 20925 Loop *BBL = LI->getLoopFor(ParentBB); 20926 if (!BBL) 20927 return nullptr; 20928 BasicBlock *BBLatch = BBL->getLoopLatch(); 20929 if (!BBLatch) 20930 return nullptr; 20931 20932 // There is a loop latch, return the incoming value if it comes from 20933 // that. This reduction pattern occasionally turns up. 20934 if (P->getIncomingBlock(0) == BBLatch) { 20935 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0)); 20936 } else if (P->getIncomingBlock(1) == BBLatch) { 20937 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1)); 20938 } 20939 20940 if (Rdx && DominatedReduxValue(Rdx)) 20941 return Rdx; 20942 20943 return nullptr; 20944 } 20945 20946 static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) { 20947 if (match(I, m_BinOp(m_Value(V0), m_Value(V1)))) 20948 return true; 20949 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1)))) 20950 return true; 20951 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1)))) 20952 return true; 20953 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1)))) 20954 return true; 20955 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1)))) 20956 return true; 20957 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1)))) 20958 return true; 20959 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1)))) 20960 return true; 20961 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1)))) 20962 return true; 20963 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1)))) 20964 return true; 20965 return false; 20966 } 20967 20968 /// We could have an initial reduction that is not an add. 20969 /// r *= v1 + v2 + v3 + v4 20970 /// In such a case start looking for a tree rooted in the first '+'. 20971 /// \Returns the new root if found, which may be nullptr if not an instruction. 20972 static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi, 20973 Instruction *Root) { 20974 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) || 20975 isa<IntrinsicInst>(Root)) && 20976 "Expected binop, select, or intrinsic for reduction matching"); 20977 Value *LHS = 20978 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root)); 20979 Value *RHS = 20980 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1); 20981 if (LHS == Phi) 20982 return dyn_cast<Instruction>(RHS); 20983 if (RHS == Phi) 20984 return dyn_cast<Instruction>(LHS); 20985 return nullptr; 20986 } 20987 20988 /// \p Returns the first operand of \p I that does not match \p Phi. If 20989 /// operand is not an instruction it returns nullptr. 20990 static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) { 20991 Value *Op0 = nullptr; 20992 Value *Op1 = nullptr; 20993 if (!matchRdxBop(I, Op0, Op1)) 20994 return nullptr; 20995 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0); 20996 } 20997 20998 /// \Returns true if \p I is a candidate instruction for reduction vectorization. 20999 static bool isReductionCandidate(Instruction *I) { 21000 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value())); 21001 Value *B0 = nullptr, *B1 = nullptr; 21002 bool IsBinop = matchRdxBop(I, B0, B1); 21003 return IsBinop || IsSelect; 21004 } 21005 21006 bool SLPVectorizerPass::vectorizeHorReduction( 21007 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, 21008 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) { 21009 if (!ShouldVectorizeHor) 21010 return false; 21011 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root); 21012 21013 if (Root->getParent() != BB || isa<PHINode>(Root)) 21014 return false; 21015 21016 // If we can find a secondary reduction root, use that instead. 21017 auto SelectRoot = [&]() { 21018 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) && 21019 HorizontalReduction::getRdxKind(Root) != RecurKind::None) 21020 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root)) 21021 return NewRoot; 21022 return Root; 21023 }; 21024 21025 // Start analysis starting from Root instruction. If horizontal reduction is 21026 // found, try to vectorize it. If it is not a horizontal reduction or 21027 // vectorization is not possible or not effective, and currently analyzed 21028 // instruction is a binary operation, try to vectorize the operands, using 21029 // pre-order DFS traversal order. If the operands were not vectorized, repeat 21030 // the same procedure considering each operand as a possible root of the 21031 // horizontal reduction. 21032 // Interrupt the process if the Root instruction itself was vectorized or all 21033 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. 21034 // If a horizintal reduction was not matched or vectorized we collect 21035 // instructions for possible later attempts for vectorization. 21036 std::queue<std::pair<Instruction *, unsigned>> Stack; 21037 Stack.emplace(SelectRoot(), 0); 21038 SmallPtrSet<Value *, 8> VisitedInstrs; 21039 bool Res = false; 21040 auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * { 21041 if (R.isAnalyzedReductionRoot(Inst)) 21042 return nullptr; 21043 if (!isReductionCandidate(Inst)) 21044 return nullptr; 21045 HorizontalReduction HorRdx; 21046 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI)) 21047 return nullptr; 21048 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC); 21049 }; 21050 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) { 21051 if (TryOperandsAsNewSeeds && FutureSeed == Root) { 21052 FutureSeed = getNonPhiOperand(Root, P); 21053 if (!FutureSeed) 21054 return false; 21055 } 21056 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their 21057 // analysis is done separately. 21058 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed)) 21059 PostponedInsts.push_back(FutureSeed); 21060 return true; 21061 }; 21062 21063 while (!Stack.empty()) { 21064 Instruction *Inst; 21065 unsigned Level; 21066 std::tie(Inst, Level) = Stack.front(); 21067 Stack.pop(); 21068 // Do not try to analyze instruction that has already been vectorized. 21069 // This may happen when we vectorize instruction operands on a previous 21070 // iteration while stack was populated before that happened. 21071 if (R.isDeleted(Inst)) 21072 continue; 21073 if (Value *VectorizedV = TryToReduce(Inst)) { 21074 Res = true; 21075 if (auto *I = dyn_cast<Instruction>(VectorizedV)) { 21076 // Try to find another reduction. 21077 Stack.emplace(I, Level); 21078 continue; 21079 } 21080 if (R.isDeleted(Inst)) 21081 continue; 21082 } else { 21083 // We could not vectorize `Inst` so try to use it as a future seed. 21084 if (!TryAppendToPostponedInsts(Inst)) { 21085 assert(Stack.empty() && "Expected empty stack"); 21086 break; 21087 } 21088 } 21089 21090 // Try to vectorize operands. 21091 // Continue analysis for the instruction from the same basic block only to 21092 // save compile time. 21093 if (++Level < RecursionMaxDepth) 21094 for (auto *Op : Inst->operand_values()) 21095 if (VisitedInstrs.insert(Op).second) 21096 if (auto *I = dyn_cast<Instruction>(Op)) 21097 // Do not try to vectorize CmpInst operands, this is done 21098 // separately. 21099 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) && 21100 !R.isDeleted(I) && I->getParent() == BB) 21101 Stack.emplace(I, Level); 21102 } 21103 return Res; 21104 } 21105 21106 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root, 21107 BasicBlock *BB, BoUpSLP &R) { 21108 SmallVector<WeakTrackingVH> PostponedInsts; 21109 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts); 21110 Res |= tryToVectorize(PostponedInsts, R); 21111 return Res; 21112 } 21113 21114 bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts, 21115 BoUpSLP &R) { 21116 bool Res = false; 21117 for (Value *V : Insts) 21118 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst)) 21119 Res |= tryToVectorize(Inst, R); 21120 return Res; 21121 } 21122 21123 bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, 21124 BasicBlock *BB, BoUpSLP &R, 21125 bool MaxVFOnly) { 21126 if (!R.canMapToVector(IVI->getType())) 21127 return false; 21128 21129 SmallVector<Value *, 16> BuildVectorOpds; 21130 SmallVector<Value *, 16> BuildVectorInsts; 21131 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R)) 21132 return false; 21133 21134 if (MaxVFOnly && BuildVectorOpds.size() == 2) { 21135 R.getORE()->emit([&]() { 21136 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI) 21137 << "Cannot SLP vectorize list: only 2 elements of buildvalue, " 21138 "trying reduction first."; 21139 }); 21140 return false; 21141 } 21142 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); 21143 // Aggregate value is unlikely to be processed in vector register. 21144 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly); 21145 } 21146 21147 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, 21148 BasicBlock *BB, BoUpSLP &R, 21149 bool MaxVFOnly) { 21150 SmallVector<Value *, 16> BuildVectorInsts; 21151 SmallVector<Value *, 16> BuildVectorOpds; 21152 SmallVector<int> Mask; 21153 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) || 21154 (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) && 21155 isFixedVectorShuffle(BuildVectorOpds, Mask, AC))) 21156 return false; 21157 21158 if (MaxVFOnly && BuildVectorInsts.size() == 2) { 21159 R.getORE()->emit([&]() { 21160 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI) 21161 << "Cannot SLP vectorize list: only 2 elements of buildvector, " 21162 "trying reduction first."; 21163 }); 21164 return false; 21165 } 21166 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n"); 21167 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly); 21168 } 21169 21170 template <typename T> 21171 static bool tryToVectorizeSequence( 21172 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator, 21173 function_ref<bool(T *, T *)> AreCompatible, 21174 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper, 21175 bool MaxVFOnly, BoUpSLP &R) { 21176 bool Changed = false; 21177 // Sort by type, parent, operands. 21178 stable_sort(Incoming, Comparator); 21179 21180 // Try to vectorize elements base on their type. 21181 SmallVector<T *> Candidates; 21182 SmallVector<T *> VL; 21183 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E; 21184 VL.clear()) { 21185 // Look for the next elements with the same type, parent and operand 21186 // kinds. 21187 auto *I = dyn_cast<Instruction>(*IncIt); 21188 if (!I || R.isDeleted(I)) { 21189 ++IncIt; 21190 continue; 21191 } 21192 auto *SameTypeIt = IncIt; 21193 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) || 21194 R.isDeleted(cast<Instruction>(*SameTypeIt)) || 21195 AreCompatible(*SameTypeIt, *IncIt))) { 21196 auto *I = dyn_cast<Instruction>(*SameTypeIt); 21197 ++SameTypeIt; 21198 if (I && !R.isDeleted(I)) 21199 VL.push_back(cast<T>(I)); 21200 } 21201 21202 // Try to vectorize them. 21203 unsigned NumElts = VL.size(); 21204 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes (" 21205 << NumElts << ")\n"); 21206 // The vectorization is a 3-state attempt: 21207 // 1. Try to vectorize instructions with the same/alternate opcodes with the 21208 // size of maximal register at first. 21209 // 2. Try to vectorize remaining instructions with the same type, if 21210 // possible. This may result in the better vectorization results rather than 21211 // if we try just to vectorize instructions with the same/alternate opcodes. 21212 // 3. Final attempt to try to vectorize all instructions with the 21213 // same/alternate ops only, this may result in some extra final 21214 // vectorization. 21215 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) { 21216 // Success start over because instructions might have been changed. 21217 Changed = true; 21218 VL.swap(Candidates); 21219 Candidates.clear(); 21220 for (T *V : VL) { 21221 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I)) 21222 Candidates.push_back(V); 21223 } 21224 } else { 21225 /// \Returns the minimum number of elements that we will attempt to 21226 /// vectorize. 21227 auto GetMinNumElements = [&R](Value *V) { 21228 unsigned EltSize = R.getVectorElementSize(V); 21229 return std::max(2U, R.getMaxVecRegSize() / EltSize); 21230 }; 21231 if (NumElts < GetMinNumElements(*IncIt) && 21232 (Candidates.empty() || 21233 Candidates.front()->getType() == (*IncIt)->getType())) { 21234 for (T *V : VL) { 21235 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I)) 21236 Candidates.push_back(V); 21237 } 21238 } 21239 } 21240 // Final attempt to vectorize instructions with the same types. 21241 if (Candidates.size() > 1 && 21242 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) { 21243 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) { 21244 // Success start over because instructions might have been changed. 21245 Changed = true; 21246 } else if (MaxVFOnly) { 21247 // Try to vectorize using small vectors. 21248 SmallVector<T *> VL; 21249 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End; 21250 VL.clear()) { 21251 auto *I = dyn_cast<Instruction>(*It); 21252 if (!I || R.isDeleted(I)) { 21253 ++It; 21254 continue; 21255 } 21256 auto *SameTypeIt = It; 21257 while (SameTypeIt != End && 21258 (!isa<Instruction>(*SameTypeIt) || 21259 R.isDeleted(cast<Instruction>(*SameTypeIt)) || 21260 AreCompatible(*SameTypeIt, *It))) { 21261 auto *I = dyn_cast<Instruction>(*SameTypeIt); 21262 ++SameTypeIt; 21263 if (I && !R.isDeleted(I)) 21264 VL.push_back(cast<T>(I)); 21265 } 21266 unsigned NumElts = VL.size(); 21267 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), 21268 /*MaxVFOnly=*/false)) 21269 Changed = true; 21270 It = SameTypeIt; 21271 } 21272 } 21273 Candidates.clear(); 21274 } 21275 21276 // Start over at the next instruction of a different type (or the end). 21277 IncIt = SameTypeIt; 21278 } 21279 return Changed; 21280 } 21281 21282 /// Compare two cmp instructions. If IsCompatibility is true, function returns 21283 /// true if 2 cmps have same/swapped predicates and mos compatible corresponding 21284 /// operands. If IsCompatibility is false, function implements strict weak 21285 /// ordering relation between two cmp instructions, returning true if the first 21286 /// instruction is "less" than the second, i.e. its predicate is less than the 21287 /// predicate of the second or the operands IDs are less than the operands IDs 21288 /// of the second cmp instruction. 21289 template <bool IsCompatibility> 21290 static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, 21291 const DominatorTree &DT) { 21292 assert(isValidElementType(V->getType()) && 21293 isValidElementType(V2->getType()) && 21294 "Expected valid element types only."); 21295 if (V == V2) 21296 return IsCompatibility; 21297 auto *CI1 = cast<CmpInst>(V); 21298 auto *CI2 = cast<CmpInst>(V2); 21299 if (CI1->getOperand(0)->getType()->getTypeID() < 21300 CI2->getOperand(0)->getType()->getTypeID()) 21301 return !IsCompatibility; 21302 if (CI1->getOperand(0)->getType()->getTypeID() > 21303 CI2->getOperand(0)->getType()->getTypeID()) 21304 return false; 21305 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() < 21306 CI2->getOperand(0)->getType()->getScalarSizeInBits()) 21307 return !IsCompatibility; 21308 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() > 21309 CI2->getOperand(0)->getType()->getScalarSizeInBits()) 21310 return false; 21311 CmpInst::Predicate Pred1 = CI1->getPredicate(); 21312 CmpInst::Predicate Pred2 = CI2->getPredicate(); 21313 CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1); 21314 CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2); 21315 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1); 21316 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2); 21317 if (BasePred1 < BasePred2) 21318 return !IsCompatibility; 21319 if (BasePred1 > BasePred2) 21320 return false; 21321 // Compare operands. 21322 bool CI1Preds = Pred1 == BasePred1; 21323 bool CI2Preds = Pred2 == BasePred1; 21324 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) { 21325 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1); 21326 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1); 21327 if (Op1 == Op2) 21328 continue; 21329 if (Op1->getValueID() < Op2->getValueID()) 21330 return !IsCompatibility; 21331 if (Op1->getValueID() > Op2->getValueID()) 21332 return false; 21333 if (auto *I1 = dyn_cast<Instruction>(Op1)) 21334 if (auto *I2 = dyn_cast<Instruction>(Op2)) { 21335 if (IsCompatibility) { 21336 if (I1->getParent() != I2->getParent()) 21337 return false; 21338 } else { 21339 // Try to compare nodes with same parent. 21340 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent()); 21341 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent()); 21342 if (!NodeI1) 21343 return NodeI2 != nullptr; 21344 if (!NodeI2) 21345 return false; 21346 assert((NodeI1 == NodeI2) == 21347 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && 21348 "Different nodes should have different DFS numbers"); 21349 if (NodeI1 != NodeI2) 21350 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); 21351 } 21352 InstructionsState S = getSameOpcode({I1, I2}, TLI); 21353 if (S && (IsCompatibility || !S.isAltShuffle())) 21354 continue; 21355 if (IsCompatibility) 21356 return false; 21357 if (I1->getOpcode() != I2->getOpcode()) 21358 return I1->getOpcode() < I2->getOpcode(); 21359 } 21360 } 21361 return IsCompatibility; 21362 } 21363 21364 template <typename ItT> 21365 bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts, 21366 BasicBlock *BB, BoUpSLP &R) { 21367 bool Changed = false; 21368 // Try to find reductions first. 21369 for (CmpInst *I : CmpInsts) { 21370 if (R.isDeleted(I)) 21371 continue; 21372 for (Value *Op : I->operands()) 21373 if (auto *RootOp = dyn_cast<Instruction>(Op)) { 21374 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R); 21375 if (R.isDeleted(I)) 21376 break; 21377 } 21378 } 21379 // Try to vectorize operands as vector bundles. 21380 for (CmpInst *I : CmpInsts) { 21381 if (R.isDeleted(I)) 21382 continue; 21383 Changed |= tryToVectorize(I, R); 21384 } 21385 // Try to vectorize list of compares. 21386 // Sort by type, compare predicate, etc. 21387 auto CompareSorter = [&](Value *V, Value *V2) { 21388 if (V == V2) 21389 return false; 21390 return compareCmp<false>(V, V2, *TLI, *DT); 21391 }; 21392 21393 auto AreCompatibleCompares = [&](Value *V1, Value *V2) { 21394 if (V1 == V2) 21395 return true; 21396 return compareCmp<true>(V1, V2, *TLI, *DT); 21397 }; 21398 21399 SmallVector<Value *> Vals; 21400 for (Instruction *V : CmpInsts) 21401 if (!R.isDeleted(V) && isValidElementType(getValueType(V))) 21402 Vals.push_back(V); 21403 if (Vals.size() <= 1) 21404 return Changed; 21405 Changed |= tryToVectorizeSequence<Value>( 21406 Vals, CompareSorter, AreCompatibleCompares, 21407 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) { 21408 // Exclude possible reductions from other blocks. 21409 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) { 21410 return any_of(V->users(), [V](User *U) { 21411 auto *Select = dyn_cast<SelectInst>(U); 21412 return Select && 21413 Select->getParent() != cast<Instruction>(V)->getParent(); 21414 }); 21415 }); 21416 if (ArePossiblyReducedInOtherBlock) 21417 return false; 21418 return tryToVectorizeList(Candidates, R, MaxVFOnly); 21419 }, 21420 /*MaxVFOnly=*/true, R); 21421 return Changed; 21422 } 21423 21424 bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions, 21425 BasicBlock *BB, BoUpSLP &R) { 21426 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) && 21427 "This function only accepts Insert instructions"); 21428 bool OpsChanged = false; 21429 SmallVector<WeakTrackingVH> PostponedInsts; 21430 for (auto *I : reverse(Instructions)) { 21431 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only. 21432 if (R.isDeleted(I) || isa<CmpInst>(I)) 21433 continue; 21434 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) { 21435 OpsChanged |= 21436 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true); 21437 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) { 21438 OpsChanged |= 21439 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true); 21440 } 21441 // pass2 - try to vectorize reductions only 21442 if (R.isDeleted(I)) 21443 continue; 21444 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts); 21445 if (R.isDeleted(I) || isa<CmpInst>(I)) 21446 continue; 21447 // pass3 - try to match and vectorize a buildvector sequence. 21448 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) { 21449 OpsChanged |= 21450 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false); 21451 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) { 21452 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R, 21453 /*MaxVFOnly=*/false); 21454 } 21455 } 21456 // Now try to vectorize postponed instructions. 21457 OpsChanged |= tryToVectorize(PostponedInsts, R); 21458 21459 Instructions.clear(); 21460 return OpsChanged; 21461 } 21462 21463 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { 21464 bool Changed = false; 21465 SmallVector<Value *, 4> Incoming; 21466 SmallPtrSet<Value *, 16> VisitedInstrs; 21467 // Maps phi nodes to the non-phi nodes found in the use tree for each phi 21468 // node. Allows better to identify the chains that can be vectorized in the 21469 // better way. 21470 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes; 21471 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) { 21472 assert(isValidElementType(V1->getType()) && 21473 isValidElementType(V2->getType()) && 21474 "Expected vectorizable types only."); 21475 // It is fine to compare type IDs here, since we expect only vectorizable 21476 // types, like ints, floats and pointers, we don't care about other type. 21477 if (V1->getType()->getTypeID() < V2->getType()->getTypeID()) 21478 return true; 21479 if (V1->getType()->getTypeID() > V2->getType()->getTypeID()) 21480 return false; 21481 if (V1->getType()->getScalarSizeInBits() < 21482 V2->getType()->getScalarSizeInBits()) 21483 return true; 21484 if (V1->getType()->getScalarSizeInBits() > 21485 V2->getType()->getScalarSizeInBits()) 21486 return false; 21487 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; 21488 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; 21489 if (Opcodes1.size() < Opcodes2.size()) 21490 return true; 21491 if (Opcodes1.size() > Opcodes2.size()) 21492 return false; 21493 for (int I = 0, E = Opcodes1.size(); I < E; ++I) { 21494 { 21495 // Instructions come first. 21496 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]); 21497 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]); 21498 if (I1 && I2) { 21499 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent()); 21500 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent()); 21501 if (!NodeI1) 21502 return NodeI2 != nullptr; 21503 if (!NodeI2) 21504 return false; 21505 assert((NodeI1 == NodeI2) == 21506 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && 21507 "Different nodes should have different DFS numbers"); 21508 if (NodeI1 != NodeI2) 21509 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); 21510 InstructionsState S = getSameOpcode({I1, I2}, *TLI); 21511 if (S && !S.isAltShuffle()) 21512 continue; 21513 return I1->getOpcode() < I2->getOpcode(); 21514 } 21515 if (I1) 21516 return true; 21517 if (I2) 21518 return false; 21519 } 21520 { 21521 // Non-undef constants come next. 21522 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]); 21523 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]); 21524 if (C1 && C2) 21525 continue; 21526 if (C1) 21527 return true; 21528 if (C2) 21529 return false; 21530 } 21531 bool U1 = isa<UndefValue>(Opcodes1[I]); 21532 bool U2 = isa<UndefValue>(Opcodes2[I]); 21533 { 21534 // Non-constant non-instructions come next. 21535 if (!U1 && !U2) { 21536 auto ValID1 = Opcodes1[I]->getValueID(); 21537 auto ValID2 = Opcodes2[I]->getValueID(); 21538 if (ValID1 == ValID2) 21539 continue; 21540 if (ValID1 < ValID2) 21541 return true; 21542 if (ValID1 > ValID2) 21543 return false; 21544 } 21545 if (!U1) 21546 return true; 21547 if (!U2) 21548 return false; 21549 } 21550 // Undefs come last. 21551 assert(U1 && U2 && "The only thing left should be undef & undef."); 21552 } 21553 return false; 21554 }; 21555 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) { 21556 if (V1 == V2) 21557 return true; 21558 if (V1->getType() != V2->getType()) 21559 return false; 21560 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; 21561 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; 21562 if (Opcodes1.size() != Opcodes2.size()) 21563 return false; 21564 for (int I = 0, E = Opcodes1.size(); I < E; ++I) { 21565 // Undefs are compatible with any other value. 21566 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) 21567 continue; 21568 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) 21569 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { 21570 if (R.isDeleted(I1) || R.isDeleted(I2)) 21571 return false; 21572 if (I1->getParent() != I2->getParent()) 21573 return false; 21574 if (getSameOpcode({I1, I2}, *TLI)) 21575 continue; 21576 return false; 21577 } 21578 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) 21579 continue; 21580 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID()) 21581 return false; 21582 } 21583 return true; 21584 }; 21585 21586 bool HaveVectorizedPhiNodes = false; 21587 do { 21588 // Collect the incoming values from the PHIs. 21589 Incoming.clear(); 21590 for (Instruction &I : *BB) { 21591 auto *P = dyn_cast<PHINode>(&I); 21592 if (!P || P->getNumIncomingValues() > MaxPHINumOperands) 21593 break; 21594 21595 // No need to analyze deleted, vectorized and non-vectorizable 21596 // instructions. 21597 if (!VisitedInstrs.count(P) && !R.isDeleted(P) && 21598 isValidElementType(P->getType())) 21599 Incoming.push_back(P); 21600 } 21601 21602 if (Incoming.size() <= 1) 21603 break; 21604 21605 // Find the corresponding non-phi nodes for better matching when trying to 21606 // build the tree. 21607 for (Value *V : Incoming) { 21608 SmallVectorImpl<Value *> &Opcodes = 21609 PHIToOpcodes.try_emplace(V).first->getSecond(); 21610 if (!Opcodes.empty()) 21611 continue; 21612 SmallVector<Value *, 4> Nodes(1, V); 21613 SmallPtrSet<Value *, 4> Visited; 21614 while (!Nodes.empty()) { 21615 auto *PHI = cast<PHINode>(Nodes.pop_back_val()); 21616 if (!Visited.insert(PHI).second) 21617 continue; 21618 for (Value *V : PHI->incoming_values()) { 21619 if (auto *PHI1 = dyn_cast<PHINode>((V))) { 21620 Nodes.push_back(PHI1); 21621 continue; 21622 } 21623 Opcodes.emplace_back(V); 21624 } 21625 } 21626 } 21627 21628 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>( 21629 Incoming, PHICompare, AreCompatiblePHIs, 21630 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) { 21631 return tryToVectorizeList(Candidates, R, MaxVFOnly); 21632 }, 21633 /*MaxVFOnly=*/true, R); 21634 Changed |= HaveVectorizedPhiNodes; 21635 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) { 21636 auto *PHI = dyn_cast<PHINode>(P.first); 21637 return !PHI || R.isDeleted(PHI); 21638 })) 21639 PHIToOpcodes.clear(); 21640 VisitedInstrs.insert(Incoming.begin(), Incoming.end()); 21641 } while (HaveVectorizedPhiNodes); 21642 21643 VisitedInstrs.clear(); 21644 21645 InstSetVector PostProcessInserts; 21646 SmallSetVector<CmpInst *, 8> PostProcessCmps; 21647 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true 21648 // also vectorizes `PostProcessCmps`. 21649 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) { 21650 bool Changed = vectorizeInserts(PostProcessInserts, BB, R); 21651 if (VectorizeCmps) { 21652 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R); 21653 PostProcessCmps.clear(); 21654 } 21655 PostProcessInserts.clear(); 21656 return Changed; 21657 }; 21658 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`. 21659 auto IsInPostProcessInstrs = [&](Instruction *I) { 21660 if (auto *Cmp = dyn_cast<CmpInst>(I)) 21661 return PostProcessCmps.contains(Cmp); 21662 return isa<InsertElementInst, InsertValueInst>(I) && 21663 PostProcessInserts.contains(I); 21664 }; 21665 // Returns true if `I` is an instruction without users, like terminator, or 21666 // function call with ignored return value, store. Ignore unused instructions 21667 // (basing on instruction type, except for CallInst and InvokeInst). 21668 auto HasNoUsers = [](Instruction *I) { 21669 return I->use_empty() && 21670 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I)); 21671 }; 21672 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) { 21673 // Skip instructions with scalable type. The num of elements is unknown at 21674 // compile-time for scalable type. 21675 if (isa<ScalableVectorType>(It->getType())) 21676 continue; 21677 21678 // Skip instructions marked for the deletion. 21679 if (R.isDeleted(&*It)) 21680 continue; 21681 // We may go through BB multiple times so skip the one we have checked. 21682 if (!VisitedInstrs.insert(&*It).second) { 21683 if (HasNoUsers(&*It) && 21684 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) { 21685 // We would like to start over since some instructions are deleted 21686 // and the iterator may become invalid value. 21687 Changed = true; 21688 It = BB->begin(); 21689 E = BB->end(); 21690 } 21691 continue; 21692 } 21693 21694 if (isa<DbgInfoIntrinsic>(It)) 21695 continue; 21696 21697 // Try to vectorize reductions that use PHINodes. 21698 if (PHINode *P = dyn_cast<PHINode>(It)) { 21699 // Check that the PHI is a reduction PHI. 21700 if (P->getNumIncomingValues() == 2) { 21701 // Try to match and vectorize a horizontal reduction. 21702 Instruction *Root = getReductionInstr(DT, P, BB, LI); 21703 if (Root && vectorizeRootInstruction(P, Root, BB, R)) { 21704 Changed = true; 21705 It = BB->begin(); 21706 E = BB->end(); 21707 continue; 21708 } 21709 } 21710 // Try to vectorize the incoming values of the PHI, to catch reductions 21711 // that feed into PHIs. 21712 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) { 21713 // Skip if the incoming block is the current BB for now. Also, bypass 21714 // unreachable IR for efficiency and to avoid crashing. 21715 // TODO: Collect the skipped incoming values and try to vectorize them 21716 // after processing BB. 21717 if (BB == P->getIncomingBlock(I) || 21718 !DT->isReachableFromEntry(P->getIncomingBlock(I))) 21719 continue; 21720 21721 // Postponed instructions should not be vectorized here, delay their 21722 // vectorization. 21723 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I)); 21724 PI && !IsInPostProcessInstrs(PI)) { 21725 bool Res = 21726 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R); 21727 Changed |= Res; 21728 if (Res && R.isDeleted(P)) { 21729 It = BB->begin(); 21730 E = BB->end(); 21731 break; 21732 } 21733 } 21734 } 21735 continue; 21736 } 21737 21738 if (HasNoUsers(&*It)) { 21739 bool OpsChanged = false; 21740 auto *SI = dyn_cast<StoreInst>(It); 21741 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI; 21742 if (SI) { 21743 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand())); 21744 // Try to vectorize chain in store, if this is the only store to the 21745 // address in the block. 21746 // TODO: This is just a temporarily solution to save compile time. Need 21747 // to investigate if we can safely turn on slp-vectorize-hor-store 21748 // instead to allow lookup for reduction chains in all non-vectorized 21749 // stores (need to check side effects and compile time). 21750 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) && 21751 SI->getValueOperand()->hasOneUse(); 21752 } 21753 if (TryToVectorizeRoot) { 21754 for (auto *V : It->operand_values()) { 21755 // Postponed instructions should not be vectorized here, delay their 21756 // vectorization. 21757 if (auto *VI = dyn_cast<Instruction>(V); 21758 VI && !IsInPostProcessInstrs(VI)) 21759 // Try to match and vectorize a horizontal reduction. 21760 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R); 21761 } 21762 } 21763 // Start vectorization of post-process list of instructions from the 21764 // top-tree instructions to try to vectorize as many instructions as 21765 // possible. 21766 OpsChanged |= 21767 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator()); 21768 if (OpsChanged) { 21769 // We would like to start over since some instructions are deleted 21770 // and the iterator may become invalid value. 21771 Changed = true; 21772 It = BB->begin(); 21773 E = BB->end(); 21774 continue; 21775 } 21776 } 21777 21778 if (isa<InsertElementInst, InsertValueInst>(It)) 21779 PostProcessInserts.insert(&*It); 21780 else if (isa<CmpInst>(It)) 21781 PostProcessCmps.insert(cast<CmpInst>(&*It)); 21782 } 21783 21784 return Changed; 21785 } 21786 21787 bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { 21788 auto Changed = false; 21789 for (auto &Entry : GEPs) { 21790 // If the getelementptr list has fewer than two elements, there's nothing 21791 // to do. 21792 if (Entry.second.size() < 2) 21793 continue; 21794 21795 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " 21796 << Entry.second.size() << ".\n"); 21797 21798 // Process the GEP list in chunks suitable for the target's supported 21799 // vector size. If a vector register can't hold 1 element, we are done. We 21800 // are trying to vectorize the index computations, so the maximum number of 21801 // elements is based on the size of the index expression, rather than the 21802 // size of the GEP itself (the target's pointer size). 21803 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) { 21804 return !R.isDeleted(GEP); 21805 }); 21806 if (It == Entry.second.end()) 21807 continue; 21808 unsigned MaxVecRegSize = R.getMaxVecRegSize(); 21809 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin()); 21810 if (MaxVecRegSize < EltSize) 21811 continue; 21812 21813 unsigned MaxElts = MaxVecRegSize / EltSize; 21814 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) { 21815 auto Len = std::min<unsigned>(BE - BI, MaxElts); 21816 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len); 21817 21818 // Initialize a set a candidate getelementptrs. Note that we use a 21819 // SetVector here to preserve program order. If the index computations 21820 // are vectorizable and begin with loads, we want to minimize the chance 21821 // of having to reorder them later. 21822 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end()); 21823 21824 // Some of the candidates may have already been vectorized after we 21825 // initially collected them or their index is optimized to constant value. 21826 // If so, they are marked as deleted, so remove them from the set of 21827 // candidates. 21828 Candidates.remove_if([&R](Value *I) { 21829 return R.isDeleted(cast<Instruction>(I)) || 21830 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get()); 21831 }); 21832 21833 // Remove from the set of candidates all pairs of getelementptrs with 21834 // constant differences. Such getelementptrs are likely not good 21835 // candidates for vectorization in a bottom-up phase since one can be 21836 // computed from the other. We also ensure all candidate getelementptr 21837 // indices are unique. 21838 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) { 21839 auto *GEPI = GEPList[I]; 21840 if (!Candidates.count(GEPI)) 21841 continue; 21842 const SCEV *SCEVI = SE->getSCEV(GEPList[I]); 21843 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) { 21844 auto *GEPJ = GEPList[J]; 21845 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]); 21846 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) { 21847 Candidates.remove(GEPI); 21848 Candidates.remove(GEPJ); 21849 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) { 21850 Candidates.remove(GEPJ); 21851 } 21852 } 21853 } 21854 21855 // We break out of the above computation as soon as we know there are 21856 // fewer than two candidates remaining. 21857 if (Candidates.size() < 2) 21858 continue; 21859 21860 // Add the single, non-constant index of each candidate to the bundle. We 21861 // ensured the indices met these constraints when we originally collected 21862 // the getelementptrs. 21863 SmallVector<Value *, 16> Bundle(Candidates.size()); 21864 auto BundleIndex = 0u; 21865 for (auto *V : Candidates) { 21866 auto *GEP = cast<GetElementPtrInst>(V); 21867 auto *GEPIdx = GEP->idx_begin()->get(); 21868 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx)); 21869 Bundle[BundleIndex++] = GEPIdx; 21870 } 21871 21872 // Try and vectorize the indices. We are currently only interested in 21873 // gather-like cases of the form: 21874 // 21875 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ... 21876 // 21877 // where the loads of "a", the loads of "b", and the subtractions can be 21878 // performed in parallel. It's likely that detecting this pattern in a 21879 // bottom-up phase will be simpler and less costly than building a 21880 // full-blown top-down phase beginning at the consecutive loads. 21881 Changed |= tryToVectorizeList(Bundle, R); 21882 } 21883 } 21884 return Changed; 21885 } 21886 21887 bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { 21888 bool Changed = false; 21889 // Sort by type, base pointers and values operand. Value operands must be 21890 // compatible (have the same opcode, same parent), otherwise it is 21891 // definitely not profitable to try to vectorize them. 21892 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) { 21893 if (V->getValueOperand()->getType()->getTypeID() < 21894 V2->getValueOperand()->getType()->getTypeID()) 21895 return true; 21896 if (V->getValueOperand()->getType()->getTypeID() > 21897 V2->getValueOperand()->getType()->getTypeID()) 21898 return false; 21899 if (V->getPointerOperandType()->getTypeID() < 21900 V2->getPointerOperandType()->getTypeID()) 21901 return true; 21902 if (V->getPointerOperandType()->getTypeID() > 21903 V2->getPointerOperandType()->getTypeID()) 21904 return false; 21905 if (V->getValueOperand()->getType()->getScalarSizeInBits() < 21906 V2->getValueOperand()->getType()->getScalarSizeInBits()) 21907 return true; 21908 if (V->getValueOperand()->getType()->getScalarSizeInBits() > 21909 V2->getValueOperand()->getType()->getScalarSizeInBits()) 21910 return false; 21911 // UndefValues are compatible with all other values. 21912 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand())) 21913 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { 21914 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = 21915 DT->getNode(I1->getParent()); 21916 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = 21917 DT->getNode(I2->getParent()); 21918 assert(NodeI1 && "Should only process reachable instructions"); 21919 assert(NodeI2 && "Should only process reachable instructions"); 21920 assert((NodeI1 == NodeI2) == 21921 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && 21922 "Different nodes should have different DFS numbers"); 21923 if (NodeI1 != NodeI2) 21924 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); 21925 return I1->getOpcode() < I2->getOpcode(); 21926 } 21927 return V->getValueOperand()->getValueID() < 21928 V2->getValueOperand()->getValueID(); 21929 }; 21930 21931 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) { 21932 if (V1 == V2) 21933 return true; 21934 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType()) 21935 return false; 21936 if (V1->getPointerOperandType() != V2->getPointerOperandType()) 21937 return false; 21938 // Undefs are compatible with any other value. 21939 if (isa<UndefValue>(V1->getValueOperand()) || 21940 isa<UndefValue>(V2->getValueOperand())) 21941 return true; 21942 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand())) 21943 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { 21944 if (I1->getParent() != I2->getParent()) 21945 return false; 21946 return getSameOpcode({I1, I2}, *TLI).valid(); 21947 } 21948 if (isa<Constant>(V1->getValueOperand()) && 21949 isa<Constant>(V2->getValueOperand())) 21950 return true; 21951 return V1->getValueOperand()->getValueID() == 21952 V2->getValueOperand()->getValueID(); 21953 }; 21954 21955 // Attempt to sort and vectorize each of the store-groups. 21956 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted; 21957 for (auto &Pair : Stores) { 21958 if (Pair.second.size() < 2) 21959 continue; 21960 21961 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " 21962 << Pair.second.size() << ".\n"); 21963 21964 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType())) 21965 continue; 21966 21967 // Reverse stores to do bottom-to-top analysis. This is important if the 21968 // values are stores to the same addresses several times, in this case need 21969 // to follow the stores order (reversed to meet the memory dependecies). 21970 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(), 21971 Pair.second.rend()); 21972 Changed |= tryToVectorizeSequence<StoreInst>( 21973 ReversedStores, StoreSorter, AreCompatibleStores, 21974 [&](ArrayRef<StoreInst *> Candidates, bool) { 21975 return vectorizeStores(Candidates, R, Attempted); 21976 }, 21977 /*MaxVFOnly=*/false, R); 21978 } 21979 return Changed; 21980 } 21981