1 //===------- VectorCombine.cpp - Optimize partial vector operations -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass optimizes scalar/vector interactions using target cost models. The 10 // transforms implemented here may not fit in traditional loop-based or SLP 11 // vectorization passes. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/Vectorize/VectorCombine.h" 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/ADT/ScopeExit.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/AssumptionCache.h" 21 #include "llvm/Analysis/BasicAliasAnalysis.h" 22 #include "llvm/Analysis/GlobalsModRef.h" 23 #include "llvm/Analysis/Loads.h" 24 #include "llvm/Analysis/TargetTransformInfo.h" 25 #include "llvm/Analysis/ValueTracking.h" 26 #include "llvm/Analysis/VectorUtils.h" 27 #include "llvm/IR/Dominators.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/PatternMatch.h" 31 #include "llvm/Support/CommandLine.h" 32 #include "llvm/Transforms/Utils/Local.h" 33 #include "llvm/Transforms/Utils/LoopUtils.h" 34 #include <numeric> 35 #include <queue> 36 37 #define DEBUG_TYPE "vector-combine" 38 #include "llvm/Transforms/Utils/InstructionWorklist.h" 39 40 using namespace llvm; 41 using namespace llvm::PatternMatch; 42 43 STATISTIC(NumVecLoad, "Number of vector loads formed"); 44 STATISTIC(NumVecCmp, "Number of vector compares formed"); 45 STATISTIC(NumVecBO, "Number of vector binops formed"); 46 STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed"); 47 STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast"); 48 STATISTIC(NumScalarBO, "Number of scalar binops formed"); 49 STATISTIC(NumScalarCmp, "Number of scalar compares formed"); 50 51 static cl::opt<bool> DisableVectorCombine( 52 "disable-vector-combine", cl::init(false), cl::Hidden, 53 cl::desc("Disable all vector combine transforms")); 54 55 static cl::opt<bool> DisableBinopExtractShuffle( 56 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden, 57 cl::desc("Disable binop extract to shuffle transforms")); 58 59 static cl::opt<unsigned> MaxInstrsToScan( 60 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, 61 cl::desc("Max number of instructions to scan for vector combining.")); 62 63 static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max(); 64 65 namespace { 66 class VectorCombine { 67 public: 68 VectorCombine(Function &F, const TargetTransformInfo &TTI, 69 const DominatorTree &DT, AAResults &AA, AssumptionCache &AC, 70 const DataLayout *DL, bool TryEarlyFoldsOnly) 71 : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC), DL(DL), 72 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {} 73 74 bool run(); 75 76 private: 77 Function &F; 78 IRBuilder<> Builder; 79 const TargetTransformInfo &TTI; 80 const DominatorTree &DT; 81 AAResults &AA; 82 AssumptionCache &AC; 83 const DataLayout *DL; 84 85 /// If true, only perform beneficial early IR transforms. Do not introduce new 86 /// vector operations. 87 bool TryEarlyFoldsOnly; 88 89 InstructionWorklist Worklist; 90 91 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction" 92 // parameter. That should be updated to specific sub-classes because the 93 // run loop was changed to dispatch on opcode. 94 bool vectorizeLoadInsert(Instruction &I); 95 bool widenSubvectorLoad(Instruction &I); 96 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, 97 ExtractElementInst *Ext1, 98 unsigned PreferredExtractIndex) const; 99 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 100 const Instruction &I, 101 ExtractElementInst *&ConvertToShuffle, 102 unsigned PreferredExtractIndex); 103 void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 104 Instruction &I); 105 void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 106 Instruction &I); 107 bool foldExtractExtract(Instruction &I); 108 bool foldInsExtFNeg(Instruction &I); 109 bool foldBitcastShuffle(Instruction &I); 110 bool scalarizeBinopOrCmp(Instruction &I); 111 bool scalarizeVPIntrinsic(Instruction &I); 112 bool foldExtractedCmps(Instruction &I); 113 bool foldSingleElementStore(Instruction &I); 114 bool scalarizeLoadExtract(Instruction &I); 115 bool foldShuffleOfBinops(Instruction &I); 116 bool foldShuffleOfCastops(Instruction &I); 117 bool foldShuffleOfShuffles(Instruction &I); 118 bool foldShuffleToIdentity(Instruction &I); 119 bool foldShuffleFromReductions(Instruction &I); 120 bool foldCastFromReductions(Instruction &I); 121 bool foldSelectShuffle(Instruction &I, bool FromReduction = false); 122 123 void replaceValue(Value &Old, Value &New) { 124 Old.replaceAllUsesWith(&New); 125 if (auto *NewI = dyn_cast<Instruction>(&New)) { 126 New.takeName(&Old); 127 Worklist.pushUsersToWorkList(*NewI); 128 Worklist.pushValue(NewI); 129 } 130 Worklist.pushValue(&Old); 131 } 132 133 void eraseInstruction(Instruction &I) { 134 for (Value *Op : I.operands()) 135 Worklist.pushValue(Op); 136 Worklist.remove(&I); 137 I.eraseFromParent(); 138 } 139 }; 140 } // namespace 141 142 /// Return the source operand of a potentially bitcasted value. If there is no 143 /// bitcast, return the input value itself. 144 static Value *peekThroughBitcasts(Value *V) { 145 while (auto *BitCast = dyn_cast<BitCastInst>(V)) 146 V = BitCast->getOperand(0); 147 return V; 148 } 149 150 static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) { 151 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan. 152 // The widened load may load data from dirty regions or create data races 153 // non-existent in the source. 154 if (!Load || !Load->isSimple() || !Load->hasOneUse() || 155 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) || 156 mustSuppressSpeculation(*Load)) 157 return false; 158 159 // We are potentially transforming byte-sized (8-bit) memory accesses, so make 160 // sure we have all of our type-based constraints in place for this target. 161 Type *ScalarTy = Load->getType()->getScalarType(); 162 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); 163 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); 164 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 || 165 ScalarSize % 8 != 0) 166 return false; 167 168 return true; 169 } 170 171 bool VectorCombine::vectorizeLoadInsert(Instruction &I) { 172 // Match insert into fixed vector of scalar value. 173 // TODO: Handle non-zero insert index. 174 Value *Scalar; 175 if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) || 176 !Scalar->hasOneUse()) 177 return false; 178 179 // Optionally match an extract from another vector. 180 Value *X; 181 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt())); 182 if (!HasExtract) 183 X = Scalar; 184 185 auto *Load = dyn_cast<LoadInst>(X); 186 if (!canWidenLoad(Load, TTI)) 187 return false; 188 189 Type *ScalarTy = Scalar->getType(); 190 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); 191 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); 192 193 // Check safety of replacing the scalar load with a larger vector load. 194 // We use minimal alignment (maximum flexibility) because we only care about 195 // the dereferenceable region. When calculating cost and creating a new op, 196 // we may use a larger value based on alignment attributes. 197 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); 198 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type"); 199 200 unsigned MinVecNumElts = MinVectorSize / ScalarSize; 201 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); 202 unsigned OffsetEltIndex = 0; 203 Align Alignment = Load->getAlign(); 204 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC, 205 &DT)) { 206 // It is not safe to load directly from the pointer, but we can still peek 207 // through gep offsets and check if it safe to load from a base address with 208 // updated alignment. If it is, we can shuffle the element(s) into place 209 // after loading. 210 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType()); 211 APInt Offset(OffsetBitWidth, 0); 212 SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset); 213 214 // We want to shuffle the result down from a high element of a vector, so 215 // the offset must be positive. 216 if (Offset.isNegative()) 217 return false; 218 219 // The offset must be a multiple of the scalar element to shuffle cleanly 220 // in the element's size. 221 uint64_t ScalarSizeInBytes = ScalarSize / 8; 222 if (Offset.urem(ScalarSizeInBytes) != 0) 223 return false; 224 225 // If we load MinVecNumElts, will our target element still be loaded? 226 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue(); 227 if (OffsetEltIndex >= MinVecNumElts) 228 return false; 229 230 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC, 231 &DT)) 232 return false; 233 234 // Update alignment with offset value. Note that the offset could be negated 235 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but 236 // negation does not change the result of the alignment calculation. 237 Alignment = commonAlignment(Alignment, Offset.getZExtValue()); 238 } 239 240 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0 241 // Use the greater of the alignment on the load or its source pointer. 242 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment); 243 Type *LoadTy = Load->getType(); 244 unsigned AS = Load->getPointerAddressSpace(); 245 InstructionCost OldCost = 246 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); 247 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); 248 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 249 OldCost += 250 TTI.getScalarizationOverhead(MinVecTy, DemandedElts, 251 /* Insert */ true, HasExtract, CostKind); 252 253 // New pattern: load VecPtr 254 InstructionCost NewCost = 255 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS); 256 // Optionally, we are shuffling the loaded vector element(s) into place. 257 // For the mask set everything but element 0 to undef to prevent poison from 258 // propagating from the extra loaded memory. This will also optionally 259 // shrink/grow the vector from the loaded size to the output size. 260 // We assume this operation has no cost in codegen if there was no offset. 261 // Note that we could use freeze to avoid poison problems, but then we might 262 // still need a shuffle to change the vector size. 263 auto *Ty = cast<FixedVectorType>(I.getType()); 264 unsigned OutputNumElts = Ty->getNumElements(); 265 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem); 266 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); 267 Mask[0] = OffsetEltIndex; 268 if (OffsetEltIndex) 269 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask); 270 271 // We can aggressively convert to the vector form because the backend can 272 // invert this transform if it does not result in a performance win. 273 if (OldCost < NewCost || !NewCost.isValid()) 274 return false; 275 276 // It is safe and potentially profitable to load a vector directly: 277 // inselt undef, load Scalar, 0 --> load VecPtr 278 IRBuilder<> Builder(Load); 279 Value *CastedPtr = 280 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS)); 281 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); 282 VecLd = Builder.CreateShuffleVector(VecLd, Mask); 283 284 replaceValue(I, *VecLd); 285 ++NumVecLoad; 286 return true; 287 } 288 289 /// If we are loading a vector and then inserting it into a larger vector with 290 /// undefined elements, try to load the larger vector and eliminate the insert. 291 /// This removes a shuffle in IR and may allow combining of other loaded values. 292 bool VectorCombine::widenSubvectorLoad(Instruction &I) { 293 // Match subvector insert of fixed vector. 294 auto *Shuf = cast<ShuffleVectorInst>(&I); 295 if (!Shuf->isIdentityWithPadding()) 296 return false; 297 298 // Allow a non-canonical shuffle mask that is choosing elements from op1. 299 unsigned NumOpElts = 300 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements(); 301 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) { 302 return M >= (int)(NumOpElts); 303 }); 304 305 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex)); 306 if (!canWidenLoad(Load, TTI)) 307 return false; 308 309 // We use minimal alignment (maximum flexibility) because we only care about 310 // the dereferenceable region. When calculating cost and creating a new op, 311 // we may use a larger value based on alignment attributes. 312 auto *Ty = cast<FixedVectorType>(I.getType()); 313 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); 314 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type"); 315 Align Alignment = Load->getAlign(); 316 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT)) 317 return false; 318 319 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment); 320 Type *LoadTy = Load->getType(); 321 unsigned AS = Load->getPointerAddressSpace(); 322 323 // Original pattern: insert_subvector (load PtrOp) 324 // This conservatively assumes that the cost of a subvector insert into an 325 // undef value is 0. We could add that cost if the cost model accurately 326 // reflects the real cost of that operation. 327 InstructionCost OldCost = 328 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); 329 330 // New pattern: load PtrOp 331 InstructionCost NewCost = 332 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS); 333 334 // We can aggressively convert to the vector form because the backend can 335 // invert this transform if it does not result in a performance win. 336 if (OldCost < NewCost || !NewCost.isValid()) 337 return false; 338 339 IRBuilder<> Builder(Load); 340 Value *CastedPtr = 341 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS)); 342 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment); 343 replaceValue(I, *VecLd); 344 ++NumVecLoad; 345 return true; 346 } 347 348 /// Determine which, if any, of the inputs should be replaced by a shuffle 349 /// followed by extract from a different index. 350 ExtractElementInst *VectorCombine::getShuffleExtract( 351 ExtractElementInst *Ext0, ExtractElementInst *Ext1, 352 unsigned PreferredExtractIndex = InvalidIndex) const { 353 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand()); 354 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand()); 355 assert(Index0C && Index1C && "Expected constant extract indexes"); 356 357 unsigned Index0 = Index0C->getZExtValue(); 358 unsigned Index1 = Index1C->getZExtValue(); 359 360 // If the extract indexes are identical, no shuffle is needed. 361 if (Index0 == Index1) 362 return nullptr; 363 364 Type *VecTy = Ext0->getVectorOperand()->getType(); 365 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 366 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types"); 367 InstructionCost Cost0 = 368 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); 369 InstructionCost Cost1 = 370 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); 371 372 // If both costs are invalid no shuffle is needed 373 if (!Cost0.isValid() && !Cost1.isValid()) 374 return nullptr; 375 376 // We are extracting from 2 different indexes, so one operand must be shuffled 377 // before performing a vector operation and/or extract. The more expensive 378 // extract will be replaced by a shuffle. 379 if (Cost0 > Cost1) 380 return Ext0; 381 if (Cost1 > Cost0) 382 return Ext1; 383 384 // If the costs are equal and there is a preferred extract index, shuffle the 385 // opposite operand. 386 if (PreferredExtractIndex == Index0) 387 return Ext1; 388 if (PreferredExtractIndex == Index1) 389 return Ext0; 390 391 // Otherwise, replace the extract with the higher index. 392 return Index0 > Index1 ? Ext0 : Ext1; 393 } 394 395 /// Compare the relative costs of 2 extracts followed by scalar operation vs. 396 /// vector operation(s) followed by extract. Return true if the existing 397 /// instructions are cheaper than a vector alternative. Otherwise, return false 398 /// and if one of the extracts should be transformed to a shufflevector, set 399 /// \p ConvertToShuffle to that extract instruction. 400 bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, 401 ExtractElementInst *Ext1, 402 const Instruction &I, 403 ExtractElementInst *&ConvertToShuffle, 404 unsigned PreferredExtractIndex) { 405 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getOperand(1)); 406 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getOperand(1)); 407 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes"); 408 409 unsigned Opcode = I.getOpcode(); 410 Type *ScalarTy = Ext0->getType(); 411 auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType()); 412 InstructionCost ScalarOpCost, VectorOpCost; 413 414 // Get cost estimates for scalar and vector versions of the operation. 415 bool IsBinOp = Instruction::isBinaryOp(Opcode); 416 if (IsBinOp) { 417 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); 418 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); 419 } else { 420 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && 421 "Expected a compare"); 422 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate(); 423 ScalarOpCost = TTI.getCmpSelInstrCost( 424 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred); 425 VectorOpCost = TTI.getCmpSelInstrCost( 426 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred); 427 } 428 429 // Get cost estimates for the extract elements. These costs will factor into 430 // both sequences. 431 unsigned Ext0Index = Ext0IndexC->getZExtValue(); 432 unsigned Ext1Index = Ext1IndexC->getZExtValue(); 433 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 434 435 InstructionCost Extract0Cost = 436 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index); 437 InstructionCost Extract1Cost = 438 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index); 439 440 // A more expensive extract will always be replaced by a splat shuffle. 441 // For example, if Ext0 is more expensive: 442 // opcode (extelt V0, Ext0), (ext V1, Ext1) --> 443 // extelt (opcode (splat V0, Ext0), V1), Ext1 444 // TODO: Evaluate whether that always results in lowest cost. Alternatively, 445 // check the cost of creating a broadcast shuffle and shuffling both 446 // operands to element 0. 447 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost); 448 449 // Extra uses of the extracts mean that we include those costs in the 450 // vector total because those instructions will not be eliminated. 451 InstructionCost OldCost, NewCost; 452 if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) { 453 // Handle a special case. If the 2 extracts are identical, adjust the 454 // formulas to account for that. The extra use charge allows for either the 455 // CSE'd pattern or an unoptimized form with identical values: 456 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C 457 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2) 458 : !Ext0->hasOneUse() || !Ext1->hasOneUse(); 459 OldCost = CheapExtractCost + ScalarOpCost; 460 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost; 461 } else { 462 // Handle the general case. Each extract is actually a different value: 463 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C 464 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost; 465 NewCost = VectorOpCost + CheapExtractCost + 466 !Ext0->hasOneUse() * Extract0Cost + 467 !Ext1->hasOneUse() * Extract1Cost; 468 } 469 470 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex); 471 if (ConvertToShuffle) { 472 if (IsBinOp && DisableBinopExtractShuffle) 473 return true; 474 475 // If we are extracting from 2 different indexes, then one operand must be 476 // shuffled before performing the vector operation. The shuffle mask is 477 // poison except for 1 lane that is being translated to the remaining 478 // extraction lane. Therefore, it is a splat shuffle. Ex: 479 // ShufMask = { poison, poison, 0, poison } 480 // TODO: The cost model has an option for a "broadcast" shuffle 481 // (splat-from-element-0), but no option for a more general splat. 482 NewCost += 483 TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); 484 } 485 486 // Aggressively form a vector op if the cost is equal because the transform 487 // may enable further optimization. 488 // Codegen can reverse this transform (scalarize) if it was not profitable. 489 return OldCost < NewCost; 490 } 491 492 /// Create a shuffle that translates (shifts) 1 element from the input vector 493 /// to a new element location. 494 static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, 495 unsigned NewIndex, IRBuilder<> &Builder) { 496 // The shuffle mask is poison except for 1 lane that is being translated 497 // to the new element index. Example for OldIndex == 2 and NewIndex == 0: 498 // ShufMask = { 2, poison, poison, poison } 499 auto *VecTy = cast<FixedVectorType>(Vec->getType()); 500 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem); 501 ShufMask[NewIndex] = OldIndex; 502 return Builder.CreateShuffleVector(Vec, ShufMask, "shift"); 503 } 504 505 /// Given an extract element instruction with constant index operand, shuffle 506 /// the source vector (shift the scalar element) to a NewIndex for extraction. 507 /// Return null if the input can be constant folded, so that we are not creating 508 /// unnecessary instructions. 509 static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, 510 unsigned NewIndex, 511 IRBuilder<> &Builder) { 512 // Shufflevectors can only be created for fixed-width vectors. 513 if (!isa<FixedVectorType>(ExtElt->getOperand(0)->getType())) 514 return nullptr; 515 516 // If the extract can be constant-folded, this code is unsimplified. Defer 517 // to other passes to handle that. 518 Value *X = ExtElt->getVectorOperand(); 519 Value *C = ExtElt->getIndexOperand(); 520 assert(isa<ConstantInt>(C) && "Expected a constant index operand"); 521 if (isa<Constant>(X)) 522 return nullptr; 523 524 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(), 525 NewIndex, Builder); 526 return cast<ExtractElementInst>(Builder.CreateExtractElement(Shuf, NewIndex)); 527 } 528 529 /// Try to reduce extract element costs by converting scalar compares to vector 530 /// compares followed by extract. 531 /// cmp (ext0 V0, C), (ext1 V1, C) 532 void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0, 533 ExtractElementInst *Ext1, Instruction &I) { 534 assert(isa<CmpInst>(&I) && "Expected a compare"); 535 assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == 536 cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && 537 "Expected matching constant extract indexes"); 538 539 // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C 540 ++NumVecCmp; 541 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate(); 542 Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); 543 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1); 544 Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand()); 545 replaceValue(I, *NewExt); 546 } 547 548 /// Try to reduce extract element costs by converting scalar binops to vector 549 /// binops followed by extract. 550 /// bo (ext0 V0, C), (ext1 V1, C) 551 void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0, 552 ExtractElementInst *Ext1, Instruction &I) { 553 assert(isa<BinaryOperator>(&I) && "Expected a binary operator"); 554 assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == 555 cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && 556 "Expected matching constant extract indexes"); 557 558 // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C 559 ++NumVecBO; 560 Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); 561 Value *VecBO = 562 Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1); 563 564 // All IR flags are safe to back-propagate because any potential poison 565 // created in unused vector elements is discarded by the extract. 566 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO)) 567 VecBOInst->copyIRFlags(&I); 568 569 Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand()); 570 replaceValue(I, *NewExt); 571 } 572 573 /// Match an instruction with extracted vector operands. 574 bool VectorCombine::foldExtractExtract(Instruction &I) { 575 // It is not safe to transform things like div, urem, etc. because we may 576 // create undefined behavior when executing those on unknown vector elements. 577 if (!isSafeToSpeculativelyExecute(&I)) 578 return false; 579 580 Instruction *I0, *I1; 581 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 582 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) && 583 !match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1)))) 584 return false; 585 586 Value *V0, *V1; 587 uint64_t C0, C1; 588 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) || 589 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) || 590 V0->getType() != V1->getType()) 591 return false; 592 593 // If the scalar value 'I' is going to be re-inserted into a vector, then try 594 // to create an extract to that same element. The extract/insert can be 595 // reduced to a "select shuffle". 596 // TODO: If we add a larger pattern match that starts from an insert, this 597 // probably becomes unnecessary. 598 auto *Ext0 = cast<ExtractElementInst>(I0); 599 auto *Ext1 = cast<ExtractElementInst>(I1); 600 uint64_t InsertIndex = InvalidIndex; 601 if (I.hasOneUse()) 602 match(I.user_back(), 603 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex))); 604 605 ExtractElementInst *ExtractToChange; 606 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex)) 607 return false; 608 609 if (ExtractToChange) { 610 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0; 611 ExtractElementInst *NewExtract = 612 translateExtract(ExtractToChange, CheapExtractIdx, Builder); 613 if (!NewExtract) 614 return false; 615 if (ExtractToChange == Ext0) 616 Ext0 = NewExtract; 617 else 618 Ext1 = NewExtract; 619 } 620 621 if (Pred != CmpInst::BAD_ICMP_PREDICATE) 622 foldExtExtCmp(Ext0, Ext1, I); 623 else 624 foldExtExtBinop(Ext0, Ext1, I); 625 626 Worklist.push(Ext0); 627 Worklist.push(Ext1); 628 return true; 629 } 630 631 /// Try to replace an extract + scalar fneg + insert with a vector fneg + 632 /// shuffle. 633 bool VectorCombine::foldInsExtFNeg(Instruction &I) { 634 // Match an insert (op (extract)) pattern. 635 Value *DestVec; 636 uint64_t Index; 637 Instruction *FNeg; 638 if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)), 639 m_ConstantInt(Index)))) 640 return false; 641 642 // Note: This handles the canonical fneg instruction and "fsub -0.0, X". 643 Value *SrcVec; 644 Instruction *Extract; 645 if (!match(FNeg, m_FNeg(m_CombineAnd( 646 m_Instruction(Extract), 647 m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))) 648 return false; 649 650 // TODO: We could handle this with a length-changing shuffle. 651 auto *VecTy = cast<FixedVectorType>(I.getType()); 652 if (SrcVec->getType() != VecTy) 653 return false; 654 655 // Ignore bogus insert/extract index. 656 unsigned NumElts = VecTy->getNumElements(); 657 if (Index >= NumElts) 658 return false; 659 660 // We are inserting the negated element into the same lane that we extracted 661 // from. This is equivalent to a select-shuffle that chooses all but the 662 // negated element from the destination vector. 663 SmallVector<int> Mask(NumElts); 664 std::iota(Mask.begin(), Mask.end(), 0); 665 Mask[Index] = Index + NumElts; 666 667 Type *ScalarTy = VecTy->getScalarType(); 668 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 669 InstructionCost OldCost = 670 TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) + 671 TTI.getVectorInstrCost(I, VecTy, CostKind, Index); 672 673 // If the extract has one use, it will be eliminated, so count it in the 674 // original cost. If it has more than one use, ignore the cost because it will 675 // be the same before/after. 676 if (Extract->hasOneUse()) 677 OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index); 678 679 InstructionCost NewCost = 680 TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) + 681 TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask); 682 683 if (NewCost > OldCost) 684 return false; 685 686 // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index --> 687 // shuffle DestVec, (fneg SrcVec), Mask 688 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); 689 Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); 690 replaceValue(I, *Shuf); 691 return true; 692 } 693 694 /// If this is a bitcast of a shuffle, try to bitcast the source vector to the 695 /// destination type followed by shuffle. This can enable further transforms by 696 /// moving bitcasts or shuffles together. 697 bool VectorCombine::foldBitcastShuffle(Instruction &I) { 698 Value *V0, *V1; 699 ArrayRef<int> Mask; 700 if (!match(&I, m_BitCast(m_OneUse( 701 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask)))))) 702 return false; 703 704 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for 705 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle 706 // mask for scalable type is a splat or not. 707 // 2) Disallow non-vector casts. 708 // TODO: We could allow any shuffle. 709 auto *DestTy = dyn_cast<FixedVectorType>(I.getType()); 710 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType()); 711 if (!DestTy || !SrcTy) 712 return false; 713 714 unsigned DestEltSize = DestTy->getScalarSizeInBits(); 715 unsigned SrcEltSize = SrcTy->getScalarSizeInBits(); 716 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0) 717 return false; 718 719 bool IsUnary = isa<UndefValue>(V1); 720 721 // For binary shuffles, only fold bitcast(shuffle(X,Y)) 722 // if it won't increase the number of bitcasts. 723 if (!IsUnary) { 724 auto *BCTy0 = dyn_cast<FixedVectorType>(peekThroughBitcasts(V0)->getType()); 725 auto *BCTy1 = dyn_cast<FixedVectorType>(peekThroughBitcasts(V1)->getType()); 726 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) && 727 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType())) 728 return false; 729 } 730 731 SmallVector<int, 16> NewMask; 732 if (DestEltSize <= SrcEltSize) { 733 // The bitcast is from wide to narrow/equal elements. The shuffle mask can 734 // always be expanded to the equivalent form choosing narrower elements. 735 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask"); 736 unsigned ScaleFactor = SrcEltSize / DestEltSize; 737 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask); 738 } else { 739 // The bitcast is from narrow elements to wide elements. The shuffle mask 740 // must choose consecutive elements to allow casting first. 741 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask"); 742 unsigned ScaleFactor = DestEltSize / SrcEltSize; 743 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask)) 744 return false; 745 } 746 747 // Bitcast the shuffle src - keep its original width but using the destination 748 // scalar type. 749 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize; 750 auto *NewShuffleTy = 751 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts); 752 auto *OldShuffleTy = 753 FixedVectorType::get(SrcTy->getScalarType(), Mask.size()); 754 unsigned NumOps = IsUnary ? 1 : 2; 755 756 // The new shuffle must not cost more than the old shuffle. 757 TargetTransformInfo::TargetCostKind CK = 758 TargetTransformInfo::TCK_RecipThroughput; 759 TargetTransformInfo::ShuffleKind SK = 760 IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc 761 : TargetTransformInfo::SK_PermuteTwoSrc; 762 763 InstructionCost DestCost = 764 TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CK) + 765 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy, 766 TargetTransformInfo::CastContextHint::None, 767 CK)); 768 InstructionCost SrcCost = 769 TTI.getShuffleCost(SK, SrcTy, Mask, CK) + 770 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy, 771 TargetTransformInfo::CastContextHint::None, CK); 772 if (DestCost > SrcCost || !DestCost.isValid()) 773 return false; 774 775 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC' 776 ++NumShufOfBitcast; 777 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy); 778 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy); 779 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask); 780 replaceValue(I, *Shuf); 781 return true; 782 } 783 784 /// VP Intrinsics whose vector operands are both splat values may be simplified 785 /// into the scalar version of the operation and the result splatted. This 786 /// can lead to scalarization down the line. 787 bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) { 788 if (!isa<VPIntrinsic>(I)) 789 return false; 790 VPIntrinsic &VPI = cast<VPIntrinsic>(I); 791 Value *Op0 = VPI.getArgOperand(0); 792 Value *Op1 = VPI.getArgOperand(1); 793 794 if (!isSplatValue(Op0) || !isSplatValue(Op1)) 795 return false; 796 797 // Check getSplatValue early in this function, to avoid doing unnecessary 798 // work. 799 Value *ScalarOp0 = getSplatValue(Op0); 800 Value *ScalarOp1 = getSplatValue(Op1); 801 if (!ScalarOp0 || !ScalarOp1) 802 return false; 803 804 // For the binary VP intrinsics supported here, the result on disabled lanes 805 // is a poison value. For now, only do this simplification if all lanes 806 // are active. 807 // TODO: Relax the condition that all lanes are active by using insertelement 808 // on inactive lanes. 809 auto IsAllTrueMask = [](Value *MaskVal) { 810 if (Value *SplattedVal = getSplatValue(MaskVal)) 811 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal)) 812 return ConstValue->isAllOnesValue(); 813 return false; 814 }; 815 if (!IsAllTrueMask(VPI.getArgOperand(2))) 816 return false; 817 818 // Check to make sure we support scalarization of the intrinsic 819 Intrinsic::ID IntrID = VPI.getIntrinsicID(); 820 if (!VPBinOpIntrinsic::isVPBinOp(IntrID)) 821 return false; 822 823 // Calculate cost of splatting both operands into vectors and the vector 824 // intrinsic 825 VectorType *VecTy = cast<VectorType>(VPI.getType()); 826 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 827 SmallVector<int> Mask; 828 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy)) 829 Mask.resize(FVTy->getNumElements(), 0); 830 InstructionCost SplatCost = 831 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) + 832 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, Mask); 833 834 // Calculate the cost of the VP Intrinsic 835 SmallVector<Type *, 4> Args; 836 for (Value *V : VPI.args()) 837 Args.push_back(V->getType()); 838 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args); 839 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind); 840 InstructionCost OldCost = 2 * SplatCost + VectorOpCost; 841 842 // Determine scalar opcode 843 std::optional<unsigned> FunctionalOpcode = 844 VPI.getFunctionalOpcode(); 845 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt; 846 if (!FunctionalOpcode) { 847 ScalarIntrID = VPI.getFunctionalIntrinsicID(); 848 if (!ScalarIntrID) 849 return false; 850 } 851 852 // Calculate cost of scalarizing 853 InstructionCost ScalarOpCost = 0; 854 if (ScalarIntrID) { 855 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args); 856 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind); 857 } else { 858 ScalarOpCost = 859 TTI.getArithmeticInstrCost(*FunctionalOpcode, VecTy->getScalarType()); 860 } 861 862 // The existing splats may be kept around if other instructions use them. 863 InstructionCost CostToKeepSplats = 864 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse()); 865 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats; 866 867 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI 868 << "\n"); 869 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost 870 << ", Cost of scalarizing:" << NewCost << "\n"); 871 872 // We want to scalarize unless the vector variant actually has lower cost. 873 if (OldCost < NewCost || !NewCost.isValid()) 874 return false; 875 876 // Scalarize the intrinsic 877 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount(); 878 Value *EVL = VPI.getArgOperand(3); 879 880 // If the VP op might introduce UB or poison, we can scalarize it provided 881 // that we know the EVL > 0: If the EVL is zero, then the original VP op 882 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by 883 // scalarizing it. 884 bool SafeToSpeculate; 885 if (ScalarIntrID) 886 SafeToSpeculate = Intrinsic::getAttributes(I.getContext(), *ScalarIntrID) 887 .hasFnAttr(Attribute::AttrKind::Speculatable); 888 else 889 SafeToSpeculate = isSafeToSpeculativelyExecuteWithOpcode( 890 *FunctionalOpcode, &VPI, nullptr, &AC, &DT); 891 if (!SafeToSpeculate && 892 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI))) 893 return false; 894 895 Value *ScalarVal = 896 ScalarIntrID 897 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID, 898 {ScalarOp0, ScalarOp1}) 899 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode), 900 ScalarOp0, ScalarOp1); 901 902 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal)); 903 return true; 904 } 905 906 /// Match a vector binop or compare instruction with at least one inserted 907 /// scalar operand and convert to scalar binop/cmp followed by insertelement. 908 bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { 909 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 910 Value *Ins0, *Ins1; 911 if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) && 912 !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) 913 return false; 914 915 // Do not convert the vector condition of a vector select into a scalar 916 // condition. That may cause problems for codegen because of differences in 917 // boolean formats and register-file transfers. 918 // TODO: Can we account for that in the cost model? 919 bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE; 920 if (IsCmp) 921 for (User *U : I.users()) 922 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value()))) 923 return false; 924 925 // Match against one or both scalar values being inserted into constant 926 // vectors: 927 // vec_op VecC0, (inselt VecC1, V1, Index) 928 // vec_op (inselt VecC0, V0, Index), VecC1 929 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) 930 // TODO: Deal with mismatched index constants and variable indexes? 931 Constant *VecC0 = nullptr, *VecC1 = nullptr; 932 Value *V0 = nullptr, *V1 = nullptr; 933 uint64_t Index0 = 0, Index1 = 0; 934 if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0), 935 m_ConstantInt(Index0))) && 936 !match(Ins0, m_Constant(VecC0))) 937 return false; 938 if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1), 939 m_ConstantInt(Index1))) && 940 !match(Ins1, m_Constant(VecC1))) 941 return false; 942 943 bool IsConst0 = !V0; 944 bool IsConst1 = !V1; 945 if (IsConst0 && IsConst1) 946 return false; 947 if (!IsConst0 && !IsConst1 && Index0 != Index1) 948 return false; 949 950 // Bail for single insertion if it is a load. 951 // TODO: Handle this once getVectorInstrCost can cost for load/stores. 952 auto *I0 = dyn_cast_or_null<Instruction>(V0); 953 auto *I1 = dyn_cast_or_null<Instruction>(V1); 954 if ((IsConst0 && I1 && I1->mayReadFromMemory()) || 955 (IsConst1 && I0 && I0->mayReadFromMemory())) 956 return false; 957 958 uint64_t Index = IsConst0 ? Index1 : Index0; 959 Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType(); 960 Type *VecTy = I.getType(); 961 assert(VecTy->isVectorTy() && 962 (IsConst0 || IsConst1 || V0->getType() == V1->getType()) && 963 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() || 964 ScalarTy->isPointerTy()) && 965 "Unexpected types for insert element into binop or cmp"); 966 967 unsigned Opcode = I.getOpcode(); 968 InstructionCost ScalarOpCost, VectorOpCost; 969 if (IsCmp) { 970 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate(); 971 ScalarOpCost = TTI.getCmpSelInstrCost( 972 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred); 973 VectorOpCost = TTI.getCmpSelInstrCost( 974 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred); 975 } else { 976 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); 977 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); 978 } 979 980 // Get cost estimate for the insert element. This cost will factor into 981 // both sequences. 982 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 983 InstructionCost InsertCost = TTI.getVectorInstrCost( 984 Instruction::InsertElement, VecTy, CostKind, Index); 985 InstructionCost OldCost = 986 (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost; 987 InstructionCost NewCost = ScalarOpCost + InsertCost + 988 (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) + 989 (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost); 990 991 // We want to scalarize unless the vector variant actually has lower cost. 992 if (OldCost < NewCost || !NewCost.isValid()) 993 return false; 994 995 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) --> 996 // inselt NewVecC, (scalar_op V0, V1), Index 997 if (IsCmp) 998 ++NumScalarCmp; 999 else 1000 ++NumScalarBO; 1001 1002 // For constant cases, extract the scalar element, this should constant fold. 1003 if (IsConst0) 1004 V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index)); 1005 if (IsConst1) 1006 V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index)); 1007 1008 Value *Scalar = 1009 IsCmp ? Builder.CreateCmp(Pred, V0, V1) 1010 : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1); 1011 1012 Scalar->setName(I.getName() + ".scalar"); 1013 1014 // All IR flags are safe to back-propagate. There is no potential for extra 1015 // poison to be created by the scalar instruction. 1016 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar)) 1017 ScalarInst->copyIRFlags(&I); 1018 1019 // Fold the vector constants in the original vectors into a new base vector. 1020 Value *NewVecC = 1021 IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1) 1022 : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1); 1023 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index); 1024 replaceValue(I, *Insert); 1025 return true; 1026 } 1027 1028 /// Try to combine a scalar binop + 2 scalar compares of extracted elements of 1029 /// a vector into vector operations followed by extract. Note: The SLP pass 1030 /// may miss this pattern because of implementation problems. 1031 bool VectorCombine::foldExtractedCmps(Instruction &I) { 1032 // We are looking for a scalar binop of booleans. 1033 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1) 1034 if (!I.isBinaryOp() || !I.getType()->isIntegerTy(1)) 1035 return false; 1036 1037 // The compare predicates should match, and each compare should have a 1038 // constant operand. 1039 // TODO: Relax the one-use constraints. 1040 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1); 1041 Instruction *I0, *I1; 1042 Constant *C0, *C1; 1043 CmpInst::Predicate P0, P1; 1044 if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) || 1045 !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) || 1046 P0 != P1) 1047 return false; 1048 1049 // The compare operands must be extracts of the same vector with constant 1050 // extract indexes. 1051 // TODO: Relax the one-use constraints. 1052 Value *X; 1053 uint64_t Index0, Index1; 1054 if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) || 1055 !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))) 1056 return false; 1057 1058 auto *Ext0 = cast<ExtractElementInst>(I0); 1059 auto *Ext1 = cast<ExtractElementInst>(I1); 1060 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1); 1061 if (!ConvertToShuf) 1062 return false; 1063 1064 // The original scalar pattern is: 1065 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1) 1066 CmpInst::Predicate Pred = P0; 1067 unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp 1068 : Instruction::ICmp; 1069 auto *VecTy = dyn_cast<FixedVectorType>(X->getType()); 1070 if (!VecTy) 1071 return false; 1072 1073 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1074 InstructionCost OldCost = 1075 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); 1076 OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); 1077 OldCost += 1078 TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(), 1079 CmpInst::makeCmpResultType(I0->getType()), Pred) * 1080 2; 1081 OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType()); 1082 1083 // The proposed vector pattern is: 1084 // vcmp = cmp Pred X, VecC 1085 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0 1086 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0; 1087 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1; 1088 auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType())); 1089 InstructionCost NewCost = TTI.getCmpSelInstrCost( 1090 CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred); 1091 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem); 1092 ShufMask[CheapIndex] = ExpensiveIndex; 1093 NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy, 1094 ShufMask); 1095 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); 1096 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex); 1097 1098 // Aggressively form vector ops if the cost is equal because the transform 1099 // may enable further optimization. 1100 // Codegen can reverse this transform (scalarize) if it was not profitable. 1101 if (OldCost < NewCost || !NewCost.isValid()) 1102 return false; 1103 1104 // Create a vector constant from the 2 scalar constants. 1105 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(), 1106 PoisonValue::get(VecTy->getElementType())); 1107 CmpC[Index0] = C0; 1108 CmpC[Index1] = C1; 1109 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC)); 1110 1111 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder); 1112 Value *VecLogic = Builder.CreateBinOp(cast<BinaryOperator>(I).getOpcode(), 1113 VCmp, Shuf); 1114 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex); 1115 replaceValue(I, *NewExt); 1116 ++NumVecCmpBO; 1117 return true; 1118 } 1119 1120 // Check if memory loc modified between two instrs in the same BB 1121 static bool isMemModifiedBetween(BasicBlock::iterator Begin, 1122 BasicBlock::iterator End, 1123 const MemoryLocation &Loc, AAResults &AA) { 1124 unsigned NumScanned = 0; 1125 return std::any_of(Begin, End, [&](const Instruction &Instr) { 1126 return isModSet(AA.getModRefInfo(&Instr, Loc)) || 1127 ++NumScanned > MaxInstrsToScan; 1128 }); 1129 } 1130 1131 namespace { 1132 /// Helper class to indicate whether a vector index can be safely scalarized and 1133 /// if a freeze needs to be inserted. 1134 class ScalarizationResult { 1135 enum class StatusTy { Unsafe, Safe, SafeWithFreeze }; 1136 1137 StatusTy Status; 1138 Value *ToFreeze; 1139 1140 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr) 1141 : Status(Status), ToFreeze(ToFreeze) {} 1142 1143 public: 1144 ScalarizationResult(const ScalarizationResult &Other) = default; 1145 ~ScalarizationResult() { 1146 assert(!ToFreeze && "freeze() not called with ToFreeze being set"); 1147 } 1148 1149 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; } 1150 static ScalarizationResult safe() { return {StatusTy::Safe}; } 1151 static ScalarizationResult safeWithFreeze(Value *ToFreeze) { 1152 return {StatusTy::SafeWithFreeze, ToFreeze}; 1153 } 1154 1155 /// Returns true if the index can be scalarize without requiring a freeze. 1156 bool isSafe() const { return Status == StatusTy::Safe; } 1157 /// Returns true if the index cannot be scalarized. 1158 bool isUnsafe() const { return Status == StatusTy::Unsafe; } 1159 /// Returns true if the index can be scalarize, but requires inserting a 1160 /// freeze. 1161 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; } 1162 1163 /// Reset the state of Unsafe and clear ToFreze if set. 1164 void discard() { 1165 ToFreeze = nullptr; 1166 Status = StatusTy::Unsafe; 1167 } 1168 1169 /// Freeze the ToFreeze and update the use in \p User to use it. 1170 void freeze(IRBuilder<> &Builder, Instruction &UserI) { 1171 assert(isSafeWithFreeze() && 1172 "should only be used when freezing is required"); 1173 assert(is_contained(ToFreeze->users(), &UserI) && 1174 "UserI must be a user of ToFreeze"); 1175 IRBuilder<>::InsertPointGuard Guard(Builder); 1176 Builder.SetInsertPoint(cast<Instruction>(&UserI)); 1177 Value *Frozen = 1178 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen"); 1179 for (Use &U : make_early_inc_range((UserI.operands()))) 1180 if (U.get() == ToFreeze) 1181 U.set(Frozen); 1182 1183 ToFreeze = nullptr; 1184 } 1185 }; 1186 } // namespace 1187 1188 /// Check if it is legal to scalarize a memory access to \p VecTy at index \p 1189 /// Idx. \p Idx must access a valid vector element. 1190 static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, 1191 Instruction *CtxI, 1192 AssumptionCache &AC, 1193 const DominatorTree &DT) { 1194 // We do checks for both fixed vector types and scalable vector types. 1195 // This is the number of elements of fixed vector types, 1196 // or the minimum number of elements of scalable vector types. 1197 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue(); 1198 1199 if (auto *C = dyn_cast<ConstantInt>(Idx)) { 1200 if (C->getValue().ult(NumElements)) 1201 return ScalarizationResult::safe(); 1202 return ScalarizationResult::unsafe(); 1203 } 1204 1205 unsigned IntWidth = Idx->getType()->getScalarSizeInBits(); 1206 APInt Zero(IntWidth, 0); 1207 APInt MaxElts(IntWidth, NumElements); 1208 ConstantRange ValidIndices(Zero, MaxElts); 1209 ConstantRange IdxRange(IntWidth, true); 1210 1211 if (isGuaranteedNotToBePoison(Idx, &AC)) { 1212 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false, 1213 true, &AC, CtxI, &DT))) 1214 return ScalarizationResult::safe(); 1215 return ScalarizationResult::unsafe(); 1216 } 1217 1218 // If the index may be poison, check if we can insert a freeze before the 1219 // range of the index is restricted. 1220 Value *IdxBase; 1221 ConstantInt *CI; 1222 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) { 1223 IdxRange = IdxRange.binaryAnd(CI->getValue()); 1224 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) { 1225 IdxRange = IdxRange.urem(CI->getValue()); 1226 } 1227 1228 if (ValidIndices.contains(IdxRange)) 1229 return ScalarizationResult::safeWithFreeze(IdxBase); 1230 return ScalarizationResult::unsafe(); 1231 } 1232 1233 /// The memory operation on a vector of \p ScalarType had alignment of 1234 /// \p VectorAlignment. Compute the maximal, but conservatively correct, 1235 /// alignment that will be valid for the memory operation on a single scalar 1236 /// element of the same type with index \p Idx. 1237 static Align computeAlignmentAfterScalarization(Align VectorAlignment, 1238 Type *ScalarType, Value *Idx, 1239 const DataLayout &DL) { 1240 if (auto *C = dyn_cast<ConstantInt>(Idx)) 1241 return commonAlignment(VectorAlignment, 1242 C->getZExtValue() * DL.getTypeStoreSize(ScalarType)); 1243 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType)); 1244 } 1245 1246 // Combine patterns like: 1247 // %0 = load <4 x i32>, <4 x i32>* %a 1248 // %1 = insertelement <4 x i32> %0, i32 %b, i32 1 1249 // store <4 x i32> %1, <4 x i32>* %a 1250 // to: 1251 // %0 = bitcast <4 x i32>* %a to i32* 1252 // %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1 1253 // store i32 %b, i32* %1 1254 bool VectorCombine::foldSingleElementStore(Instruction &I) { 1255 auto *SI = cast<StoreInst>(&I); 1256 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType())) 1257 return false; 1258 1259 // TODO: Combine more complicated patterns (multiple insert) by referencing 1260 // TargetTransformInfo. 1261 Instruction *Source; 1262 Value *NewElement; 1263 Value *Idx; 1264 if (!match(SI->getValueOperand(), 1265 m_InsertElt(m_Instruction(Source), m_Value(NewElement), 1266 m_Value(Idx)))) 1267 return false; 1268 1269 if (auto *Load = dyn_cast<LoadInst>(Source)) { 1270 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType()); 1271 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts(); 1272 // Don't optimize for atomic/volatile load or store. Ensure memory is not 1273 // modified between, vector type matches store size, and index is inbounds. 1274 if (!Load->isSimple() || Load->getParent() != SI->getParent() || 1275 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) || 1276 SrcAddr != SI->getPointerOperand()->stripPointerCasts()) 1277 return false; 1278 1279 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT); 1280 if (ScalarizableIdx.isUnsafe() || 1281 isMemModifiedBetween(Load->getIterator(), SI->getIterator(), 1282 MemoryLocation::get(SI), AA)) 1283 return false; 1284 1285 if (ScalarizableIdx.isSafeWithFreeze()) 1286 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx)); 1287 Value *GEP = Builder.CreateInBoundsGEP( 1288 SI->getValueOperand()->getType(), SI->getPointerOperand(), 1289 {ConstantInt::get(Idx->getType(), 0), Idx}); 1290 StoreInst *NSI = Builder.CreateStore(NewElement, GEP); 1291 NSI->copyMetadata(*SI); 1292 Align ScalarOpAlignment = computeAlignmentAfterScalarization( 1293 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx, 1294 *DL); 1295 NSI->setAlignment(ScalarOpAlignment); 1296 replaceValue(I, *NSI); 1297 eraseInstruction(I); 1298 return true; 1299 } 1300 1301 return false; 1302 } 1303 1304 /// Try to scalarize vector loads feeding extractelement instructions. 1305 bool VectorCombine::scalarizeLoadExtract(Instruction &I) { 1306 Value *Ptr; 1307 if (!match(&I, m_Load(m_Value(Ptr)))) 1308 return false; 1309 1310 auto *VecTy = cast<VectorType>(I.getType()); 1311 auto *LI = cast<LoadInst>(&I); 1312 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType())) 1313 return false; 1314 1315 InstructionCost OriginalCost = 1316 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(), 1317 LI->getPointerAddressSpace()); 1318 InstructionCost ScalarizedCost = 0; 1319 1320 Instruction *LastCheckedInst = LI; 1321 unsigned NumInstChecked = 0; 1322 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze; 1323 auto FailureGuard = make_scope_exit([&]() { 1324 // If the transform is aborted, discard the ScalarizationResults. 1325 for (auto &Pair : NeedFreeze) 1326 Pair.second.discard(); 1327 }); 1328 1329 // Check if all users of the load are extracts with no memory modifications 1330 // between the load and the extract. Compute the cost of both the original 1331 // code and the scalarized version. 1332 for (User *U : LI->users()) { 1333 auto *UI = dyn_cast<ExtractElementInst>(U); 1334 if (!UI || UI->getParent() != LI->getParent()) 1335 return false; 1336 1337 // Check if any instruction between the load and the extract may modify 1338 // memory. 1339 if (LastCheckedInst->comesBefore(UI)) { 1340 for (Instruction &I : 1341 make_range(std::next(LI->getIterator()), UI->getIterator())) { 1342 // Bail out if we reached the check limit or the instruction may write 1343 // to memory. 1344 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory()) 1345 return false; 1346 NumInstChecked++; 1347 } 1348 LastCheckedInst = UI; 1349 } 1350 1351 auto ScalarIdx = canScalarizeAccess(VecTy, UI->getOperand(1), &I, AC, DT); 1352 if (ScalarIdx.isUnsafe()) 1353 return false; 1354 if (ScalarIdx.isSafeWithFreeze()) { 1355 NeedFreeze.try_emplace(UI, ScalarIdx); 1356 ScalarIdx.discard(); 1357 } 1358 1359 auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1)); 1360 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1361 OriginalCost += 1362 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, 1363 Index ? Index->getZExtValue() : -1); 1364 ScalarizedCost += 1365 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(), 1366 Align(1), LI->getPointerAddressSpace()); 1367 ScalarizedCost += TTI.getAddressComputationCost(VecTy->getElementType()); 1368 } 1369 1370 if (ScalarizedCost >= OriginalCost) 1371 return false; 1372 1373 // Replace extracts with narrow scalar loads. 1374 for (User *U : LI->users()) { 1375 auto *EI = cast<ExtractElementInst>(U); 1376 Value *Idx = EI->getOperand(1); 1377 1378 // Insert 'freeze' for poison indexes. 1379 auto It = NeedFreeze.find(EI); 1380 if (It != NeedFreeze.end()) 1381 It->second.freeze(Builder, *cast<Instruction>(Idx)); 1382 1383 Builder.SetInsertPoint(EI); 1384 Value *GEP = 1385 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx}); 1386 auto *NewLoad = cast<LoadInst>(Builder.CreateLoad( 1387 VecTy->getElementType(), GEP, EI->getName() + ".scalar")); 1388 1389 Align ScalarOpAlignment = computeAlignmentAfterScalarization( 1390 LI->getAlign(), VecTy->getElementType(), Idx, *DL); 1391 NewLoad->setAlignment(ScalarOpAlignment); 1392 1393 replaceValue(*EI, *NewLoad); 1394 } 1395 1396 FailureGuard.release(); 1397 return true; 1398 } 1399 1400 /// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)". 1401 bool VectorCombine::foldShuffleOfBinops(Instruction &I) { 1402 BinaryOperator *B0, *B1; 1403 ArrayRef<int> OldMask; 1404 if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)), 1405 m_Mask(OldMask)))) 1406 return false; 1407 1408 // Don't introduce poison into div/rem. 1409 if (any_of(OldMask, [](int M) { return M == PoisonMaskElem; }) && 1410 B0->isIntDivRem()) 1411 return false; 1412 1413 // TODO: Add support for addlike etc. 1414 Instruction::BinaryOps Opcode = B0->getOpcode(); 1415 if (Opcode != B1->getOpcode()) 1416 return false; 1417 1418 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType()); 1419 auto *BinOpTy = dyn_cast<FixedVectorType>(B0->getType()); 1420 if (!ShuffleDstTy || !BinOpTy) 1421 return false; 1422 1423 unsigned NumSrcElts = BinOpTy->getNumElements(); 1424 1425 // If we have something like "add X, Y" and "add Z, X", swap ops to match. 1426 Value *X = B0->getOperand(0), *Y = B0->getOperand(1); 1427 Value *Z = B1->getOperand(0), *W = B1->getOperand(1); 1428 if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W && 1429 (X == W || Y == Z)) 1430 std::swap(X, Y); 1431 1432 auto ConvertToUnary = [NumSrcElts](int &M) { 1433 if (M >= (int)NumSrcElts) 1434 M -= NumSrcElts; 1435 }; 1436 1437 SmallVector<int> NewMask0(OldMask.begin(), OldMask.end()); 1438 TargetTransformInfo::ShuffleKind SK0 = TargetTransformInfo::SK_PermuteTwoSrc; 1439 if (X == Z) { 1440 llvm::for_each(NewMask0, ConvertToUnary); 1441 SK0 = TargetTransformInfo::SK_PermuteSingleSrc; 1442 Z = PoisonValue::get(BinOpTy); 1443 } 1444 1445 SmallVector<int> NewMask1(OldMask.begin(), OldMask.end()); 1446 TargetTransformInfo::ShuffleKind SK1 = TargetTransformInfo::SK_PermuteTwoSrc; 1447 if (Y == W) { 1448 llvm::for_each(NewMask1, ConvertToUnary); 1449 SK1 = TargetTransformInfo::SK_PermuteSingleSrc; 1450 W = PoisonValue::get(BinOpTy); 1451 } 1452 1453 // Try to replace a binop with a shuffle if the shuffle is not costly. 1454 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1455 1456 InstructionCost OldCost = 1457 TTI.getArithmeticInstrCost(B0->getOpcode(), BinOpTy, CostKind) + 1458 TTI.getArithmeticInstrCost(B1->getOpcode(), BinOpTy, CostKind) + 1459 TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, 1460 OldMask, CostKind, 0, nullptr, {B0, B1}, &I); 1461 1462 InstructionCost NewCost = 1463 TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) + 1464 TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}) + 1465 TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind); 1466 1467 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I 1468 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost 1469 << "\n"); 1470 if (NewCost >= OldCost) 1471 return false; 1472 1473 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0); 1474 Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1); 1475 Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1); 1476 1477 // Intersect flags from the old binops. 1478 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) { 1479 NewInst->copyIRFlags(B0); 1480 NewInst->andIRFlags(B1); 1481 } 1482 1483 Worklist.pushValue(Shuf0); 1484 Worklist.pushValue(Shuf1); 1485 replaceValue(I, *NewBO); 1486 return true; 1487 } 1488 1489 /// Try to convert "shuffle (castop), (castop)" with a shared castop operand 1490 /// into "castop (shuffle)". 1491 bool VectorCombine::foldShuffleOfCastops(Instruction &I) { 1492 Value *V0, *V1; 1493 ArrayRef<int> OldMask; 1494 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask)))) 1495 return false; 1496 1497 auto *C0 = dyn_cast<CastInst>(V0); 1498 auto *C1 = dyn_cast<CastInst>(V1); 1499 if (!C0 || !C1) 1500 return false; 1501 1502 Instruction::CastOps Opcode = C0->getOpcode(); 1503 if (C0->getSrcTy() != C1->getSrcTy()) 1504 return false; 1505 1506 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds. 1507 if (Opcode != C1->getOpcode()) { 1508 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value()))) 1509 Opcode = Instruction::SExt; 1510 else 1511 return false; 1512 } 1513 1514 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType()); 1515 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy()); 1516 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy()); 1517 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy) 1518 return false; 1519 1520 unsigned NumSrcElts = CastSrcTy->getNumElements(); 1521 unsigned NumDstElts = CastDstTy->getNumElements(); 1522 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) && 1523 "Only bitcasts expected to alter src/dst element counts"); 1524 1525 // Check for bitcasting of unscalable vector types. 1526 // e.g. <32 x i40> -> <40 x i32> 1527 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 && 1528 (NumDstElts % NumSrcElts) != 0) 1529 return false; 1530 1531 SmallVector<int, 16> NewMask; 1532 if (NumSrcElts >= NumDstElts) { 1533 // The bitcast is from wide to narrow/equal elements. The shuffle mask can 1534 // always be expanded to the equivalent form choosing narrower elements. 1535 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask"); 1536 unsigned ScaleFactor = NumSrcElts / NumDstElts; 1537 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask); 1538 } else { 1539 // The bitcast is from narrow elements to wide elements. The shuffle mask 1540 // must choose consecutive elements to allow casting first. 1541 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask"); 1542 unsigned ScaleFactor = NumDstElts / NumSrcElts; 1543 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask)) 1544 return false; 1545 } 1546 1547 auto *NewShuffleDstTy = 1548 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size()); 1549 1550 // Try to replace a castop with a shuffle if the shuffle is not costly. 1551 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1552 1553 InstructionCost CostC0 = 1554 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy, 1555 TTI::CastContextHint::None, CostKind); 1556 InstructionCost CostC1 = 1557 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy, 1558 TTI::CastContextHint::None, CostKind); 1559 InstructionCost OldCost = CostC0 + CostC1; 1560 OldCost += 1561 TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, CastDstTy, 1562 OldMask, CostKind, 0, nullptr, std::nullopt, &I); 1563 1564 InstructionCost NewCost = TTI.getShuffleCost( 1565 TargetTransformInfo::SK_PermuteTwoSrc, CastSrcTy, NewMask, CostKind); 1566 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy, 1567 TTI::CastContextHint::None, CostKind); 1568 if (!C0->hasOneUse()) 1569 NewCost += CostC0; 1570 if (!C1->hasOneUse()) 1571 NewCost += CostC1; 1572 1573 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I 1574 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost 1575 << "\n"); 1576 if (NewCost > OldCost) 1577 return false; 1578 1579 Value *Shuf = Builder.CreateShuffleVector(C0->getOperand(0), 1580 C1->getOperand(0), NewMask); 1581 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy); 1582 1583 // Intersect flags from the old casts. 1584 if (auto *NewInst = dyn_cast<Instruction>(Cast)) { 1585 NewInst->copyIRFlags(C0); 1586 NewInst->andIRFlags(C1); 1587 } 1588 1589 Worklist.pushValue(Shuf); 1590 replaceValue(I, *Cast); 1591 return true; 1592 } 1593 1594 /// Try to convert "shuffle (shuffle x, undef), (shuffle y, undef)" 1595 /// into "shuffle x, y". 1596 bool VectorCombine::foldShuffleOfShuffles(Instruction &I) { 1597 Value *V0, *V1; 1598 UndefValue *U0, *U1; 1599 ArrayRef<int> OuterMask, InnerMask0, InnerMask1; 1600 if (!match(&I, m_Shuffle(m_OneUse(m_Shuffle(m_Value(V0), m_UndefValue(U0), 1601 m_Mask(InnerMask0))), 1602 m_OneUse(m_Shuffle(m_Value(V1), m_UndefValue(U1), 1603 m_Mask(InnerMask1))), 1604 m_Mask(OuterMask)))) 1605 return false; 1606 1607 auto *ShufI0 = dyn_cast<Instruction>(I.getOperand(0)); 1608 auto *ShufI1 = dyn_cast<Instruction>(I.getOperand(1)); 1609 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType()); 1610 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(V0->getType()); 1611 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(I.getOperand(0)->getType()); 1612 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy || 1613 V0->getType() != V1->getType()) 1614 return false; 1615 1616 unsigned NumSrcElts = ShuffleSrcTy->getNumElements(); 1617 unsigned NumImmElts = ShuffleImmTy->getNumElements(); 1618 1619 // Bail if either inner masks reference a RHS undef arg. 1620 if ((!isa<PoisonValue>(U0) && 1621 any_of(InnerMask0, [&](int M) { return M >= (int)NumSrcElts; })) || 1622 (!isa<PoisonValue>(U1) && 1623 any_of(InnerMask1, [&](int M) { return M >= (int)NumSrcElts; }))) 1624 return false; 1625 1626 // Merge shuffles - replace index to the RHS poison arg with PoisonMaskElem, 1627 SmallVector<int, 16> NewMask(OuterMask.begin(), OuterMask.end()); 1628 for (int &M : NewMask) { 1629 if (0 <= M && M < (int)NumImmElts) { 1630 M = (InnerMask0[M] >= (int)NumSrcElts) ? PoisonMaskElem : InnerMask0[M]; 1631 } else if (M >= (int)NumImmElts) { 1632 if (InnerMask1[M - NumImmElts] >= (int)NumSrcElts) 1633 M = PoisonMaskElem; 1634 else 1635 M = InnerMask1[M - NumImmElts] + (V0 == V1 ? 0 : NumSrcElts); 1636 } 1637 } 1638 1639 // Have we folded to an Identity shuffle? 1640 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) { 1641 replaceValue(I, *V0); 1642 return true; 1643 } 1644 1645 // Try to merge the shuffles if the new shuffle is not costly. 1646 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1647 1648 InstructionCost OldCost = 1649 TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy, 1650 InnerMask0, CostKind, 0, nullptr, {V0, U0}, ShufI0) + 1651 TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy, 1652 InnerMask1, CostKind, 0, nullptr, {V1, U1}, ShufI1) + 1653 TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy, 1654 OuterMask, CostKind, 0, nullptr, {ShufI0, ShufI1}, &I); 1655 1656 InstructionCost NewCost = 1657 TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleSrcTy, 1658 NewMask, CostKind, 0, nullptr, {V0, V1}); 1659 1660 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I 1661 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost 1662 << "\n"); 1663 if (NewCost > OldCost) 1664 return false; 1665 1666 // Clear unused sources to poison. 1667 if (none_of(NewMask, [&](int M) { return 0 <= M && M < (int)NumSrcElts; })) 1668 V0 = PoisonValue::get(ShuffleSrcTy); 1669 if (none_of(NewMask, [&](int M) { return (int)NumSrcElts <= M; })) 1670 V1 = PoisonValue::get(ShuffleSrcTy); 1671 1672 Value *Shuf = Builder.CreateShuffleVector(V0, V1, NewMask); 1673 replaceValue(I, *Shuf); 1674 return true; 1675 } 1676 1677 using InstLane = std::pair<Use *, int>; 1678 1679 static InstLane lookThroughShuffles(Use *U, int Lane) { 1680 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) { 1681 unsigned NumElts = 1682 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements(); 1683 int M = SV->getMaskValue(Lane); 1684 if (M < 0) 1685 return {nullptr, PoisonMaskElem}; 1686 if (static_cast<unsigned>(M) < NumElts) { 1687 U = &SV->getOperandUse(0); 1688 Lane = M; 1689 } else { 1690 U = &SV->getOperandUse(1); 1691 Lane = M - NumElts; 1692 } 1693 } 1694 return InstLane{U, Lane}; 1695 } 1696 1697 static SmallVector<InstLane> 1698 generateInstLaneVectorFromOperand(ArrayRef<InstLane> Item, int Op) { 1699 SmallVector<InstLane> NItem; 1700 for (InstLane IL : Item) { 1701 auto [U, Lane] = IL; 1702 InstLane OpLane = 1703 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op), 1704 Lane) 1705 : InstLane{nullptr, PoisonMaskElem}; 1706 NItem.emplace_back(OpLane); 1707 } 1708 return NItem; 1709 } 1710 1711 /// Detect concat of multiple values into a vector 1712 static bool isFreeConcat(ArrayRef<InstLane> Item, 1713 const TargetTransformInfo &TTI) { 1714 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType()); 1715 unsigned NumElts = Ty->getNumElements(); 1716 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0) 1717 return false; 1718 1719 // Check that the concat is free, usually meaning that the type will be split 1720 // during legalization. 1721 SmallVector<int, 16> ConcatMask(NumElts * 2); 1722 std::iota(ConcatMask.begin(), ConcatMask.end(), 0); 1723 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, ConcatMask, 1724 TTI::TCK_RecipThroughput) != 0) 1725 return false; 1726 1727 unsigned NumSlices = Item.size() / NumElts; 1728 // Currently we generate a tree of shuffles for the concats, which limits us 1729 // to a power2. 1730 if (!isPowerOf2_32(NumSlices)) 1731 return false; 1732 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) { 1733 Use *SliceV = Item[Slice * NumElts].first; 1734 if (!SliceV || SliceV->get()->getType() != Ty) 1735 return false; 1736 for (unsigned Elt = 0; Elt < NumElts; ++Elt) { 1737 auto [V, Lane] = Item[Slice * NumElts + Elt]; 1738 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get()) 1739 return false; 1740 } 1741 } 1742 return true; 1743 } 1744 1745 static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty, 1746 const SmallPtrSet<Use *, 4> &IdentityLeafs, 1747 const SmallPtrSet<Use *, 4> &SplatLeafs, 1748 const SmallPtrSet<Use *, 4> &ConcatLeafs, 1749 IRBuilder<> &Builder) { 1750 auto [FrontU, FrontLane] = Item.front(); 1751 1752 if (IdentityLeafs.contains(FrontU)) { 1753 return FrontU->get(); 1754 } 1755 if (SplatLeafs.contains(FrontU)) { 1756 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane); 1757 return Builder.CreateShuffleVector(FrontU->get(), Mask); 1758 } 1759 if (ConcatLeafs.contains(FrontU)) { 1760 unsigned NumElts = 1761 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements(); 1762 SmallVector<Value *> Values(Item.size() / NumElts, nullptr); 1763 for (unsigned S = 0; S < Values.size(); ++S) 1764 Values[S] = Item[S * NumElts].first->get(); 1765 1766 while (Values.size() > 1) { 1767 NumElts *= 2; 1768 SmallVector<int, 16> Mask(NumElts, 0); 1769 std::iota(Mask.begin(), Mask.end(), 0); 1770 SmallVector<Value *> NewValues(Values.size() / 2, nullptr); 1771 for (unsigned S = 0; S < NewValues.size(); ++S) 1772 NewValues[S] = 1773 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask); 1774 Values = NewValues; 1775 } 1776 return Values[0]; 1777 } 1778 1779 auto *I = cast<Instruction>(FrontU->get()); 1780 auto *II = dyn_cast<IntrinsicInst>(I); 1781 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0); 1782 SmallVector<Value *> Ops(NumOps); 1783 for (unsigned Idx = 0; Idx < NumOps; Idx++) { 1784 if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx)) { 1785 Ops[Idx] = II->getOperand(Idx); 1786 continue; 1787 } 1788 Ops[Idx] = 1789 generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx), Ty, 1790 IdentityLeafs, SplatLeafs, ConcatLeafs, Builder); 1791 } 1792 1793 SmallVector<Value *, 8> ValueList; 1794 for (const auto &Lane : Item) 1795 if (Lane.first) 1796 ValueList.push_back(Lane.first->get()); 1797 1798 Type *DstTy = 1799 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements()); 1800 if (auto *BI = dyn_cast<BinaryOperator>(I)) { 1801 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(), 1802 Ops[0], Ops[1]); 1803 propagateIRFlags(Value, ValueList); 1804 return Value; 1805 } 1806 if (auto *CI = dyn_cast<CmpInst>(I)) { 1807 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]); 1808 propagateIRFlags(Value, ValueList); 1809 return Value; 1810 } 1811 if (auto *SI = dyn_cast<SelectInst>(I)) { 1812 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI); 1813 propagateIRFlags(Value, ValueList); 1814 return Value; 1815 } 1816 if (auto *CI = dyn_cast<CastInst>(I)) { 1817 auto *Value = Builder.CreateCast((Instruction::CastOps)CI->getOpcode(), 1818 Ops[0], DstTy); 1819 propagateIRFlags(Value, ValueList); 1820 return Value; 1821 } 1822 if (II) { 1823 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops); 1824 propagateIRFlags(Value, ValueList); 1825 return Value; 1826 } 1827 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate"); 1828 auto *Value = 1829 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]); 1830 propagateIRFlags(Value, ValueList); 1831 return Value; 1832 } 1833 1834 // Starting from a shuffle, look up through operands tracking the shuffled index 1835 // of each lane. If we can simplify away the shuffles to identities then 1836 // do so. 1837 bool VectorCombine::foldShuffleToIdentity(Instruction &I) { 1838 auto *Ty = dyn_cast<FixedVectorType>(I.getType()); 1839 if (!Ty || I.use_empty()) 1840 return false; 1841 1842 SmallVector<InstLane> Start(Ty->getNumElements()); 1843 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M) 1844 Start[M] = lookThroughShuffles(&*I.use_begin(), M); 1845 1846 SmallVector<SmallVector<InstLane>> Worklist; 1847 Worklist.push_back(Start); 1848 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs; 1849 unsigned NumVisited = 0; 1850 1851 while (!Worklist.empty()) { 1852 if (++NumVisited > MaxInstrsToScan) 1853 return false; 1854 1855 SmallVector<InstLane> Item = Worklist.pop_back_val(); 1856 auto [FrontU, FrontLane] = Item.front(); 1857 1858 // If we found an undef first lane then bail out to keep things simple. 1859 if (!FrontU) 1860 return false; 1861 1862 // Helper to peek through bitcasts to the same value. 1863 auto IsEquiv = [&](Value *X, Value *Y) { 1864 return X->getType() == Y->getType() && 1865 peekThroughBitcasts(X) == peekThroughBitcasts(Y); 1866 }; 1867 1868 // Look for an identity value. 1869 if (FrontLane == 0 && 1870 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() == 1871 Ty->getNumElements() && 1872 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) { 1873 Value *FrontV = Item.front().first->get(); 1874 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) && 1875 E.value().second == (int)E.index()); 1876 })) { 1877 IdentityLeafs.insert(FrontU); 1878 continue; 1879 } 1880 // Look for constants, for the moment only supporting constant splats. 1881 if (auto *C = dyn_cast<Constant>(FrontU); 1882 C && C->getSplatValue() && 1883 all_of(drop_begin(Item), [Item](InstLane &IL) { 1884 Value *FrontV = Item.front().first->get(); 1885 Use *U = IL.first; 1886 return !U || U->get() == FrontV; 1887 })) { 1888 SplatLeafs.insert(FrontU); 1889 continue; 1890 } 1891 // Look for a splat value. 1892 if (all_of(drop_begin(Item), [Item](InstLane &IL) { 1893 auto [FrontU, FrontLane] = Item.front(); 1894 auto [U, Lane] = IL; 1895 return !U || (U->get() == FrontU->get() && Lane == FrontLane); 1896 })) { 1897 SplatLeafs.insert(FrontU); 1898 continue; 1899 } 1900 1901 // We need each element to be the same type of value, and check that each 1902 // element has a single use. 1903 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) { 1904 Value *FrontV = Item.front().first->get(); 1905 if (!IL.first) 1906 return true; 1907 Value *V = IL.first->get(); 1908 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUse()) 1909 return false; 1910 if (V->getValueID() != FrontV->getValueID()) 1911 return false; 1912 if (auto *CI = dyn_cast<CmpInst>(V)) 1913 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate()) 1914 return false; 1915 if (auto *CI = dyn_cast<CastInst>(V)) 1916 if (CI->getSrcTy() != cast<CastInst>(FrontV)->getSrcTy()) 1917 return false; 1918 if (auto *SI = dyn_cast<SelectInst>(V)) 1919 if (!isa<VectorType>(SI->getOperand(0)->getType()) || 1920 SI->getOperand(0)->getType() != 1921 cast<SelectInst>(FrontV)->getOperand(0)->getType()) 1922 return false; 1923 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V)) 1924 return false; 1925 auto *II = dyn_cast<IntrinsicInst>(V); 1926 return !II || (isa<IntrinsicInst>(FrontV) && 1927 II->getIntrinsicID() == 1928 cast<IntrinsicInst>(FrontV)->getIntrinsicID() && 1929 !II->hasOperandBundles()); 1930 }; 1931 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) { 1932 // Check the operator is one that we support. 1933 if (isa<BinaryOperator, CmpInst>(FrontU)) { 1934 // We exclude div/rem in case they hit UB from poison lanes. 1935 if (auto *BO = dyn_cast<BinaryOperator>(FrontU); 1936 BO && BO->isIntDivRem()) 1937 return false; 1938 Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); 1939 Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1)); 1940 continue; 1941 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(FrontU)) { 1942 Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); 1943 continue; 1944 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) { 1945 // TODO: Handle vector widening/narrowing bitcasts. 1946 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy()); 1947 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy()); 1948 if (DstTy && SrcTy && 1949 SrcTy->getNumElements() == DstTy->getNumElements()) { 1950 Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); 1951 continue; 1952 } 1953 } else if (isa<SelectInst>(FrontU)) { 1954 Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); 1955 Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1)); 1956 Worklist.push_back(generateInstLaneVectorFromOperand(Item, 2)); 1957 continue; 1958 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU); 1959 II && isTriviallyVectorizable(II->getIntrinsicID()) && 1960 !II->hasOperandBundles()) { 1961 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) { 1962 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) { 1963 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) { 1964 Value *FrontV = Item.front().first->get(); 1965 Use *U = IL.first; 1966 return !U || (cast<Instruction>(U->get())->getOperand(Op) == 1967 cast<Instruction>(FrontV)->getOperand(Op)); 1968 })) 1969 return false; 1970 continue; 1971 } 1972 Worklist.push_back(generateInstLaneVectorFromOperand(Item, Op)); 1973 } 1974 continue; 1975 } 1976 } 1977 1978 if (isFreeConcat(Item, TTI)) { 1979 ConcatLeafs.insert(FrontU); 1980 continue; 1981 } 1982 1983 return false; 1984 } 1985 1986 if (NumVisited <= 1) 1987 return false; 1988 1989 // If we got this far, we know the shuffles are superfluous and can be 1990 // removed. Scan through again and generate the new tree of instructions. 1991 Builder.SetInsertPoint(&I); 1992 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs, 1993 ConcatLeafs, Builder); 1994 replaceValue(I, *V); 1995 return true; 1996 } 1997 1998 /// Given a commutative reduction, the order of the input lanes does not alter 1999 /// the results. We can use this to remove certain shuffles feeding the 2000 /// reduction, removing the need to shuffle at all. 2001 bool VectorCombine::foldShuffleFromReductions(Instruction &I) { 2002 auto *II = dyn_cast<IntrinsicInst>(&I); 2003 if (!II) 2004 return false; 2005 switch (II->getIntrinsicID()) { 2006 case Intrinsic::vector_reduce_add: 2007 case Intrinsic::vector_reduce_mul: 2008 case Intrinsic::vector_reduce_and: 2009 case Intrinsic::vector_reduce_or: 2010 case Intrinsic::vector_reduce_xor: 2011 case Intrinsic::vector_reduce_smin: 2012 case Intrinsic::vector_reduce_smax: 2013 case Intrinsic::vector_reduce_umin: 2014 case Intrinsic::vector_reduce_umax: 2015 break; 2016 default: 2017 return false; 2018 } 2019 2020 // Find all the inputs when looking through operations that do not alter the 2021 // lane order (binops, for example). Currently we look for a single shuffle, 2022 // and can ignore splat values. 2023 std::queue<Value *> Worklist; 2024 SmallPtrSet<Value *, 4> Visited; 2025 ShuffleVectorInst *Shuffle = nullptr; 2026 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0))) 2027 Worklist.push(Op); 2028 2029 while (!Worklist.empty()) { 2030 Value *CV = Worklist.front(); 2031 Worklist.pop(); 2032 if (Visited.contains(CV)) 2033 continue; 2034 2035 // Splats don't change the order, so can be safely ignored. 2036 if (isSplatValue(CV)) 2037 continue; 2038 2039 Visited.insert(CV); 2040 2041 if (auto *CI = dyn_cast<Instruction>(CV)) { 2042 if (CI->isBinaryOp()) { 2043 for (auto *Op : CI->operand_values()) 2044 Worklist.push(Op); 2045 continue; 2046 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) { 2047 if (Shuffle && Shuffle != SV) 2048 return false; 2049 Shuffle = SV; 2050 continue; 2051 } 2052 } 2053 2054 // Anything else is currently an unknown node. 2055 return false; 2056 } 2057 2058 if (!Shuffle) 2059 return false; 2060 2061 // Check all uses of the binary ops and shuffles are also included in the 2062 // lane-invariant operations (Visited should be the list of lanewise 2063 // instructions, including the shuffle that we found). 2064 for (auto *V : Visited) 2065 for (auto *U : V->users()) 2066 if (!Visited.contains(U) && U != &I) 2067 return false; 2068 2069 FixedVectorType *VecType = 2070 dyn_cast<FixedVectorType>(II->getOperand(0)->getType()); 2071 if (!VecType) 2072 return false; 2073 FixedVectorType *ShuffleInputType = 2074 dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType()); 2075 if (!ShuffleInputType) 2076 return false; 2077 unsigned NumInputElts = ShuffleInputType->getNumElements(); 2078 2079 // Find the mask from sorting the lanes into order. This is most likely to 2080 // become a identity or concat mask. Undef elements are pushed to the end. 2081 SmallVector<int> ConcatMask; 2082 Shuffle->getShuffleMask(ConcatMask); 2083 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; }); 2084 // In the case of a truncating shuffle it's possible for the mask 2085 // to have an index greater than the size of the resulting vector. 2086 // This requires special handling. 2087 bool IsTruncatingShuffle = VecType->getNumElements() < NumInputElts; 2088 bool UsesSecondVec = 2089 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; }); 2090 2091 FixedVectorType *VecTyForCost = 2092 (UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType; 2093 InstructionCost OldCost = TTI.getShuffleCost( 2094 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, 2095 VecTyForCost, Shuffle->getShuffleMask()); 2096 InstructionCost NewCost = TTI.getShuffleCost( 2097 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, 2098 VecTyForCost, ConcatMask); 2099 2100 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle 2101 << "\n"); 2102 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost 2103 << "\n"); 2104 if (NewCost < OldCost) { 2105 Builder.SetInsertPoint(Shuffle); 2106 Value *NewShuffle = Builder.CreateShuffleVector( 2107 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask); 2108 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n"); 2109 replaceValue(*Shuffle, *NewShuffle); 2110 } 2111 2112 // See if we can re-use foldSelectShuffle, getting it to reduce the size of 2113 // the shuffle into a nicer order, as it can ignore the order of the shuffles. 2114 return foldSelectShuffle(*Shuffle, true); 2115 } 2116 2117 /// Determine if its more efficient to fold: 2118 /// reduce(trunc(x)) -> trunc(reduce(x)). 2119 /// reduce(sext(x)) -> sext(reduce(x)). 2120 /// reduce(zext(x)) -> zext(reduce(x)). 2121 bool VectorCombine::foldCastFromReductions(Instruction &I) { 2122 auto *II = dyn_cast<IntrinsicInst>(&I); 2123 if (!II) 2124 return false; 2125 2126 bool TruncOnly = false; 2127 Intrinsic::ID IID = II->getIntrinsicID(); 2128 switch (IID) { 2129 case Intrinsic::vector_reduce_add: 2130 case Intrinsic::vector_reduce_mul: 2131 TruncOnly = true; 2132 break; 2133 case Intrinsic::vector_reduce_and: 2134 case Intrinsic::vector_reduce_or: 2135 case Intrinsic::vector_reduce_xor: 2136 break; 2137 default: 2138 return false; 2139 } 2140 2141 unsigned ReductionOpc = getArithmeticReductionInstruction(IID); 2142 Value *ReductionSrc = I.getOperand(0); 2143 2144 Value *Src; 2145 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) && 2146 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src)))))) 2147 return false; 2148 2149 auto CastOpc = 2150 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode(); 2151 2152 auto *SrcTy = cast<VectorType>(Src->getType()); 2153 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType()); 2154 Type *ResultTy = I.getType(); 2155 2156 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2157 InstructionCost OldCost = TTI.getArithmeticReductionCost( 2158 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind); 2159 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy, 2160 TTI::CastContextHint::None, CostKind, 2161 cast<CastInst>(ReductionSrc)); 2162 InstructionCost NewCost = 2163 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt, 2164 CostKind) + 2165 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(), 2166 TTI::CastContextHint::None, CostKind); 2167 2168 if (OldCost <= NewCost || !NewCost.isValid()) 2169 return false; 2170 2171 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(), 2172 II->getIntrinsicID(), {Src}); 2173 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy); 2174 replaceValue(I, *NewCast); 2175 return true; 2176 } 2177 2178 /// This method looks for groups of shuffles acting on binops, of the form: 2179 /// %x = shuffle ... 2180 /// %y = shuffle ... 2181 /// %a = binop %x, %y 2182 /// %b = binop %x, %y 2183 /// shuffle %a, %b, selectmask 2184 /// We may, especially if the shuffle is wider than legal, be able to convert 2185 /// the shuffle to a form where only parts of a and b need to be computed. On 2186 /// architectures with no obvious "select" shuffle, this can reduce the total 2187 /// number of operations if the target reports them as cheaper. 2188 bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { 2189 auto *SVI = cast<ShuffleVectorInst>(&I); 2190 auto *VT = cast<FixedVectorType>(I.getType()); 2191 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0)); 2192 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1)); 2193 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() || 2194 VT != Op0->getType()) 2195 return false; 2196 2197 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0)); 2198 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1)); 2199 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0)); 2200 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1)); 2201 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B}); 2202 auto checkSVNonOpUses = [&](Instruction *I) { 2203 if (!I || I->getOperand(0)->getType() != VT) 2204 return true; 2205 return any_of(I->users(), [&](User *U) { 2206 return U != Op0 && U != Op1 && 2207 !(isa<ShuffleVectorInst>(U) && 2208 (InputShuffles.contains(cast<Instruction>(U)) || 2209 isInstructionTriviallyDead(cast<Instruction>(U)))); 2210 }); 2211 }; 2212 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) || 2213 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B)) 2214 return false; 2215 2216 // Collect all the uses that are shuffles that we can transform together. We 2217 // may not have a single shuffle, but a group that can all be transformed 2218 // together profitably. 2219 SmallVector<ShuffleVectorInst *> Shuffles; 2220 auto collectShuffles = [&](Instruction *I) { 2221 for (auto *U : I->users()) { 2222 auto *SV = dyn_cast<ShuffleVectorInst>(U); 2223 if (!SV || SV->getType() != VT) 2224 return false; 2225 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) || 2226 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1)) 2227 return false; 2228 if (!llvm::is_contained(Shuffles, SV)) 2229 Shuffles.push_back(SV); 2230 } 2231 return true; 2232 }; 2233 if (!collectShuffles(Op0) || !collectShuffles(Op1)) 2234 return false; 2235 // From a reduction, we need to be processing a single shuffle, otherwise the 2236 // other uses will not be lane-invariant. 2237 if (FromReduction && Shuffles.size() > 1) 2238 return false; 2239 2240 // Add any shuffle uses for the shuffles we have found, to include them in our 2241 // cost calculations. 2242 if (!FromReduction) { 2243 for (ShuffleVectorInst *SV : Shuffles) { 2244 for (auto *U : SV->users()) { 2245 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U); 2246 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT) 2247 Shuffles.push_back(SSV); 2248 } 2249 } 2250 } 2251 2252 // For each of the output shuffles, we try to sort all the first vector 2253 // elements to the beginning, followed by the second array elements at the 2254 // end. If the binops are legalized to smaller vectors, this may reduce total 2255 // number of binops. We compute the ReconstructMask mask needed to convert 2256 // back to the original lane order. 2257 SmallVector<std::pair<int, int>> V1, V2; 2258 SmallVector<SmallVector<int>> OrigReconstructMasks; 2259 int MaxV1Elt = 0, MaxV2Elt = 0; 2260 unsigned NumElts = VT->getNumElements(); 2261 for (ShuffleVectorInst *SVN : Shuffles) { 2262 SmallVector<int> Mask; 2263 SVN->getShuffleMask(Mask); 2264 2265 // Check the operands are the same as the original, or reversed (in which 2266 // case we need to commute the mask). 2267 Value *SVOp0 = SVN->getOperand(0); 2268 Value *SVOp1 = SVN->getOperand(1); 2269 if (isa<UndefValue>(SVOp1)) { 2270 auto *SSV = cast<ShuffleVectorInst>(SVOp0); 2271 SVOp0 = SSV->getOperand(0); 2272 SVOp1 = SSV->getOperand(1); 2273 for (unsigned I = 0, E = Mask.size(); I != E; I++) { 2274 if (Mask[I] >= static_cast<int>(SSV->getShuffleMask().size())) 2275 return false; 2276 Mask[I] = Mask[I] < 0 ? Mask[I] : SSV->getMaskValue(Mask[I]); 2277 } 2278 } 2279 if (SVOp0 == Op1 && SVOp1 == Op0) { 2280 std::swap(SVOp0, SVOp1); 2281 ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); 2282 } 2283 if (SVOp0 != Op0 || SVOp1 != Op1) 2284 return false; 2285 2286 // Calculate the reconstruction mask for this shuffle, as the mask needed to 2287 // take the packed values from Op0/Op1 and reconstructing to the original 2288 // order. 2289 SmallVector<int> ReconstructMask; 2290 for (unsigned I = 0; I < Mask.size(); I++) { 2291 if (Mask[I] < 0) { 2292 ReconstructMask.push_back(-1); 2293 } else if (Mask[I] < static_cast<int>(NumElts)) { 2294 MaxV1Elt = std::max(MaxV1Elt, Mask[I]); 2295 auto It = find_if(V1, [&](const std::pair<int, int> &A) { 2296 return Mask[I] == A.first; 2297 }); 2298 if (It != V1.end()) 2299 ReconstructMask.push_back(It - V1.begin()); 2300 else { 2301 ReconstructMask.push_back(V1.size()); 2302 V1.emplace_back(Mask[I], V1.size()); 2303 } 2304 } else { 2305 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts); 2306 auto It = find_if(V2, [&](const std::pair<int, int> &A) { 2307 return Mask[I] - static_cast<int>(NumElts) == A.first; 2308 }); 2309 if (It != V2.end()) 2310 ReconstructMask.push_back(NumElts + It - V2.begin()); 2311 else { 2312 ReconstructMask.push_back(NumElts + V2.size()); 2313 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size()); 2314 } 2315 } 2316 } 2317 2318 // For reductions, we know that the lane ordering out doesn't alter the 2319 // result. In-order can help simplify the shuffle away. 2320 if (FromReduction) 2321 sort(ReconstructMask); 2322 OrigReconstructMasks.push_back(std::move(ReconstructMask)); 2323 } 2324 2325 // If the Maximum element used from V1 and V2 are not larger than the new 2326 // vectors, the vectors are already packes and performing the optimization 2327 // again will likely not help any further. This also prevents us from getting 2328 // stuck in a cycle in case the costs do not also rule it out. 2329 if (V1.empty() || V2.empty() || 2330 (MaxV1Elt == static_cast<int>(V1.size()) - 1 && 2331 MaxV2Elt == static_cast<int>(V2.size()) - 1)) 2332 return false; 2333 2334 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a 2335 // shuffle of another shuffle, or not a shuffle (that is treated like a 2336 // identity shuffle). 2337 auto GetBaseMaskValue = [&](Instruction *I, int M) { 2338 auto *SV = dyn_cast<ShuffleVectorInst>(I); 2339 if (!SV) 2340 return M; 2341 if (isa<UndefValue>(SV->getOperand(1))) 2342 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) 2343 if (InputShuffles.contains(SSV)) 2344 return SSV->getMaskValue(SV->getMaskValue(M)); 2345 return SV->getMaskValue(M); 2346 }; 2347 2348 // Attempt to sort the inputs my ascending mask values to make simpler input 2349 // shuffles and push complex shuffles down to the uses. We sort on the first 2350 // of the two input shuffle orders, to try and get at least one input into a 2351 // nice order. 2352 auto SortBase = [&](Instruction *A, std::pair<int, int> X, 2353 std::pair<int, int> Y) { 2354 int MXA = GetBaseMaskValue(A, X.first); 2355 int MYA = GetBaseMaskValue(A, Y.first); 2356 return MXA < MYA; 2357 }; 2358 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) { 2359 return SortBase(SVI0A, A, B); 2360 }); 2361 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) { 2362 return SortBase(SVI1A, A, B); 2363 }); 2364 // Calculate our ReconstructMasks from the OrigReconstructMasks and the 2365 // modified order of the input shuffles. 2366 SmallVector<SmallVector<int>> ReconstructMasks; 2367 for (const auto &Mask : OrigReconstructMasks) { 2368 SmallVector<int> ReconstructMask; 2369 for (int M : Mask) { 2370 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) { 2371 auto It = find_if(V, [M](auto A) { return A.second == M; }); 2372 assert(It != V.end() && "Expected all entries in Mask"); 2373 return std::distance(V.begin(), It); 2374 }; 2375 if (M < 0) 2376 ReconstructMask.push_back(-1); 2377 else if (M < static_cast<int>(NumElts)) { 2378 ReconstructMask.push_back(FindIndex(V1, M)); 2379 } else { 2380 ReconstructMask.push_back(NumElts + FindIndex(V2, M)); 2381 } 2382 } 2383 ReconstructMasks.push_back(std::move(ReconstructMask)); 2384 } 2385 2386 // Calculate the masks needed for the new input shuffles, which get padded 2387 // with undef 2388 SmallVector<int> V1A, V1B, V2A, V2B; 2389 for (unsigned I = 0; I < V1.size(); I++) { 2390 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first)); 2391 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first)); 2392 } 2393 for (unsigned I = 0; I < V2.size(); I++) { 2394 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first)); 2395 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first)); 2396 } 2397 while (V1A.size() < NumElts) { 2398 V1A.push_back(PoisonMaskElem); 2399 V1B.push_back(PoisonMaskElem); 2400 } 2401 while (V2A.size() < NumElts) { 2402 V2A.push_back(PoisonMaskElem); 2403 V2B.push_back(PoisonMaskElem); 2404 } 2405 2406 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) { 2407 auto *SV = dyn_cast<ShuffleVectorInst>(I); 2408 if (!SV) 2409 return C; 2410 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1)) 2411 ? TTI::SK_PermuteSingleSrc 2412 : TTI::SK_PermuteTwoSrc, 2413 VT, SV->getShuffleMask()); 2414 }; 2415 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) { 2416 return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask); 2417 }; 2418 2419 // Get the costs of the shuffles + binops before and after with the new 2420 // shuffle masks. 2421 InstructionCost CostBefore = 2422 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT) + 2423 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT); 2424 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(), 2425 InstructionCost(0), AddShuffleCost); 2426 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), 2427 InstructionCost(0), AddShuffleCost); 2428 2429 // The new binops will be unused for lanes past the used shuffle lengths. 2430 // These types attempt to get the correct cost for that from the target. 2431 FixedVectorType *Op0SmallVT = 2432 FixedVectorType::get(VT->getScalarType(), V1.size()); 2433 FixedVectorType *Op1SmallVT = 2434 FixedVectorType::get(VT->getScalarType(), V2.size()); 2435 InstructionCost CostAfter = 2436 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT) + 2437 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT); 2438 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(), 2439 InstructionCost(0), AddShuffleMaskCost); 2440 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B}); 2441 CostAfter += 2442 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(), 2443 InstructionCost(0), AddShuffleMaskCost); 2444 2445 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n"); 2446 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore 2447 << " vs CostAfter: " << CostAfter << "\n"); 2448 if (CostBefore <= CostAfter) 2449 return false; 2450 2451 // The cost model has passed, create the new instructions. 2452 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * { 2453 auto *SV = dyn_cast<ShuffleVectorInst>(I); 2454 if (!SV) 2455 return I; 2456 if (isa<UndefValue>(SV->getOperand(1))) 2457 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) 2458 if (InputShuffles.contains(SSV)) 2459 return SSV->getOperand(Op); 2460 return SV->getOperand(Op); 2461 }; 2462 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef()); 2463 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0), 2464 GetShuffleOperand(SVI0A, 1), V1A); 2465 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef()); 2466 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0), 2467 GetShuffleOperand(SVI0B, 1), V1B); 2468 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef()); 2469 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0), 2470 GetShuffleOperand(SVI1A, 1), V2A); 2471 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef()); 2472 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0), 2473 GetShuffleOperand(SVI1B, 1), V2B); 2474 Builder.SetInsertPoint(Op0); 2475 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(), 2476 NSV0A, NSV0B); 2477 if (auto *I = dyn_cast<Instruction>(NOp0)) 2478 I->copyIRFlags(Op0, true); 2479 Builder.SetInsertPoint(Op1); 2480 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(), 2481 NSV1A, NSV1B); 2482 if (auto *I = dyn_cast<Instruction>(NOp1)) 2483 I->copyIRFlags(Op1, true); 2484 2485 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) { 2486 Builder.SetInsertPoint(Shuffles[S]); 2487 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]); 2488 replaceValue(*Shuffles[S], *NSV); 2489 } 2490 2491 Worklist.pushValue(NSV0A); 2492 Worklist.pushValue(NSV0B); 2493 Worklist.pushValue(NSV1A); 2494 Worklist.pushValue(NSV1B); 2495 for (auto *S : Shuffles) 2496 Worklist.add(S); 2497 return true; 2498 } 2499 2500 /// This is the entry point for all transforms. Pass manager differences are 2501 /// handled in the callers of this function. 2502 bool VectorCombine::run() { 2503 if (DisableVectorCombine) 2504 return false; 2505 2506 // Don't attempt vectorization if the target does not support vectors. 2507 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true))) 2508 return false; 2509 2510 bool MadeChange = false; 2511 auto FoldInst = [this, &MadeChange](Instruction &I) { 2512 Builder.SetInsertPoint(&I); 2513 bool IsFixedVectorType = isa<FixedVectorType>(I.getType()); 2514 auto Opcode = I.getOpcode(); 2515 2516 // These folds should be beneficial regardless of when this pass is run 2517 // in the optimization pipeline. 2518 // The type checking is for run-time efficiency. We can avoid wasting time 2519 // dispatching to folding functions if there's no chance of matching. 2520 if (IsFixedVectorType) { 2521 switch (Opcode) { 2522 case Instruction::InsertElement: 2523 MadeChange |= vectorizeLoadInsert(I); 2524 break; 2525 case Instruction::ShuffleVector: 2526 MadeChange |= widenSubvectorLoad(I); 2527 break; 2528 default: 2529 break; 2530 } 2531 } 2532 2533 // This transform works with scalable and fixed vectors 2534 // TODO: Identify and allow other scalable transforms 2535 if (isa<VectorType>(I.getType())) { 2536 MadeChange |= scalarizeBinopOrCmp(I); 2537 MadeChange |= scalarizeLoadExtract(I); 2538 MadeChange |= scalarizeVPIntrinsic(I); 2539 } 2540 2541 if (Opcode == Instruction::Store) 2542 MadeChange |= foldSingleElementStore(I); 2543 2544 // If this is an early pipeline invocation of this pass, we are done. 2545 if (TryEarlyFoldsOnly) 2546 return; 2547 2548 // Otherwise, try folds that improve codegen but may interfere with 2549 // early IR canonicalizations. 2550 // The type checking is for run-time efficiency. We can avoid wasting time 2551 // dispatching to folding functions if there's no chance of matching. 2552 if (IsFixedVectorType) { 2553 switch (Opcode) { 2554 case Instruction::InsertElement: 2555 MadeChange |= foldInsExtFNeg(I); 2556 break; 2557 case Instruction::ShuffleVector: 2558 MadeChange |= foldShuffleOfBinops(I); 2559 MadeChange |= foldShuffleOfCastops(I); 2560 MadeChange |= foldShuffleOfShuffles(I); 2561 MadeChange |= foldSelectShuffle(I); 2562 MadeChange |= foldShuffleToIdentity(I); 2563 break; 2564 case Instruction::BitCast: 2565 MadeChange |= foldBitcastShuffle(I); 2566 break; 2567 } 2568 } else { 2569 switch (Opcode) { 2570 case Instruction::Call: 2571 MadeChange |= foldShuffleFromReductions(I); 2572 MadeChange |= foldCastFromReductions(I); 2573 break; 2574 case Instruction::ICmp: 2575 case Instruction::FCmp: 2576 MadeChange |= foldExtractExtract(I); 2577 break; 2578 default: 2579 if (Instruction::isBinaryOp(Opcode)) { 2580 MadeChange |= foldExtractExtract(I); 2581 MadeChange |= foldExtractedCmps(I); 2582 } 2583 break; 2584 } 2585 } 2586 }; 2587 2588 for (BasicBlock &BB : F) { 2589 // Ignore unreachable basic blocks. 2590 if (!DT.isReachableFromEntry(&BB)) 2591 continue; 2592 // Use early increment range so that we can erase instructions in loop. 2593 for (Instruction &I : make_early_inc_range(BB)) { 2594 if (I.isDebugOrPseudoInst()) 2595 continue; 2596 FoldInst(I); 2597 } 2598 } 2599 2600 while (!Worklist.isEmpty()) { 2601 Instruction *I = Worklist.removeOne(); 2602 if (!I) 2603 continue; 2604 2605 if (isInstructionTriviallyDead(I)) { 2606 eraseInstruction(*I); 2607 continue; 2608 } 2609 2610 FoldInst(*I); 2611 } 2612 2613 return MadeChange; 2614 } 2615 2616 PreservedAnalyses VectorCombinePass::run(Function &F, 2617 FunctionAnalysisManager &FAM) { 2618 auto &AC = FAM.getResult<AssumptionAnalysis>(F); 2619 TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F); 2620 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); 2621 AAResults &AA = FAM.getResult<AAManager>(F); 2622 const DataLayout *DL = &F.getDataLayout(); 2623 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TryEarlyFoldsOnly); 2624 if (!Combiner.run()) 2625 return PreservedAnalyses::all(); 2626 PreservedAnalyses PA; 2627 PA.preserveSet<CFGAnalyses>(); 2628 return PA; 2629 } 2630