1 //===------- VectorCombine.cpp - Optimize partial vector operations -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass optimizes scalar/vector interactions using target cost models. The 10 // transforms implemented here may not fit in traditional loop-based or SLP 11 // vectorization passes. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/Vectorize/VectorCombine.h" 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/ADT/ScopeExit.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/AssumptionCache.h" 21 #include "llvm/Analysis/BasicAliasAnalysis.h" 22 #include "llvm/Analysis/GlobalsModRef.h" 23 #include "llvm/Analysis/Loads.h" 24 #include "llvm/Analysis/TargetTransformInfo.h" 25 #include "llvm/Analysis/ValueTracking.h" 26 #include "llvm/Analysis/VectorUtils.h" 27 #include "llvm/IR/Dominators.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/PatternMatch.h" 31 #include "llvm/Support/CommandLine.h" 32 #include "llvm/Transforms/Utils/Local.h" 33 #include "llvm/Transforms/Utils/LoopUtils.h" 34 #include <numeric> 35 #include <queue> 36 37 #define DEBUG_TYPE "vector-combine" 38 #include "llvm/Transforms/Utils/InstructionWorklist.h" 39 40 using namespace llvm; 41 using namespace llvm::PatternMatch; 42 43 STATISTIC(NumVecLoad, "Number of vector loads formed"); 44 STATISTIC(NumVecCmp, "Number of vector compares formed"); 45 STATISTIC(NumVecBO, "Number of vector binops formed"); 46 STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed"); 47 STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast"); 48 STATISTIC(NumScalarBO, "Number of scalar binops formed"); 49 STATISTIC(NumScalarCmp, "Number of scalar compares formed"); 50 51 static cl::opt<bool> DisableVectorCombine( 52 "disable-vector-combine", cl::init(false), cl::Hidden, 53 cl::desc("Disable all vector combine transforms")); 54 55 static cl::opt<bool> DisableBinopExtractShuffle( 56 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden, 57 cl::desc("Disable binop extract to shuffle transforms")); 58 59 static cl::opt<unsigned> MaxInstrsToScan( 60 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, 61 cl::desc("Max number of instructions to scan for vector combining.")); 62 63 static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max(); 64 65 namespace { 66 class VectorCombine { 67 public: 68 VectorCombine(Function &F, const TargetTransformInfo &TTI, 69 const DominatorTree &DT, AAResults &AA, AssumptionCache &AC, 70 const DataLayout *DL, TTI::TargetCostKind CostKind, 71 bool TryEarlyFoldsOnly) 72 : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC), DL(DL), 73 CostKind(CostKind), TryEarlyFoldsOnly(TryEarlyFoldsOnly) {} 74 75 bool run(); 76 77 private: 78 Function &F; 79 IRBuilder<> Builder; 80 const TargetTransformInfo &TTI; 81 const DominatorTree &DT; 82 AAResults &AA; 83 AssumptionCache &AC; 84 const DataLayout *DL; 85 TTI::TargetCostKind CostKind; 86 87 /// If true, only perform beneficial early IR transforms. Do not introduce new 88 /// vector operations. 89 bool TryEarlyFoldsOnly; 90 91 InstructionWorklist Worklist; 92 93 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction" 94 // parameter. That should be updated to specific sub-classes because the 95 // run loop was changed to dispatch on opcode. 96 bool vectorizeLoadInsert(Instruction &I); 97 bool widenSubvectorLoad(Instruction &I); 98 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, 99 ExtractElementInst *Ext1, 100 unsigned PreferredExtractIndex) const; 101 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 102 const Instruction &I, 103 ExtractElementInst *&ConvertToShuffle, 104 unsigned PreferredExtractIndex); 105 void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 106 Instruction &I); 107 void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 108 Instruction &I); 109 bool foldExtractExtract(Instruction &I); 110 bool foldInsExtFNeg(Instruction &I); 111 bool foldInsExtVectorToShuffle(Instruction &I); 112 bool foldBitcastShuffle(Instruction &I); 113 bool scalarizeBinopOrCmp(Instruction &I); 114 bool scalarizeVPIntrinsic(Instruction &I); 115 bool foldExtractedCmps(Instruction &I); 116 bool foldSingleElementStore(Instruction &I); 117 bool scalarizeLoadExtract(Instruction &I); 118 bool foldConcatOfBoolMasks(Instruction &I); 119 bool foldPermuteOfBinops(Instruction &I); 120 bool foldShuffleOfBinops(Instruction &I); 121 bool foldShuffleOfCastops(Instruction &I); 122 bool foldShuffleOfShuffles(Instruction &I); 123 bool foldShuffleOfIntrinsics(Instruction &I); 124 bool foldShuffleToIdentity(Instruction &I); 125 bool foldShuffleFromReductions(Instruction &I); 126 bool foldCastFromReductions(Instruction &I); 127 bool foldSelectShuffle(Instruction &I, bool FromReduction = false); 128 bool shrinkType(Instruction &I); 129 130 void replaceValue(Value &Old, Value &New) { 131 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n'); 132 LLVM_DEBUG(dbgs() << " With: " << New << '\n'); 133 Old.replaceAllUsesWith(&New); 134 if (auto *NewI = dyn_cast<Instruction>(&New)) { 135 New.takeName(&Old); 136 Worklist.pushUsersToWorkList(*NewI); 137 Worklist.pushValue(NewI); 138 } 139 Worklist.pushValue(&Old); 140 } 141 142 void eraseInstruction(Instruction &I) { 143 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n'); 144 SmallVector<Value *> Ops(I.operands()); 145 Worklist.remove(&I); 146 I.eraseFromParent(); 147 148 // Push remaining users of the operands and then the operand itself - allows 149 // further folds that were hindered by OneUse limits. 150 for (Value *Op : Ops) 151 if (auto *OpI = dyn_cast<Instruction>(Op)) { 152 Worklist.pushUsersToWorkList(*OpI); 153 Worklist.pushValue(OpI); 154 } 155 } 156 }; 157 } // namespace 158 159 /// Return the source operand of a potentially bitcasted value. If there is no 160 /// bitcast, return the input value itself. 161 static Value *peekThroughBitcasts(Value *V) { 162 while (auto *BitCast = dyn_cast<BitCastInst>(V)) 163 V = BitCast->getOperand(0); 164 return V; 165 } 166 167 static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) { 168 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan. 169 // The widened load may load data from dirty regions or create data races 170 // non-existent in the source. 171 if (!Load || !Load->isSimple() || !Load->hasOneUse() || 172 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) || 173 mustSuppressSpeculation(*Load)) 174 return false; 175 176 // We are potentially transforming byte-sized (8-bit) memory accesses, so make 177 // sure we have all of our type-based constraints in place for this target. 178 Type *ScalarTy = Load->getType()->getScalarType(); 179 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); 180 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); 181 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 || 182 ScalarSize % 8 != 0) 183 return false; 184 185 return true; 186 } 187 188 bool VectorCombine::vectorizeLoadInsert(Instruction &I) { 189 // Match insert into fixed vector of scalar value. 190 // TODO: Handle non-zero insert index. 191 Value *Scalar; 192 if (!match(&I, 193 m_InsertElt(m_Poison(), m_OneUse(m_Value(Scalar)), m_ZeroInt()))) 194 return false; 195 196 // Optionally match an extract from another vector. 197 Value *X; 198 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt())); 199 if (!HasExtract) 200 X = Scalar; 201 202 auto *Load = dyn_cast<LoadInst>(X); 203 if (!canWidenLoad(Load, TTI)) 204 return false; 205 206 Type *ScalarTy = Scalar->getType(); 207 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); 208 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); 209 210 // Check safety of replacing the scalar load with a larger vector load. 211 // We use minimal alignment (maximum flexibility) because we only care about 212 // the dereferenceable region. When calculating cost and creating a new op, 213 // we may use a larger value based on alignment attributes. 214 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); 215 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type"); 216 217 unsigned MinVecNumElts = MinVectorSize / ScalarSize; 218 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); 219 unsigned OffsetEltIndex = 0; 220 Align Alignment = Load->getAlign(); 221 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC, 222 &DT)) { 223 // It is not safe to load directly from the pointer, but we can still peek 224 // through gep offsets and check if it safe to load from a base address with 225 // updated alignment. If it is, we can shuffle the element(s) into place 226 // after loading. 227 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType()); 228 APInt Offset(OffsetBitWidth, 0); 229 SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset); 230 231 // We want to shuffle the result down from a high element of a vector, so 232 // the offset must be positive. 233 if (Offset.isNegative()) 234 return false; 235 236 // The offset must be a multiple of the scalar element to shuffle cleanly 237 // in the element's size. 238 uint64_t ScalarSizeInBytes = ScalarSize / 8; 239 if (Offset.urem(ScalarSizeInBytes) != 0) 240 return false; 241 242 // If we load MinVecNumElts, will our target element still be loaded? 243 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue(); 244 if (OffsetEltIndex >= MinVecNumElts) 245 return false; 246 247 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC, 248 &DT)) 249 return false; 250 251 // Update alignment with offset value. Note that the offset could be negated 252 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but 253 // negation does not change the result of the alignment calculation. 254 Alignment = commonAlignment(Alignment, Offset.getZExtValue()); 255 } 256 257 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0 258 // Use the greater of the alignment on the load or its source pointer. 259 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment); 260 Type *LoadTy = Load->getType(); 261 unsigned AS = Load->getPointerAddressSpace(); 262 InstructionCost OldCost = 263 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind); 264 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); 265 OldCost += 266 TTI.getScalarizationOverhead(MinVecTy, DemandedElts, 267 /* Insert */ true, HasExtract, CostKind); 268 269 // New pattern: load VecPtr 270 InstructionCost NewCost = 271 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind); 272 // Optionally, we are shuffling the loaded vector element(s) into place. 273 // For the mask set everything but element 0 to undef to prevent poison from 274 // propagating from the extra loaded memory. This will also optionally 275 // shrink/grow the vector from the loaded size to the output size. 276 // We assume this operation has no cost in codegen if there was no offset. 277 // Note that we could use freeze to avoid poison problems, but then we might 278 // still need a shuffle to change the vector size. 279 auto *Ty = cast<FixedVectorType>(I.getType()); 280 unsigned OutputNumElts = Ty->getNumElements(); 281 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem); 282 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); 283 Mask[0] = OffsetEltIndex; 284 if (OffsetEltIndex) 285 NewCost += 286 TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask, CostKind); 287 288 // We can aggressively convert to the vector form because the backend can 289 // invert this transform if it does not result in a performance win. 290 if (OldCost < NewCost || !NewCost.isValid()) 291 return false; 292 293 // It is safe and potentially profitable to load a vector directly: 294 // inselt undef, load Scalar, 0 --> load VecPtr 295 IRBuilder<> Builder(Load); 296 Value *CastedPtr = 297 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS)); 298 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); 299 VecLd = Builder.CreateShuffleVector(VecLd, Mask); 300 301 replaceValue(I, *VecLd); 302 ++NumVecLoad; 303 return true; 304 } 305 306 /// If we are loading a vector and then inserting it into a larger vector with 307 /// undefined elements, try to load the larger vector and eliminate the insert. 308 /// This removes a shuffle in IR and may allow combining of other loaded values. 309 bool VectorCombine::widenSubvectorLoad(Instruction &I) { 310 // Match subvector insert of fixed vector. 311 auto *Shuf = cast<ShuffleVectorInst>(&I); 312 if (!Shuf->isIdentityWithPadding()) 313 return false; 314 315 // Allow a non-canonical shuffle mask that is choosing elements from op1. 316 unsigned NumOpElts = 317 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements(); 318 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) { 319 return M >= (int)(NumOpElts); 320 }); 321 322 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex)); 323 if (!canWidenLoad(Load, TTI)) 324 return false; 325 326 // We use minimal alignment (maximum flexibility) because we only care about 327 // the dereferenceable region. When calculating cost and creating a new op, 328 // we may use a larger value based on alignment attributes. 329 auto *Ty = cast<FixedVectorType>(I.getType()); 330 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); 331 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type"); 332 Align Alignment = Load->getAlign(); 333 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT)) 334 return false; 335 336 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment); 337 Type *LoadTy = Load->getType(); 338 unsigned AS = Load->getPointerAddressSpace(); 339 340 // Original pattern: insert_subvector (load PtrOp) 341 // This conservatively assumes that the cost of a subvector insert into an 342 // undef value is 0. We could add that cost if the cost model accurately 343 // reflects the real cost of that operation. 344 InstructionCost OldCost = 345 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind); 346 347 // New pattern: load PtrOp 348 InstructionCost NewCost = 349 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind); 350 351 // We can aggressively convert to the vector form because the backend can 352 // invert this transform if it does not result in a performance win. 353 if (OldCost < NewCost || !NewCost.isValid()) 354 return false; 355 356 IRBuilder<> Builder(Load); 357 Value *CastedPtr = 358 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS)); 359 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment); 360 replaceValue(I, *VecLd); 361 ++NumVecLoad; 362 return true; 363 } 364 365 /// Determine which, if any, of the inputs should be replaced by a shuffle 366 /// followed by extract from a different index. 367 ExtractElementInst *VectorCombine::getShuffleExtract( 368 ExtractElementInst *Ext0, ExtractElementInst *Ext1, 369 unsigned PreferredExtractIndex = InvalidIndex) const { 370 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand()); 371 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand()); 372 assert(Index0C && Index1C && "Expected constant extract indexes"); 373 374 unsigned Index0 = Index0C->getZExtValue(); 375 unsigned Index1 = Index1C->getZExtValue(); 376 377 // If the extract indexes are identical, no shuffle is needed. 378 if (Index0 == Index1) 379 return nullptr; 380 381 Type *VecTy = Ext0->getVectorOperand()->getType(); 382 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types"); 383 InstructionCost Cost0 = 384 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); 385 InstructionCost Cost1 = 386 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); 387 388 // If both costs are invalid no shuffle is needed 389 if (!Cost0.isValid() && !Cost1.isValid()) 390 return nullptr; 391 392 // We are extracting from 2 different indexes, so one operand must be shuffled 393 // before performing a vector operation and/or extract. The more expensive 394 // extract will be replaced by a shuffle. 395 if (Cost0 > Cost1) 396 return Ext0; 397 if (Cost1 > Cost0) 398 return Ext1; 399 400 // If the costs are equal and there is a preferred extract index, shuffle the 401 // opposite operand. 402 if (PreferredExtractIndex == Index0) 403 return Ext1; 404 if (PreferredExtractIndex == Index1) 405 return Ext0; 406 407 // Otherwise, replace the extract with the higher index. 408 return Index0 > Index1 ? Ext0 : Ext1; 409 } 410 411 /// Compare the relative costs of 2 extracts followed by scalar operation vs. 412 /// vector operation(s) followed by extract. Return true if the existing 413 /// instructions are cheaper than a vector alternative. Otherwise, return false 414 /// and if one of the extracts should be transformed to a shufflevector, set 415 /// \p ConvertToShuffle to that extract instruction. 416 bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, 417 ExtractElementInst *Ext1, 418 const Instruction &I, 419 ExtractElementInst *&ConvertToShuffle, 420 unsigned PreferredExtractIndex) { 421 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand()); 422 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand()); 423 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes"); 424 425 unsigned Opcode = I.getOpcode(); 426 Value *Ext0Src = Ext0->getVectorOperand(); 427 Value *Ext1Src = Ext1->getVectorOperand(); 428 Type *ScalarTy = Ext0->getType(); 429 auto *VecTy = cast<VectorType>(Ext0Src->getType()); 430 InstructionCost ScalarOpCost, VectorOpCost; 431 432 // Get cost estimates for scalar and vector versions of the operation. 433 bool IsBinOp = Instruction::isBinaryOp(Opcode); 434 if (IsBinOp) { 435 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind); 436 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind); 437 } else { 438 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && 439 "Expected a compare"); 440 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate(); 441 ScalarOpCost = TTI.getCmpSelInstrCost( 442 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind); 443 VectorOpCost = TTI.getCmpSelInstrCost( 444 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind); 445 } 446 447 // Get cost estimates for the extract elements. These costs will factor into 448 // both sequences. 449 unsigned Ext0Index = Ext0IndexC->getZExtValue(); 450 unsigned Ext1Index = Ext1IndexC->getZExtValue(); 451 452 InstructionCost Extract0Cost = 453 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index); 454 InstructionCost Extract1Cost = 455 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index); 456 457 // A more expensive extract will always be replaced by a splat shuffle. 458 // For example, if Ext0 is more expensive: 459 // opcode (extelt V0, Ext0), (ext V1, Ext1) --> 460 // extelt (opcode (splat V0, Ext0), V1), Ext1 461 // TODO: Evaluate whether that always results in lowest cost. Alternatively, 462 // check the cost of creating a broadcast shuffle and shuffling both 463 // operands to element 0. 464 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index; 465 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index; 466 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost); 467 468 // Extra uses of the extracts mean that we include those costs in the 469 // vector total because those instructions will not be eliminated. 470 InstructionCost OldCost, NewCost; 471 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) { 472 // Handle a special case. If the 2 extracts are identical, adjust the 473 // formulas to account for that. The extra use charge allows for either the 474 // CSE'd pattern or an unoptimized form with identical values: 475 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C 476 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2) 477 : !Ext0->hasOneUse() || !Ext1->hasOneUse(); 478 OldCost = CheapExtractCost + ScalarOpCost; 479 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost; 480 } else { 481 // Handle the general case. Each extract is actually a different value: 482 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C 483 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost; 484 NewCost = VectorOpCost + CheapExtractCost + 485 !Ext0->hasOneUse() * Extract0Cost + 486 !Ext1->hasOneUse() * Extract1Cost; 487 } 488 489 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex); 490 if (ConvertToShuffle) { 491 if (IsBinOp && DisableBinopExtractShuffle) 492 return true; 493 494 // If we are extracting from 2 different indexes, then one operand must be 495 // shuffled before performing the vector operation. The shuffle mask is 496 // poison except for 1 lane that is being translated to the remaining 497 // extraction lane. Therefore, it is a splat shuffle. Ex: 498 // ShufMask = { poison, poison, 0, poison } 499 // TODO: The cost model has an option for a "broadcast" shuffle 500 // (splat-from-element-0), but no option for a more general splat. 501 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) { 502 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(), 503 PoisonMaskElem); 504 ShuffleMask[BestInsIndex] = BestExtIndex; 505 NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, 506 VecTy, ShuffleMask, CostKind, 0, nullptr, 507 {ConvertToShuffle}); 508 } else { 509 NewCost += 510 TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy, 511 {}, CostKind, 0, nullptr, {ConvertToShuffle}); 512 } 513 } 514 515 // Aggressively form a vector op if the cost is equal because the transform 516 // may enable further optimization. 517 // Codegen can reverse this transform (scalarize) if it was not profitable. 518 return OldCost < NewCost; 519 } 520 521 /// Create a shuffle that translates (shifts) 1 element from the input vector 522 /// to a new element location. 523 static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, 524 unsigned NewIndex, IRBuilder<> &Builder) { 525 // The shuffle mask is poison except for 1 lane that is being translated 526 // to the new element index. Example for OldIndex == 2 and NewIndex == 0: 527 // ShufMask = { 2, poison, poison, poison } 528 auto *VecTy = cast<FixedVectorType>(Vec->getType()); 529 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem); 530 ShufMask[NewIndex] = OldIndex; 531 return Builder.CreateShuffleVector(Vec, ShufMask, "shift"); 532 } 533 534 /// Given an extract element instruction with constant index operand, shuffle 535 /// the source vector (shift the scalar element) to a NewIndex for extraction. 536 /// Return null if the input can be constant folded, so that we are not creating 537 /// unnecessary instructions. 538 static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, 539 unsigned NewIndex, 540 IRBuilder<> &Builder) { 541 // Shufflevectors can only be created for fixed-width vectors. 542 Value *X = ExtElt->getVectorOperand(); 543 if (!isa<FixedVectorType>(X->getType())) 544 return nullptr; 545 546 // If the extract can be constant-folded, this code is unsimplified. Defer 547 // to other passes to handle that. 548 Value *C = ExtElt->getIndexOperand(); 549 assert(isa<ConstantInt>(C) && "Expected a constant index operand"); 550 if (isa<Constant>(X)) 551 return nullptr; 552 553 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(), 554 NewIndex, Builder); 555 return cast<ExtractElementInst>(Builder.CreateExtractElement(Shuf, NewIndex)); 556 } 557 558 /// Try to reduce extract element costs by converting scalar compares to vector 559 /// compares followed by extract. 560 /// cmp (ext0 V0, C), (ext1 V1, C) 561 void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0, 562 ExtractElementInst *Ext1, Instruction &I) { 563 assert(isa<CmpInst>(&I) && "Expected a compare"); 564 assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == 565 cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && 566 "Expected matching constant extract indexes"); 567 568 // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C 569 ++NumVecCmp; 570 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate(); 571 Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); 572 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1); 573 Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand()); 574 replaceValue(I, *NewExt); 575 } 576 577 /// Try to reduce extract element costs by converting scalar binops to vector 578 /// binops followed by extract. 579 /// bo (ext0 V0, C), (ext1 V1, C) 580 void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0, 581 ExtractElementInst *Ext1, Instruction &I) { 582 assert(isa<BinaryOperator>(&I) && "Expected a binary operator"); 583 assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == 584 cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && 585 "Expected matching constant extract indexes"); 586 587 // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C 588 ++NumVecBO; 589 Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); 590 Value *VecBO = 591 Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1); 592 593 // All IR flags are safe to back-propagate because any potential poison 594 // created in unused vector elements is discarded by the extract. 595 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO)) 596 VecBOInst->copyIRFlags(&I); 597 598 Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand()); 599 replaceValue(I, *NewExt); 600 } 601 602 /// Match an instruction with extracted vector operands. 603 bool VectorCombine::foldExtractExtract(Instruction &I) { 604 // It is not safe to transform things like div, urem, etc. because we may 605 // create undefined behavior when executing those on unknown vector elements. 606 if (!isSafeToSpeculativelyExecute(&I)) 607 return false; 608 609 Instruction *I0, *I1; 610 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE; 611 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) && 612 !match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1)))) 613 return false; 614 615 Value *V0, *V1; 616 uint64_t C0, C1; 617 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) || 618 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) || 619 V0->getType() != V1->getType()) 620 return false; 621 622 // If the scalar value 'I' is going to be re-inserted into a vector, then try 623 // to create an extract to that same element. The extract/insert can be 624 // reduced to a "select shuffle". 625 // TODO: If we add a larger pattern match that starts from an insert, this 626 // probably becomes unnecessary. 627 auto *Ext0 = cast<ExtractElementInst>(I0); 628 auto *Ext1 = cast<ExtractElementInst>(I1); 629 uint64_t InsertIndex = InvalidIndex; 630 if (I.hasOneUse()) 631 match(I.user_back(), 632 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex))); 633 634 ExtractElementInst *ExtractToChange; 635 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex)) 636 return false; 637 638 if (ExtractToChange) { 639 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0; 640 ExtractElementInst *NewExtract = 641 translateExtract(ExtractToChange, CheapExtractIdx, Builder); 642 if (!NewExtract) 643 return false; 644 if (ExtractToChange == Ext0) 645 Ext0 = NewExtract; 646 else 647 Ext1 = NewExtract; 648 } 649 650 if (Pred != CmpInst::BAD_ICMP_PREDICATE) 651 foldExtExtCmp(Ext0, Ext1, I); 652 else 653 foldExtExtBinop(Ext0, Ext1, I); 654 655 Worklist.push(Ext0); 656 Worklist.push(Ext1); 657 return true; 658 } 659 660 /// Try to replace an extract + scalar fneg + insert with a vector fneg + 661 /// shuffle. 662 bool VectorCombine::foldInsExtFNeg(Instruction &I) { 663 // Match an insert (op (extract)) pattern. 664 Value *DestVec; 665 uint64_t Index; 666 Instruction *FNeg; 667 if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)), 668 m_ConstantInt(Index)))) 669 return false; 670 671 // Note: This handles the canonical fneg instruction and "fsub -0.0, X". 672 Value *SrcVec; 673 Instruction *Extract; 674 if (!match(FNeg, m_FNeg(m_CombineAnd( 675 m_Instruction(Extract), 676 m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))) 677 return false; 678 679 auto *VecTy = cast<FixedVectorType>(I.getType()); 680 auto *ScalarTy = VecTy->getScalarType(); 681 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType()); 682 if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType()) 683 return false; 684 685 // Ignore bogus insert/extract index. 686 unsigned NumElts = VecTy->getNumElements(); 687 if (Index >= NumElts) 688 return false; 689 690 // We are inserting the negated element into the same lane that we extracted 691 // from. This is equivalent to a select-shuffle that chooses all but the 692 // negated element from the destination vector. 693 SmallVector<int> Mask(NumElts); 694 std::iota(Mask.begin(), Mask.end(), 0); 695 Mask[Index] = Index + NumElts; 696 InstructionCost OldCost = 697 TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) + 698 TTI.getVectorInstrCost(I, VecTy, CostKind, Index); 699 700 // If the extract has one use, it will be eliminated, so count it in the 701 // original cost. If it has more than one use, ignore the cost because it will 702 // be the same before/after. 703 if (Extract->hasOneUse()) 704 OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index); 705 706 InstructionCost NewCost = 707 TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) + 708 TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, Mask, 709 CostKind); 710 711 bool NeedLenChg = SrcVecTy->getNumElements() != NumElts; 712 // If the lengths of the two vectors are not equal, 713 // we need to add a length-change vector. Add this cost. 714 SmallVector<int> SrcMask; 715 if (NeedLenChg) { 716 SrcMask.assign(NumElts, PoisonMaskElem); 717 SrcMask[Index] = Index; 718 NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, 719 SrcVecTy, SrcMask, CostKind); 720 } 721 722 if (NewCost > OldCost) 723 return false; 724 725 Value *NewShuf; 726 // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index 727 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); 728 if (NeedLenChg) { 729 // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask 730 Value *LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask); 731 NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask); 732 } else { 733 // shuffle DestVec, (fneg SrcVec), Mask 734 NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); 735 } 736 737 replaceValue(I, *NewShuf); 738 return true; 739 } 740 741 /// If this is a bitcast of a shuffle, try to bitcast the source vector to the 742 /// destination type followed by shuffle. This can enable further transforms by 743 /// moving bitcasts or shuffles together. 744 bool VectorCombine::foldBitcastShuffle(Instruction &I) { 745 Value *V0, *V1; 746 ArrayRef<int> Mask; 747 if (!match(&I, m_BitCast(m_OneUse( 748 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask)))))) 749 return false; 750 751 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for 752 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle 753 // mask for scalable type is a splat or not. 754 // 2) Disallow non-vector casts. 755 // TODO: We could allow any shuffle. 756 auto *DestTy = dyn_cast<FixedVectorType>(I.getType()); 757 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType()); 758 if (!DestTy || !SrcTy) 759 return false; 760 761 unsigned DestEltSize = DestTy->getScalarSizeInBits(); 762 unsigned SrcEltSize = SrcTy->getScalarSizeInBits(); 763 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0) 764 return false; 765 766 bool IsUnary = isa<UndefValue>(V1); 767 768 // For binary shuffles, only fold bitcast(shuffle(X,Y)) 769 // if it won't increase the number of bitcasts. 770 if (!IsUnary) { 771 auto *BCTy0 = dyn_cast<FixedVectorType>(peekThroughBitcasts(V0)->getType()); 772 auto *BCTy1 = dyn_cast<FixedVectorType>(peekThroughBitcasts(V1)->getType()); 773 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) && 774 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType())) 775 return false; 776 } 777 778 SmallVector<int, 16> NewMask; 779 if (DestEltSize <= SrcEltSize) { 780 // The bitcast is from wide to narrow/equal elements. The shuffle mask can 781 // always be expanded to the equivalent form choosing narrower elements. 782 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask"); 783 unsigned ScaleFactor = SrcEltSize / DestEltSize; 784 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask); 785 } else { 786 // The bitcast is from narrow elements to wide elements. The shuffle mask 787 // must choose consecutive elements to allow casting first. 788 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask"); 789 unsigned ScaleFactor = DestEltSize / SrcEltSize; 790 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask)) 791 return false; 792 } 793 794 // Bitcast the shuffle src - keep its original width but using the destination 795 // scalar type. 796 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize; 797 auto *NewShuffleTy = 798 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts); 799 auto *OldShuffleTy = 800 FixedVectorType::get(SrcTy->getScalarType(), Mask.size()); 801 unsigned NumOps = IsUnary ? 1 : 2; 802 803 // The new shuffle must not cost more than the old shuffle. 804 TargetTransformInfo::ShuffleKind SK = 805 IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc 806 : TargetTransformInfo::SK_PermuteTwoSrc; 807 808 InstructionCost NewCost = 809 TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CostKind) + 810 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy, 811 TargetTransformInfo::CastContextHint::None, 812 CostKind)); 813 InstructionCost OldCost = 814 TTI.getShuffleCost(SK, SrcTy, Mask, CostKind) + 815 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy, 816 TargetTransformInfo::CastContextHint::None, 817 CostKind); 818 819 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: " 820 << OldCost << " vs NewCost: " << NewCost << "\n"); 821 822 if (NewCost > OldCost || !NewCost.isValid()) 823 return false; 824 825 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC' 826 ++NumShufOfBitcast; 827 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy); 828 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy); 829 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask); 830 replaceValue(I, *Shuf); 831 return true; 832 } 833 834 /// VP Intrinsics whose vector operands are both splat values may be simplified 835 /// into the scalar version of the operation and the result splatted. This 836 /// can lead to scalarization down the line. 837 bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) { 838 if (!isa<VPIntrinsic>(I)) 839 return false; 840 VPIntrinsic &VPI = cast<VPIntrinsic>(I); 841 Value *Op0 = VPI.getArgOperand(0); 842 Value *Op1 = VPI.getArgOperand(1); 843 844 if (!isSplatValue(Op0) || !isSplatValue(Op1)) 845 return false; 846 847 // Check getSplatValue early in this function, to avoid doing unnecessary 848 // work. 849 Value *ScalarOp0 = getSplatValue(Op0); 850 Value *ScalarOp1 = getSplatValue(Op1); 851 if (!ScalarOp0 || !ScalarOp1) 852 return false; 853 854 // For the binary VP intrinsics supported here, the result on disabled lanes 855 // is a poison value. For now, only do this simplification if all lanes 856 // are active. 857 // TODO: Relax the condition that all lanes are active by using insertelement 858 // on inactive lanes. 859 auto IsAllTrueMask = [](Value *MaskVal) { 860 if (Value *SplattedVal = getSplatValue(MaskVal)) 861 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal)) 862 return ConstValue->isAllOnesValue(); 863 return false; 864 }; 865 if (!IsAllTrueMask(VPI.getArgOperand(2))) 866 return false; 867 868 // Check to make sure we support scalarization of the intrinsic 869 Intrinsic::ID IntrID = VPI.getIntrinsicID(); 870 if (!VPBinOpIntrinsic::isVPBinOp(IntrID)) 871 return false; 872 873 // Calculate cost of splatting both operands into vectors and the vector 874 // intrinsic 875 VectorType *VecTy = cast<VectorType>(VPI.getType()); 876 SmallVector<int> Mask; 877 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy)) 878 Mask.resize(FVTy->getNumElements(), 0); 879 InstructionCost SplatCost = 880 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) + 881 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, Mask, 882 CostKind); 883 884 // Calculate the cost of the VP Intrinsic 885 SmallVector<Type *, 4> Args; 886 for (Value *V : VPI.args()) 887 Args.push_back(V->getType()); 888 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args); 889 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind); 890 InstructionCost OldCost = 2 * SplatCost + VectorOpCost; 891 892 // Determine scalar opcode 893 std::optional<unsigned> FunctionalOpcode = 894 VPI.getFunctionalOpcode(); 895 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt; 896 if (!FunctionalOpcode) { 897 ScalarIntrID = VPI.getFunctionalIntrinsicID(); 898 if (!ScalarIntrID) 899 return false; 900 } 901 902 // Calculate cost of scalarizing 903 InstructionCost ScalarOpCost = 0; 904 if (ScalarIntrID) { 905 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args); 906 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind); 907 } else { 908 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode, 909 VecTy->getScalarType(), CostKind); 910 } 911 912 // The existing splats may be kept around if other instructions use them. 913 InstructionCost CostToKeepSplats = 914 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse()); 915 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats; 916 917 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI 918 << "\n"); 919 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost 920 << ", Cost of scalarizing:" << NewCost << "\n"); 921 922 // We want to scalarize unless the vector variant actually has lower cost. 923 if (OldCost < NewCost || !NewCost.isValid()) 924 return false; 925 926 // Scalarize the intrinsic 927 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount(); 928 Value *EVL = VPI.getArgOperand(3); 929 930 // If the VP op might introduce UB or poison, we can scalarize it provided 931 // that we know the EVL > 0: If the EVL is zero, then the original VP op 932 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by 933 // scalarizing it. 934 bool SafeToSpeculate; 935 if (ScalarIntrID) 936 SafeToSpeculate = Intrinsic::getAttributes(I.getContext(), *ScalarIntrID) 937 .hasFnAttr(Attribute::AttrKind::Speculatable); 938 else 939 SafeToSpeculate = isSafeToSpeculativelyExecuteWithOpcode( 940 *FunctionalOpcode, &VPI, nullptr, &AC, &DT); 941 if (!SafeToSpeculate && 942 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI))) 943 return false; 944 945 Value *ScalarVal = 946 ScalarIntrID 947 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID, 948 {ScalarOp0, ScalarOp1}) 949 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode), 950 ScalarOp0, ScalarOp1); 951 952 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal)); 953 return true; 954 } 955 956 /// Match a vector binop or compare instruction with at least one inserted 957 /// scalar operand and convert to scalar binop/cmp followed by insertelement. 958 bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { 959 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE; 960 Value *Ins0, *Ins1; 961 if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) && 962 !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) 963 return false; 964 965 // Do not convert the vector condition of a vector select into a scalar 966 // condition. That may cause problems for codegen because of differences in 967 // boolean formats and register-file transfers. 968 // TODO: Can we account for that in the cost model? 969 bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE; 970 if (IsCmp) 971 for (User *U : I.users()) 972 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value()))) 973 return false; 974 975 // Match against one or both scalar values being inserted into constant 976 // vectors: 977 // vec_op VecC0, (inselt VecC1, V1, Index) 978 // vec_op (inselt VecC0, V0, Index), VecC1 979 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) 980 // TODO: Deal with mismatched index constants and variable indexes? 981 Constant *VecC0 = nullptr, *VecC1 = nullptr; 982 Value *V0 = nullptr, *V1 = nullptr; 983 uint64_t Index0 = 0, Index1 = 0; 984 if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0), 985 m_ConstantInt(Index0))) && 986 !match(Ins0, m_Constant(VecC0))) 987 return false; 988 if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1), 989 m_ConstantInt(Index1))) && 990 !match(Ins1, m_Constant(VecC1))) 991 return false; 992 993 bool IsConst0 = !V0; 994 bool IsConst1 = !V1; 995 if (IsConst0 && IsConst1) 996 return false; 997 if (!IsConst0 && !IsConst1 && Index0 != Index1) 998 return false; 999 1000 auto *VecTy0 = cast<VectorType>(Ins0->getType()); 1001 auto *VecTy1 = cast<VectorType>(Ins1->getType()); 1002 if (VecTy0->getElementCount().getKnownMinValue() <= Index0 || 1003 VecTy1->getElementCount().getKnownMinValue() <= Index1) 1004 return false; 1005 1006 // Bail for single insertion if it is a load. 1007 // TODO: Handle this once getVectorInstrCost can cost for load/stores. 1008 auto *I0 = dyn_cast_or_null<Instruction>(V0); 1009 auto *I1 = dyn_cast_or_null<Instruction>(V1); 1010 if ((IsConst0 && I1 && I1->mayReadFromMemory()) || 1011 (IsConst1 && I0 && I0->mayReadFromMemory())) 1012 return false; 1013 1014 uint64_t Index = IsConst0 ? Index1 : Index0; 1015 Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType(); 1016 Type *VecTy = I.getType(); 1017 assert(VecTy->isVectorTy() && 1018 (IsConst0 || IsConst1 || V0->getType() == V1->getType()) && 1019 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() || 1020 ScalarTy->isPointerTy()) && 1021 "Unexpected types for insert element into binop or cmp"); 1022 1023 unsigned Opcode = I.getOpcode(); 1024 InstructionCost ScalarOpCost, VectorOpCost; 1025 if (IsCmp) { 1026 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate(); 1027 ScalarOpCost = TTI.getCmpSelInstrCost( 1028 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind); 1029 VectorOpCost = TTI.getCmpSelInstrCost( 1030 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind); 1031 } else { 1032 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind); 1033 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind); 1034 } 1035 1036 // Get cost estimate for the insert element. This cost will factor into 1037 // both sequences. 1038 InstructionCost InsertCost = TTI.getVectorInstrCost( 1039 Instruction::InsertElement, VecTy, CostKind, Index); 1040 InstructionCost OldCost = 1041 (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost; 1042 InstructionCost NewCost = ScalarOpCost + InsertCost + 1043 (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) + 1044 (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost); 1045 1046 // We want to scalarize unless the vector variant actually has lower cost. 1047 if (OldCost < NewCost || !NewCost.isValid()) 1048 return false; 1049 1050 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) --> 1051 // inselt NewVecC, (scalar_op V0, V1), Index 1052 if (IsCmp) 1053 ++NumScalarCmp; 1054 else 1055 ++NumScalarBO; 1056 1057 // For constant cases, extract the scalar element, this should constant fold. 1058 if (IsConst0) 1059 V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index)); 1060 if (IsConst1) 1061 V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index)); 1062 1063 Value *Scalar = 1064 IsCmp ? Builder.CreateCmp(Pred, V0, V1) 1065 : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1); 1066 1067 Scalar->setName(I.getName() + ".scalar"); 1068 1069 // All IR flags are safe to back-propagate. There is no potential for extra 1070 // poison to be created by the scalar instruction. 1071 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar)) 1072 ScalarInst->copyIRFlags(&I); 1073 1074 // Fold the vector constants in the original vectors into a new base vector. 1075 Value *NewVecC = 1076 IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1) 1077 : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1); 1078 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index); 1079 replaceValue(I, *Insert); 1080 return true; 1081 } 1082 1083 /// Try to combine a scalar binop + 2 scalar compares of extracted elements of 1084 /// a vector into vector operations followed by extract. Note: The SLP pass 1085 /// may miss this pattern because of implementation problems. 1086 bool VectorCombine::foldExtractedCmps(Instruction &I) { 1087 auto *BI = dyn_cast<BinaryOperator>(&I); 1088 1089 // We are looking for a scalar binop of booleans. 1090 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1) 1091 if (!BI || !I.getType()->isIntegerTy(1)) 1092 return false; 1093 1094 // The compare predicates should match, and each compare should have a 1095 // constant operand. 1096 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1); 1097 Instruction *I0, *I1; 1098 Constant *C0, *C1; 1099 CmpPredicate P0, P1; 1100 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) || 1101 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) 1102 return false; 1103 1104 auto MatchingPred = CmpPredicate::getMatching(P0, P1); 1105 if (!MatchingPred) 1106 return false; 1107 1108 // The compare operands must be extracts of the same vector with constant 1109 // extract indexes. 1110 Value *X; 1111 uint64_t Index0, Index1; 1112 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) || 1113 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1)))) 1114 return false; 1115 1116 auto *Ext0 = cast<ExtractElementInst>(I0); 1117 auto *Ext1 = cast<ExtractElementInst>(I1); 1118 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind); 1119 if (!ConvertToShuf) 1120 return false; 1121 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) && 1122 "Unknown ExtractElementInst"); 1123 1124 // The original scalar pattern is: 1125 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1) 1126 CmpInst::Predicate Pred = *MatchingPred; 1127 unsigned CmpOpcode = 1128 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp; 1129 auto *VecTy = dyn_cast<FixedVectorType>(X->getType()); 1130 if (!VecTy) 1131 return false; 1132 1133 InstructionCost Ext0Cost = 1134 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); 1135 InstructionCost Ext1Cost = 1136 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); 1137 InstructionCost CmpCost = TTI.getCmpSelInstrCost( 1138 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred, 1139 CostKind); 1140 1141 InstructionCost OldCost = 1142 Ext0Cost + Ext1Cost + CmpCost * 2 + 1143 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind); 1144 1145 // The proposed vector pattern is: 1146 // vcmp = cmp Pred X, VecC 1147 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0 1148 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0; 1149 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1; 1150 auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType())); 1151 InstructionCost NewCost = TTI.getCmpSelInstrCost( 1152 CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred, 1153 CostKind); 1154 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem); 1155 ShufMask[CheapIndex] = ExpensiveIndex; 1156 NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy, 1157 ShufMask, CostKind); 1158 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind); 1159 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex); 1160 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost; 1161 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost; 1162 1163 // Aggressively form vector ops if the cost is equal because the transform 1164 // may enable further optimization. 1165 // Codegen can reverse this transform (scalarize) if it was not profitable. 1166 if (OldCost < NewCost || !NewCost.isValid()) 1167 return false; 1168 1169 // Create a vector constant from the 2 scalar constants. 1170 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(), 1171 PoisonValue::get(VecTy->getElementType())); 1172 CmpC[Index0] = C0; 1173 CmpC[Index1] = C1; 1174 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC)); 1175 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder); 1176 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp; 1177 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf; 1178 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS); 1179 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex); 1180 replaceValue(I, *NewExt); 1181 ++NumVecCmpBO; 1182 return true; 1183 } 1184 1185 // Check if memory loc modified between two instrs in the same BB 1186 static bool isMemModifiedBetween(BasicBlock::iterator Begin, 1187 BasicBlock::iterator End, 1188 const MemoryLocation &Loc, AAResults &AA) { 1189 unsigned NumScanned = 0; 1190 return std::any_of(Begin, End, [&](const Instruction &Instr) { 1191 return isModSet(AA.getModRefInfo(&Instr, Loc)) || 1192 ++NumScanned > MaxInstrsToScan; 1193 }); 1194 } 1195 1196 namespace { 1197 /// Helper class to indicate whether a vector index can be safely scalarized and 1198 /// if a freeze needs to be inserted. 1199 class ScalarizationResult { 1200 enum class StatusTy { Unsafe, Safe, SafeWithFreeze }; 1201 1202 StatusTy Status; 1203 Value *ToFreeze; 1204 1205 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr) 1206 : Status(Status), ToFreeze(ToFreeze) {} 1207 1208 public: 1209 ScalarizationResult(const ScalarizationResult &Other) = default; 1210 ~ScalarizationResult() { 1211 assert(!ToFreeze && "freeze() not called with ToFreeze being set"); 1212 } 1213 1214 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; } 1215 static ScalarizationResult safe() { return {StatusTy::Safe}; } 1216 static ScalarizationResult safeWithFreeze(Value *ToFreeze) { 1217 return {StatusTy::SafeWithFreeze, ToFreeze}; 1218 } 1219 1220 /// Returns true if the index can be scalarize without requiring a freeze. 1221 bool isSafe() const { return Status == StatusTy::Safe; } 1222 /// Returns true if the index cannot be scalarized. 1223 bool isUnsafe() const { return Status == StatusTy::Unsafe; } 1224 /// Returns true if the index can be scalarize, but requires inserting a 1225 /// freeze. 1226 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; } 1227 1228 /// Reset the state of Unsafe and clear ToFreze if set. 1229 void discard() { 1230 ToFreeze = nullptr; 1231 Status = StatusTy::Unsafe; 1232 } 1233 1234 /// Freeze the ToFreeze and update the use in \p User to use it. 1235 void freeze(IRBuilder<> &Builder, Instruction &UserI) { 1236 assert(isSafeWithFreeze() && 1237 "should only be used when freezing is required"); 1238 assert(is_contained(ToFreeze->users(), &UserI) && 1239 "UserI must be a user of ToFreeze"); 1240 IRBuilder<>::InsertPointGuard Guard(Builder); 1241 Builder.SetInsertPoint(cast<Instruction>(&UserI)); 1242 Value *Frozen = 1243 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen"); 1244 for (Use &U : make_early_inc_range((UserI.operands()))) 1245 if (U.get() == ToFreeze) 1246 U.set(Frozen); 1247 1248 ToFreeze = nullptr; 1249 } 1250 }; 1251 } // namespace 1252 1253 /// Check if it is legal to scalarize a memory access to \p VecTy at index \p 1254 /// Idx. \p Idx must access a valid vector element. 1255 static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, 1256 Instruction *CtxI, 1257 AssumptionCache &AC, 1258 const DominatorTree &DT) { 1259 // We do checks for both fixed vector types and scalable vector types. 1260 // This is the number of elements of fixed vector types, 1261 // or the minimum number of elements of scalable vector types. 1262 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue(); 1263 1264 if (auto *C = dyn_cast<ConstantInt>(Idx)) { 1265 if (C->getValue().ult(NumElements)) 1266 return ScalarizationResult::safe(); 1267 return ScalarizationResult::unsafe(); 1268 } 1269 1270 unsigned IntWidth = Idx->getType()->getScalarSizeInBits(); 1271 APInt Zero(IntWidth, 0); 1272 APInt MaxElts(IntWidth, NumElements); 1273 ConstantRange ValidIndices(Zero, MaxElts); 1274 ConstantRange IdxRange(IntWidth, true); 1275 1276 if (isGuaranteedNotToBePoison(Idx, &AC)) { 1277 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false, 1278 true, &AC, CtxI, &DT))) 1279 return ScalarizationResult::safe(); 1280 return ScalarizationResult::unsafe(); 1281 } 1282 1283 // If the index may be poison, check if we can insert a freeze before the 1284 // range of the index is restricted. 1285 Value *IdxBase; 1286 ConstantInt *CI; 1287 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) { 1288 IdxRange = IdxRange.binaryAnd(CI->getValue()); 1289 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) { 1290 IdxRange = IdxRange.urem(CI->getValue()); 1291 } 1292 1293 if (ValidIndices.contains(IdxRange)) 1294 return ScalarizationResult::safeWithFreeze(IdxBase); 1295 return ScalarizationResult::unsafe(); 1296 } 1297 1298 /// The memory operation on a vector of \p ScalarType had alignment of 1299 /// \p VectorAlignment. Compute the maximal, but conservatively correct, 1300 /// alignment that will be valid for the memory operation on a single scalar 1301 /// element of the same type with index \p Idx. 1302 static Align computeAlignmentAfterScalarization(Align VectorAlignment, 1303 Type *ScalarType, Value *Idx, 1304 const DataLayout &DL) { 1305 if (auto *C = dyn_cast<ConstantInt>(Idx)) 1306 return commonAlignment(VectorAlignment, 1307 C->getZExtValue() * DL.getTypeStoreSize(ScalarType)); 1308 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType)); 1309 } 1310 1311 // Combine patterns like: 1312 // %0 = load <4 x i32>, <4 x i32>* %a 1313 // %1 = insertelement <4 x i32> %0, i32 %b, i32 1 1314 // store <4 x i32> %1, <4 x i32>* %a 1315 // to: 1316 // %0 = bitcast <4 x i32>* %a to i32* 1317 // %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1 1318 // store i32 %b, i32* %1 1319 bool VectorCombine::foldSingleElementStore(Instruction &I) { 1320 auto *SI = cast<StoreInst>(&I); 1321 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType())) 1322 return false; 1323 1324 // TODO: Combine more complicated patterns (multiple insert) by referencing 1325 // TargetTransformInfo. 1326 Instruction *Source; 1327 Value *NewElement; 1328 Value *Idx; 1329 if (!match(SI->getValueOperand(), 1330 m_InsertElt(m_Instruction(Source), m_Value(NewElement), 1331 m_Value(Idx)))) 1332 return false; 1333 1334 if (auto *Load = dyn_cast<LoadInst>(Source)) { 1335 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType()); 1336 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts(); 1337 // Don't optimize for atomic/volatile load or store. Ensure memory is not 1338 // modified between, vector type matches store size, and index is inbounds. 1339 if (!Load->isSimple() || Load->getParent() != SI->getParent() || 1340 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) || 1341 SrcAddr != SI->getPointerOperand()->stripPointerCasts()) 1342 return false; 1343 1344 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT); 1345 if (ScalarizableIdx.isUnsafe() || 1346 isMemModifiedBetween(Load->getIterator(), SI->getIterator(), 1347 MemoryLocation::get(SI), AA)) 1348 return false; 1349 1350 // Ensure we add the load back to the worklist BEFORE its users so they can 1351 // erased in the correct order. 1352 Worklist.push(Load); 1353 1354 if (ScalarizableIdx.isSafeWithFreeze()) 1355 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx)); 1356 Value *GEP = Builder.CreateInBoundsGEP( 1357 SI->getValueOperand()->getType(), SI->getPointerOperand(), 1358 {ConstantInt::get(Idx->getType(), 0), Idx}); 1359 StoreInst *NSI = Builder.CreateStore(NewElement, GEP); 1360 NSI->copyMetadata(*SI); 1361 Align ScalarOpAlignment = computeAlignmentAfterScalarization( 1362 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx, 1363 *DL); 1364 NSI->setAlignment(ScalarOpAlignment); 1365 replaceValue(I, *NSI); 1366 eraseInstruction(I); 1367 return true; 1368 } 1369 1370 return false; 1371 } 1372 1373 /// Try to scalarize vector loads feeding extractelement instructions. 1374 bool VectorCombine::scalarizeLoadExtract(Instruction &I) { 1375 Value *Ptr; 1376 if (!match(&I, m_Load(m_Value(Ptr)))) 1377 return false; 1378 1379 auto *LI = cast<LoadInst>(&I); 1380 auto *VecTy = cast<VectorType>(LI->getType()); 1381 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType())) 1382 return false; 1383 1384 InstructionCost OriginalCost = 1385 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(), 1386 LI->getPointerAddressSpace(), CostKind); 1387 InstructionCost ScalarizedCost = 0; 1388 1389 Instruction *LastCheckedInst = LI; 1390 unsigned NumInstChecked = 0; 1391 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze; 1392 auto FailureGuard = make_scope_exit([&]() { 1393 // If the transform is aborted, discard the ScalarizationResults. 1394 for (auto &Pair : NeedFreeze) 1395 Pair.second.discard(); 1396 }); 1397 1398 // Check if all users of the load are extracts with no memory modifications 1399 // between the load and the extract. Compute the cost of both the original 1400 // code and the scalarized version. 1401 for (User *U : LI->users()) { 1402 auto *UI = dyn_cast<ExtractElementInst>(U); 1403 if (!UI || UI->getParent() != LI->getParent()) 1404 return false; 1405 1406 // Check if any instruction between the load and the extract may modify 1407 // memory. 1408 if (LastCheckedInst->comesBefore(UI)) { 1409 for (Instruction &I : 1410 make_range(std::next(LI->getIterator()), UI->getIterator())) { 1411 // Bail out if we reached the check limit or the instruction may write 1412 // to memory. 1413 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory()) 1414 return false; 1415 NumInstChecked++; 1416 } 1417 LastCheckedInst = UI; 1418 } 1419 1420 auto ScalarIdx = 1421 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT); 1422 if (ScalarIdx.isUnsafe()) 1423 return false; 1424 if (ScalarIdx.isSafeWithFreeze()) { 1425 NeedFreeze.try_emplace(UI, ScalarIdx); 1426 ScalarIdx.discard(); 1427 } 1428 1429 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand()); 1430 OriginalCost += 1431 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, 1432 Index ? Index->getZExtValue() : -1); 1433 ScalarizedCost += 1434 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(), 1435 Align(1), LI->getPointerAddressSpace(), CostKind); 1436 ScalarizedCost += TTI.getAddressComputationCost(VecTy->getElementType()); 1437 } 1438 1439 if (ScalarizedCost >= OriginalCost) 1440 return false; 1441 1442 // Ensure we add the load back to the worklist BEFORE its users so they can 1443 // erased in the correct order. 1444 Worklist.push(LI); 1445 1446 // Replace extracts with narrow scalar loads. 1447 for (User *U : LI->users()) { 1448 auto *EI = cast<ExtractElementInst>(U); 1449 Value *Idx = EI->getIndexOperand(); 1450 1451 // Insert 'freeze' for poison indexes. 1452 auto It = NeedFreeze.find(EI); 1453 if (It != NeedFreeze.end()) 1454 It->second.freeze(Builder, *cast<Instruction>(Idx)); 1455 1456 Builder.SetInsertPoint(EI); 1457 Value *GEP = 1458 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx}); 1459 auto *NewLoad = cast<LoadInst>(Builder.CreateLoad( 1460 VecTy->getElementType(), GEP, EI->getName() + ".scalar")); 1461 1462 Align ScalarOpAlignment = computeAlignmentAfterScalarization( 1463 LI->getAlign(), VecTy->getElementType(), Idx, *DL); 1464 NewLoad->setAlignment(ScalarOpAlignment); 1465 1466 replaceValue(*EI, *NewLoad); 1467 } 1468 1469 FailureGuard.release(); 1470 return true; 1471 } 1472 1473 /// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))" 1474 /// to "(bitcast (concat X, Y))" 1475 /// where X/Y are bitcasted from i1 mask vectors. 1476 bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) { 1477 Type *Ty = I.getType(); 1478 if (!Ty->isIntegerTy()) 1479 return false; 1480 1481 // TODO: Add big endian test coverage 1482 if (DL->isBigEndian()) 1483 return false; 1484 1485 // Restrict to disjoint cases so the mask vectors aren't overlapping. 1486 Instruction *X, *Y; 1487 if (!match(&I, m_DisjointOr(m_Instruction(X), m_Instruction(Y)))) 1488 return false; 1489 1490 // Allow both sources to contain shl, to handle more generic pattern: 1491 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))" 1492 Value *SrcX; 1493 uint64_t ShAmtX = 0; 1494 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) && 1495 !match(X, m_OneUse( 1496 m_Shl(m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX))))), 1497 m_ConstantInt(ShAmtX))))) 1498 return false; 1499 1500 Value *SrcY; 1501 uint64_t ShAmtY = 0; 1502 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) && 1503 !match(Y, m_OneUse( 1504 m_Shl(m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY))))), 1505 m_ConstantInt(ShAmtY))))) 1506 return false; 1507 1508 // Canonicalize larger shift to the RHS. 1509 if (ShAmtX > ShAmtY) { 1510 std::swap(X, Y); 1511 std::swap(SrcX, SrcY); 1512 std::swap(ShAmtX, ShAmtY); 1513 } 1514 1515 // Ensure both sources are matching vXi1 bool mask types, and that the shift 1516 // difference is the mask width so they can be easily concatenated together. 1517 uint64_t ShAmtDiff = ShAmtY - ShAmtX; 1518 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0); 1519 unsigned BitWidth = Ty->getPrimitiveSizeInBits(); 1520 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType()); 1521 if (!MaskTy || SrcX->getType() != SrcY->getType() || 1522 !MaskTy->getElementType()->isIntegerTy(1) || 1523 MaskTy->getNumElements() != ShAmtDiff || 1524 MaskTy->getNumElements() > (BitWidth / 2)) 1525 return false; 1526 1527 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy); 1528 auto *ConcatIntTy = 1529 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements()); 1530 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff); 1531 1532 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements()); 1533 std::iota(ConcatMask.begin(), ConcatMask.end(), 0); 1534 1535 // TODO: Is it worth supporting multi use cases? 1536 InstructionCost OldCost = 0; 1537 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind); 1538 OldCost += 1539 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind); 1540 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy, 1541 TTI::CastContextHint::None, CostKind); 1542 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy, 1543 TTI::CastContextHint::None, CostKind); 1544 1545 InstructionCost NewCost = 0; 1546 NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, MaskTy, 1547 ConcatMask, CostKind); 1548 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy, 1549 TTI::CastContextHint::None, CostKind); 1550 if (Ty != ConcatIntTy) 1551 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy, 1552 TTI::CastContextHint::None, CostKind); 1553 if (ShAmtX > 0) 1554 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind); 1555 1556 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I 1557 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost 1558 << "\n"); 1559 1560 if (NewCost > OldCost) 1561 return false; 1562 1563 // Build bool mask concatenation, bitcast back to scalar integer, and perform 1564 // any residual zero-extension or shifting. 1565 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask); 1566 Worklist.pushValue(Concat); 1567 1568 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy); 1569 1570 if (Ty != ConcatIntTy) { 1571 Worklist.pushValue(Result); 1572 Result = Builder.CreateZExt(Result, Ty); 1573 } 1574 1575 if (ShAmtX > 0) { 1576 Worklist.pushValue(Result); 1577 Result = Builder.CreateShl(Result, ShAmtX); 1578 } 1579 1580 replaceValue(I, *Result); 1581 return true; 1582 } 1583 1584 /// Try to convert "shuffle (binop (shuffle, shuffle)), undef" 1585 /// --> "binop (shuffle), (shuffle)". 1586 bool VectorCombine::foldPermuteOfBinops(Instruction &I) { 1587 BinaryOperator *BinOp; 1588 ArrayRef<int> OuterMask; 1589 if (!match(&I, 1590 m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask)))) 1591 return false; 1592 1593 // Don't introduce poison into div/rem. 1594 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem)) 1595 return false; 1596 1597 Value *Op00, *Op01, *Op10, *Op11; 1598 ArrayRef<int> Mask0, Mask1; 1599 bool Match0 = 1600 match(BinOp->getOperand(0), 1601 m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)))); 1602 bool Match1 = 1603 match(BinOp->getOperand(1), 1604 m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)))); 1605 if (!Match0 && !Match1) 1606 return false; 1607 1608 Op00 = Match0 ? Op00 : BinOp->getOperand(0); 1609 Op01 = Match0 ? Op01 : BinOp->getOperand(0); 1610 Op10 = Match1 ? Op10 : BinOp->getOperand(1); 1611 Op11 = Match1 ? Op11 : BinOp->getOperand(1); 1612 1613 Instruction::BinaryOps Opcode = BinOp->getOpcode(); 1614 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType()); 1615 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType()); 1616 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType()); 1617 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType()); 1618 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty) 1619 return false; 1620 1621 unsigned NumSrcElts = BinOpTy->getNumElements(); 1622 1623 // Don't accept shuffles that reference the second operand in 1624 // div/rem or if its an undef arg. 1625 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) && 1626 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; })) 1627 return false; 1628 1629 // Merge outer / inner (or identity if no match) shuffles. 1630 SmallVector<int> NewMask0, NewMask1; 1631 for (int M : OuterMask) { 1632 if (M < 0 || M >= (int)NumSrcElts) { 1633 NewMask0.push_back(PoisonMaskElem); 1634 NewMask1.push_back(PoisonMaskElem); 1635 } else { 1636 NewMask0.push_back(Match0 ? Mask0[M] : M); 1637 NewMask1.push_back(Match1 ? Mask1[M] : M); 1638 } 1639 } 1640 1641 unsigned NumOpElts = Op0Ty->getNumElements(); 1642 bool IsIdentity0 = ShuffleDstTy == Op0Ty && 1643 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) && 1644 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts); 1645 bool IsIdentity1 = ShuffleDstTy == Op1Ty && 1646 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) && 1647 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts); 1648 1649 // Try to merge shuffles across the binop if the new shuffles are not costly. 1650 InstructionCost OldCost = 1651 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) + 1652 TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, BinOpTy, 1653 OuterMask, CostKind, 0, nullptr, {BinOp}, &I); 1654 if (Match0) 1655 OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, 1656 Mask0, CostKind, 0, nullptr, {Op00, Op01}, 1657 cast<Instruction>(BinOp->getOperand(0))); 1658 if (Match1) 1659 OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty, 1660 Mask1, CostKind, 0, nullptr, {Op10, Op11}, 1661 cast<Instruction>(BinOp->getOperand(1))); 1662 1663 InstructionCost NewCost = 1664 TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind); 1665 1666 if (!IsIdentity0) 1667 NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, 1668 NewMask0, CostKind, 0, nullptr, {Op00, Op01}); 1669 if (!IsIdentity1) 1670 NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty, 1671 NewMask1, CostKind, 0, nullptr, {Op10, Op11}); 1672 1673 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I 1674 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost 1675 << "\n"); 1676 1677 // If costs are equal, still fold as we reduce instruction count. 1678 if (NewCost > OldCost) 1679 return false; 1680 1681 Value *LHS = 1682 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0); 1683 Value *RHS = 1684 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1); 1685 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS); 1686 1687 // Intersect flags from the old binops. 1688 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) 1689 NewInst->copyIRFlags(BinOp); 1690 1691 Worklist.pushValue(LHS); 1692 Worklist.pushValue(RHS); 1693 replaceValue(I, *NewBO); 1694 return true; 1695 } 1696 1697 /// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)". 1698 /// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)". 1699 bool VectorCombine::foldShuffleOfBinops(Instruction &I) { 1700 ArrayRef<int> OldMask; 1701 Instruction *LHS, *RHS; 1702 if (!match(&I, m_Shuffle(m_OneUse(m_Instruction(LHS)), 1703 m_OneUse(m_Instruction(RHS)), m_Mask(OldMask)))) 1704 return false; 1705 1706 // TODO: Add support for addlike etc. 1707 if (LHS->getOpcode() != RHS->getOpcode()) 1708 return false; 1709 1710 Value *X, *Y, *Z, *W; 1711 bool IsCommutative = false; 1712 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE; 1713 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE; 1714 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) && 1715 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) { 1716 auto *BO = cast<BinaryOperator>(LHS); 1717 // Don't introduce poison into div/rem. 1718 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem()) 1719 return false; 1720 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode()); 1721 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) && 1722 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) && 1723 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) { 1724 IsCommutative = cast<CmpInst>(LHS)->isCommutative(); 1725 } else 1726 return false; 1727 1728 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType()); 1729 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType()); 1730 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType()); 1731 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType()) 1732 return false; 1733 1734 unsigned NumSrcElts = BinOpTy->getNumElements(); 1735 1736 // If we have something like "add X, Y" and "add Z, X", swap ops to match. 1737 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z)) 1738 std::swap(X, Y); 1739 1740 auto ConvertToUnary = [NumSrcElts](int &M) { 1741 if (M >= (int)NumSrcElts) 1742 M -= NumSrcElts; 1743 }; 1744 1745 SmallVector<int> NewMask0(OldMask); 1746 TargetTransformInfo::ShuffleKind SK0 = TargetTransformInfo::SK_PermuteTwoSrc; 1747 if (X == Z) { 1748 llvm::for_each(NewMask0, ConvertToUnary); 1749 SK0 = TargetTransformInfo::SK_PermuteSingleSrc; 1750 Z = PoisonValue::get(BinOpTy); 1751 } 1752 1753 SmallVector<int> NewMask1(OldMask); 1754 TargetTransformInfo::ShuffleKind SK1 = TargetTransformInfo::SK_PermuteTwoSrc; 1755 if (Y == W) { 1756 llvm::for_each(NewMask1, ConvertToUnary); 1757 SK1 = TargetTransformInfo::SK_PermuteSingleSrc; 1758 W = PoisonValue::get(BinOpTy); 1759 } 1760 1761 // Try to replace a binop with a shuffle if the shuffle is not costly. 1762 InstructionCost OldCost = 1763 TTI.getInstructionCost(LHS, CostKind) + 1764 TTI.getInstructionCost(RHS, CostKind) + 1765 TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy, 1766 OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I); 1767 1768 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns 1769 // where one use shuffles have gotten split across the binop/cmp. These 1770 // often allow a major reduction in total cost that wouldn't happen as 1771 // individual folds. 1772 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask, 1773 TTI::TargetCostKind CostKind) -> bool { 1774 Value *InnerOp; 1775 ArrayRef<int> InnerMask; 1776 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(), 1777 m_Mask(InnerMask)))) && 1778 InnerOp->getType() == Op->getType() && 1779 all_of(InnerMask, 1780 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) { 1781 for (int &M : Mask) 1782 if (Offset <= M && M < (int)(Offset + NumSrcElts)) { 1783 M = InnerMask[M - Offset]; 1784 M = 0 <= M ? M + Offset : M; 1785 } 1786 OldCost += TTI.getInstructionCost(cast<Instruction>(Op), CostKind); 1787 Op = InnerOp; 1788 return true; 1789 } 1790 return false; 1791 }; 1792 bool ReducedInstCount = false; 1793 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind); 1794 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind); 1795 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind); 1796 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind); 1797 1798 InstructionCost NewCost = 1799 TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) + 1800 TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}); 1801 1802 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) { 1803 NewCost += 1804 TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind); 1805 } else { 1806 auto *ShuffleCmpTy = 1807 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy); 1808 NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy, 1809 ShuffleDstTy, PredLHS, CostKind); 1810 } 1811 1812 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I 1813 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost 1814 << "\n"); 1815 1816 // If either shuffle will constant fold away, then fold for the same cost as 1817 // we will reduce the instruction count. 1818 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) || 1819 (isa<Constant>(Y) && isa<Constant>(W)); 1820 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost)) 1821 return false; 1822 1823 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0); 1824 Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1); 1825 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE 1826 ? Builder.CreateBinOp( 1827 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1) 1828 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1); 1829 1830 // Intersect flags from the old binops. 1831 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) { 1832 NewInst->copyIRFlags(LHS); 1833 NewInst->andIRFlags(RHS); 1834 } 1835 1836 Worklist.pushValue(Shuf0); 1837 Worklist.pushValue(Shuf1); 1838 replaceValue(I, *NewBO); 1839 return true; 1840 } 1841 1842 /// Try to convert "shuffle (castop), (castop)" with a shared castop operand 1843 /// into "castop (shuffle)". 1844 bool VectorCombine::foldShuffleOfCastops(Instruction &I) { 1845 Value *V0, *V1; 1846 ArrayRef<int> OldMask; 1847 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask)))) 1848 return false; 1849 1850 auto *C0 = dyn_cast<CastInst>(V0); 1851 auto *C1 = dyn_cast<CastInst>(V1); 1852 if (!C0 || !C1) 1853 return false; 1854 1855 Instruction::CastOps Opcode = C0->getOpcode(); 1856 if (C0->getSrcTy() != C1->getSrcTy()) 1857 return false; 1858 1859 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds. 1860 if (Opcode != C1->getOpcode()) { 1861 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value()))) 1862 Opcode = Instruction::SExt; 1863 else 1864 return false; 1865 } 1866 1867 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType()); 1868 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy()); 1869 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy()); 1870 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy) 1871 return false; 1872 1873 unsigned NumSrcElts = CastSrcTy->getNumElements(); 1874 unsigned NumDstElts = CastDstTy->getNumElements(); 1875 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) && 1876 "Only bitcasts expected to alter src/dst element counts"); 1877 1878 // Check for bitcasting of unscalable vector types. 1879 // e.g. <32 x i40> -> <40 x i32> 1880 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 && 1881 (NumDstElts % NumSrcElts) != 0) 1882 return false; 1883 1884 SmallVector<int, 16> NewMask; 1885 if (NumSrcElts >= NumDstElts) { 1886 // The bitcast is from wide to narrow/equal elements. The shuffle mask can 1887 // always be expanded to the equivalent form choosing narrower elements. 1888 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask"); 1889 unsigned ScaleFactor = NumSrcElts / NumDstElts; 1890 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask); 1891 } else { 1892 // The bitcast is from narrow elements to wide elements. The shuffle mask 1893 // must choose consecutive elements to allow casting first. 1894 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask"); 1895 unsigned ScaleFactor = NumDstElts / NumSrcElts; 1896 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask)) 1897 return false; 1898 } 1899 1900 auto *NewShuffleDstTy = 1901 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size()); 1902 1903 // Try to replace a castop with a shuffle if the shuffle is not costly. 1904 InstructionCost CostC0 = 1905 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy, 1906 TTI::CastContextHint::None, CostKind); 1907 InstructionCost CostC1 = 1908 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy, 1909 TTI::CastContextHint::None, CostKind); 1910 InstructionCost OldCost = CostC0 + CostC1; 1911 OldCost += 1912 TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, CastDstTy, 1913 OldMask, CostKind, 0, nullptr, {}, &I); 1914 1915 InstructionCost NewCost = TTI.getShuffleCost( 1916 TargetTransformInfo::SK_PermuteTwoSrc, CastSrcTy, NewMask, CostKind); 1917 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy, 1918 TTI::CastContextHint::None, CostKind); 1919 if (!C0->hasOneUse()) 1920 NewCost += CostC0; 1921 if (!C1->hasOneUse()) 1922 NewCost += CostC1; 1923 1924 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I 1925 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost 1926 << "\n"); 1927 if (NewCost > OldCost) 1928 return false; 1929 1930 Value *Shuf = Builder.CreateShuffleVector(C0->getOperand(0), 1931 C1->getOperand(0), NewMask); 1932 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy); 1933 1934 // Intersect flags from the old casts. 1935 if (auto *NewInst = dyn_cast<Instruction>(Cast)) { 1936 NewInst->copyIRFlags(C0); 1937 NewInst->andIRFlags(C1); 1938 } 1939 1940 Worklist.pushValue(Shuf); 1941 replaceValue(I, *Cast); 1942 return true; 1943 } 1944 1945 /// Try to convert any of: 1946 /// "shuffle (shuffle x, y), (shuffle y, x)" 1947 /// "shuffle (shuffle x, undef), (shuffle y, undef)" 1948 /// "shuffle (shuffle x, undef), y" 1949 /// "shuffle x, (shuffle y, undef)" 1950 /// into "shuffle x, y". 1951 bool VectorCombine::foldShuffleOfShuffles(Instruction &I) { 1952 ArrayRef<int> OuterMask; 1953 Value *OuterV0, *OuterV1; 1954 if (!match(&I, 1955 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask)))) 1956 return false; 1957 1958 ArrayRef<int> InnerMask0, InnerMask1; 1959 Value *X0, *X1, *Y0, *Y1; 1960 bool Match0 = 1961 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0))); 1962 bool Match1 = 1963 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1))); 1964 if (!Match0 && !Match1) 1965 return false; 1966 1967 X0 = Match0 ? X0 : OuterV0; 1968 Y0 = Match0 ? Y0 : OuterV0; 1969 X1 = Match1 ? X1 : OuterV1; 1970 Y1 = Match1 ? Y1 : OuterV1; 1971 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType()); 1972 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType()); 1973 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType()); 1974 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy || 1975 X0->getType() != X1->getType()) 1976 return false; 1977 1978 unsigned NumSrcElts = ShuffleSrcTy->getNumElements(); 1979 unsigned NumImmElts = ShuffleImmTy->getNumElements(); 1980 1981 // Attempt to merge shuffles, matching upto 2 source operands. 1982 // Replace index to a poison arg with PoisonMaskElem. 1983 // Bail if either inner masks reference an undef arg. 1984 SmallVector<int, 16> NewMask(OuterMask); 1985 Value *NewX = nullptr, *NewY = nullptr; 1986 for (int &M : NewMask) { 1987 Value *Src = nullptr; 1988 if (0 <= M && M < (int)NumImmElts) { 1989 Src = OuterV0; 1990 if (Match0) { 1991 M = InnerMask0[M]; 1992 Src = M >= (int)NumSrcElts ? Y0 : X0; 1993 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M; 1994 } 1995 } else if (M >= (int)NumImmElts) { 1996 Src = OuterV1; 1997 M -= NumImmElts; 1998 if (Match1) { 1999 M = InnerMask1[M]; 2000 Src = M >= (int)NumSrcElts ? Y1 : X1; 2001 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M; 2002 } 2003 } 2004 if (Src && M != PoisonMaskElem) { 2005 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index"); 2006 if (isa<UndefValue>(Src)) { 2007 // We've referenced an undef element - if its poison, update the shuffle 2008 // mask, else bail. 2009 if (!isa<PoisonValue>(Src)) 2010 return false; 2011 M = PoisonMaskElem; 2012 continue; 2013 } 2014 if (!NewX || NewX == Src) { 2015 NewX = Src; 2016 continue; 2017 } 2018 if (!NewY || NewY == Src) { 2019 M += NumSrcElts; 2020 NewY = Src; 2021 continue; 2022 } 2023 return false; 2024 } 2025 } 2026 2027 if (!NewX) 2028 return PoisonValue::get(ShuffleDstTy); 2029 if (!NewY) 2030 NewY = PoisonValue::get(ShuffleSrcTy); 2031 2032 // Have we folded to an Identity shuffle? 2033 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) { 2034 replaceValue(I, *NewX); 2035 return true; 2036 } 2037 2038 // Try to merge the shuffles if the new shuffle is not costly. 2039 InstructionCost InnerCost0 = 0; 2040 if (Match0) 2041 InnerCost0 = TTI.getInstructionCost(cast<Instruction>(OuterV0), CostKind); 2042 2043 InstructionCost InnerCost1 = 0; 2044 if (Match1) 2045 InnerCost1 = TTI.getInstructionCost(cast<Instruction>(OuterV1), CostKind); 2046 2047 InstructionCost OuterCost = TTI.getInstructionCost(&I, CostKind); 2048 2049 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost; 2050 2051 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; }); 2052 TargetTransformInfo::ShuffleKind SK = 2053 IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc 2054 : TargetTransformInfo::SK_PermuteTwoSrc; 2055 InstructionCost NewCost = TTI.getShuffleCost( 2056 SK, ShuffleSrcTy, NewMask, CostKind, 0, nullptr, {NewX, NewY}); 2057 if (!OuterV0->hasOneUse()) 2058 NewCost += InnerCost0; 2059 if (!OuterV1->hasOneUse()) 2060 NewCost += InnerCost1; 2061 2062 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I 2063 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost 2064 << "\n"); 2065 if (NewCost > OldCost) 2066 return false; 2067 2068 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask); 2069 replaceValue(I, *Shuf); 2070 return true; 2071 } 2072 2073 /// Try to convert 2074 /// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)". 2075 bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) { 2076 Value *V0, *V1; 2077 ArrayRef<int> OldMask; 2078 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)), 2079 m_Mask(OldMask)))) 2080 return false; 2081 2082 auto *II0 = dyn_cast<IntrinsicInst>(V0); 2083 auto *II1 = dyn_cast<IntrinsicInst>(V1); 2084 if (!II0 || !II1) 2085 return false; 2086 2087 Intrinsic::ID IID = II0->getIntrinsicID(); 2088 if (IID != II1->getIntrinsicID()) 2089 return false; 2090 2091 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType()); 2092 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType()); 2093 if (!ShuffleDstTy || !II0Ty) 2094 return false; 2095 2096 if (!isTriviallyVectorizable(IID)) 2097 return false; 2098 2099 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) 2100 if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI) && 2101 II0->getArgOperand(I) != II1->getArgOperand(I)) 2102 return false; 2103 2104 InstructionCost OldCost = 2105 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) + 2106 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) + 2107 TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, II0Ty, OldMask, 2108 CostKind, 0, nullptr, {II0, II1}, &I); 2109 2110 SmallVector<Type *> NewArgsTy; 2111 InstructionCost NewCost = 0; 2112 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) 2113 if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) { 2114 NewArgsTy.push_back(II0->getArgOperand(I)->getType()); 2115 } else { 2116 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType()); 2117 NewArgsTy.push_back(FixedVectorType::get(VecTy->getElementType(), 2118 VecTy->getNumElements() * 2)); 2119 NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, 2120 VecTy, OldMask, CostKind); 2121 } 2122 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy); 2123 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind); 2124 2125 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I 2126 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost 2127 << "\n"); 2128 2129 if (NewCost > OldCost) 2130 return false; 2131 2132 SmallVector<Value *> NewArgs; 2133 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) 2134 if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) { 2135 NewArgs.push_back(II0->getArgOperand(I)); 2136 } else { 2137 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), 2138 II1->getArgOperand(I), OldMask); 2139 NewArgs.push_back(Shuf); 2140 Worklist.pushValue(Shuf); 2141 } 2142 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs); 2143 2144 // Intersect flags from the old intrinsics. 2145 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) { 2146 NewInst->copyIRFlags(II0); 2147 NewInst->andIRFlags(II1); 2148 } 2149 2150 replaceValue(I, *NewIntrinsic); 2151 return true; 2152 } 2153 2154 using InstLane = std::pair<Use *, int>; 2155 2156 static InstLane lookThroughShuffles(Use *U, int Lane) { 2157 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) { 2158 unsigned NumElts = 2159 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements(); 2160 int M = SV->getMaskValue(Lane); 2161 if (M < 0) 2162 return {nullptr, PoisonMaskElem}; 2163 if (static_cast<unsigned>(M) < NumElts) { 2164 U = &SV->getOperandUse(0); 2165 Lane = M; 2166 } else { 2167 U = &SV->getOperandUse(1); 2168 Lane = M - NumElts; 2169 } 2170 } 2171 return InstLane{U, Lane}; 2172 } 2173 2174 static SmallVector<InstLane> 2175 generateInstLaneVectorFromOperand(ArrayRef<InstLane> Item, int Op) { 2176 SmallVector<InstLane> NItem; 2177 for (InstLane IL : Item) { 2178 auto [U, Lane] = IL; 2179 InstLane OpLane = 2180 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op), 2181 Lane) 2182 : InstLane{nullptr, PoisonMaskElem}; 2183 NItem.emplace_back(OpLane); 2184 } 2185 return NItem; 2186 } 2187 2188 /// Detect concat of multiple values into a vector 2189 static bool isFreeConcat(ArrayRef<InstLane> Item, TTI::TargetCostKind CostKind, 2190 const TargetTransformInfo &TTI) { 2191 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType()); 2192 unsigned NumElts = Ty->getNumElements(); 2193 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0) 2194 return false; 2195 2196 // Check that the concat is free, usually meaning that the type will be split 2197 // during legalization. 2198 SmallVector<int, 16> ConcatMask(NumElts * 2); 2199 std::iota(ConcatMask.begin(), ConcatMask.end(), 0); 2200 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, ConcatMask, CostKind) != 0) 2201 return false; 2202 2203 unsigned NumSlices = Item.size() / NumElts; 2204 // Currently we generate a tree of shuffles for the concats, which limits us 2205 // to a power2. 2206 if (!isPowerOf2_32(NumSlices)) 2207 return false; 2208 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) { 2209 Use *SliceV = Item[Slice * NumElts].first; 2210 if (!SliceV || SliceV->get()->getType() != Ty) 2211 return false; 2212 for (unsigned Elt = 0; Elt < NumElts; ++Elt) { 2213 auto [V, Lane] = Item[Slice * NumElts + Elt]; 2214 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get()) 2215 return false; 2216 } 2217 } 2218 return true; 2219 } 2220 2221 static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty, 2222 const SmallPtrSet<Use *, 4> &IdentityLeafs, 2223 const SmallPtrSet<Use *, 4> &SplatLeafs, 2224 const SmallPtrSet<Use *, 4> &ConcatLeafs, 2225 IRBuilder<> &Builder, 2226 const TargetTransformInfo *TTI) { 2227 auto [FrontU, FrontLane] = Item.front(); 2228 2229 if (IdentityLeafs.contains(FrontU)) { 2230 return FrontU->get(); 2231 } 2232 if (SplatLeafs.contains(FrontU)) { 2233 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane); 2234 return Builder.CreateShuffleVector(FrontU->get(), Mask); 2235 } 2236 if (ConcatLeafs.contains(FrontU)) { 2237 unsigned NumElts = 2238 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements(); 2239 SmallVector<Value *> Values(Item.size() / NumElts, nullptr); 2240 for (unsigned S = 0; S < Values.size(); ++S) 2241 Values[S] = Item[S * NumElts].first->get(); 2242 2243 while (Values.size() > 1) { 2244 NumElts *= 2; 2245 SmallVector<int, 16> Mask(NumElts, 0); 2246 std::iota(Mask.begin(), Mask.end(), 0); 2247 SmallVector<Value *> NewValues(Values.size() / 2, nullptr); 2248 for (unsigned S = 0; S < NewValues.size(); ++S) 2249 NewValues[S] = 2250 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask); 2251 Values = NewValues; 2252 } 2253 return Values[0]; 2254 } 2255 2256 auto *I = cast<Instruction>(FrontU->get()); 2257 auto *II = dyn_cast<IntrinsicInst>(I); 2258 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0); 2259 SmallVector<Value *> Ops(NumOps); 2260 for (unsigned Idx = 0; Idx < NumOps; Idx++) { 2261 if (II && 2262 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) { 2263 Ops[Idx] = II->getOperand(Idx); 2264 continue; 2265 } 2266 Ops[Idx] = generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx), 2267 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs, 2268 Builder, TTI); 2269 } 2270 2271 SmallVector<Value *, 8> ValueList; 2272 for (const auto &Lane : Item) 2273 if (Lane.first) 2274 ValueList.push_back(Lane.first->get()); 2275 2276 Type *DstTy = 2277 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements()); 2278 if (auto *BI = dyn_cast<BinaryOperator>(I)) { 2279 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(), 2280 Ops[0], Ops[1]); 2281 propagateIRFlags(Value, ValueList); 2282 return Value; 2283 } 2284 if (auto *CI = dyn_cast<CmpInst>(I)) { 2285 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]); 2286 propagateIRFlags(Value, ValueList); 2287 return Value; 2288 } 2289 if (auto *SI = dyn_cast<SelectInst>(I)) { 2290 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI); 2291 propagateIRFlags(Value, ValueList); 2292 return Value; 2293 } 2294 if (auto *CI = dyn_cast<CastInst>(I)) { 2295 auto *Value = Builder.CreateCast((Instruction::CastOps)CI->getOpcode(), 2296 Ops[0], DstTy); 2297 propagateIRFlags(Value, ValueList); 2298 return Value; 2299 } 2300 if (II) { 2301 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops); 2302 propagateIRFlags(Value, ValueList); 2303 return Value; 2304 } 2305 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate"); 2306 auto *Value = 2307 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]); 2308 propagateIRFlags(Value, ValueList); 2309 return Value; 2310 } 2311 2312 // Starting from a shuffle, look up through operands tracking the shuffled index 2313 // of each lane. If we can simplify away the shuffles to identities then 2314 // do so. 2315 bool VectorCombine::foldShuffleToIdentity(Instruction &I) { 2316 auto *Ty = dyn_cast<FixedVectorType>(I.getType()); 2317 if (!Ty || I.use_empty()) 2318 return false; 2319 2320 SmallVector<InstLane> Start(Ty->getNumElements()); 2321 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M) 2322 Start[M] = lookThroughShuffles(&*I.use_begin(), M); 2323 2324 SmallVector<SmallVector<InstLane>> Worklist; 2325 Worklist.push_back(Start); 2326 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs; 2327 unsigned NumVisited = 0; 2328 2329 while (!Worklist.empty()) { 2330 if (++NumVisited > MaxInstrsToScan) 2331 return false; 2332 2333 SmallVector<InstLane> Item = Worklist.pop_back_val(); 2334 auto [FrontU, FrontLane] = Item.front(); 2335 2336 // If we found an undef first lane then bail out to keep things simple. 2337 if (!FrontU) 2338 return false; 2339 2340 // Helper to peek through bitcasts to the same value. 2341 auto IsEquiv = [&](Value *X, Value *Y) { 2342 return X->getType() == Y->getType() && 2343 peekThroughBitcasts(X) == peekThroughBitcasts(Y); 2344 }; 2345 2346 // Look for an identity value. 2347 if (FrontLane == 0 && 2348 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() == 2349 Ty->getNumElements() && 2350 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) { 2351 Value *FrontV = Item.front().first->get(); 2352 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) && 2353 E.value().second == (int)E.index()); 2354 })) { 2355 IdentityLeafs.insert(FrontU); 2356 continue; 2357 } 2358 // Look for constants, for the moment only supporting constant splats. 2359 if (auto *C = dyn_cast<Constant>(FrontU); 2360 C && C->getSplatValue() && 2361 all_of(drop_begin(Item), [Item](InstLane &IL) { 2362 Value *FrontV = Item.front().first->get(); 2363 Use *U = IL.first; 2364 return !U || (isa<Constant>(U->get()) && 2365 cast<Constant>(U->get())->getSplatValue() == 2366 cast<Constant>(FrontV)->getSplatValue()); 2367 })) { 2368 SplatLeafs.insert(FrontU); 2369 continue; 2370 } 2371 // Look for a splat value. 2372 if (all_of(drop_begin(Item), [Item](InstLane &IL) { 2373 auto [FrontU, FrontLane] = Item.front(); 2374 auto [U, Lane] = IL; 2375 return !U || (U->get() == FrontU->get() && Lane == FrontLane); 2376 })) { 2377 SplatLeafs.insert(FrontU); 2378 continue; 2379 } 2380 2381 // We need each element to be the same type of value, and check that each 2382 // element has a single use. 2383 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) { 2384 Value *FrontV = Item.front().first->get(); 2385 if (!IL.first) 2386 return true; 2387 Value *V = IL.first->get(); 2388 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUse()) 2389 return false; 2390 if (V->getValueID() != FrontV->getValueID()) 2391 return false; 2392 if (auto *CI = dyn_cast<CmpInst>(V)) 2393 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate()) 2394 return false; 2395 if (auto *CI = dyn_cast<CastInst>(V)) 2396 if (CI->getSrcTy()->getScalarType() != 2397 cast<CastInst>(FrontV)->getSrcTy()->getScalarType()) 2398 return false; 2399 if (auto *SI = dyn_cast<SelectInst>(V)) 2400 if (!isa<VectorType>(SI->getOperand(0)->getType()) || 2401 SI->getOperand(0)->getType() != 2402 cast<SelectInst>(FrontV)->getOperand(0)->getType()) 2403 return false; 2404 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V)) 2405 return false; 2406 auto *II = dyn_cast<IntrinsicInst>(V); 2407 return !II || (isa<IntrinsicInst>(FrontV) && 2408 II->getIntrinsicID() == 2409 cast<IntrinsicInst>(FrontV)->getIntrinsicID() && 2410 !II->hasOperandBundles()); 2411 }; 2412 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) { 2413 // Check the operator is one that we support. 2414 if (isa<BinaryOperator, CmpInst>(FrontU)) { 2415 // We exclude div/rem in case they hit UB from poison lanes. 2416 if (auto *BO = dyn_cast<BinaryOperator>(FrontU); 2417 BO && BO->isIntDivRem()) 2418 return false; 2419 Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); 2420 Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1)); 2421 continue; 2422 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst, 2423 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) { 2424 Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); 2425 continue; 2426 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) { 2427 // TODO: Handle vector widening/narrowing bitcasts. 2428 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy()); 2429 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy()); 2430 if (DstTy && SrcTy && 2431 SrcTy->getNumElements() == DstTy->getNumElements()) { 2432 Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); 2433 continue; 2434 } 2435 } else if (isa<SelectInst>(FrontU)) { 2436 Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); 2437 Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1)); 2438 Worklist.push_back(generateInstLaneVectorFromOperand(Item, 2)); 2439 continue; 2440 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU); 2441 II && isTriviallyVectorizable(II->getIntrinsicID()) && 2442 !II->hasOperandBundles()) { 2443 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) { 2444 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op, 2445 &TTI)) { 2446 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) { 2447 Value *FrontV = Item.front().first->get(); 2448 Use *U = IL.first; 2449 return !U || (cast<Instruction>(U->get())->getOperand(Op) == 2450 cast<Instruction>(FrontV)->getOperand(Op)); 2451 })) 2452 return false; 2453 continue; 2454 } 2455 Worklist.push_back(generateInstLaneVectorFromOperand(Item, Op)); 2456 } 2457 continue; 2458 } 2459 } 2460 2461 if (isFreeConcat(Item, CostKind, TTI)) { 2462 ConcatLeafs.insert(FrontU); 2463 continue; 2464 } 2465 2466 return false; 2467 } 2468 2469 if (NumVisited <= 1) 2470 return false; 2471 2472 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n"); 2473 2474 // If we got this far, we know the shuffles are superfluous and can be 2475 // removed. Scan through again and generate the new tree of instructions. 2476 Builder.SetInsertPoint(&I); 2477 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs, 2478 ConcatLeafs, Builder, &TTI); 2479 replaceValue(I, *V); 2480 return true; 2481 } 2482 2483 /// Given a commutative reduction, the order of the input lanes does not alter 2484 /// the results. We can use this to remove certain shuffles feeding the 2485 /// reduction, removing the need to shuffle at all. 2486 bool VectorCombine::foldShuffleFromReductions(Instruction &I) { 2487 auto *II = dyn_cast<IntrinsicInst>(&I); 2488 if (!II) 2489 return false; 2490 switch (II->getIntrinsicID()) { 2491 case Intrinsic::vector_reduce_add: 2492 case Intrinsic::vector_reduce_mul: 2493 case Intrinsic::vector_reduce_and: 2494 case Intrinsic::vector_reduce_or: 2495 case Intrinsic::vector_reduce_xor: 2496 case Intrinsic::vector_reduce_smin: 2497 case Intrinsic::vector_reduce_smax: 2498 case Intrinsic::vector_reduce_umin: 2499 case Intrinsic::vector_reduce_umax: 2500 break; 2501 default: 2502 return false; 2503 } 2504 2505 // Find all the inputs when looking through operations that do not alter the 2506 // lane order (binops, for example). Currently we look for a single shuffle, 2507 // and can ignore splat values. 2508 std::queue<Value *> Worklist; 2509 SmallPtrSet<Value *, 4> Visited; 2510 ShuffleVectorInst *Shuffle = nullptr; 2511 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0))) 2512 Worklist.push(Op); 2513 2514 while (!Worklist.empty()) { 2515 Value *CV = Worklist.front(); 2516 Worklist.pop(); 2517 if (Visited.contains(CV)) 2518 continue; 2519 2520 // Splats don't change the order, so can be safely ignored. 2521 if (isSplatValue(CV)) 2522 continue; 2523 2524 Visited.insert(CV); 2525 2526 if (auto *CI = dyn_cast<Instruction>(CV)) { 2527 if (CI->isBinaryOp()) { 2528 for (auto *Op : CI->operand_values()) 2529 Worklist.push(Op); 2530 continue; 2531 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) { 2532 if (Shuffle && Shuffle != SV) 2533 return false; 2534 Shuffle = SV; 2535 continue; 2536 } 2537 } 2538 2539 // Anything else is currently an unknown node. 2540 return false; 2541 } 2542 2543 if (!Shuffle) 2544 return false; 2545 2546 // Check all uses of the binary ops and shuffles are also included in the 2547 // lane-invariant operations (Visited should be the list of lanewise 2548 // instructions, including the shuffle that we found). 2549 for (auto *V : Visited) 2550 for (auto *U : V->users()) 2551 if (!Visited.contains(U) && U != &I) 2552 return false; 2553 2554 FixedVectorType *VecType = 2555 dyn_cast<FixedVectorType>(II->getOperand(0)->getType()); 2556 if (!VecType) 2557 return false; 2558 FixedVectorType *ShuffleInputType = 2559 dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType()); 2560 if (!ShuffleInputType) 2561 return false; 2562 unsigned NumInputElts = ShuffleInputType->getNumElements(); 2563 2564 // Find the mask from sorting the lanes into order. This is most likely to 2565 // become a identity or concat mask. Undef elements are pushed to the end. 2566 SmallVector<int> ConcatMask; 2567 Shuffle->getShuffleMask(ConcatMask); 2568 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; }); 2569 // In the case of a truncating shuffle it's possible for the mask 2570 // to have an index greater than the size of the resulting vector. 2571 // This requires special handling. 2572 bool IsTruncatingShuffle = VecType->getNumElements() < NumInputElts; 2573 bool UsesSecondVec = 2574 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; }); 2575 2576 FixedVectorType *VecTyForCost = 2577 (UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType; 2578 InstructionCost OldCost = TTI.getShuffleCost( 2579 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, 2580 VecTyForCost, Shuffle->getShuffleMask(), CostKind); 2581 InstructionCost NewCost = TTI.getShuffleCost( 2582 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, 2583 VecTyForCost, ConcatMask, CostKind); 2584 2585 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle 2586 << "\n"); 2587 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost 2588 << "\n"); 2589 if (NewCost < OldCost) { 2590 Builder.SetInsertPoint(Shuffle); 2591 Value *NewShuffle = Builder.CreateShuffleVector( 2592 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask); 2593 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n"); 2594 replaceValue(*Shuffle, *NewShuffle); 2595 } 2596 2597 // See if we can re-use foldSelectShuffle, getting it to reduce the size of 2598 // the shuffle into a nicer order, as it can ignore the order of the shuffles. 2599 return foldSelectShuffle(*Shuffle, true); 2600 } 2601 2602 /// Determine if its more efficient to fold: 2603 /// reduce(trunc(x)) -> trunc(reduce(x)). 2604 /// reduce(sext(x)) -> sext(reduce(x)). 2605 /// reduce(zext(x)) -> zext(reduce(x)). 2606 bool VectorCombine::foldCastFromReductions(Instruction &I) { 2607 auto *II = dyn_cast<IntrinsicInst>(&I); 2608 if (!II) 2609 return false; 2610 2611 bool TruncOnly = false; 2612 Intrinsic::ID IID = II->getIntrinsicID(); 2613 switch (IID) { 2614 case Intrinsic::vector_reduce_add: 2615 case Intrinsic::vector_reduce_mul: 2616 TruncOnly = true; 2617 break; 2618 case Intrinsic::vector_reduce_and: 2619 case Intrinsic::vector_reduce_or: 2620 case Intrinsic::vector_reduce_xor: 2621 break; 2622 default: 2623 return false; 2624 } 2625 2626 unsigned ReductionOpc = getArithmeticReductionInstruction(IID); 2627 Value *ReductionSrc = I.getOperand(0); 2628 2629 Value *Src; 2630 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) && 2631 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src)))))) 2632 return false; 2633 2634 auto CastOpc = 2635 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode(); 2636 2637 auto *SrcTy = cast<VectorType>(Src->getType()); 2638 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType()); 2639 Type *ResultTy = I.getType(); 2640 2641 InstructionCost OldCost = TTI.getArithmeticReductionCost( 2642 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind); 2643 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy, 2644 TTI::CastContextHint::None, CostKind, 2645 cast<CastInst>(ReductionSrc)); 2646 InstructionCost NewCost = 2647 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt, 2648 CostKind) + 2649 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(), 2650 TTI::CastContextHint::None, CostKind); 2651 2652 if (OldCost <= NewCost || !NewCost.isValid()) 2653 return false; 2654 2655 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(), 2656 II->getIntrinsicID(), {Src}); 2657 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy); 2658 replaceValue(I, *NewCast); 2659 return true; 2660 } 2661 2662 /// This method looks for groups of shuffles acting on binops, of the form: 2663 /// %x = shuffle ... 2664 /// %y = shuffle ... 2665 /// %a = binop %x, %y 2666 /// %b = binop %x, %y 2667 /// shuffle %a, %b, selectmask 2668 /// We may, especially if the shuffle is wider than legal, be able to convert 2669 /// the shuffle to a form where only parts of a and b need to be computed. On 2670 /// architectures with no obvious "select" shuffle, this can reduce the total 2671 /// number of operations if the target reports them as cheaper. 2672 bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { 2673 auto *SVI = cast<ShuffleVectorInst>(&I); 2674 auto *VT = cast<FixedVectorType>(I.getType()); 2675 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0)); 2676 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1)); 2677 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() || 2678 VT != Op0->getType()) 2679 return false; 2680 2681 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0)); 2682 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1)); 2683 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0)); 2684 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1)); 2685 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B}); 2686 auto checkSVNonOpUses = [&](Instruction *I) { 2687 if (!I || I->getOperand(0)->getType() != VT) 2688 return true; 2689 return any_of(I->users(), [&](User *U) { 2690 return U != Op0 && U != Op1 && 2691 !(isa<ShuffleVectorInst>(U) && 2692 (InputShuffles.contains(cast<Instruction>(U)) || 2693 isInstructionTriviallyDead(cast<Instruction>(U)))); 2694 }); 2695 }; 2696 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) || 2697 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B)) 2698 return false; 2699 2700 // Collect all the uses that are shuffles that we can transform together. We 2701 // may not have a single shuffle, but a group that can all be transformed 2702 // together profitably. 2703 SmallVector<ShuffleVectorInst *> Shuffles; 2704 auto collectShuffles = [&](Instruction *I) { 2705 for (auto *U : I->users()) { 2706 auto *SV = dyn_cast<ShuffleVectorInst>(U); 2707 if (!SV || SV->getType() != VT) 2708 return false; 2709 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) || 2710 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1)) 2711 return false; 2712 if (!llvm::is_contained(Shuffles, SV)) 2713 Shuffles.push_back(SV); 2714 } 2715 return true; 2716 }; 2717 if (!collectShuffles(Op0) || !collectShuffles(Op1)) 2718 return false; 2719 // From a reduction, we need to be processing a single shuffle, otherwise the 2720 // other uses will not be lane-invariant. 2721 if (FromReduction && Shuffles.size() > 1) 2722 return false; 2723 2724 // Add any shuffle uses for the shuffles we have found, to include them in our 2725 // cost calculations. 2726 if (!FromReduction) { 2727 for (ShuffleVectorInst *SV : Shuffles) { 2728 for (auto *U : SV->users()) { 2729 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U); 2730 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT) 2731 Shuffles.push_back(SSV); 2732 } 2733 } 2734 } 2735 2736 // For each of the output shuffles, we try to sort all the first vector 2737 // elements to the beginning, followed by the second array elements at the 2738 // end. If the binops are legalized to smaller vectors, this may reduce total 2739 // number of binops. We compute the ReconstructMask mask needed to convert 2740 // back to the original lane order. 2741 SmallVector<std::pair<int, int>> V1, V2; 2742 SmallVector<SmallVector<int>> OrigReconstructMasks; 2743 int MaxV1Elt = 0, MaxV2Elt = 0; 2744 unsigned NumElts = VT->getNumElements(); 2745 for (ShuffleVectorInst *SVN : Shuffles) { 2746 SmallVector<int> Mask; 2747 SVN->getShuffleMask(Mask); 2748 2749 // Check the operands are the same as the original, or reversed (in which 2750 // case we need to commute the mask). 2751 Value *SVOp0 = SVN->getOperand(0); 2752 Value *SVOp1 = SVN->getOperand(1); 2753 if (isa<UndefValue>(SVOp1)) { 2754 auto *SSV = cast<ShuffleVectorInst>(SVOp0); 2755 SVOp0 = SSV->getOperand(0); 2756 SVOp1 = SSV->getOperand(1); 2757 for (unsigned I = 0, E = Mask.size(); I != E; I++) { 2758 if (Mask[I] >= static_cast<int>(SSV->getShuffleMask().size())) 2759 return false; 2760 Mask[I] = Mask[I] < 0 ? Mask[I] : SSV->getMaskValue(Mask[I]); 2761 } 2762 } 2763 if (SVOp0 == Op1 && SVOp1 == Op0) { 2764 std::swap(SVOp0, SVOp1); 2765 ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); 2766 } 2767 if (SVOp0 != Op0 || SVOp1 != Op1) 2768 return false; 2769 2770 // Calculate the reconstruction mask for this shuffle, as the mask needed to 2771 // take the packed values from Op0/Op1 and reconstructing to the original 2772 // order. 2773 SmallVector<int> ReconstructMask; 2774 for (unsigned I = 0; I < Mask.size(); I++) { 2775 if (Mask[I] < 0) { 2776 ReconstructMask.push_back(-1); 2777 } else if (Mask[I] < static_cast<int>(NumElts)) { 2778 MaxV1Elt = std::max(MaxV1Elt, Mask[I]); 2779 auto It = find_if(V1, [&](const std::pair<int, int> &A) { 2780 return Mask[I] == A.first; 2781 }); 2782 if (It != V1.end()) 2783 ReconstructMask.push_back(It - V1.begin()); 2784 else { 2785 ReconstructMask.push_back(V1.size()); 2786 V1.emplace_back(Mask[I], V1.size()); 2787 } 2788 } else { 2789 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts); 2790 auto It = find_if(V2, [&](const std::pair<int, int> &A) { 2791 return Mask[I] - static_cast<int>(NumElts) == A.first; 2792 }); 2793 if (It != V2.end()) 2794 ReconstructMask.push_back(NumElts + It - V2.begin()); 2795 else { 2796 ReconstructMask.push_back(NumElts + V2.size()); 2797 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size()); 2798 } 2799 } 2800 } 2801 2802 // For reductions, we know that the lane ordering out doesn't alter the 2803 // result. In-order can help simplify the shuffle away. 2804 if (FromReduction) 2805 sort(ReconstructMask); 2806 OrigReconstructMasks.push_back(std::move(ReconstructMask)); 2807 } 2808 2809 // If the Maximum element used from V1 and V2 are not larger than the new 2810 // vectors, the vectors are already packes and performing the optimization 2811 // again will likely not help any further. This also prevents us from getting 2812 // stuck in a cycle in case the costs do not also rule it out. 2813 if (V1.empty() || V2.empty() || 2814 (MaxV1Elt == static_cast<int>(V1.size()) - 1 && 2815 MaxV2Elt == static_cast<int>(V2.size()) - 1)) 2816 return false; 2817 2818 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a 2819 // shuffle of another shuffle, or not a shuffle (that is treated like a 2820 // identity shuffle). 2821 auto GetBaseMaskValue = [&](Instruction *I, int M) { 2822 auto *SV = dyn_cast<ShuffleVectorInst>(I); 2823 if (!SV) 2824 return M; 2825 if (isa<UndefValue>(SV->getOperand(1))) 2826 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) 2827 if (InputShuffles.contains(SSV)) 2828 return SSV->getMaskValue(SV->getMaskValue(M)); 2829 return SV->getMaskValue(M); 2830 }; 2831 2832 // Attempt to sort the inputs my ascending mask values to make simpler input 2833 // shuffles and push complex shuffles down to the uses. We sort on the first 2834 // of the two input shuffle orders, to try and get at least one input into a 2835 // nice order. 2836 auto SortBase = [&](Instruction *A, std::pair<int, int> X, 2837 std::pair<int, int> Y) { 2838 int MXA = GetBaseMaskValue(A, X.first); 2839 int MYA = GetBaseMaskValue(A, Y.first); 2840 return MXA < MYA; 2841 }; 2842 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) { 2843 return SortBase(SVI0A, A, B); 2844 }); 2845 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) { 2846 return SortBase(SVI1A, A, B); 2847 }); 2848 // Calculate our ReconstructMasks from the OrigReconstructMasks and the 2849 // modified order of the input shuffles. 2850 SmallVector<SmallVector<int>> ReconstructMasks; 2851 for (const auto &Mask : OrigReconstructMasks) { 2852 SmallVector<int> ReconstructMask; 2853 for (int M : Mask) { 2854 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) { 2855 auto It = find_if(V, [M](auto A) { return A.second == M; }); 2856 assert(It != V.end() && "Expected all entries in Mask"); 2857 return std::distance(V.begin(), It); 2858 }; 2859 if (M < 0) 2860 ReconstructMask.push_back(-1); 2861 else if (M < static_cast<int>(NumElts)) { 2862 ReconstructMask.push_back(FindIndex(V1, M)); 2863 } else { 2864 ReconstructMask.push_back(NumElts + FindIndex(V2, M)); 2865 } 2866 } 2867 ReconstructMasks.push_back(std::move(ReconstructMask)); 2868 } 2869 2870 // Calculate the masks needed for the new input shuffles, which get padded 2871 // with undef 2872 SmallVector<int> V1A, V1B, V2A, V2B; 2873 for (unsigned I = 0; I < V1.size(); I++) { 2874 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first)); 2875 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first)); 2876 } 2877 for (unsigned I = 0; I < V2.size(); I++) { 2878 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first)); 2879 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first)); 2880 } 2881 while (V1A.size() < NumElts) { 2882 V1A.push_back(PoisonMaskElem); 2883 V1B.push_back(PoisonMaskElem); 2884 } 2885 while (V2A.size() < NumElts) { 2886 V2A.push_back(PoisonMaskElem); 2887 V2B.push_back(PoisonMaskElem); 2888 } 2889 2890 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) { 2891 auto *SV = dyn_cast<ShuffleVectorInst>(I); 2892 if (!SV) 2893 return C; 2894 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1)) 2895 ? TTI::SK_PermuteSingleSrc 2896 : TTI::SK_PermuteTwoSrc, 2897 VT, SV->getShuffleMask(), CostKind); 2898 }; 2899 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) { 2900 return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask, CostKind); 2901 }; 2902 2903 // Get the costs of the shuffles + binops before and after with the new 2904 // shuffle masks. 2905 InstructionCost CostBefore = 2906 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) + 2907 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind); 2908 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(), 2909 InstructionCost(0), AddShuffleCost); 2910 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), 2911 InstructionCost(0), AddShuffleCost); 2912 2913 // The new binops will be unused for lanes past the used shuffle lengths. 2914 // These types attempt to get the correct cost for that from the target. 2915 FixedVectorType *Op0SmallVT = 2916 FixedVectorType::get(VT->getScalarType(), V1.size()); 2917 FixedVectorType *Op1SmallVT = 2918 FixedVectorType::get(VT->getScalarType(), V2.size()); 2919 InstructionCost CostAfter = 2920 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) + 2921 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind); 2922 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(), 2923 InstructionCost(0), AddShuffleMaskCost); 2924 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B}); 2925 CostAfter += 2926 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(), 2927 InstructionCost(0), AddShuffleMaskCost); 2928 2929 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n"); 2930 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore 2931 << " vs CostAfter: " << CostAfter << "\n"); 2932 if (CostBefore <= CostAfter) 2933 return false; 2934 2935 // The cost model has passed, create the new instructions. 2936 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * { 2937 auto *SV = dyn_cast<ShuffleVectorInst>(I); 2938 if (!SV) 2939 return I; 2940 if (isa<UndefValue>(SV->getOperand(1))) 2941 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) 2942 if (InputShuffles.contains(SSV)) 2943 return SSV->getOperand(Op); 2944 return SV->getOperand(Op); 2945 }; 2946 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef()); 2947 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0), 2948 GetShuffleOperand(SVI0A, 1), V1A); 2949 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef()); 2950 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0), 2951 GetShuffleOperand(SVI0B, 1), V1B); 2952 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef()); 2953 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0), 2954 GetShuffleOperand(SVI1A, 1), V2A); 2955 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef()); 2956 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0), 2957 GetShuffleOperand(SVI1B, 1), V2B); 2958 Builder.SetInsertPoint(Op0); 2959 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(), 2960 NSV0A, NSV0B); 2961 if (auto *I = dyn_cast<Instruction>(NOp0)) 2962 I->copyIRFlags(Op0, true); 2963 Builder.SetInsertPoint(Op1); 2964 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(), 2965 NSV1A, NSV1B); 2966 if (auto *I = dyn_cast<Instruction>(NOp1)) 2967 I->copyIRFlags(Op1, true); 2968 2969 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) { 2970 Builder.SetInsertPoint(Shuffles[S]); 2971 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]); 2972 replaceValue(*Shuffles[S], *NSV); 2973 } 2974 2975 Worklist.pushValue(NSV0A); 2976 Worklist.pushValue(NSV0B); 2977 Worklist.pushValue(NSV1A); 2978 Worklist.pushValue(NSV1B); 2979 for (auto *S : Shuffles) 2980 Worklist.add(S); 2981 return true; 2982 } 2983 2984 /// Check if instruction depends on ZExt and this ZExt can be moved after the 2985 /// instruction. Move ZExt if it is profitable. For example: 2986 /// logic(zext(x),y) -> zext(logic(x,trunc(y))) 2987 /// lshr((zext(x),y) -> zext(lshr(x,trunc(y))) 2988 /// Cost model calculations takes into account if zext(x) has other users and 2989 /// whether it can be propagated through them too. 2990 bool VectorCombine::shrinkType(Instruction &I) { 2991 Value *ZExted, *OtherOperand; 2992 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)), 2993 m_Value(OtherOperand))) && 2994 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand)))) 2995 return false; 2996 2997 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0); 2998 2999 auto *BigTy = cast<FixedVectorType>(I.getType()); 3000 auto *SmallTy = cast<FixedVectorType>(ZExted->getType()); 3001 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits(); 3002 3003 if (I.getOpcode() == Instruction::LShr) { 3004 // Check that the shift amount is less than the number of bits in the 3005 // smaller type. Otherwise, the smaller lshr will return a poison value. 3006 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL); 3007 if (ShAmtKB.getMaxValue().uge(BW)) 3008 return false; 3009 } else { 3010 // Check that the expression overall uses at most the same number of bits as 3011 // ZExted 3012 KnownBits KB = computeKnownBits(&I, *DL); 3013 if (KB.countMaxActiveBits() > BW) 3014 return false; 3015 } 3016 3017 // Calculate costs of leaving current IR as it is and moving ZExt operation 3018 // later, along with adding truncates if needed 3019 InstructionCost ZExtCost = TTI.getCastInstrCost( 3020 Instruction::ZExt, BigTy, SmallTy, 3021 TargetTransformInfo::CastContextHint::None, CostKind); 3022 InstructionCost CurrentCost = ZExtCost; 3023 InstructionCost ShrinkCost = 0; 3024 3025 // Calculate total cost and check that we can propagate through all ZExt users 3026 for (User *U : ZExtOperand->users()) { 3027 auto *UI = cast<Instruction>(U); 3028 if (UI == &I) { 3029 CurrentCost += 3030 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind); 3031 ShrinkCost += 3032 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind); 3033 ShrinkCost += ZExtCost; 3034 continue; 3035 } 3036 3037 if (!Instruction::isBinaryOp(UI->getOpcode())) 3038 return false; 3039 3040 // Check if we can propagate ZExt through its other users 3041 KnownBits KB = computeKnownBits(UI, *DL); 3042 if (KB.countMaxActiveBits() > BW) 3043 return false; 3044 3045 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind); 3046 ShrinkCost += 3047 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind); 3048 ShrinkCost += ZExtCost; 3049 } 3050 3051 // If the other instruction operand is not a constant, we'll need to 3052 // generate a truncate instruction. So we have to adjust cost 3053 if (!isa<Constant>(OtherOperand)) 3054 ShrinkCost += TTI.getCastInstrCost( 3055 Instruction::Trunc, SmallTy, BigTy, 3056 TargetTransformInfo::CastContextHint::None, CostKind); 3057 3058 // If the cost of shrinking types and leaving the IR is the same, we'll lean 3059 // towards modifying the IR because shrinking opens opportunities for other 3060 // shrinking optimisations. 3061 if (ShrinkCost > CurrentCost) 3062 return false; 3063 3064 Builder.SetInsertPoint(&I); 3065 Value *Op0 = ZExted; 3066 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy); 3067 // Keep the order of operands the same 3068 if (I.getOperand(0) == OtherOperand) 3069 std::swap(Op0, Op1); 3070 Value *NewBinOp = 3071 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1); 3072 cast<Instruction>(NewBinOp)->copyIRFlags(&I); 3073 cast<Instruction>(NewBinOp)->copyMetadata(I); 3074 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy); 3075 replaceValue(I, *NewZExtr); 3076 return true; 3077 } 3078 3079 /// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) --> 3080 /// shuffle (DstVec, SrcVec, Mask) 3081 bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) { 3082 Value *DstVec, *SrcVec; 3083 uint64_t ExtIdx, InsIdx; 3084 if (!match(&I, 3085 m_InsertElt(m_Value(DstVec), 3086 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)), 3087 m_ConstantInt(InsIdx)))) 3088 return false; 3089 3090 auto *VecTy = dyn_cast<FixedVectorType>(I.getType()); 3091 if (!VecTy || SrcVec->getType() != VecTy) 3092 return false; 3093 3094 unsigned NumElts = VecTy->getNumElements(); 3095 if (ExtIdx >= NumElts || InsIdx >= NumElts) 3096 return false; 3097 3098 // Insertion into poison is a cheaper single operand shuffle. 3099 TargetTransformInfo::ShuffleKind SK; 3100 SmallVector<int> Mask(NumElts, PoisonMaskElem); 3101 if (isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec)) { 3102 SK = TargetTransformInfo::SK_PermuteSingleSrc; 3103 Mask[InsIdx] = ExtIdx; 3104 std::swap(DstVec, SrcVec); 3105 } else { 3106 SK = TargetTransformInfo::SK_PermuteTwoSrc; 3107 std::iota(Mask.begin(), Mask.end(), 0); 3108 Mask[InsIdx] = ExtIdx + NumElts; 3109 } 3110 3111 // Cost 3112 auto *Ins = cast<InsertElementInst>(&I); 3113 auto *Ext = cast<ExtractElementInst>(I.getOperand(1)); 3114 InstructionCost InsCost = 3115 TTI.getVectorInstrCost(*Ins, VecTy, CostKind, InsIdx); 3116 InstructionCost ExtCost = 3117 TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx); 3118 InstructionCost OldCost = ExtCost + InsCost; 3119 3120 // Ignore 'free' identity insertion shuffle. 3121 // TODO: getShuffleCost should return TCC_Free for Identity shuffles. 3122 InstructionCost NewCost = 0; 3123 if (!ShuffleVectorInst::isIdentityMask(Mask, NumElts)) 3124 NewCost += TTI.getShuffleCost(SK, VecTy, Mask, CostKind, 0, nullptr, 3125 {DstVec, SrcVec}); 3126 if (!Ext->hasOneUse()) 3127 NewCost += ExtCost; 3128 3129 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I 3130 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost 3131 << "\n"); 3132 3133 if (OldCost < NewCost) 3134 return false; 3135 3136 // Canonicalize undef param to RHS to help further folds. 3137 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) { 3138 ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); 3139 std::swap(DstVec, SrcVec); 3140 } 3141 3142 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask); 3143 replaceValue(I, *Shuf); 3144 3145 return true; 3146 } 3147 3148 /// This is the entry point for all transforms. Pass manager differences are 3149 /// handled in the callers of this function. 3150 bool VectorCombine::run() { 3151 if (DisableVectorCombine) 3152 return false; 3153 3154 // Don't attempt vectorization if the target does not support vectors. 3155 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true))) 3156 return false; 3157 3158 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n"); 3159 3160 bool MadeChange = false; 3161 auto FoldInst = [this, &MadeChange](Instruction &I) { 3162 Builder.SetInsertPoint(&I); 3163 bool IsVectorType = isa<VectorType>(I.getType()); 3164 bool IsFixedVectorType = isa<FixedVectorType>(I.getType()); 3165 auto Opcode = I.getOpcode(); 3166 3167 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n'); 3168 3169 // These folds should be beneficial regardless of when this pass is run 3170 // in the optimization pipeline. 3171 // The type checking is for run-time efficiency. We can avoid wasting time 3172 // dispatching to folding functions if there's no chance of matching. 3173 if (IsFixedVectorType) { 3174 switch (Opcode) { 3175 case Instruction::InsertElement: 3176 MadeChange |= vectorizeLoadInsert(I); 3177 break; 3178 case Instruction::ShuffleVector: 3179 MadeChange |= widenSubvectorLoad(I); 3180 break; 3181 default: 3182 break; 3183 } 3184 } 3185 3186 // This transform works with scalable and fixed vectors 3187 // TODO: Identify and allow other scalable transforms 3188 if (IsVectorType) { 3189 MadeChange |= scalarizeBinopOrCmp(I); 3190 MadeChange |= scalarizeLoadExtract(I); 3191 MadeChange |= scalarizeVPIntrinsic(I); 3192 } 3193 3194 if (Opcode == Instruction::Store) 3195 MadeChange |= foldSingleElementStore(I); 3196 3197 // If this is an early pipeline invocation of this pass, we are done. 3198 if (TryEarlyFoldsOnly) 3199 return; 3200 3201 // Otherwise, try folds that improve codegen but may interfere with 3202 // early IR canonicalizations. 3203 // The type checking is for run-time efficiency. We can avoid wasting time 3204 // dispatching to folding functions if there's no chance of matching. 3205 if (IsFixedVectorType) { 3206 switch (Opcode) { 3207 case Instruction::InsertElement: 3208 MadeChange |= foldInsExtFNeg(I); 3209 MadeChange |= foldInsExtVectorToShuffle(I); 3210 break; 3211 case Instruction::ShuffleVector: 3212 MadeChange |= foldPermuteOfBinops(I); 3213 MadeChange |= foldShuffleOfBinops(I); 3214 MadeChange |= foldShuffleOfCastops(I); 3215 MadeChange |= foldShuffleOfShuffles(I); 3216 MadeChange |= foldShuffleOfIntrinsics(I); 3217 MadeChange |= foldSelectShuffle(I); 3218 MadeChange |= foldShuffleToIdentity(I); 3219 break; 3220 case Instruction::BitCast: 3221 MadeChange |= foldBitcastShuffle(I); 3222 break; 3223 default: 3224 MadeChange |= shrinkType(I); 3225 break; 3226 } 3227 } else { 3228 switch (Opcode) { 3229 case Instruction::Call: 3230 MadeChange |= foldShuffleFromReductions(I); 3231 MadeChange |= foldCastFromReductions(I); 3232 break; 3233 case Instruction::ICmp: 3234 case Instruction::FCmp: 3235 MadeChange |= foldExtractExtract(I); 3236 break; 3237 case Instruction::Or: 3238 MadeChange |= foldConcatOfBoolMasks(I); 3239 [[fallthrough]]; 3240 default: 3241 if (Instruction::isBinaryOp(Opcode)) { 3242 MadeChange |= foldExtractExtract(I); 3243 MadeChange |= foldExtractedCmps(I); 3244 } 3245 break; 3246 } 3247 } 3248 }; 3249 3250 for (BasicBlock &BB : F) { 3251 // Ignore unreachable basic blocks. 3252 if (!DT.isReachableFromEntry(&BB)) 3253 continue; 3254 // Use early increment range so that we can erase instructions in loop. 3255 for (Instruction &I : make_early_inc_range(BB)) { 3256 if (I.isDebugOrPseudoInst()) 3257 continue; 3258 FoldInst(I); 3259 } 3260 } 3261 3262 while (!Worklist.isEmpty()) { 3263 Instruction *I = Worklist.removeOne(); 3264 if (!I) 3265 continue; 3266 3267 if (isInstructionTriviallyDead(I)) { 3268 eraseInstruction(*I); 3269 continue; 3270 } 3271 3272 FoldInst(*I); 3273 } 3274 3275 return MadeChange; 3276 } 3277 3278 PreservedAnalyses VectorCombinePass::run(Function &F, 3279 FunctionAnalysisManager &FAM) { 3280 auto &AC = FAM.getResult<AssumptionAnalysis>(F); 3281 TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F); 3282 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); 3283 AAResults &AA = FAM.getResult<AAManager>(F); 3284 const DataLayout *DL = &F.getDataLayout(); 3285 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput, 3286 TryEarlyFoldsOnly); 3287 if (!Combiner.run()) 3288 return PreservedAnalyses::all(); 3289 PreservedAnalyses PA; 3290 PA.preserveSet<CFGAnalyses>(); 3291 return PA; 3292 } 3293