1 //===------- VectorCombine.cpp - Optimize partial vector operations -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass optimizes scalar/vector interactions using target cost models. The 10 // transforms implemented here may not fit in traditional loop-based or SLP 11 // vectorization passes. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/Vectorize/VectorCombine.h" 16 #include "llvm/ADT/Statistic.h" 17 #include "llvm/Analysis/AssumptionCache.h" 18 #include "llvm/Analysis/BasicAliasAnalysis.h" 19 #include "llvm/Analysis/GlobalsModRef.h" 20 #include "llvm/Analysis/Loads.h" 21 #include "llvm/Analysis/TargetTransformInfo.h" 22 #include "llvm/Analysis/ValueTracking.h" 23 #include "llvm/Analysis/VectorUtils.h" 24 #include "llvm/IR/Dominators.h" 25 #include "llvm/IR/Function.h" 26 #include "llvm/IR/IRBuilder.h" 27 #include "llvm/IR/PatternMatch.h" 28 #include "llvm/InitializePasses.h" 29 #include "llvm/Pass.h" 30 #include "llvm/Support/CommandLine.h" 31 #include "llvm/Transforms/Utils/Local.h" 32 #include "llvm/Transforms/Vectorize.h" 33 #include <numeric> 34 35 #define DEBUG_TYPE "vector-combine" 36 #include "llvm/Transforms/Utils/InstructionWorklist.h" 37 38 using namespace llvm; 39 using namespace llvm::PatternMatch; 40 41 STATISTIC(NumVecLoad, "Number of vector loads formed"); 42 STATISTIC(NumVecCmp, "Number of vector compares formed"); 43 STATISTIC(NumVecBO, "Number of vector binops formed"); 44 STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed"); 45 STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast"); 46 STATISTIC(NumScalarBO, "Number of scalar binops formed"); 47 STATISTIC(NumScalarCmp, "Number of scalar compares formed"); 48 49 static cl::opt<bool> DisableVectorCombine( 50 "disable-vector-combine", cl::init(false), cl::Hidden, 51 cl::desc("Disable all vector combine transforms")); 52 53 static cl::opt<bool> DisableBinopExtractShuffle( 54 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden, 55 cl::desc("Disable binop extract to shuffle transforms")); 56 57 static cl::opt<unsigned> MaxInstrsToScan( 58 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, 59 cl::desc("Max number of instructions to scan for vector combining.")); 60 61 static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max(); 62 63 namespace { 64 class VectorCombine { 65 public: 66 VectorCombine(Function &F, const TargetTransformInfo &TTI, 67 const DominatorTree &DT, AAResults &AA, AssumptionCache &AC, 68 bool TryEarlyFoldsOnly) 69 : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC), 70 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {} 71 72 bool run(); 73 74 private: 75 Function &F; 76 IRBuilder<> Builder; 77 const TargetTransformInfo &TTI; 78 const DominatorTree &DT; 79 AAResults &AA; 80 AssumptionCache &AC; 81 82 /// If true, only perform beneficial early IR transforms. Do not introduce new 83 /// vector operations. 84 bool TryEarlyFoldsOnly; 85 86 InstructionWorklist Worklist; 87 88 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction" 89 // parameter. That should be updated to specific sub-classes because the 90 // run loop was changed to dispatch on opcode. 91 bool vectorizeLoadInsert(Instruction &I); 92 bool widenSubvectorLoad(Instruction &I); 93 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, 94 ExtractElementInst *Ext1, 95 unsigned PreferredExtractIndex) const; 96 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 97 const Instruction &I, 98 ExtractElementInst *&ConvertToShuffle, 99 unsigned PreferredExtractIndex); 100 void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 101 Instruction &I); 102 void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 103 Instruction &I); 104 bool foldExtractExtract(Instruction &I); 105 bool foldInsExtFNeg(Instruction &I); 106 bool foldBitcastShuf(Instruction &I); 107 bool scalarizeBinopOrCmp(Instruction &I); 108 bool foldExtractedCmps(Instruction &I); 109 bool foldSingleElementStore(Instruction &I); 110 bool scalarizeLoadExtract(Instruction &I); 111 bool foldShuffleOfBinops(Instruction &I); 112 bool foldShuffleFromReductions(Instruction &I); 113 bool foldSelectShuffle(Instruction &I, bool FromReduction = false); 114 115 void replaceValue(Value &Old, Value &New) { 116 Old.replaceAllUsesWith(&New); 117 if (auto *NewI = dyn_cast<Instruction>(&New)) { 118 New.takeName(&Old); 119 Worklist.pushUsersToWorkList(*NewI); 120 Worklist.pushValue(NewI); 121 } 122 Worklist.pushValue(&Old); 123 } 124 125 void eraseInstruction(Instruction &I) { 126 for (Value *Op : I.operands()) 127 Worklist.pushValue(Op); 128 Worklist.remove(&I); 129 I.eraseFromParent(); 130 } 131 }; 132 } // namespace 133 134 static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) { 135 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan. 136 // The widened load may load data from dirty regions or create data races 137 // non-existent in the source. 138 if (!Load || !Load->isSimple() || !Load->hasOneUse() || 139 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) || 140 mustSuppressSpeculation(*Load)) 141 return false; 142 143 // We are potentially transforming byte-sized (8-bit) memory accesses, so make 144 // sure we have all of our type-based constraints in place for this target. 145 Type *ScalarTy = Load->getType()->getScalarType(); 146 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); 147 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); 148 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 || 149 ScalarSize % 8 != 0) 150 return false; 151 152 return true; 153 } 154 155 bool VectorCombine::vectorizeLoadInsert(Instruction &I) { 156 // Match insert into fixed vector of scalar value. 157 // TODO: Handle non-zero insert index. 158 Value *Scalar; 159 if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) || 160 !Scalar->hasOneUse()) 161 return false; 162 163 // Optionally match an extract from another vector. 164 Value *X; 165 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt())); 166 if (!HasExtract) 167 X = Scalar; 168 169 auto *Load = dyn_cast<LoadInst>(X); 170 if (!canWidenLoad(Load, TTI)) 171 return false; 172 173 Type *ScalarTy = Scalar->getType(); 174 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); 175 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); 176 177 // Check safety of replacing the scalar load with a larger vector load. 178 // We use minimal alignment (maximum flexibility) because we only care about 179 // the dereferenceable region. When calculating cost and creating a new op, 180 // we may use a larger value based on alignment attributes. 181 const DataLayout &DL = I.getModule()->getDataLayout(); 182 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); 183 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type"); 184 185 unsigned MinVecNumElts = MinVectorSize / ScalarSize; 186 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); 187 unsigned OffsetEltIndex = 0; 188 Align Alignment = Load->getAlign(); 189 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &AC, 190 &DT)) { 191 // It is not safe to load directly from the pointer, but we can still peek 192 // through gep offsets and check if it safe to load from a base address with 193 // updated alignment. If it is, we can shuffle the element(s) into place 194 // after loading. 195 unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType()); 196 APInt Offset(OffsetBitWidth, 0); 197 SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); 198 199 // We want to shuffle the result down from a high element of a vector, so 200 // the offset must be positive. 201 if (Offset.isNegative()) 202 return false; 203 204 // The offset must be a multiple of the scalar element to shuffle cleanly 205 // in the element's size. 206 uint64_t ScalarSizeInBytes = ScalarSize / 8; 207 if (Offset.urem(ScalarSizeInBytes) != 0) 208 return false; 209 210 // If we load MinVecNumElts, will our target element still be loaded? 211 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue(); 212 if (OffsetEltIndex >= MinVecNumElts) 213 return false; 214 215 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &AC, 216 &DT)) 217 return false; 218 219 // Update alignment with offset value. Note that the offset could be negated 220 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but 221 // negation does not change the result of the alignment calculation. 222 Alignment = commonAlignment(Alignment, Offset.getZExtValue()); 223 } 224 225 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0 226 // Use the greater of the alignment on the load or its source pointer. 227 Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment); 228 Type *LoadTy = Load->getType(); 229 unsigned AS = Load->getPointerAddressSpace(); 230 InstructionCost OldCost = 231 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); 232 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); 233 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 234 OldCost += 235 TTI.getScalarizationOverhead(MinVecTy, DemandedElts, 236 /* Insert */ true, HasExtract, CostKind); 237 238 // New pattern: load VecPtr 239 InstructionCost NewCost = 240 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS); 241 // Optionally, we are shuffling the loaded vector element(s) into place. 242 // For the mask set everything but element 0 to undef to prevent poison from 243 // propagating from the extra loaded memory. This will also optionally 244 // shrink/grow the vector from the loaded size to the output size. 245 // We assume this operation has no cost in codegen if there was no offset. 246 // Note that we could use freeze to avoid poison problems, but then we might 247 // still need a shuffle to change the vector size. 248 auto *Ty = cast<FixedVectorType>(I.getType()); 249 unsigned OutputNumElts = Ty->getNumElements(); 250 SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem); 251 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); 252 Mask[0] = OffsetEltIndex; 253 if (OffsetEltIndex) 254 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask); 255 256 // We can aggressively convert to the vector form because the backend can 257 // invert this transform if it does not result in a performance win. 258 if (OldCost < NewCost || !NewCost.isValid()) 259 return false; 260 261 // It is safe and potentially profitable to load a vector directly: 262 // inselt undef, load Scalar, 0 --> load VecPtr 263 IRBuilder<> Builder(Load); 264 Value *CastedPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( 265 SrcPtr, MinVecTy->getPointerTo(AS)); 266 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); 267 VecLd = Builder.CreateShuffleVector(VecLd, Mask); 268 269 replaceValue(I, *VecLd); 270 ++NumVecLoad; 271 return true; 272 } 273 274 /// If we are loading a vector and then inserting it into a larger vector with 275 /// undefined elements, try to load the larger vector and eliminate the insert. 276 /// This removes a shuffle in IR and may allow combining of other loaded values. 277 bool VectorCombine::widenSubvectorLoad(Instruction &I) { 278 // Match subvector insert of fixed vector. 279 auto *Shuf = cast<ShuffleVectorInst>(&I); 280 if (!Shuf->isIdentityWithPadding()) 281 return false; 282 283 // Allow a non-canonical shuffle mask that is choosing elements from op1. 284 unsigned NumOpElts = 285 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements(); 286 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) { 287 return M >= (int)(NumOpElts); 288 }); 289 290 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex)); 291 if (!canWidenLoad(Load, TTI)) 292 return false; 293 294 // We use minimal alignment (maximum flexibility) because we only care about 295 // the dereferenceable region. When calculating cost and creating a new op, 296 // we may use a larger value based on alignment attributes. 297 auto *Ty = cast<FixedVectorType>(I.getType()); 298 const DataLayout &DL = I.getModule()->getDataLayout(); 299 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); 300 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type"); 301 Align Alignment = Load->getAlign(); 302 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), DL, Load, &AC, &DT)) 303 return false; 304 305 Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment); 306 Type *LoadTy = Load->getType(); 307 unsigned AS = Load->getPointerAddressSpace(); 308 309 // Original pattern: insert_subvector (load PtrOp) 310 // This conservatively assumes that the cost of a subvector insert into an 311 // undef value is 0. We could add that cost if the cost model accurately 312 // reflects the real cost of that operation. 313 InstructionCost OldCost = 314 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); 315 316 // New pattern: load PtrOp 317 InstructionCost NewCost = 318 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS); 319 320 // We can aggressively convert to the vector form because the backend can 321 // invert this transform if it does not result in a performance win. 322 if (OldCost < NewCost || !NewCost.isValid()) 323 return false; 324 325 IRBuilder<> Builder(Load); 326 Value *CastedPtr = 327 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Ty->getPointerTo(AS)); 328 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment); 329 replaceValue(I, *VecLd); 330 ++NumVecLoad; 331 return true; 332 } 333 334 /// Determine which, if any, of the inputs should be replaced by a shuffle 335 /// followed by extract from a different index. 336 ExtractElementInst *VectorCombine::getShuffleExtract( 337 ExtractElementInst *Ext0, ExtractElementInst *Ext1, 338 unsigned PreferredExtractIndex = InvalidIndex) const { 339 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand()); 340 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand()); 341 assert(Index0C && Index1C && "Expected constant extract indexes"); 342 343 unsigned Index0 = Index0C->getZExtValue(); 344 unsigned Index1 = Index1C->getZExtValue(); 345 346 // If the extract indexes are identical, no shuffle is needed. 347 if (Index0 == Index1) 348 return nullptr; 349 350 Type *VecTy = Ext0->getVectorOperand()->getType(); 351 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 352 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types"); 353 InstructionCost Cost0 = 354 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); 355 InstructionCost Cost1 = 356 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); 357 358 // If both costs are invalid no shuffle is needed 359 if (!Cost0.isValid() && !Cost1.isValid()) 360 return nullptr; 361 362 // We are extracting from 2 different indexes, so one operand must be shuffled 363 // before performing a vector operation and/or extract. The more expensive 364 // extract will be replaced by a shuffle. 365 if (Cost0 > Cost1) 366 return Ext0; 367 if (Cost1 > Cost0) 368 return Ext1; 369 370 // If the costs are equal and there is a preferred extract index, shuffle the 371 // opposite operand. 372 if (PreferredExtractIndex == Index0) 373 return Ext1; 374 if (PreferredExtractIndex == Index1) 375 return Ext0; 376 377 // Otherwise, replace the extract with the higher index. 378 return Index0 > Index1 ? Ext0 : Ext1; 379 } 380 381 /// Compare the relative costs of 2 extracts followed by scalar operation vs. 382 /// vector operation(s) followed by extract. Return true if the existing 383 /// instructions are cheaper than a vector alternative. Otherwise, return false 384 /// and if one of the extracts should be transformed to a shufflevector, set 385 /// \p ConvertToShuffle to that extract instruction. 386 bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, 387 ExtractElementInst *Ext1, 388 const Instruction &I, 389 ExtractElementInst *&ConvertToShuffle, 390 unsigned PreferredExtractIndex) { 391 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getOperand(1)); 392 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getOperand(1)); 393 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes"); 394 395 unsigned Opcode = I.getOpcode(); 396 Type *ScalarTy = Ext0->getType(); 397 auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType()); 398 InstructionCost ScalarOpCost, VectorOpCost; 399 400 // Get cost estimates for scalar and vector versions of the operation. 401 bool IsBinOp = Instruction::isBinaryOp(Opcode); 402 if (IsBinOp) { 403 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); 404 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); 405 } else { 406 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && 407 "Expected a compare"); 408 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate(); 409 ScalarOpCost = TTI.getCmpSelInstrCost( 410 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred); 411 VectorOpCost = TTI.getCmpSelInstrCost( 412 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred); 413 } 414 415 // Get cost estimates for the extract elements. These costs will factor into 416 // both sequences. 417 unsigned Ext0Index = Ext0IndexC->getZExtValue(); 418 unsigned Ext1Index = Ext1IndexC->getZExtValue(); 419 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 420 421 InstructionCost Extract0Cost = 422 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index); 423 InstructionCost Extract1Cost = 424 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index); 425 426 // A more expensive extract will always be replaced by a splat shuffle. 427 // For example, if Ext0 is more expensive: 428 // opcode (extelt V0, Ext0), (ext V1, Ext1) --> 429 // extelt (opcode (splat V0, Ext0), V1), Ext1 430 // TODO: Evaluate whether that always results in lowest cost. Alternatively, 431 // check the cost of creating a broadcast shuffle and shuffling both 432 // operands to element 0. 433 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost); 434 435 // Extra uses of the extracts mean that we include those costs in the 436 // vector total because those instructions will not be eliminated. 437 InstructionCost OldCost, NewCost; 438 if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) { 439 // Handle a special case. If the 2 extracts are identical, adjust the 440 // formulas to account for that. The extra use charge allows for either the 441 // CSE'd pattern or an unoptimized form with identical values: 442 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C 443 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2) 444 : !Ext0->hasOneUse() || !Ext1->hasOneUse(); 445 OldCost = CheapExtractCost + ScalarOpCost; 446 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost; 447 } else { 448 // Handle the general case. Each extract is actually a different value: 449 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C 450 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost; 451 NewCost = VectorOpCost + CheapExtractCost + 452 !Ext0->hasOneUse() * Extract0Cost + 453 !Ext1->hasOneUse() * Extract1Cost; 454 } 455 456 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex); 457 if (ConvertToShuffle) { 458 if (IsBinOp && DisableBinopExtractShuffle) 459 return true; 460 461 // If we are extracting from 2 different indexes, then one operand must be 462 // shuffled before performing the vector operation. The shuffle mask is 463 // undefined except for 1 lane that is being translated to the remaining 464 // extraction lane. Therefore, it is a splat shuffle. Ex: 465 // ShufMask = { undef, undef, 0, undef } 466 // TODO: The cost model has an option for a "broadcast" shuffle 467 // (splat-from-element-0), but no option for a more general splat. 468 NewCost += 469 TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); 470 } 471 472 // Aggressively form a vector op if the cost is equal because the transform 473 // may enable further optimization. 474 // Codegen can reverse this transform (scalarize) if it was not profitable. 475 return OldCost < NewCost; 476 } 477 478 /// Create a shuffle that translates (shifts) 1 element from the input vector 479 /// to a new element location. 480 static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, 481 unsigned NewIndex, IRBuilder<> &Builder) { 482 // The shuffle mask is undefined except for 1 lane that is being translated 483 // to the new element index. Example for OldIndex == 2 and NewIndex == 0: 484 // ShufMask = { 2, undef, undef, undef } 485 auto *VecTy = cast<FixedVectorType>(Vec->getType()); 486 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem); 487 ShufMask[NewIndex] = OldIndex; 488 return Builder.CreateShuffleVector(Vec, ShufMask, "shift"); 489 } 490 491 /// Given an extract element instruction with constant index operand, shuffle 492 /// the source vector (shift the scalar element) to a NewIndex for extraction. 493 /// Return null if the input can be constant folded, so that we are not creating 494 /// unnecessary instructions. 495 static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, 496 unsigned NewIndex, 497 IRBuilder<> &Builder) { 498 // Shufflevectors can only be created for fixed-width vectors. 499 if (!isa<FixedVectorType>(ExtElt->getOperand(0)->getType())) 500 return nullptr; 501 502 // If the extract can be constant-folded, this code is unsimplified. Defer 503 // to other passes to handle that. 504 Value *X = ExtElt->getVectorOperand(); 505 Value *C = ExtElt->getIndexOperand(); 506 assert(isa<ConstantInt>(C) && "Expected a constant index operand"); 507 if (isa<Constant>(X)) 508 return nullptr; 509 510 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(), 511 NewIndex, Builder); 512 return cast<ExtractElementInst>(Builder.CreateExtractElement(Shuf, NewIndex)); 513 } 514 515 /// Try to reduce extract element costs by converting scalar compares to vector 516 /// compares followed by extract. 517 /// cmp (ext0 V0, C), (ext1 V1, C) 518 void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0, 519 ExtractElementInst *Ext1, Instruction &I) { 520 assert(isa<CmpInst>(&I) && "Expected a compare"); 521 assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == 522 cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && 523 "Expected matching constant extract indexes"); 524 525 // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C 526 ++NumVecCmp; 527 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate(); 528 Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); 529 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1); 530 Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand()); 531 replaceValue(I, *NewExt); 532 } 533 534 /// Try to reduce extract element costs by converting scalar binops to vector 535 /// binops followed by extract. 536 /// bo (ext0 V0, C), (ext1 V1, C) 537 void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0, 538 ExtractElementInst *Ext1, Instruction &I) { 539 assert(isa<BinaryOperator>(&I) && "Expected a binary operator"); 540 assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == 541 cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && 542 "Expected matching constant extract indexes"); 543 544 // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C 545 ++NumVecBO; 546 Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); 547 Value *VecBO = 548 Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1); 549 550 // All IR flags are safe to back-propagate because any potential poison 551 // created in unused vector elements is discarded by the extract. 552 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO)) 553 VecBOInst->copyIRFlags(&I); 554 555 Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand()); 556 replaceValue(I, *NewExt); 557 } 558 559 /// Match an instruction with extracted vector operands. 560 bool VectorCombine::foldExtractExtract(Instruction &I) { 561 // It is not safe to transform things like div, urem, etc. because we may 562 // create undefined behavior when executing those on unknown vector elements. 563 if (!isSafeToSpeculativelyExecute(&I)) 564 return false; 565 566 Instruction *I0, *I1; 567 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 568 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) && 569 !match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1)))) 570 return false; 571 572 Value *V0, *V1; 573 uint64_t C0, C1; 574 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) || 575 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) || 576 V0->getType() != V1->getType()) 577 return false; 578 579 // If the scalar value 'I' is going to be re-inserted into a vector, then try 580 // to create an extract to that same element. The extract/insert can be 581 // reduced to a "select shuffle". 582 // TODO: If we add a larger pattern match that starts from an insert, this 583 // probably becomes unnecessary. 584 auto *Ext0 = cast<ExtractElementInst>(I0); 585 auto *Ext1 = cast<ExtractElementInst>(I1); 586 uint64_t InsertIndex = InvalidIndex; 587 if (I.hasOneUse()) 588 match(I.user_back(), 589 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex))); 590 591 ExtractElementInst *ExtractToChange; 592 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex)) 593 return false; 594 595 if (ExtractToChange) { 596 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0; 597 ExtractElementInst *NewExtract = 598 translateExtract(ExtractToChange, CheapExtractIdx, Builder); 599 if (!NewExtract) 600 return false; 601 if (ExtractToChange == Ext0) 602 Ext0 = NewExtract; 603 else 604 Ext1 = NewExtract; 605 } 606 607 if (Pred != CmpInst::BAD_ICMP_PREDICATE) 608 foldExtExtCmp(Ext0, Ext1, I); 609 else 610 foldExtExtBinop(Ext0, Ext1, I); 611 612 Worklist.push(Ext0); 613 Worklist.push(Ext1); 614 return true; 615 } 616 617 /// Try to replace an extract + scalar fneg + insert with a vector fneg + 618 /// shuffle. 619 bool VectorCombine::foldInsExtFNeg(Instruction &I) { 620 // Match an insert (op (extract)) pattern. 621 Value *DestVec; 622 uint64_t Index; 623 Instruction *FNeg; 624 if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)), 625 m_ConstantInt(Index)))) 626 return false; 627 628 // Note: This handles the canonical fneg instruction and "fsub -0.0, X". 629 Value *SrcVec; 630 Instruction *Extract; 631 if (!match(FNeg, m_FNeg(m_CombineAnd( 632 m_Instruction(Extract), 633 m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))) 634 return false; 635 636 // TODO: We could handle this with a length-changing shuffle. 637 auto *VecTy = cast<FixedVectorType>(I.getType()); 638 if (SrcVec->getType() != VecTy) 639 return false; 640 641 // Ignore bogus insert/extract index. 642 unsigned NumElts = VecTy->getNumElements(); 643 if (Index >= NumElts) 644 return false; 645 646 // We are inserting the negated element into the same lane that we extracted 647 // from. This is equivalent to a select-shuffle that chooses all but the 648 // negated element from the destination vector. 649 SmallVector<int> Mask(NumElts); 650 std::iota(Mask.begin(), Mask.end(), 0); 651 Mask[Index] = Index + NumElts; 652 653 Type *ScalarTy = VecTy->getScalarType(); 654 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 655 InstructionCost OldCost = 656 TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) + 657 TTI.getVectorInstrCost(I, VecTy, CostKind, Index); 658 659 // If the extract has one use, it will be eliminated, so count it in the 660 // original cost. If it has more than one use, ignore the cost because it will 661 // be the same before/after. 662 if (Extract->hasOneUse()) 663 OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index); 664 665 InstructionCost NewCost = 666 TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) + 667 TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask); 668 669 if (NewCost > OldCost) 670 return false; 671 672 // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index --> 673 // shuffle DestVec, (fneg SrcVec), Mask 674 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); 675 Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); 676 replaceValue(I, *Shuf); 677 return true; 678 } 679 680 /// If this is a bitcast of a shuffle, try to bitcast the source vector to the 681 /// destination type followed by shuffle. This can enable further transforms by 682 /// moving bitcasts or shuffles together. 683 bool VectorCombine::foldBitcastShuf(Instruction &I) { 684 Value *V; 685 ArrayRef<int> Mask; 686 if (!match(&I, m_BitCast( 687 m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask)))))) 688 return false; 689 690 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for 691 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle 692 // mask for scalable type is a splat or not. 693 // 2) Disallow non-vector casts and length-changing shuffles. 694 // TODO: We could allow any shuffle. 695 auto *SrcTy = dyn_cast<FixedVectorType>(V->getType()); 696 if (!SrcTy || I.getOperand(0)->getType() != SrcTy) 697 return false; 698 699 auto *DestTy = cast<FixedVectorType>(I.getType()); 700 unsigned DestNumElts = DestTy->getNumElements(); 701 unsigned SrcNumElts = SrcTy->getNumElements(); 702 SmallVector<int, 16> NewMask; 703 if (SrcNumElts <= DestNumElts) { 704 // The bitcast is from wide to narrow/equal elements. The shuffle mask can 705 // always be expanded to the equivalent form choosing narrower elements. 706 assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask"); 707 unsigned ScaleFactor = DestNumElts / SrcNumElts; 708 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask); 709 } else { 710 // The bitcast is from narrow elements to wide elements. The shuffle mask 711 // must choose consecutive elements to allow casting first. 712 assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask"); 713 unsigned ScaleFactor = SrcNumElts / DestNumElts; 714 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask)) 715 return false; 716 } 717 718 // The new shuffle must not cost more than the old shuffle. The bitcast is 719 // moved ahead of the shuffle, so assume that it has the same cost as before. 720 InstructionCost DestCost = TTI.getShuffleCost( 721 TargetTransformInfo::SK_PermuteSingleSrc, DestTy, NewMask); 722 InstructionCost SrcCost = 723 TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy, Mask); 724 if (DestCost > SrcCost || !DestCost.isValid()) 725 return false; 726 727 // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' 728 ++NumShufOfBitcast; 729 Value *CastV = Builder.CreateBitCast(V, DestTy); 730 Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask); 731 replaceValue(I, *Shuf); 732 return true; 733 } 734 735 /// Match a vector binop or compare instruction with at least one inserted 736 /// scalar operand and convert to scalar binop/cmp followed by insertelement. 737 bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { 738 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 739 Value *Ins0, *Ins1; 740 if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) && 741 !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) 742 return false; 743 744 // Do not convert the vector condition of a vector select into a scalar 745 // condition. That may cause problems for codegen because of differences in 746 // boolean formats and register-file transfers. 747 // TODO: Can we account for that in the cost model? 748 bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE; 749 if (IsCmp) 750 for (User *U : I.users()) 751 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value()))) 752 return false; 753 754 // Match against one or both scalar values being inserted into constant 755 // vectors: 756 // vec_op VecC0, (inselt VecC1, V1, Index) 757 // vec_op (inselt VecC0, V0, Index), VecC1 758 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) 759 // TODO: Deal with mismatched index constants and variable indexes? 760 Constant *VecC0 = nullptr, *VecC1 = nullptr; 761 Value *V0 = nullptr, *V1 = nullptr; 762 uint64_t Index0 = 0, Index1 = 0; 763 if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0), 764 m_ConstantInt(Index0))) && 765 !match(Ins0, m_Constant(VecC0))) 766 return false; 767 if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1), 768 m_ConstantInt(Index1))) && 769 !match(Ins1, m_Constant(VecC1))) 770 return false; 771 772 bool IsConst0 = !V0; 773 bool IsConst1 = !V1; 774 if (IsConst0 && IsConst1) 775 return false; 776 if (!IsConst0 && !IsConst1 && Index0 != Index1) 777 return false; 778 779 // Bail for single insertion if it is a load. 780 // TODO: Handle this once getVectorInstrCost can cost for load/stores. 781 auto *I0 = dyn_cast_or_null<Instruction>(V0); 782 auto *I1 = dyn_cast_or_null<Instruction>(V1); 783 if ((IsConst0 && I1 && I1->mayReadFromMemory()) || 784 (IsConst1 && I0 && I0->mayReadFromMemory())) 785 return false; 786 787 uint64_t Index = IsConst0 ? Index1 : Index0; 788 Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType(); 789 Type *VecTy = I.getType(); 790 assert(VecTy->isVectorTy() && 791 (IsConst0 || IsConst1 || V0->getType() == V1->getType()) && 792 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() || 793 ScalarTy->isPointerTy()) && 794 "Unexpected types for insert element into binop or cmp"); 795 796 unsigned Opcode = I.getOpcode(); 797 InstructionCost ScalarOpCost, VectorOpCost; 798 if (IsCmp) { 799 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate(); 800 ScalarOpCost = TTI.getCmpSelInstrCost( 801 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred); 802 VectorOpCost = TTI.getCmpSelInstrCost( 803 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred); 804 } else { 805 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); 806 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); 807 } 808 809 // Get cost estimate for the insert element. This cost will factor into 810 // both sequences. 811 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 812 InstructionCost InsertCost = TTI.getVectorInstrCost( 813 Instruction::InsertElement, VecTy, CostKind, Index); 814 InstructionCost OldCost = 815 (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost; 816 InstructionCost NewCost = ScalarOpCost + InsertCost + 817 (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) + 818 (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost); 819 820 // We want to scalarize unless the vector variant actually has lower cost. 821 if (OldCost < NewCost || !NewCost.isValid()) 822 return false; 823 824 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) --> 825 // inselt NewVecC, (scalar_op V0, V1), Index 826 if (IsCmp) 827 ++NumScalarCmp; 828 else 829 ++NumScalarBO; 830 831 // For constant cases, extract the scalar element, this should constant fold. 832 if (IsConst0) 833 V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index)); 834 if (IsConst1) 835 V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index)); 836 837 Value *Scalar = 838 IsCmp ? Builder.CreateCmp(Pred, V0, V1) 839 : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1); 840 841 Scalar->setName(I.getName() + ".scalar"); 842 843 // All IR flags are safe to back-propagate. There is no potential for extra 844 // poison to be created by the scalar instruction. 845 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar)) 846 ScalarInst->copyIRFlags(&I); 847 848 // Fold the vector constants in the original vectors into a new base vector. 849 Value *NewVecC = 850 IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1) 851 : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1); 852 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index); 853 replaceValue(I, *Insert); 854 return true; 855 } 856 857 /// Try to combine a scalar binop + 2 scalar compares of extracted elements of 858 /// a vector into vector operations followed by extract. Note: The SLP pass 859 /// may miss this pattern because of implementation problems. 860 bool VectorCombine::foldExtractedCmps(Instruction &I) { 861 // We are looking for a scalar binop of booleans. 862 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1) 863 if (!I.isBinaryOp() || !I.getType()->isIntegerTy(1)) 864 return false; 865 866 // The compare predicates should match, and each compare should have a 867 // constant operand. 868 // TODO: Relax the one-use constraints. 869 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1); 870 Instruction *I0, *I1; 871 Constant *C0, *C1; 872 CmpInst::Predicate P0, P1; 873 if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) || 874 !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) || 875 P0 != P1) 876 return false; 877 878 // The compare operands must be extracts of the same vector with constant 879 // extract indexes. 880 // TODO: Relax the one-use constraints. 881 Value *X; 882 uint64_t Index0, Index1; 883 if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) || 884 !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))) 885 return false; 886 887 auto *Ext0 = cast<ExtractElementInst>(I0); 888 auto *Ext1 = cast<ExtractElementInst>(I1); 889 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1); 890 if (!ConvertToShuf) 891 return false; 892 893 // The original scalar pattern is: 894 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1) 895 CmpInst::Predicate Pred = P0; 896 unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp 897 : Instruction::ICmp; 898 auto *VecTy = dyn_cast<FixedVectorType>(X->getType()); 899 if (!VecTy) 900 return false; 901 902 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 903 InstructionCost OldCost = 904 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); 905 OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); 906 OldCost += 907 TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(), 908 CmpInst::makeCmpResultType(I0->getType()), Pred) * 909 2; 910 OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType()); 911 912 // The proposed vector pattern is: 913 // vcmp = cmp Pred X, VecC 914 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0 915 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0; 916 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1; 917 auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType())); 918 InstructionCost NewCost = TTI.getCmpSelInstrCost( 919 CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred); 920 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem); 921 ShufMask[CheapIndex] = ExpensiveIndex; 922 NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy, 923 ShufMask); 924 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); 925 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex); 926 927 // Aggressively form vector ops if the cost is equal because the transform 928 // may enable further optimization. 929 // Codegen can reverse this transform (scalarize) if it was not profitable. 930 if (OldCost < NewCost || !NewCost.isValid()) 931 return false; 932 933 // Create a vector constant from the 2 scalar constants. 934 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(), 935 UndefValue::get(VecTy->getElementType())); 936 CmpC[Index0] = C0; 937 CmpC[Index1] = C1; 938 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC)); 939 940 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder); 941 Value *VecLogic = Builder.CreateBinOp(cast<BinaryOperator>(I).getOpcode(), 942 VCmp, Shuf); 943 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex); 944 replaceValue(I, *NewExt); 945 ++NumVecCmpBO; 946 return true; 947 } 948 949 // Check if memory loc modified between two instrs in the same BB 950 static bool isMemModifiedBetween(BasicBlock::iterator Begin, 951 BasicBlock::iterator End, 952 const MemoryLocation &Loc, AAResults &AA) { 953 unsigned NumScanned = 0; 954 return std::any_of(Begin, End, [&](const Instruction &Instr) { 955 return isModSet(AA.getModRefInfo(&Instr, Loc)) || 956 ++NumScanned > MaxInstrsToScan; 957 }); 958 } 959 960 namespace { 961 /// Helper class to indicate whether a vector index can be safely scalarized and 962 /// if a freeze needs to be inserted. 963 class ScalarizationResult { 964 enum class StatusTy { Unsafe, Safe, SafeWithFreeze }; 965 966 StatusTy Status; 967 Value *ToFreeze; 968 969 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr) 970 : Status(Status), ToFreeze(ToFreeze) {} 971 972 public: 973 ScalarizationResult(const ScalarizationResult &Other) = default; 974 ~ScalarizationResult() { 975 assert(!ToFreeze && "freeze() not called with ToFreeze being set"); 976 } 977 978 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; } 979 static ScalarizationResult safe() { return {StatusTy::Safe}; } 980 static ScalarizationResult safeWithFreeze(Value *ToFreeze) { 981 return {StatusTy::SafeWithFreeze, ToFreeze}; 982 } 983 984 /// Returns true if the index can be scalarize without requiring a freeze. 985 bool isSafe() const { return Status == StatusTy::Safe; } 986 /// Returns true if the index cannot be scalarized. 987 bool isUnsafe() const { return Status == StatusTy::Unsafe; } 988 /// Returns true if the index can be scalarize, but requires inserting a 989 /// freeze. 990 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; } 991 992 /// Reset the state of Unsafe and clear ToFreze if set. 993 void discard() { 994 ToFreeze = nullptr; 995 Status = StatusTy::Unsafe; 996 } 997 998 /// Freeze the ToFreeze and update the use in \p User to use it. 999 void freeze(IRBuilder<> &Builder, Instruction &UserI) { 1000 assert(isSafeWithFreeze() && 1001 "should only be used when freezing is required"); 1002 assert(is_contained(ToFreeze->users(), &UserI) && 1003 "UserI must be a user of ToFreeze"); 1004 IRBuilder<>::InsertPointGuard Guard(Builder); 1005 Builder.SetInsertPoint(cast<Instruction>(&UserI)); 1006 Value *Frozen = 1007 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen"); 1008 for (Use &U : make_early_inc_range((UserI.operands()))) 1009 if (U.get() == ToFreeze) 1010 U.set(Frozen); 1011 1012 ToFreeze = nullptr; 1013 } 1014 }; 1015 } // namespace 1016 1017 /// Check if it is legal to scalarize a memory access to \p VecTy at index \p 1018 /// Idx. \p Idx must access a valid vector element. 1019 static ScalarizationResult canScalarizeAccess(FixedVectorType *VecTy, 1020 Value *Idx, Instruction *CtxI, 1021 AssumptionCache &AC, 1022 const DominatorTree &DT) { 1023 if (auto *C = dyn_cast<ConstantInt>(Idx)) { 1024 if (C->getValue().ult(VecTy->getNumElements())) 1025 return ScalarizationResult::safe(); 1026 return ScalarizationResult::unsafe(); 1027 } 1028 1029 unsigned IntWidth = Idx->getType()->getScalarSizeInBits(); 1030 APInt Zero(IntWidth, 0); 1031 APInt MaxElts(IntWidth, VecTy->getNumElements()); 1032 ConstantRange ValidIndices(Zero, MaxElts); 1033 ConstantRange IdxRange(IntWidth, true); 1034 1035 if (isGuaranteedNotToBePoison(Idx, &AC)) { 1036 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false, 1037 true, &AC, CtxI, &DT))) 1038 return ScalarizationResult::safe(); 1039 return ScalarizationResult::unsafe(); 1040 } 1041 1042 // If the index may be poison, check if we can insert a freeze before the 1043 // range of the index is restricted. 1044 Value *IdxBase; 1045 ConstantInt *CI; 1046 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) { 1047 IdxRange = IdxRange.binaryAnd(CI->getValue()); 1048 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) { 1049 IdxRange = IdxRange.urem(CI->getValue()); 1050 } 1051 1052 if (ValidIndices.contains(IdxRange)) 1053 return ScalarizationResult::safeWithFreeze(IdxBase); 1054 return ScalarizationResult::unsafe(); 1055 } 1056 1057 /// The memory operation on a vector of \p ScalarType had alignment of 1058 /// \p VectorAlignment. Compute the maximal, but conservatively correct, 1059 /// alignment that will be valid for the memory operation on a single scalar 1060 /// element of the same type with index \p Idx. 1061 static Align computeAlignmentAfterScalarization(Align VectorAlignment, 1062 Type *ScalarType, Value *Idx, 1063 const DataLayout &DL) { 1064 if (auto *C = dyn_cast<ConstantInt>(Idx)) 1065 return commonAlignment(VectorAlignment, 1066 C->getZExtValue() * DL.getTypeStoreSize(ScalarType)); 1067 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType)); 1068 } 1069 1070 // Combine patterns like: 1071 // %0 = load <4 x i32>, <4 x i32>* %a 1072 // %1 = insertelement <4 x i32> %0, i32 %b, i32 1 1073 // store <4 x i32> %1, <4 x i32>* %a 1074 // to: 1075 // %0 = bitcast <4 x i32>* %a to i32* 1076 // %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1 1077 // store i32 %b, i32* %1 1078 bool VectorCombine::foldSingleElementStore(Instruction &I) { 1079 auto *SI = cast<StoreInst>(&I); 1080 if (!SI->isSimple() || 1081 !isa<FixedVectorType>(SI->getValueOperand()->getType())) 1082 return false; 1083 1084 // TODO: Combine more complicated patterns (multiple insert) by referencing 1085 // TargetTransformInfo. 1086 Instruction *Source; 1087 Value *NewElement; 1088 Value *Idx; 1089 if (!match(SI->getValueOperand(), 1090 m_InsertElt(m_Instruction(Source), m_Value(NewElement), 1091 m_Value(Idx)))) 1092 return false; 1093 1094 if (auto *Load = dyn_cast<LoadInst>(Source)) { 1095 auto VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType()); 1096 const DataLayout &DL = I.getModule()->getDataLayout(); 1097 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts(); 1098 // Don't optimize for atomic/volatile load or store. Ensure memory is not 1099 // modified between, vector type matches store size, and index is inbounds. 1100 if (!Load->isSimple() || Load->getParent() != SI->getParent() || 1101 !DL.typeSizeEqualsStoreSize(Load->getType()) || 1102 SrcAddr != SI->getPointerOperand()->stripPointerCasts()) 1103 return false; 1104 1105 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT); 1106 if (ScalarizableIdx.isUnsafe() || 1107 isMemModifiedBetween(Load->getIterator(), SI->getIterator(), 1108 MemoryLocation::get(SI), AA)) 1109 return false; 1110 1111 if (ScalarizableIdx.isSafeWithFreeze()) 1112 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx)); 1113 Value *GEP = Builder.CreateInBoundsGEP( 1114 SI->getValueOperand()->getType(), SI->getPointerOperand(), 1115 {ConstantInt::get(Idx->getType(), 0), Idx}); 1116 StoreInst *NSI = Builder.CreateStore(NewElement, GEP); 1117 NSI->copyMetadata(*SI); 1118 Align ScalarOpAlignment = computeAlignmentAfterScalarization( 1119 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx, 1120 DL); 1121 NSI->setAlignment(ScalarOpAlignment); 1122 replaceValue(I, *NSI); 1123 eraseInstruction(I); 1124 return true; 1125 } 1126 1127 return false; 1128 } 1129 1130 /// Try to scalarize vector loads feeding extractelement instructions. 1131 bool VectorCombine::scalarizeLoadExtract(Instruction &I) { 1132 Value *Ptr; 1133 if (!match(&I, m_Load(m_Value(Ptr)))) 1134 return false; 1135 1136 auto *FixedVT = cast<FixedVectorType>(I.getType()); 1137 auto *LI = cast<LoadInst>(&I); 1138 const DataLayout &DL = I.getModule()->getDataLayout(); 1139 if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(FixedVT)) 1140 return false; 1141 1142 InstructionCost OriginalCost = 1143 TTI.getMemoryOpCost(Instruction::Load, FixedVT, LI->getAlign(), 1144 LI->getPointerAddressSpace()); 1145 InstructionCost ScalarizedCost = 0; 1146 1147 Instruction *LastCheckedInst = LI; 1148 unsigned NumInstChecked = 0; 1149 // Check if all users of the load are extracts with no memory modifications 1150 // between the load and the extract. Compute the cost of both the original 1151 // code and the scalarized version. 1152 for (User *U : LI->users()) { 1153 auto *UI = dyn_cast<ExtractElementInst>(U); 1154 if (!UI || UI->getParent() != LI->getParent()) 1155 return false; 1156 1157 if (!isGuaranteedNotToBePoison(UI->getOperand(1), &AC, LI, &DT)) 1158 return false; 1159 1160 // Check if any instruction between the load and the extract may modify 1161 // memory. 1162 if (LastCheckedInst->comesBefore(UI)) { 1163 for (Instruction &I : 1164 make_range(std::next(LI->getIterator()), UI->getIterator())) { 1165 // Bail out if we reached the check limit or the instruction may write 1166 // to memory. 1167 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory()) 1168 return false; 1169 NumInstChecked++; 1170 } 1171 LastCheckedInst = UI; 1172 } 1173 1174 auto ScalarIdx = canScalarizeAccess(FixedVT, UI->getOperand(1), &I, AC, DT); 1175 if (!ScalarIdx.isSafe()) { 1176 // TODO: Freeze index if it is safe to do so. 1177 ScalarIdx.discard(); 1178 return false; 1179 } 1180 1181 auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1)); 1182 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1183 OriginalCost += 1184 TTI.getVectorInstrCost(Instruction::ExtractElement, FixedVT, CostKind, 1185 Index ? Index->getZExtValue() : -1); 1186 ScalarizedCost += 1187 TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(), 1188 Align(1), LI->getPointerAddressSpace()); 1189 ScalarizedCost += TTI.getAddressComputationCost(FixedVT->getElementType()); 1190 } 1191 1192 if (ScalarizedCost >= OriginalCost) 1193 return false; 1194 1195 // Replace extracts with narrow scalar loads. 1196 for (User *U : LI->users()) { 1197 auto *EI = cast<ExtractElementInst>(U); 1198 Builder.SetInsertPoint(EI); 1199 1200 Value *Idx = EI->getOperand(1); 1201 Value *GEP = 1202 Builder.CreateInBoundsGEP(FixedVT, Ptr, {Builder.getInt32(0), Idx}); 1203 auto *NewLoad = cast<LoadInst>(Builder.CreateLoad( 1204 FixedVT->getElementType(), GEP, EI->getName() + ".scalar")); 1205 1206 Align ScalarOpAlignment = computeAlignmentAfterScalarization( 1207 LI->getAlign(), FixedVT->getElementType(), Idx, DL); 1208 NewLoad->setAlignment(ScalarOpAlignment); 1209 1210 replaceValue(*EI, *NewLoad); 1211 } 1212 1213 return true; 1214 } 1215 1216 /// Try to convert "shuffle (binop), (binop)" with a shared binop operand into 1217 /// "binop (shuffle), (shuffle)". 1218 bool VectorCombine::foldShuffleOfBinops(Instruction &I) { 1219 auto *VecTy = cast<FixedVectorType>(I.getType()); 1220 BinaryOperator *B0, *B1; 1221 ArrayRef<int> Mask; 1222 if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)), 1223 m_Mask(Mask))) || 1224 B0->getOpcode() != B1->getOpcode() || B0->getType() != VecTy) 1225 return false; 1226 1227 // Try to replace a binop with a shuffle if the shuffle is not costly. 1228 // The new shuffle will choose from a single, common operand, so it may be 1229 // cheaper than the existing two-operand shuffle. 1230 SmallVector<int> UnaryMask = createUnaryMask(Mask, Mask.size()); 1231 Instruction::BinaryOps Opcode = B0->getOpcode(); 1232 InstructionCost BinopCost = TTI.getArithmeticInstrCost(Opcode, VecTy); 1233 InstructionCost ShufCost = TTI.getShuffleCost( 1234 TargetTransformInfo::SK_PermuteSingleSrc, VecTy, UnaryMask); 1235 if (ShufCost > BinopCost) 1236 return false; 1237 1238 // If we have something like "add X, Y" and "add Z, X", swap ops to match. 1239 Value *X = B0->getOperand(0), *Y = B0->getOperand(1); 1240 Value *Z = B1->getOperand(0), *W = B1->getOperand(1); 1241 if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W) 1242 std::swap(X, Y); 1243 1244 Value *Shuf0, *Shuf1; 1245 if (X == Z) { 1246 // shuf (bo X, Y), (bo X, W) --> bo (shuf X), (shuf Y, W) 1247 Shuf0 = Builder.CreateShuffleVector(X, UnaryMask); 1248 Shuf1 = Builder.CreateShuffleVector(Y, W, Mask); 1249 } else if (Y == W) { 1250 // shuf (bo X, Y), (bo Z, Y) --> bo (shuf X, Z), (shuf Y) 1251 Shuf0 = Builder.CreateShuffleVector(X, Z, Mask); 1252 Shuf1 = Builder.CreateShuffleVector(Y, UnaryMask); 1253 } else { 1254 return false; 1255 } 1256 1257 Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1); 1258 // Intersect flags from the old binops. 1259 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) { 1260 NewInst->copyIRFlags(B0); 1261 NewInst->andIRFlags(B1); 1262 } 1263 replaceValue(I, *NewBO); 1264 return true; 1265 } 1266 1267 /// Given a commutative reduction, the order of the input lanes does not alter 1268 /// the results. We can use this to remove certain shuffles feeding the 1269 /// reduction, removing the need to shuffle at all. 1270 bool VectorCombine::foldShuffleFromReductions(Instruction &I) { 1271 auto *II = dyn_cast<IntrinsicInst>(&I); 1272 if (!II) 1273 return false; 1274 switch (II->getIntrinsicID()) { 1275 case Intrinsic::vector_reduce_add: 1276 case Intrinsic::vector_reduce_mul: 1277 case Intrinsic::vector_reduce_and: 1278 case Intrinsic::vector_reduce_or: 1279 case Intrinsic::vector_reduce_xor: 1280 case Intrinsic::vector_reduce_smin: 1281 case Intrinsic::vector_reduce_smax: 1282 case Intrinsic::vector_reduce_umin: 1283 case Intrinsic::vector_reduce_umax: 1284 break; 1285 default: 1286 return false; 1287 } 1288 1289 // Find all the inputs when looking through operations that do not alter the 1290 // lane order (binops, for example). Currently we look for a single shuffle, 1291 // and can ignore splat values. 1292 std::queue<Value *> Worklist; 1293 SmallPtrSet<Value *, 4> Visited; 1294 ShuffleVectorInst *Shuffle = nullptr; 1295 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0))) 1296 Worklist.push(Op); 1297 1298 while (!Worklist.empty()) { 1299 Value *CV = Worklist.front(); 1300 Worklist.pop(); 1301 if (Visited.contains(CV)) 1302 continue; 1303 1304 // Splats don't change the order, so can be safely ignored. 1305 if (isSplatValue(CV)) 1306 continue; 1307 1308 Visited.insert(CV); 1309 1310 if (auto *CI = dyn_cast<Instruction>(CV)) { 1311 if (CI->isBinaryOp()) { 1312 for (auto *Op : CI->operand_values()) 1313 Worklist.push(Op); 1314 continue; 1315 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) { 1316 if (Shuffle && Shuffle != SV) 1317 return false; 1318 Shuffle = SV; 1319 continue; 1320 } 1321 } 1322 1323 // Anything else is currently an unknown node. 1324 return false; 1325 } 1326 1327 if (!Shuffle) 1328 return false; 1329 1330 // Check all uses of the binary ops and shuffles are also included in the 1331 // lane-invariant operations (Visited should be the list of lanewise 1332 // instructions, including the shuffle that we found). 1333 for (auto *V : Visited) 1334 for (auto *U : V->users()) 1335 if (!Visited.contains(U) && U != &I) 1336 return false; 1337 1338 FixedVectorType *VecType = 1339 dyn_cast<FixedVectorType>(II->getOperand(0)->getType()); 1340 if (!VecType) 1341 return false; 1342 FixedVectorType *ShuffleInputType = 1343 dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType()); 1344 if (!ShuffleInputType) 1345 return false; 1346 int NumInputElts = ShuffleInputType->getNumElements(); 1347 1348 // Find the mask from sorting the lanes into order. This is most likely to 1349 // become a identity or concat mask. Undef elements are pushed to the end. 1350 SmallVector<int> ConcatMask; 1351 Shuffle->getShuffleMask(ConcatMask); 1352 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; }); 1353 bool UsesSecondVec = 1354 any_of(ConcatMask, [&](int M) { return M >= NumInputElts; }); 1355 InstructionCost OldCost = TTI.getShuffleCost( 1356 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType, 1357 Shuffle->getShuffleMask()); 1358 InstructionCost NewCost = TTI.getShuffleCost( 1359 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType, 1360 ConcatMask); 1361 1362 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle 1363 << "\n"); 1364 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost 1365 << "\n"); 1366 if (NewCost < OldCost) { 1367 Builder.SetInsertPoint(Shuffle); 1368 Value *NewShuffle = Builder.CreateShuffleVector( 1369 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask); 1370 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n"); 1371 replaceValue(*Shuffle, *NewShuffle); 1372 } 1373 1374 // See if we can re-use foldSelectShuffle, getting it to reduce the size of 1375 // the shuffle into a nicer order, as it can ignore the order of the shuffles. 1376 return foldSelectShuffle(*Shuffle, true); 1377 } 1378 1379 /// This method looks for groups of shuffles acting on binops, of the form: 1380 /// %x = shuffle ... 1381 /// %y = shuffle ... 1382 /// %a = binop %x, %y 1383 /// %b = binop %x, %y 1384 /// shuffle %a, %b, selectmask 1385 /// We may, especially if the shuffle is wider than legal, be able to convert 1386 /// the shuffle to a form where only parts of a and b need to be computed. On 1387 /// architectures with no obvious "select" shuffle, this can reduce the total 1388 /// number of operations if the target reports them as cheaper. 1389 bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { 1390 auto *SVI = cast<ShuffleVectorInst>(&I); 1391 auto *VT = cast<FixedVectorType>(I.getType()); 1392 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0)); 1393 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1)); 1394 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() || 1395 VT != Op0->getType()) 1396 return false; 1397 1398 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0)); 1399 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1)); 1400 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0)); 1401 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1)); 1402 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B}); 1403 auto checkSVNonOpUses = [&](Instruction *I) { 1404 if (!I || I->getOperand(0)->getType() != VT) 1405 return true; 1406 return any_of(I->users(), [&](User *U) { 1407 return U != Op0 && U != Op1 && 1408 !(isa<ShuffleVectorInst>(U) && 1409 (InputShuffles.contains(cast<Instruction>(U)) || 1410 isInstructionTriviallyDead(cast<Instruction>(U)))); 1411 }); 1412 }; 1413 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) || 1414 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B)) 1415 return false; 1416 1417 // Collect all the uses that are shuffles that we can transform together. We 1418 // may not have a single shuffle, but a group that can all be transformed 1419 // together profitably. 1420 SmallVector<ShuffleVectorInst *> Shuffles; 1421 auto collectShuffles = [&](Instruction *I) { 1422 for (auto *U : I->users()) { 1423 auto *SV = dyn_cast<ShuffleVectorInst>(U); 1424 if (!SV || SV->getType() != VT) 1425 return false; 1426 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) || 1427 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1)) 1428 return false; 1429 if (!llvm::is_contained(Shuffles, SV)) 1430 Shuffles.push_back(SV); 1431 } 1432 return true; 1433 }; 1434 if (!collectShuffles(Op0) || !collectShuffles(Op1)) 1435 return false; 1436 // From a reduction, we need to be processing a single shuffle, otherwise the 1437 // other uses will not be lane-invariant. 1438 if (FromReduction && Shuffles.size() > 1) 1439 return false; 1440 1441 // Add any shuffle uses for the shuffles we have found, to include them in our 1442 // cost calculations. 1443 if (!FromReduction) { 1444 for (ShuffleVectorInst *SV : Shuffles) { 1445 for (auto *U : SV->users()) { 1446 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U); 1447 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT) 1448 Shuffles.push_back(SSV); 1449 } 1450 } 1451 } 1452 1453 // For each of the output shuffles, we try to sort all the first vector 1454 // elements to the beginning, followed by the second array elements at the 1455 // end. If the binops are legalized to smaller vectors, this may reduce total 1456 // number of binops. We compute the ReconstructMask mask needed to convert 1457 // back to the original lane order. 1458 SmallVector<std::pair<int, int>> V1, V2; 1459 SmallVector<SmallVector<int>> OrigReconstructMasks; 1460 int MaxV1Elt = 0, MaxV2Elt = 0; 1461 unsigned NumElts = VT->getNumElements(); 1462 for (ShuffleVectorInst *SVN : Shuffles) { 1463 SmallVector<int> Mask; 1464 SVN->getShuffleMask(Mask); 1465 1466 // Check the operands are the same as the original, or reversed (in which 1467 // case we need to commute the mask). 1468 Value *SVOp0 = SVN->getOperand(0); 1469 Value *SVOp1 = SVN->getOperand(1); 1470 if (isa<UndefValue>(SVOp1)) { 1471 auto *SSV = cast<ShuffleVectorInst>(SVOp0); 1472 SVOp0 = SSV->getOperand(0); 1473 SVOp1 = SSV->getOperand(1); 1474 for (unsigned I = 0, E = Mask.size(); I != E; I++) { 1475 if (Mask[I] >= static_cast<int>(SSV->getShuffleMask().size())) 1476 return false; 1477 Mask[I] = Mask[I] < 0 ? Mask[I] : SSV->getMaskValue(Mask[I]); 1478 } 1479 } 1480 if (SVOp0 == Op1 && SVOp1 == Op0) { 1481 std::swap(SVOp0, SVOp1); 1482 ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); 1483 } 1484 if (SVOp0 != Op0 || SVOp1 != Op1) 1485 return false; 1486 1487 // Calculate the reconstruction mask for this shuffle, as the mask needed to 1488 // take the packed values from Op0/Op1 and reconstructing to the original 1489 // order. 1490 SmallVector<int> ReconstructMask; 1491 for (unsigned I = 0; I < Mask.size(); I++) { 1492 if (Mask[I] < 0) { 1493 ReconstructMask.push_back(-1); 1494 } else if (Mask[I] < static_cast<int>(NumElts)) { 1495 MaxV1Elt = std::max(MaxV1Elt, Mask[I]); 1496 auto It = find_if(V1, [&](const std::pair<int, int> &A) { 1497 return Mask[I] == A.first; 1498 }); 1499 if (It != V1.end()) 1500 ReconstructMask.push_back(It - V1.begin()); 1501 else { 1502 ReconstructMask.push_back(V1.size()); 1503 V1.emplace_back(Mask[I], V1.size()); 1504 } 1505 } else { 1506 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts); 1507 auto It = find_if(V2, [&](const std::pair<int, int> &A) { 1508 return Mask[I] - static_cast<int>(NumElts) == A.first; 1509 }); 1510 if (It != V2.end()) 1511 ReconstructMask.push_back(NumElts + It - V2.begin()); 1512 else { 1513 ReconstructMask.push_back(NumElts + V2.size()); 1514 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size()); 1515 } 1516 } 1517 } 1518 1519 // For reductions, we know that the lane ordering out doesn't alter the 1520 // result. In-order can help simplify the shuffle away. 1521 if (FromReduction) 1522 sort(ReconstructMask); 1523 OrigReconstructMasks.push_back(std::move(ReconstructMask)); 1524 } 1525 1526 // If the Maximum element used from V1 and V2 are not larger than the new 1527 // vectors, the vectors are already packes and performing the optimization 1528 // again will likely not help any further. This also prevents us from getting 1529 // stuck in a cycle in case the costs do not also rule it out. 1530 if (V1.empty() || V2.empty() || 1531 (MaxV1Elt == static_cast<int>(V1.size()) - 1 && 1532 MaxV2Elt == static_cast<int>(V2.size()) - 1)) 1533 return false; 1534 1535 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a 1536 // shuffle of another shuffle, or not a shuffle (that is treated like a 1537 // identity shuffle). 1538 auto GetBaseMaskValue = [&](Instruction *I, int M) { 1539 auto *SV = dyn_cast<ShuffleVectorInst>(I); 1540 if (!SV) 1541 return M; 1542 if (isa<UndefValue>(SV->getOperand(1))) 1543 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) 1544 if (InputShuffles.contains(SSV)) 1545 return SSV->getMaskValue(SV->getMaskValue(M)); 1546 return SV->getMaskValue(M); 1547 }; 1548 1549 // Attempt to sort the inputs my ascending mask values to make simpler input 1550 // shuffles and push complex shuffles down to the uses. We sort on the first 1551 // of the two input shuffle orders, to try and get at least one input into a 1552 // nice order. 1553 auto SortBase = [&](Instruction *A, std::pair<int, int> X, 1554 std::pair<int, int> Y) { 1555 int MXA = GetBaseMaskValue(A, X.first); 1556 int MYA = GetBaseMaskValue(A, Y.first); 1557 return MXA < MYA; 1558 }; 1559 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) { 1560 return SortBase(SVI0A, A, B); 1561 }); 1562 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) { 1563 return SortBase(SVI1A, A, B); 1564 }); 1565 // Calculate our ReconstructMasks from the OrigReconstructMasks and the 1566 // modified order of the input shuffles. 1567 SmallVector<SmallVector<int>> ReconstructMasks; 1568 for (auto Mask : OrigReconstructMasks) { 1569 SmallVector<int> ReconstructMask; 1570 for (int M : Mask) { 1571 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) { 1572 auto It = find_if(V, [M](auto A) { return A.second == M; }); 1573 assert(It != V.end() && "Expected all entries in Mask"); 1574 return std::distance(V.begin(), It); 1575 }; 1576 if (M < 0) 1577 ReconstructMask.push_back(-1); 1578 else if (M < static_cast<int>(NumElts)) { 1579 ReconstructMask.push_back(FindIndex(V1, M)); 1580 } else { 1581 ReconstructMask.push_back(NumElts + FindIndex(V2, M)); 1582 } 1583 } 1584 ReconstructMasks.push_back(std::move(ReconstructMask)); 1585 } 1586 1587 // Calculate the masks needed for the new input shuffles, which get padded 1588 // with undef 1589 SmallVector<int> V1A, V1B, V2A, V2B; 1590 for (unsigned I = 0; I < V1.size(); I++) { 1591 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first)); 1592 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first)); 1593 } 1594 for (unsigned I = 0; I < V2.size(); I++) { 1595 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first)); 1596 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first)); 1597 } 1598 while (V1A.size() < NumElts) { 1599 V1A.push_back(UndefMaskElem); 1600 V1B.push_back(UndefMaskElem); 1601 } 1602 while (V2A.size() < NumElts) { 1603 V2A.push_back(UndefMaskElem); 1604 V2B.push_back(UndefMaskElem); 1605 } 1606 1607 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) { 1608 auto *SV = dyn_cast<ShuffleVectorInst>(I); 1609 if (!SV) 1610 return C; 1611 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1)) 1612 ? TTI::SK_PermuteSingleSrc 1613 : TTI::SK_PermuteTwoSrc, 1614 VT, SV->getShuffleMask()); 1615 }; 1616 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) { 1617 return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask); 1618 }; 1619 1620 // Get the costs of the shuffles + binops before and after with the new 1621 // shuffle masks. 1622 InstructionCost CostBefore = 1623 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT) + 1624 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT); 1625 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(), 1626 InstructionCost(0), AddShuffleCost); 1627 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), 1628 InstructionCost(0), AddShuffleCost); 1629 1630 // The new binops will be unused for lanes past the used shuffle lengths. 1631 // These types attempt to get the correct cost for that from the target. 1632 FixedVectorType *Op0SmallVT = 1633 FixedVectorType::get(VT->getScalarType(), V1.size()); 1634 FixedVectorType *Op1SmallVT = 1635 FixedVectorType::get(VT->getScalarType(), V2.size()); 1636 InstructionCost CostAfter = 1637 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT) + 1638 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT); 1639 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(), 1640 InstructionCost(0), AddShuffleMaskCost); 1641 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B}); 1642 CostAfter += 1643 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(), 1644 InstructionCost(0), AddShuffleMaskCost); 1645 1646 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n"); 1647 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore 1648 << " vs CostAfter: " << CostAfter << "\n"); 1649 if (CostBefore <= CostAfter) 1650 return false; 1651 1652 // The cost model has passed, create the new instructions. 1653 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * { 1654 auto *SV = dyn_cast<ShuffleVectorInst>(I); 1655 if (!SV) 1656 return I; 1657 if (isa<UndefValue>(SV->getOperand(1))) 1658 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) 1659 if (InputShuffles.contains(SSV)) 1660 return SSV->getOperand(Op); 1661 return SV->getOperand(Op); 1662 }; 1663 Builder.SetInsertPoint(SVI0A->getNextNode()); 1664 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0), 1665 GetShuffleOperand(SVI0A, 1), V1A); 1666 Builder.SetInsertPoint(SVI0B->getNextNode()); 1667 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0), 1668 GetShuffleOperand(SVI0B, 1), V1B); 1669 Builder.SetInsertPoint(SVI1A->getNextNode()); 1670 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0), 1671 GetShuffleOperand(SVI1A, 1), V2A); 1672 Builder.SetInsertPoint(SVI1B->getNextNode()); 1673 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0), 1674 GetShuffleOperand(SVI1B, 1), V2B); 1675 Builder.SetInsertPoint(Op0); 1676 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(), 1677 NSV0A, NSV0B); 1678 if (auto *I = dyn_cast<Instruction>(NOp0)) 1679 I->copyIRFlags(Op0, true); 1680 Builder.SetInsertPoint(Op1); 1681 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(), 1682 NSV1A, NSV1B); 1683 if (auto *I = dyn_cast<Instruction>(NOp1)) 1684 I->copyIRFlags(Op1, true); 1685 1686 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) { 1687 Builder.SetInsertPoint(Shuffles[S]); 1688 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]); 1689 replaceValue(*Shuffles[S], *NSV); 1690 } 1691 1692 Worklist.pushValue(NSV0A); 1693 Worklist.pushValue(NSV0B); 1694 Worklist.pushValue(NSV1A); 1695 Worklist.pushValue(NSV1B); 1696 for (auto *S : Shuffles) 1697 Worklist.add(S); 1698 return true; 1699 } 1700 1701 /// This is the entry point for all transforms. Pass manager differences are 1702 /// handled in the callers of this function. 1703 bool VectorCombine::run() { 1704 if (DisableVectorCombine) 1705 return false; 1706 1707 // Don't attempt vectorization if the target does not support vectors. 1708 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true))) 1709 return false; 1710 1711 bool MadeChange = false; 1712 auto FoldInst = [this, &MadeChange](Instruction &I) { 1713 Builder.SetInsertPoint(&I); 1714 bool IsFixedVectorType = isa<FixedVectorType>(I.getType()); 1715 auto Opcode = I.getOpcode(); 1716 1717 // These folds should be beneficial regardless of when this pass is run 1718 // in the optimization pipeline. 1719 // The type checking is for run-time efficiency. We can avoid wasting time 1720 // dispatching to folding functions if there's no chance of matching. 1721 if (IsFixedVectorType) { 1722 switch (Opcode) { 1723 case Instruction::InsertElement: 1724 MadeChange |= vectorizeLoadInsert(I); 1725 break; 1726 case Instruction::ShuffleVector: 1727 MadeChange |= widenSubvectorLoad(I); 1728 break; 1729 case Instruction::Load: 1730 MadeChange |= scalarizeLoadExtract(I); 1731 break; 1732 default: 1733 break; 1734 } 1735 } 1736 1737 // This transform works with scalable and fixed vectors 1738 // TODO: Identify and allow other scalable transforms 1739 if (isa<VectorType>(I.getType())) 1740 MadeChange |= scalarizeBinopOrCmp(I); 1741 1742 if (Opcode == Instruction::Store) 1743 MadeChange |= foldSingleElementStore(I); 1744 1745 1746 // If this is an early pipeline invocation of this pass, we are done. 1747 if (TryEarlyFoldsOnly) 1748 return; 1749 1750 // Otherwise, try folds that improve codegen but may interfere with 1751 // early IR canonicalizations. 1752 // The type checking is for run-time efficiency. We can avoid wasting time 1753 // dispatching to folding functions if there's no chance of matching. 1754 if (IsFixedVectorType) { 1755 switch (Opcode) { 1756 case Instruction::InsertElement: 1757 MadeChange |= foldInsExtFNeg(I); 1758 break; 1759 case Instruction::ShuffleVector: 1760 MadeChange |= foldShuffleOfBinops(I); 1761 MadeChange |= foldSelectShuffle(I); 1762 break; 1763 case Instruction::BitCast: 1764 MadeChange |= foldBitcastShuf(I); 1765 break; 1766 } 1767 } else { 1768 switch (Opcode) { 1769 case Instruction::Call: 1770 MadeChange |= foldShuffleFromReductions(I); 1771 break; 1772 case Instruction::ICmp: 1773 case Instruction::FCmp: 1774 MadeChange |= foldExtractExtract(I); 1775 break; 1776 default: 1777 if (Instruction::isBinaryOp(Opcode)) { 1778 MadeChange |= foldExtractExtract(I); 1779 MadeChange |= foldExtractedCmps(I); 1780 } 1781 break; 1782 } 1783 } 1784 }; 1785 1786 for (BasicBlock &BB : F) { 1787 // Ignore unreachable basic blocks. 1788 if (!DT.isReachableFromEntry(&BB)) 1789 continue; 1790 // Use early increment range so that we can erase instructions in loop. 1791 for (Instruction &I : make_early_inc_range(BB)) { 1792 if (I.isDebugOrPseudoInst()) 1793 continue; 1794 FoldInst(I); 1795 } 1796 } 1797 1798 while (!Worklist.isEmpty()) { 1799 Instruction *I = Worklist.removeOne(); 1800 if (!I) 1801 continue; 1802 1803 if (isInstructionTriviallyDead(I)) { 1804 eraseInstruction(*I); 1805 continue; 1806 } 1807 1808 FoldInst(*I); 1809 } 1810 1811 return MadeChange; 1812 } 1813 1814 // Pass manager boilerplate below here. 1815 1816 namespace { 1817 class VectorCombineLegacyPass : public FunctionPass { 1818 public: 1819 static char ID; 1820 VectorCombineLegacyPass() : FunctionPass(ID) { 1821 initializeVectorCombineLegacyPassPass(*PassRegistry::getPassRegistry()); 1822 } 1823 1824 void getAnalysisUsage(AnalysisUsage &AU) const override { 1825 AU.addRequired<AssumptionCacheTracker>(); 1826 AU.addRequired<DominatorTreeWrapperPass>(); 1827 AU.addRequired<TargetTransformInfoWrapperPass>(); 1828 AU.addRequired<AAResultsWrapperPass>(); 1829 AU.setPreservesCFG(); 1830 AU.addPreserved<DominatorTreeWrapperPass>(); 1831 AU.addPreserved<GlobalsAAWrapperPass>(); 1832 AU.addPreserved<AAResultsWrapperPass>(); 1833 AU.addPreserved<BasicAAWrapperPass>(); 1834 FunctionPass::getAnalysisUsage(AU); 1835 } 1836 1837 bool runOnFunction(Function &F) override { 1838 if (skipFunction(F)) 1839 return false; 1840 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1841 auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1842 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1843 auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); 1844 VectorCombine Combiner(F, TTI, DT, AA, AC, false); 1845 return Combiner.run(); 1846 } 1847 }; 1848 } // namespace 1849 1850 char VectorCombineLegacyPass::ID = 0; 1851 INITIALIZE_PASS_BEGIN(VectorCombineLegacyPass, "vector-combine", 1852 "Optimize scalar/vector ops", false, 1853 false) 1854 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1855 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 1856 INITIALIZE_PASS_END(VectorCombineLegacyPass, "vector-combine", 1857 "Optimize scalar/vector ops", false, false) 1858 Pass *llvm::createVectorCombinePass() { 1859 return new VectorCombineLegacyPass(); 1860 } 1861 1862 PreservedAnalyses VectorCombinePass::run(Function &F, 1863 FunctionAnalysisManager &FAM) { 1864 auto &AC = FAM.getResult<AssumptionAnalysis>(F); 1865 TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F); 1866 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); 1867 AAResults &AA = FAM.getResult<AAManager>(F); 1868 VectorCombine Combiner(F, TTI, DT, AA, AC, TryEarlyFoldsOnly); 1869 if (!Combiner.run()) 1870 return PreservedAnalyses::all(); 1871 PreservedAnalyses PA; 1872 PA.preserveSet<CFGAnalyses>(); 1873 return PA; 1874 } 1875