1 //===- InstCombineVectorOps.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements instcombine for ExtractElement, InsertElement and 10 // ShuffleVector. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "InstCombineInternal.h" 15 #include "llvm/ADT/APInt.h" 16 #include "llvm/ADT/ArrayRef.h" 17 #include "llvm/ADT/DenseMap.h" 18 #include "llvm/ADT/STLExtras.h" 19 #include "llvm/ADT/SmallBitVector.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Statistic.h" 22 #include "llvm/Analysis/InstructionSimplify.h" 23 #include "llvm/Analysis/VectorUtils.h" 24 #include "llvm/IR/BasicBlock.h" 25 #include "llvm/IR/Constant.h" 26 #include "llvm/IR/Constants.h" 27 #include "llvm/IR/DerivedTypes.h" 28 #include "llvm/IR/InstrTypes.h" 29 #include "llvm/IR/Instruction.h" 30 #include "llvm/IR/Instructions.h" 31 #include "llvm/IR/Operator.h" 32 #include "llvm/IR/PatternMatch.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/IR/User.h" 35 #include "llvm/IR/Value.h" 36 #include "llvm/Support/Casting.h" 37 #include "llvm/Support/ErrorHandling.h" 38 #include "llvm/Transforms/InstCombine/InstCombiner.h" 39 #include <cassert> 40 #include <cstdint> 41 #include <iterator> 42 #include <utility> 43 44 #define DEBUG_TYPE "instcombine" 45 46 using namespace llvm; 47 using namespace PatternMatch; 48 49 STATISTIC(NumAggregateReconstructionsSimplified, 50 "Number of aggregate reconstructions turned into reuse of the " 51 "original aggregate"); 52 53 /// Return true if the value is cheaper to scalarize than it is to leave as a 54 /// vector operation. If the extract index \p EI is a constant integer then 55 /// some operations may be cheap to scalarize. 56 /// 57 /// FIXME: It's possible to create more instructions than previously existed. 58 static bool cheapToScalarize(Value *V, Value *EI) { 59 ConstantInt *CEI = dyn_cast<ConstantInt>(EI); 60 61 // If we can pick a scalar constant value out of a vector, that is free. 62 if (auto *C = dyn_cast<Constant>(V)) 63 return CEI || C->getSplatValue(); 64 65 if (CEI && match(V, m_Intrinsic<Intrinsic::stepvector>())) { 66 ElementCount EC = cast<VectorType>(V->getType())->getElementCount(); 67 // Index needs to be lower than the minimum size of the vector, because 68 // for scalable vector, the vector size is known at run time. 69 return CEI->getValue().ult(EC.getKnownMinValue()); 70 } 71 72 // An insertelement to the same constant index as our extract will simplify 73 // to the scalar inserted element. An insertelement to a different constant 74 // index is irrelevant to our extract. 75 if (match(V, m_InsertElt(m_Value(), m_Value(), m_ConstantInt()))) 76 return CEI; 77 78 if (match(V, m_OneUse(m_Load(m_Value())))) 79 return true; 80 81 if (match(V, m_OneUse(m_UnOp()))) 82 return true; 83 84 Value *V0, *V1; 85 if (match(V, m_OneUse(m_BinOp(m_Value(V0), m_Value(V1))))) 86 if (cheapToScalarize(V0, EI) || cheapToScalarize(V1, EI)) 87 return true; 88 89 CmpInst::Predicate UnusedPred; 90 if (match(V, m_OneUse(m_Cmp(UnusedPred, m_Value(V0), m_Value(V1))))) 91 if (cheapToScalarize(V0, EI) || cheapToScalarize(V1, EI)) 92 return true; 93 94 return false; 95 } 96 97 // If we have a PHI node with a vector type that is only used to feed 98 // itself and be an operand of extractelement at a constant location, 99 // try to replace the PHI of the vector type with a PHI of a scalar type. 100 Instruction *InstCombinerImpl::scalarizePHI(ExtractElementInst &EI, 101 PHINode *PN) { 102 SmallVector<Instruction *, 2> Extracts; 103 // The users we want the PHI to have are: 104 // 1) The EI ExtractElement (we already know this) 105 // 2) Possibly more ExtractElements with the same index. 106 // 3) Another operand, which will feed back into the PHI. 107 Instruction *PHIUser = nullptr; 108 for (auto *U : PN->users()) { 109 if (ExtractElementInst *EU = dyn_cast<ExtractElementInst>(U)) { 110 if (EI.getIndexOperand() == EU->getIndexOperand()) 111 Extracts.push_back(EU); 112 else 113 return nullptr; 114 } else if (!PHIUser) { 115 PHIUser = cast<Instruction>(U); 116 } else { 117 return nullptr; 118 } 119 } 120 121 if (!PHIUser) 122 return nullptr; 123 124 // Verify that this PHI user has one use, which is the PHI itself, 125 // and that it is a binary operation which is cheap to scalarize. 126 // otherwise return nullptr. 127 if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) || 128 !(isa<BinaryOperator>(PHIUser)) || 129 !cheapToScalarize(PHIUser, EI.getIndexOperand())) 130 return nullptr; 131 132 // Create a scalar PHI node that will replace the vector PHI node 133 // just before the current PHI node. 134 PHINode *scalarPHI = cast<PHINode>(InsertNewInstWith( 135 PHINode::Create(EI.getType(), PN->getNumIncomingValues(), ""), PN->getIterator())); 136 // Scalarize each PHI operand. 137 for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) { 138 Value *PHIInVal = PN->getIncomingValue(i); 139 BasicBlock *inBB = PN->getIncomingBlock(i); 140 Value *Elt = EI.getIndexOperand(); 141 // If the operand is the PHI induction variable: 142 if (PHIInVal == PHIUser) { 143 // Scalarize the binary operation. Its first operand is the 144 // scalar PHI, and the second operand is extracted from the other 145 // vector operand. 146 BinaryOperator *B0 = cast<BinaryOperator>(PHIUser); 147 unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0; 148 Value *Op = InsertNewInstWith( 149 ExtractElementInst::Create(B0->getOperand(opId), Elt, 150 B0->getOperand(opId)->getName() + ".Elt"), 151 B0->getIterator()); 152 Value *newPHIUser = InsertNewInstWith( 153 BinaryOperator::CreateWithCopiedFlags(B0->getOpcode(), 154 scalarPHI, Op, B0), B0->getIterator()); 155 scalarPHI->addIncoming(newPHIUser, inBB); 156 } else { 157 // Scalarize PHI input: 158 Instruction *newEI = ExtractElementInst::Create(PHIInVal, Elt, ""); 159 // Insert the new instruction into the predecessor basic block. 160 Instruction *pos = dyn_cast<Instruction>(PHIInVal); 161 BasicBlock::iterator InsertPos; 162 if (pos && !isa<PHINode>(pos)) { 163 InsertPos = ++pos->getIterator(); 164 } else { 165 InsertPos = inBB->getFirstInsertionPt(); 166 } 167 168 InsertNewInstWith(newEI, InsertPos); 169 170 scalarPHI->addIncoming(newEI, inBB); 171 } 172 } 173 174 for (auto *E : Extracts) { 175 replaceInstUsesWith(*E, scalarPHI); 176 // Add old extract to worklist for DCE. 177 addToWorklist(E); 178 } 179 180 return &EI; 181 } 182 183 Instruction *InstCombinerImpl::foldBitcastExtElt(ExtractElementInst &Ext) { 184 Value *X; 185 uint64_t ExtIndexC; 186 if (!match(Ext.getVectorOperand(), m_BitCast(m_Value(X))) || 187 !match(Ext.getIndexOperand(), m_ConstantInt(ExtIndexC))) 188 return nullptr; 189 190 ElementCount NumElts = 191 cast<VectorType>(Ext.getVectorOperandType())->getElementCount(); 192 Type *DestTy = Ext.getType(); 193 unsigned DestWidth = DestTy->getPrimitiveSizeInBits(); 194 bool IsBigEndian = DL.isBigEndian(); 195 196 // If we are casting an integer to vector and extracting a portion, that is 197 // a shift-right and truncate. 198 if (X->getType()->isIntegerTy()) { 199 assert(isa<FixedVectorType>(Ext.getVectorOperand()->getType()) && 200 "Expected fixed vector type for bitcast from scalar integer"); 201 202 // Big endian requires adjusting the extract index since MSB is at index 0. 203 // LittleEndian: extelt (bitcast i32 X to v4i8), 0 -> trunc i32 X to i8 204 // BigEndian: extelt (bitcast i32 X to v4i8), 0 -> trunc i32 (X >> 24) to i8 205 if (IsBigEndian) 206 ExtIndexC = NumElts.getKnownMinValue() - 1 - ExtIndexC; 207 unsigned ShiftAmountC = ExtIndexC * DestWidth; 208 if (!ShiftAmountC || 209 (isDesirableIntType(X->getType()->getPrimitiveSizeInBits()) && 210 Ext.getVectorOperand()->hasOneUse())) { 211 if (ShiftAmountC) 212 X = Builder.CreateLShr(X, ShiftAmountC, "extelt.offset"); 213 if (DestTy->isFloatingPointTy()) { 214 Type *DstIntTy = IntegerType::getIntNTy(X->getContext(), DestWidth); 215 Value *Trunc = Builder.CreateTrunc(X, DstIntTy); 216 return new BitCastInst(Trunc, DestTy); 217 } 218 return new TruncInst(X, DestTy); 219 } 220 } 221 222 if (!X->getType()->isVectorTy()) 223 return nullptr; 224 225 // If this extractelement is using a bitcast from a vector of the same number 226 // of elements, see if we can find the source element from the source vector: 227 // extelt (bitcast VecX), IndexC --> bitcast X[IndexC] 228 auto *SrcTy = cast<VectorType>(X->getType()); 229 ElementCount NumSrcElts = SrcTy->getElementCount(); 230 if (NumSrcElts == NumElts) 231 if (Value *Elt = findScalarElement(X, ExtIndexC)) 232 return new BitCastInst(Elt, DestTy); 233 234 assert(NumSrcElts.isScalable() == NumElts.isScalable() && 235 "Src and Dst must be the same sort of vector type"); 236 237 // If the source elements are wider than the destination, try to shift and 238 // truncate a subset of scalar bits of an insert op. 239 if (NumSrcElts.getKnownMinValue() < NumElts.getKnownMinValue()) { 240 Value *Scalar; 241 Value *Vec; 242 uint64_t InsIndexC; 243 if (!match(X, m_InsertElt(m_Value(Vec), m_Value(Scalar), 244 m_ConstantInt(InsIndexC)))) 245 return nullptr; 246 247 // The extract must be from the subset of vector elements that we inserted 248 // into. Example: if we inserted element 1 of a <2 x i64> and we are 249 // extracting an i16 (narrowing ratio = 4), then this extract must be from 1 250 // of elements 4-7 of the bitcasted vector. 251 unsigned NarrowingRatio = 252 NumElts.getKnownMinValue() / NumSrcElts.getKnownMinValue(); 253 254 if (ExtIndexC / NarrowingRatio != InsIndexC) { 255 // Remove insertelement, if we don't use the inserted element. 256 // extractelement (bitcast (insertelement (Vec, b)), a) -> 257 // extractelement (bitcast (Vec), a) 258 // FIXME: this should be removed to SimplifyDemandedVectorElts, 259 // once scale vectors are supported. 260 if (X->hasOneUse() && Ext.getVectorOperand()->hasOneUse()) { 261 Value *NewBC = Builder.CreateBitCast(Vec, Ext.getVectorOperandType()); 262 return ExtractElementInst::Create(NewBC, Ext.getIndexOperand()); 263 } 264 return nullptr; 265 } 266 267 // We are extracting part of the original scalar. How that scalar is 268 // inserted into the vector depends on the endian-ness. Example: 269 // Vector Byte Elt Index: 0 1 2 3 4 5 6 7 270 // +--+--+--+--+--+--+--+--+ 271 // inselt <2 x i32> V, <i32> S, 1: |V0|V1|V2|V3|S0|S1|S2|S3| 272 // extelt <4 x i16> V', 3: | |S2|S3| 273 // +--+--+--+--+--+--+--+--+ 274 // If this is little-endian, S2|S3 are the MSB of the 32-bit 'S' value. 275 // If this is big-endian, S2|S3 are the LSB of the 32-bit 'S' value. 276 // In this example, we must right-shift little-endian. Big-endian is just a 277 // truncate. 278 unsigned Chunk = ExtIndexC % NarrowingRatio; 279 if (IsBigEndian) 280 Chunk = NarrowingRatio - 1 - Chunk; 281 282 // Bail out if this is an FP vector to FP vector sequence. That would take 283 // more instructions than we started with unless there is no shift, and it 284 // may not be handled as well in the backend. 285 bool NeedSrcBitcast = SrcTy->getScalarType()->isFloatingPointTy(); 286 bool NeedDestBitcast = DestTy->isFloatingPointTy(); 287 if (NeedSrcBitcast && NeedDestBitcast) 288 return nullptr; 289 290 unsigned SrcWidth = SrcTy->getScalarSizeInBits(); 291 unsigned ShAmt = Chunk * DestWidth; 292 293 // TODO: This limitation is more strict than necessary. We could sum the 294 // number of new instructions and subtract the number eliminated to know if 295 // we can proceed. 296 if (!X->hasOneUse() || !Ext.getVectorOperand()->hasOneUse()) 297 if (NeedSrcBitcast || NeedDestBitcast) 298 return nullptr; 299 300 if (NeedSrcBitcast) { 301 Type *SrcIntTy = IntegerType::getIntNTy(Scalar->getContext(), SrcWidth); 302 Scalar = Builder.CreateBitCast(Scalar, SrcIntTy); 303 } 304 305 if (ShAmt) { 306 // Bail out if we could end with more instructions than we started with. 307 if (!Ext.getVectorOperand()->hasOneUse()) 308 return nullptr; 309 Scalar = Builder.CreateLShr(Scalar, ShAmt); 310 } 311 312 if (NeedDestBitcast) { 313 Type *DestIntTy = IntegerType::getIntNTy(Scalar->getContext(), DestWidth); 314 return new BitCastInst(Builder.CreateTrunc(Scalar, DestIntTy), DestTy); 315 } 316 return new TruncInst(Scalar, DestTy); 317 } 318 319 return nullptr; 320 } 321 322 /// Find elements of V demanded by UserInstr. 323 static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) { 324 unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements(); 325 326 // Conservatively assume that all elements are needed. 327 APInt UsedElts(APInt::getAllOnes(VWidth)); 328 329 switch (UserInstr->getOpcode()) { 330 case Instruction::ExtractElement: { 331 ExtractElementInst *EEI = cast<ExtractElementInst>(UserInstr); 332 assert(EEI->getVectorOperand() == V); 333 ConstantInt *EEIIndexC = dyn_cast<ConstantInt>(EEI->getIndexOperand()); 334 if (EEIIndexC && EEIIndexC->getValue().ult(VWidth)) { 335 UsedElts = APInt::getOneBitSet(VWidth, EEIIndexC->getZExtValue()); 336 } 337 break; 338 } 339 case Instruction::ShuffleVector: { 340 ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(UserInstr); 341 unsigned MaskNumElts = 342 cast<FixedVectorType>(UserInstr->getType())->getNumElements(); 343 344 UsedElts = APInt(VWidth, 0); 345 for (unsigned i = 0; i < MaskNumElts; i++) { 346 unsigned MaskVal = Shuffle->getMaskValue(i); 347 if (MaskVal == -1u || MaskVal >= 2 * VWidth) 348 continue; 349 if (Shuffle->getOperand(0) == V && (MaskVal < VWidth)) 350 UsedElts.setBit(MaskVal); 351 if (Shuffle->getOperand(1) == V && 352 ((MaskVal >= VWidth) && (MaskVal < 2 * VWidth))) 353 UsedElts.setBit(MaskVal - VWidth); 354 } 355 break; 356 } 357 default: 358 break; 359 } 360 return UsedElts; 361 } 362 363 /// Find union of elements of V demanded by all its users. 364 /// If it is known by querying findDemandedEltsBySingleUser that 365 /// no user demands an element of V, then the corresponding bit 366 /// remains unset in the returned value. 367 static APInt findDemandedEltsByAllUsers(Value *V) { 368 unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements(); 369 370 APInt UnionUsedElts(VWidth, 0); 371 for (const Use &U : V->uses()) { 372 if (Instruction *I = dyn_cast<Instruction>(U.getUser())) { 373 UnionUsedElts |= findDemandedEltsBySingleUser(V, I); 374 } else { 375 UnionUsedElts = APInt::getAllOnes(VWidth); 376 break; 377 } 378 379 if (UnionUsedElts.isAllOnes()) 380 break; 381 } 382 383 return UnionUsedElts; 384 } 385 386 /// Given a constant index for a extractelement or insertelement instruction, 387 /// return it with the canonical type if it isn't already canonical. We 388 /// arbitrarily pick 64 bit as our canonical type. The actual bitwidth doesn't 389 /// matter, we just want a consistent type to simplify CSE. 390 static ConstantInt *getPreferredVectorIndex(ConstantInt *IndexC) { 391 const unsigned IndexBW = IndexC->getBitWidth(); 392 if (IndexBW == 64 || IndexC->getValue().getActiveBits() > 64) 393 return nullptr; 394 return ConstantInt::get(IndexC->getContext(), 395 IndexC->getValue().zextOrTrunc(64)); 396 } 397 398 Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { 399 Value *SrcVec = EI.getVectorOperand(); 400 Value *Index = EI.getIndexOperand(); 401 if (Value *V = simplifyExtractElementInst(SrcVec, Index, 402 SQ.getWithInstruction(&EI))) 403 return replaceInstUsesWith(EI, V); 404 405 // extractelt (select %x, %vec1, %vec2), %const -> 406 // select %x, %vec1[%const], %vec2[%const] 407 // TODO: Support constant folding of multiple select operands: 408 // extractelt (select %x, %vec1, %vec2), (select %x, %c1, %c2) 409 // If the extractelement will for instance try to do out of bounds accesses 410 // because of the values of %c1 and/or %c2, the sequence could be optimized 411 // early. This is currently not possible because constant folding will reach 412 // an unreachable assertion if it doesn't find a constant operand. 413 if (SelectInst *SI = dyn_cast<SelectInst>(EI.getVectorOperand())) 414 if (SI->getCondition()->getType()->isIntegerTy() && 415 isa<Constant>(EI.getIndexOperand())) 416 if (Instruction *R = FoldOpIntoSelect(EI, SI)) 417 return R; 418 419 // If extracting a specified index from the vector, see if we can recursively 420 // find a previously computed scalar that was inserted into the vector. 421 auto *IndexC = dyn_cast<ConstantInt>(Index); 422 bool HasKnownValidIndex = false; 423 if (IndexC) { 424 // Canonicalize type of constant indices to i64 to simplify CSE 425 if (auto *NewIdx = getPreferredVectorIndex(IndexC)) 426 return replaceOperand(EI, 1, NewIdx); 427 428 ElementCount EC = EI.getVectorOperandType()->getElementCount(); 429 unsigned NumElts = EC.getKnownMinValue(); 430 HasKnownValidIndex = IndexC->getValue().ult(NumElts); 431 432 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(SrcVec)) { 433 Intrinsic::ID IID = II->getIntrinsicID(); 434 // Index needs to be lower than the minimum size of the vector, because 435 // for scalable vector, the vector size is known at run time. 436 if (IID == Intrinsic::stepvector && IndexC->getValue().ult(NumElts)) { 437 Type *Ty = EI.getType(); 438 unsigned BitWidth = Ty->getIntegerBitWidth(); 439 Value *Idx; 440 // Return index when its value does not exceed the allowed limit 441 // for the element type of the vector, otherwise return undefined. 442 if (IndexC->getValue().getActiveBits() <= BitWidth) 443 Idx = ConstantInt::get(Ty, IndexC->getValue().zextOrTrunc(BitWidth)); 444 else 445 Idx = PoisonValue::get(Ty); 446 return replaceInstUsesWith(EI, Idx); 447 } 448 } 449 450 // InstSimplify should handle cases where the index is invalid. 451 // For fixed-length vector, it's invalid to extract out-of-range element. 452 if (!EC.isScalable() && IndexC->getValue().uge(NumElts)) 453 return nullptr; 454 455 if (Instruction *I = foldBitcastExtElt(EI)) 456 return I; 457 458 // If there's a vector PHI feeding a scalar use through this extractelement 459 // instruction, try to scalarize the PHI. 460 if (auto *Phi = dyn_cast<PHINode>(SrcVec)) 461 if (Instruction *ScalarPHI = scalarizePHI(EI, Phi)) 462 return ScalarPHI; 463 } 464 465 // TODO come up with a n-ary matcher that subsumes both unary and 466 // binary matchers. 467 UnaryOperator *UO; 468 if (match(SrcVec, m_UnOp(UO)) && cheapToScalarize(SrcVec, Index)) { 469 // extelt (unop X), Index --> unop (extelt X, Index) 470 Value *X = UO->getOperand(0); 471 Value *E = Builder.CreateExtractElement(X, Index); 472 return UnaryOperator::CreateWithCopiedFlags(UO->getOpcode(), E, UO); 473 } 474 475 // If the binop is not speculatable, we cannot hoist the extractelement if 476 // it may make the operand poison. 477 BinaryOperator *BO; 478 if (match(SrcVec, m_BinOp(BO)) && cheapToScalarize(SrcVec, Index) && 479 (HasKnownValidIndex || isSafeToSpeculativelyExecute(BO))) { 480 // extelt (binop X, Y), Index --> binop (extelt X, Index), (extelt Y, Index) 481 Value *X = BO->getOperand(0), *Y = BO->getOperand(1); 482 Value *E0 = Builder.CreateExtractElement(X, Index); 483 Value *E1 = Builder.CreateExtractElement(Y, Index); 484 return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(), E0, E1, BO); 485 } 486 487 Value *X, *Y; 488 CmpInst::Predicate Pred; 489 if (match(SrcVec, m_Cmp(Pred, m_Value(X), m_Value(Y))) && 490 cheapToScalarize(SrcVec, Index)) { 491 // extelt (cmp X, Y), Index --> cmp (extelt X, Index), (extelt Y, Index) 492 Value *E0 = Builder.CreateExtractElement(X, Index); 493 Value *E1 = Builder.CreateExtractElement(Y, Index); 494 CmpInst *SrcCmpInst = cast<CmpInst>(SrcVec); 495 return CmpInst::CreateWithCopiedFlags(SrcCmpInst->getOpcode(), Pred, E0, E1, 496 SrcCmpInst); 497 } 498 499 if (auto *I = dyn_cast<Instruction>(SrcVec)) { 500 if (auto *IE = dyn_cast<InsertElementInst>(I)) { 501 // instsimplify already handled the case where the indices are constants 502 // and equal by value, if both are constants, they must not be the same 503 // value, extract from the pre-inserted value instead. 504 if (isa<Constant>(IE->getOperand(2)) && IndexC) 505 return replaceOperand(EI, 0, IE->getOperand(0)); 506 } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { 507 auto *VecType = cast<VectorType>(GEP->getType()); 508 ElementCount EC = VecType->getElementCount(); 509 uint64_t IdxVal = IndexC ? IndexC->getZExtValue() : 0; 510 if (IndexC && IdxVal < EC.getKnownMinValue() && GEP->hasOneUse()) { 511 // Find out why we have a vector result - these are a few examples: 512 // 1. We have a scalar pointer and a vector of indices, or 513 // 2. We have a vector of pointers and a scalar index, or 514 // 3. We have a vector of pointers and a vector of indices, etc. 515 // Here we only consider combining when there is exactly one vector 516 // operand, since the optimization is less obviously a win due to 517 // needing more than one extractelements. 518 519 unsigned VectorOps = 520 llvm::count_if(GEP->operands(), [](const Value *V) { 521 return isa<VectorType>(V->getType()); 522 }); 523 if (VectorOps == 1) { 524 Value *NewPtr = GEP->getPointerOperand(); 525 if (isa<VectorType>(NewPtr->getType())) 526 NewPtr = Builder.CreateExtractElement(NewPtr, IndexC); 527 528 SmallVector<Value *> NewOps; 529 for (unsigned I = 1; I != GEP->getNumOperands(); ++I) { 530 Value *Op = GEP->getOperand(I); 531 if (isa<VectorType>(Op->getType())) 532 NewOps.push_back(Builder.CreateExtractElement(Op, IndexC)); 533 else 534 NewOps.push_back(Op); 535 } 536 537 GetElementPtrInst *NewGEP = GetElementPtrInst::Create( 538 GEP->getSourceElementType(), NewPtr, NewOps); 539 NewGEP->setIsInBounds(GEP->isInBounds()); 540 return NewGEP; 541 } 542 } 543 } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) { 544 // If this is extracting an element from a shufflevector, figure out where 545 // it came from and extract from the appropriate input element instead. 546 // Restrict the following transformation to fixed-length vector. 547 if (isa<FixedVectorType>(SVI->getType()) && isa<ConstantInt>(Index)) { 548 int SrcIdx = 549 SVI->getMaskValue(cast<ConstantInt>(Index)->getZExtValue()); 550 Value *Src; 551 unsigned LHSWidth = cast<FixedVectorType>(SVI->getOperand(0)->getType()) 552 ->getNumElements(); 553 554 if (SrcIdx < 0) 555 return replaceInstUsesWith(EI, PoisonValue::get(EI.getType())); 556 if (SrcIdx < (int)LHSWidth) 557 Src = SVI->getOperand(0); 558 else { 559 SrcIdx -= LHSWidth; 560 Src = SVI->getOperand(1); 561 } 562 Type *Int64Ty = Type::getInt64Ty(EI.getContext()); 563 return ExtractElementInst::Create( 564 Src, ConstantInt::get(Int64Ty, SrcIdx, false)); 565 } 566 } else if (auto *CI = dyn_cast<CastInst>(I)) { 567 // Canonicalize extractelement(cast) -> cast(extractelement). 568 // Bitcasts can change the number of vector elements, and they cost 569 // nothing. 570 if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) { 571 Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index); 572 return CastInst::Create(CI->getOpcode(), EE, EI.getType()); 573 } 574 } 575 } 576 577 // Run demanded elements after other transforms as this can drop flags on 578 // binops. If there's two paths to the same final result, we prefer the 579 // one which doesn't force us to drop flags. 580 if (IndexC) { 581 ElementCount EC = EI.getVectorOperandType()->getElementCount(); 582 unsigned NumElts = EC.getKnownMinValue(); 583 // This instruction only demands the single element from the input vector. 584 // Skip for scalable type, the number of elements is unknown at 585 // compile-time. 586 if (!EC.isScalable() && NumElts != 1) { 587 // If the input vector has a single use, simplify it based on this use 588 // property. 589 if (SrcVec->hasOneUse()) { 590 APInt PoisonElts(NumElts, 0); 591 APInt DemandedElts(NumElts, 0); 592 DemandedElts.setBit(IndexC->getZExtValue()); 593 if (Value *V = 594 SimplifyDemandedVectorElts(SrcVec, DemandedElts, PoisonElts)) 595 return replaceOperand(EI, 0, V); 596 } else { 597 // If the input vector has multiple uses, simplify it based on a union 598 // of all elements used. 599 APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec); 600 if (!DemandedElts.isAllOnes()) { 601 APInt PoisonElts(NumElts, 0); 602 if (Value *V = SimplifyDemandedVectorElts( 603 SrcVec, DemandedElts, PoisonElts, 0 /* Depth */, 604 true /* AllowMultipleUsers */)) { 605 if (V != SrcVec) { 606 Worklist.addValue(SrcVec); 607 SrcVec->replaceAllUsesWith(V); 608 return &EI; 609 } 610 } 611 } 612 } 613 } 614 } 615 return nullptr; 616 } 617 618 /// If V is a shuffle of values that ONLY returns elements from either LHS or 619 /// RHS, return the shuffle mask and true. Otherwise, return false. 620 static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, 621 SmallVectorImpl<int> &Mask) { 622 assert(LHS->getType() == RHS->getType() && 623 "Invalid CollectSingleShuffleElements"); 624 unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements(); 625 626 if (match(V, m_Poison())) { 627 Mask.assign(NumElts, -1); 628 return true; 629 } 630 631 if (V == LHS) { 632 for (unsigned i = 0; i != NumElts; ++i) 633 Mask.push_back(i); 634 return true; 635 } 636 637 if (V == RHS) { 638 for (unsigned i = 0; i != NumElts; ++i) 639 Mask.push_back(i + NumElts); 640 return true; 641 } 642 643 if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) { 644 // If this is an insert of an extract from some other vector, include it. 645 Value *VecOp = IEI->getOperand(0); 646 Value *ScalarOp = IEI->getOperand(1); 647 Value *IdxOp = IEI->getOperand(2); 648 649 if (!isa<ConstantInt>(IdxOp)) 650 return false; 651 unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue(); 652 653 if (isa<PoisonValue>(ScalarOp)) { // inserting poison into vector. 654 // We can handle this if the vector we are inserting into is 655 // transitively ok. 656 if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { 657 // If so, update the mask to reflect the inserted poison. 658 Mask[InsertedIdx] = -1; 659 return true; 660 } 661 } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){ 662 if (isa<ConstantInt>(EI->getOperand(1))) { 663 unsigned ExtractedIdx = 664 cast<ConstantInt>(EI->getOperand(1))->getZExtValue(); 665 unsigned NumLHSElts = 666 cast<FixedVectorType>(LHS->getType())->getNumElements(); 667 668 // This must be extracting from either LHS or RHS. 669 if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) { 670 // We can handle this if the vector we are inserting into is 671 // transitively ok. 672 if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { 673 // If so, update the mask to reflect the inserted value. 674 if (EI->getOperand(0) == LHS) { 675 Mask[InsertedIdx % NumElts] = ExtractedIdx; 676 } else { 677 assert(EI->getOperand(0) == RHS); 678 Mask[InsertedIdx % NumElts] = ExtractedIdx + NumLHSElts; 679 } 680 return true; 681 } 682 } 683 } 684 } 685 } 686 687 return false; 688 } 689 690 /// If we have insertion into a vector that is wider than the vector that we 691 /// are extracting from, try to widen the source vector to allow a single 692 /// shufflevector to replace one or more insert/extract pairs. 693 static bool replaceExtractElements(InsertElementInst *InsElt, 694 ExtractElementInst *ExtElt, 695 InstCombinerImpl &IC) { 696 auto *InsVecType = cast<FixedVectorType>(InsElt->getType()); 697 auto *ExtVecType = cast<FixedVectorType>(ExtElt->getVectorOperandType()); 698 unsigned NumInsElts = InsVecType->getNumElements(); 699 unsigned NumExtElts = ExtVecType->getNumElements(); 700 701 // The inserted-to vector must be wider than the extracted-from vector. 702 if (InsVecType->getElementType() != ExtVecType->getElementType() || 703 NumExtElts >= NumInsElts) 704 return false; 705 706 // Create a shuffle mask to widen the extended-from vector using poison 707 // values. The mask selects all of the values of the original vector followed 708 // by as many poison values as needed to create a vector of the same length 709 // as the inserted-to vector. 710 SmallVector<int, 16> ExtendMask; 711 for (unsigned i = 0; i < NumExtElts; ++i) 712 ExtendMask.push_back(i); 713 for (unsigned i = NumExtElts; i < NumInsElts; ++i) 714 ExtendMask.push_back(-1); 715 716 Value *ExtVecOp = ExtElt->getVectorOperand(); 717 auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp); 718 BasicBlock *InsertionBlock = (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst)) 719 ? ExtVecOpInst->getParent() 720 : ExtElt->getParent(); 721 722 // TODO: This restriction matches the basic block check below when creating 723 // new extractelement instructions. If that limitation is removed, this one 724 // could also be removed. But for now, we just bail out to ensure that we 725 // will replace the extractelement instruction that is feeding our 726 // insertelement instruction. This allows the insertelement to then be 727 // replaced by a shufflevector. If the insertelement is not replaced, we can 728 // induce infinite looping because there's an optimization for extractelement 729 // that will delete our widening shuffle. This would trigger another attempt 730 // here to create that shuffle, and we spin forever. 731 if (InsertionBlock != InsElt->getParent()) 732 return false; 733 734 // TODO: This restriction matches the check in visitInsertElementInst() and 735 // prevents an infinite loop caused by not turning the extract/insert pair 736 // into a shuffle. We really should not need either check, but we're lacking 737 // folds for shufflevectors because we're afraid to generate shuffle masks 738 // that the backend can't handle. 739 if (InsElt->hasOneUse() && isa<InsertElementInst>(InsElt->user_back())) 740 return false; 741 742 auto *WideVec = new ShuffleVectorInst(ExtVecOp, ExtendMask); 743 744 // Insert the new shuffle after the vector operand of the extract is defined 745 // (as long as it's not a PHI) or at the start of the basic block of the 746 // extract, so any subsequent extracts in the same basic block can use it. 747 // TODO: Insert before the earliest ExtractElementInst that is replaced. 748 if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst)) 749 WideVec->insertAfter(ExtVecOpInst); 750 else 751 IC.InsertNewInstWith(WideVec, ExtElt->getParent()->getFirstInsertionPt()); 752 753 // Replace extracts from the original narrow vector with extracts from the new 754 // wide vector. 755 for (User *U : ExtVecOp->users()) { 756 ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U); 757 if (!OldExt || OldExt->getParent() != WideVec->getParent()) 758 continue; 759 auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1)); 760 IC.InsertNewInstWith(NewExt, OldExt->getIterator()); 761 IC.replaceInstUsesWith(*OldExt, NewExt); 762 // Add the old extracts to the worklist for DCE. We can't remove the 763 // extracts directly, because they may still be used by the calling code. 764 IC.addToWorklist(OldExt); 765 } 766 767 return true; 768 } 769 770 /// We are building a shuffle to create V, which is a sequence of insertelement, 771 /// extractelement pairs. If PermittedRHS is set, then we must either use it or 772 /// not rely on the second vector source. Return a std::pair containing the 773 /// left and right vectors of the proposed shuffle (or 0), and set the Mask 774 /// parameter as required. 775 /// 776 /// Note: we intentionally don't try to fold earlier shuffles since they have 777 /// often been chosen carefully to be efficiently implementable on the target. 778 using ShuffleOps = std::pair<Value *, Value *>; 779 780 static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<int> &Mask, 781 Value *PermittedRHS, 782 InstCombinerImpl &IC, bool &Rerun) { 783 assert(V->getType()->isVectorTy() && "Invalid shuffle!"); 784 unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements(); 785 786 if (match(V, m_Poison())) { 787 Mask.assign(NumElts, -1); 788 return std::make_pair( 789 PermittedRHS ? PoisonValue::get(PermittedRHS->getType()) : V, nullptr); 790 } 791 792 if (isa<ConstantAggregateZero>(V)) { 793 Mask.assign(NumElts, 0); 794 return std::make_pair(V, nullptr); 795 } 796 797 if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) { 798 // If this is an insert of an extract from some other vector, include it. 799 Value *VecOp = IEI->getOperand(0); 800 Value *ScalarOp = IEI->getOperand(1); 801 Value *IdxOp = IEI->getOperand(2); 802 803 if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) { 804 if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) { 805 unsigned ExtractedIdx = 806 cast<ConstantInt>(EI->getOperand(1))->getZExtValue(); 807 unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue(); 808 809 // Either the extracted from or inserted into vector must be RHSVec, 810 // otherwise we'd end up with a shuffle of three inputs. 811 if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) { 812 Value *RHS = EI->getOperand(0); 813 ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS, IC, Rerun); 814 assert(LR.second == nullptr || LR.second == RHS); 815 816 if (LR.first->getType() != RHS->getType()) { 817 // Although we are giving up for now, see if we can create extracts 818 // that match the inserts for another round of combining. 819 if (replaceExtractElements(IEI, EI, IC)) 820 Rerun = true; 821 822 // We tried our best, but we can't find anything compatible with RHS 823 // further up the chain. Return a trivial shuffle. 824 for (unsigned i = 0; i < NumElts; ++i) 825 Mask[i] = i; 826 return std::make_pair(V, nullptr); 827 } 828 829 unsigned NumLHSElts = 830 cast<FixedVectorType>(RHS->getType())->getNumElements(); 831 Mask[InsertedIdx % NumElts] = NumLHSElts + ExtractedIdx; 832 return std::make_pair(LR.first, RHS); 833 } 834 835 if (VecOp == PermittedRHS) { 836 // We've gone as far as we can: anything on the other side of the 837 // extractelement will already have been converted into a shuffle. 838 unsigned NumLHSElts = 839 cast<FixedVectorType>(EI->getOperand(0)->getType()) 840 ->getNumElements(); 841 for (unsigned i = 0; i != NumElts; ++i) 842 Mask.push_back(i == InsertedIdx ? ExtractedIdx : NumLHSElts + i); 843 return std::make_pair(EI->getOperand(0), PermittedRHS); 844 } 845 846 // If this insertelement is a chain that comes from exactly these two 847 // vectors, return the vector and the effective shuffle. 848 if (EI->getOperand(0)->getType() == PermittedRHS->getType() && 849 collectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS, 850 Mask)) 851 return std::make_pair(EI->getOperand(0), PermittedRHS); 852 } 853 } 854 } 855 856 // Otherwise, we can't do anything fancy. Return an identity vector. 857 for (unsigned i = 0; i != NumElts; ++i) 858 Mask.push_back(i); 859 return std::make_pair(V, nullptr); 860 } 861 862 /// Look for chain of insertvalue's that fully define an aggregate, and trace 863 /// back the values inserted, see if they are all were extractvalue'd from 864 /// the same source aggregate from the exact same element indexes. 865 /// If they were, just reuse the source aggregate. 866 /// This potentially deals with PHI indirections. 867 Instruction *InstCombinerImpl::foldAggregateConstructionIntoAggregateReuse( 868 InsertValueInst &OrigIVI) { 869 Type *AggTy = OrigIVI.getType(); 870 unsigned NumAggElts; 871 switch (AggTy->getTypeID()) { 872 case Type::StructTyID: 873 NumAggElts = AggTy->getStructNumElements(); 874 break; 875 case Type::ArrayTyID: 876 NumAggElts = AggTy->getArrayNumElements(); 877 break; 878 default: 879 llvm_unreachable("Unhandled aggregate type?"); 880 } 881 882 // Arbitrary aggregate size cut-off. Motivation for limit of 2 is to be able 883 // to handle clang C++ exception struct (which is hardcoded as {i8*, i32}), 884 // FIXME: any interesting patterns to be caught with larger limit? 885 assert(NumAggElts > 0 && "Aggregate should have elements."); 886 if (NumAggElts > 2) 887 return nullptr; 888 889 static constexpr auto NotFound = std::nullopt; 890 static constexpr auto FoundMismatch = nullptr; 891 892 // Try to find a value of each element of an aggregate. 893 // FIXME: deal with more complex, not one-dimensional, aggregate types 894 SmallVector<std::optional<Instruction *>, 2> AggElts(NumAggElts, NotFound); 895 896 // Do we know values for each element of the aggregate? 897 auto KnowAllElts = [&AggElts]() { 898 return !llvm::is_contained(AggElts, NotFound); 899 }; 900 901 int Depth = 0; 902 903 // Arbitrary `insertvalue` visitation depth limit. Let's be okay with 904 // every element being overwritten twice, which should never happen. 905 static const int DepthLimit = 2 * NumAggElts; 906 907 // Recurse up the chain of `insertvalue` aggregate operands until either we've 908 // reconstructed full initializer or can't visit any more `insertvalue`'s. 909 for (InsertValueInst *CurrIVI = &OrigIVI; 910 Depth < DepthLimit && CurrIVI && !KnowAllElts(); 911 CurrIVI = dyn_cast<InsertValueInst>(CurrIVI->getAggregateOperand()), 912 ++Depth) { 913 auto *InsertedValue = 914 dyn_cast<Instruction>(CurrIVI->getInsertedValueOperand()); 915 if (!InsertedValue) 916 return nullptr; // Inserted value must be produced by an instruction. 917 918 ArrayRef<unsigned int> Indices = CurrIVI->getIndices(); 919 920 // Don't bother with more than single-level aggregates. 921 if (Indices.size() != 1) 922 return nullptr; // FIXME: deal with more complex aggregates? 923 924 // Now, we may have already previously recorded the value for this element 925 // of an aggregate. If we did, that means the CurrIVI will later be 926 // overwritten with the already-recorded value. But if not, let's record it! 927 std::optional<Instruction *> &Elt = AggElts[Indices.front()]; 928 Elt = Elt.value_or(InsertedValue); 929 930 // FIXME: should we handle chain-terminating undef base operand? 931 } 932 933 // Was that sufficient to deduce the full initializer for the aggregate? 934 if (!KnowAllElts()) 935 return nullptr; // Give up then. 936 937 // We now want to find the source[s] of the aggregate elements we've found. 938 // And with "source" we mean the original aggregate[s] from which 939 // the inserted elements were extracted. This may require PHI translation. 940 941 enum class AggregateDescription { 942 /// When analyzing the value that was inserted into an aggregate, we did 943 /// not manage to find defining `extractvalue` instruction to analyze. 944 NotFound, 945 /// When analyzing the value that was inserted into an aggregate, we did 946 /// manage to find defining `extractvalue` instruction[s], and everything 947 /// matched perfectly - aggregate type, element insertion/extraction index. 948 Found, 949 /// When analyzing the value that was inserted into an aggregate, we did 950 /// manage to find defining `extractvalue` instruction, but there was 951 /// a mismatch: either the source type from which the extraction was didn't 952 /// match the aggregate type into which the insertion was, 953 /// or the extraction/insertion channels mismatched, 954 /// or different elements had different source aggregates. 955 FoundMismatch 956 }; 957 auto Describe = [](std::optional<Value *> SourceAggregate) { 958 if (SourceAggregate == NotFound) 959 return AggregateDescription::NotFound; 960 if (*SourceAggregate == FoundMismatch) 961 return AggregateDescription::FoundMismatch; 962 return AggregateDescription::Found; 963 }; 964 965 // Given the value \p Elt that was being inserted into element \p EltIdx of an 966 // aggregate AggTy, see if \p Elt was originally defined by an 967 // appropriate extractvalue (same element index, same aggregate type). 968 // If found, return the source aggregate from which the extraction was. 969 // If \p PredBB is provided, does PHI translation of an \p Elt first. 970 auto FindSourceAggregate = 971 [&](Instruction *Elt, unsigned EltIdx, std::optional<BasicBlock *> UseBB, 972 std::optional<BasicBlock *> PredBB) -> std::optional<Value *> { 973 // For now(?), only deal with, at most, a single level of PHI indirection. 974 if (UseBB && PredBB) 975 Elt = dyn_cast<Instruction>(Elt->DoPHITranslation(*UseBB, *PredBB)); 976 // FIXME: deal with multiple levels of PHI indirection? 977 978 // Did we find an extraction? 979 auto *EVI = dyn_cast_or_null<ExtractValueInst>(Elt); 980 if (!EVI) 981 return NotFound; 982 983 Value *SourceAggregate = EVI->getAggregateOperand(); 984 985 // Is the extraction from the same type into which the insertion was? 986 if (SourceAggregate->getType() != AggTy) 987 return FoundMismatch; 988 // And the element index doesn't change between extraction and insertion? 989 if (EVI->getNumIndices() != 1 || EltIdx != EVI->getIndices().front()) 990 return FoundMismatch; 991 992 return SourceAggregate; // AggregateDescription::Found 993 }; 994 995 // Given elements AggElts that were constructing an aggregate OrigIVI, 996 // see if we can find appropriate source aggregate for each of the elements, 997 // and see it's the same aggregate for each element. If so, return it. 998 auto FindCommonSourceAggregate = 999 [&](std::optional<BasicBlock *> UseBB, 1000 std::optional<BasicBlock *> PredBB) -> std::optional<Value *> { 1001 std::optional<Value *> SourceAggregate; 1002 1003 for (auto I : enumerate(AggElts)) { 1004 assert(Describe(SourceAggregate) != AggregateDescription::FoundMismatch && 1005 "We don't store nullptr in SourceAggregate!"); 1006 assert((Describe(SourceAggregate) == AggregateDescription::Found) == 1007 (I.index() != 0) && 1008 "SourceAggregate should be valid after the first element,"); 1009 1010 // For this element, is there a plausible source aggregate? 1011 // FIXME: we could special-case undef element, IFF we know that in the 1012 // source aggregate said element isn't poison. 1013 std::optional<Value *> SourceAggregateForElement = 1014 FindSourceAggregate(*I.value(), I.index(), UseBB, PredBB); 1015 1016 // Okay, what have we found? Does that correlate with previous findings? 1017 1018 // Regardless of whether or not we have previously found source 1019 // aggregate for previous elements (if any), if we didn't find one for 1020 // this element, passthrough whatever we have just found. 1021 if (Describe(SourceAggregateForElement) != AggregateDescription::Found) 1022 return SourceAggregateForElement; 1023 1024 // Okay, we have found source aggregate for this element. 1025 // Let's see what we already know from previous elements, if any. 1026 switch (Describe(SourceAggregate)) { 1027 case AggregateDescription::NotFound: 1028 // This is apparently the first element that we have examined. 1029 SourceAggregate = SourceAggregateForElement; // Record the aggregate! 1030 continue; // Great, now look at next element. 1031 case AggregateDescription::Found: 1032 // We have previously already successfully examined other elements. 1033 // Is this the same source aggregate we've found for other elements? 1034 if (*SourceAggregateForElement != *SourceAggregate) 1035 return FoundMismatch; 1036 continue; // Still the same aggregate, look at next element. 1037 case AggregateDescription::FoundMismatch: 1038 llvm_unreachable("Can't happen. We would have early-exited then."); 1039 }; 1040 } 1041 1042 assert(Describe(SourceAggregate) == AggregateDescription::Found && 1043 "Must be a valid Value"); 1044 return *SourceAggregate; 1045 }; 1046 1047 std::optional<Value *> SourceAggregate; 1048 1049 // Can we find the source aggregate without looking at predecessors? 1050 SourceAggregate = FindCommonSourceAggregate(/*UseBB=*/std::nullopt, 1051 /*PredBB=*/std::nullopt); 1052 if (Describe(SourceAggregate) != AggregateDescription::NotFound) { 1053 if (Describe(SourceAggregate) == AggregateDescription::FoundMismatch) 1054 return nullptr; // Conflicting source aggregates! 1055 ++NumAggregateReconstructionsSimplified; 1056 return replaceInstUsesWith(OrigIVI, *SourceAggregate); 1057 } 1058 1059 // Okay, apparently we need to look at predecessors. 1060 1061 // We should be smart about picking the "use" basic block, which will be the 1062 // merge point for aggregate, where we'll insert the final PHI that will be 1063 // used instead of OrigIVI. Basic block of OrigIVI is *not* the right choice. 1064 // We should look in which blocks each of the AggElts is being defined, 1065 // they all should be defined in the same basic block. 1066 BasicBlock *UseBB = nullptr; 1067 1068 for (const std::optional<Instruction *> &I : AggElts) { 1069 BasicBlock *BB = (*I)->getParent(); 1070 // If it's the first instruction we've encountered, record the basic block. 1071 if (!UseBB) { 1072 UseBB = BB; 1073 continue; 1074 } 1075 // Otherwise, this must be the same basic block we've seen previously. 1076 if (UseBB != BB) 1077 return nullptr; 1078 } 1079 1080 // If *all* of the elements are basic-block-independent, meaning they are 1081 // either function arguments, or constant expressions, then if we didn't 1082 // handle them without predecessor-aware handling, we won't handle them now. 1083 if (!UseBB) 1084 return nullptr; 1085 1086 // If we didn't manage to find source aggregate without looking at 1087 // predecessors, and there are no predecessors to look at, then we're done. 1088 if (pred_empty(UseBB)) 1089 return nullptr; 1090 1091 // Arbitrary predecessor count limit. 1092 static const int PredCountLimit = 64; 1093 1094 // Cache the (non-uniqified!) list of predecessors in a vector, 1095 // checking the limit at the same time for efficiency. 1096 SmallVector<BasicBlock *, 4> Preds; // May have duplicates! 1097 for (BasicBlock *Pred : predecessors(UseBB)) { 1098 // Don't bother if there are too many predecessors. 1099 if (Preds.size() >= PredCountLimit) // FIXME: only count duplicates once? 1100 return nullptr; 1101 Preds.emplace_back(Pred); 1102 } 1103 1104 // For each predecessor, what is the source aggregate, 1105 // from which all the elements were originally extracted from? 1106 // Note that we want for the map to have stable iteration order! 1107 SmallDenseMap<BasicBlock *, Value *, 4> SourceAggregates; 1108 for (BasicBlock *Pred : Preds) { 1109 std::pair<decltype(SourceAggregates)::iterator, bool> IV = 1110 SourceAggregates.insert({Pred, nullptr}); 1111 // Did we already evaluate this predecessor? 1112 if (!IV.second) 1113 continue; 1114 1115 // Let's hope that when coming from predecessor Pred, all elements of the 1116 // aggregate produced by OrigIVI must have been originally extracted from 1117 // the same aggregate. Is that so? Can we find said original aggregate? 1118 SourceAggregate = FindCommonSourceAggregate(UseBB, Pred); 1119 if (Describe(SourceAggregate) != AggregateDescription::Found) 1120 return nullptr; // Give up. 1121 IV.first->second = *SourceAggregate; 1122 } 1123 1124 // All good! Now we just need to thread the source aggregates here. 1125 // Note that we have to insert the new PHI here, ourselves, because we can't 1126 // rely on InstCombinerImpl::run() inserting it into the right basic block. 1127 // Note that the same block can be a predecessor more than once, 1128 // and we need to preserve that invariant for the PHI node. 1129 BuilderTy::InsertPointGuard Guard(Builder); 1130 Builder.SetInsertPoint(UseBB, UseBB->getFirstNonPHIIt()); 1131 auto *PHI = 1132 Builder.CreatePHI(AggTy, Preds.size(), OrigIVI.getName() + ".merged"); 1133 for (BasicBlock *Pred : Preds) 1134 PHI->addIncoming(SourceAggregates[Pred], Pred); 1135 1136 ++NumAggregateReconstructionsSimplified; 1137 return replaceInstUsesWith(OrigIVI, PHI); 1138 } 1139 1140 /// Try to find redundant insertvalue instructions, like the following ones: 1141 /// %0 = insertvalue { i8, i32 } undef, i8 %x, 0 1142 /// %1 = insertvalue { i8, i32 } %0, i8 %y, 0 1143 /// Here the second instruction inserts values at the same indices, as the 1144 /// first one, making the first one redundant. 1145 /// It should be transformed to: 1146 /// %0 = insertvalue { i8, i32 } undef, i8 %y, 0 1147 Instruction *InstCombinerImpl::visitInsertValueInst(InsertValueInst &I) { 1148 if (Value *V = simplifyInsertValueInst( 1149 I.getAggregateOperand(), I.getInsertedValueOperand(), I.getIndices(), 1150 SQ.getWithInstruction(&I))) 1151 return replaceInstUsesWith(I, V); 1152 1153 bool IsRedundant = false; 1154 ArrayRef<unsigned int> FirstIndices = I.getIndices(); 1155 1156 // If there is a chain of insertvalue instructions (each of them except the 1157 // last one has only one use and it's another insertvalue insn from this 1158 // chain), check if any of the 'children' uses the same indices as the first 1159 // instruction. In this case, the first one is redundant. 1160 Value *V = &I; 1161 unsigned Depth = 0; 1162 while (V->hasOneUse() && Depth < 10) { 1163 User *U = V->user_back(); 1164 auto UserInsInst = dyn_cast<InsertValueInst>(U); 1165 if (!UserInsInst || U->getOperand(0) != V) 1166 break; 1167 if (UserInsInst->getIndices() == FirstIndices) { 1168 IsRedundant = true; 1169 break; 1170 } 1171 V = UserInsInst; 1172 Depth++; 1173 } 1174 1175 if (IsRedundant) 1176 return replaceInstUsesWith(I, I.getOperand(0)); 1177 1178 if (Instruction *NewI = foldAggregateConstructionIntoAggregateReuse(I)) 1179 return NewI; 1180 1181 return nullptr; 1182 } 1183 1184 static bool isShuffleEquivalentToSelect(ShuffleVectorInst &Shuf) { 1185 // Can not analyze scalable type, the number of elements is not a compile-time 1186 // constant. 1187 if (isa<ScalableVectorType>(Shuf.getOperand(0)->getType())) 1188 return false; 1189 1190 int MaskSize = Shuf.getShuffleMask().size(); 1191 int VecSize = 1192 cast<FixedVectorType>(Shuf.getOperand(0)->getType())->getNumElements(); 1193 1194 // A vector select does not change the size of the operands. 1195 if (MaskSize != VecSize) 1196 return false; 1197 1198 // Each mask element must be undefined or choose a vector element from one of 1199 // the source operands without crossing vector lanes. 1200 for (int i = 0; i != MaskSize; ++i) { 1201 int Elt = Shuf.getMaskValue(i); 1202 if (Elt != -1 && Elt != i && Elt != i + VecSize) 1203 return false; 1204 } 1205 1206 return true; 1207 } 1208 1209 /// Turn a chain of inserts that splats a value into an insert + shuffle: 1210 /// insertelt(insertelt(insertelt(insertelt X, %k, 0), %k, 1), %k, 2) ... -> 1211 /// shufflevector(insertelt(X, %k, 0), poison, zero) 1212 static Instruction *foldInsSequenceIntoSplat(InsertElementInst &InsElt) { 1213 // We are interested in the last insert in a chain. So if this insert has a 1214 // single user and that user is an insert, bail. 1215 if (InsElt.hasOneUse() && isa<InsertElementInst>(InsElt.user_back())) 1216 return nullptr; 1217 1218 VectorType *VecTy = InsElt.getType(); 1219 // Can not handle scalable type, the number of elements is not a compile-time 1220 // constant. 1221 if (isa<ScalableVectorType>(VecTy)) 1222 return nullptr; 1223 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements(); 1224 1225 // Do not try to do this for a one-element vector, since that's a nop, 1226 // and will cause an inf-loop. 1227 if (NumElements == 1) 1228 return nullptr; 1229 1230 Value *SplatVal = InsElt.getOperand(1); 1231 InsertElementInst *CurrIE = &InsElt; 1232 SmallBitVector ElementPresent(NumElements, false); 1233 InsertElementInst *FirstIE = nullptr; 1234 1235 // Walk the chain backwards, keeping track of which indices we inserted into, 1236 // until we hit something that isn't an insert of the splatted value. 1237 while (CurrIE) { 1238 auto *Idx = dyn_cast<ConstantInt>(CurrIE->getOperand(2)); 1239 if (!Idx || CurrIE->getOperand(1) != SplatVal) 1240 return nullptr; 1241 1242 auto *NextIE = dyn_cast<InsertElementInst>(CurrIE->getOperand(0)); 1243 // Check none of the intermediate steps have any additional uses, except 1244 // for the root insertelement instruction, which can be re-used, if it 1245 // inserts at position 0. 1246 if (CurrIE != &InsElt && 1247 (!CurrIE->hasOneUse() && (NextIE != nullptr || !Idx->isZero()))) 1248 return nullptr; 1249 1250 ElementPresent[Idx->getZExtValue()] = true; 1251 FirstIE = CurrIE; 1252 CurrIE = NextIE; 1253 } 1254 1255 // If this is just a single insertelement (not a sequence), we are done. 1256 if (FirstIE == &InsElt) 1257 return nullptr; 1258 1259 // If we are not inserting into a poison vector, make sure we've seen an 1260 // insert into every element. 1261 // TODO: If the base vector is not undef, it might be better to create a splat 1262 // and then a select-shuffle (blend) with the base vector. 1263 if (!match(FirstIE->getOperand(0), m_Poison())) 1264 if (!ElementPresent.all()) 1265 return nullptr; 1266 1267 // Create the insert + shuffle. 1268 Type *Int64Ty = Type::getInt64Ty(InsElt.getContext()); 1269 PoisonValue *PoisonVec = PoisonValue::get(VecTy); 1270 Constant *Zero = ConstantInt::get(Int64Ty, 0); 1271 if (!cast<ConstantInt>(FirstIE->getOperand(2))->isZero()) 1272 FirstIE = InsertElementInst::Create(PoisonVec, SplatVal, Zero, "", 1273 InsElt.getIterator()); 1274 1275 // Splat from element 0, but replace absent elements with poison in the mask. 1276 SmallVector<int, 16> Mask(NumElements, 0); 1277 for (unsigned i = 0; i != NumElements; ++i) 1278 if (!ElementPresent[i]) 1279 Mask[i] = -1; 1280 1281 return new ShuffleVectorInst(FirstIE, Mask); 1282 } 1283 1284 /// Try to fold an insert element into an existing splat shuffle by changing 1285 /// the shuffle's mask to include the index of this insert element. 1286 static Instruction *foldInsEltIntoSplat(InsertElementInst &InsElt) { 1287 // Check if the vector operand of this insert is a canonical splat shuffle. 1288 auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0)); 1289 if (!Shuf || !Shuf->isZeroEltSplat()) 1290 return nullptr; 1291 1292 // Bail out early if shuffle is scalable type. The number of elements in 1293 // shuffle mask is unknown at compile-time. 1294 if (isa<ScalableVectorType>(Shuf->getType())) 1295 return nullptr; 1296 1297 // Check for a constant insertion index. 1298 uint64_t IdxC; 1299 if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC))) 1300 return nullptr; 1301 1302 // Check if the splat shuffle's input is the same as this insert's scalar op. 1303 Value *X = InsElt.getOperand(1); 1304 Value *Op0 = Shuf->getOperand(0); 1305 if (!match(Op0, m_InsertElt(m_Undef(), m_Specific(X), m_ZeroInt()))) 1306 return nullptr; 1307 1308 // Replace the shuffle mask element at the index of this insert with a zero. 1309 // For example: 1310 // inselt (shuf (inselt undef, X, 0), _, <0,undef,0,undef>), X, 1 1311 // --> shuf (inselt undef, X, 0), poison, <0,0,0,undef> 1312 unsigned NumMaskElts = 1313 cast<FixedVectorType>(Shuf->getType())->getNumElements(); 1314 SmallVector<int, 16> NewMask(NumMaskElts); 1315 for (unsigned i = 0; i != NumMaskElts; ++i) 1316 NewMask[i] = i == IdxC ? 0 : Shuf->getMaskValue(i); 1317 1318 return new ShuffleVectorInst(Op0, NewMask); 1319 } 1320 1321 /// Try to fold an extract+insert element into an existing identity shuffle by 1322 /// changing the shuffle's mask to include the index of this insert element. 1323 static Instruction *foldInsEltIntoIdentityShuffle(InsertElementInst &InsElt) { 1324 // Check if the vector operand of this insert is an identity shuffle. 1325 auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0)); 1326 if (!Shuf || !match(Shuf->getOperand(1), m_Poison()) || 1327 !(Shuf->isIdentityWithExtract() || Shuf->isIdentityWithPadding())) 1328 return nullptr; 1329 1330 // Bail out early if shuffle is scalable type. The number of elements in 1331 // shuffle mask is unknown at compile-time. 1332 if (isa<ScalableVectorType>(Shuf->getType())) 1333 return nullptr; 1334 1335 // Check for a constant insertion index. 1336 uint64_t IdxC; 1337 if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC))) 1338 return nullptr; 1339 1340 // Check if this insert's scalar op is extracted from the identity shuffle's 1341 // input vector. 1342 Value *Scalar = InsElt.getOperand(1); 1343 Value *X = Shuf->getOperand(0); 1344 if (!match(Scalar, m_ExtractElt(m_Specific(X), m_SpecificInt(IdxC)))) 1345 return nullptr; 1346 1347 // Replace the shuffle mask element at the index of this extract+insert with 1348 // that same index value. 1349 // For example: 1350 // inselt (shuf X, IdMask), (extelt X, IdxC), IdxC --> shuf X, IdMask' 1351 unsigned NumMaskElts = 1352 cast<FixedVectorType>(Shuf->getType())->getNumElements(); 1353 SmallVector<int, 16> NewMask(NumMaskElts); 1354 ArrayRef<int> OldMask = Shuf->getShuffleMask(); 1355 for (unsigned i = 0; i != NumMaskElts; ++i) { 1356 if (i != IdxC) { 1357 // All mask elements besides the inserted element remain the same. 1358 NewMask[i] = OldMask[i]; 1359 } else if (OldMask[i] == (int)IdxC) { 1360 // If the mask element was already set, there's nothing to do 1361 // (demanded elements analysis may unset it later). 1362 return nullptr; 1363 } else { 1364 assert(OldMask[i] == PoisonMaskElem && 1365 "Unexpected shuffle mask element for identity shuffle"); 1366 NewMask[i] = IdxC; 1367 } 1368 } 1369 1370 return new ShuffleVectorInst(X, Shuf->getOperand(1), NewMask); 1371 } 1372 1373 /// If we have an insertelement instruction feeding into another insertelement 1374 /// and the 2nd is inserting a constant into the vector, canonicalize that 1375 /// constant insertion before the insertion of a variable: 1376 /// 1377 /// insertelement (insertelement X, Y, IdxC1), ScalarC, IdxC2 --> 1378 /// insertelement (insertelement X, ScalarC, IdxC2), Y, IdxC1 1379 /// 1380 /// This has the potential of eliminating the 2nd insertelement instruction 1381 /// via constant folding of the scalar constant into a vector constant. 1382 static Instruction *hoistInsEltConst(InsertElementInst &InsElt2, 1383 InstCombiner::BuilderTy &Builder) { 1384 auto *InsElt1 = dyn_cast<InsertElementInst>(InsElt2.getOperand(0)); 1385 if (!InsElt1 || !InsElt1->hasOneUse()) 1386 return nullptr; 1387 1388 Value *X, *Y; 1389 Constant *ScalarC; 1390 ConstantInt *IdxC1, *IdxC2; 1391 if (match(InsElt1->getOperand(0), m_Value(X)) && 1392 match(InsElt1->getOperand(1), m_Value(Y)) && !isa<Constant>(Y) && 1393 match(InsElt1->getOperand(2), m_ConstantInt(IdxC1)) && 1394 match(InsElt2.getOperand(1), m_Constant(ScalarC)) && 1395 match(InsElt2.getOperand(2), m_ConstantInt(IdxC2)) && IdxC1 != IdxC2) { 1396 Value *NewInsElt1 = Builder.CreateInsertElement(X, ScalarC, IdxC2); 1397 return InsertElementInst::Create(NewInsElt1, Y, IdxC1); 1398 } 1399 1400 return nullptr; 1401 } 1402 1403 /// insertelt (shufflevector X, CVec, Mask|insertelt X, C1, CIndex1), C, CIndex 1404 /// --> shufflevector X, CVec', Mask' 1405 static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) { 1406 auto *Inst = dyn_cast<Instruction>(InsElt.getOperand(0)); 1407 // Bail out if the parent has more than one use. In that case, we'd be 1408 // replacing the insertelt with a shuffle, and that's not a clear win. 1409 if (!Inst || !Inst->hasOneUse()) 1410 return nullptr; 1411 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0))) { 1412 // The shuffle must have a constant vector operand. The insertelt must have 1413 // a constant scalar being inserted at a constant position in the vector. 1414 Constant *ShufConstVec, *InsEltScalar; 1415 uint64_t InsEltIndex; 1416 if (!match(Shuf->getOperand(1), m_Constant(ShufConstVec)) || 1417 !match(InsElt.getOperand(1), m_Constant(InsEltScalar)) || 1418 !match(InsElt.getOperand(2), m_ConstantInt(InsEltIndex))) 1419 return nullptr; 1420 1421 // Adding an element to an arbitrary shuffle could be expensive, but a 1422 // shuffle that selects elements from vectors without crossing lanes is 1423 // assumed cheap. 1424 // If we're just adding a constant into that shuffle, it will still be 1425 // cheap. 1426 if (!isShuffleEquivalentToSelect(*Shuf)) 1427 return nullptr; 1428 1429 // From the above 'select' check, we know that the mask has the same number 1430 // of elements as the vector input operands. We also know that each constant 1431 // input element is used in its lane and can not be used more than once by 1432 // the shuffle. Therefore, replace the constant in the shuffle's constant 1433 // vector with the insertelt constant. Replace the constant in the shuffle's 1434 // mask vector with the insertelt index plus the length of the vector 1435 // (because the constant vector operand of a shuffle is always the 2nd 1436 // operand). 1437 ArrayRef<int> Mask = Shuf->getShuffleMask(); 1438 unsigned NumElts = Mask.size(); 1439 SmallVector<Constant *, 16> NewShufElts(NumElts); 1440 SmallVector<int, 16> NewMaskElts(NumElts); 1441 for (unsigned I = 0; I != NumElts; ++I) { 1442 if (I == InsEltIndex) { 1443 NewShufElts[I] = InsEltScalar; 1444 NewMaskElts[I] = InsEltIndex + NumElts; 1445 } else { 1446 // Copy over the existing values. 1447 NewShufElts[I] = ShufConstVec->getAggregateElement(I); 1448 NewMaskElts[I] = Mask[I]; 1449 } 1450 1451 // Bail if we failed to find an element. 1452 if (!NewShufElts[I]) 1453 return nullptr; 1454 } 1455 1456 // Create new operands for a shuffle that includes the constant of the 1457 // original insertelt. The old shuffle will be dead now. 1458 return new ShuffleVectorInst(Shuf->getOperand(0), 1459 ConstantVector::get(NewShufElts), NewMaskElts); 1460 } else if (auto *IEI = dyn_cast<InsertElementInst>(Inst)) { 1461 // Transform sequences of insertelements ops with constant data/indexes into 1462 // a single shuffle op. 1463 // Can not handle scalable type, the number of elements needed to create 1464 // shuffle mask is not a compile-time constant. 1465 if (isa<ScalableVectorType>(InsElt.getType())) 1466 return nullptr; 1467 unsigned NumElts = 1468 cast<FixedVectorType>(InsElt.getType())->getNumElements(); 1469 1470 uint64_t InsertIdx[2]; 1471 Constant *Val[2]; 1472 if (!match(InsElt.getOperand(2), m_ConstantInt(InsertIdx[0])) || 1473 !match(InsElt.getOperand(1), m_Constant(Val[0])) || 1474 !match(IEI->getOperand(2), m_ConstantInt(InsertIdx[1])) || 1475 !match(IEI->getOperand(1), m_Constant(Val[1]))) 1476 return nullptr; 1477 SmallVector<Constant *, 16> Values(NumElts); 1478 SmallVector<int, 16> Mask(NumElts); 1479 auto ValI = std::begin(Val); 1480 // Generate new constant vector and mask. 1481 // We have 2 values/masks from the insertelements instructions. Insert them 1482 // into new value/mask vectors. 1483 for (uint64_t I : InsertIdx) { 1484 if (!Values[I]) { 1485 Values[I] = *ValI; 1486 Mask[I] = NumElts + I; 1487 } 1488 ++ValI; 1489 } 1490 // Remaining values are filled with 'poison' values. 1491 for (unsigned I = 0; I < NumElts; ++I) { 1492 if (!Values[I]) { 1493 Values[I] = PoisonValue::get(InsElt.getType()->getElementType()); 1494 Mask[I] = I; 1495 } 1496 } 1497 // Create new operands for a shuffle that includes the constant of the 1498 // original insertelt. 1499 return new ShuffleVectorInst(IEI->getOperand(0), 1500 ConstantVector::get(Values), Mask); 1501 } 1502 return nullptr; 1503 } 1504 1505 /// If both the base vector and the inserted element are extended from the same 1506 /// type, do the insert element in the narrow source type followed by extend. 1507 /// TODO: This can be extended to include other cast opcodes, but particularly 1508 /// if we create a wider insertelement, make sure codegen is not harmed. 1509 static Instruction *narrowInsElt(InsertElementInst &InsElt, 1510 InstCombiner::BuilderTy &Builder) { 1511 // We are creating a vector extend. If the original vector extend has another 1512 // use, that would mean we end up with 2 vector extends, so avoid that. 1513 // TODO: We could ease the use-clause to "if at least one op has one use" 1514 // (assuming that the source types match - see next TODO comment). 1515 Value *Vec = InsElt.getOperand(0); 1516 if (!Vec->hasOneUse()) 1517 return nullptr; 1518 1519 Value *Scalar = InsElt.getOperand(1); 1520 Value *X, *Y; 1521 CastInst::CastOps CastOpcode; 1522 if (match(Vec, m_FPExt(m_Value(X))) && match(Scalar, m_FPExt(m_Value(Y)))) 1523 CastOpcode = Instruction::FPExt; 1524 else if (match(Vec, m_SExt(m_Value(X))) && match(Scalar, m_SExt(m_Value(Y)))) 1525 CastOpcode = Instruction::SExt; 1526 else if (match(Vec, m_ZExt(m_Value(X))) && match(Scalar, m_ZExt(m_Value(Y)))) 1527 CastOpcode = Instruction::ZExt; 1528 else 1529 return nullptr; 1530 1531 // TODO: We can allow mismatched types by creating an intermediate cast. 1532 if (X->getType()->getScalarType() != Y->getType()) 1533 return nullptr; 1534 1535 // inselt (ext X), (ext Y), Index --> ext (inselt X, Y, Index) 1536 Value *NewInsElt = Builder.CreateInsertElement(X, Y, InsElt.getOperand(2)); 1537 return CastInst::Create(CastOpcode, NewInsElt, InsElt.getType()); 1538 } 1539 1540 /// If we are inserting 2 halves of a value into adjacent elements of a vector, 1541 /// try to convert to a single insert with appropriate bitcasts. 1542 static Instruction *foldTruncInsEltPair(InsertElementInst &InsElt, 1543 bool IsBigEndian, 1544 InstCombiner::BuilderTy &Builder) { 1545 Value *VecOp = InsElt.getOperand(0); 1546 Value *ScalarOp = InsElt.getOperand(1); 1547 Value *IndexOp = InsElt.getOperand(2); 1548 1549 // Pattern depends on endian because we expect lower index is inserted first. 1550 // Big endian: 1551 // inselt (inselt BaseVec, (trunc (lshr X, BW/2), Index0), (trunc X), Index1 1552 // Little endian: 1553 // inselt (inselt BaseVec, (trunc X), Index0), (trunc (lshr X, BW/2)), Index1 1554 // Note: It is not safe to do this transform with an arbitrary base vector 1555 // because the bitcast of that vector to fewer/larger elements could 1556 // allow poison to spill into an element that was not poison before. 1557 // TODO: Detect smaller fractions of the scalar. 1558 // TODO: One-use checks are conservative. 1559 auto *VTy = dyn_cast<FixedVectorType>(InsElt.getType()); 1560 Value *Scalar0, *BaseVec; 1561 uint64_t Index0, Index1; 1562 if (!VTy || (VTy->getNumElements() & 1) || 1563 !match(IndexOp, m_ConstantInt(Index1)) || 1564 !match(VecOp, m_InsertElt(m_Value(BaseVec), m_Value(Scalar0), 1565 m_ConstantInt(Index0))) || 1566 !match(BaseVec, m_Undef())) 1567 return nullptr; 1568 1569 // The first insert must be to the index one less than this one, and 1570 // the first insert must be to an even index. 1571 if (Index0 + 1 != Index1 || Index0 & 1) 1572 return nullptr; 1573 1574 // For big endian, the high half of the value should be inserted first. 1575 // For little endian, the low half of the value should be inserted first. 1576 Value *X; 1577 uint64_t ShAmt; 1578 if (IsBigEndian) { 1579 if (!match(ScalarOp, m_Trunc(m_Value(X))) || 1580 !match(Scalar0, m_Trunc(m_LShr(m_Specific(X), m_ConstantInt(ShAmt))))) 1581 return nullptr; 1582 } else { 1583 if (!match(Scalar0, m_Trunc(m_Value(X))) || 1584 !match(ScalarOp, m_Trunc(m_LShr(m_Specific(X), m_ConstantInt(ShAmt))))) 1585 return nullptr; 1586 } 1587 1588 Type *SrcTy = X->getType(); 1589 unsigned ScalarWidth = SrcTy->getScalarSizeInBits(); 1590 unsigned VecEltWidth = VTy->getScalarSizeInBits(); 1591 if (ScalarWidth != VecEltWidth * 2 || ShAmt != VecEltWidth) 1592 return nullptr; 1593 1594 // Bitcast the base vector to a vector type with the source element type. 1595 Type *CastTy = FixedVectorType::get(SrcTy, VTy->getNumElements() / 2); 1596 Value *CastBaseVec = Builder.CreateBitCast(BaseVec, CastTy); 1597 1598 // Scale the insert index for a vector with half as many elements. 1599 // bitcast (inselt (bitcast BaseVec), X, NewIndex) 1600 uint64_t NewIndex = IsBigEndian ? Index1 / 2 : Index0 / 2; 1601 Value *NewInsert = Builder.CreateInsertElement(CastBaseVec, X, NewIndex); 1602 return new BitCastInst(NewInsert, VTy); 1603 } 1604 1605 Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) { 1606 Value *VecOp = IE.getOperand(0); 1607 Value *ScalarOp = IE.getOperand(1); 1608 Value *IdxOp = IE.getOperand(2); 1609 1610 if (auto *V = simplifyInsertElementInst( 1611 VecOp, ScalarOp, IdxOp, SQ.getWithInstruction(&IE))) 1612 return replaceInstUsesWith(IE, V); 1613 1614 // Canonicalize type of constant indices to i64 to simplify CSE 1615 if (auto *IndexC = dyn_cast<ConstantInt>(IdxOp)) { 1616 if (auto *NewIdx = getPreferredVectorIndex(IndexC)) 1617 return replaceOperand(IE, 2, NewIdx); 1618 1619 Value *BaseVec, *OtherScalar; 1620 uint64_t OtherIndexVal; 1621 if (match(VecOp, m_OneUse(m_InsertElt(m_Value(BaseVec), 1622 m_Value(OtherScalar), 1623 m_ConstantInt(OtherIndexVal)))) && 1624 !isa<Constant>(OtherScalar) && OtherIndexVal > IndexC->getZExtValue()) { 1625 Value *NewIns = Builder.CreateInsertElement(BaseVec, ScalarOp, IdxOp); 1626 return InsertElementInst::Create(NewIns, OtherScalar, 1627 Builder.getInt64(OtherIndexVal)); 1628 } 1629 } 1630 1631 // If the scalar is bitcast and inserted into undef, do the insert in the 1632 // source type followed by bitcast. 1633 // TODO: Generalize for insert into any constant, not just undef? 1634 Value *ScalarSrc; 1635 if (match(VecOp, m_Undef()) && 1636 match(ScalarOp, m_OneUse(m_BitCast(m_Value(ScalarSrc)))) && 1637 (ScalarSrc->getType()->isIntegerTy() || 1638 ScalarSrc->getType()->isFloatingPointTy())) { 1639 // inselt undef, (bitcast ScalarSrc), IdxOp --> 1640 // bitcast (inselt undef, ScalarSrc, IdxOp) 1641 Type *ScalarTy = ScalarSrc->getType(); 1642 Type *VecTy = VectorType::get(ScalarTy, IE.getType()->getElementCount()); 1643 Constant *NewUndef = isa<PoisonValue>(VecOp) ? PoisonValue::get(VecTy) 1644 : UndefValue::get(VecTy); 1645 Value *NewInsElt = Builder.CreateInsertElement(NewUndef, ScalarSrc, IdxOp); 1646 return new BitCastInst(NewInsElt, IE.getType()); 1647 } 1648 1649 // If the vector and scalar are both bitcast from the same element type, do 1650 // the insert in that source type followed by bitcast. 1651 Value *VecSrc; 1652 if (match(VecOp, m_BitCast(m_Value(VecSrc))) && 1653 match(ScalarOp, m_BitCast(m_Value(ScalarSrc))) && 1654 (VecOp->hasOneUse() || ScalarOp->hasOneUse()) && 1655 VecSrc->getType()->isVectorTy() && !ScalarSrc->getType()->isVectorTy() && 1656 cast<VectorType>(VecSrc->getType())->getElementType() == 1657 ScalarSrc->getType()) { 1658 // inselt (bitcast VecSrc), (bitcast ScalarSrc), IdxOp --> 1659 // bitcast (inselt VecSrc, ScalarSrc, IdxOp) 1660 Value *NewInsElt = Builder.CreateInsertElement(VecSrc, ScalarSrc, IdxOp); 1661 return new BitCastInst(NewInsElt, IE.getType()); 1662 } 1663 1664 // If the inserted element was extracted from some other fixed-length vector 1665 // and both indexes are valid constants, try to turn this into a shuffle. 1666 // Can not handle scalable vector type, the number of elements needed to 1667 // create shuffle mask is not a compile-time constant. 1668 uint64_t InsertedIdx, ExtractedIdx; 1669 Value *ExtVecOp; 1670 if (isa<FixedVectorType>(IE.getType()) && 1671 match(IdxOp, m_ConstantInt(InsertedIdx)) && 1672 match(ScalarOp, 1673 m_ExtractElt(m_Value(ExtVecOp), m_ConstantInt(ExtractedIdx))) && 1674 isa<FixedVectorType>(ExtVecOp->getType()) && 1675 ExtractedIdx < 1676 cast<FixedVectorType>(ExtVecOp->getType())->getNumElements()) { 1677 // TODO: Looking at the user(s) to determine if this insert is a 1678 // fold-to-shuffle opportunity does not match the usual instcombine 1679 // constraints. We should decide if the transform is worthy based only 1680 // on this instruction and its operands, but that may not work currently. 1681 // 1682 // Here, we are trying to avoid creating shuffles before reaching 1683 // the end of a chain of extract-insert pairs. This is complicated because 1684 // we do not generally form arbitrary shuffle masks in instcombine 1685 // (because those may codegen poorly), but collectShuffleElements() does 1686 // exactly that. 1687 // 1688 // The rules for determining what is an acceptable target-independent 1689 // shuffle mask are fuzzy because they evolve based on the backend's 1690 // capabilities and real-world impact. 1691 auto isShuffleRootCandidate = [](InsertElementInst &Insert) { 1692 if (!Insert.hasOneUse()) 1693 return true; 1694 auto *InsertUser = dyn_cast<InsertElementInst>(Insert.user_back()); 1695 if (!InsertUser) 1696 return true; 1697 return false; 1698 }; 1699 1700 // Try to form a shuffle from a chain of extract-insert ops. 1701 if (isShuffleRootCandidate(IE)) { 1702 bool Rerun = true; 1703 while (Rerun) { 1704 Rerun = false; 1705 1706 SmallVector<int, 16> Mask; 1707 ShuffleOps LR = 1708 collectShuffleElements(&IE, Mask, nullptr, *this, Rerun); 1709 1710 // The proposed shuffle may be trivial, in which case we shouldn't 1711 // perform the combine. 1712 if (LR.first != &IE && LR.second != &IE) { 1713 // We now have a shuffle of LHS, RHS, Mask. 1714 if (LR.second == nullptr) 1715 LR.second = PoisonValue::get(LR.first->getType()); 1716 return new ShuffleVectorInst(LR.first, LR.second, Mask); 1717 } 1718 } 1719 } 1720 } 1721 1722 if (auto VecTy = dyn_cast<FixedVectorType>(VecOp->getType())) { 1723 unsigned VWidth = VecTy->getNumElements(); 1724 APInt PoisonElts(VWidth, 0); 1725 APInt AllOnesEltMask(APInt::getAllOnes(VWidth)); 1726 if (Value *V = SimplifyDemandedVectorElts(&IE, AllOnesEltMask, 1727 PoisonElts)) { 1728 if (V != &IE) 1729 return replaceInstUsesWith(IE, V); 1730 return &IE; 1731 } 1732 } 1733 1734 if (Instruction *Shuf = foldConstantInsEltIntoShuffle(IE)) 1735 return Shuf; 1736 1737 if (Instruction *NewInsElt = hoistInsEltConst(IE, Builder)) 1738 return NewInsElt; 1739 1740 if (Instruction *Broadcast = foldInsSequenceIntoSplat(IE)) 1741 return Broadcast; 1742 1743 if (Instruction *Splat = foldInsEltIntoSplat(IE)) 1744 return Splat; 1745 1746 if (Instruction *IdentityShuf = foldInsEltIntoIdentityShuffle(IE)) 1747 return IdentityShuf; 1748 1749 if (Instruction *Ext = narrowInsElt(IE, Builder)) 1750 return Ext; 1751 1752 if (Instruction *Ext = foldTruncInsEltPair(IE, DL.isBigEndian(), Builder)) 1753 return Ext; 1754 1755 return nullptr; 1756 } 1757 1758 /// Return true if we can evaluate the specified expression tree if the vector 1759 /// elements were shuffled in a different order. 1760 static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask, 1761 unsigned Depth = 5) { 1762 // We can always reorder the elements of a constant. 1763 if (isa<Constant>(V)) 1764 return true; 1765 1766 // We won't reorder vector arguments. No IPO here. 1767 Instruction *I = dyn_cast<Instruction>(V); 1768 if (!I) return false; 1769 1770 // Two users may expect different orders of the elements. Don't try it. 1771 if (!I->hasOneUse()) 1772 return false; 1773 1774 if (Depth == 0) return false; 1775 1776 switch (I->getOpcode()) { 1777 case Instruction::UDiv: 1778 case Instruction::SDiv: 1779 case Instruction::URem: 1780 case Instruction::SRem: 1781 // Propagating an undefined shuffle mask element to integer div/rem is not 1782 // allowed because those opcodes can create immediate undefined behavior 1783 // from an undefined element in an operand. 1784 if (llvm::is_contained(Mask, -1)) 1785 return false; 1786 [[fallthrough]]; 1787 case Instruction::Add: 1788 case Instruction::FAdd: 1789 case Instruction::Sub: 1790 case Instruction::FSub: 1791 case Instruction::Mul: 1792 case Instruction::FMul: 1793 case Instruction::FDiv: 1794 case Instruction::FRem: 1795 case Instruction::Shl: 1796 case Instruction::LShr: 1797 case Instruction::AShr: 1798 case Instruction::And: 1799 case Instruction::Or: 1800 case Instruction::Xor: 1801 case Instruction::ICmp: 1802 case Instruction::FCmp: 1803 case Instruction::Trunc: 1804 case Instruction::ZExt: 1805 case Instruction::SExt: 1806 case Instruction::FPToUI: 1807 case Instruction::FPToSI: 1808 case Instruction::UIToFP: 1809 case Instruction::SIToFP: 1810 case Instruction::FPTrunc: 1811 case Instruction::FPExt: 1812 case Instruction::GetElementPtr: { 1813 // Bail out if we would create longer vector ops. We could allow creating 1814 // longer vector ops, but that may result in more expensive codegen. 1815 Type *ITy = I->getType(); 1816 if (ITy->isVectorTy() && 1817 Mask.size() > cast<FixedVectorType>(ITy)->getNumElements()) 1818 return false; 1819 for (Value *Operand : I->operands()) { 1820 if (!canEvaluateShuffled(Operand, Mask, Depth - 1)) 1821 return false; 1822 } 1823 return true; 1824 } 1825 case Instruction::InsertElement: { 1826 ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2)); 1827 if (!CI) return false; 1828 int ElementNumber = CI->getLimitedValue(); 1829 1830 // Verify that 'CI' does not occur twice in Mask. A single 'insertelement' 1831 // can't put an element into multiple indices. 1832 bool SeenOnce = false; 1833 for (int I : Mask) { 1834 if (I == ElementNumber) { 1835 if (SeenOnce) 1836 return false; 1837 SeenOnce = true; 1838 } 1839 } 1840 return canEvaluateShuffled(I->getOperand(0), Mask, Depth - 1); 1841 } 1842 } 1843 return false; 1844 } 1845 1846 /// Rebuild a new instruction just like 'I' but with the new operands given. 1847 /// In the event of type mismatch, the type of the operands is correct. 1848 static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps, 1849 IRBuilderBase &Builder) { 1850 Builder.SetInsertPoint(I); 1851 switch (I->getOpcode()) { 1852 case Instruction::Add: 1853 case Instruction::FAdd: 1854 case Instruction::Sub: 1855 case Instruction::FSub: 1856 case Instruction::Mul: 1857 case Instruction::FMul: 1858 case Instruction::UDiv: 1859 case Instruction::SDiv: 1860 case Instruction::FDiv: 1861 case Instruction::URem: 1862 case Instruction::SRem: 1863 case Instruction::FRem: 1864 case Instruction::Shl: 1865 case Instruction::LShr: 1866 case Instruction::AShr: 1867 case Instruction::And: 1868 case Instruction::Or: 1869 case Instruction::Xor: { 1870 BinaryOperator *BO = cast<BinaryOperator>(I); 1871 assert(NewOps.size() == 2 && "binary operator with #ops != 2"); 1872 Value *New = Builder.CreateBinOp(cast<BinaryOperator>(I)->getOpcode(), 1873 NewOps[0], NewOps[1]); 1874 if (auto *NewI = dyn_cast<Instruction>(New)) { 1875 if (isa<OverflowingBinaryOperator>(BO)) { 1876 NewI->setHasNoUnsignedWrap(BO->hasNoUnsignedWrap()); 1877 NewI->setHasNoSignedWrap(BO->hasNoSignedWrap()); 1878 } 1879 if (isa<PossiblyExactOperator>(BO)) { 1880 NewI->setIsExact(BO->isExact()); 1881 } 1882 if (isa<FPMathOperator>(BO)) 1883 NewI->copyFastMathFlags(I); 1884 } 1885 return New; 1886 } 1887 case Instruction::ICmp: 1888 assert(NewOps.size() == 2 && "icmp with #ops != 2"); 1889 return Builder.CreateICmp(cast<ICmpInst>(I)->getPredicate(), NewOps[0], 1890 NewOps[1]); 1891 case Instruction::FCmp: 1892 assert(NewOps.size() == 2 && "fcmp with #ops != 2"); 1893 return Builder.CreateFCmp(cast<FCmpInst>(I)->getPredicate(), NewOps[0], 1894 NewOps[1]); 1895 case Instruction::Trunc: 1896 case Instruction::ZExt: 1897 case Instruction::SExt: 1898 case Instruction::FPToUI: 1899 case Instruction::FPToSI: 1900 case Instruction::UIToFP: 1901 case Instruction::SIToFP: 1902 case Instruction::FPTrunc: 1903 case Instruction::FPExt: { 1904 // It's possible that the mask has a different number of elements from 1905 // the original cast. We recompute the destination type to match the mask. 1906 Type *DestTy = VectorType::get( 1907 I->getType()->getScalarType(), 1908 cast<VectorType>(NewOps[0]->getType())->getElementCount()); 1909 assert(NewOps.size() == 1 && "cast with #ops != 1"); 1910 return Builder.CreateCast(cast<CastInst>(I)->getOpcode(), NewOps[0], 1911 DestTy); 1912 } 1913 case Instruction::GetElementPtr: { 1914 Value *Ptr = NewOps[0]; 1915 ArrayRef<Value*> Idx = NewOps.slice(1); 1916 return Builder.CreateGEP(cast<GEPOperator>(I)->getSourceElementType(), 1917 Ptr, Idx, "", 1918 cast<GEPOperator>(I)->isInBounds()); 1919 } 1920 } 1921 llvm_unreachable("failed to rebuild vector instructions"); 1922 } 1923 1924 static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask, 1925 IRBuilderBase &Builder) { 1926 // Mask.size() does not need to be equal to the number of vector elements. 1927 1928 assert(V->getType()->isVectorTy() && "can't reorder non-vector elements"); 1929 Type *EltTy = V->getType()->getScalarType(); 1930 1931 if (isa<PoisonValue>(V)) 1932 return PoisonValue::get(FixedVectorType::get(EltTy, Mask.size())); 1933 1934 if (match(V, m_Undef())) 1935 return UndefValue::get(FixedVectorType::get(EltTy, Mask.size())); 1936 1937 if (isa<ConstantAggregateZero>(V)) 1938 return ConstantAggregateZero::get(FixedVectorType::get(EltTy, Mask.size())); 1939 1940 if (Constant *C = dyn_cast<Constant>(V)) 1941 return ConstantExpr::getShuffleVector(C, PoisonValue::get(C->getType()), 1942 Mask); 1943 1944 Instruction *I = cast<Instruction>(V); 1945 switch (I->getOpcode()) { 1946 case Instruction::Add: 1947 case Instruction::FAdd: 1948 case Instruction::Sub: 1949 case Instruction::FSub: 1950 case Instruction::Mul: 1951 case Instruction::FMul: 1952 case Instruction::UDiv: 1953 case Instruction::SDiv: 1954 case Instruction::FDiv: 1955 case Instruction::URem: 1956 case Instruction::SRem: 1957 case Instruction::FRem: 1958 case Instruction::Shl: 1959 case Instruction::LShr: 1960 case Instruction::AShr: 1961 case Instruction::And: 1962 case Instruction::Or: 1963 case Instruction::Xor: 1964 case Instruction::ICmp: 1965 case Instruction::FCmp: 1966 case Instruction::Trunc: 1967 case Instruction::ZExt: 1968 case Instruction::SExt: 1969 case Instruction::FPToUI: 1970 case Instruction::FPToSI: 1971 case Instruction::UIToFP: 1972 case Instruction::SIToFP: 1973 case Instruction::FPTrunc: 1974 case Instruction::FPExt: 1975 case Instruction::Select: 1976 case Instruction::GetElementPtr: { 1977 SmallVector<Value*, 8> NewOps; 1978 bool NeedsRebuild = 1979 (Mask.size() != 1980 cast<FixedVectorType>(I->getType())->getNumElements()); 1981 for (int i = 0, e = I->getNumOperands(); i != e; ++i) { 1982 Value *V; 1983 // Recursively call evaluateInDifferentElementOrder on vector arguments 1984 // as well. E.g. GetElementPtr may have scalar operands even if the 1985 // return value is a vector, so we need to examine the operand type. 1986 if (I->getOperand(i)->getType()->isVectorTy()) 1987 V = evaluateInDifferentElementOrder(I->getOperand(i), Mask, Builder); 1988 else 1989 V = I->getOperand(i); 1990 NewOps.push_back(V); 1991 NeedsRebuild |= (V != I->getOperand(i)); 1992 } 1993 if (NeedsRebuild) 1994 return buildNew(I, NewOps, Builder); 1995 return I; 1996 } 1997 case Instruction::InsertElement: { 1998 int Element = cast<ConstantInt>(I->getOperand(2))->getLimitedValue(); 1999 2000 // The insertelement was inserting at Element. Figure out which element 2001 // that becomes after shuffling. The answer is guaranteed to be unique 2002 // by CanEvaluateShuffled. 2003 bool Found = false; 2004 int Index = 0; 2005 for (int e = Mask.size(); Index != e; ++Index) { 2006 if (Mask[Index] == Element) { 2007 Found = true; 2008 break; 2009 } 2010 } 2011 2012 // If element is not in Mask, no need to handle the operand 1 (element to 2013 // be inserted). Just evaluate values in operand 0 according to Mask. 2014 if (!Found) 2015 return evaluateInDifferentElementOrder(I->getOperand(0), Mask, Builder); 2016 2017 Value *V = evaluateInDifferentElementOrder(I->getOperand(0), Mask, 2018 Builder); 2019 Builder.SetInsertPoint(I); 2020 return Builder.CreateInsertElement(V, I->getOperand(1), Index); 2021 } 2022 } 2023 llvm_unreachable("failed to reorder elements of vector instruction!"); 2024 } 2025 2026 // Returns true if the shuffle is extracting a contiguous range of values from 2027 // LHS, for example: 2028 // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 2029 // Input: |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP| 2030 // Shuffles to: |EE|FF|GG|HH| 2031 // +--+--+--+--+ 2032 static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI, 2033 ArrayRef<int> Mask) { 2034 unsigned LHSElems = 2035 cast<FixedVectorType>(SVI.getOperand(0)->getType())->getNumElements(); 2036 unsigned MaskElems = Mask.size(); 2037 unsigned BegIdx = Mask.front(); 2038 unsigned EndIdx = Mask.back(); 2039 if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1) 2040 return false; 2041 for (unsigned I = 0; I != MaskElems; ++I) 2042 if (static_cast<unsigned>(Mask[I]) != BegIdx + I) 2043 return false; 2044 return true; 2045 } 2046 2047 /// These are the ingredients in an alternate form binary operator as described 2048 /// below. 2049 struct BinopElts { 2050 BinaryOperator::BinaryOps Opcode; 2051 Value *Op0; 2052 Value *Op1; 2053 BinopElts(BinaryOperator::BinaryOps Opc = (BinaryOperator::BinaryOps)0, 2054 Value *V0 = nullptr, Value *V1 = nullptr) : 2055 Opcode(Opc), Op0(V0), Op1(V1) {} 2056 operator bool() const { return Opcode != 0; } 2057 }; 2058 2059 /// Binops may be transformed into binops with different opcodes and operands. 2060 /// Reverse the usual canonicalization to enable folds with the non-canonical 2061 /// form of the binop. If a transform is possible, return the elements of the 2062 /// new binop. If not, return invalid elements. 2063 static BinopElts getAlternateBinop(BinaryOperator *BO, const DataLayout &DL) { 2064 Value *BO0 = BO->getOperand(0), *BO1 = BO->getOperand(1); 2065 Type *Ty = BO->getType(); 2066 switch (BO->getOpcode()) { 2067 case Instruction::Shl: { 2068 // shl X, C --> mul X, (1 << C) 2069 Constant *C; 2070 if (match(BO1, m_ImmConstant(C))) { 2071 Constant *ShlOne = ConstantFoldBinaryOpOperands( 2072 Instruction::Shl, ConstantInt::get(Ty, 1), C, DL); 2073 assert(ShlOne && "Constant folding of immediate constants failed"); 2074 return {Instruction::Mul, BO0, ShlOne}; 2075 } 2076 break; 2077 } 2078 case Instruction::Or: { 2079 // or disjoin X, C --> add X, C 2080 if (cast<PossiblyDisjointInst>(BO)->isDisjoint()) 2081 return {Instruction::Add, BO0, BO1}; 2082 break; 2083 } 2084 case Instruction::Sub: 2085 // sub 0, X --> mul X, -1 2086 if (match(BO0, m_ZeroInt())) 2087 return {Instruction::Mul, BO1, ConstantInt::getAllOnesValue(Ty)}; 2088 break; 2089 default: 2090 break; 2091 } 2092 return {}; 2093 } 2094 2095 /// A select shuffle of a select shuffle with a shared operand can be reduced 2096 /// to a single select shuffle. This is an obvious improvement in IR, and the 2097 /// backend is expected to lower select shuffles efficiently. 2098 static Instruction *foldSelectShuffleOfSelectShuffle(ShuffleVectorInst &Shuf) { 2099 assert(Shuf.isSelect() && "Must have select-equivalent shuffle"); 2100 2101 Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1); 2102 SmallVector<int, 16> Mask; 2103 Shuf.getShuffleMask(Mask); 2104 unsigned NumElts = Mask.size(); 2105 2106 // Canonicalize a select shuffle with common operand as Op1. 2107 auto *ShufOp = dyn_cast<ShuffleVectorInst>(Op0); 2108 if (ShufOp && ShufOp->isSelect() && 2109 (ShufOp->getOperand(0) == Op1 || ShufOp->getOperand(1) == Op1)) { 2110 std::swap(Op0, Op1); 2111 ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); 2112 } 2113 2114 ShufOp = dyn_cast<ShuffleVectorInst>(Op1); 2115 if (!ShufOp || !ShufOp->isSelect() || 2116 (ShufOp->getOperand(0) != Op0 && ShufOp->getOperand(1) != Op0)) 2117 return nullptr; 2118 2119 Value *X = ShufOp->getOperand(0), *Y = ShufOp->getOperand(1); 2120 SmallVector<int, 16> Mask1; 2121 ShufOp->getShuffleMask(Mask1); 2122 assert(Mask1.size() == NumElts && "Vector size changed with select shuffle"); 2123 2124 // Canonicalize common operand (Op0) as X (first operand of first shuffle). 2125 if (Y == Op0) { 2126 std::swap(X, Y); 2127 ShuffleVectorInst::commuteShuffleMask(Mask1, NumElts); 2128 } 2129 2130 // If the mask chooses from X (operand 0), it stays the same. 2131 // If the mask chooses from the earlier shuffle, the other mask value is 2132 // transferred to the combined select shuffle: 2133 // shuf X, (shuf X, Y, M1), M --> shuf X, Y, M' 2134 SmallVector<int, 16> NewMask(NumElts); 2135 for (unsigned i = 0; i != NumElts; ++i) 2136 NewMask[i] = Mask[i] < (signed)NumElts ? Mask[i] : Mask1[i]; 2137 2138 // A select mask with undef elements might look like an identity mask. 2139 assert((ShuffleVectorInst::isSelectMask(NewMask, NumElts) || 2140 ShuffleVectorInst::isIdentityMask(NewMask, NumElts)) && 2141 "Unexpected shuffle mask"); 2142 return new ShuffleVectorInst(X, Y, NewMask); 2143 } 2144 2145 static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf, 2146 const SimplifyQuery &SQ) { 2147 assert(Shuf.isSelect() && "Must have select-equivalent shuffle"); 2148 2149 // Are we shuffling together some value and that same value after it has been 2150 // modified by a binop with a constant? 2151 Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1); 2152 Constant *C; 2153 bool Op0IsBinop; 2154 if (match(Op0, m_BinOp(m_Specific(Op1), m_Constant(C)))) 2155 Op0IsBinop = true; 2156 else if (match(Op1, m_BinOp(m_Specific(Op0), m_Constant(C)))) 2157 Op0IsBinop = false; 2158 else 2159 return nullptr; 2160 2161 // The identity constant for a binop leaves a variable operand unchanged. For 2162 // a vector, this is a splat of something like 0, -1, or 1. 2163 // If there's no identity constant for this binop, we're done. 2164 auto *BO = cast<BinaryOperator>(Op0IsBinop ? Op0 : Op1); 2165 BinaryOperator::BinaryOps BOpcode = BO->getOpcode(); 2166 Constant *IdC = ConstantExpr::getBinOpIdentity(BOpcode, Shuf.getType(), true); 2167 if (!IdC) 2168 return nullptr; 2169 2170 Value *X = Op0IsBinop ? Op1 : Op0; 2171 2172 // Prevent folding in the case the non-binop operand might have NaN values. 2173 // If X can have NaN elements then we have that the floating point math 2174 // operation in the transformed code may not preserve the exact NaN 2175 // bit-pattern -- e.g. `fadd sNaN, 0.0 -> qNaN`. 2176 // This makes the transformation incorrect since the original program would 2177 // have preserved the exact NaN bit-pattern. 2178 // Avoid the folding if X can have NaN elements. 2179 if (Shuf.getType()->getElementType()->isFloatingPointTy() && 2180 !isKnownNeverNaN(X, 0, SQ)) 2181 return nullptr; 2182 2183 // Shuffle identity constants into the lanes that return the original value. 2184 // Example: shuf (mul X, {-1,-2,-3,-4}), X, {0,5,6,3} --> mul X, {-1,1,1,-4} 2185 // Example: shuf X, (add X, {-1,-2,-3,-4}), {0,1,6,7} --> add X, {0,0,-3,-4} 2186 // The existing binop constant vector remains in the same operand position. 2187 ArrayRef<int> Mask = Shuf.getShuffleMask(); 2188 Constant *NewC = Op0IsBinop ? ConstantExpr::getShuffleVector(C, IdC, Mask) : 2189 ConstantExpr::getShuffleVector(IdC, C, Mask); 2190 2191 bool MightCreatePoisonOrUB = 2192 is_contained(Mask, PoisonMaskElem) && 2193 (Instruction::isIntDivRem(BOpcode) || Instruction::isShift(BOpcode)); 2194 if (MightCreatePoisonOrUB) 2195 NewC = InstCombiner::getSafeVectorConstantForBinop(BOpcode, NewC, true); 2196 2197 // shuf (bop X, C), X, M --> bop X, C' 2198 // shuf X, (bop X, C), M --> bop X, C' 2199 Instruction *NewBO = BinaryOperator::Create(BOpcode, X, NewC); 2200 NewBO->copyIRFlags(BO); 2201 2202 // An undef shuffle mask element may propagate as an undef constant element in 2203 // the new binop. That would produce poison where the original code might not. 2204 // If we already made a safe constant, then there's no danger. 2205 if (is_contained(Mask, PoisonMaskElem) && !MightCreatePoisonOrUB) 2206 NewBO->dropPoisonGeneratingFlags(); 2207 return NewBO; 2208 } 2209 2210 /// If we have an insert of a scalar to a non-zero element of an undefined 2211 /// vector and then shuffle that value, that's the same as inserting to the zero 2212 /// element and shuffling. Splatting from the zero element is recognized as the 2213 /// canonical form of splat. 2214 static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf, 2215 InstCombiner::BuilderTy &Builder) { 2216 Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1); 2217 ArrayRef<int> Mask = Shuf.getShuffleMask(); 2218 Value *X; 2219 uint64_t IndexC; 2220 2221 // Match a shuffle that is a splat to a non-zero element. 2222 if (!match(Op0, m_OneUse(m_InsertElt(m_Poison(), m_Value(X), 2223 m_ConstantInt(IndexC)))) || 2224 !match(Op1, m_Poison()) || match(Mask, m_ZeroMask()) || IndexC == 0) 2225 return nullptr; 2226 2227 // Insert into element 0 of a poison vector. 2228 PoisonValue *PoisonVec = PoisonValue::get(Shuf.getType()); 2229 Value *NewIns = Builder.CreateInsertElement(PoisonVec, X, (uint64_t)0); 2230 2231 // Splat from element 0. Any mask element that is poison remains poison. 2232 // For example: 2233 // shuf (inselt poison, X, 2), _, <2,2,undef> 2234 // --> shuf (inselt poison, X, 0), poison, <0,0,undef> 2235 unsigned NumMaskElts = 2236 cast<FixedVectorType>(Shuf.getType())->getNumElements(); 2237 SmallVector<int, 16> NewMask(NumMaskElts, 0); 2238 for (unsigned i = 0; i != NumMaskElts; ++i) 2239 if (Mask[i] == PoisonMaskElem) 2240 NewMask[i] = Mask[i]; 2241 2242 return new ShuffleVectorInst(NewIns, NewMask); 2243 } 2244 2245 /// Try to fold shuffles that are the equivalent of a vector select. 2246 Instruction *InstCombinerImpl::foldSelectShuffle(ShuffleVectorInst &Shuf) { 2247 if (!Shuf.isSelect()) 2248 return nullptr; 2249 2250 // Canonicalize to choose from operand 0 first unless operand 1 is undefined. 2251 // Commuting undef to operand 0 conflicts with another canonicalization. 2252 unsigned NumElts = cast<FixedVectorType>(Shuf.getType())->getNumElements(); 2253 if (!match(Shuf.getOperand(1), m_Undef()) && 2254 Shuf.getMaskValue(0) >= (int)NumElts) { 2255 // TODO: Can we assert that both operands of a shuffle-select are not undef 2256 // (otherwise, it would have been folded by instsimplify? 2257 Shuf.commute(); 2258 return &Shuf; 2259 } 2260 2261 if (Instruction *I = foldSelectShuffleOfSelectShuffle(Shuf)) 2262 return I; 2263 2264 if (Instruction *I = foldSelectShuffleWith1Binop( 2265 Shuf, getSimplifyQuery().getWithInstruction(&Shuf))) 2266 return I; 2267 2268 BinaryOperator *B0, *B1; 2269 if (!match(Shuf.getOperand(0), m_BinOp(B0)) || 2270 !match(Shuf.getOperand(1), m_BinOp(B1))) 2271 return nullptr; 2272 2273 // If one operand is "0 - X", allow that to be viewed as "X * -1" 2274 // (ConstantsAreOp1) by getAlternateBinop below. If the neg is not paired 2275 // with a multiply, we will exit because C0/C1 will not be set. 2276 Value *X, *Y; 2277 Constant *C0 = nullptr, *C1 = nullptr; 2278 bool ConstantsAreOp1; 2279 if (match(B0, m_BinOp(m_Constant(C0), m_Value(X))) && 2280 match(B1, m_BinOp(m_Constant(C1), m_Value(Y)))) 2281 ConstantsAreOp1 = false; 2282 else if (match(B0, m_CombineOr(m_BinOp(m_Value(X), m_Constant(C0)), 2283 m_Neg(m_Value(X)))) && 2284 match(B1, m_CombineOr(m_BinOp(m_Value(Y), m_Constant(C1)), 2285 m_Neg(m_Value(Y))))) 2286 ConstantsAreOp1 = true; 2287 else 2288 return nullptr; 2289 2290 // We need matching binops to fold the lanes together. 2291 BinaryOperator::BinaryOps Opc0 = B0->getOpcode(); 2292 BinaryOperator::BinaryOps Opc1 = B1->getOpcode(); 2293 bool DropNSW = false; 2294 if (ConstantsAreOp1 && Opc0 != Opc1) { 2295 // TODO: We drop "nsw" if shift is converted into multiply because it may 2296 // not be correct when the shift amount is BitWidth - 1. We could examine 2297 // each vector element to determine if it is safe to keep that flag. 2298 if (Opc0 == Instruction::Shl || Opc1 == Instruction::Shl) 2299 DropNSW = true; 2300 if (BinopElts AltB0 = getAlternateBinop(B0, DL)) { 2301 assert(isa<Constant>(AltB0.Op1) && "Expecting constant with alt binop"); 2302 Opc0 = AltB0.Opcode; 2303 C0 = cast<Constant>(AltB0.Op1); 2304 } else if (BinopElts AltB1 = getAlternateBinop(B1, DL)) { 2305 assert(isa<Constant>(AltB1.Op1) && "Expecting constant with alt binop"); 2306 Opc1 = AltB1.Opcode; 2307 C1 = cast<Constant>(AltB1.Op1); 2308 } 2309 } 2310 2311 if (Opc0 != Opc1 || !C0 || !C1) 2312 return nullptr; 2313 2314 // The opcodes must be the same. Use a new name to make that clear. 2315 BinaryOperator::BinaryOps BOpc = Opc0; 2316 2317 // Select the constant elements needed for the single binop. 2318 ArrayRef<int> Mask = Shuf.getShuffleMask(); 2319 Constant *NewC = ConstantExpr::getShuffleVector(C0, C1, Mask); 2320 2321 // We are moving a binop after a shuffle. When a shuffle has an undefined 2322 // mask element, the result is undefined, but it is not poison or undefined 2323 // behavior. That is not necessarily true for div/rem/shift. 2324 bool MightCreatePoisonOrUB = 2325 is_contained(Mask, PoisonMaskElem) && 2326 (Instruction::isIntDivRem(BOpc) || Instruction::isShift(BOpc)); 2327 if (MightCreatePoisonOrUB) 2328 NewC = InstCombiner::getSafeVectorConstantForBinop(BOpc, NewC, 2329 ConstantsAreOp1); 2330 2331 Value *V; 2332 if (X == Y) { 2333 // Remove a binop and the shuffle by rearranging the constant: 2334 // shuffle (op V, C0), (op V, C1), M --> op V, C' 2335 // shuffle (op C0, V), (op C1, V), M --> op C', V 2336 V = X; 2337 } else { 2338 // If there are 2 different variable operands, we must create a new shuffle 2339 // (select) first, so check uses to ensure that we don't end up with more 2340 // instructions than we started with. 2341 if (!B0->hasOneUse() && !B1->hasOneUse()) 2342 return nullptr; 2343 2344 // If we use the original shuffle mask and op1 is *variable*, we would be 2345 // putting an undef into operand 1 of div/rem/shift. This is either UB or 2346 // poison. We do not have to guard against UB when *constants* are op1 2347 // because safe constants guarantee that we do not overflow sdiv/srem (and 2348 // there's no danger for other opcodes). 2349 // TODO: To allow this case, create a new shuffle mask with no undefs. 2350 if (MightCreatePoisonOrUB && !ConstantsAreOp1) 2351 return nullptr; 2352 2353 // Note: In general, we do not create new shuffles in InstCombine because we 2354 // do not know if a target can lower an arbitrary shuffle optimally. In this 2355 // case, the shuffle uses the existing mask, so there is no additional risk. 2356 2357 // Select the variable vectors first, then perform the binop: 2358 // shuffle (op X, C0), (op Y, C1), M --> op (shuffle X, Y, M), C' 2359 // shuffle (op C0, X), (op C1, Y), M --> op C', (shuffle X, Y, M) 2360 V = Builder.CreateShuffleVector(X, Y, Mask); 2361 } 2362 2363 Value *NewBO = ConstantsAreOp1 ? Builder.CreateBinOp(BOpc, V, NewC) : 2364 Builder.CreateBinOp(BOpc, NewC, V); 2365 2366 // Flags are intersected from the 2 source binops. But there are 2 exceptions: 2367 // 1. If we changed an opcode, poison conditions might have changed. 2368 // 2. If the shuffle had undef mask elements, the new binop might have undefs 2369 // where the original code did not. But if we already made a safe constant, 2370 // then there's no danger. 2371 if (auto *NewI = dyn_cast<Instruction>(NewBO)) { 2372 NewI->copyIRFlags(B0); 2373 NewI->andIRFlags(B1); 2374 if (DropNSW) 2375 NewI->setHasNoSignedWrap(false); 2376 if (is_contained(Mask, PoisonMaskElem) && !MightCreatePoisonOrUB) 2377 NewI->dropPoisonGeneratingFlags(); 2378 } 2379 return replaceInstUsesWith(Shuf, NewBO); 2380 } 2381 2382 /// Convert a narrowing shuffle of a bitcasted vector into a vector truncate. 2383 /// Example (little endian): 2384 /// shuf (bitcast <4 x i16> X to <8 x i8>), <0, 2, 4, 6> --> trunc X to <4 x i8> 2385 static Instruction *foldTruncShuffle(ShuffleVectorInst &Shuf, 2386 bool IsBigEndian) { 2387 // This must be a bitcasted shuffle of 1 vector integer operand. 2388 Type *DestType = Shuf.getType(); 2389 Value *X; 2390 if (!match(Shuf.getOperand(0), m_BitCast(m_Value(X))) || 2391 !match(Shuf.getOperand(1), m_Poison()) || !DestType->isIntOrIntVectorTy()) 2392 return nullptr; 2393 2394 // The source type must have the same number of elements as the shuffle, 2395 // and the source element type must be larger than the shuffle element type. 2396 Type *SrcType = X->getType(); 2397 if (!SrcType->isVectorTy() || !SrcType->isIntOrIntVectorTy() || 2398 cast<FixedVectorType>(SrcType)->getNumElements() != 2399 cast<FixedVectorType>(DestType)->getNumElements() || 2400 SrcType->getScalarSizeInBits() % DestType->getScalarSizeInBits() != 0) 2401 return nullptr; 2402 2403 assert(Shuf.changesLength() && !Shuf.increasesLength() && 2404 "Expected a shuffle that decreases length"); 2405 2406 // Last, check that the mask chooses the correct low bits for each narrow 2407 // element in the result. 2408 uint64_t TruncRatio = 2409 SrcType->getScalarSizeInBits() / DestType->getScalarSizeInBits(); 2410 ArrayRef<int> Mask = Shuf.getShuffleMask(); 2411 for (unsigned i = 0, e = Mask.size(); i != e; ++i) { 2412 if (Mask[i] == PoisonMaskElem) 2413 continue; 2414 uint64_t LSBIndex = IsBigEndian ? (i + 1) * TruncRatio - 1 : i * TruncRatio; 2415 assert(LSBIndex <= INT32_MAX && "Overflowed 32-bits"); 2416 if (Mask[i] != (int)LSBIndex) 2417 return nullptr; 2418 } 2419 2420 return new TruncInst(X, DestType); 2421 } 2422 2423 /// Match a shuffle-select-shuffle pattern where the shuffles are widening and 2424 /// narrowing (concatenating with poison and extracting back to the original 2425 /// length). This allows replacing the wide select with a narrow select. 2426 static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf, 2427 InstCombiner::BuilderTy &Builder) { 2428 // This must be a narrowing identity shuffle. It extracts the 1st N elements 2429 // of the 1st vector operand of a shuffle. 2430 if (!match(Shuf.getOperand(1), m_Poison()) || !Shuf.isIdentityWithExtract()) 2431 return nullptr; 2432 2433 // The vector being shuffled must be a vector select that we can eliminate. 2434 // TODO: The one-use requirement could be eased if X and/or Y are constants. 2435 Value *Cond, *X, *Y; 2436 if (!match(Shuf.getOperand(0), 2437 m_OneUse(m_Select(m_Value(Cond), m_Value(X), m_Value(Y))))) 2438 return nullptr; 2439 2440 // We need a narrow condition value. It must be extended with poison elements 2441 // and have the same number of elements as this shuffle. 2442 unsigned NarrowNumElts = 2443 cast<FixedVectorType>(Shuf.getType())->getNumElements(); 2444 Value *NarrowCond; 2445 if (!match(Cond, m_OneUse(m_Shuffle(m_Value(NarrowCond), m_Poison()))) || 2446 cast<FixedVectorType>(NarrowCond->getType())->getNumElements() != 2447 NarrowNumElts || 2448 !cast<ShuffleVectorInst>(Cond)->isIdentityWithPadding()) 2449 return nullptr; 2450 2451 // shuf (sel (shuf NarrowCond, poison, WideMask), X, Y), poison, NarrowMask) 2452 // --> 2453 // sel NarrowCond, (shuf X, poison, NarrowMask), (shuf Y, poison, NarrowMask) 2454 Value *NarrowX = Builder.CreateShuffleVector(X, Shuf.getShuffleMask()); 2455 Value *NarrowY = Builder.CreateShuffleVector(Y, Shuf.getShuffleMask()); 2456 return SelectInst::Create(NarrowCond, NarrowX, NarrowY); 2457 } 2458 2459 /// Canonicalize FP negate/abs after shuffle. 2460 static Instruction *foldShuffleOfUnaryOps(ShuffleVectorInst &Shuf, 2461 InstCombiner::BuilderTy &Builder) { 2462 auto *S0 = dyn_cast<Instruction>(Shuf.getOperand(0)); 2463 Value *X; 2464 if (!S0 || !match(S0, m_CombineOr(m_FNeg(m_Value(X)), m_FAbs(m_Value(X))))) 2465 return nullptr; 2466 2467 bool IsFNeg = S0->getOpcode() == Instruction::FNeg; 2468 2469 // Match 1-input (unary) shuffle. 2470 // shuffle (fneg/fabs X), Mask --> fneg/fabs (shuffle X, Mask) 2471 if (S0->hasOneUse() && match(Shuf.getOperand(1), m_Poison())) { 2472 Value *NewShuf = Builder.CreateShuffleVector(X, Shuf.getShuffleMask()); 2473 if (IsFNeg) 2474 return UnaryOperator::CreateFNegFMF(NewShuf, S0); 2475 2476 Function *FAbs = Intrinsic::getDeclaration(Shuf.getModule(), 2477 Intrinsic::fabs, Shuf.getType()); 2478 CallInst *NewF = CallInst::Create(FAbs, {NewShuf}); 2479 NewF->setFastMathFlags(S0->getFastMathFlags()); 2480 return NewF; 2481 } 2482 2483 // Match 2-input (binary) shuffle. 2484 auto *S1 = dyn_cast<Instruction>(Shuf.getOperand(1)); 2485 Value *Y; 2486 if (!S1 || !match(S1, m_CombineOr(m_FNeg(m_Value(Y)), m_FAbs(m_Value(Y)))) || 2487 S0->getOpcode() != S1->getOpcode() || 2488 (!S0->hasOneUse() && !S1->hasOneUse())) 2489 return nullptr; 2490 2491 // shuf (fneg/fabs X), (fneg/fabs Y), Mask --> fneg/fabs (shuf X, Y, Mask) 2492 Value *NewShuf = Builder.CreateShuffleVector(X, Y, Shuf.getShuffleMask()); 2493 Instruction *NewF; 2494 if (IsFNeg) { 2495 NewF = UnaryOperator::CreateFNeg(NewShuf); 2496 } else { 2497 Function *FAbs = Intrinsic::getDeclaration(Shuf.getModule(), 2498 Intrinsic::fabs, Shuf.getType()); 2499 NewF = CallInst::Create(FAbs, {NewShuf}); 2500 } 2501 NewF->copyIRFlags(S0); 2502 NewF->andIRFlags(S1); 2503 return NewF; 2504 } 2505 2506 /// Canonicalize casts after shuffle. 2507 static Instruction *foldCastShuffle(ShuffleVectorInst &Shuf, 2508 InstCombiner::BuilderTy &Builder) { 2509 // Do we have 2 matching cast operands? 2510 auto *Cast0 = dyn_cast<CastInst>(Shuf.getOperand(0)); 2511 auto *Cast1 = dyn_cast<CastInst>(Shuf.getOperand(1)); 2512 if (!Cast0 || !Cast1 || Cast0->getOpcode() != Cast1->getOpcode() || 2513 Cast0->getSrcTy() != Cast1->getSrcTy()) 2514 return nullptr; 2515 2516 // TODO: Allow other opcodes? That would require easing the type restrictions 2517 // below here. 2518 CastInst::CastOps CastOpcode = Cast0->getOpcode(); 2519 switch (CastOpcode) { 2520 case Instruction::FPToSI: 2521 case Instruction::FPToUI: 2522 case Instruction::SIToFP: 2523 case Instruction::UIToFP: 2524 break; 2525 default: 2526 return nullptr; 2527 } 2528 2529 VectorType *ShufTy = Shuf.getType(); 2530 VectorType *ShufOpTy = cast<VectorType>(Shuf.getOperand(0)->getType()); 2531 VectorType *CastSrcTy = cast<VectorType>(Cast0->getSrcTy()); 2532 2533 // TODO: Allow length-increasing shuffles? 2534 if (ShufTy->getElementCount().getKnownMinValue() > 2535 ShufOpTy->getElementCount().getKnownMinValue()) 2536 return nullptr; 2537 2538 // TODO: Allow element-size-decreasing casts (ex: fptosi float to i8)? 2539 assert(isa<FixedVectorType>(CastSrcTy) && isa<FixedVectorType>(ShufOpTy) && 2540 "Expected fixed vector operands for casts and binary shuffle"); 2541 if (CastSrcTy->getPrimitiveSizeInBits() > ShufOpTy->getPrimitiveSizeInBits()) 2542 return nullptr; 2543 2544 // At least one of the operands must have only one use (the shuffle). 2545 if (!Cast0->hasOneUse() && !Cast1->hasOneUse()) 2546 return nullptr; 2547 2548 // shuffle (cast X), (cast Y), Mask --> cast (shuffle X, Y, Mask) 2549 Value *X = Cast0->getOperand(0); 2550 Value *Y = Cast1->getOperand(0); 2551 Value *NewShuf = Builder.CreateShuffleVector(X, Y, Shuf.getShuffleMask()); 2552 return CastInst::Create(CastOpcode, NewShuf, ShufTy); 2553 } 2554 2555 /// Try to fold an extract subvector operation. 2556 static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) { 2557 Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1); 2558 if (!Shuf.isIdentityWithExtract() || !match(Op1, m_Poison())) 2559 return nullptr; 2560 2561 // Check if we are extracting all bits of an inserted scalar: 2562 // extract-subvec (bitcast (inselt ?, X, 0) --> bitcast X to subvec type 2563 Value *X; 2564 if (match(Op0, m_BitCast(m_InsertElt(m_Value(), m_Value(X), m_Zero()))) && 2565 X->getType()->getPrimitiveSizeInBits() == 2566 Shuf.getType()->getPrimitiveSizeInBits()) 2567 return new BitCastInst(X, Shuf.getType()); 2568 2569 // Try to combine 2 shuffles into 1 shuffle by concatenating a shuffle mask. 2570 Value *Y; 2571 ArrayRef<int> Mask; 2572 if (!match(Op0, m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask)))) 2573 return nullptr; 2574 2575 // Be conservative with shuffle transforms. If we can't kill the 1st shuffle, 2576 // then combining may result in worse codegen. 2577 if (!Op0->hasOneUse()) 2578 return nullptr; 2579 2580 // We are extracting a subvector from a shuffle. Remove excess elements from 2581 // the 1st shuffle mask to eliminate the extract. 2582 // 2583 // This transform is conservatively limited to identity extracts because we do 2584 // not allow arbitrary shuffle mask creation as a target-independent transform 2585 // (because we can't guarantee that will lower efficiently). 2586 // 2587 // If the extracting shuffle has an poison mask element, it transfers to the 2588 // new shuffle mask. Otherwise, copy the original mask element. Example: 2589 // shuf (shuf X, Y, <C0, C1, C2, poison, C4>), poison, <0, poison, 2, 3> --> 2590 // shuf X, Y, <C0, poison, C2, poison> 2591 unsigned NumElts = cast<FixedVectorType>(Shuf.getType())->getNumElements(); 2592 SmallVector<int, 16> NewMask(NumElts); 2593 assert(NumElts < Mask.size() && 2594 "Identity with extract must have less elements than its inputs"); 2595 2596 for (unsigned i = 0; i != NumElts; ++i) { 2597 int ExtractMaskElt = Shuf.getMaskValue(i); 2598 int MaskElt = Mask[i]; 2599 NewMask[i] = ExtractMaskElt == PoisonMaskElem ? ExtractMaskElt : MaskElt; 2600 } 2601 return new ShuffleVectorInst(X, Y, NewMask); 2602 } 2603 2604 /// Try to replace a shuffle with an insertelement or try to replace a shuffle 2605 /// operand with the operand of an insertelement. 2606 static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf, 2607 InstCombinerImpl &IC) { 2608 Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1); 2609 SmallVector<int, 16> Mask; 2610 Shuf.getShuffleMask(Mask); 2611 2612 int NumElts = Mask.size(); 2613 int InpNumElts = cast<FixedVectorType>(V0->getType())->getNumElements(); 2614 2615 // This is a specialization of a fold in SimplifyDemandedVectorElts. We may 2616 // not be able to handle it there if the insertelement has >1 use. 2617 // If the shuffle has an insertelement operand but does not choose the 2618 // inserted scalar element from that value, then we can replace that shuffle 2619 // operand with the source vector of the insertelement. 2620 Value *X; 2621 uint64_t IdxC; 2622 if (match(V0, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) { 2623 // shuf (inselt X, ?, IdxC), ?, Mask --> shuf X, ?, Mask 2624 if (!is_contained(Mask, (int)IdxC)) 2625 return IC.replaceOperand(Shuf, 0, X); 2626 } 2627 if (match(V1, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) { 2628 // Offset the index constant by the vector width because we are checking for 2629 // accesses to the 2nd vector input of the shuffle. 2630 IdxC += InpNumElts; 2631 // shuf ?, (inselt X, ?, IdxC), Mask --> shuf ?, X, Mask 2632 if (!is_contained(Mask, (int)IdxC)) 2633 return IC.replaceOperand(Shuf, 1, X); 2634 } 2635 // For the rest of the transform, the shuffle must not change vector sizes. 2636 // TODO: This restriction could be removed if the insert has only one use 2637 // (because the transform would require a new length-changing shuffle). 2638 if (NumElts != InpNumElts) 2639 return nullptr; 2640 2641 // shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC' 2642 auto isShufflingScalarIntoOp1 = [&](Value *&Scalar, ConstantInt *&IndexC) { 2643 // We need an insertelement with a constant index. 2644 if (!match(V0, m_InsertElt(m_Value(), m_Value(Scalar), 2645 m_ConstantInt(IndexC)))) 2646 return false; 2647 2648 // Test the shuffle mask to see if it splices the inserted scalar into the 2649 // operand 1 vector of the shuffle. 2650 int NewInsIndex = -1; 2651 for (int i = 0; i != NumElts; ++i) { 2652 // Ignore undef mask elements. 2653 if (Mask[i] == -1) 2654 continue; 2655 2656 // The shuffle takes elements of operand 1 without lane changes. 2657 if (Mask[i] == NumElts + i) 2658 continue; 2659 2660 // The shuffle must choose the inserted scalar exactly once. 2661 if (NewInsIndex != -1 || Mask[i] != IndexC->getSExtValue()) 2662 return false; 2663 2664 // The shuffle is placing the inserted scalar into element i. 2665 NewInsIndex = i; 2666 } 2667 2668 assert(NewInsIndex != -1 && "Did not fold shuffle with unused operand?"); 2669 2670 // Index is updated to the potentially translated insertion lane. 2671 IndexC = ConstantInt::get(IndexC->getIntegerType(), NewInsIndex); 2672 return true; 2673 }; 2674 2675 // If the shuffle is unnecessary, insert the scalar operand directly into 2676 // operand 1 of the shuffle. Example: 2677 // shuffle (insert ?, S, 1), V1, <1, 5, 6, 7> --> insert V1, S, 0 2678 Value *Scalar; 2679 ConstantInt *IndexC; 2680 if (isShufflingScalarIntoOp1(Scalar, IndexC)) 2681 return InsertElementInst::Create(V1, Scalar, IndexC); 2682 2683 // Try again after commuting shuffle. Example: 2684 // shuffle V0, (insert ?, S, 0), <0, 1, 2, 4> --> 2685 // shuffle (insert ?, S, 0), V0, <4, 5, 6, 0> --> insert V0, S, 3 2686 std::swap(V0, V1); 2687 ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); 2688 if (isShufflingScalarIntoOp1(Scalar, IndexC)) 2689 return InsertElementInst::Create(V1, Scalar, IndexC); 2690 2691 return nullptr; 2692 } 2693 2694 static Instruction *foldIdentityPaddedShuffles(ShuffleVectorInst &Shuf) { 2695 // Match the operands as identity with padding (also known as concatenation 2696 // with undef) shuffles of the same source type. The backend is expected to 2697 // recreate these concatenations from a shuffle of narrow operands. 2698 auto *Shuffle0 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(0)); 2699 auto *Shuffle1 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(1)); 2700 if (!Shuffle0 || !Shuffle0->isIdentityWithPadding() || 2701 !Shuffle1 || !Shuffle1->isIdentityWithPadding()) 2702 return nullptr; 2703 2704 // We limit this transform to power-of-2 types because we expect that the 2705 // backend can convert the simplified IR patterns to identical nodes as the 2706 // original IR. 2707 // TODO: If we can verify the same behavior for arbitrary types, the 2708 // power-of-2 checks can be removed. 2709 Value *X = Shuffle0->getOperand(0); 2710 Value *Y = Shuffle1->getOperand(0); 2711 if (X->getType() != Y->getType() || 2712 !isPowerOf2_32(cast<FixedVectorType>(Shuf.getType())->getNumElements()) || 2713 !isPowerOf2_32( 2714 cast<FixedVectorType>(Shuffle0->getType())->getNumElements()) || 2715 !isPowerOf2_32(cast<FixedVectorType>(X->getType())->getNumElements()) || 2716 match(X, m_Undef()) || match(Y, m_Undef())) 2717 return nullptr; 2718 assert(match(Shuffle0->getOperand(1), m_Undef()) && 2719 match(Shuffle1->getOperand(1), m_Undef()) && 2720 "Unexpected operand for identity shuffle"); 2721 2722 // This is a shuffle of 2 widening shuffles. We can shuffle the narrow source 2723 // operands directly by adjusting the shuffle mask to account for the narrower 2724 // types: 2725 // shuf (widen X), (widen Y), Mask --> shuf X, Y, Mask' 2726 int NarrowElts = cast<FixedVectorType>(X->getType())->getNumElements(); 2727 int WideElts = cast<FixedVectorType>(Shuffle0->getType())->getNumElements(); 2728 assert(WideElts > NarrowElts && "Unexpected types for identity with padding"); 2729 2730 ArrayRef<int> Mask = Shuf.getShuffleMask(); 2731 SmallVector<int, 16> NewMask(Mask.size(), -1); 2732 for (int i = 0, e = Mask.size(); i != e; ++i) { 2733 if (Mask[i] == -1) 2734 continue; 2735 2736 // If this shuffle is choosing an undef element from 1 of the sources, that 2737 // element is undef. 2738 if (Mask[i] < WideElts) { 2739 if (Shuffle0->getMaskValue(Mask[i]) == -1) 2740 continue; 2741 } else { 2742 if (Shuffle1->getMaskValue(Mask[i] - WideElts) == -1) 2743 continue; 2744 } 2745 2746 // If this shuffle is choosing from the 1st narrow op, the mask element is 2747 // the same. If this shuffle is choosing from the 2nd narrow op, the mask 2748 // element is offset down to adjust for the narrow vector widths. 2749 if (Mask[i] < WideElts) { 2750 assert(Mask[i] < NarrowElts && "Unexpected shuffle mask"); 2751 NewMask[i] = Mask[i]; 2752 } else { 2753 assert(Mask[i] < (WideElts + NarrowElts) && "Unexpected shuffle mask"); 2754 NewMask[i] = Mask[i] - (WideElts - NarrowElts); 2755 } 2756 } 2757 return new ShuffleVectorInst(X, Y, NewMask); 2758 } 2759 2760 // Splatting the first element of the result of a BinOp, where any of the 2761 // BinOp's operands are the result of a first element splat can be simplified to 2762 // splatting the first element of the result of the BinOp 2763 Instruction *InstCombinerImpl::simplifyBinOpSplats(ShuffleVectorInst &SVI) { 2764 if (!match(SVI.getOperand(1), m_Poison()) || 2765 !match(SVI.getShuffleMask(), m_ZeroMask()) || 2766 !SVI.getOperand(0)->hasOneUse()) 2767 return nullptr; 2768 2769 Value *Op0 = SVI.getOperand(0); 2770 Value *X, *Y; 2771 if (!match(Op0, m_BinOp(m_Shuffle(m_Value(X), m_Poison(), m_ZeroMask()), 2772 m_Value(Y))) && 2773 !match(Op0, m_BinOp(m_Value(X), 2774 m_Shuffle(m_Value(Y), m_Poison(), m_ZeroMask())))) 2775 return nullptr; 2776 if (X->getType() != Y->getType()) 2777 return nullptr; 2778 2779 auto *BinOp = cast<BinaryOperator>(Op0); 2780 if (!isSafeToSpeculativelyExecute(BinOp)) 2781 return nullptr; 2782 2783 Value *NewBO = Builder.CreateBinOp(BinOp->getOpcode(), X, Y); 2784 if (auto NewBOI = dyn_cast<Instruction>(NewBO)) 2785 NewBOI->copyIRFlags(BinOp); 2786 2787 return new ShuffleVectorInst(NewBO, SVI.getShuffleMask()); 2788 } 2789 2790 Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) { 2791 Value *LHS = SVI.getOperand(0); 2792 Value *RHS = SVI.getOperand(1); 2793 SimplifyQuery ShufQuery = SQ.getWithInstruction(&SVI); 2794 if (auto *V = simplifyShuffleVectorInst(LHS, RHS, SVI.getShuffleMask(), 2795 SVI.getType(), ShufQuery)) 2796 return replaceInstUsesWith(SVI, V); 2797 2798 if (Instruction *I = simplifyBinOpSplats(SVI)) 2799 return I; 2800 2801 // Canonicalize splat shuffle to use poison RHS. Handle this explicitly in 2802 // order to support scalable vectors. 2803 if (match(SVI.getShuffleMask(), m_ZeroMask()) && !isa<PoisonValue>(RHS)) 2804 return replaceOperand(SVI, 1, PoisonValue::get(RHS->getType())); 2805 2806 if (isa<ScalableVectorType>(LHS->getType())) 2807 return nullptr; 2808 2809 unsigned VWidth = cast<FixedVectorType>(SVI.getType())->getNumElements(); 2810 unsigned LHSWidth = cast<FixedVectorType>(LHS->getType())->getNumElements(); 2811 2812 // shuffle (bitcast X), (bitcast Y), Mask --> bitcast (shuffle X, Y, Mask) 2813 // 2814 // if X and Y are of the same (vector) type, and the element size is not 2815 // changed by the bitcasts, we can distribute the bitcasts through the 2816 // shuffle, hopefully reducing the number of instructions. We make sure that 2817 // at least one bitcast only has one use, so we don't *increase* the number of 2818 // instructions here. 2819 Value *X, *Y; 2820 if (match(LHS, m_BitCast(m_Value(X))) && match(RHS, m_BitCast(m_Value(Y))) && 2821 X->getType()->isVectorTy() && X->getType() == Y->getType() && 2822 X->getType()->getScalarSizeInBits() == 2823 SVI.getType()->getScalarSizeInBits() && 2824 (LHS->hasOneUse() || RHS->hasOneUse())) { 2825 Value *V = Builder.CreateShuffleVector(X, Y, SVI.getShuffleMask(), 2826 SVI.getName() + ".uncasted"); 2827 return new BitCastInst(V, SVI.getType()); 2828 } 2829 2830 ArrayRef<int> Mask = SVI.getShuffleMask(); 2831 2832 // Peek through a bitcasted shuffle operand by scaling the mask. If the 2833 // simulated shuffle can simplify, then this shuffle is unnecessary: 2834 // shuf (bitcast X), undef, Mask --> bitcast X' 2835 // TODO: This could be extended to allow length-changing shuffles. 2836 // The transform might also be obsoleted if we allowed canonicalization 2837 // of bitcasted shuffles. 2838 if (match(LHS, m_BitCast(m_Value(X))) && match(RHS, m_Undef()) && 2839 X->getType()->isVectorTy() && VWidth == LHSWidth) { 2840 // Try to create a scaled mask constant. 2841 auto *XType = cast<FixedVectorType>(X->getType()); 2842 unsigned XNumElts = XType->getNumElements(); 2843 SmallVector<int, 16> ScaledMask; 2844 if (scaleShuffleMaskElts(XNumElts, Mask, ScaledMask)) { 2845 // If the shuffled source vector simplifies, cast that value to this 2846 // shuffle's type. 2847 if (auto *V = simplifyShuffleVectorInst(X, UndefValue::get(XType), 2848 ScaledMask, XType, ShufQuery)) 2849 return BitCastInst::Create(Instruction::BitCast, V, SVI.getType()); 2850 } 2851 } 2852 2853 // shuffle x, x, mask --> shuffle x, undef, mask' 2854 if (LHS == RHS) { 2855 assert(!match(RHS, m_Undef()) && 2856 "Shuffle with 2 undef ops not simplified?"); 2857 return new ShuffleVectorInst(LHS, createUnaryMask(Mask, LHSWidth)); 2858 } 2859 2860 // shuffle undef, x, mask --> shuffle x, undef, mask' 2861 if (match(LHS, m_Undef())) { 2862 SVI.commute(); 2863 return &SVI; 2864 } 2865 2866 if (Instruction *I = canonicalizeInsertSplat(SVI, Builder)) 2867 return I; 2868 2869 if (Instruction *I = foldSelectShuffle(SVI)) 2870 return I; 2871 2872 if (Instruction *I = foldTruncShuffle(SVI, DL.isBigEndian())) 2873 return I; 2874 2875 if (Instruction *I = narrowVectorSelect(SVI, Builder)) 2876 return I; 2877 2878 if (Instruction *I = foldShuffleOfUnaryOps(SVI, Builder)) 2879 return I; 2880 2881 if (Instruction *I = foldCastShuffle(SVI, Builder)) 2882 return I; 2883 2884 APInt PoisonElts(VWidth, 0); 2885 APInt AllOnesEltMask(APInt::getAllOnes(VWidth)); 2886 if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, PoisonElts)) { 2887 if (V != &SVI) 2888 return replaceInstUsesWith(SVI, V); 2889 return &SVI; 2890 } 2891 2892 if (Instruction *I = foldIdentityExtractShuffle(SVI)) 2893 return I; 2894 2895 // These transforms have the potential to lose undef knowledge, so they are 2896 // intentionally placed after SimplifyDemandedVectorElts(). 2897 if (Instruction *I = foldShuffleWithInsert(SVI, *this)) 2898 return I; 2899 if (Instruction *I = foldIdentityPaddedShuffles(SVI)) 2900 return I; 2901 2902 if (match(RHS, m_Poison()) && canEvaluateShuffled(LHS, Mask)) { 2903 Value *V = evaluateInDifferentElementOrder(LHS, Mask, Builder); 2904 return replaceInstUsesWith(SVI, V); 2905 } 2906 2907 // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to 2908 // a non-vector type. We can instead bitcast the original vector followed by 2909 // an extract of the desired element: 2910 // 2911 // %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, 2912 // <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2913 // %1 = bitcast <4 x i8> %sroa to i32 2914 // Becomes: 2915 // %bc = bitcast <16 x i8> %in to <4 x i32> 2916 // %ext = extractelement <4 x i32> %bc, i32 0 2917 // 2918 // If the shuffle is extracting a contiguous range of values from the input 2919 // vector then each use which is a bitcast of the extracted size can be 2920 // replaced. This will work if the vector types are compatible, and the begin 2921 // index is aligned to a value in the casted vector type. If the begin index 2922 // isn't aligned then we can shuffle the original vector (keeping the same 2923 // vector type) before extracting. 2924 // 2925 // This code will bail out if the target type is fundamentally incompatible 2926 // with vectors of the source type. 2927 // 2928 // Example of <16 x i8>, target type i32: 2929 // Index range [4,8): v-----------v Will work. 2930 // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 2931 // <16 x i8>: | | | | | | | | | | | | | | | | | 2932 // <4 x i32>: | | | | | 2933 // +-----------+-----------+-----------+-----------+ 2934 // Index range [6,10): ^-----------^ Needs an extra shuffle. 2935 // Target type i40: ^--------------^ Won't work, bail. 2936 bool MadeChange = false; 2937 if (isShuffleExtractingFromLHS(SVI, Mask)) { 2938 Value *V = LHS; 2939 unsigned MaskElems = Mask.size(); 2940 auto *SrcTy = cast<FixedVectorType>(V->getType()); 2941 unsigned VecBitWidth = SrcTy->getPrimitiveSizeInBits().getFixedValue(); 2942 unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType()); 2943 assert(SrcElemBitWidth && "vector elements must have a bitwidth"); 2944 unsigned SrcNumElems = SrcTy->getNumElements(); 2945 SmallVector<BitCastInst *, 8> BCs; 2946 DenseMap<Type *, Value *> NewBCs; 2947 for (User *U : SVI.users()) 2948 if (BitCastInst *BC = dyn_cast<BitCastInst>(U)) 2949 if (!BC->use_empty()) 2950 // Only visit bitcasts that weren't previously handled. 2951 BCs.push_back(BC); 2952 for (BitCastInst *BC : BCs) { 2953 unsigned BegIdx = Mask.front(); 2954 Type *TgtTy = BC->getDestTy(); 2955 unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy); 2956 if (!TgtElemBitWidth) 2957 continue; 2958 unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth; 2959 bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth; 2960 bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth); 2961 if (!VecBitWidthsEqual) 2962 continue; 2963 if (!VectorType::isValidElementType(TgtTy)) 2964 continue; 2965 auto *CastSrcTy = FixedVectorType::get(TgtTy, TgtNumElems); 2966 if (!BegIsAligned) { 2967 // Shuffle the input so [0,NumElements) contains the output, and 2968 // [NumElems,SrcNumElems) is undef. 2969 SmallVector<int, 16> ShuffleMask(SrcNumElems, -1); 2970 for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I) 2971 ShuffleMask[I] = Idx; 2972 V = Builder.CreateShuffleVector(V, ShuffleMask, 2973 SVI.getName() + ".extract"); 2974 BegIdx = 0; 2975 } 2976 unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth; 2977 assert(SrcElemsPerTgtElem); 2978 BegIdx /= SrcElemsPerTgtElem; 2979 bool BCAlreadyExists = NewBCs.contains(CastSrcTy); 2980 auto *NewBC = 2981 BCAlreadyExists 2982 ? NewBCs[CastSrcTy] 2983 : Builder.CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc"); 2984 if (!BCAlreadyExists) 2985 NewBCs[CastSrcTy] = NewBC; 2986 auto *Ext = Builder.CreateExtractElement(NewBC, BegIdx, 2987 SVI.getName() + ".extract"); 2988 // The shufflevector isn't being replaced: the bitcast that used it 2989 // is. InstCombine will visit the newly-created instructions. 2990 replaceInstUsesWith(*BC, Ext); 2991 MadeChange = true; 2992 } 2993 } 2994 2995 // If the LHS is a shufflevector itself, see if we can combine it with this 2996 // one without producing an unusual shuffle. 2997 // Cases that might be simplified: 2998 // 1. 2999 // x1=shuffle(v1,v2,mask1) 3000 // x=shuffle(x1,undef,mask) 3001 // ==> 3002 // x=shuffle(v1,undef,newMask) 3003 // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : -1 3004 // 2. 3005 // x1=shuffle(v1,undef,mask1) 3006 // x=shuffle(x1,x2,mask) 3007 // where v1.size() == mask1.size() 3008 // ==> 3009 // x=shuffle(v1,x2,newMask) 3010 // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : mask[i] 3011 // 3. 3012 // x2=shuffle(v2,undef,mask2) 3013 // x=shuffle(x1,x2,mask) 3014 // where v2.size() == mask2.size() 3015 // ==> 3016 // x=shuffle(x1,v2,newMask) 3017 // newMask[i] = (mask[i] < x1.size()) 3018 // ? mask[i] : mask2[mask[i]-x1.size()]+x1.size() 3019 // 4. 3020 // x1=shuffle(v1,undef,mask1) 3021 // x2=shuffle(v2,undef,mask2) 3022 // x=shuffle(x1,x2,mask) 3023 // where v1.size() == v2.size() 3024 // ==> 3025 // x=shuffle(v1,v2,newMask) 3026 // newMask[i] = (mask[i] < x1.size()) 3027 // ? mask1[mask[i]] : mask2[mask[i]-x1.size()]+v1.size() 3028 // 3029 // Here we are really conservative: 3030 // we are absolutely afraid of producing a shuffle mask not in the input 3031 // program, because the code gen may not be smart enough to turn a merged 3032 // shuffle into two specific shuffles: it may produce worse code. As such, 3033 // we only merge two shuffles if the result is either a splat or one of the 3034 // input shuffle masks. In this case, merging the shuffles just removes 3035 // one instruction, which we know is safe. This is good for things like 3036 // turning: (splat(splat)) -> splat, or 3037 // merge(V[0..n], V[n+1..2n]) -> V[0..2n] 3038 ShuffleVectorInst* LHSShuffle = dyn_cast<ShuffleVectorInst>(LHS); 3039 ShuffleVectorInst* RHSShuffle = dyn_cast<ShuffleVectorInst>(RHS); 3040 if (LHSShuffle) 3041 if (!match(LHSShuffle->getOperand(1), m_Poison()) && 3042 !match(RHS, m_Poison())) 3043 LHSShuffle = nullptr; 3044 if (RHSShuffle) 3045 if (!match(RHSShuffle->getOperand(1), m_Poison())) 3046 RHSShuffle = nullptr; 3047 if (!LHSShuffle && !RHSShuffle) 3048 return MadeChange ? &SVI : nullptr; 3049 3050 Value* LHSOp0 = nullptr; 3051 Value* LHSOp1 = nullptr; 3052 Value* RHSOp0 = nullptr; 3053 unsigned LHSOp0Width = 0; 3054 unsigned RHSOp0Width = 0; 3055 if (LHSShuffle) { 3056 LHSOp0 = LHSShuffle->getOperand(0); 3057 LHSOp1 = LHSShuffle->getOperand(1); 3058 LHSOp0Width = cast<FixedVectorType>(LHSOp0->getType())->getNumElements(); 3059 } 3060 if (RHSShuffle) { 3061 RHSOp0 = RHSShuffle->getOperand(0); 3062 RHSOp0Width = cast<FixedVectorType>(RHSOp0->getType())->getNumElements(); 3063 } 3064 Value* newLHS = LHS; 3065 Value* newRHS = RHS; 3066 if (LHSShuffle) { 3067 // case 1 3068 if (match(RHS, m_Poison())) { 3069 newLHS = LHSOp0; 3070 newRHS = LHSOp1; 3071 } 3072 // case 2 or 4 3073 else if (LHSOp0Width == LHSWidth) { 3074 newLHS = LHSOp0; 3075 } 3076 } 3077 // case 3 or 4 3078 if (RHSShuffle && RHSOp0Width == LHSWidth) { 3079 newRHS = RHSOp0; 3080 } 3081 // case 4 3082 if (LHSOp0 == RHSOp0) { 3083 newLHS = LHSOp0; 3084 newRHS = nullptr; 3085 } 3086 3087 if (newLHS == LHS && newRHS == RHS) 3088 return MadeChange ? &SVI : nullptr; 3089 3090 ArrayRef<int> LHSMask; 3091 ArrayRef<int> RHSMask; 3092 if (newLHS != LHS) 3093 LHSMask = LHSShuffle->getShuffleMask(); 3094 if (RHSShuffle && newRHS != RHS) 3095 RHSMask = RHSShuffle->getShuffleMask(); 3096 3097 unsigned newLHSWidth = (newLHS != LHS) ? LHSOp0Width : LHSWidth; 3098 SmallVector<int, 16> newMask; 3099 bool isSplat = true; 3100 int SplatElt = -1; 3101 // Create a new mask for the new ShuffleVectorInst so that the new 3102 // ShuffleVectorInst is equivalent to the original one. 3103 for (unsigned i = 0; i < VWidth; ++i) { 3104 int eltMask; 3105 if (Mask[i] < 0) { 3106 // This element is a poison value. 3107 eltMask = -1; 3108 } else if (Mask[i] < (int)LHSWidth) { 3109 // This element is from left hand side vector operand. 3110 // 3111 // If LHS is going to be replaced (case 1, 2, or 4), calculate the 3112 // new mask value for the element. 3113 if (newLHS != LHS) { 3114 eltMask = LHSMask[Mask[i]]; 3115 // If the value selected is an poison value, explicitly specify it 3116 // with a -1 mask value. 3117 if (eltMask >= (int)LHSOp0Width && isa<PoisonValue>(LHSOp1)) 3118 eltMask = -1; 3119 } else 3120 eltMask = Mask[i]; 3121 } else { 3122 // This element is from right hand side vector operand 3123 // 3124 // If the value selected is a poison value, explicitly specify it 3125 // with a -1 mask value. (case 1) 3126 if (match(RHS, m_Poison())) 3127 eltMask = -1; 3128 // If RHS is going to be replaced (case 3 or 4), calculate the 3129 // new mask value for the element. 3130 else if (newRHS != RHS) { 3131 eltMask = RHSMask[Mask[i]-LHSWidth]; 3132 // If the value selected is an poison value, explicitly specify it 3133 // with a -1 mask value. 3134 if (eltMask >= (int)RHSOp0Width) { 3135 assert(match(RHSShuffle->getOperand(1), m_Poison()) && 3136 "should have been check above"); 3137 eltMask = -1; 3138 } 3139 } else 3140 eltMask = Mask[i]-LHSWidth; 3141 3142 // If LHS's width is changed, shift the mask value accordingly. 3143 // If newRHS == nullptr, i.e. LHSOp0 == RHSOp0, we want to remap any 3144 // references from RHSOp0 to LHSOp0, so we don't need to shift the mask. 3145 // If newRHS == newLHS, we want to remap any references from newRHS to 3146 // newLHS so that we can properly identify splats that may occur due to 3147 // obfuscation across the two vectors. 3148 if (eltMask >= 0 && newRHS != nullptr && newLHS != newRHS) 3149 eltMask += newLHSWidth; 3150 } 3151 3152 // Check if this could still be a splat. 3153 if (eltMask >= 0) { 3154 if (SplatElt >= 0 && SplatElt != eltMask) 3155 isSplat = false; 3156 SplatElt = eltMask; 3157 } 3158 3159 newMask.push_back(eltMask); 3160 } 3161 3162 // If the result mask is equal to one of the original shuffle masks, 3163 // or is a splat, do the replacement. 3164 if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) { 3165 if (!newRHS) 3166 newRHS = PoisonValue::get(newLHS->getType()); 3167 return new ShuffleVectorInst(newLHS, newRHS, newMask); 3168 } 3169 3170 return MadeChange ? &SVI : nullptr; 3171 } 3172