Lines Matching +full:nand +full:- +full:style
1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
12 //===----------------------------------------------------------------------===//
71 #define DEBUG_TYPE "x86-isel"
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
78 "alignment set by x86-experimental-pref-loop-alignment."),
82 "x86-br-merging-base-cost", cl::init(2),
88 "will be merged, and above which conditionals will be split. Set to -1 "
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
99 "x86-br-merging-likely-bias", cl::init(0),
100 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
105 "the instruction cost threshold. Set to -1 to never merge likely "
110 "x86-br-merging-unlikely-bias", cl::init(-1),
112 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
117 "the instruction cost threshold. Set to -1 to never merge unlikely "
122 "mul-constant-optimization", cl::init(true),
137 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
141 // default expansion to a no-op.
144 // For 64-bit, since we have so many registers, use the ILP scheduler.
145 // For 32-bit, use the register pressure specific scheduling.
154 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
264 // We have an algorithm for SSE2, and we turn this into a 64-bit
268 // We have an algorithm for SSE2->double, and we turn this into a
269 // 64-bit FILD followed by conditional FADD for other targets.
284 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
298 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
350 // Without SSE, i64->f64 goes through memory.
358 // the two-result form to trivial CSE, which is able to combine x/y and x%y
361 // Scalar integer multiply-high is also lowered to use two-result
363 // (low) operations are left as Legal, as there are single-result
364 // instructions for this in x86. Using the two-result multiply instructions
436 // Special handling for half-precision floating point conversions.
505 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
524 // 64-bit shl, sra, srl (iff 32-bit x86)
553 // All CPUs supporting AVX will atomically load/store aligned 128-bit
562 // FIXME - use subtarget debug flags
646 // Disable f32->f64 extload as we can only generate this in one instruction
649 // non-optsize case.
774 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
775 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
784 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
785 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
793 // Handle constrained floating-point operations of scalar.
832 // clang-format off
843 // clang-format on
857 // Handle constrained floating-point operations of scalar.
869 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
896 // clang-format off
904 // clang-format on
956 // clang-format off
970 // clang-format on
1021 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1037 // with -msoft-float, disable use of MMX as well.
1073 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1243 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1252 // Add 32-bit vector stores to help vectorization opportunities.
1360 // FIXME: Do we need to handle scalar-to-vector here?
1386 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1505 // These types need custom splitting if their input is a 128-bit vector.
1613 // when we have a 256bit-wide blend with immediate.
1635 // (result) is 128-bit but the source is 256-bit wide.
1641 // Custom lower several nodes for 256-bit types.
1693 // available with AVX512. 512-bit vectors are in a separate block controlled
1719 // There is no byte sized k-register load or store without AVX512DQ.
1732 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1765 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1766 // elements. 512-bits can be disabled based on prefer-vector-width and
1767 // required-vector-width function attributes.
1846 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1847 // to 512-bit rather than use the AVX2 instructions so that we can use
1848 // k-masks.
1871 // Extends from v64i1 masks to 512-bit vectors.
1984 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1996 // (result) is 256-bit but the source is 512-bit wide.
1997 // 128-bit was made Legal under AVX1.
2062 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2065 // These operations are handled on non-VLX by artificially widening in
2151 // Extends from v32i1 masks to 256-bit vectors.
2161 // These operations are handled on non-VLX by artificially widening in
2163 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2380 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2389 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2398 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2403 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2423 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2427 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2428 // than generic legalization for 64-bit multiplication-with-overflow, though.
2470 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2471 // is. We should promote the value to 64-bits to solve this.
2472 // This is what the CRT headers do - `fmodf` is an inline header
2476 // clang-format off
2496 // clang-format on
2498 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2506 // We have target-specific dag combine patterns for the following nodes:
2567 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2569 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2571 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2580 // Default loop alignment, which can be overridden by -align-loops.
2583 // An out-of-order CPU can speculatively execute past a predictable branch,
2591 // Default to having -disable-strictnode-mutation on
2595 // This has so far only been implemented for 64-bit MachO.
2636 //===----------------------------------------------------------------------===//
2638 //===----------------------------------------------------------------------===//
2650 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2653 // TODO: If this is a non-temporal load and the target has an instruction
2666 // We can not replace a wide volatile load with a broadcast-from-memory,
2669 return !Ld->isVolatile() ||
2670 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2674 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
2679 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
2754 int ReturnAddrIndex = FuncInfo->getRAIndex();
2758 unsigned SlotSize = RegInfo->getSlotSize();
2760 -(int64_t)SlotSize,
2762 FuncInfo->setRAIndex(ReturnAddrIndex);
2774 // If we don't have a symbolic displacement - we don't have any extra
2780 // 64-bit offsets.
2790 // For other non-large code models we assume that latest small object is 16MB
2819 // clang-format off
2831 // clang-format on
2835 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2843 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2844 // X > -1 -> X == 0, jump !sign.
2848 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2849 // X < 0 -> X == 0, jump on sign.
2852 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
2853 // X >= 0 -> X == 0, jump on !sign.
2856 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
2857 // X < 1 -> X <= 0
2892 // clang-format off
2893 default: llvm_unreachable("Condcode should be pre-legalized away");
2914 // clang-format on
2956 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2964 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2972 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2980 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2991 unsigned Size = I.getType()->getScalarSizeInBits();
2992 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3003 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3004 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3025 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3026 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3036 switch (IntrData->Type) {
3042 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3044 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3046 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3048 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3061 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3072 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3073 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3102 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3104 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3106 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3109 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3113 // can be store-folded. Therefore, it's probably not worth splitting the load.
3114 EVT VT = Load->getValueType(0);
3115 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3116 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
3122 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
3123 UI->use_begin()->getOpcode() != ISD::STORE)
3126 // All non-chain uses are extract + store.
3137 assert(Ty->isIntegerTy());
3139 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3147 // a floating-point compare and we have blendv or conditional move, then it is
3148 // cheaper to select instead of doing a cross-register move and creating a
3175 // through type legalization on 32-bit targets so we would need to special
3182 // most implementations, sub-vXi32 vector multiplies are always fast,
3191 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3192 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3213 // TODO - do we have any exceptions?
3239 (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
3313 // There are only 32-bit and 64-bit forms for 'andn'.
3317 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3360 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3382 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3402 // at least imm32 mask (or be zext i32 -> i64).
3404 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3407 // We can only benefit if req at least 7-bit for the mask. We
3414 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3416 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3427 // Non-vector type and we have a zext mask with SRL.
3453 return N->getOpcode() != ISD::FP_EXTEND;
3458 assert(((N->getOpcode() == ISD::SHL &&
3459 N->getOperand(0).getOpcode() == ISD::SRL) ||
3460 (N->getOpcode() == ISD::SRL &&
3461 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3462 "Expected shift-shift mask");
3464 EVT VT = N->getValueType(0);
3467 // Only fold if the shift values are equal - so it folds to AND.
3468 // TODO - we should fold if either is a non-uniform vector but we don't do
3469 // the fold for non-splats yet.
3470 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3482 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3518 // TODO: Allow 64-bit type for 32-bit target.
3519 // TODO: 512-bit types should be allowed, but make sure that those
3612 /// Return true if every element in Mask, is an in-place blend/select mask or is
3624 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3670 /// shuffle masks. The latter have the special property of a '-2' representing
3671 /// a zero-ed lane of a vector.
3686 // a pair of values. If we find such a case, use the non-undef mask's value.
3730 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3784 // Use an UNDEF node if MaskElt == -1.
3785 // Split 64-bit constants in the 32-bit mode.
3872 // available, use a floating-point +0.0 instead.
3926 // This is the index of the first element of the vectorWidth-bit chunk
3928 IdxVal &= ~(ElemsPerChunk - 1);
3933 Vec->ops().slice(IdxVal, ElemsPerChunk));
3945 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
3947 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
3949 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
3958 /// Generate a DAG to grab 256-bits from a 512-bit vector.
3981 // This is the index of the first element of the vectorWidth-bit chunk
3983 IdxVal &= ~(ElemsPerChunk - 1);
3989 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
3991 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
3993 /// we want. It need not be aligned to a 128-bit boundary. That makes
4055 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4056 Ops.append(N->op_begin(), N->op_end());
4060 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4061 SDValue Src = N->getOperand(0);
4062 SDValue Sub = N->getOperand(1);
4063 const APInt &Idx = N->getConstantOperandAPInt(2);
4149 // If this is a splat value (with no-undefs) then use the lower subvector,
4187 // Make sure we only try to split 256/512-bit types to avoid creating
4213 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4214 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4259 // Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4271 // AVX512 broadcasts 32/64-bit operands.
4272 // TODO: Support float once getAVX512Node is used by fp-ops.
4283 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4318 // Perform the 512-bit op then extract the bottom subvector.
4324 /// Insert i1-subvector to i1-vector.
4394 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4400 unsigned ShiftLeft = NumElems - SubVecNumElems;
4401 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4427 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4442 unsigned ShiftLeft = NumElems - SubVecNumElems;
4443 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4469 unsigned LowShift = NumElems - IdxVal;
4507 "Expected a 128/256/512-bit vector type");
4521 // For 256-bit vectors, we only need the lower (128-bit) input half.
4522 // For 512-bit vectors, we only need the lower input half or quarter.
4562 /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4564 /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4565 /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4630 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4664 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4706 // TODO: Add support for non-zero offsets.
4709 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4711 return CNode->getConstVal();
4717 return getTargetConstantFromBasePtr(Load->getBasePtr());
4812 Mask = CInt->getValue();
4816 Mask = CFP->getValueAPF().bitcastToAPInt();
4820 Type *Ty = CDS->getType();
4821 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
4822 Type *EltTy = CDS->getElementType();
4823 bool IsInteger = EltTy->isIntegerTy();
4825 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
4828 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
4829 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
4831 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
4833 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
4850 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
4855 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
4865 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
4876 Type *CstTy = Cst->getType();
4877 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4878 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
4881 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
4889 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
4900 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
4903 SDValue Ptr = MemIntr->getBasePtr();
4915 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
4924 SDValue Ptr = MemIntr->getBasePtr();
4928 Type *CstTy = Cst->getType();
4929 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4930 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
4931 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
4934 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
4941 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
4964 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
4970 // If bitcasts to larger elements we might lose track of undefs - don't
4995 // TODO - support extract_subvector through bitcasts.
5017 // TODO - support shuffle through bitcasts.
5021 ArrayRef<int> Mask = SVN->getMask();
5050 if (UndefElts1[M - NumElts])
5052 EltBits.push_back(EltBits1[M - NumElts]);
5069 int SplatIndex = -1;
5074 SplatIndex = -1;
5108 // Match not(xor X, -1) -> X.
5109 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5110 // Match not(extract_subvector(xor X, -1)) -> extract_subvector(X).
5111 // Match not(concat_vectors(xor X, -1, xor Y, -1)) -> concat_vectors(X, Y).
5135 // Don't fold min_signed_value -> (min_signed_value - 1)
5139 Elt -= 1;
5162 /// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5171 unsigned Repetitions = 1u << (NumStages - 1);
5219 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5225 /// It is an error to call this with non-empty Mask/Ops vectors.
5248 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5255 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5262 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5313 "Only 32-bit and 64-bit elements are supported!");
5316 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5326 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5335 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5342 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5349 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5355 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5361 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5371 // We only decode broadcasts of same-sized vectors, peeking through to
5405 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5419 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5426 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5452 unsigned CtrlImm = CtrlOp->getZExtValue();
5514 // inputs that are actually the same node. Re-map the mask to always point
5519 M -= Mask.size();
5521 // If we didn't already add operands in the opcode-specific code, default to
5587 int Scale = Size / V->getNumOperands();
5594 APInt Val = Cst->getAPIntValue();
5599 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5610 int Scale = V->getNumOperands() / Size;
5689 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5690 // TODO: We currently only set UNDEF for integer types - floats use the same
5704 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
5827 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
5838 // Attempt to decode as a per-byte mask.
5849 // We can't assume an undef src element gives an undef dst - the other src
5906 if (!N->isOnlyUserOf(Sub.getNode()))
5933 // Limit this to vXi64 512-bit vector cases to make the most of AVX512
6011 // Check we have an in-range constant insertion index.
6104 // PACKSS then it was likely being used for sign-extension for a
6106 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6111 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6164 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6190 Mask[i + j] = i + j - ByteShift;
6194 Mask[i + j - ByteShift] = i + j;
6206 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6239 // We can only handle all-signbits extensions.
6295 M -= MaskWidth;
6306 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6379 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6380 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6383 SDValue Ptr = DAG.getMemBasePlusOffset(Mem->getBasePtr(),
6386 SDValue Ops[] = {Mem->getChain(), Ptr};
6390 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6408 int Elt = SV->getMaskElt(Index);
6413 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6447 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6478 // For insert_vector_elt - either return the index matching scalar or recurse
6525 assert(0 == i && "Expected insertion into zero-index");
6548 // SSE4.1 - use PINSRB to insert each byte directly.
6555 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6556 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6665 assert(Zeroable.size() - Zeroable.count() > 1 &&
6666 "We expect at least two non-zero elements!");
6679 // Make sure that this node is extracting from a 128-bit vector.
6704 Elt = Op->getOperand(EltIdx);
6735 SDValue Current = Op->getOperand(i);
6736 SDValue SrcVector = Current->getOperand(0);
6745 assert(V1.getNode() && "Expected at least two non-zero elements!");
6781 SDValue Ptr = LD->getBasePtr();
6782 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
6784 EVT PVT = LD->getValueType(0);
6788 int FI = -1;
6791 FI = FINode->getIndex();
6795 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6802 // FIXME: 256-bit vector instructions don't require a strict alignment,
6805 SDValue Chain = LD->getChain();
6826 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
6833 int EltNo = (Offset - StartOffset) >> 2;
6838 LD->getPointerInfo().getWithOffset(StartOffset));
6852 if (!BaseLd->isSimple())
6866 uint64_t Amt = AmtC->getZExtValue();
6880 uint64_t Idx = IdxC->getZExtValue();
6895 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6905 int LastLoadedElt = -1;
6936 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
6947 // Handle Special Cases - all undef or undef/zero.
6964 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
6966 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
6978 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
6983 EltIdx - FirstLoadedElt);
7004 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7005 assert(LDBase->isSimple() &&
7008 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7009 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
7018 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7021 // LOAD - all consecutive load/undefs (must start/end with a load or be
7032 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7034 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7044 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7050 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7081 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7095 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7097 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7098 LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
7106 // BROADCAST - match the smallest possible repetition pattern, load that
7116 // Don't attempt a 1:N subvector broadcast - it should be caught by
7177 // are consecutive, non-overlapping, and in the right order.
7200 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7224 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7249 for (auto *U : N->uses()) {
7250 unsigned Opc = U->getOpcode();
7252 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7254 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7260 if (N->hasOneUse()) {
7263 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7283 // TODO: Splats could be generated for non-AVX CPUs using SSE
7284 // instructions, but there's less potential gain for only 128-bit vectors.
7288 MVT VT = BVOp->getSimpleValueType(0);
7298 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7342 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7347 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7365 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7381 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7397 if (!Ld || NumElts - NumUndefElts != 1)
7408 // TODO: Handle broadcasts of non-constant sequences.
7410 // Make sure that all of the users of a non-constant load are from the
7412 // FIXME: Is the use count needed for non-constant, non-load case?
7413 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7431 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7446 C = CI->getConstantIntValue();
7448 C = CF->getConstantFPValue();
7454 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7465 // Handle AVX2 in-register broadcasts.
7474 // Make sure the non-chain result is only used by this build vector.
7475 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7482 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7485 LN->getMemoryVT(), LN->getMemOperand());
7490 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7496 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7499 LN->getMemoryVT(), LN->getMemOperand());
7518 int Idx = ExtIdx->getAsZExtVal();
7522 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7533 SDValue ShuffleVec = SVOp->getOperand(0);
7538 int ShuffleIdx = SVOp->getMaskElt(Idx);
7559 SmallVector<int, 8> Mask(NumElems, -1);
7579 // Quit if non-constant index.
7647 int SplatIdx = -1;
7653 Immediate |= (InC->getZExtValue() & 0x1) << idx;
7664 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7738 /// 128-bit partial horizontal operation on a 256-bit vector, but that operation
7739 /// may not match the layout of an x86 256-bit horizontal instruction.
7753 /// horizontal operations, but the index-matching logic is incorrect for that.
7755 /// code because it is only used for partial h-op matching now?
7760 EVT VT = N->getValueType(0);
7761 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
7769 unsigned NumElts = LastIdx - BaseIdx;
7775 SDValue Op = N->getOperand(i + BaseIdx);
7778 if (Op->isUndef()) {
7786 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7839 /// Emit a sequence of two 128-bit horizontal add/sub followed by
7843 /// This function expects two 256-bit vectors called V0 and V1.
7844 /// At first, each vector is split into two separate 128-bit vectors.
7845 /// Then, the resulting 128-bit vectors are used to implement two
7850 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7853 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7854 /// horizontal binop dag node would take as input the lower 128-bit of V1
7855 /// and the upper 128-bit of V1.
7861 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7862 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7868 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7869 /// the upper 128-bits of the result.
7890 if (!isUndefLO && !V0->isUndef())
7892 if (!isUndefHI && !V1->isUndef())
7896 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7899 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7908 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
7909 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
7917 MVT VT = BV->getSimpleValueType(0);
7927 // Odd-numbered elements in the input build vector are obtained from
7929 // Even-numbered elements in the input build vector are obtained from
7933 SDValue Op = BV->getOperand(i);
8036 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8043 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8077 MVT VT = BV->getSimpleValueType(0);
8090 // There are no known X86 targets with 512-bit ADDSUB instructions!
8109 MVT VT = BV->getSimpleValueType(0);
8114 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8115 // half of the result is calculated independently from the 128-bit halves of
8116 // the inputs, so that makes the index-checking logic below more complicated.
8125 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8137 // clang-format off
8143 // clang-format on
8156 // The source vector is chosen based on which 64-bit half of the
8200 // This is free (examples: zmm --> xmm, xmm --> ymm).
8201 MVT VT = BV->getSimpleValueType(0);
8216 if (BV->getOperand(i).isUndef())
8236 // We need at least 2 non-undef elements to make this worthwhile by default.
8238 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8243 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8245 MVT VT = BV->getSimpleValueType(0);
8256 // Try harder to match 256-bit ops by using extract/concat.
8266 if (BV->getOperand(i)->isUndef())
8270 if (BV->getOperand(i)->isUndef())
8306 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8360 MVT VT = Op->getSimpleValueType(0);
8366 unsigned Opcode = Op->getOperand(0).getOpcode();
8368 if (Opcode != Op->getOperand(i).getOpcode())
8384 // Don't do this if the buildvector is a splat - we'd replace one
8386 if (Op->getSplatValue())
8394 for (SDValue Elt : Op->ops()) {
8415 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8443 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8444 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8445 // vpcmpeqd on 256-bit vectors.
8478 // Zero-extend the index elements within the vector.
8515 // e.g. v4i32 -> v16i8 (Scale = 4)
8564 // SSE41 can compare v2i64 - select between indices 0 and 1.
8716 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8717 // reasoned to be a permutation of a vector by indices in a non-constant vector.
8721 // ->
8726 // construction of vectors with constant-0 elements.
8733 // This is done by checking that the i-th build_vector operand is of the form:
8747 SDValue ExtractedIndex = Op->getOperand(1);
8764 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
8808 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
8815 NumConstants--;
8836 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
8838 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
8839 // and blend the FREEZE-UNDEF operands back in.
8840 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
8843 SmallVector<int, 16> BlendMask(NumElems, -1);
8847 BlendMask[i] = -1;
8874 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
8875 UpperElems = NumElems - (NumElems / 4);
8878 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
8880 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
8897 // If we are inserting one variable into a vector of non-zero constants, try
8901 // constants. Insertion into a zero vector is handled as a special-case
8903 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
8907 // Create an all-constant vector. The variable element in the old
8918 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
8920 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
8941 unsigned InsertC = InsIndex->getAsZExtVal();
8946 // There's no good way to insert into the high elements of a >128-bit
8949 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
8958 // Special case for single non-zero, non-undef, element.
8963 // If we have a constant or non-constant insertion into the low element of
9009 // is a non-constant being inserted into an element other than the low one,
9028 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9035 // handled, so this is best done with a single constant-pool load.
9044 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
9050 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9076 // For AVX-length vectors, build the individual 128-bit pieces and use
9083 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9085 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9091 // Let legalizer expand 2-wide build_vectors.
9159 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9182 // our (non-undef) elements to the full vector width with the element in the
9203 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9211 // 256-bit AVX can use the vinsertf128 instruction
9212 // to create 256-bit vectors from two other 128-bit ones.
9220 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
9247 // If we have more than 2 non-zeros, build each half separately.
9250 ArrayRef<SDUse> Ops = Op->ops();
9279 // k-register.
9306 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9310 Log2_64(NonZeros) != NumOperands - 1) {
9322 // If there are zero or one non-zeros we can handle this very simply.
9336 ArrayRef<SDUse> Ops = Op->ops();
9367 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9368 // from two other 128-bit ones.
9370 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9374 //===----------------------------------------------------------------------===//
9383 //===----------------------------------------------------------------------===//
9385 /// Tiny helper function to identify a no-op mask.
9388 /// array input, which is assumed to be a single-input shuffle mask of the kind
9391 /// in-place shuffle are 'no-op's.
9394 assert(Mask[i] >= -1 && "Out of bound mask element!");
9404 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9420 /// Test whether there are elements crossing 128-bit lanes in this
9427 /// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9428 /// better support 'repeated mask + lane permute' style shuffles.
9440 int SrcLane = -1;
9455 /// Test whether a shuffle mask is equivalent within each sub-lane.
9458 /// lane-relative shuffle in each sub-lane. This trivially implies
9459 /// that it is also not lane-crossing. It may however involve a blend from the
9463 /// non-trivial to compute in the face of undef lanes. The representation is
9464 /// suitable for use with existing 128-bit shuffles as entries from the second
9470 RepeatedMask.assign(LaneSize, -1);
9480 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9485 // This is the first non-undef entry in this slot of a 128-bit lane.
9494 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
9507 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
9514 /// Test whether a target shuffle mask is equivalent within each sub-lane.
9537 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
9542 // This is the first non-undef entry in this slot of a 128-bit lane.
9551 /// Test whether a target shuffle mask is equivalent within each sub-lane.
9620 /// each element of the mask is either -1 (signifying undef) or the value given
9630 assert(Mask[i] >= -1 && "Out of bound mask element!");
9636 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9637 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9649 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9666 // Check for out-of-range target shuffle mask indices.
9693 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9702 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9703 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9733 // Create 128-bit vector type based on mask size.
9766 /// Get a 4-lane 8-bit shuffle immediate for a mask.
9768 /// This helper function produces an 8-bit shuffle immediate corresponding to
9775 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9776 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
9777 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
9778 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
9779 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
9781 // If the mask only uses one non-undef element, then fully 'splat' it to
9783 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
9808 // The function looks for a sub-mask that the nonzero elements are in
9809 // increasing order. If such sub-mask exist. The function returns true.
9813 int NextElement = -1;
9817 assert(Mask[i] >= -1 && "Out of bound mask element!");
10031 /// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10032 /// followed by unpack 256-bit.
10048 // This is a "natural" unpack operation (rather than the 128-bit sectored
10049 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10076 unsigned UpperElts = NumElts - NumSrcElts;
10126 // Non-VLX targets must truncate from a 512-bit type, so we need to
10173 unsigned UpperElts = NumElts - NumSrcElts;
10222 // TODO: Support non-BWI VPMOVWB truncations?
10237 unsigned UpperElts = NumElts - NumSrcElts;
10264 // and truncate from the double-sized src.
10307 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10318 "We should only be called with masks with a power-of-2 size!");
10320 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10323 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10340 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10374 unsigned NumPackedBits = NumSrcBits - BitSize;
10448 // Don't lower multi-stage packs on AVX512, truncation is better.
10453 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10524 return SDValue(); // No non-zeroable elements!
10579 // For 32/64-bit elements, if we only reference one input (plus any undefs),
10604 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
10632 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10657 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
10668 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10672 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
10679 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
10686 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
10696 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
10712 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
10715 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
10743 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
10744 // allow that load-folding possibility.
10753 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
10755 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
10766 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
10790 // Otherwise load an immediate into a GPR, cast to k-register, and use a
10802 /// a single-input permutation.
10805 /// then reduce the shuffle to a single-input permutation.
10813 SmallVector<int, 32> BlendMask(Mask.size(), -1);
10814 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
10841 /// a single-input permutation.
10844 /// then reduce the shuffle to a single-input (wider) permutation.
10870 NormM -= NumElts;
10895 SmallVector<int, 32> PermuteMask(NumElts, -1);
10902 NormM -= NumElts;
10910 assert(PermuteMask[Elt] != -1 &&
10936 // This routine only supports 128-bit integer dual input vectors.
10948 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10949 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11005 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11014 // half-crossings are created.
11017 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11025 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11037 /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11073 M -= NumElts;
11084 // TODO - it might be worth doing this for unary shuffles if the permute
11107 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11109 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11166 SmallVector<int, 32> V1Mask(NumElts, -1);
11167 SmallVector<int, 32> V2Mask(NumElts, -1);
11168 SmallVector<int, 32> FinalMask(NumElts, -1);
11176 V2Mask[i] = M - NumElts;
11184 // and change \p InputMask to be a no-op (identity) mask.
11205 // It is possible that the shuffle for one of the inputs is already a no-op.
11206 // See if we can simplify non-no-op shuffles into broadcasts,
11215 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11217 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11218 // pre-shuffle first is a better strategy.
11238 // Unpack/rotate failed - try again with variable blends.
11250 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11253 V1Mask.assign(NumElts, -1);
11254 V2Mask.assign(NumElts, -1);
11255 FinalMask.assign(NumElts, -1);
11263 V2Mask[i + (j / 2)] = M - NumElts;
11277 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11278 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11286 return -1;
11311 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11319 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11344 // [-1, 12, 13, 14, -1, -1, 1, -1]
11345 // [-1, -1, -1, -1, -1, -1, 1, 2]
11347 // [-1, 4, 5, 6, -1, -1, 9, -1]
11348 // [-1, 4, 5, 6, -1, -1, -1, -1]
11359 int StartIdx = i - (M % NumElts);
11362 return -1;
11367 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11373 return -1;
11390 return -1;
11410 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11421 /// specified as a *right shift* because x86 is little-endian, it is a *left
11427 return -1;
11429 // PALIGNR works on 128-bit lanes.
11432 return -1;
11436 return -1;
11449 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11465 "512-bit PALIGNR requires BWI instructions");
11472 "Rotate-based lowering only supports 128-bit lowering!");
11474 "Can shuffle at most 16 bytes in a 128-bit vector!");
11479 int LoByteShift = 16 - ByteRotation;
11500 /// specified as a *right shift* because x86 is little-endian, it is a *left
11508 "Only 32-bit and 64-bit elements are supported!");
11510 // 128/256-bit vectors are only supported with VLX.
11512 && "VLX required for 128/256-bit vectors");
11520 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
11522 // TODO: We can probably make this more aggressive and use shift-pairs like
11534 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
11537 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
11543 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
11558 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11559 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
11569 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11584 // 01234567 --> zzzzzz01 --> 1zzzzzzz
11585 // 01234567 --> 4567zzzz --> zzzzz456
11586 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
11588 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11603 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11620 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
11623 /// bit-wise element shifts and the byte shift across an entire 128-bit double
11626 /// PSHL : (little-endian) left bit shift.
11628 /// [ -1, 4, zz, -1 ]
11629 /// PSRL : (little-endian) right bit shift.
11631 /// [ -1, -1, 7, zz]
11632 /// PSLLDQ : (little-endian) left byte shift
11634 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
11635 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
11636 /// PSRLDQ : (little-endian) right byte shift
11638 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
11639 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
11650 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
11660 unsigned Len = Scale - Shift;
11662 return -1;
11682 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
11699 return -1;
11756 for (; Len > 0; --Len)
11757 if (!Zeroable[Len - 1])
11763 int Idx = -1;
11776 if (Idx < 0 || (Src == V && Idx == (M - i))) {
11778 Idx = M - i;
11796 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
11826 int Len = Hi - Idx;
11838 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
11841 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
11844 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
11915 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
11918 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
11926 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
11939 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
11945 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
11946 -1};
11953 int PSHUFDMask[4] = {Offset / 2, -1,
11954 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
11958 int PSHUFWMask[4] = {1, -1, -1, -1};
11967 // to 64-bits.
12015 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12017 ShMask[i - AlignToUnpack] = i;
12019 Offset -= AlignToUnpack;
12027 Offset -= (NumElements / 2);
12047 /// match this pattern. It will use all of the micro-architectural details it
12048 /// can to emit an efficient lowering. It handles both blends with all-zero
12049 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12052 /// The reason we have dedicated lowering for zext-style shuffles is that they
12063 "Exceeds 32-bit integer zero extension limit");
12066 // Define a helper function to check a particular ext-scale and lower to it if
12068 auto Lower = [&](int Scale) -> SDValue {
12093 Offset = M - (i / Scale);
12095 return SDValue(); // Flip-flopping inputs.
12097 // Offset must start in the lowest 128-bit lane or at the start of an
12110 return SDValue(); // Non-consecutive strided elements.
12114 // If we fail to find an input, we have a zero-shuffle which should always
12129 // The widest scale possible for extending is to a 64-bit integer.
12143 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12148 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12202 return V->hasOneUse() &&
12229 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12239 // Bail if a non-zero V1 isn't used in place.
12242 V1Mask[V2Index] = -1;
12252 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12258 // Using zext to expand a narrow element won't work for non-zero
12264 // Zero-extend directly to i32.
12269 // and OR with the zero-extended scalar.
12290 // this. We can't support integer vectors or non-zero targets cheaply.
12291 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12338 /// Try to lower broadcast of a single - truncated - integer element,
12352 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12353 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12379 // If we're extracting non-least-significant bits, shift so we can truncate.
12396 // This routine only handles 128-bit shufps.
12398 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12399 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12400 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12401 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12413 /// Test whether the specified input (0 or 1) is in-place blended by the
12428 /// If we are extracting two 128-bit halves of a vector and shuffling the
12429 /// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12430 /// multi-shuffle lowering.
12437 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12470 NewMask.append(NumElts, -1);
12472 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12475 // This is free: ymm -> xmm.
12483 /// filtering. While a little annoying to re-dispatch on type here, there isn't
12547 BitOffset -= BeginOffset;
12557 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
12583 cast<LoadSDNode>(V)->isSimple()) {
12584 // We do not check for one-use of the vector load because a broadcast load
12590 SDValue BaseAddr = Ld->getOperand(1);
12593 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
12599 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
12602 SDValue Ops[] = {Ld->getChain(), NewAddr};
12606 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12611 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12613 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12619 // We can only broadcast from the zero-element of a vector register,
12620 // but it can be advantageous to broadcast from the zero-element of a
12625 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12629 // Only broadcast the zero-element of a 128-bit subvector.
12634 "Unexpected bit-offset");
12660 // We only support broadcasting from 128-bit vectors to minimize the
12662 // 128-bits, removing as many bitcasts as possible.
12693 int VADstIndex = -1;
12694 int VBDstIndex = -1;
12710 // We can only insert a single non-zeroable element.
12723 // Don't bother if we have no (non-zeroable) element for insertion.
12737 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
12741 // the zero mask and the V2 insertion - so remove V1 dependency.
12783 /// Handle lowering of 2-lane 64-bit floating point shuffles.
12785 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
12821 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12822 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12837 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
12838 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
12844 // blend patterns if a zero-blend above didn't work.
12863 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
12868 /// Handle lowering of 2-lane 64-bit integer shuffles.
12870 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
12892 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
12893 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
12894 Mask[1] < 0 ? -1 : (Mask[1] * 2),
12895 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
12901 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
12902 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
12941 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
12982 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
12994 NewMask[V2Index] -= 4;
12999 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13018 NewMask[2] -= 4;
13019 NewMask[3] -= 4;
13024 NewMask[0] -= 4;
13025 NewMask[1] -= 4;
13037 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13038 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13061 /// Lower 4-lane 32-bit floating point shuffles.
13128 // There are special ways we can lower some single-element blends. However, we
13129 // have custom ways we can lower more complex single-element blends below that
13131 // when the V2 input is targeting element 0 of the mask -- that is the fast
13166 /// Lower 4-lane i32 vector shuffles.
13168 /// We try to handle these with integer-domain shuffles where we can, but for
13200 // Try to use broadcast unless the mask only has one non-undef element.
13233 // There are special ways we can lower some single-element blends.
13256 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13295 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13307 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13309 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13310 /// vector, form the analogous 128-bit 8-element Mask.
13330 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13343 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13344 int NumHToL = LoInputs.size() - NumLToL;
13345 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13346 int NumHToH = HiInputs.size() - NumLToH;
13352 // If we are shuffling values from one half - check how many different DWORD
13366 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13398 DWordPairs.resize(2, std::make_pair(-1, -1));
13408 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13413 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13414 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13416 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13417 // and an existing 2-into-2 on the other half. In this case we may have to
13418 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13419 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13420 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13421 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13422 // half than the one we target for fixing) will be fixed when we re-enter this
13426 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13427 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13429 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13431 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13432 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13434 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13435 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13462 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13469 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13471 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13472 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13473 // is essential that we don't *create* a 3<-1 as then we might oscillate.
13477 // to balance this to ensure we don't form a 3-1 shuffle in the other
13545 // Recurse back into this routine to re-compute state now that this isn't
13559 int PSHUFLMask[4] = {-1, -1, -1, -1};
13560 int PSHUFHMask[4] = {-1, -1, -1, -1};
13561 int PSHUFDMask[4] = {-1, -1, -1, -1};
13564 // original halves. This will then dictate the targets of the cross-half
13573 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13574 InPlaceInputs[0] - HalfOffset;
13581 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13588 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13589 InPlaceInputs[0] - HalfOffset;
13593 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13600 // Now gather the cross-half inputs and place them into a free dword of
13603 // look more like the 3-1 fixing operation.
13628 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13629 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13630 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13631 Input - SourceOffset;
13634 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13637 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13639 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13640 Input - SourceOffset &&
13643 // Note that this correctly re-maps both when we do a swap and when
13646 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13650 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13651 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13653 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13658 // And just directly shift any other-half mask elements to be same-half
13663 M = M - SourceOffset + DestOffset;
13673 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13674 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13676 SourceHalfMask[InputFixed - SourceOffset] =
13677 IncomingInputs[0] - SourceOffset;
13684 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13685 // We have two non-adjacent or clobbered inputs we need to extract from
13688 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13689 IncomingInputs[1] - SourceOffset};
13715 // (because there are no off-half inputs to this half) and there is no
13717 // swap an input with a non-input.
13795 M -= 4;
13803 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
13827 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
13856 /// Generic lowering of 8-lane i16 shuffles.
13858 /// This handles both single-input shuffles and combined shuffle/blends with
13863 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
13927 "All single-input shuffles should be canonicalized to be V1-input "
13942 // There are special ways we can lower some single-element blends.
13992 // Check if this is part of a 256-bit vector truncation.
14008 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14038 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14065 // We can always bit-blend if we have to so the fallback strategy is to
14066 // decompose into single-input permutes and blends/unpacks.
14071 /// Lower 8-lane 16-bit floating point shuffles.
14100 // Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14101 // sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14121 M += (Scale - 1) * NumElts;
14144 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14145 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14193 // For single-input shuffles, there are some nicer lowering tricks we can use.
14209 // Notably, this handles splat and partial-splat shuffles more efficiently.
14210 // However, it only makes sense if the pre-duplication shuffle simplifies
14212 // express the pre-duplication shuffle as an i16 shuffle.
14223 auto tryToWidenViaDuplication = [&]() -> SDValue {
14240 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14284 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14287 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14323 // blends but after all of the single-input lowerings. If the single input
14332 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14345 // do so. This avoids using them to handle blends-with-zero which is
14358 // FIXME: It might be worth trying to detect if the unpack-feeding
14365 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14376 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14386 // There are special ways we can lower some single-element blends.
14411 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14443 // Handle multi-input cases by blending/unpacking single-input shuffles.
14448 // The fallback path for single-input shuffles widens this into two v8i16
14453 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14454 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14497 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
14499 /// This routine breaks down the specific type of 128-bit shuffle and
14534 /// Generic routine to split vector shuffle into half-sized shuffles.
14543 "Only for 256-bit or wider vector shuffles!");
14555 // Use splitVector/extractSubVector so that split build-vectors just build two
14568 // Now create two 4-way blends of these half-width vectors.
14589 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
14600 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14601 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14602 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14606 V2BlendMask[i] = M - NumElements;
14619 // a minimal number of high-level vector shuffle nodes.
14638 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14663 /// This is provided as a good fallback for many lowerings of non-single-input
14664 /// shuffles with more than one 128-bit lane. In those cases, we want to select
14665 /// between splitting the shuffle into 128-bit components and stitching those
14666 /// back together vs. extracting the single-input shuffles and blending those
14672 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
14680 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14684 V2BroadcastIdx = M - Size;
14685 else if (M - Size != V2BroadcastIdx)
14699 // If the inputs all stem from a single 128-bit lane of each input, then we
14715 // requires that the decomposed single-input shuffles don't end up here.
14721 // TODO: Extend to support v8f32 (+ 512-bit shuffles).
14728 int LHSMask[4] = {-1, -1, -1, -1};
14729 int RHSMask[4] = {-1, -1, -1, -1};
14750 /// Lower a vector shuffle crossing multiple 128-bit lanes as
14751 /// a lane permutation followed by a per-lane permutation.
14753 /// This is mainly for cases where we can have non-repeating permutes
14771 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
14813 // TODO - isShuffleMaskInputInPlace could be extended to something like
14825 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
14848 // Then attempt a solution with 64-bit sublanes (vpermq).
14852 // If that doesn't work and we have fast variable cross-lane shuffle,
14853 // attempt 32-bit sublanes (vpermd).
14874 /// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
14878 /// single-input cross lane shuffle which is lower than any other fully general
14879 /// cross-lane shuffle strategy I'm aware of. Special cases for each particular
14884 // FIXME: This should probably be generalized for 512-bit vectors as well.
14885 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
14896 // If there are only inputs from one 128-bit lane, splitting will in fact be
14914 // TODO - we could support shuffling V2 in the Flipped input.
14922 "In-lane shuffle mask expected");
14930 // Flip the lanes, and shuffle the results which should now be in-lane.
14939 /// Handle lowering 2-lane 128-bit shuffles.
14987 // Blends are faster and handle all the non-lane-crossing cases.
14995 // Check for patterns which can be matched with a single insert of a 128-bit
15001 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15023 // Otherwise form a 128-bit permutation. After accounting for undefs,
15024 // convert the 64-bit shuffle mask selection values into 128-bit
15029 // [1:0] - select 128 bits from sources for low half of destination
15030 // [2] - ignore
15031 // [3] - zero low half of destination
15032 // [5:4] - select 128 bits from sources for high half of destination
15033 // [6] - ignore
15034 // [7] - zero high half of destination
15053 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
15071 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15072 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15077 int Srcs[2] = {-1, -1};
15078 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15177 SmallVector<int, 16> NewMask(NumElts, -1);
15181 int M = -1;
15192 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15198 int M = -1;
15209 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15214 NewMask[i] = -1;
15246 HalfIdx1 = -1;
15247 HalfIdx2 = -1;
15321 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15329 "Expected 256-bit or 512-bit vector");
15379 // Always extract lowers when setting lower - these are all free subreg ops.
15385 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15400 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15413 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15415 // AVX2 has efficient 64-bit element cross-lane shuffles.
15419 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15431 /// Handle case where shuffle sources are coming from the same 128-bit lane and
15432 /// every lane can be represented as the same repeating mask - allowing us to
15451 // accounting for UNDEFs but only references the lowest 128-bit
15469 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15477 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15492 // Bail if the shuffle mask doesn't cross 128-bit lanes.
15507 // can form a repeating shuffle mask (local to each sub-lane). At the same
15508 // time, determine the source sub-lane for each destination sub-lane.
15509 int TopSrcSubLane = -1;
15510 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15516 // Extract the sub-lane mask, check that it all comes from the same lane
15518 int SrcLane = -1;
15519 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15532 // Whole sub-lane is UNDEF.
15536 // Attempt to match against the candidate repeated sub-lane masks.
15552 // Merge the sub-lane mask into the matching repeated sub-lane mask.
15562 // Track the top most source sub-lane - by setting the remaining to
15570 // Bail if we failed to find a matching repeated sub-lane mask.
15578 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15591 // Shuffle each source sub-lane to its destination.
15592 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15613 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15614 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
15616 // Otherwise we can only permute whole 128-bit lanes.
15693 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
15718 if (Zeroable.countl_one() < (Mask.size() - 8))
15746 // Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
15748 // 256-bit vectors in earlier isel stages. Therefore, this function matches a
15749 // pair of 256-bit shuffles and makes sure the masks are consecutive.
15784 for (SDNode *User : V1->uses())
15785 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
15786 User->getOperand(1) == V2)
15791 // Find out which half of the 512-bit shuffles is each smaller shuffle
15796 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
15797 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
15800 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
15801 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
15823 /// Handle lowering of 4-lane 64-bit floating point shuffles.
15825 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
15850 // Non-half-crossing single input shuffles can be lowered with an
15863 // Try to create an in-lane repeating shuffle mask and then shuffle the
15869 // Try to permute the lanes and then use a per-lane permute.
15911 // Try to create an in-lane repeating shuffle mask and then shuffle the
15917 // Try to simplify this by merging 128-bit lanes to enable a lane-based
15943 /// Handle lowering of 4-lane 64-bit integer shuffles.
15977 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16031 // Try to create an in-lane repeating shuffle mask and then shuffle the
16042 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16056 /// Handle lowering of 8-lane 32-bit floating point shuffles.
16058 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16090 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16116 // Try to create an in-lane repeating shuffle mask and then shuffle the
16123 // two 128-bit lanes use the variable mask to VPERMILPS.
16138 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16160 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16178 /// Handle lowering of 8-lane 32-bit integer shuffles.
16207 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16236 // If the shuffle mask is repeated in each 128-bit lane we can use more
16237 // efficient instructions that mirror the shuffles across the two 128-bit
16280 // Try to create an in-lane repeating shuffle mask and then shuffle the
16287 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16293 // generate a cross-lane VPERMD instruction.
16309 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16320 /// Handle lowering of 16-lane 16-bit integer shuffles.
16374 // Try to create an in-lane repeating shuffle mask and then shuffle the
16386 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16391 // There are no generalized cross-lane shuffle operations available on i16
16404 // As this is a single-input shuffle, the repeated mask should be
16416 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
16420 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16426 // Try to permute the lanes and then use a per-lane permute.
16443 /// Handle lowering of 32-lane 8-bit integer shuffles.
16503 // Try to create an in-lane repeating shuffle mask and then shuffle the
16509 // There are no generalized cross-lane shuffle operations available on i8
16512 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16529 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16533 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16539 // Try to permute the lanes and then use a per-lane permute.
16564 /// High-level routine to lower various 256-bit x86 vector shuffles.
16566 /// This routine either breaks down the specific type of a 256-bit x86 vector
16567 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
16588 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16590 // querying in the per-vector-type lowering routines. With AVX1 we have
16591 // essentially *zero* ability to manipulate a 256-bit vector with integer
16598 // for masking/blending then decompose into 128-bit vectors.
16636 llvm_unreachable("Not a valid 256-bit x86 vector type!");
16640 /// Try to lower a vector shuffle as a 128-bit shuffles.
16652 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16670 // Check for patterns which can be matched with a single insert of a 256-bit
16683 // See if this is an insertion of the lower 128-bits of V2 into V1.
16685 int V2Index = -1;
16687 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16698 // Make sure we only have a single V2 index and its the lowest 128-bits.
16713 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
16725 int PermMask[4] = {-1, -1, -1, -1};
16728 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16746 /// Handle lowering of 8-lane 64-bit floating point shuffles.
16761 // Non-half-crossing single input shuffles can be lowered with an
16800 /// Handle lowering of 16-lane 32-bit floating point shuffles.
16809 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16845 // Try to create an in-lane repeating shuffle mask and then shuffle the
16852 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
16867 /// Handle lowering of 8-lane 64-bit integer shuffles.
16884 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16886 // 128-bit lanes.
16940 /// Handle lowering of 16-lane 32-bit integer shuffles.
16970 // If the shuffle mask is repeated in each 128-bit lane we can use more
16971 // efficient instructions that mirror the shuffles across the four 128-bit
17019 // Try to create an in-lane repeating shuffle mask and then shuffle the
17037 /// Handle lowering of 32-lane 16-bit integer shuffles.
17045 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17082 // As this is a single-input shuffle, the repeated mask should be
17101 /// Handle lowering of 64-lane 8-bit integer shuffles.
17109 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17153 // Try to create an in-lane repeating shuffle mask and then shuffle the
17168 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17181 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17195 /// High-level routine to lower various 512-bit x86 vector shuffles.
17197 /// This routine either breaks down the specific type of a 512-bit x86 vector
17198 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
17206 "Cannot lower 512-bit vectors w/ basic ISA!");
17270 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17282 int ShiftAmt = -1;
17291 // The first non-undef element determines our shift amount.
17293 ShiftAmt = M - i;
17298 // All non-undef elements must shift by the same amount.
17299 if (ShiftAmt != M - i)
17313 // Returns the shift amount if possible or -1 if not. This is a simplified
17321 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17330 unsigned Len = Size - Shift;
17341 return -1;
17346 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
17347 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
17355 "Cannot lower 512-bit vectors w/o basic ISA!");
17362 int Src = -1;
17382 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17411 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17413 ShiftAmt += WideElts - NumElts;
17427 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
17430 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
17449 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17454 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17455 // 256-bit operation available.
17459 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17460 // 256-bit operation available.
17564 // are preferable to blendw/blendvb/masked-mov.
17572 switch (V->getOpcode()) {
17591 if (!V->hasOneUse())
17609 /// Top-level lowering for x86 vector shuffles.
17619 ArrayRef<int> OrigMask = SVOp->getMask();
17641 // Check for non-undef masks pointing at an undef vector and make the masks
17649 M = -1;
17657 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
17660 // We actually see shuffles that are entirely re-arrangements of a set of
17675 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17683 // TODO: Avoid lowering directly from this top-level function: make this
17684 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
17698 // Modify the new Mask to take all zeros from the all-zero vector.
17699 // Choose indices that are blend-friendly.
17702 "V2's non-undef elements are used?!");
17708 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
17766 // Only non-legal VSELECTs reach this lowering, convert those into generic
17767 // shuffles and re-use the shuffle lowering path for blends.
17798 // Try to lower this to a blend-style vector shuffle. This can handle all
17804 // with patterns on the mask registers on AVX-512.
17821 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
17822 // into an i1 condition so that we can use the mask-based 512-bit blend
17863 // VSELECT-matching blend, return Op, and but if we need to expand, return
17909 unsigned IdxVal = Idx->getAsZExtVal();
17923 SDNode *User = *Op.getNode()->use_begin();
17924 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
17925 (User->getOpcode() != ISD::BITCAST ||
17926 User->getValueType(0) != MVT::i32))
17940 /// AVX-512 feature.
17957 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
17971 unsigned IdxVal = IdxC->getZExtValue();
17988 MVT VT = N->getSimpleValueType(0);
17991 for (SDNode *User : N->uses()) {
17992 switch (User->getOpcode()) {
17996 if (!isa<ConstantSDNode>(User->getOperand(1))) {
18000 DemandedElts.setBit(User->getConstantOperandVal(1));
18003 if (!User->getValueType(0).isSimple() ||
18004 !User->getValueType(0).isVector()) {
18036 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18044 // | Uops | 0 - DV | 5 | 6 | 7 | |
18045 // ---------------------------------------------
18056 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18057 // ---------------------------------------------------------
18058 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18059 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18066 unsigned IdxVal = IdxC->getZExtValue();
18068 // If this is a 256-bit vector result, first extract the 128-bit vector and
18069 // then extract the element from the 128-bit vector.
18071 // Get the 128-bit vector.
18080 IdxVal &= ElemsPerChunk - 1;
18111 // Only extract a single element from a v16i8 source - determine the common
18112 // DWORD/WORD that all extractions share, and extract the sub-byte.
18118 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18149 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
18166 int Mask[2] = { 1, -1 };
18176 /// AVX-512 feature.
18197 // Copy into a k-register, extract to v1i1 and insert_subvector.
18229 // possible vector indices, and FP insertion has less gpr->simd traffic.
18249 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18254 if (N2C->getAPIntValue().uge(NumElts))
18256 uint64_t IdxVal = N2C->getZExtValue();
18262 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18287 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18290 // With a 256-bit vector, we can insert into the zero element efficiently
18295 // doing anyway after extracting to a 128-bit vector.
18306 "Vectors will always have power-of-two number of elements.");
18308 // If we are not inserting into the low 128-bit vector chunk,
18322 // Get the desired 128-bit vector chunk.
18327 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18335 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18388 // If this is an insertion of 32-bits into the low 32-bits of
18393 // generate insertps because blendps does not have a 32-bit memory
18423 // If this is a 256-bit vector result, first insert into a 128-bit
18424 // vector and then insert into the 256-bit vector.
18426 // Insert into a 128-bit vector.
18433 // Insert the 128-bit vector.
18485 // References to absolute symbols are never PC-relative.
18486 if (GV && GV->isAbsoluteSymbolRef())
18489 // The following OpFlags under RIP-rel PIC use RIP.
18518 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
18540 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18564 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18565 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18591 GV = G->getGlobal();
18592 Offset = G->getOffset();
18595 ExternalSym = ES->getSymbol();
18615 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
18648 // If there was a non-zero offset that we didn't fold, create an explicit
18673 auto UI = TGA->use_begin();
18675 if (UI != TGA->use_end())
18676 return SDValue(*UI->use_begin()->use_begin(), 0);
18678 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18679 GA->getOffset(), OperandFlags);
18752 MFI->incNumLocalDynamicTLSAccesses();
18774 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18775 GA->getValueType(0),
18776 GA->getOffset(), OperandFlags);
18789 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
18798 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
18816 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
18818 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18819 GA->getOffset(), OperandFlags);
18846 const GlobalValue *GV = GA->getGlobal();
18887 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
18888 GA->getValueType(0),
18889 GA->getOffset(), OpFlag);
18932 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
18933 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
18949 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
18971 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18972 GA->getValueType(0),
18973 GA->getOffset(), X86II::MO_SECREL);
19000 // offset and returning `true` for TLS-desc currently duplicates both
19001 // which is detrimental :-/
19017 // Try to use a packed vector operation to handle i64 on 32-bit targets when
19027 bool IsStrict = Op->isStrictFPOpcode();
19039 // Using 256-bit to ensure result is 128-bits for f32 case.
19060 // Try to use a packed vector operation to handle i64 on 32-bit targets.
19068 bool IsStrict = Op->isStrictFPOpcode();
19120 /// round-trip between XMM and GPR.
19132 // See if we have a 128-bit vector cast op for this type of cast.
19141 // If we are extracting from a non-zero element, first shuffle the source
19144 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19148 // If the source vector is wider than 128-bits, extract the low part. Do not
19153 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19154 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19161 /// try to vectorize the cast ops. This will avoid an expensive round-trip
19178 // See if we have 128-bit vector cast instructions for this type of cast.
19191 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19197 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19213 bool IsStrict = Op->isStrictFPOpcode();
19214 MVT VT = Op->getSimpleValueType(0);
19215 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19224 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19238 {Op->getOperand(0), Src});
19252 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19253 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19301 bool IsStrict = Op->isStrictFPOpcode();
19303 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19340 bool IsStrict = Op->isStrictFPOpcode();
19343 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19413 // Bitcasting to f64 here allows us to do a single 64-bit store from
19415 // with two 32-bit stores.
19478 /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19487 /// 64-bit unsigned integer to double expansion.
19491 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
19493 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
19494 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
19526 // Load the 64-bit value into an XMM register.
19539 // TODO: Are there any fast-math-flags to propagate here?
19547 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19555 /// 32-bit unsigned integer to float expansion.
19559 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19564 // Load the 32-bit value into an XMM register.
19581 if (Op.getNode()->isStrictFPOpcode()) {
19583 // TODO: Are there any fast-math-flags to propagate here?
19599 // TODO: Are there any fast-math-flags to propagate here?
19612 bool IsStrict = Op->isStrictFPOpcode();
19625 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
19644 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
19662 bool IsStrict = Op->isStrictFPOpcode();
19663 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
19669 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19671 MVT VT = Op->getSimpleValueType(0);
19673 // v8i32->v8f64 is legal with AVX512 so just return it.
19690 {Op->getOperand(0), V});
19705 Op->getSimpleValueType(0) == MVT::v4f64) {
19738 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19745 if (VecFloatVT != Op->getSimpleValueType(0))
19749 // - The vector of constants:
19750 // -- 0x4b000000
19751 // -- 0x53000000
19752 // - A shift:
19753 // -- v >> 16
19796 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19798 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
19801 // TODO: Are there any fast-math-flags to propagate here?
19819 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19839 bool IsStrict = Op->isStrictFPOpcode();
19845 MVT DstVT = Op->getSimpleValueType(0);
19869 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
19873 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
19887 // The transform for i64->f64 isn't correct for 0 when rounding to negative
19888 // infinity. It produces -0.0, so disable under strictfp.
19892 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
19901 // Make a 64-bit buffer, and use it to build an FILD.
19903 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
19924 // Bitcasting to f64 here allows us to do a single 64-bit store from
19926 // with two 32-bit stores.
19950 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
19965 // TODO: Are there any fast-math-flags to propagate here?
19999 bool IsStrict = Op->isStrictFPOpcode();
20015 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20021 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20031 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20052 // FistSrc = (Value - FltOfs);
20053 // Fist-to-mem64 FistSrc
20054 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20116 // FIXME This causes a redundant load/store if the SSE-class value is already
20183 // v8i16 -> v8i32
20184 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20185 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20188 // v4i32 -> v4i64
20189 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20190 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20196 // Short-circuit if we can determine that each 128-bit half is the same value.
20199 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20228 MVT VT = Op->getSimpleValueType(0);
20229 SDValue In = Op->getOperand(0);
20239 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20252 // Widen to 512-bits if VLX is not supported.
20274 // Extract back to 128/256-bit if we widened.
20298 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20299 /// within each 128-bit lane.
20331 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20339 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
20340 // On pre-AVX512, pack the src in both halves to help value tracking.
20369 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20377 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20378 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20384 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20395 // If 512bit -> 128bit truncate another stage.
20401 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20404 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
20419 /// e.g. trunc <8 x i32> X to <8 x i16> -->
20464 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
20465 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
20472 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
20489 // Pre-SSE41 we can only use PACKUSWB.
20491 if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20496 // Truncate with PACKSS if we are truncating a vector with sign-bits
20501 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
20509 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20518 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
20522 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
20529 /// This function lowers a vector truncation of 'extended sign-bits' or
20530 /// 'extended zero-bits' values.
20598 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
20599 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
20607 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
20626 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
20650 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
20662 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
20675 // We either have 8 elements or we're allowed to use 512-bit vectors.
20682 ShiftInx = InVT.getScalarSizeInBits() - 1;
20712 // truncate the remainder. We'd rather produce two 64-bit results and
20725 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
20732 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
20759 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
20770 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
20772 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
20789 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
20793 -1, -1, -1, -1, -1, -1, -1, -1,
20795 -1, -1, -1, -1, -1, -1, -1, -1 };
20800 static const int ShufMask2[] = {0, 2, -1, -1};
20815 llvm_unreachable("All 256->128 cases should have been handled above!");
20825 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
20828 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
20842 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
20851 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
20857 bool IsStrict = Op->isStrictFPOpcode();
20860 MVT VT = Op->getSimpleValueType(0);
20862 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
20891 // Widen to 512-bits.
20897 // TODO: Should we just do this for non-strict as well?
20960 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
20984 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
20991 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21001 // TODO: Should we just do this for non-strict as well?
21023 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21031 // TODO: Should we just do this for non-strict as well?
21055 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21077 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21115 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21116 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21132 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21143 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21237 EVT DstVT = N->getValueType(0);
21238 SDValue Src = N->getOperand(0);
21256 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21285 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21288 SDValue Src = Node->getOperand(0);
21295 EVT DstVT = Node->getValueType(0);
21303 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21316 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21329 // floating-point values.
21431 bool IsStrict = Op->isStrictFPOpcode();
21439 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
21470 // Need a libcall, but ABI for f16 is soft-float on MacOS.
21532 {Op->getOperand(0), Res});
21544 {Op->getOperand(0), Res});
21549 bool IsStrict = Op->isStrictFPOpcode();
21622 // FIXME: Should we use zeros for upper elements for non-strict?
21641 bool IsStrict = Op->isStrictFPOpcode();
21670 bool IsStrict = Op->isStrictFPOpcode();
21686 // FIXME: Should we use zeros for upper elements for non-strict?
21754 // clang-format off
21761 // clang-format on
21780 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
21781 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
21788 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
21789 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
21790 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
21791 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
21840 for (SDNode *User : Op->uses())
21841 if (User->getOpcode() == ISD::FNEG)
21853 // decide if we should generate a 16-byte constant mask when we only need 4 or
21857 // generate a 16-byte vector constant and logic op even for the scalar case.
21858 // Using a 16-byte mask allows folding the load of the mask with
21884 // For the scalar case extend to a 128-bit vector, perform the logic op,
21916 // Perform all scalar logic operations as 16-byte vectors because there are no
21945 APFloat APF = Op0CN->getValueAPF();
21982 // instruction. Since the shift amount is in-range-or-undefined, we know
21993 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22007 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22066 /// Try to map a 128-bit or larger integer comparison to vector instructions
22083 // logically-combined vector-sized operands compared to zero. This pattern may
22100 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22101 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22142 auto ScalarToVector = [&](SDValue X) -> SDValue {
22170 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22203 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22204 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22206 "Non 128-bit vector on pre-SSE41 target");
22216 /// style scalarized (associative) reduction patterns. Partial reductions
22217 /// are supported when the pointer SrcMask is non-null.
22218 /// TODO - move this to SelectionDAG?
22236 if (I->getOpcode() == unsigned(BinOp)) {
22237 Opnds.push_back(I->getOperand(0));
22238 Opnds.push_back(I->getOperand(1));
22239 // Re-evaluate the number of nodes to be traversed.
22244 // Quit if a non-EXTRACT_VECTOR_ELT
22245 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22249 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22253 SDValue Src = I->getOperand(0);
22258 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22267 unsigned CIdx = Idx->getZExtValue();
22268 if (M->second[CIdx])
22270 M->second.setBit(CIdx);
22276 SrcMask->push_back(SrcOpMap[SrcOp]);
22299 // Quit if not convertable to legal scalar or 128/256-bit vector.
22303 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
22320 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22343 // Without PTEST, a masked v2i64 or-reduction is not faster than
22350 // Split down to 128/256/512-bit vector.
22367 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
22376 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
22423 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
22434 // Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
22449 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22452 // Check whether we're masking/truncating an OR-reduction result, in which
22467 Mask = Cst->getAPIntValue();
22478 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
22486 // Quit if not splittable to scalar/128/256/512-bit vector.
22492 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22508 // Match icmp(reduce_and(X),-1) allof reduction patterns.
22528 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
22533 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
22542 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
22564 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22568 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22570 UOpNo = User->use_begin().getOperandNo();
22571 User = *User->use_begin();
22574 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22575 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22581 // Transform to an x86-specific ALU node with flags if there is a chance of
22585 for (SDNode *U : Op->uses())
22586 if (U->getOpcode() != ISD::CopyToReg &&
22587 U->getOpcode() != ISD::SETCC &&
22588 U->getOpcode() != ISD::STORE)
22614 switch (Op->getOpcode()) {
22619 if (Op.getNode()->getFlags().hasNoSignedWrap())
22644 // non-casted variable when we check for possible users.
22660 // Otherwise use a regular EFLAGS-setting instruction.
22662 // clang-format off
22669 // clang-format on
22684 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22685 Op->getOperand(1)).getValue(1);
22697 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22726 // Don't do this if the immediate can fit in 8-bits.
22727 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
22728 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
22750 // TODO: Add sign-bits equivalent for isX86CCSigned(X86CC)?
22760 // 0-x == y --> x+y == 0
22761 // 0-x != y --> x+y != 0
22769 // x == 0-y --> x+y == 0
22770 // x != 0-y --> x+y != 0
22791 if (N->getOpcode() == ISD::FDIV)
22794 EVT FPVT = N->getValueType(0);
22797 // This indicates a non-free bitcast.
22799 // integer vector anyways for the int->fp cast.
22824 /// The minimum architected relative accuracy is 2^-12. We need one
22825 /// Newton-Raphson step to have a good float result (24 bits of precision).
22834 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
22835 // It is likely not profitable to do this for f64 because a double-precision
22851 // There is no FSQRT for 512-bits, but there is RSQRT14.
22878 /// The minimum architected relative accuracy is 2^-12. We need one
22879 /// Newton-Raphson step to have a good float result (24 bits of precision).
22886 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
22887 // It is likely not profitable to do this for f64 because a double-precision
22899 // real-world code. These defaults are intended to match GCC behavior.
22906 // There is no FSQRT for 512-bits, but there is RCP14.
22944 if (isIntDivCheap(N->getValueType(0), Attr))
22956 EVT VT = N->getValueType(0);
22962 // If the divisor is 2 or -2, the default expansion is better.
22964 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
22993 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23001 uint64_t AndRHSVal = AndRHS->getZExtValue();
23039 // Check if pre-AVX condcode can be performed by a single FCMP op.
23044 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23052 // 0 - EQ
23053 // 1 - LT
23054 // 2 - LE
23055 // 3 - UNORD
23056 // 4 - NEQ
23057 // 5 - NLT
23058 // 6 - NLE
23059 // 7 - ORD
23061 // clang-format off
23083 // clang-format on
23107 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
23142 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23169 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23170 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23174 const APInt &EltC = Elt->getAPIntValue();
23181 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23210 // Only do this pre-AVX since vpcmp* is no longer destructive.
23224 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23233 // Psubus is better than flip-sign because it requires no inversion.
23253 MVT VT = Op->getSimpleValueType(0);
23254 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23269 // compare like we do for non-strict, we might trigger spurious exceptions
23283 // floating-point vector result that matches the operand type. This allows
23311 SignalCmp->setFlags(Op->getFlags());
23402 // The non-AVX512 code below works under the assumption that source and
23409 // In AVX-512 architecture setcc returns mask with i1 elements,
23421 // clang-format off
23433 // clang-format on
23444 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23462 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23466 if (C1 && C1->getAPIntValue().isPowerOf2()) {
23468 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23474 DAG.getConstant(BitWidth - 1, dl, VT));
23479 // Break 256-bit integer vector compare into smaller ones.
23483 // Break 512-bit integer vector compare into smaller ones.
23489 // not-of-PCMPEQ:
23490 // X != INT_MIN --> X >s INT_MIN
23491 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23492 // +X != 0 --> +X >s 0
23504 // If both operands are known non-negative, then an unsigned compare is the
23519 // X > C --> X >= (C+1) --> X == umax(X, C+1)
23527 // X < C --> X <= (C-1) --> X == umin(X, C-1)
23537 // clang-format off
23543 // clang-format on
23549 // If the logical-not of the result is required, perform that now.
23596 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23605 // If the i64 elements are sign-extended enough to be representable as i32
23665 // Make sure the lower and upper halves are both all-ones.
23689 // If the logical-not of the result is required, perform that now.
23771 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
23805 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
23816 // (seteq (add X, -1), -1). Similar for setne.
23845 MVT VT = Op->getSimpleValueType(0);
23849 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
23855 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
23877 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
23880 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
23883 // encoding size - so it must either already be a i8 or i32 immediate, or it
23888 const APInt &Op1Val = Op1C->getAPIntValue();
23939 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
24012 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24013 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24038 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24061 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24065 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
24133 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
24134 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
24135 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
24136 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
24137 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
24138 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
24139 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24140 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24148 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
24149 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
24166 // 'X - 1' sets the carry flag if X == 0.
24167 // '0 - X' sets the carry flag if X != 0.
24168 // Convert the carry flag to a -1/0 mask with sbb:
24169 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24170 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24171 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24172 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24216 SDValue Mask = DAG.getNegative(Neg, DL, VT); // -(and (x, 0x1))
24221 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
24224 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24229 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24230 unsigned ShCt = VT.getSizeInBits() - 1;
24256 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
24296 // a < b ? -1 : 0 -> RES = ~setcc_carry
24297 // a < b ? 0 : -1 -> RES = setcc_carry
24298 // a >= b ? -1 : 0 -> RES = setcc_carry
24299 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24301 unsigned CondCode = CC->getAsZExtVal();
24349 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
24355 MVT VT = Op->getSimpleValueType(0);
24356 SDValue In = Op->getOperand(0);
24372 // Widen to 512-bits if VLX is not supported.
24388 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24399 // Extract back to 128/256-bit if we widened.
24409 SDValue In = Op->getOperand(0);
24422 // non-SSE4.1 targets. For zero extend this should only handle inputs of
24427 SDValue In = Op->getOperand(0);
24428 MVT VT = Op->getSimpleValueType(0);
24448 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24449 // For 512-bit vectors, we need 128-bits or 256-bits.
24452 // at least 128-bits.
24458 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24460 // need to be handled here for 256/512-bit results.
24462 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24475 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24477 assert(VT.is256BitVector() && "256-bit vector expected");
24497 // If the source elements are already all-signbits, we don't need to extend,
24509 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24526 Mask[i * Scale + (Scale - 1)] = i;
24531 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24549 MVT VT = Op->getSimpleValueType(0);
24550 SDValue In = Op->getOperand(0);
24582 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24583 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24589 SmallVector<int,8> ShufMask(NumElems, -1);
24599 /// Change a vector store into a pair of half-size vector stores.
24601 SDValue StoredVal = Store->getValue();
24604 "Expecting 256/512-bit op");
24611 if (!Store->isSimple())
24618 SDValue Ptr0 = Store->getBasePtr();
24622 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24623 Store->getOriginalAlign(),
24624 Store->getMemOperand()->getFlags());
24625 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24626 Store->getPointerInfo().getWithOffset(HalfOffset),
24627 Store->getOriginalAlign(),
24628 Store->getMemOperand()->getFlags());
24636 SDValue StoredVal = Store->getValue();
24638 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
24644 if (!Store->isSimple())
24655 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24659 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24660 Store->getPointerInfo().getWithOffset(Offset),
24661 Store->getOriginalAlign(),
24662 Store->getMemOperand()->getFlags());
24672 SDValue StoredVal = St->getValue();
24679 assert(!St->isTruncatingStore() && "Expected non-truncating store");
24694 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24695 St->getPointerInfo(), St->getOriginalAlign(),
24696 St->getMemOperand()->getFlags());
24699 if (St->isTruncatingStore())
24702 // If this is a 256-bit store of concatenated ops, we are better off splitting
24703 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24729 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24737 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24738 St->getPointerInfo(), St->getOriginalAlign(),
24739 St->getMemOperand()->getFlags());
24743 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24745 St->getMemOperand());
24767 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
24772 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24773 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24774 Ld->getMemOperand()->getFlags());
24777 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
24813 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24844 // have a fall-through edge, because this requires an explicit
24846 if (Op.getNode()->hasOneUse()) {
24847 SDNode *User = *Op.getNode()->use_begin();
24851 if (User->getOpcode() == ISD::BR) {
24852 SDValue FalseBB = User->getOperand(1);
24854 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
24940 EVT VT = Node->getValueType(0);
24974 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24998 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25001 Register SPReg = RegInfo->getStackRegister();
25007 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
25025 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25032 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25038 // gp_offset (0 - 6 * 8)
25039 // fp_offset (48 - 48 + 8 * 16)
25047 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25055 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25061 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25069 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25079 "LowerVAARG only handles 64-bit va_arg!");
25089 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25093 EVT ArgVT = Op.getNode()->getValueType(0);
25137 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
25139 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
25148 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
25149 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25198 ShiftAmt = ElementType.getSizeInBits() - 1;
25204 && "Unknown target vector shift-by-constant node");
25244 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
25249 // Peek through any zext node if we can get back to a 128-bit source.
25260 // The shift uses the entire lower 64-bits of the amount vector, so no need to
25266 // If the shift amount has come from a scalar, then zero-extend the scalar
25275 // then we can zero-extend it by setting all the other mask elements to
25290 // Extract if the shift amount vector is larger than 128-bits.
25296 // Zero-extend bottom element to v2i64 vector type, either by extension or
25307 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25316 // Change opcode to non-immediate version.
25319 // The return type has to be a 128-bit type with the same element
25396 if (MaskConst->getZExtValue() & 0x1)
25417 if (!Fn->hasPersonalityFn())
25420 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25422 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25428 "can only recover FP for 32-bit MSVC EH personality functions");
25435 /// RegNodeBase = EntryEBP - RegNodeSize
25436 /// ParentFP = RegNodeBase - ParentFrameOffset
25450 if (!Fn->hasPersonalityFn())
25456 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25468 // RegNodeBase = EntryEBP - RegNodeSize
25469 // ParentFP = RegNodeBase - ParentFrameOffset
25480 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25486 unsigned RC = C->getZExtValue();
25500 RC = C->getZExtValue();
25520 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25523 switch(IntrData->Type) {
25526 // First, we check if the intrinsic may have non-default rounding mode,
25527 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25528 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25539 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25547 Opc = IntrData->Opc0;
25549 Opc = IntrData->Opc1;
25559 // First, we check if the intrinsic may have non-default rounding mode,
25560 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25561 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25573 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25581 Opc = IntrData->Opc0;
25583 Opc = IntrData->Opc1;
25596 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25598 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
25602 // First, we check if the intrinsic may have non-default rounding mode,
25603 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25604 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25616 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25620 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
25623 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
25626 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25635 // - RC Opcode is specified and
25636 // - RC is not "current direction".
25637 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25650 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25661 Opc = IntrData->Opc0;
25663 Opc = IntrData->Opc1;
25675 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25677 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25678 // (2) With rounding mode and sae - 7 operands.
25692 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25700 unsigned Opc = IntrData->Opc0;
25722 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25724 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25739 Opc = IntrData->Opc0;
25741 Opc = IntrData->Opc1;
25754 if (IntrData->Opc1 != 0) {
25758 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25764 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25773 unsigned Opc = IntrData->Opc0;
25774 if (IntrData->Opc1 != 0) {
25777 Opc = IntrData->Opc1;
25794 Opc = IntrData->Opc0;
25796 Opc = IntrData->Opc1;
25810 unsigned Opc = IntrData->Opc0;
25811 if (IntrData->Opc1 != 0) {
25814 Opc = IntrData->Opc1;
25830 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
25837 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
25848 if (IntrData->Type == CFMA_OP_MASKZ)
25852 // - RC Opcode is specified and
25853 // - RC is not "current direction".
25855 if (IntrData->Opc1 != 0) {
25859 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
25865 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
25871 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25877 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
25893 // First, we check if the intrinsic may have non-default rounding mode,
25894 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25895 if (IntrData->Opc1 != 0) {
25898 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
25904 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
25914 if (IntrData->Opc1 != 0) {
25917 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
25923 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
25935 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
25942 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
26000 // Catch shift-by-constant.
26002 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26004 CShAmt->getZExtValue(), DAG);
26007 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26021 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26031 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26035 unsigned Opc = IntrData->Opc0;
26036 if (IntrData->Opc1 != 0) {
26039 Opc = IntrData->Opc1;
26052 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26057 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26061 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26066 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26070 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26075 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26080 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26087 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26091 DAG.getConstant(-1, dl, MVT::i8));
26092 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26108 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26113 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26123 unsigned Opc = IntrData->Opc0;
26137 Opc = IntrData->Opc1;
26149 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26155 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
26311 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26326 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26339 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26359 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26361 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26364 // supported on 32-bit Windows, which isn't PIC.
26373 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26386 if (RegInfo->hasBasePointer(MF))
26387 Reg = RegInfo->getBaseRegister();
26389 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26391 Reg = RegInfo->getPtrSizedStackRegister(MF);
26393 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26410 Op->getOperand(1), Op->getOperand(2));
26431 // to 8-bits which may make it no longer out of bounds.
26432 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26483 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
26505 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26523 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26538 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26561 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26575 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26592 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26606 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26623 /// Returns a Glue value which can be used to add extra copy-from-reg if the
26632 SDValue Chain = N->getOperand(0);
26636 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
26637 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26662 // Merge the two 32-bit values into a 64-bit one.
26670 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26685 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26686 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26687 // and the EAX register is loaded with the low-order 32 bits.
26723 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26741 EHInfo->EHGuardFrameIndex = FINode->getIndex();
26774 // 64-bit targets support extended Swift async frame setup,
26776 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
26792 X86FI->setHasSwiftAsyncContext(true);
26793 SDValue Chain = Op->getOperand(0);
26800 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26805 if (!X86FI->getSwiftAsyncContextFrameIdx())
26806 X86FI->setSwiftAsyncContextFrameIdx(
26810 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
26813 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26814 Op->getOperand(0));
26861 SDValue Chain = Op->getOperand(0);
26880 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
26881 Op->getOperand(3), Op->getOperand(4));
26883 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26904 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26933 MachineMemOperand *MMO = MemIntr->getMemOperand();
26934 EVT MemVT = MemIntr->getMemoryVT();
26940 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26971 MachineMemOperand *MMO = MemIntr->getMemOperand();
26972 EVT MemVT = MemIntr->getMemoryVT();
26981 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26994 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27008 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27014 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27028 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27034 unsigned Imm = Op2->getAsZExtVal();
27038 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27048 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27050 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
27088 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
27089 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
27123 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27135 switch(IntrData->Type) {
27140 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
27141 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27145 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
27146 DAG.getConstant(1, dl, Op->getValueType(1)),
27149 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
27152 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
27162 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27184 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27191 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
27203 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
27219 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
27225 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
27226 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27229 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
27230 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
27244 EVT MemVT = MemIntr->getMemoryVT();
27246 uint16_t TruncationOp = IntrData->Opc0;
27251 MemIntr->getMemOperand());
27258 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
27266 MemIntr->getMemOperand(), DAG);
27272 VMask, MemVT, MemIntr->getMemOperand(), DAG);
27296 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
27323 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
27327 int FrameAddrIndex = FuncInfo->getFAIndex();
27330 unsigned SlotSize = RegInfo->getSlotSize();
27333 FuncInfo->setFAIndex(FrameAddrIndex);
27339 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
27346 while (Depth--)
27374 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27390 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
27421 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
27429 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
27450 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
27483 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27489 // Large code-model.
27490 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
27493 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
27494 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
27538 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
27539 CallingConv::ID CC = Func->getCallingConv();
27552 FunctionType *FTy = Func->getFunctionType();
27553 const AttributeList &Attrs = Func->getAttributes();
27555 if (!Attrs.isEmpty() && !Func->isVarArg()) {
27559 for (FunctionType::param_iterator I = FTy->param_begin(),
27560 E = FTy->param_end(); I != E; ++I, ++Idx)
27568 report_fatal_error("Nest register in use - reduce number of inreg"
27594 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
27626 01 Round to -inf
27631 -1 Undefined
27635 3 Round to -inf
27637 To perform the conversion, we use a packed lookup table of the four 2-bit
27639 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
27688 SDValue Chain = Op.getNode()->getOperand(0);
27711 SDValue NewRM = Op.getNode()->getOperand(1);
27714 uint64_t RM = CVal->getZExtValue();
27717 // clang-format off
27724 // clang-format on
27729 // 0 Round to 0 -> 11
27730 // 1 Round to nearest -> 00
27731 // 2 Round to +inf -> 10
27732 // 3 Round to -inf -> 01
27733 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27806 SDValue Chain = Op->getOperand(0);
27807 SDValue Ptr = Op->getOperand(1);
27809 EVT MemVT = Node->getMemoryVT();
27811 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27822 (MMO->getFlags() & ~MachineMemOperand::MOStore);
27872 SDValue Chain = Op->getOperand(0);
27873 SDValue Ptr = Op->getOperand(1);
27875 EVT MemVT = Node->getMemoryVT();
27877 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27885 SDValue Chain = Op.getNode()->getOperand(0);
27891 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
27900 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
27944 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27958 // Per-nibble leading zero PSHUFB lookup table.
28038 // vXi8 vectors need to be promoted to 512-bits for vXi32.
28042 // Decompose 256-bit ops into smaller 128-bit ops.
28046 // Decompose 512-bit ops into smaller 256-bit ops.
28078 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
28084 // Finally xor with NumBits-1.
28086 DAG.getConstant(NumBits - 1, dl, OpVT));
28131 "Only handle AVX 256-bit vector integer operation");
28157 // Handle a special-case with a bit-hack instead of cmp+select:
28158 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
28163 if (C && C->getAPIntValue().isSignMask()) {
28164 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
28165 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
28172 // usubsat X, Y --> (X >u Y) ? X - Y : 0
28211 // Since X86 does not have CMOV for 8-bit integer, we don't convert
28212 // 8-bit integer abs to NEG and CMOV.
28221 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
28230 "Only handle AVX 256-bit vector integer operation");
28301 // Num xNaN +0 -0
28302 // --------------- ---------------
28304 // X --------------- X ---------------
28305 // xNaN | X | X/Y | -0 | +0 | -0 |
28306 // --------------- ---------------
28317 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
28319 return CstOp->getAPIntValue() == Zero;
28320 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
28321 Op->getOpcode() == ISD::SPLAT_VECTOR) {
28322 for (const SDValue &OpVal : Op->op_values()) {
28328 if (!CstOp->getValueAPF().isZero())
28330 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
28341 Op->getFlags().hasNoSignedZeros() ||
28354 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
28373 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28404 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
28412 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28441 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
28442 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
28461 // Decompose 256-bit ops into 128-bit ops.
28471 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
28497 for (auto [Idx, Val] : enumerate(B->ops())) {
28519 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
28557 static const int UnpackMask[] = { 1, -1, 3, -1 };
28641 // and use pmullw to calculate the full 16-bit product.
28644 // pmulhw to calculate the full 16-bit product. This trick means we don't
28711 bool IsSigned = Op->getOpcode() == ISD::MULHS;
28716 // Decompose 256-bit ops into 128-bit ops.
28740 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
28741 9, -1, 11, -1, 13, -1, 15, -1};
28796 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
28823 bool IsSigned = Op->getOpcode() == ISD::SMULO;
28826 EVT OvfVT = Op->getValueType(1);
28935 // UMULO overflows if the high bits are non-zero.
28951 if (isa<ConstantSDNode>(Op->getOperand(1))) {
28959 switch (Op->getOpcode()) {
28960 // clang-format off
28966 // clang-format on
28974 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
28975 EVT ArgVT = Op->getOperand(i).getValueType();
28979 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28984 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
29015 bool IsStrict = Op->isStrictFPOpcode();
29024 if (Op->getOpcode() == ISD::FP_TO_SINT ||
29025 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
29048 bool IsStrict = Op->isStrictFPOpcode();
29057 if (Op->getOpcode() == ISD::SINT_TO_FP ||
29058 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
29070 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29095 (0x8080808080808080ULL >> (64 - (8 * Amt))));
29097 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
29099 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
29118 // Return true if the required (according to Opcode) shift-imm form is natively
29147 // These instructions are defined together with shift-immediate.
29154 // Return true if the required (according to Opcode) variable-shift form is
29170 // vXi16 supported only on AVX-512, BWI
29209 ShiftAmt - 32, DAG);
29245 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
29247 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29265 // If we're logical shifting an all-signbits value then we can just perform as
29281 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29317 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
29326 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
29353 int BaseShAmtIdx = -1;
29359 // vXi8 shifts - shift as v8i16 + mask result.
29370 // Create the mask using vXi16 shifts. For shift-rights we need to move
29372 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
29390 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
29502 // XOP has 128-bit variable logical/arithmetic shifts.
29503 // +ve/-ve Amt = shift left/right.
29514 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
29515 // shifts per-lane and then shuffle the partial results back together.
29541 SDValue A = Amt->getOperand(i);
29567 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
29568 Cst2->getAPIntValue().ult(EltSizeInBits)) {
29570 Cst1->getZExtValue(), DAG);
29572 Cst2->getZExtValue(), DAG);
29587 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
29601 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
29603 // of these cases in pre-SSE41/XOP/AVX512 but not both.
29626 // immediate shifts, else we need to zero-extend each lane to the lower i64
29639 // just zero-extending, but for SSE just duplicating the top 16-bits is
29643 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
29644 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
29645 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
29646 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
29650 {4, 5, 6, 7, -1, -1, -1, -1});
29667 // TODO - ideally shuffle combining would handle this.
29669 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
29670 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
29673 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
29674 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
29681 // NOTE: We honor prefered vector width before promoting to 512-bits.
29773 // On pre-SSE41 targets we test for the sign bit by comparing to
29774 // zero - a negative value will set all bits of the lanes to true
29875 // If we have a constant shift amount, the non-SSE41 path is best as
29891 // On pre-SSE41 targets we splat the sign bit - a negative value will
29938 // Decompose 256-bit shifts into 128-bit shifts.
29984 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
29985 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
29991 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
29992 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
30001 // bit-select - lower using vXi16 shifts and then perform the bitmask at
30003 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
30004 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
30025 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30037 // Split 256-bit integers on XOP/pre-AVX2 targets.
30038 // Split 512-bit integers on non 512-bit BWI targets.
30043 // Pre-mask the amount modulo using the wider vector.
30050 int ScalarAmtIdx = -1;
30070 // If per-element shifts are legal, fallback to generic expansion.
30075 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30076 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30092 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
30116 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30117 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30120 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
30183 // Else, fall-back on VPROLV/VPRORV.
30187 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
30216 // Split 256-bit integers on XOP/pre-AVX2 targets.
30220 // XOP has 128-bit vector variable + immediate rotates.
30221 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
30225 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
30234 // Use general rotate by variable (per-element).
30238 // Rotate by an uniform constant - expand back to shifts.
30243 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
30244 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
30252 // Split 512-bit integers on non 512-bit BWI targets.
30266 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30270 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30271 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30273 int BaseRotAmtIdx = -1;
30294 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30295 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30296 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
30317 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
30318 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
30347 // On pre-SSE41 targets we test for the sign bit by comparing to
30348 // zero - a negative value will set all bits of the lanes to true
30405 // Fallback for non-constants AVX2 vXi16 as well.
30440 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
30441 // that can then be OR'd with the lower 32-bits.
30443 static const int OddMask[] = {1, -1, 3, -1};
30466 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
30478 Type *MemType = SI->getValueOperand()->getType();
30480 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
30482 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30486 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
30498 Type *MemType = LI->getType();
30500 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
30502 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
30503 // can use movq to do the load. If we have X87 we can load into an 80-bit
30505 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30509 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
30510 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
30533 if (isPowerOf2_64(C->getZExtValue()))
30535 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
30540 // Check if V is some power of 2 pattern known to be non-zero
30559 if (I->getOpcode() == Instruction::Shl) {
30561 // -X` and some other provable power of 2 patterns that we can use CTZ on
30564 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
30565 // be provably a non-zero power of 2.
30568 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
30571 if (ShiftVal->equalsInt(1))
30577 Value *BitV = I->getOperand(1);
30583 if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
30597 if (AI->use_empty())
30600 if (AI->getOperation() == AtomicRMWInst::Xor) {
30601 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
30603 if (match(AI->getOperand(1), m_SignMask()))
30609 // Note: InstCombinePass can cause a de-optimization here. It replaces the
30613 Instruction *I = AI->user_back();
30614 auto BitChange = FindSingleBitChange(AI->getValOperand());
30615 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
30616 I->getOpcode() != Instruction::And ||
30617 AI->getType()->getPrimitiveSizeInBits() == 8 ||
30618 AI->getParent() != I->getParent())
30621 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
30624 if (AI == I->getOperand(OtherIdx))
30629 auto *C1 = cast<ConstantInt>(AI->getValOperand());
30630 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
30631 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
30634 if (AI->getOperation() == AtomicRMWInst::And) {
30635 return ~C1->getValue() == C2->getValue()
30645 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
30657 if (AI->getOperation() == AtomicRMWInst::And)
30673 switch (AI->getOperation()) {
30689 Instruction *I = AI->user_back();
30690 LLVMContext &Ctx = AI->getContext();
30691 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30695 auto BitTested = FindSingleBitChange(AI->getValOperand());
30699 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
30701 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
30703 unsigned Imm = llvm::countr_zero(C->getZExtValue());
30706 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
30715 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
30717 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
30725 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
30727 // If the result is only used for zero/non-zero status then we don't need to
30729 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
30731 if (ICmp->isEquality()) {
30732 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
30733 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
30736 if ((C0 ? C0 : C1)->isZero())
30746 I->replaceAllUsesWith(Result);
30747 I->eraseFromParent();
30748 AI->eraseFromParent();
30753 if (!AI->hasOneUse())
30756 Value *Op = AI->getOperand(1);
30758 Instruction *I = AI->user_back();
30759 AtomicRMWInst::BinOp Opc = AI->getOperation();
30764 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30766 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30775 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30777 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30786 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30789 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30797 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30799 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30813 LLVMContext &Ctx = AI->getContext();
30814 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
30816 TempI = AI->user_back();
30817 assert(TempI->hasOneUse() && "Must have one use");
30818 ICI = cast<ICmpInst>(TempI->user_back());
30821 ICmpInst::Predicate Pred = ICI->getPredicate();
30839 switch (AI->getOperation()) {
30859 Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
30860 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30863 CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
30865 ICI->replaceAllUsesWith(Result);
30866 ICI->eraseFromParent();
30868 TempI->eraseFromParent();
30869 AI->eraseFromParent();
30875 Type *MemType = AI->getType();
30879 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
30884 AtomicRMWInst::BinOp Op = AI->getOperation();
30900 case AtomicRMWInst::Nand:
30912 // These always require a non-trivial set of data operations on x86. We must
30921 Type *MemType = AI->getType();
30925 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
30931 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
30932 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
30933 AI->use_empty())
30938 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
30939 auto SSID = AI->getSyncScopeID();
30942 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
30945 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
30967 // different cache-line to prevent cache-line bouncing. In practice it
30978 AI->getType(), AI->getPointerOperand(), AI->getAlign());
30979 Loaded->setAtomic(Order, SSID);
30980 AI->replaceAllUsesWith(Loaded);
30981 AI->eraseFromParent();
30996 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
31007 // c) To minimize concerns about cross thread stack usage - in particular,
31009 // captures state in the TOS frame and accesses it from many threads -
31014 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
31018 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
31058 // The only fence that needs an instruction is a sequentially-consistent
31059 // cross-thread fence.
31069 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31097 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
31107 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
31149 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
31161 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
31282 // http://wm.ite.pl/articles/sse-popcount.html
31285 // index into a in-register pre-computed pop count table. We then split up the
31286 // input vector in two new ones: (1) a vector with only the shifted-right
31289 // to index the in-register table. Next, both are added and the result is a
31340 // Decompose 256-bit ops into smaller 128-bit ops.
31344 // Decompose 512-bit ops into smaller 256-bit ops.
31377 unsigned ActiveBits = Known.getBitWidth() - LZ;
31378 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
31380 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
31392 // i3 CTPOP - perform LUT into i32 integer.
31407 // i4 CTPOP - perform LUT into i64 integer.
31424 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
31468 // Decompose 256-bit ops into smaller 128-bit ops.
31473 "Only 128-bit vector bitreverse lowering supported.");
31481 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
31507 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
31511 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
31553 // 0-15 value (moved to the other nibble).
31588 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
31605 // Xor the high and low 16-bits together using a 32-bit operation.
31614 // Xor the high and low 16-bits together using a 32-bit operation.
31619 // If the input is 16-bits, we need to extend to use an i32 shift below.
31623 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
31624 // This should allow an h-reg to be used to save a shift.
31641 switch (N->getOpcode()) {
31661 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
31665 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
31666 /*MemVT=*/N->getSimpleValueType(0), MMO);
31669 /// Lower atomic_load_ops into LOCK-prefixed operations.
31673 SDValue Chain = N->getOperand(0);
31674 SDValue LHS = N->getOperand(1);
31675 SDValue RHS = N->getOperand(2);
31676 unsigned Opc = N->getOpcode();
31677 MVT VT = N->getSimpleValueType(0);
31683 if (N->hasAnyUseOfValue(0)) {
31684 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
31691 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
31705 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
31711 if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
31712 AN->getSyncScopeID() == SyncScope::System) {
31717 assert(!N->hasAnyUseOfValue(0));
31719 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31722 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31724 assert(!N->hasAnyUseOfValue(0));
31726 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31732 assert(!N->hasAnyUseOfValue(0));
31734 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31742 EVT VT = Node->getMemoryVT();
31745 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
31760 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
31761 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
31762 Node->getMemOperand());
31770 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
31774 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
31776 MVT::i64, Node->getMemOperand());
31778 // First load this into an 80-bit X87 register using a stack temporary.
31781 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31784 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
31794 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
31797 StoreOps, MVT::i64, Node->getMemOperand());
31811 // Convert seq_cst store -> xchg
31812 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
31813 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
31814 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
31815 Node->getOperand(0), Node->getOperand(2),
31816 Node->getOperand(1), Node->getMemOperand());
31822 MVT VT = N->getSimpleValueType(0);
31846 if (N->getValueType(1) == MVT::i1)
31849 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
31947 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
31960 "MGATHER/MSCATTER are supported on AVX-512 arch only");
31963 SDValue Src = N->getValue();
31968 SDValue Scale = N->getScale();
31969 SDValue Index = N->getIndex();
31970 SDValue Mask = N->getMask();
31971 SDValue Chain = N->getChain();
31972 SDValue BasePtr = N->getBasePtr();
31984 N->getMemoryVT(), N->getMemOperand());
31996 // If we don't have VLX and neither the passthru or index is 512-bits, we
32000 // Determine how much we need to widen by to get a 512-bit type.
32017 N->getMemoryVT(), N->getMemOperand());
32026 SDValue Mask = N->getMask();
32028 SDValue PassThru = N->getPassThru();
32038 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32039 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
32040 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
32041 N->isExpandingLoad());
32047 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
32048 "Expanding masked load is supported on AVX-512 target only!");
32050 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
32051 "Expanding masked load is supported for 32 and 64-bit types only!");
32075 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32076 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
32077 N->getExtensionType(), N->isExpandingLoad());
32089 SDValue DataToStore = N->getValue();
32092 SDValue Mask = N->getMask();
32095 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
32096 "Expanding masked load is supported on AVX-512 target only!");
32098 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
32099 "Expanding masked load is supported for 32 and 64-bit types only!");
32122 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
32123 N->getOffset(), Mask, N->getMemoryVT(),
32124 N->getMemOperand(), N->getAddressingMode(),
32125 N->isTruncatingStore(), N->isCompressingStore());
32131 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
32136 SDValue Index = N->getIndex();
32137 SDValue Mask = N->getMask();
32138 SDValue PassThru = N->getPassThru();
32147 // If we don't have VLX and neither the passthru or index is 512-bits, we
32152 // Determine how much we need to widen by to get a 512-bit type.
32171 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
32172 N->getScale() };
32174 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
32175 N->getMemOperand());
32187 unsigned SrcAS = N->getSrcAddressSpace();
32189 assert(SrcAS != N->getDestAddressSpace() &&
32209 // no-ops in the case of a null GC strategy (or a GC strategy which does not
32214 if (Op->getGluedNode())
32215 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
32239 // We don't support non-data prefetch without PREFETCHI.
32259 // sub-string, e.g. "$12" contain "$1"
32261 I = AsmStr.size() - OpNoStr1.size();
32316 // ->
32337 // ->
32352 // clang-format off
32504 // clang-format on
32514 switch (N->getOpcode()) {
32518 N->dump(&DAG);
32522 EVT VT = N->getValueType(0);
32534 EVT VT = N->getValueType(0);
32540 {N->getOperand(0), Lo});
32542 {N->getOperand(0), Hi});
32554 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
32557 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
32561 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
32575 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
32577 // Bit count should fit in 32-bits, extract it as that and then zero
32588 EVT VT = N->getValueType(0);
32591 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
32594 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
32595 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
32607 EVT VT = N->getValueType(0);
32610 bool IsSigned = N->getOpcode() == ISD::SMULO;
32612 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
32613 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
32618 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
32630 // UMULO overflows if the high bits are non-zero.
32633 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
32646 EVT VT = N->getValueType(0);
32647 EVT InVT = N->getOperand(0).getValueType();
32662 Ops[0] = N->getOperand(0);
32664 Ops[0] = N->getOperand(1);
32667 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
32676 EVT VT = N->getValueType(0);
32680 N->getOperand(0), UNDEF);
32682 N->getOperand(1), UNDEF);
32683 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
32690 EVT VT = N->getValueType(0);
32696 // TODO: Can we do something for non-splat?
32698 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
32701 Ops0[0] = N->getOperand(0);
32705 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
32716 MVT VT = N->getSimpleValueType(0);
32724 SDValue In = N->getOperand(0);
32749 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
32791 -1, -1, -1, -1, -1, -1, -1, -1 });
32814 assert(N->getValueType(0) == MVT::v8i8 &&
32819 EVT VT = N->getValueType(0);
32820 SDValue In = N->getOperand(0);
32826 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
32827 // Custom split this so we can extend i8/i16->i32 invec. This is better
32828 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
32863 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
32869 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
32872 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
32882 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
32893 bool IsStrict = N->isStrictFPOpcode();
32894 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
32895 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
32896 EVT VT = N->getValueType(0);
32897 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32898 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
32906 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
32911 Res = DAG.getNode(N->getOpcode(), dl, VT,
32938 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
32977 {N->getOperand(0), Src});
33037 // legalization to v8i32<-v8f64.
33044 Opc = N->getOpcode();
33050 {N->getOperand(0), Src});
33061 // Custom widen strict v2f32->v2i32 by padding with zeros.
33066 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
33067 {N->getOperand(0), Src});
33085 // If we use a 128-bit result we might need to use a target specific node.
33090 unsigned Opc = N->getOpcode();
33105 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
33143 bool IsStrict = N->isStrictFPOpcode();
33144 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
33145 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
33146 EVT VT = N->getValueType(0);
33147 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33161 {N->getOperand(0), Src});
33178 {N->getOperand(0), Src});
33203 {N->getOperand(0), Elt});
33220 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
33235 // Custom widen strict v2i32->v2f32 to avoid scalarization.
33239 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
33240 {N->getOperand(0), Src});
33255 {N->getOperand(0), Or, VBias});
33262 // TODO: Are there any fast-math-flags to propagate here?
33270 bool IsStrict = N->isStrictFPOpcode();
33271 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
33272 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33273 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
33275 EVT VT = N->getValueType(0);
33315 assert(N->getValueType(0) == MVT::v2f32 &&
33319 bool IsStrict = N->isStrictFPOpcode();
33320 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33328 {N->getOperand(0), V});
33337 unsigned IntNo = N->getConstantOperandVal(1);
33365 EVT T = N->getValueType(0);
33369 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
33373 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
33374 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
33381 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
33386 // In 64-bit mode we might need the base pointer in RBX, but we can't know
33391 // live-range.
33394 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
33396 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
33403 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
33420 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
33429 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
33437 if (N->getValueType(0) == MVT::i128) {
33439 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
33440 Node->getBasePtr(), Node->getMemOperand());
33445 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
33454 // Then extract the lower 64-bits.
33457 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
33459 MVT::i64, Node->getMemOperand());
33468 // then casts to i64. This avoids a 128-bit stack temporary being
33469 // created by type legalization if we were to cast v4f32->v2i64.
33478 // First load this into an 80-bit X87 register. This will put the whole
33481 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
33484 Node->getMemOperand());
33492 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33531 EVT DstVT = N->getValueType(0);
33532 EVT SrcVT = N->getOperand(0).getValueType();
33534 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
33535 // we can split using the k-register rather than memory.
33537 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
33554 N->getOperand(0));
33563 EVT VT = N->getValueType(0);
33567 SDValue Index = Gather->getIndex();
33573 SDValue Mask = Gather->getMask();
33576 Gather->getPassThru(),
33585 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
33586 Gather->getBasePtr(), Index, Gather->getScale() };
33589 Gather->getMemoryVT(), Gather->getMemOperand());
33598 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
33600 MVT VT = N->getSimpleValueType(0);
33609 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
33610 Ld->getPointerInfo(), Ld->getOriginalAlign(),
33611 Ld->getMemOperand()->getFlags());
33623 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
33625 MVT::i64, Ld->getMemOperand());
33636 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33645 assert(N->getSimpleValueType(0) == MVT::f16 &&
33648 SDValue VecOp = N->getOperand(0);
33650 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
33652 N->getOperand(1));
34081 // X86 allows a sign-extended 32-bit immediate field as a displacement.
34097 // If lower 4G is not available, then we must use rip-relative addressing.
34127 unsigned Bits = Ty->getScalarSizeInBits();
34151 // These are non-commutative binops.
34187 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
34189 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
34190 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
34195 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
34201 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
34230 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
34231 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
34235 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
34256 // X86 has 8, 16, and 32-bit zero-extending loads.
34267 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
34271 if (I->getOpcode() == Instruction::Mul &&
34272 VTy->getElementType()->isIntegerTy(64)) {
34273 for (auto &Op : I->operands()) {
34275 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
34283 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
34298 int ShiftAmountOpNum = -1;
34299 if (I->isShift())
34302 if (II->getIntrinsicID() == Intrinsic::fshl ||
34303 II->getIntrinsicID() == Intrinsic::fshr)
34307 if (ShiftAmountOpNum == -1)
34310 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
34311 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
34312 isVectorShiftByScalarCheap(I->getType())) {
34313 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
34369 // TODO: This is too general. There are cases where pre-AVX512 codegen would
34393 // Very little shuffling can be done for 64-bit vectors right now.
34405 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
34426 // zero-extensions.
34433 //===----------------------------------------------------------------------===//
34435 //===----------------------------------------------------------------------===//
34442 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
34452 for (MachineBasicBlock *Succ : BB->successors())
34453 if (Succ->isLiveIn(X86::EFLAGS))
34464 const BasicBlock *BB = MBB->getBasicBlock();
34465 MachineFunction::iterator I = ++MBB->getIterator();
34473 // s0 = -1
34483 MachineFunction *MF = MBB->getParent();
34484 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
34485 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
34486 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
34487 MF->insert(I, mainMBB);
34488 MF->insert(I, fallMBB);
34489 MF->insert(I, sinkMBB);
34492 mainMBB->addLiveIn(X86::EFLAGS);
34493 fallMBB->addLiveIn(X86::EFLAGS);
34494 sinkMBB->addLiveIn(X86::EFLAGS);
34498 sinkMBB->splice(sinkMBB->begin(), MBB,
34499 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
34500 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
34502 MachineRegisterInfo &MRI = MF->getRegInfo();
34512 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
34513 thisMBB->addSuccessor(mainMBB);
34514 thisMBB->addSuccessor(fallMBB);
34517 // mainDstReg := -1
34518 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
34519 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
34520 mainMBB->addSuccessor(sinkMBB);
34526 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
34527 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
34529 fallMBB->addSuccessor(sinkMBB);
34533 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
34544 // Emit va_arg instruction on X86-64.
34546 // Operands to this pseudo-instruction:
34548 // 1-5) Input : va_list address (addr, i64mem)
34552 // 9 ) EFLAGS (implicit-def)
34567 MachineFunction *MF = MBB->getParent();
34575 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
34576 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
34577 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
34578 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
34582 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
34584 getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
34643 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
34644 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34645 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34646 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34648 MachineFunction::iterator MBBIter = ++MBB->getIterator();
34651 MF->insert(MBBIter, offsetMBB);
34652 MF->insert(MBBIter, overflowMBB);
34653 MF->insert(MBBIter, endMBB);
34656 endMBB->splice(endMBB->begin(), thisMBB,
34657 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
34658 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
34661 thisMBB->addSuccessor(offsetMBB);
34662 thisMBB->addSuccessor(overflowMBB);
34665 offsetMBB->addSuccessor(endMBB);
34666 overflowMBB->addSuccessor(endMBB);
34670 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
34679 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
34681 .addImm(MaxOffset + 8 - ArgSizeA8);
34685 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
34697 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34707 // Zero-extend the offset
34709 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
34715 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
34720 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
34727 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
34732 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
34742 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
34753 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34768 // aligned_addr = (addr + (align-1)) & ~(align-1)
34771 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34774 .addImm(Alignment.value() - 1);
34778 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
34781 .addImm(~(uint64_t)(Alignment.value() - 1));
34783 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
34788 // (the overflow address should be kept 8-byte aligned)
34792 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34799 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
34810 BuildMI(*endMBB, endMBB->begin(), MIMD,
34811 TII->get(X86::PHI), DestReg)
34835 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
34839 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
34840 // together with other CMOV pseudo-opcodes into a single basic-block with
34885 MachineFunction *MF = TrueMBB->getParent();
34886 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
34889 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
34892 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
34904 Register DestReg = MIIt->getOperand(0).getReg();
34905 Register Op1Reg = MIIt->getOperand(1).getReg();
34906 Register Op2Reg = MIIt->getOperand(2).getReg();
34911 if (MIIt->getOperand(3).getImm() == OppCC)
34921 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
34966 // because this custom-inserter would have generated:
35013 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
35014 MachineFunction *F = ThisMBB->getParent();
35015 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35016 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35017 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
35019 MachineFunction::iterator It = ++ThisMBB->getIterator();
35020 F->insert(It, FirstInsertedMBB);
35021 F->insert(It, SecondInsertedMBB);
35022 F->insert(It, SinkMBB);
35027 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
35034 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
35035 SinkMBB->addLiveIn(X86::EFLAGS);
35039 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
35041 ThisMBB->end());
35042 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
35045 ThisMBB->addSuccessor(FirstInsertedMBB);
35047 ThisMBB->addSuccessor(SinkMBB);
35049 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
35051 FirstInsertedMBB->addSuccessor(SinkMBB);
35053 SecondInsertedMBB->addSuccessor(SinkMBB);
35057 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
35061 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
35071 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
35095 // diamond control-flow pattern. The incoming instruction knows the
35104 // fallthrough --> FalseMBB
35106 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
35139 // function - EmitLoweredCascadedSelect.
35153 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
35154 (NextMIIt->getOperand(3).getImm() == CC ||
35155 NextMIIt->getOperand(3).getImm() == OppCC)) {
35157 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
35163 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
35164 NextMIIt->getOpcode() == MI.getOpcode() &&
35165 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
35166 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
35167 NextMIIt->getOperand(1).isKill()) {
35171 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
35172 MachineFunction *F = ThisMBB->getParent();
35173 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
35174 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
35176 MachineFunction::iterator It = ++ThisMBB->getIterator();
35177 F->insert(It, FalseMBB);
35178 F->insert(It, SinkMBB);
35181 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
35182 FalseMBB->setCallFrameSize(CallFrameSize);
35183 SinkMBB->setCallFrameSize(CallFrameSize);
35188 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
35190 FalseMBB->addLiveIn(X86::EFLAGS);
35191 SinkMBB->addLiveIn(X86::EFLAGS);
35199 SinkMBB->push_back(MI.removeFromParent());
35202 SinkMBB->splice(SinkMBB->end(), ThisMBB,
35204 ThisMBB->end());
35205 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
35208 ThisMBB->addSuccessor(FalseMBB);
35210 ThisMBB->addSuccessor(SinkMBB);
35212 FalseMBB->addSuccessor(SinkMBB);
35215 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
35226 ThisMBB->erase(MIItBegin, MIItEnd);
35241 MachineFunction *MF = MBB->getParent();
35245 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35249 MachineRegisterInfo &MRI = MF->getRegInfo();
35250 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35251 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35252 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35254 MachineFunction::iterator MBBIter = ++MBB->getIterator();
35255 MF->insert(MBBIter, testMBB);
35256 MF->insert(MBBIter, blockMBB);
35257 MF->insert(MBBIter, tailMBB);
35268 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
35272 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
35280 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
35284 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
35287 testMBB->addSuccessor(blockMBB);
35288 testMBB->addSuccessor(tailMBB);
35294 // + ---- <- ------------ <- ------------- <- ------------ +
35296 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
35298 // + <- ----------- <- ------------ <- ----------- <- ------------ +
35304 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
35307 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
35312 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
35313 blockMBB->addSuccessor(testMBB);
35316 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
35320 tailMBB->splice(tailMBB->end(), MBB,
35321 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35322 tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
35323 MBB->addSuccessor(testMBB);
35335 MachineFunction *MF = BB->getParent();
35338 const BasicBlock *LLVM_BB = BB->getBasicBlock();
35340 assert(MF->shouldSplitStack());
35364 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35365 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35366 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35368 MachineRegisterInfo &MRI = MF->getRegInfo();
35370 getRegClassFor(getPointerTy(MF->getDataLayout()));
35380 MachineFunction::iterator MBBIter = ++BB->getIterator();
35382 MF->insert(MBBIter, bumpMBB);
35383 MF->insert(MBBIter, mallocMBB);
35384 MF->insert(MBBIter, continueMBB);
35386 continueMBB->splice(continueMBB->begin(), BB,
35387 std::next(MachineBasicBlock::iterator(MI)), BB->end());
35388 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
35392 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
35393 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
35395 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
35398 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
35402 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
35404 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
35406 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
35410 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
35412 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
35414 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
35420 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
35422 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
35428 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
35430 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
35431 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
35438 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
35441 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
35443 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
35446 BB->addSuccessor(bumpMBB);
35447 BB->addSuccessor(mallocMBB);
35448 mallocMBB->addSuccessor(continueMBB);
35449 bumpMBB->addSuccessor(continueMBB);
35452 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
35469 MachineFunction *MF = BB->getParent();
35475 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
35478 // Only 32-bit EH needs to worry about manually restoring stack pointers.
35485 MF->CreateMachineBasicBlock(BB->getBasicBlock());
35486 assert(BB->succ_size() == 1);
35487 MF->insert(std::next(BB->getIterator()), RestoreMBB);
35488 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
35489 BB->addSuccessor(RestoreMBB);
35494 RestoreMBB->setIsEHPad(true);
35496 auto RestoreMBBI = RestoreMBB->begin();
35505 // adjust_stackdown -> TLSADDR -> adjust_stackup.
35507 // inside MC, therefore without the two markers shrink-wrapping
35511 MachineFunction &MF = *BB->getParent();
35518 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
35526 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
35535 // our load from the relocation, sticking it in either RDI (x86-64)
35538 MachineFunction *F = BB->getParent();
35546 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
35550 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
35551 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
35554 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
35561 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
35566 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
35573 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
35578 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
35579 .addReg(TII->getGlobalBaseReg(F))
35585 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
35614 // aliases and are doing non-trivial configuration of the thunk's body. For
35615 // example, the Linux kernel will do boot-time hot patching of the thunk
35621 // LLVM will generate calls to specific thunks, we merely make a best-effort
35626 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35629 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35632 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35635 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35638 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35646 // When targeting an internal COMDAT thunk use an LLVM-specific name.
35649 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35652 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35655 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35658 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35661 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35668 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35684 // Find an available scratch register to hold the callee. On 64-bit, we can
35686 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
35704 // Choose the first remaining non-zero available register.
35718 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
35721 MI.setDesc(TII->get(Opc));
35722 MachineInstrBuilder(*BB->getParent(), &MI)
35742 MachineFunction *MF = MBB->getParent();
35744 MachineRegisterInfo &MRI = MF->getRegInfo();
35752 MVT PVT = getPointerTy(MF->getDataLayout());
35756 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
35764 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35768 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
35785 MachineFunction *MF = MBB->getParent();
35788 MachineRegisterInfo &MRI = MF->getRegInfo();
35790 const BasicBlock *BB = MBB->getBasicBlock();
35791 MachineFunction::iterator I = ++MBB->getIterator();
35804 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
35811 MVT PVT = getPointerTy(MF->getDataLayout());
35818 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
35832 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35833 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35834 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
35835 MF->insert(I, mainMBB);
35836 MF->insert(I, sinkMBB);
35837 MF->push_back(restoreMBB);
35838 restoreMBB->setMachineBlockAddressTaken();
35843 sinkMBB->splice(sinkMBB->begin(), MBB,
35844 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35845 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
35851 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
35860 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
35868 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
35869 .addReg(XII->getGlobalBaseReg(MF))
35878 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
35891 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
35896 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
35900 MIB.addRegMask(RegInfo->getNoPreservedMask());
35901 thisMBB->addSuccessor(mainMBB);
35902 thisMBB->addSuccessor(restoreMBB);
35906 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
35907 mainMBB->addSuccessor(sinkMBB);
35910 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35917 if (RegInfo->hasBasePointer(*MF)) {
35920 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
35921 X86FI->setRestoreBasePointer(MF);
35922 Register FramePtr = RegInfo->getFrameRegister(*MF);
35923 Register BasePtr = RegInfo->getBaseRegister();
35925 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
35926 FramePtr, true, X86FI->getRestoreBasePointerOffset())
35929 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
35930 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35931 restoreMBB->addSuccessor(sinkMBB);
35946 MachineFunction *MF = MBB->getParent();
35948 MachineRegisterInfo &MRI = MF->getRegInfo();
35954 MVT PVT = getPointerTy(MF->getDataLayout());
35981 MachineFunction::iterator I = ++MBB->getIterator();
35982 const BasicBlock *BB = MBB->getBasicBlock();
35984 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
35985 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35986 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
35987 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
35988 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
35989 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35990 MF->insert(I, checkSspMBB);
35991 MF->insert(I, fallMBB);
35992 MF->insert(I, fixShadowMBB);
35993 MF->insert(I, fixShadowLoopPrepareMBB);
35994 MF->insert(I, fixShadowLoopMBB);
35995 MF->insert(I, sinkMBB);
35998 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
35999 MBB->end());
36000 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
36002 MBB->addSuccessor(checkSspMBB);
36006 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
36010 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
36020 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
36025 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
36028 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
36031 checkSspMBB->addSuccessor(sinkMBB);
36032 checkSspMBB->addSuccessor(fallMBB);
36039 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
36055 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
36060 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
36063 fallMBB->addSuccessor(sinkMBB);
36064 fallMBB->addSuccessor(fixShadowMBB);
36070 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
36076 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
36080 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
36085 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
36088 fixShadowMBB->addSuccessor(sinkMBB);
36089 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
36094 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
36101 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
36103 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
36109 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
36116 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
36120 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
36123 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
36126 fixShadowLoopMBB->addSuccessor(sinkMBB);
36127 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
36136 MachineFunction *MF = MBB->getParent();
36138 MachineRegisterInfo &MRI = MF->getRegInfo();
36144 MVT PVT = getPointerTy(MF->getDataLayout());
36154 Register SP = RegInfo->getStackRegister();
36167 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
36172 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
36184 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
36198 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
36209 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
36220 MachineFunction *MF = MBB->getParent();
36221 MachineRegisterInfo *MRI = &MF->getRegInfo();
36224 MVT PVT = getPointerTy(MF->getDataLayout());
36230 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
36238 VR = MRI->createVirtualRegister(TRC);
36242 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
36249 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
36250 .addReg(0) /* TII->getGlobalBaseReg(MF) */
36257 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
36269 MachineFunction *MF = BB->getParent();
36270 MachineRegisterInfo *MRI = &MF->getRegInfo();
36272 int FI = MF->getFrameInfo().getFunctionContextIndex();
36292 if (!MF->hasCallSiteLandingPad(Sym))
36295 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
36309 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
36319 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
36320 DispatchBB->setIsEHPad(true);
36322 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
36323 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
36324 DispatchBB->addSuccessor(TrapBB);
36326 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
36327 DispatchBB->addSuccessor(DispContBB);
36330 MF->push_back(DispatchBB);
36331 MF->push_back(DispContBB);
36332 MF->push_back(TrapBB);
36340 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
36341 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
36343 const X86RegisterInfo &RI = TII->getRegisterInfo();
36349 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
36350 MFI->setRestoreBasePointer(MF);
36355 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
36356 MFI->getRestoreBasePointerOffset())
36359 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
36364 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
36365 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
36367 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
36370 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
36375 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
36376 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
36379 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
36386 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
36394 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
36402 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
36403 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
36404 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
36407 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
36414 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
36417 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
36421 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
36429 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
36441 DispContBB->addSuccessor(LP);
36445 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
36450 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
36451 MBB->succ_rend());
36454 if (MBBS->isEHPad()) {
36455 MBB->removeSuccessor(MBBS);
36460 MBB->addSuccessor(DispatchBB);
36462 // Find the invoke call and mark all of the callee-saved registers as
36486 // Mark all former landing pads as non-landing pads. The dispatch is the only
36489 LP->setIsEHPad(false);
36503 MachineFunction &MF = *BB->getParent();
36510 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
36516 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
36524 MachineFunction *MF = BB->getParent();
36591 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36592 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
36596 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36597 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
36602 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36603 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
36609 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
36610 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
36615 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36616 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
36621 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36626 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
36631 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
36642 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36661 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36662 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
36666 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36667 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
36671 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36672 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
36677 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
36678 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
36683 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36684 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
36690 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
36695 // clang-format off
36706 // clang-format on
36710 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
36714 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36758 // - which is ESI for i686 - register allocator would not be able to
36760 // - there never would be enough unreserved registers during regalloc
36765 // If it is not i686 or there is no base pointer - nothing to do here.
36766 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
36773 assert(TRI->getBaseRegister() == X86::ESI &&
36777 MachineRegisterInfo &MRI = MF->getRegInfo();
36778 MVT SPTy = getPointerTy(MF->getDataLayout());
36792 while (RMBBI != BB->rend() &&
36793 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
36794 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
36795 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
36796 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
36801 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
36809 Register BasePtr = TRI->getBaseRegister();
36810 if (TRI->hasBasePointer(*MF) &&
36812 if (!BB->isLiveIn(BasePtr))
36813 BB->addLiveIn(BasePtr);
36816 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36817 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36819 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36821 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
36828 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
36831 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
36840 Register BasePtr = TRI->getBaseRegister();
36844 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
36845 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36847 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36849 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
36851 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
36854 if (!BB->isLiveIn(BasePtr)) {
36855 BB->addLiveIn(BasePtr);
36858 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36860 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36862 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
36865 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36866 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36869 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36870 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
36879 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
36880 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
36881 MFI->setHasPreallocatedCall(true);
36883 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
36887 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
36894 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
36897 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
36898 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
36902 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
36916 // clang-format off
36924 // clang-format on
36927 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36938 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
36940 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
36941 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
36945 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
36946 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
36968 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36976 MIB.add(MI.getOperand(CurOp++)); // index -- stride
36992 // clang-format off
36996 // clang-format on
36998 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
37009 //===----------------------------------------------------------------------===//
37011 //===----------------------------------------------------------------------===//
37039 // For vectors - if we have a constant, then try to sign extend.
37068 const APInt &Mask = C->getAPIntValue();
37070 // Clear all non-demanded bits initially.
37094 // and non-demanded bits.
37254 ShAmt = VT.getScalarSizeInBits() - 1;
37303 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
37419 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
37420 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
37442 // The result will have at least as many trailing zeros as the non-mask
37474 // Truncations/Conversions - upper elements are known zero.
37489 // Strict Conversions - upper elements are known zero.
37538 switch (Op->getConstantOperandVal(0)) {
37581 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
37611 // TODO - handle target shuffle ops with different value types.
37648 if (Tmp > (NumSrcBits - VTBits))
37649 return Tmp - (NumSrcBits - VTBits);
37661 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
37684 if (Tmp > (SrcBits - VTBits))
37685 return Tmp - (SrcBits - VTBits);
37700 return VTBits; // Shifted all bits out --> zero.
37703 return 1; // Shifted all sign bits out --> unknown.
37704 return Tmp - ShiftVal.getZExtValue();
37710 if (ShiftVal.uge(VTBits - 1))
37718 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
37729 // Vector compares return zero/all-bits result values.
37750 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
37777 // TODO - handle target shuffle ops with different value types.
37800 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
37801 return N->getOperand(0);
37810 if (!LN->isSimple())
37814 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37816 LN->getPointerInfo(), LN->getOriginalAlign(),
37817 LN->getMemOperand()->getFlags());
37831 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
37834 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
37836 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
37847 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
37865 unsigned Len = Scale - 1;
37891 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
37895 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
37926 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
37948 "AVX512 required for 512-bit vector shuffles");
37994 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
38011 // VPERMILPD can permute with a non-repeating shuffle.
38032 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
38037 // Narrow the repeated mask to create 32-bit element permutes.
38075 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
38189 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
38198 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
38205 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
38231 // non-blended source element is zero in each case.
38441 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
38442 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
38445 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
38446 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
38449 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
38450 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
38457 int ShufMask[4] = {-1, -1, -1, -1};
38494 /// chain of single-use x86 shuffle instructions and accumulated the combined
38551 // is different from the root element size - this would prevent writemasks
38555 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
38556 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
38573 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
38588 // Handle 128/256-bit lane shuffles of 512-bit vectors.
38594 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
38615 int PermMask[4] = {-1, -1, -1, -1};
38619 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
38660 // Handle 128-bit lane shuffles of 256-bit vectors.
38675 // If we're inserting the low subvector, an insert-subvector 'concat'
38708 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
38727 // For masks that have been widened to 128-bit elements or more,
38728 // narrow back down to 64-bit elements.
38739 // TODO - variable shuffles might need this to be widened again.
38779 // Attempt to match against broadcast-from-vector.
38965 // Don't try to re-form single instruction chains under any circumstances now
38987 // If we have a single input lane-crossing shuffle then lower to VPERMV.
38996 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
39011 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
39012 // vector as the second source (non-VLX will pad to 512-bit shuffles).
39023 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
39041 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
39042 // (non-VLX will pad to 512-bit shuffles).
39061 // See if we can combine a single input shuffle with zeros to a bit-mask,
39089 // the 128-bit lanes use the variable mask to VPERMILPS.
39106 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
39112 // Bits[3] - Match Bit.
39113 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
39114 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
39121 VPerm2Idx.push_back(-1);
39174 // With XOP, if we have a 128-bit binary input shuffle we can always combine
39175 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
39180 // Bits[4:0] - Byte Index (0 - 31)
39181 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
39214 // (non-VLX will pad to 512-bit shuffles)
39244 // -->
39286 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
39325 // elements, and shrink them to the half-width mask. It does this in a loop
39413 // the HOP args are pre-shuffled.
39425 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
39439 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
39459 int PostMask[4] = {-1, -1, -1, -1};
39481 SDValue BC1 = BC[BC.size() - 1];
39505 M -= NumElts + (SubLane * NumHalfEltsPerLane);
39519 M -= NumHalfEltsPerLane;
39522 M -= NumHalfEltsPerLane;
39552 // If we are post-shuffling a 256-bit hop and not requiring the upper
39553 // elements, then try to narrow to a 128-bit hop directly.
39602 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
39674 /// of single-use shuffle instructions, build a generic model of the cumulative
39681 /// special-purpose shuffle.
39697 /// combine-ordering. To fix this, we should do the redundant instruction
39724 return SDValue(); // Bail if we hit a non-simple non-vector.
39738 OpDemandedElts.setBit(M - BaseIdx);
39741 // Op is smaller than Root - extract the demanded elts for the subvector.
39746 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
39798 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
39834 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
39840 // Match failed - should we replace an existing Op?
39847 return Ops.size() - 1;
39853 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
39862 // This function can be performance-critical, so we rely on the power-of-2
39864 // bit-masks and shifts.
39866 "Non-power-of-2 shuffle mask sizes");
39868 "Non-power-of-2 shuffle mask sizes");
39879 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
39880 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
39881 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
39902 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
39912 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
39921 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
39924 (RootMaskedIdx & (OpRatio - 1));
39926 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
39987 if (Ops.size() < (MaxDepth - Depth)) {
39996 if (Ops[i].getNode()->hasOneUse() ||
40014 // If constant fold failed and we only have constants - then we have
40015 // multiple uses by a single non-variable shuffle - just bail.
40047 int OpEltIdx = MaskElt - Lo;
40059 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
40086 // Reresolve - we might have repeated subvector sources.
40094 // elements, and shrink them to the half-width mask. It does this in a loop
40146 /// Get the PSHUF-style mask from PSHUF node.
40149 /// PSHUF-style masks that can be reused with such instructions.
40158 // If we have more than 128-bits, only the low 128-bits of shuffle mask
40165 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
40166 "Mask doesn't repeat in high 128-bit lanes!");
40180 M -= 4;
40197 "Called with something other than an x86 128-bit half shuffle!");
40199 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
40220 // dword shuffle, and the high words are self-contained.
40230 // dword shuffle, and the low words are self-contained.
40240 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
40246 // Search for a half-shuffle which we can combine with.
40250 !V->isOnlyUserOf(V.getOperand(0).getNode()))
40321 // permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
40328 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
40330 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
40374 // Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
40444 // the blend mask is the same in the 128-bit subvectors (or can widen to
40454 // Don't introduce lane-crossing permutes without AVX2, unless it can be
40469 // TODO - move this to TLI like isBinOp?
40480 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
40481 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
40499 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
40500 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
40501 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
40530 N->isOnlyUserOf(N.getOperand(0).getNode())) {
40575 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
40576 N->isOnlyUserOf(N.getOperand(1).getNode())) {
40639 /// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
40708 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
40730 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
40747 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
40748 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
40759 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
40760 // If we're re-broadcasting a smaller type then broadcast with that type and
40774 // Reduce broadcast source vector to lowest 128-bits.
40779 // broadcast(scalar_to_vector(x)) -> broadcast(x).
40784 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
40794 for (SDNode *User : Src->uses())
40795 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
40796 Src == User->getOperand(0) &&
40797 User->getValueSizeInBits(0).getFixedValue() >
40803 // vbroadcast(scalarload X) -> vbroadcast_load X
40809 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40812 LN->getMemoryVT(), LN->getMemOperand());
40839 if (LN->isSimple()) {
40841 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40844 LN->getPointerInfo(), LN->getOriginalAlign(),
40845 LN->getMemOperand()->getFlags());
40857 if (LN->getMemoryVT().getSizeInBits() == 16) {
40859 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40862 LN->getMemoryVT(), LN->getMemOperand());
40881 LN->isSimple()) {
40885 LN->getBasePtr(), TypeSize::getFixed(Offset), DL);
40886 SDValue Ops[] = { LN->getChain(), Ptr };
40889 LN->getPointerInfo().getWithOffset(Offset),
40890 LN->getOriginalAlign(),
40891 LN->getMemOperand()->getFlags());
40900 // vbroadcast(vzload X) -> vbroadcast_load X
40903 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
40905 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40908 LN->getMemoryVT(), LN->getMemOperand());
40916 // vbroadcast(vector load X) -> vbroadcast_load
40922 if (LN->isSimple()) {
40924 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
40927 LN->getPointerInfo(), LN->getOriginalAlign(),
40928 LN->getMemOperand()->getFlags());
40959 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
40961 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
40964 LN->getMemoryVT(), LN->getMemOperand());
40991 // vzext_movl (scalar_to_vector C) --> load [C,0...]
40994 // Create a vector constant - scalar constant followed by zeros.
41000 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
41007 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
41016 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
41042 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
41061 // --> m3 = blend(m1,m2)
41097 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
41131 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
41146 // If we're permuting the upper 256-bits subvectors of a concatenation, then
41149 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the
41151 SDValue LHS = N->getOperand(0);
41152 SDValue RHS = N->getOperand(1);
41153 uint64_t Mask = N->getConstantOperandVal(2);
41174 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
41175 SDValue LHS = N->getOperand(0);
41176 SDValue RHS = N->getOperand(1);
41184 N->getOperand(2)));
41188 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
41193 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
41225 if (N0->hasOneUse()) {
41260 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
41309 // Zero/UNDEF insertion - zero out element and remove dependency.
41375 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
41376 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
41377 MemIntr->getBasePtr(),
41378 MemIntr->getMemOperand());
41420 // Nuke no-op shuffles that show up after combining.
41435 // dwords as otherwise it would have been removed as a no-op.
41450 // only works when we have a PSHUFD followed by two half-shuffles.
41499 int ParitySrc[2] = {-1, -1};
41538 EVT VT = N->getValueType(0);
41544 // We only handle target-independent shuffles.
41547 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
41550 SDValue V1 = N->getOperand(0);
41551 SDValue V2 = N->getOperand(1);
41560 if (!V1->hasOneUse() || !V2->hasOneUse())
41567 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
41568 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
41569 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
41573 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
41574 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
41575 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
41579 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
41585 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
41586 : V2->getOpcode() == ISD::FADD;
41597 // We only handle target-independent shuffles.
41600 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
41603 MVT VT = N->getSimpleValueType(0);
41609 SDValue Op0 = N->getOperand(0);
41610 SDValue Op1 = N->getOperand(1);
41622 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
41634 /// Try to combine a shuffle into a target-specific add-sub or
41635 /// mul-add-sub node.
41647 MVT VT = N->getSimpleValueType(0);
41659 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
41661 // X86 targets with 512-bit ADDSUB instructions!
41676 // if we can express this as a single-source shuffle, that's preferable.
41683 EVT VT = N->getValueType(0);
41685 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
41695 SDValue N0 = N->getOperand(0);
41696 SDValue N1 = N->getOperand(1);
41711 for (int Elt : SVOp->getMask())
41712 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
41723 EVT VT = Shuf->getValueType(0);
41724 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
41730 ArrayRef<int> Mask = Shuf->getMask();
41735 // (half-index output is 0 or 2).
41742 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
41744 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
41747 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
41748 Shuf->getOperand(1), HalfMask, HalfIdx1,
41762 EVT VT = N->getValueType(0);
41783 if (isTargetShuffle(N->getOpcode())) {
41789 // instructions into higher-order shuffles. We do this after combining
41797 // TODO - merge this into combineX86ShufflesRecursively.
41802 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
41803 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41838 if (!Load || !Load->getBasePtr().hasOneUse())
41845 Type *CTy = C->getType();
41846 if (!CTy->isVectorTy() ||
41847 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
41850 // Handle scaling for i64 elements on 32-bit targets.
41851 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
41860 Constant *Elt = C->getAggregateElement(i);
41862 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
41878 Load->getAlign());
41963 // We only need the bottom 64-bits of the (128-bit) shift amount.
41969 // only the bottom 64-bits are only ever used.
41970 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
41971 unsigned UseOpc = Use->getOpcode();
41974 Use->getOperand(0) != Amt;
41994 // Fold shift(0,x) -> 0
42020 // Fold shift(0,x) -> 0
42049 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
42050 unsigned ShiftAmt = Amt->getZExtValue();
42062 int Diff = ShiftAmt - C1;
42064 Diff = -Diff;
42088 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
42089 unsigned ShiftAmt = Amt->getZExtValue();
42101 int Diff = ShiftAmt - C1;
42103 Diff = -Diff;
42144 // We can't assume an undef src element gives an undef dst - the
42215 // TODO - pass on known zero/undef.
42218 // TODO - we should do this for all target/faux shuffles ops.
42252 // TODO - pass on known zero/undef.
42330 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
42331 Mem->getMemOperand());
42355 // TODO - we should do this for all target/faux shuffles ops.
42381 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
42382 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
42383 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
42389 // See if 512-bit ops only use the bottom 128-bits.
42412 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
42414 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
42415 MemIntr->getMemOperand());
42424 EVT MemVT = MemIntr->getMemoryVT();
42428 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
42429 MemIntr->getBasePtr(), MemIntr->getMemOperand());
42506 // (Non-Lane Crossing) Target Shuffles.
42610 int M = OpMask[i] - Lo;
42615 // TODO - Propagate input undef/zero elts.
42625 // can handle - so pretend its Depth == 0 again, and reduce the max depth
42638 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
42676 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
42695 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
42756 int Diff = ShAmt - Shift2Amt;
42771 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
42772 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
42808 unsigned ShAmt = Op1->getAsZExtVal();
42818 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
42842 if (Known.Zero[BitWidth - ShAmt - 1] ||
42848 if (Known.One[BitWidth - ShAmt - 1])
42881 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
42882 unsigned Idx = CIdx->getZExtValue();
42886 // bits from the implict zext - simplify to zero.
42919 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
42920 unsigned Idx = CIdx->getZExtValue();
42946 // TODO - add known bits handling.
42960 // Attempt to avoid multi-use ops if we don't need anything from them.
42971 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
42986 Src->hasOneUse()) {
42998 // icmp sgt(0, R) == ashr(R, BitWidth-1).
43014 // See if we only demand bits from the lower 128-bit vector.
43029 Known.Zero.setHighBits(BitWidth - NumElts);
43038 if (KnownSrc.One[SrcBits - 1])
43040 else if (KnownSrc.Zero[SrcBits - 1])
43043 // Attempt to avoid multi-use os if we don't need anything from it.
43060 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
43084 // Only bottom 16-bits of the control bits are required.
43087 uint64_t Val1 = Cst1->getZExtValue();
43096 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
43097 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
43134 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
43152 // The result will have at least as many trailing zeros as the non-mask
43177 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
43178 !DemandedElts[CIdx->getZExtValue()])
43189 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
43190 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
43201 // icmp sgt(0, R) == ashr(R, BitWidth-1).
43394 // clang-format off
43399 // clang-format on
43404 // Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
43415 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
43466 // ->
43499 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
43508 // With AVX512 vxi1 types are legal and we prefer using k-regs.
43532 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
43538 // avoid sign-extending to this type entirely.
43552 // sign-extend to a 256-bit operation to avoid truncation.
43562 // sign-extend to a 256-bit operation to match the compare.
43563 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
43564 // 256-bit because the shuffle is cheaper than sign extending the result of
43575 // it is not profitable to sign-extend to 256-bit because this will
43576 // require an extra cross-lane shuffle which is more expensive than
43577 // truncating the result of the compare to 128-bits.
43629 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
43639 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
43644 // Only do this if we have k-registers.
43648 EVT DstVT = N->getValueType(0);
43649 SDValue Op = N->getOperand(0);
43695 unsigned NumElts = BV->getNumOperands();
43696 SDValue Splat = BV->getSplatValue();
43720 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
43728 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
43736 // Use PSHUFW to repeat 16-bit elements.
43747 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
43792 if (C->isZero())
43794 if (C->isAllOnes())
43848 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
43864 SDValue N0 = N->getOperand(0);
43865 EVT VT = N->getValueType(0);
43871 // ->
43902 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
43906 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
43951 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
43961 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
43964 MemVT, BCast->getMemOperand());
43972 // avoiding store-load conversions.
43981 // Handle zero-extension of i32 with MOVD.
43986 // TODO - investigate supporting sext 32-bit immediates on x86_64.
44010 // Detect bitcasts of 64-bit build vectors and convert to a
44049 if (C->isAllOnes())
44051 if (C->isZero())
44057 // Turn it into a sign bit compare that produces a k-register. This avoids
44098 // remove GPR<->K-register crossings.
44103 // floating-point operand into a floating-point logic operation. This may
44109 // clang-format off
44114 // clang-format on
44129 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
44138 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
44161 auto IsFreeTruncation = [](SDValue &Op) -> bool {
44168 return (BV && BV->isConstant());
44186 SDValue AbsOp1 = Abs->getOperand(0);
44193 // Check if the operands of the sub are zero-extended from vectors of i8.
44223 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
44256 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
44285 EVT ExtractVT = Extract->getValueType(0);
44304 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
44331 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
44352 // Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
44359 EVT ExtractVT = Extract->getValueType(0);
44387 // Special case for (pre-legalization) vXi1 reductions.
44391 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
44394 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
44395 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
44425 // FIXME: Better handling of k-registers or 512-bit vectors?
44466 // parity -> (PARITY(MOVMSK X))
44474 // any_of -> MOVMSK != 0
44478 // all_of -> MOVMSK == ((1 << NumElts) - 1)
44485 // negate to get the final 0/-1 mask value.
44497 EVT ExtractVT = Extract->getValueType(0);
44503 EVT VT = Extract->getOperand(0).getValueType();
44512 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
44543 for (unsigned i = Stages - StageBias; i > 0; --i) {
44544 SmallVector<int, 16> Mask(DpElems, -1);
44545 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
44560 Extract->getOperand(1));
44569 EVT ExtractVT = Extract->getValueType(0);
44575 EVT VT = Extract->getOperand(0).getValueType();
44597 // abs-diff pattern.
44601 // Check whether we have an abs-diff pattern feeding into the select.
44617 for(unsigned i = Stages - 3; i > 0; --i) {
44618 SmallVector<int, 16> Mask(SadElems, -1);
44619 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
44634 Extract->getOperand(1));
44638 // integer, that requires a potentially expensive XMM -> GPR transfer.
44643 // to a single-use of the loaded vector. For the reasons above, we
44649 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44653 EVT VT = N->getValueType(0);
44655 bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
44656 return Use->getOpcode() == ISD::STORE ||
44657 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
44658 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
44665 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
44667 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
44669 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
44670 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
44672 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
44673 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
44690 SDValue Src = N->getOperand(0);
44691 SDValue Idx = N->getOperand(1);
44693 EVT VT = N->getValueType(0);
44703 const APInt &IdxC = N->getConstantOperandAPInt(1);
44717 // TODO support non-zero offsets.
44731 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
44733 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
44734 MemIntr->getBasePtr(),
44735 MemIntr->getPointerInfo(),
44736 MemIntr->getOriginalAlign(),
44737 MemIntr->getMemOperand()->getFlags());
44772 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
44776 // We can only legally extract other elements from 128-bit vectors and in
44777 // certain circumstances, depending on SSE-level.
44787 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
44791 Idx &= (NumEltsPerLane - 1);
44844 // If narrowing/widening failed, see if we can extract+zero-extend.
44855 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
44877 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
44889 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
44890 SDValue Vec = ExtElt->getOperand(0);
44891 SDValue Index = ExtElt->getOperand(1);
44892 EVT VT = ExtElt->getValueType(0);
44896 // non-zero element because the shuffle+scalar op will be cheaper?
44901 // extract, the condition code), so deal with those as a special-case.
44907 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
44930 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
44942 // TODO: This switch could include FNEG and the x86-specific FP logic ops
44973 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
44976 for (SDValue Op : Vec->ops())
44990 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
45002 SDValue Index = ExtElt->getOperand(1);
45006 EVT VT = ExtElt->getValueType(0);
45015 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
45033 // vXi8 mul reduction - promote to vXi16 mul reduction.
45056 {4, 5, 6, 7, -1, -1, -1, -1}));
45059 {2, 3, -1, -1, -1, -1, -1, -1}));
45062 {1, -1, -1, -1, -1, -1, -1, -1}));
45067 // vXi8 add reduction - sub 128-bit vector.
45076 // Must be a >=128-bit vector with pow2 elements.
45080 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
45092 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
45101 // If the source vector values are 0-255, then we can use PSADBW to
45138 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
45153 // 256-bit horizontal instructions operate on 128-bit chunks rather than
45156 // TODO: We could extend this to handle 512-bit or even longer vectors.
45169 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
45180 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
45187 SDValue InputVector = N->getOperand(0);
45188 SDValue EltIdx = N->getOperand(1);
45192 EVT VT = N->getValueType(0);
45194 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
45199 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
45210 uint64_t Idx = CIdx->getZExtValue();
45216 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
45224 DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));
45235 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
45246 // TODO - Remove this once we can handle the implicit zero-extension of
45269 // pre-legalization,
45293 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
45312 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45313 Use->getOperand(0).getResNo() == ResNo &&
45314 Use->getValueType(0) == MVT::i1) {
45316 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
45322 if (all_of(InputVector->uses(), IsBoolExtract) &&
45328 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
45330 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
45342 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
45373 // Input type must be extending a bool vector (bit-casted from a scalar
45395 // must split it down into sub-sections for broadcasting. For example:
45396 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
45397 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
45422 // For smaller scalar integers, we can simply any-extend it to the vector
45447 // zero-extension.
45451 DAG.getConstant(EltSizeInBits - 1, DL, VT));
45454 /// If a vector select has an operand that is -1 or 0, try to simplify the
45461 SDValue Cond = N->getOperand(0);
45462 SDValue LHS = N->getOperand(1);
45463 SDValue RHS = N->getOperand(2);
45468 if (N->getOpcode() != ISD::VSELECT)
45491 // vector floating-point selects.
45509 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
45522 // vselect Cond, 111..., 000... -> Cond
45529 // vselect Cond, 111..., X -> or Cond, X
45536 // vselect Cond, X, 000... -> and Cond, X
45543 // vselect Cond, 000..., X -> andn Cond, X
45547 // The canonical form differs for i1 vectors - x86andnp is not used
45560 /// and concatenate the result to eliminate a wide (256-bit) vector instruction:
45561 /// vselect Cond, (concat T0, T1), (concat F0, F1) -->
45565 unsigned Opcode = N->getOpcode();
45569 // TODO: Split 512-bit vectors too?
45570 EVT VT = N->getValueType(0);
45575 SDValue Cond = N->getOperand(0);
45576 SDValue TVal = N->getOperand(1);
45577 SDValue FVal = N->getOperand(2);
45593 SDValue Cond = N->getOperand(0);
45594 SDValue LHS = N->getOperand(1);
45595 SDValue RHS = N->getOperand(2);
45603 EVT VT = N->getValueType(0);
45608 // this with a wider condition value (post-legalization it becomes an i8),
45613 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
45615 // TODO: For constants that overflow or do not differ by power-of-2 or small
45617 const APInt &TrueVal = TrueC->getAPIntValue();
45618 const APInt &FalseVal = FalseC->getAPIntValue();
45620 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
45623 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45646 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
45649 // Multiply condition by the difference if non-one.
45653 // Add the base if non-zero.
45654 if (!FalseC->isZero())
45663 /// If this is a *dynamic* select (non-constant condition) and we can match
45672 SDValue Cond = N->getOperand(0);
45673 if ((N->getOpcode() != ISD::VSELECT &&
45674 N->getOpcode() != X86ISD::BLENDV) ||
45680 EVT VT = N->getValueType(0);
45686 // cases where a *dynamic* blend will fail even though a constant-condition
45689 // Potentially, we should combine constant-condition vselect nodes
45690 // pre-legalization into shuffles and not mark as many types as custom
45694 // FIXME: We don't support i16-element blends currently. We could and
45696 // rather than just the high bit and using an i8-element blend.
45705 // There are no 512-bit blend instructions that use sign bits.
45710 // and don't ever optimize vector selects that map to AVX512 mask-registers.
45715 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
45717 if ((UI->getOpcode() != ISD::VSELECT &&
45718 UI->getOpcode() != X86ISD::BLENDV) ||
45739 for (SDNode *U : Cond->uses()) {
45740 if (U->getOpcode() == X86ISD::BLENDV)
45743 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
45744 Cond, U->getOperand(1), U->getOperand(2));
45754 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
45755 N->getOperand(1), N->getOperand(2));
45767 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
45770 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
45771 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
45782 "Mask must be zero/all-bits");
45790 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
45791 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
45808 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
45811 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
45813 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
45826 if (N->getOpcode() != ISD::VSELECT)
45829 SDValue Cond = N->getOperand(0);
45830 SDValue LHS = N->getOperand(1);
45831 SDValue RHS = N->getOperand(2);
45843 // (vselect M, L, R) -> (vselect ~M, R, L)
45845 ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
45852 /// Do target-specific dag combines on SELECT and VSELECT nodes.
45857 SDValue Cond = N->getOperand(0);
45858 SDValue LHS = N->getOperand(1);
45859 SDValue RHS = N->getOperand(2);
45878 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
45879 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
45891 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
45894 N->getOpcode() == X86ISD::BLENDV))
45898 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
45901 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
45933 // ignored in unsafe-math mode).
45940 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46012 // Check for x CC y ? y : x -- a min/max with reversed arms.
46080 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
46087 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
46088 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
46090 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46114 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
46117 // AVX512 - Extend select with zero to merge with target shuffle.
46118 // select(mask, extract_subvector(shuffle(x)), zero) -->
46120 // TODO - support non target shuffles as well.
46155 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
46160 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46163 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
46164 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
46167 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
46178 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
46179 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
46198 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
46200 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
46206 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
46212 // clang-format off
46218 // clang-format on
46231 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
46243 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
46255 // with out-of-bounds clamping.
46259 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
46260 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
46261 // exceeding bitwidth-1.
46262 if (N->getOpcode() == ISD::VSELECT) {
46264 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
46265 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
46276 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
46277 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
46303 // select(~Cond, X, Y) -> select(Cond, Y, X)
46306 return DAG.getNode(N->getOpcode(), DL, VT,
46309 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
46316 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
46325 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
46355 // This can lower using a vector shift bit-hack rather than mask and compare.
46357 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
46361 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
46363 // The 'and' mask must be composed of power-of-2 constants.
46366 if (C && C->getAPIntValue().isPowerOf2()) {
46367 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
46373 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
46374 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
46375 // 16-bit lacks a proper blendv.
46383 return C->getAPIntValue().isPowerOf2();
46385 // Create a left-shift constant to get the mask bits over to the sign-bit.
46390 ShlVals.push_back(EltBitWidth - 1 -
46391 MaskVal->getAPIntValue().exactLogBase2());
46393 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
46414 // This combine only operates on CMP-like nodes.
46416 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
46426 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
46427 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
46428 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
46429 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
46433 // - XOR/OR/AND (if they were made to survive AtomicExpand)
46434 // - LHS != 1
46453 APInt Addend = OpRHSC->getAPIntValue();
46455 Addend = -Addend;
46461 APInt Comparison = CmpRHSC->getAPIntValue();
46462 APInt NegAddend = -Addend;
46477 APInt DecComparison = Comparison - 1;
46499 AN->getMemOperand());
46515 else if (CC == X86::COND_G && Addend == -1)
46517 else if (CC == X86::COND_LE && Addend == -1)
46540 // CMP(X,0) -> signbit test
46545 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
46551 // OR(X,Y) -> see if only one operand contributes to the signbit.
46552 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
46600 // This combine only operates on CMP-like nodes.
46602 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
46626 if (C->getZExtValue() == 1) {
46629 } else if (C->getZExtValue() != 0)
46639 int OpIdx = -1;
46691 if (FVal && FVal->getZExtValue() != 0) {
46692 if (FVal->getZExtValue() != 1)
46699 if (FValIsFalse && TVal->getZExtValue() != 1)
46701 if (!FValIsFalse && TVal->getZExtValue() != 0)
46720 if (Cond->getOpcode() == X86ISD::CMP) {
46721 if (!isNullConstant(Cond->getOperand(1)))
46724 Cond = Cond->getOperand(0);
46730 switch (Cond->getOpcode()) {
46738 SetCC0 = Cond->getOperand(0);
46739 SetCC1 = Cond->getOperand(1);
46746 SetCC0->getOperand(1) != SetCC1->getOperand(1))
46749 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
46750 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
46751 Flags = SetCC0->getOperand(1);
46755 // When legalizing carry, we create carries via add X, -1
46785 CarryOp1.getNode()->hasOneUse() &&
46789 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
46840 // testc -> testz.
46844 // !testc -> !testz.
46848 // testz -> testc.
46852 // !testz -> !testc.
46857 // testnzc -> testnzc (no change).
46873 // TESTC(X,~X) == TESTC(X,-1)
46912 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
46949 // TESTZ(-1,X) == TESTZ(X,X)
46953 // TESTZ(X,-1) == TESTZ(X,X)
46957 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
46985 // Handle eq/ne against -1 (all_of).
46996 const APInt &CmpVal = CmpConstant->getAPIntValue();
47027 bool IsOneUse = CmpOp.getNode()->hasOneUse();
47030 // signbits extend down to all the sub-elements as well.
47044 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
47053 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
47054 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
47055 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
47056 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
47074 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
47075 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
47076 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
47077 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
47089 // Check for 256-bit split vector cases.
47117 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
47131 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
47159 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
47190 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
47191 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
47192 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
47193 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
47244 SDValue FalseOp = N->getOperand(0);
47245 SDValue TrueOp = N->getOperand(1);
47246 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
47247 SDValue Cond = N->getOperand(3);
47249 // cmov X, X, ?, ? --> X
47262 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47273 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
47279 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
47282 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
47286 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
47288 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
47294 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
47296 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
47301 FalseC->getValueType(0), Cond);
47309 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
47310 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
47311 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
47333 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
47340 // Add the base if non-zero.
47341 if (FalseC->getAPIntValue() != 0)
47351 // (select (x != c), e, c) -> select (x != c), e, x),
47352 // (select (x == c), c, e) -> select (x == c), x, e)
47356 // The rationale for this change is that the conditional-move from a constant
47357 // needs two instructions, however, conditional-move from a register needs
47361 // some instruction-combining opportunities. This opt needs to be
47383 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47396 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
47401 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
47402 EVT CondVT = Cond->getValueType(0);
47403 EVT OuterVT = N->getValueType(0);
47406 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
47415 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
47416 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
47444 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
47447 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47452 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
47453 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
47454 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
47455 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
47475 EVT VT = N->getValueType(0);
47492 EVT VT = N->getOperand(0).getValueType();
47496 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
47500 SDValue Opd = N->getOperand(i);
47508 // When ranges are from -128 ~ 127, use MULS8 mode.
47514 // When ranges are from -32768 ~ 32767, use MULS16 mode.
47542 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
47548 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
47572 SDValue N0 = N->getOperand(0);
47573 SDValue N1 = N->getOperand(1);
47574 EVT VT = N->getOperand(0).getValueType();
47626 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47631 N->getOperand(0));
47636 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47641 N->getOperand(0));
47659 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
47684 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
47693 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
47696 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
47697 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47699 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47720 EVT VT = N->getValueType(0);
47736 SDValue N0 = N->getOperand(0);
47737 SDValue N1 = N->getOperand(1);
47772 // Mask off upper 16-bits of sign-extended constants.
47775 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
47780 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
47790 N->isOnlyUserOf(Op.getNode())) {
47797 N->isOnlyUserOf(Op.getNode())) {
47827 EVT VT = N->getValueType(0);
47835 SDValue N0 = N->getOperand(0);
47836 SDValue N1 = N->getOperand(1);
47838 // MULDQ returns the 64-bit result of the signed multiplication of the lower
47839 // 32-bits. We can lower with this if the sign bits stretch that far.
47867 EVT VT = N->getValueType(0);
47896 N->getOperand(1), /*AllowUndefs*/ true, /*AllowTrunc*/ false);
47900 if (auto *RawC = getTargetConstantFromNode(N->getOperand(1)))
47901 if (auto *SplatC = RawC->getSplatValue())
47903 C = &(SplatCI->getValue());
47905 if (!C || C->getBitWidth() != VT.getScalarSizeInBits())
47908 C = &(CNode->getAPIntValue());
47911 if (isPowerOf2_64(C->getZExtValue()))
47914 int64_t SignMulAmt = C->getSExtValue();
47916 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
47921 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47947 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
47948 N->use_begin()->getOpcode() == ISD::ADD))
47956 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47959 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47973 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
47977 assert(C->getZExtValue() != 0 &&
47978 C->getZExtValue() != maxUIntN(VT.getScalarSizeInBits()) &&
47981 if (isPowerOf2_64(AbsMulAmt - 1)) {
47984 ISD::ADD, DL, VT, N->getOperand(0),
47985 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47986 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
47990 // (mul x, 2^N - 1) => (sub (shl x, N), x)
47992 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47996 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
47998 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
47999 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
48003 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48004 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
48007 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
48010 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
48012 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48016 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
48019 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
48022 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
48023 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
48032 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48035 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48055 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
48062 SDValue ShiftOperand = N->getOperand(0);
48067 EVT VT = N->getValueType(0);
48073 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
48097 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
48104 SDValue N0 = N->getOperand(0);
48105 SDValue N1 = N->getOperand(1);
48112 // with out-of-bounds clamping.
48118 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
48124 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
48132 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
48139 Mask <<= N1C->getAPIntValue();
48141 // We can handle cases concerning bit-widening nodes containing setcc_c if
48147 // zext(setcc_c) -> i32 0x0000FFFF
48148 // c1 -> i32 0x0000FFFF
48149 // c2 -> i32 0x00000001
48150 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
48151 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
48172 SDValue N0 = N->getOperand(0);
48173 SDValue N1 = N->getOperand(1);
48181 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
48185 m_SpecificInt(VT.getScalarSizeInBits() - 1))))
48190 // into (SHL (sext_in_reg X), ShlConst - SraConst)
48192 // or (SRA (sext_in_reg X), SraConst - ShlConst)
48194 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
48210 APInt ShlConst = N01->getAsAPIntVal();
48211 APInt SraConst = N1->getAsAPIntVal();
48221 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
48222 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
48230 DAG.getConstant(ShlConst - SraConst, DL, CVT));
48232 DAG.getConstant(SraConst - ShlConst, DL, CVT));
48241 SDValue N0 = N->getOperand(0);
48242 SDValue N1 = N->getOperand(1);
48251 // with out-of-bounds clamping.
48257 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
48263 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
48277 // TODO: This is a generic DAG combine that became an x86-only combine to
48278 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
48279 // and-not ('andn').
48288 // If we can shrink the constant mask below 8-bits or 32-bits, then this
48290 // from improved known-bits analysis or instruction selection.
48291 APInt MaskVal = AndC->getAPIntValue();
48300 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
48305 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
48315 unsigned Opcode = N->getOpcode();
48319 EVT VT = N->getValueType(0);
48320 SDValue N0 = N->getOperand(0);
48321 SDValue N1 = N->getOperand(1);
48325 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
48327 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
48332 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
48342 // shuffle to a v4X64 width - we can probably relax this in the future.
48360 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
48385 int PostShuffle[4] = {-1, -1, -1, -1};
48417 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
48460 unsigned Opcode = N->getOpcode();
48464 EVT VT = N->getValueType(0);
48465 SDValue N0 = N->getOperand(0);
48466 SDValue N1 = N->getOperand(1);
48479 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
48480 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
48530 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
48534 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
48570 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
48611 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
48612 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
48616 MVT VT = N->getSimpleValueType(0);
48617 SDValue LHS = N->getOperand(0);
48618 SDValue RHS = N->getOperand(1);
48620 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
48621 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
48624 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
48643 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
48649 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
48659 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
48660 X86ISD::VSRL == N->getOpcode()) &&
48662 EVT VT = N->getValueType(0);
48663 SDValue N0 = N->getOperand(0);
48664 SDValue N1 = N->getOperand(1);
48666 // Shift zero -> zero.
48676 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
48692 unsigned Opcode = N->getOpcode();
48697 EVT VT = N->getValueType(0);
48698 SDValue N0 = N->getOperand(0);
48699 SDValue N1 = N->getOperand(1);
48705 // (shift undef, X) -> 0
48711 unsigned ShiftVal = N->getConstantOperandVal(1);
48715 ShiftVal = NumBitsPerElt - 1;
48718 // (shift X, 0) -> X
48722 // (shift 0, C) -> 0
48728 // (VSRAI -1, C) -> -1
48732 return DAG.getConstant(-1, SDLoc(N), VT);
48741 NewShiftVal = NumBitsPerElt - 1;
48747 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
48751 // (shl (add X, X), C) -> (shl X, (C + 1))
48765 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
48770 N0->hasOneUse()) {
48815 if (N->isOnlyUserOf(N0.getNode())) {
48819 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
48823 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
48845 EVT VT = N->getValueType(0);
48846 unsigned Opcode = N->getOpcode();
48852 SDValue Vec = N->getOperand(0);
48853 SDValue Scl = N->getOperand(1);
48854 SDValue Idx = N->getOperand(2);
48856 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
48880 /// OR -> CMPNEQSS.
48889 SDValue N0 = N->getOperand(0);
48890 SDValue N1 = N->getOperand(1);
48899 SDValue CMP00 = CMP0->getOperand(0);
48900 SDValue CMP01 = CMP0->getOperand(1);
48907 for (const SDNode *U : N->uses()) {
48911 switch (U->getOpcode()) {
48951 N->getSimpleValueType(0));
48961 // On a 32-bit target, we cannot bitcast the 64-bit float to a
48962 // 64-bit integer, since that's not a legal type. Since
48987 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
48989 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
48991 MVT VT = N->getSimpleValueType(0);
48996 SDValue N0 = N->getOperand(0);
48997 SDValue N1 = N->getOperand(1);
49015 /// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
49016 /// ->
49021 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
49023 EVT VT = N->getValueType(0);
49032 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
49033 // end-users are ISD::AND including cases
49035 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
49036 !SVN->getOperand(1).isUndef()) {
49039 SDValue IVEN = SVN->getOperand(0);
49044 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
49052 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
49053 SVN->getOperand(1), SVN->getMask());
49059 SDValue N0 = N->getOperand(0);
49060 SDValue N1 = N->getOperand(1);
49154 // register. In most cases we actually compare or select YMM-sized registers
49157 // Even with AVX-512 this is still useful for removing casts around logical
49190 // clang-format off
49195 // clang-format on
49200 /// If both input operands of a logic op are being cast from floating-point
49201 /// types or FP compares, try to convert this into a floating-point logic node
49206 EVT VT = N->getValueType(0);
49207 SDValue N0 = N->getOperand(0);
49208 SDValue N1 = N->getOperand(1);
49227 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
49236 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
49237 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
49247 // logic (setcc N00, N01), (setcc N10, N11) -->
49261 SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
49265 // Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
49266 // to reduce XMM->GPR traffic.
49268 unsigned Opc = N->getOpcode();
49272 SDValue N0 = N->getOperand(0);
49273 SDValue N1 = N->getOperand(1);
49299 // Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
49303 unsigned Opc = N->getOpcode();
49307 SDValue N0 = N->getOperand(0);
49308 SDValue N1 = N->getOperand(1);
49309 EVT VT = N->getValueType(0);
49321 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
49343 // BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
49346 unsigned Opc = N->getOpcode();
49350 SDValue N0 = N->getOperand(0);
49351 SDValue N1 = N->getOperand(1);
49352 EVT VT = N->getValueType(0);
49385 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
49387 /// with a shift-right to eliminate loading the vector constant mask value.
49390 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
49391 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
49397 // shift and "andn". This saves a materialization of a -1 vector constant.
49400 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
49405 if (N->getValueType(0) == VT &&
49421 VT.getScalarSizeInBits() - 1, DAG);
49443 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
49445 return DAG.getBitcast(N->getValueType(0), Shift);
49451 if (Ld->isIndexed())
49454 SDValue Base = Ld->getBasePtr();
49474 // 'and-load' sequence.
49478 // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
49485 MVT VT = Node->getSimpleValueType(0);
49494 SDValue N = Node->getOperand(i);
49501 const Value *MemOp = Ld->getMemOperand()->getValue();
49507 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
49508 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
49510 Constant *Init = GV->getInitializer();
49511 Type *Ty = Init->getType();
49513 !Ty->getArrayElementType()->isIntegerTy() ||
49514 Ty->getArrayElementType()->getScalarSizeInBits() !=
49516 Ty->getArrayNumElements() >
49517 Ty->getArrayElementType()->getScalarSizeInBits())
49521 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
49524 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
49525 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
49533 // Do the transformation (For 32-bit type):
49534 // -> (and (load arr[idx]), inp)
49535 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
49537 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
49562 // where the setcc will freely 0 upper bits of k-register. We can replace the
49567 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
49569 EVT VT = N->getValueType(0);
49573 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
49580 SDValue Src = N->getOperand(0);
49612 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
49651 // Only do this re-ordering if op has one use.
49666 Op.getOperand(1 - OpIdx));
49682 // BLSR: (and x, (add x, -1))
49683 // BLSMSK: (xor x, (add x, -1))
49692 EVT VT = N->getValueType(0);
49698 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
49703 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
49704 N->getOperand(1 - OpIdx), 0))
49715 // ->
49721 // ->
49727 SDValue SetCC = N->getOperand(0);
49733 SDNode *BrCond = *Flag->uses().begin();
49734 if (BrCond->getOpcode() != X86ISD::BRCOND)
49737 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
49744 if (N->getOpcode() == X86ISD::SUB)
49745 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
49749 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
49755 SmallVector<SDValue> Ops(BrCond->op_values());
49756 if (isNullConstant(N->getOperand(1)))
49758 else if (isOneConstant(N->getOperand(1)))
49764 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
49765 // Avoid self-assign error b/c CC1 can be `e/ne`.
49775 // ->
49779 // ->
49787 SDValue SetCC0 = N->getOperand(0);
49788 SDValue SetCC1 = N->getOperand(1);
49793 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
49819 bool IsOR = N->getOpcode() == ISD::OR;
49830 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
49854 SDValue N0 = N->getOperand(0);
49855 SDValue N1 = N->getOperand(1);
49856 EVT VT = N->getValueType(0);
49868 // Use a 32-bit and+zext if upper bits known zero.
49880 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
49903 // `(-x << C0) & C1`
49905 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
49918 const APInt &MulC = N01C->getAPIntValue();
49919 const APInt &AndC = N1C->getAPIntValue();
49920 APInt MulCLowBit = MulC & (-MulC);
49925 assert(MulCLowBitLog != -1 &&
49970 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
49971 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
49978 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
49984 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
49986 if (isOneConstant(N1) && N0->hasOneUse()) {
49990 Src.getOperand(0)->hasOneUse())
50038 // We can't assume an undef src element gives an undef dst - the
50059 if (N->getOpcode() != ISD::DELETED_NODE)
50074 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
50082 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
50116 // Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
50119 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
50121 MVT VT = N->getSimpleValueType(0);
50126 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
50127 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
50150 // TODO - add UNDEF elts support.
50160 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
50161 // VPTERNLOG is only available as vXi32/64-bit types.
50174 SDValue X = N->getOperand(0);
50183 if (N->getOpcode() != ISD::OR)
50186 SDValue N0 = N->getOperand(0);
50187 SDValue N1 = N->getOperand(1);
50208 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
50223 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
50225 EVT VT = N->getValueType(0);
50281 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
50282 // The result of the shift is true or false, and on X86, the 32-bit
50300 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
50304 return (N->getOpcode() == ISD::OR && N->hasOneUse());
50307 // Check the zero extend is extending to 32-bit or more. The code generated by
50308 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
50310 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
50311 !isORCandidate(N->getOperand(0)))
50316 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
50317 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
50318 N->getOperand(1).getOpcode() == X86ISD::CMP &&
50319 isNullConstant(N->getOperand(1).getOperand(1)) &&
50320 N->getOperand(1).getValueType().bitsGE(MVT::i32);
50323 SDNode *OR = N->getOperand(0).getNode();
50324 SDValue LHS = OR->getOperand(0);
50325 SDValue RHS = OR->getOperand(1);
50332 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
50333 LHS = OR->getOperand(0);
50334 RHS = OR->getOperand(1);
50358 LHS = OR->getOperand(0);
50359 RHS = OR->getOperand(1);
50361 if (RHS->getOpcode() == ISD::OR)
50369 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
50375 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
50377 SDValue NotOp = And0_L->getOperand(0);
50384 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
50385 EVT VT = And1_L->getValueType(0);
50396 /// "and-not" operation. This function is intended to be called from a
50399 // Note that masked-merge variants using XOR or ADD expressions are
50401 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
50402 SDValue N0 = Node->getOperand(0);
50403 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
50405 SDValue N1 = Node->getOperand(1);
50406 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
50410 SDValue N00 = N0->getOperand(0);
50411 SDValue N01 = N0->getOperand(1);
50412 SDValue N10 = N1->getOperand(0);
50413 SDValue N11 = N1->getOperand(1);
50436 // Look through a one-use zext.
50453 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50457 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
50458 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
50459 // This is a complicated way to get -1 or 0 from the carry flag:
50460 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50461 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50467 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
50468 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
50473 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
50474 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
50476 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50487 // X + SETB Z --> adc X, 0
50488 // X - SETB Z --> sbb X, 0
50504 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50508 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50518 // X + SETAE --> sbb X, -1
50519 // X - SETAE --> adc X, -1
50522 DAG.getConstant(-1, DL, VT), EFLAGS);
50526 // X + SETBE --> sbb X, -1
50527 // X - SETBE --> adc X, -1
50534 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50538 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50543 DAG.getConstant(-1, DL, VT), NewEFLAGS);
50558 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50561 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
50563 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
50564 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
50565 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
50566 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
50575 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
50577 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
50578 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
50579 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
50580 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
50598 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
50599 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
50602 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
50604 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
50605 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
50615 bool IsSub = N->getOpcode() == ISD::SUB;
50616 SDValue X = N->getOperand(0);
50617 SDValue Y = N->getOperand(1);
50618 EVT VT = N->getValueType(0);
50635 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&
50647 bool IsSub = N->getOpcode() == ISD::XOR;
50648 bool N1COdd = N1C->getZExtValue() & 1;
50651 EVT VT = N->getValueType(0);
50658 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
50659 if (N->getOpcode() == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
50663 MVT VT = N->getSimpleValueType(0);
50685 SDValue N0 = N->getOperand(0);
50686 SDValue N1 = N->getOperand(1);
50687 EVT VT = N->getValueType(0);
50699 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
50749 // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
50759 uint64_t Val = CN->getZExtValue();
50774 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
50775 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
50776 // iff the upper elements of the non-shifted arg are zero.
50824 if (N->getOpcode() != ISD::DELETED_NODE)
50842 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
50844 /// SETGT(X, -1)
50847 EVT ResultType = N->getValueType(0);
50851 SDValue N0 = N->getOperand(0);
50852 SDValue N1 = N->getOperand(1);
50874 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
50877 // Create a greater-than comparison against -1.
50887 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
50894 /// xor (sra X, elt_size(X)-1), -1
50896 /// pcmpgt X, -1
50902 EVT VT = N->getValueType(0);
50907 // clang-format off
50917 // clang-format on
50922 SDValue Shift = N->getOperand(0);
50923 SDValue Ones = N->getOperand(1);
50932 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
50935 // Create a greater-than comparison against -1. We don't use the more obvious
50936 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
50965 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
51009 const APInt &Limit) -> SDValue {
51047 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
51048 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
51050 // clip to 0-255.
51064 // For 256-bit or smaller vectors, we require VLX.
51066 // If the result type is 256-bits or larger and we have disable 512-bit
51079 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
51142 EVT RegVT = Ld->getValueType(0);
51143 SDValue Ptr = Ld->getBasePtr();
51144 SDValue Chain = Ld->getChain();
51145 ISD::LoadExtType Ext = Ld->getExtensionType();
51147 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
51170 for (SDNode *User : Chain->uses()) {
51173 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
51174 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
51176 UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) &&
51177 User->getValueSizeInBits(0).getFixedValue() >
51179 EVT UserVT = User->getValueType(0);
51180 SDValue UserPtr = UserLd->getBasePtr();
51186 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
51187 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
51216 EVT RegVT = Ld->getValueType(0);
51217 EVT MemVT = Ld->getMemoryVT();
51221 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
51222 // into two 16-byte operations. Also split non-temporal aligned loads on
51223 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
51224 ISD::LoadExtType Ext = Ld->getExtensionType();
51228 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
51229 Ld->getAlign() >= Align(16)) ||
51231 *Ld->getMemOperand(), &Fast) &&
51238 SDValue Ptr1 = Ld->getBasePtr();
51244 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
51245 Ld->getOriginalAlign(),
51246 Ld->getMemOperand()->getFlags());
51247 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
51248 Ld->getPointerInfo().getWithOffset(HalfOffset),
51249 Ld->getOriginalAlign(),
51250 Ld->getMemOperand()->getFlags());
51258 // Bool vector load - attempt to cast to an integer, as we have good
51265 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
51266 Ld->getPointerInfo(),
51267 Ld->getOriginalAlign(),
51268 Ld->getMemOperand()->getFlags());
51276 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
51278 SDValue Ptr = Ld->getBasePtr();
51279 SDValue Chain = Ld->getChain();
51280 for (SDNode *User : Chain->uses()) {
51283 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
51284 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
51285 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
51286 !User->hasAnyUseOfValue(1) &&
51287 User->getValueSizeInBits(0).getFixedValue() >
51301 unsigned AddrSpace = Ld->getAddressSpace();
51305 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
51307 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
51308 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
51309 Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
51310 Ld->getMemOperand()->getFlags());
51319 /// Otherwise, return -1.
51329 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
51330 return -1;
51332 int TrueIndex = -1;
51333 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
51335 const SDValue &Op = BV->getOperand(i);
51340 return -1;
51341 if (ConstNode->getAPIntValue().countr_one() >= 1) {
51344 return -1;
51359 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
51365 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
51367 Addr = MaskedOp->getBasePtr();
51375 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
51380 /// If exactly one element of the mask is set for a non-extending masked load,
51382 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
51388 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
51389 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
51402 EVT VT = ML->getValueType(0);
51412 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
51413 ML->getPointerInfo().getWithOffset(Offset),
51414 Alignment, ML->getMemOperand()->getFlags());
51416 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
51428 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
51429 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
51433 EVT VT = ML->getValueType(0);
51439 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
51440 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
51441 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
51443 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
51444 ML->getMemOperand());
51445 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
51446 ML->getPassThru());
51452 // (for example, vblendvps -> vblendps).
51454 // Don't try this if the pass-through operand is already undefined. That would
51456 if (ML->getPassThru().isUndef())
51459 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
51462 // The new masked load has an undef pass-through operand. The select uses the
51463 // original pass-through operand.
51465 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
51466 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
51467 ML->getAddressingMode(), ML->getExtensionType());
51468 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
51469 ML->getPassThru());
51480 if (Mld->isExpandingLoad())
51483 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
51494 // If the mask value has been legalized to a non-boolean vector, try to
51496 SDValue Mask = Mld->getMask();
51498 EVT VT = Mld->getValueType(0);
51502 if (N->getOpcode() != ISD::DELETED_NODE)
51509 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
51510 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
51511 Mld->getAddressingMode(), Mld->getExtensionType());
51517 /// If exactly one element of the mask is set for a non-truncating masked store,
51519 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
51524 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
51536 SDValue Value = MS->getValue();
51548 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
51549 MS->getPointerInfo().getWithOffset(Offset),
51550 Alignment, MS->getMemOperand()->getFlags());
51557 if (Mst->isCompressingStore())
51560 EVT VT = Mst->getValue().getValueType();
51564 if (Mst->isTruncatingStore())
51570 // If the mask value has been legalized to a non-boolean vector, try to
51572 SDValue Mask = Mst->getMask();
51576 if (N->getOpcode() != ISD::DELETED_NODE)
51582 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
51583 Mst->getBasePtr(), Mst->getOffset(), NewMask,
51584 Mst->getMemoryVT(), Mst->getMemOperand(),
51585 Mst->getAddressingMode());
51588 SDValue Value = Mst->getValue();
51589 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
51591 Mst->getMemoryVT())) {
51592 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
51593 Mst->getBasePtr(), Mst->getOffset(), Mask,
51594 Mst->getMemoryVT(), Mst->getMemOperand(),
51595 Mst->getAddressingMode(), true);
51605 EVT StVT = St->getMemoryVT();
51607 SDValue StoredVal = St->getValue();
51618 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51619 St->getPointerInfo(), St->getOriginalAlign(),
51620 St->getMemOperand()->getFlags());
51624 // This will avoid a copy to k-register.
51631 return DAG.getStore(St->getChain(), dl, Val,
51632 St->getBasePtr(), St->getPointerInfo(),
51633 St->getOriginalAlign(),
51634 St->getMemOperand()->getFlags());
51645 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51646 St->getPointerInfo(), St->getOriginalAlign(),
51647 St->getMemOperand()->getFlags());
51654 // If its a v64i1 store without 64-bit support, we need two stores.
51657 StoredVal->ops().slice(0, 32));
51660 StoredVal->ops().slice(32, 32));
51663 SDValue Ptr0 = St->getBasePtr();
51667 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
51668 St->getOriginalAlign(),
51669 St->getMemOperand()->getFlags());
51671 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
51672 St->getPointerInfo().getWithOffset(4),
51673 St->getOriginalAlign(),
51674 St->getMemOperand()->getFlags());
51679 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51680 St->getPointerInfo(), St->getOriginalAlign(),
51681 St->getMemOperand()->getFlags());
51684 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
51685 // Sandy Bridge, perform two 16-byte stores.
51689 *St->getMemOperand(), &Fast) &&
51698 // Split under-aligned vector non-temporal stores.
51699 if (St->isNonTemporal() && StVT == VT &&
51700 St->getAlign().value() < VT.getStoreSize()) {
51701 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
51710 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
51720 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
51722 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
51723 St->getValue().getOpcode() == ISD::TRUNCATE &&
51724 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
51726 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
51728 St->getValue().getOperand(0));
51729 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
51730 MVT::v16i8, St->getMemOperand());
51734 if (!St->isTruncatingStore() &&
51740 return EmitTruncSStore(IsSigned, St->getChain(),
51741 dl, StoredVal.getOperand(0), St->getBasePtr(),
51742 VT, St->getMemOperand(), DAG);
51746 if (!St->isTruncatingStore()) {
51768 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
51769 TruncVT, St->getMemOperand());
51778 if (St->isTruncatingStore() && VT.isVector()) {
51780 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
51781 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
51782 dl, Val, St->getBasePtr(),
51783 St->getMemoryVT(), St->getMemOperand(), DAG);
51784 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
51786 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
51787 dl, Val, St->getBasePtr(),
51788 St->getMemoryVT(), St->getMemOperand(), DAG);
51795 unsigned AddrSpace = St->getAddressSpace();
51799 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
51801 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
51803 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
51804 St->getOriginalAlign(), St->getMemOperand()->getFlags(),
51805 St->getAAInfo());
51809 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
51814 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
51826 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
51827 cast<LoadSDNode>(St->getValue())->isSimple() &&
51828 St->getChain().hasOneUse() && St->isSimple()) {
51829 auto *Ld = cast<LoadSDNode>(St->getValue());
51835 if (!Ld->hasNUsesOfValue(1, 0))
51841 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
51842 Ld->getBasePtr(), Ld->getMemOperand());
51846 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
51847 St->getMemOperand());
51850 // This is similar to the above case, but here we handle a scalar 64-bit
51851 // integer store that is extracted from a vector on a 32-bit target.
51852 // If we have SSE2, then we can treat it like a floating-point double
51857 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
51858 SDValue OldExtract = St->getOperand(1);
51865 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
51866 St->getPointerInfo(), St->getOriginalAlign(),
51867 St->getMemOperand()->getFlags());
51878 SDValue StoredVal = N->getOperand(1);
51880 EVT MemVT = St->getMemoryVT();
51888 if (N->getOpcode() != ISD::DELETED_NODE)
51905 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
51907 /// A horizontal-op B, for some already available A and B, and if so then LHS is
51925 // which is A horizontal-op B.
51967 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
52024 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
52025 // so we just repeat the inner loop if this is a 256-bit op.
52046 // Compute the post-shuffle mask index based on where the element
52050 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
52052 // The low half of the 128-bit result must choose from A.
52053 // The high half of the 128-bit result must choose from B,
52069 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
52077 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
52080 ForceHorizOp || (llvm::any_of(NewLHS->uses(), FoundHorizUser) &&
52081 llvm::any_of(NewRHS->uses(), FoundHorizUser));
52099 EVT VT = N->getValueType(0);
52100 unsigned Opcode = N->getOpcode();
52105 return N->hasOneUse() &&
52106 N->use_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
52107 (N->use_begin()->getOperand(0).getOpcode() == HorizOpcode ||
52108 N->use_begin()->getOperand(1).getOpcode() == HorizOpcode);
52116 SDValue LHS = N->getOperand(0);
52117 SDValue RHS = N->getOperand(1);
52133 SDValue LHS = N->getOperand(0);
52134 SDValue RHS = N->getOperand(1);
52158 // <i32 -2147483648[float -0.000000e+00]> 0
52160 // <(load 4 from constant-pool)> t0, t29
52171 EVT VT = N->getValueType(0);
52172 SDValue LHS = N->getOperand(0);
52173 SDValue RHS = N->getOperand(1);
52175 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
52177 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
52179 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
52228 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
52229 !AllowContract(N->getFlags()))
52232 EVT VT = N->getValueType(0);
52236 SDValue LHS = N->getOperand(0);
52237 SDValue RHS = N->getOperand(1);
52242 &HasNoSignedZero](SDValue N) -> bool {
52247 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
52255 ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&
52256 HasNoSignedZero(Op0->getFlags())) ||
52257 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
52280 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
52284 /// Do target-specific dag combines on floating-point adds/subs.
52298 EVT VT = N->getValueType(0);
52299 SDValue Src = N->getOperand(0);
52312 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
52314 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
52320 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
52321 SDValue Src = N->getOperand(0);
52325 EVT VT = N->getValueType(0);
52362 // In most cases its only worth pre-truncating if we're only facing the cost
52367 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
52408 // Only handle vXi16 types that are at least 128-bits unless they will be
52427 // Count leading sign/zero bits on both inputs - if there are enough then
52428 // truncation back to vXi16 will be cheap - either as a pack/shuffle
52476 // adjacent pairs of 16-bit products, and saturates the result before
52477 // truncating to 16-bits.
52570 unsigned IdxN00 = ConstN00Elt->getZExtValue();
52571 unsigned IdxN01 = ConstN01Elt->getZExtValue();
52572 unsigned IdxN10 = ConstN10Elt->getZExtValue();
52573 unsigned IdxN11 = ConstN11Elt->getZExtValue();
52626 EVT VT = N->getValueType(0);
52627 SDValue Src = N->getOperand(0);
52630 // Attempt to pre-truncate inputs to arithmetic ops instead.
52664 EVT VT = N->getValueType(0);
52665 SDValue In = N->getOperand(0);
52683 /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
52691 if (N->getOpcode() == ISD::FNEG)
52692 return N->getOperand(0);
52698 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
52701 EVT VT = Op->getValueType(0);
52711 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
52717 cast<ShuffleVectorSDNode>(Op)->getMask());
52722 // -V, INDEX).
52757 // Only allow bitcast from correctly-sized constant.
52773 // clang-format off
52787 // clang-format on
52793 // clang-format off
52811 // clang-format on
52818 // clang-format off
52828 // clang-format on
52835 /// Do target-specific dag combines on floating point negations.
52839 EVT OrigVT = N->getValueType(0);
52854 // use of a constant by performing (-0 - A*B) instead.
52857 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
52887 SDNodeFlags Flags = Op.getNode()->getFlags();
52922 // Fill in the non-negated ops with the original values.
52942 MVT VT = N->getSimpleValueType(0);
52953 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
52954 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
52956 switch (N->getOpcode()) {
52957 // clang-format off
52963 // clang-format on
52970 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
52972 if (N->getOpcode() != ISD::XOR)
52975 SDValue LHS = N->getOperand(0);
52976 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
52980 X86::CondCode(LHS->getConstantOperandVal(0)));
52982 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
52987 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
52992 EVT VT = N->getValueType(0);
52997 SDValue N0 = N->getOperand(0);
52998 SDValue N1 = N->getOperand(1);
53010 } else if (N->getOpcode() == ISD::SUB) {
53023 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
53044 SDValue N0 = N->getOperand(0);
53045 SDValue N1 = N->getOperand(1);
53046 EVT VT = N->getValueType(0);
53087 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
53098 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
53109 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
53110 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
53113 N0.getOperand(0).getOpcode() == N->getOpcode()) {
53117 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
53134 SDValue N0 = N->getOperand(0);
53135 EVT VT = N->getValueType(0);
53137 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
53148 ReverseMask[I] = (NumElts - 1) - I;
53162 unsigned Opcode = N->getOpcode();
53163 SDValue N0 = N->getOperand(0);
53164 SDValue N1 = N->getOperand(1);
53165 EVT VT = N->getValueType(0);
53169 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
53186 EVT VT = N->getValueType(0);
53189 // TODO - Constant Folding.
53208 /// to be used as a replacement operand with operations (eg, bitwise-and) where
53223 SDValue N0 = N->getOperand(0);
53224 SDValue N1 = N->getOperand(1);
53225 EVT VT = N->getValueType(0);
53238 return C && C->getConstantFPValue()->isAllOnesValue();
53241 // fand (fxor X, -1), Y --> fandn X, Y
53245 // fand X, (fxor Y, -1) --> fandn Y, X
53252 /// Do target-specific dag combines on X86ISD::FAND nodes.
53255 // FAND(0.0, x) -> 0.0
53256 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
53259 // FAND(x, 0.0) -> 0.0
53260 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
53269 /// Do target-specific dag combines on X86ISD::FANDN nodes.
53272 // FANDN(0.0, x) -> x
53273 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
53274 return N->getOperand(1);
53276 // FANDN(x, 0.0) -> 0.0
53277 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
53283 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
53287 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
53289 // F[X]OR(0.0, x) -> x
53290 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
53291 return N->getOperand(1);
53293 // F[X]OR(x, 0.0) -> x
53294 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
53295 return N->getOperand(0);
53303 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
53305 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
53312 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
53315 switch (N->getOpcode()) {
53321 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
53322 N->getOperand(0), N->getOperand(1));
53327 EVT VT = N->getValueType(0);
53339 SDValue Op0 = N->getOperand(0);
53340 SDValue Op1 = N->getOperand(1);
53342 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
53346 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
53347 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
53349 // If one of the operands is known non-NaN use the native min/max instructions
53350 // with the non-NaN input as second operand.
53352 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
53354 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
53368 // ----------------
53370 // Op0 ----------------
53372 // ----------------
53393 EVT VT = N->getValueType(0);
53401 SDValue In = N->getOperand(0);
53405 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
53406 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
53412 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
53426 bool IsStrict = N->isTargetStrictFPOpcode();
53427 EVT VT = N->getValueType(0);
53430 SDValue In = N->getOperand(IsStrict ? 1 : 0);
53434 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
53443 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
53444 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
53448 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
53460 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
53464 SDValue N0 = N->getOperand(0);
53465 SDValue N1 = N->getOperand(1);
53466 MVT VT = N->getSimpleValueType(0);
53471 // ANDNP(undef, x) -> 0
53472 // ANDNP(x, undef) -> 0
53476 // ANDNP(0, x) -> x
53480 // ANDNP(x, 0) -> 0
53484 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
53493 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
53494 if (N1->hasOneUse())
53517 if (N0->hasOneUse()) {
53547 // We can't assume an undef src element gives an undef dst - the
53570 if (N->getOpcode() != ISD::DELETED_NODE)
53581 SDValue N1 = N->getOperand(1);
53587 if (N->getOpcode() != ISD::DELETED_NODE)
53597 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
53598 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
53600 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
53604 if (N->getOpcode() != ISD::DELETED_NODE)
53611 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
53616 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
53617 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
53620 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
53637 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
53639 EVT DstVT = N->getValueType(0);
53641 SDValue N0 = N->getOperand(0);
53642 SDValue N1 = N->getOperand(1);
53643 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
53699 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
53704 EVT VT = N->getValueType(0);
53705 SDValue N0 = N->getOperand(0);
53706 SDValue N1 = N->getOperand(1);
53707 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
53711 // both SSE and AVX2 since there is no sign-extended shift right
53712 // operation on a vector with 64-bit elements.
53713 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
53739 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
53740 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
53746 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
53747 Ext->getOpcode() != ISD::ZERO_EXTEND)
53751 EVT VT = Ext->getValueType(0);
53755 SDValue Add = Ext->getOperand(0);
53761 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
53762 bool NSW = Add->getFlags().hasNoSignedWrap();
53763 bool NUW = Add->getFlags().hasNoUnsignedWrap();
53785 for (auto *User : Ext->uses()) {
53786 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
53795 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
53796 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
53800 // sign-extended.
53808 // operands and the result of CMOV is not used anywhere else - promote CMOV
53811 // (or more) pseudo-CMOVs only when they go one-after-another and
53815 // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
53816 // promotion is also good in terms of code-size.
53817 // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
53820 SDValue CMovN = Extend->getOperand(0);
53824 EVT TargetVT = Extend->getValueType(0);
53825 unsigned ExtendOpcode = Extend->getOpcode();
53868 SDValue N0 = N->getOperand(0);
53869 EVT VT = N->getValueType(0);
53892 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
53904 if (N->getOpcode() == ISD::ZERO_EXTEND)
53913 SDValue N0 = N->getOperand(0);
53914 EVT VT = N->getValueType(0);
53917 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
53920 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
53921 N0->getOperand(1));
53943 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
53974 return Use->getOpcode() != ISD::FMA && Use->getOpcode() != ISD::STRICT_FMA;
53976 if (llvm::any_of(V->uses(), IsNotFMA))
53982 for (const SDValue &Op : V->op_values()) {
53984 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
53997 if (llvm::any_of(NV->uses(), IsNotFMA))
54004 for (const SDValue &Op : V->op_values()) {
54006 if (Cst->isNegative())
54018 EVT VT = N->getValueType(0);
54019 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
54026 SDValue A = N->getOperand(IsStrict ? 1 : 0);
54027 SDValue B = N->getOperand(IsStrict ? 2 : 1);
54028 SDValue C = N->getOperand(IsStrict ? 3 : 2);
54030 // If the operation allows fast-math and the target does not support FMA,
54032 SDNodeFlags Flags = N->getFlags();
54085 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
54087 // Propagate fast-math-flags to new FMA node.
54090 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
54092 {N->getOperand(0), A, B, C});
54094 if (N->getNumOperands() == 4)
54095 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
54100 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
54101 // Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
54105 EVT VT = N->getValueType(0);
54110 SDValue N2 = N->getOperand(2);
54116 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
54118 if (N->getNumOperands() == 4)
54119 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
54120 NegN2, N->getOperand(3));
54121 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
54129 SDValue N0 = N->getOperand(0);
54130 EVT VT = N->getValueType(0);
54132 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
54134 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
54136 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
54137 N0->getOperand(1));
54157 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
54188 /// pre-promote its result type since vXi1 vectors don't get promoted
54207 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
54208 const SDValue LHS = N->getOperand(0);
54209 const SDValue RHS = N->getOperand(1);
54210 EVT VT = N->getValueType(0);
54227 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
54228 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
54230 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
54245 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
54246 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
54248 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
54263 // cmpeq(trunc(x),C) --> cmpeq(x,C)
54264 // cmpne(trunc(x),C) --> cmpne(x,C)
54280 // icmp eq Abs(X) C ->
54281 // (icmp eq A, C) | (icmp eq A, -C)
54282 // icmp ne Abs(X) C ->
54283 // (icmp ne A, C) & (icmp ne A, -C)
54289 const APInt &CInt = C->getAPIntValue();
54295 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
54413 // -> `(icmp ult (add x, -C), 2)`
54417 // in worse codegen. So, undo the middle-end transform and go back to `(or
54440 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
54457 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
54458 (CC == ISD::SETUGE && (-CmpC) == 2)) {
54474 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
54480 // X pred 0.0 --> X pred -X
54496 SDValue Src = N->getOperand(0);
54498 MVT VT = N->getSimpleValueType(0);
54518 // Look through int->fp bitcasts that don't change the element width.
54524 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
54535 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
54546 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
54547 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
54549 // Use KnownBits to determine if only a single bit is non-zero
54563 // vXi8 shifts - we only care about the signbit so can use PSLLW.
54579 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
54580 if (N->isOnlyUserOf(Src.getNode())) {
54613 MVT VT = N->getSimpleValueType(0);
54628 SDValue Mask = MemOp->getMask();
54635 if (N->getOpcode() != ISD::DELETED_NODE)
54650 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
54651 Gather->getMask(), Base, Index, Scale } ;
54652 return DAG.getMaskedGather(Gather->getVTList(),
54653 Gather->getMemoryVT(), DL, Ops,
54654 Gather->getMemOperand(),
54655 Gather->getIndexType(),
54656 Gather->getExtensionType());
54659 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
54660 Scatter->getMask(), Base, Index, Scale };
54661 return DAG.getMaskedScatter(Scatter->getVTList(),
54662 Scatter->getMemoryVT(), DL,
54663 Ops, Scatter->getMemOperand(),
54664 Scatter->getIndexType(),
54665 Scatter->isTruncatingStore());
54672 SDValue Index = GorS->getIndex();
54673 SDValue Base = GorS->getBasePtr();
54674 SDValue Scale = GorS->getScale();
54680 // Shrink constant indices if they are larger than 32-bits.
54688 if (BV->isConstant() && IndexWidth > 32 &&
54689 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
54703 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
54718 uint64_t ScaleAmt = Scale->getAsZExtVal();
54721 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
54722 // FIXME: Allow non-constant?
54725 APInt Adder = C->getAPIntValue() * ScaleAmt;
54736 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
54764 SDValue Mask = GorS->getMask();
54768 if (N->getOpcode() != ISD::DELETED_NODE)
54781 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
54782 SDValue EFLAGS = N->getOperand(1);
54795 SDValue EFLAGS = N->getOperand(3);
54796 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
54803 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
54804 N->getOperand(1), Cond, Flags);
54813 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
54817 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
54824 EVT VT = N->getValueType(0);
54825 bool IsStrict = N->isStrictFPOpcode();
54827 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54834 // make the transformation for non-constant splats as well, but it's unclear
54839 if (!BV->isConstant())
54844 EVT IntVT = BV->getValueType(0);
54849 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
54850 {N->getOperand(0), SDValue(BV, 0)});
54852 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
54855 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
54866 /// If we are converting a value to floating-point, try to replace scalar
54873 SDValue Trunc = N->getOperand(0);
54889 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
54898 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
54903 bool IsStrict = N->isStrictFPOpcode();
54904 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54905 EVT VT = N->getValueType(0);
54911 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
54912 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
54914 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
54915 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
54931 {N->getOperand(0), P});
54935 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
54936 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
54937 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
54947 {N->getOperand(0), P});
54954 SDNodeFlags Flags = N->getFlags();
54958 {N->getOperand(0), Op0});
54970 bool IsStrict = N->isStrictFPOpcode();
54975 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54976 EVT VT = N->getValueType(0);
54982 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
54983 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
54985 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
54986 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
55002 {N->getOperand(0), P});
55006 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
55007 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
55008 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
55016 {N->getOperand(0), P});
55026 if (NumSignBits >= (BitWidth - 31)) {
55035 {N->getOperand(0), Trunc});
55043 { 0, 2, -1, -1 });
55046 {N->getOperand(0), Shuf});
55052 // a 32-bit target where SSE doesn't support i64->FP operations.
55066 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
55069 Subtarget.getTargetLowering()->BuildFILD(
55070 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
55071 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
55089 for (const SDNode *User : Flags->uses()) {
55091 switch (User->getOpcode()) {
55097 CC = (X86::CondCode)User->getConstantOperandVal(0);
55101 CC = (X86::CondCode)User->getConstantOperandVal(2);
55106 // clang-format off
55114 // clang-format on
55124 for (const SDNode *User : Flags->uses()) {
55126 switch (User->getOpcode()) {
55140 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
55152 if (!isNullConstant(N->getOperand(1)))
55160 SDValue Op = N->getOperand(0);
55177 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
55192 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
55220 // Peek through any zero-extend if we're only testing for a zero result.
55238 // i32 truncated op to prevent partial-reg compares of promoted ops.
55298 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
55302 SDValue LHS = N->getOperand(0);
55303 SDValue RHS = N->getOperand(1);
55305 bool IsSub = X86ISD::SUB == N->getOpcode();
55308 if (IsSub && isOneConstant(N->getOperand(1)) && !N->hasAnyUseOfValue(0))
55313 if (!N->hasAnyUseOfValue(1)) {
55321 SDVTList VTs = DAG.getVTList(N->getValueType(0));
55330 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
55339 SDValue LHS = N->getOperand(0);
55340 SDValue RHS = N->getOperand(1);
55341 SDValue BorrowIn = N->getOperand(2);
55344 MVT VT = N->getSimpleValueType(0);
55349 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
55352 !N->hasAnyUseOfValue(1))
55353 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
55362 SDValue LHS = N->getOperand(0);
55363 SDValue RHS = N->getOperand(1);
55364 SDValue CarryIn = N->getOperand(2);
55370 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
55376 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
55381 EVT VT = N->getValueType(0);
55382 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
55391 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
55394 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
55396 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
55397 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
55403 MVT VT = N->getSimpleValueType(0);
55408 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
55410 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
55411 !N->hasAnyUseOfValue(1))
55412 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
55457 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
55458 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
55465 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
55466 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
55467 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
55468 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
55471 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
55472 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
55490 Mul = Op0L->getOperand(0);
55491 if (Mul->getOpcode() != ISD::MUL ||
55496 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
55497 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
55596 unsigned IdxN00 = ConstN00Elt->getZExtValue();
55597 unsigned IdxN01 = ConstN01Elt->getZExtValue();
55598 unsigned IdxN10 = ConstN10Elt->getZExtValue();
55599 unsigned IdxN11 = ConstN11Elt->getZExtValue();
55662 // ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
55671 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
55709 /// earlier folds that may be used to turn select-of-constants into logic hacks.
55713 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
55714 // better because we eliminate 1-2 instructions. This transform is still
55717 // immediate asm operands (fit in 32-bits).
55730 SDValue Cmov = N->getOperand(0);
55731 SDValue OtherOp = N->getOperand(1);
55742 EVT VT = N->getValueType(0);
55749 // a 3-operand LEA which is likely slower than a 2-operand LEA.
55753 all_of(N->uses(), [&](SDNode *Use) {
55755 return MemNode && MemNode->getBasePtr().getNode() == N;
55757 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
55768 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
55778 EVT VT = N->getValueType(0);
55779 SDValue Op0 = N->getOperand(0);
55780 SDValue Op1 = N->getOperand(1);
55797 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
55813 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
55833 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
55834 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
55836 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
55837 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
55844 // Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
55845 // condition comes from the subtract node that produced -X. This matches the
55849 SDValue N0 = N->getOperand(0);
55850 SDValue N1 = N->getOperand(1);
55865 // Get the X and -X from the negate.
55878 MVT VT = N->getSimpleValueType(0);
55886 SDValue Op0 = N->getOperand(0);
55887 SDValue Op1 = N->getOperand(1);
55891 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
55893 EVT VT = N->getValueType(0);
55896 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
55901 APInt NewImm = Op0C->getAPIntValue() - 1;
55915 // ->
55917 if (N->getConstantOperandVal(3) != X86::COND_NE)
55920 SDValue Sub = N->getOperand(4);
55929 SmallVector<SDValue, 5> Ops(N->op_values());
55933 return DAG.getMemIntrinsicNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops,
55934 cast<MemSDNode>(N)->getMemoryVT(),
55935 cast<MemSDNode>(N)->getMemOperand());
55941 SDValue Op0 = N->getOperand(0);
55942 SDValue Op1 = N->getOperand(1);
55949 return !Cst->isOpaque();
55959 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
55962 Op1->hasOneUse()) {
55978 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
55979 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
55981 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
55982 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
55986 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
55988 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
55990 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
55991 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
56008 unsigned Opcode = N->getOpcode();
56012 SDValue LHS = N->getOperand(0);
56013 SDValue RHS = N->getOperand(1);
56014 MVT VT = N->getSimpleValueType(0);
56024 // PCMPEQ(X,UNDEF) -> UNDEF
56025 // PCMPGT(X,UNDEF) -> 0
56026 // PCMPGT(UNDEF,X) -> 0
56103 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
56113 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
56122 // extract_subvector(broadcast(x))) -> broadcast(x)
56124 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
56132 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
56136 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
56145 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
56169 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
56419 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
56668 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
56669 *FirstLd->getMemOperand(), &Fast) &&
56717 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
56726 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
56743 EVT VT = N->getValueType(0);
56744 EVT SrcVT = N->getOperand(0).getValueType();
56746 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
56755 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
56756 if (I == (E - 1)) {
56782 MVT OpVT = N->getSimpleValueType(0);
56787 SDValue Vec = N->getOperand(0);
56788 SDValue SubVec = N->getOperand(1);
56790 uint64_t IdxVal = N->getConstantOperandVal(2);
56827 Ins.getOperand(1), N->getOperand(2));
56836 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
56843 SubVec.getOperand(1), N->getOperand(2));
56867 // Match concat_vector style patterns.
56905 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
56908 MemIntr->getMemoryVT(),
56909 MemIntr->getMemOperand());
56932 /// is a common pattern for AVX1 integer code because 256-bit selects may be
56933 /// legal, but there is almost no integer math/logic available for 256-bit.
56938 SDValue Sel = Ext->getOperand(0);
56945 // TODO: This can be extended to handle extraction to 256-bits.
56946 MVT VT = Ext->getSimpleValueType(0);
56954 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
56961 unsigned ExtIdx = Ext->getConstantOperandVal(1);
56990 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
56992 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
57000 if (!N->getValueType(0).isSimple())
57003 MVT VT = N->getSimpleValueType(0);
57004 SDValue InVec = N->getOperand(0);
57005 unsigned IdxVal = N->getConstantOperandVal(1);
57021 SDValue NotOp = V->getOperand(0);
57026 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
57029 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
57049 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
57060 InVec.getOperand(0), N->getOperand(1));
57061 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
57077 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
57188 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
57202 EVT VT = N->getValueType(0);
57203 SDValue Src = N->getOperand(0);
57234 if (Ld->getExtensionType() == Ext &&
57235 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
57266 for (SDNode *User : Src->uses())
57267 if (User->getOpcode() == X86ISD::VBROADCAST &&
57268 Src == User->getOperand(0)) {
57271 User->getValueSizeInBits(0).getFixedValue();
57287 SDValue LHS = N->getOperand(0);
57288 SDValue RHS = N->getOperand(1);
57293 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
57298 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
57312 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
57318 LHS.getOperand(0), { 0, -1, 1, -1 });
57320 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
57322 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
57328 RHS.getOperand(0), { 0, -1, 1, -1 });
57330 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
57339 MVT VT = N->getSimpleValueType(0);
57340 SDValue LHS = N->getOperand(0);
57341 SDValue RHS = N->getOperand(1);
57342 unsigned Opc = N->getOpcode();
57385 EVT VT = N->getValueType(0);
57386 SDValue In = N->getOperand(0);
57387 unsigned Opcode = N->getOpcode();
57396 if (Ld->isSimple()) {
57404 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
57405 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
57412 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
57417 // -> EXTEND_VECTOR_INREG(X).
57418 // TODO: Handle non-zero subvector indices.
57425 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
57452 EVT VT = N->getValueType(0);
57454 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
57473 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
57476 if (N->getValueType(0) != MVT::f32 ||
57477 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
57482 N->getOperand(0).getOperand(0));
57493 EVT VT = N->getValueType(0);
57494 bool IsStrict = N->isStrictFPOpcode();
57495 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
57558 {N->getOperand(0), Src});
57589 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
57590 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
57594 if (N->hasAnyUseOfValue(1))
57599 SDValue Ptr = MemIntrin->getBasePtr();
57600 SDValue Chain = MemIntrin->getChain();
57601 EVT VT = N->getSimpleValueType(0);
57602 EVT MemVT = MemIntrin->getMemoryVT();
57606 for (SDNode *User : Ptr->uses())
57607 if (User != N && User->getOpcode() == N->getOpcode() &&
57608 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
57609 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
57610 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
57612 !User->hasAnyUseOfValue(1) &&
57613 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
57628 bool IsStrict = N->isStrictFPOpcode();
57629 EVT VT = N->getValueType(0);
57630 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
57651 bool IsOp0Strict = Op0->isStrictFPOpcode();
57693 {N->getOperand(0), Src, Rnd});
57715 SDValue Src = N->getOperand(0);
57721 if (LN->isSimple()) {
57722 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
57723 LN->getBasePtr(),
57724 LN->getPointerInfo(),
57725 LN->getOriginalAlign(),
57726 LN->getMemOperand()->getFlags());
57737 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
57748 switch (N->getOpcode()) {
57749 // clang-format off
57935 // clang-format on
57945 // Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
57959 // TODO: Almost no 8-bit ops are desirable because they have no actual
57960 // size/speed advantages vs. 32-bit ops, but they do have a major
57963 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
57964 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
57965 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
58006 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
58008 // In case control-flow branch protection is enabled, we need to add
58023 EVT VT = LogicOp->getValueType(0);
58024 EVT OpVT = SETCC0->getOperand(0).getValueType();
58037 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
58049 // 8-bit multiply-by-constant can usually be expanded to something cheaper
58057 SDNode *User = *Op->use_begin();
58062 return Ld->getBasePtr() == St->getBasePtr();
58070 SDNode *User = *Op->use_begin();
58071 if (User->getOpcode() != ISD::ATOMIC_STORE)
58075 return Ld->getBasePtr() == St->getBasePtr();
58123 //===----------------------------------------------------------------------===//
58125 //===----------------------------------------------------------------------===//
58163 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
58165 const std::string &AsmStr = IA->getAsmString();
58167 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
58168 if (!Ty || Ty->getBitWidth() % 16 != 0)
58171 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
58194 // rorw $$8, ${0:w} --> llvm.bswap.i16
58195 if (CI->getType()->isIntegerTy(16) &&
58196 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
58200 StringRef ConstraintsStr = IA->getConstraintString();
58208 if (CI->getType()->isIntegerTy(32) &&
58209 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
58214 StringRef ConstraintsStr = IA->getConstraintString();
58221 if (CI->getType()->isIntegerTy(64)) {
58222 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
58226 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
58361 Type *Ty = CallOperandVal->getType();
58377 if (CallOperandVal->getType()->isIntegerTy())
58383 if (Ty->isFloatingPointTy())
58387 if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
58398 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
58399 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
58400 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
58405 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
58410 if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
58430 if (CallOperandVal->getType()->isIntegerTy())
58436 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
58440 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
58441 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
58446 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
58451 if (C->getZExtValue() <= 31)
58456 if (C->getZExtValue() <= 63)
58461 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
58466 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
58471 if (C->getZExtValue() <= 3)
58476 if (C->getZExtValue() <= 0xff)
58486 if ((C->getSExtValue() >= -0x80000000LL) &&
58487 (C->getSExtValue() <= 0x7fffffffLL))
58492 if (C->getZExtValue() <= 0xffffffff)
58534 // Extend to 32-bits
58552 if (C->getZExtValue() <= 31) {
58553 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58561 if (C->getZExtValue() <= 63) {
58562 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58570 if (isInt<8>(C->getSExtValue())) {
58571 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58579 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
58580 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
58581 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
58589 if (C->getZExtValue() <= 3) {
58590 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58598 if (C->getZExtValue() <= 255) {
58599 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58607 if (C->getZExtValue() <= 127) {
58608 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58615 // 32-bit signed value
58618 C->getSExtValue())) {
58620 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
58633 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
58634 BA->getValueType(0)));
58637 if (Op->getOpcode() == ISD::ADD &&
58638 isa<ConstantSDNode>(Op->getOperand(1))) {
58639 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
58640 Op = Op->getOperand(0);
58643 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
58644 GA->getValueType(0), Offset));
58649 // 32-bit unsigned value
58652 C->getZExtValue())) {
58653 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58665 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
58669 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
58670 : CST->getSExtValue();
58682 // If we are in non-pic codegen mode, we allow the address of a global (with
58688 Subtarget.classifyGlobalReference(GA->getGlobal())))
58775 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
58796 // 32-bit fallthrough
59090 // Map st(0) -> st(7) -> ST0
59100 return std::make_pair(X86::FP0 + Constraint[4] - '0',
59109 // flags -> EFLAGS
59113 // dirflag -> DF
59119 // fpsr -> FPSW
59127 // Make sure it isn't a register that requires 64-bit mode.
59130 TRI->getEncodingValue(Res.first) >= 8) {
59131 // Register requires REX prefix, but we're in 32-bit mode.
59137 TRI->getEncodingValue(Res.first) & 0x10) {
59143 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
59146 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
59171 // Model GCC's behavior here and select a fixed pair of 32-bit
59192 if (RC && RC->contains(DestReg))
59211 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
59213 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
59215 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
59248 // integer division, leaving the division as-is is a loss even in terms of
59261 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
59262 AFI->setIsSplitCSR(true);
59269 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
59274 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
59275 MachineBasicBlock::iterator MBBI = Entry->begin();
59283 Register NewVR = MRI->createVirtualRegister(RC);
59285 // FIXME: this currently does not emit CFI pseudo-instructions, it works
59286 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
59288 // CFI pseudo-instructions.
59290 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
59292 Entry->addLiveIn(*I);
59293 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
59296 // Insert the copy-back instructions right before the terminator.
59298 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
59299 TII->get(TargetOpcode::COPY), *I)
59312 assert(MBBI->isCall() && MBBI->getCFIType() &&
59318 switch (MBBI->getOpcode()) {
59325 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
59330 assert(MBBI->isCall() &&
59332 if (OrigCall->shouldUpdateCallSiteInfo())
59334 MBBI->setCFIType(MF, OrigCall->getCFIType());
59335 OrigCall->eraseFromParent();
59342 MachineOperand &Target = MBBI->getOperand(0);
59344 switch (MBBI->getOpcode()) {
59357 // 64-bit indirect thunk calls.
59367 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
59369 .addImm(MBBI->getCFIType())
59383 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
59387 if (MF.getFunction().hasFnAttribute("probe-stack"))
59388 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
59389 "inline-asm";
59403 if (MF.getFunction().hasFnAttribute("probe-stack"))
59404 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
59409 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
59423 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
59428 if (ML && ML->isInnermost() &&