Lines Matching +full:depth +full:- +full:wise
1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
12 //===----------------------------------------------------------------------===//
68 #define DEBUG_TYPE "x86-isel"
71 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
75 "alignment set by x86-experimental-pref-loop-alignment."),
79 "x86-br-merging-base-cost", cl::init(2),
85 "will be merged, and above which conditionals will be split. Set to -1 "
90 "x86-br-merging-ccmp-bias", cl::init(6),
91 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
96 WidenShift("x86-widen-shift", cl::init(true),
101 "x86-br-merging-likely-bias", cl::init(0),
102 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
107 "the instruction cost threshold. Set to -1 to never merge likely "
112 "x86-br-merging-unlikely-bias", cl::init(-1),
114 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
119 "the instruction cost threshold. Set to -1 to never merge unlikely "
124 "mul-constant-optimization", cl::init(true),
139 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
143 // default expansion to a no-op.
146 // For 64-bit, since we have so many registers, use the ILP scheduler.
147 // For 32-bit, use the register pressure specific scheduling.
156 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
266 // We have an algorithm for SSE2, and we turn this into a 64-bit
270 // We have an algorithm for SSE2->double, and we turn this into a
271 // 64-bit FILD followed by conditional FADD for other targets.
286 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
300 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
373 // Without SSE, i64->f64 goes through memory.
381 // the two-result form to trivial CSE, which is able to combine x/y and x%y
384 // Scalar integer multiply-high is also lowered to use two-result
386 // (low) operations are left as Legal, as there are single-result
387 // instructions for this in x86. Using the two-result multiply instructions
459 // Special handling for half-precision floating point conversions.
528 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
547 // 64-bit shl, sra, srl (iff 32-bit x86)
576 // All CPUs supporting AVX will atomically load/store aligned 128-bit
585 // FIXME - use subtarget debug flags
672 // Disable f32->f64 extload as we can only generate this in one instruction
675 // non-optsize case.
801 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
802 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
811 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
812 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
820 // Handle constrained floating-point operations of scalar.
859 // clang-format off
871 // clang-format on
885 // Handle constrained floating-point operations of scalar.
898 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
925 // clang-format off
933 // clang-format on
986 // clang-format off
1000 // clang-format on
1051 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1067 // with -msoft-float, disable use of MMX as well.
1107 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1280 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1289 // Add 32-bit vector stores to help vectorization opportunities.
1397 // FIXME: Do we need to handle scalar-to-vector here?
1423 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1545 // These types need custom splitting if their input is a 128-bit vector.
1653 // when we have a 256bit-wide blend with immediate.
1675 // (result) is 128-bit but the source is 256-bit wide.
1681 // Custom lower several nodes for 256-bit types.
1735 // available with AVX512. 512-bit vectors are in a separate block controlled
1764 // There is no byte sized k-register load or store without AVX512DQ.
1777 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1810 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1811 // elements. 512-bits can be disabled based on prefer-vector-width and
1812 // required-vector-width function attributes.
1894 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1895 // to 512-bit rather than use the AVX2 instructions so that we can use
1896 // k-masks.
1919 // Extends from v64i1 masks to 512-bit vectors.
2032 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2044 // (result) is 256-bit but the source is 512-bit wide.
2045 // 128-bit was made Legal under AVX1.
2111 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2114 // These operations are handled on non-VLX by artificially widening in
2229 // Extends from v32i1 masks to 256-bit vectors.
2239 // These operations are handled on non-VLX by artificially widening in
2241 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2514 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2523 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2532 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2537 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2557 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2561 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2562 // than generic legalization for 64-bit multiplication-with-overflow, though.
2604 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2605 // is. We should promote the value to 64-bits to solve this.
2606 // This is what the CRT headers do - `fmodf` is an inline header
2610 // clang-format off
2631 // clang-format on
2633 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2641 // We have target-specific dag combine patterns for the following nodes:
2707 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2709 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2711 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2720 // Default loop alignment, which can be overridden by -align-loops.
2723 // An out-of-order CPU can speculatively execute past a predictable branch,
2731 // Default to having -disable-strictnode-mutation on
2735 // This has so far only been implemented for 64-bit MachO.
2776 //===----------------------------------------------------------------------===//
2778 //===----------------------------------------------------------------------===//
2790 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2793 // TODO: If this is a non-temporal load and the target has an instruction
2806 // We can not replace a wide volatile load with a broadcast-from-memory,
2809 return !Ld->isVolatile() ||
2810 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2817 SDNode *User = *Op->user_begin();
2818 while (User->getOpcode() == ISD::BITCAST) {
2819 if (!User->hasOneUse())
2821 User = *User->user_begin();
2828 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2903 int ReturnAddrIndex = FuncInfo->getRAIndex();
2907 unsigned SlotSize = RegInfo->getSlotSize();
2909 -(int64_t)SlotSize,
2911 FuncInfo->setRAIndex(ReturnAddrIndex);
2923 // If we don't have a symbolic displacement - we don't have any extra
2929 // 64-bit offsets.
2939 // For other non-large code models we assume that latest small object is 16MB
2968 // clang-format off
2980 // clang-format on
2984 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2992 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2993 // X > -1 -> X == 0, jump !sign.
2997 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2998 // X < 0 -> X == 0, jump on sign.
3001 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3002 // X >= 0 -> X == 0, jump on !sign.
3005 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3006 // X < 1 -> X <= 0
3041 // clang-format off
3042 default: llvm_unreachable("Condcode should be pre-legalized away");
3063 // clang-format on
3105 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3113 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3121 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3129 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3140 unsigned Size = I.getType()->getScalarSizeInBits();
3141 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3152 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3153 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3174 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3175 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3185 switch (IntrData->Type) {
3191 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3193 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3195 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3197 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3210 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3221 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3222 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3251 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3253 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3255 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3258 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3262 // can be store-folded. Therefore, it's probably not worth splitting the load.
3263 EVT VT = Load->getValueType(0);
3264 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3265 for (SDUse &Use : Load->uses()) {
3273 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR || !User->hasOneUse() ||
3274 User->user_begin()->getOpcode() != ISD::STORE)
3277 // All non-chain uses are extract + store.
3288 assert(Ty->isIntegerTy());
3290 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3298 // a floating-point compare and we have blendv or conditional move, then it is
3299 // cheaper to select instead of doing a cross-register move and creating a
3326 // through type legalization on 32-bit targets so we would need to special
3333 // most implementations, sub-vXi32 vector multiplies are always fast,
3342 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3343 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3364 // TODO - do we have any exceptions?
3392 (!Ty->isVectorTy() &&
3393 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3469 // There are only 32-bit and 64-bit forms for 'andn'.
3473 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3516 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3538 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3558 // at least imm32 mask (or be zext i32 -> i64).
3560 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3563 // We can only benefit if req at least 7-bit for the mask. We
3570 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3572 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3583 // Non-vector type and we have a zext mask with SRL.
3606 return N->getOpcode() != ISD::FP_EXTEND;
3611 assert(((N->getOpcode() == ISD::SHL &&
3612 N->getOperand(0).getOpcode() == ISD::SRL) ||
3613 (N->getOpcode() == ISD::SRL &&
3614 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3615 "Expected shift-shift mask");
3617 EVT VT = N->getValueType(0);
3620 // Only fold if the shift values are equal - so it folds to AND.
3621 // TODO - we should fold if either is a non-uniform vector but we don't do
3622 // the fold for non-splats yet.
3623 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3635 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3671 // TODO: Allow 64-bit type for 32-bit target.
3672 // TODO: 512-bit types should be allowed, but make sure that those
3765 /// Return true if every element in Mask, is an in-place blend/select mask or is
3777 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3823 /// shuffle masks. The latter have the special property of a '-2' representing
3824 /// a zero-ed lane of a vector.
3839 // a pair of values. If we find such a case, use the non-undef mask's value.
3883 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3937 // Use an UNDEF node if MaskElt == -1.
3938 // Split 64-bit constants in the 32-bit mode.
4025 // available, use a floating-point +0.0 instead.
4079 // This is the index of the first element of the vectorWidth-bit chunk
4081 IdxVal &= ~(ElemsPerChunk - 1);
4086 Vec->ops().slice(IdxVal, ElemsPerChunk));
4098 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4100 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4102 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4111 /// Generate a DAG to grab 256-bits from a 512-bit vector.
4134 // This is the index of the first element of the vectorWidth-bit chunk
4136 IdxVal &= ~(ElemsPerChunk - 1);
4142 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
4144 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4146 /// we want. It need not be aligned to a 128-bit boundary. That makes
4163 // If the upper 128-bits of a build vector are already undef/zero, then try to
4164 // widen from the lower 128-bits.
4167 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4219 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4220 Ops.append(N->op_begin(), N->op_end());
4224 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4225 SDValue Src = N->getOperand(0);
4226 SDValue Sub = N->getOperand(1);
4227 const APInt &Idx = N->getConstantOperandAPInt(2);
4313 // If this is a splat value (with no-undefs) then use the lower subvector,
4351 // Make sure we only try to split 256/512-bit types to avoid creating
4377 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4378 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4423 // Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4435 // AVX512 broadcasts 32/64-bit operands.
4436 // TODO: Support float once getAVX512Node is used by fp-ops.
4447 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4482 // Perform the 512-bit op then extract the bottom subvector.
4488 /// Insert i1-subvector to i1-vector.
4558 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4564 unsigned ShiftLeft = NumElems - SubVecNumElems;
4565 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4591 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4606 unsigned ShiftLeft = NumElems - SubVecNumElems;
4607 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4633 unsigned LowShift = NumElems - IdxVal;
4671 "Expected a 128/256/512-bit vector type");
4685 // For 256-bit vectors, we only need the lower (128-bit) input half.
4686 // For 512-bit vectors, we only need the lower input half or quarter.
4726 /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4728 /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4729 /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4794 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4828 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4870 // TODO: Add support for non-zero offsets.
4873 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4875 return CNode->getConstVal();
4881 return getTargetConstantFromBasePtr(Load->getBasePtr());
4976 Mask = CInt->getValue();
4980 Mask = CFP->getValueAPF().bitcastToAPInt();
4984 Type *Ty = CDS->getType();
4985 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
4986 Type *EltTy = CDS->getElementType();
4987 bool IsInteger = EltTy->isIntegerTy();
4989 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
4992 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
4993 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
4995 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
4997 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5014 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5019 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5029 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5040 Type *CstTy = Cst->getType();
5041 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5042 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5045 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5053 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5064 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5067 SDValue Ptr = MemIntr->getBasePtr();
5079 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5088 SDValue Ptr = MemIntr->getBasePtr();
5092 Type *CstTy = Cst->getType();
5093 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5094 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5095 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5098 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5105 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5128 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5134 // If bitcasts to larger elements we might lose track of undefs - don't
5159 // TODO - support extract_subvector through bitcasts.
5181 // TODO - support shuffle through bitcasts.
5185 ArrayRef<int> Mask = SVN->getMask();
5214 if (UndefElts1[M - NumElts])
5216 EltBits.push_back(EltBits1[M - NumElts]);
5233 int SplatIndex = -1;
5238 SplatIndex = -1;
5287 // Helper to attempt to return a cheaper, bit-inverted version of \p V.
5293 // Match not(xor X, -1) -> X.
5299 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5309 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5320 // Don't fold min_signed_value -> (min_signed_value - 1)
5324 Elt -= 1;
5335 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5347 // Match not(or(not(X),not(Y))) -> and(X, Y).
5350 // TODO: Handle cases with single NOT operand -> ANDNP
5361 /// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5370 unsigned Repetitions = 1u << (NumStages - 1);
5418 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5424 /// It is an error to call this with non-empty Mask/Ops vectors.
5447 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5454 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5461 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5512 "Only 32-bit and 64-bit elements are supported!");
5515 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5525 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5534 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5541 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5548 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5554 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5560 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5570 // We only decode broadcasts of same-sized vectors, peeking through to
5604 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5618 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5625 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5651 unsigned CtrlImm = CtrlOp->getZExtValue();
5713 // inputs that are actually the same node. Re-map the mask to always point
5718 M -= Mask.size();
5720 // If we didn't already add operands in the opcode-specific code, default to
5786 int Scale = Size / V->getNumOperands();
5793 APInt Val = Cst->getAPIntValue();
5798 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5809 int Scale = V->getNumOperands() / Size;
5888 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5889 // TODO: We currently only set UNDEF for integer types - floats use the same
5903 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
5997 const SelectionDAG &DAG, unsigned Depth,
6007 const SelectionDAG &DAG, unsigned Depth,
6026 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6037 // Attempt to decode as a per-byte mask.
6048 // We can't assume an undef src element gives an undef dst - the other src
6073 Depth + 1, true) ||
6075 Depth + 1, true))
6105 if (!N->isOnlyUserOf(Sub.getNode()))
6132 // Limit this to vXi64 512-bit vector cases to make the most of AVX512
6134 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6158 Depth + 1, ResolveKnownElts))
6210 // Check we have an in-range constant insertion index.
6298 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6300 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6303 // PACKSS then it was likely being used for sign-extension for a
6305 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6310 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6318 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6320 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6363 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6383 Mask[i + j] = i + j - ByteShift;
6387 Mask[i + j - ByteShift] = i + j;
6413 Mask[i + j] = i + j - ByteShift;
6417 Mask[i + j - ByteShift] = i + j;
6429 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6462 // We can only handle all-signbits extensions.
6518 M -= MaskWidth;
6529 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6548 const SelectionDAG &DAG, unsigned Depth,
6550 if (Depth >= SelectionDAG::MaxRecursionDepth)
6551 return false; // Limit search depth.
6562 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6573 const SelectionDAG &DAG, unsigned Depth,
6577 KnownZero, DAG, Depth, ResolveKnownElts);
6582 const SelectionDAG &DAG, unsigned Depth = 0,
6590 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6602 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6603 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6606 SDValue Ptr = DAG.getMemBasePlusOffset(Mem->getBasePtr(),
6609 SDValue Ops[] = {Mem->getChain(), Ptr};
6613 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6621 SelectionDAG &DAG, unsigned Depth) {
6622 if (Depth >= SelectionDAG::MaxRecursionDepth)
6623 return SDValue(); // Limit search depth.
6631 int Elt = SV->getMaskElt(Index);
6636 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6637 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6659 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6670 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6671 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6680 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6687 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6695 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6701 // For insert_vector_elt - either return the index matching scalar or recurse
6707 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6748 assert(0 == i && "Expected insertion into zero-index");
6771 // SSE4.1 - use PINSRB to insert each byte directly.
6778 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6779 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6888 assert(Zeroable.size() - Zeroable.count() > 1 &&
6889 "We expect at least two non-zero elements!");
6902 // Make sure that this node is extracting from a 128-bit vector.
6927 Elt = Op->getOperand(EltIdx);
6958 SDValue Current = Op->getOperand(i);
6959 SDValue SrcVector = Current->getOperand(0);
6968 assert(V1.getNode() && "Expected at least two non-zero elements!");
7005 SDValue Ptr = LD->getBasePtr();
7006 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7008 EVT PVT = LD->getValueType(0);
7012 int FI = -1;
7015 FI = FINode->getIndex();
7019 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7026 // FIXME: 256-bit vector instructions don't require a strict alignment,
7029 SDValue Chain = LD->getChain();
7050 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7057 int EltNo = (Offset - StartOffset) >> 2;
7062 LD->getPointerInfo().getWithOffset(StartOffset));
7076 if (!BaseLd->isSimple())
7090 uint64_t Amt = AmtC->getZExtValue();
7104 uint64_t Idx = IdxC->getZExtValue();
7119 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7129 int LastLoadedElt = -1;
7160 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7171 // Handle Special Cases - all undef or undef/zero.
7188 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7190 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7202 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7207 EltIdx - FirstLoadedElt);
7228 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7229 assert(LDBase->isSimple() &&
7232 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7233 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
7242 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7245 // LOAD - all consecutive load/undefs (must start/end with a load or be
7256 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7258 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7268 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7274 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7305 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7319 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7321 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7322 LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
7330 // BROADCAST - match the smallest possible repetition pattern, load that
7340 // Don't attempt a 1:N subvector broadcast - it should be caught by
7401 // are consecutive, non-overlapping, and in the right order.
7424 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7448 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7473 for (auto *U : N->users()) {
7474 unsigned Opc = U->getOpcode();
7476 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7478 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7484 if (N->hasOneUse()) {
7487 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7507 // TODO: Splats could be generated for non-AVX CPUs using SSE
7508 // instructions, but there's less potential gain for only 128-bit vectors.
7512 MVT VT = BVOp->getSimpleValueType(0);
7522 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7566 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7571 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7589 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7605 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7621 if (!Ld || NumElts - NumUndefElts != 1)
7632 // TODO: Handle broadcasts of non-constant sequences.
7634 // Make sure that all of the users of a non-constant load are from the
7636 // FIXME: Is the use count needed for non-constant, non-load case?
7637 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7655 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7670 C = CI->getConstantIntValue();
7672 C = CF->getConstantFPValue();
7678 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7689 // Handle AVX2 in-register broadcasts.
7698 // Make sure the non-chain result is only used by this build vector.
7699 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7706 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7709 LN->getMemoryVT(), LN->getMemOperand());
7714 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7720 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7723 LN->getMemoryVT(), LN->getMemOperand());
7742 int Idx = ExtIdx->getAsZExtVal();
7746 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7757 SDValue ShuffleVec = SVOp->getOperand(0);
7762 int ShuffleIdx = SVOp->getMaskElt(Idx);
7783 SmallVector<int, 8> Mask(NumElems, -1);
7803 // Quit if non-constant index.
7871 int SplatIdx = -1;
7877 Immediate |= (InC->getZExtValue() & 0x1) << idx;
7888 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7962 /// 128-bit partial horizontal operation on a 256-bit vector, but that operation
7963 /// may not match the layout of an x86 256-bit horizontal instruction.
7977 /// horizontal operations, but the index-matching logic is incorrect for that.
7979 /// code because it is only used for partial h-op matching now?
7984 EVT VT = N->getValueType(0);
7985 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
7993 unsigned NumElts = LastIdx - BaseIdx;
7999 SDValue Op = N->getOperand(i + BaseIdx);
8002 if (Op->isUndef()) {
8010 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8063 /// Emit a sequence of two 128-bit horizontal add/sub followed by
8067 /// This function expects two 256-bit vectors called V0 and V1.
8068 /// At first, each vector is split into two separate 128-bit vectors.
8069 /// Then, the resulting 128-bit vectors are used to implement two
8074 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8077 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8078 /// horizontal binop dag node would take as input the lower 128-bit of V1
8079 /// and the upper 128-bit of V1.
8085 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8086 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8092 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8093 /// the upper 128-bits of the result.
8114 if (!isUndefLO && !V0->isUndef())
8116 if (!isUndefHI && !V1->isUndef())
8120 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8123 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8132 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8133 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8141 MVT VT = BV->getSimpleValueType(0);
8151 // Odd-numbered elements in the input build vector are obtained from
8153 // Even-numbered elements in the input build vector are obtained from
8157 SDValue Op = BV->getOperand(i);
8260 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8267 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8301 MVT VT = BV->getSimpleValueType(0);
8314 // There are no known X86 targets with 512-bit ADDSUB instructions!
8333 MVT VT = BV->getSimpleValueType(0);
8338 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8339 // half of the result is calculated independently from the 128-bit halves of
8340 // the inputs, so that makes the index-checking logic below more complicated.
8349 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8361 // clang-format off
8367 // clang-format on
8380 // The source vector is chosen based on which 64-bit half of the
8424 // This is free (examples: zmm --> xmm, xmm --> ymm).
8425 MVT VT = BV->getSimpleValueType(0);
8440 if (BV->getOperand(i).isUndef())
8460 // We need at least 2 non-undef elements to make this worthwhile by default.
8462 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8467 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8469 MVT VT = BV->getSimpleValueType(0);
8480 // Try harder to match 256-bit ops by using extract/concat.
8490 if (BV->getOperand(i)->isUndef())
8494 if (BV->getOperand(i)->isUndef())
8530 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8584 MVT VT = Op->getSimpleValueType(0);
8590 unsigned Opcode = Op->getOperand(0).getOpcode();
8592 if (Opcode != Op->getOperand(i).getOpcode())
8608 // Don't do this if the buildvector is a splat - we'd replace one
8610 if (Op->getSplatValue())
8618 for (SDValue Elt : Op->ops()) {
8639 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8667 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8668 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8669 // vpcmpeqd on 256-bit vectors.
8702 // Zero-extend the index elements within the vector.
8739 // e.g. v4i32 -> v16i8 (Scale = 4)
8788 // SSE41 can compare v2i64 - select between indices 0 and 1.
8940 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8941 // reasoned to be a permutation of a vector by indices in a non-constant vector.
8945 // ->
8950 // construction of vectors with constant-0 elements.
8957 // This is done by checking that the i-th build_vector operand is of the form:
8971 SDValue ExtractedIndex = Op->getOperand(1);
8988 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9032 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9039 NumConstants--;
9060 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9062 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9063 // and blend the FREEZE-UNDEF operands back in.
9064 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9067 SmallVector<int, 16> BlendMask(NumElems, -1);
9071 BlendMask[i] = -1;
9098 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9099 UpperElems = NumElems - (NumElems / 4);
9102 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9104 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9121 // If we are inserting one variable into a vector of non-zero constants, try
9125 // constants. Insertion into a zero vector is handled as a special-case
9127 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9131 // Create an all-constant vector. The variable element in the old
9142 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9144 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9165 unsigned InsertC = InsIndex->getAsZExtVal();
9170 // There's no good way to insert into the high elements of a >128-bit
9173 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9182 // Special case for single non-zero, non-undef, element.
9187 // If we have a constant or non-constant insertion into the low element of
9233 // is a non-constant being inserted into an element other than the low one,
9252 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9259 // handled, so this is best done with a single constant-pool load.
9268 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
9274 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9300 // For AVX-length vectors, build the individual 128-bit pieces and use
9307 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9309 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9315 // Let legalizer expand 2-wide build_vectors.
9383 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9406 // our (non-undef) elements to the full vector width with the element in the
9427 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9435 // 256-bit AVX can use the vinsertf128 instruction
9436 // to create 256-bit vectors from two other 128-bit ones.
9444 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
9471 // If we have more than 2 non-zeros, build each half separately.
9474 ArrayRef<SDUse> Ops = Op->ops();
9502 // k-register.
9529 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9533 Log2_64(NonZeros) != NumOperands - 1) {
9545 // If there are zero or one non-zeros we can handle this very simply.
9559 ArrayRef<SDUse> Ops = Op->ops();
9590 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9591 // from two other 128-bit ones.
9593 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9597 //===----------------------------------------------------------------------===//
9606 //===----------------------------------------------------------------------===//
9608 /// Tiny helper function to identify a no-op mask.
9611 /// array input, which is assumed to be a single-input shuffle mask of the kind
9614 /// in-place shuffle are 'no-op's.
9617 assert(Mask[i] >= -1 && "Out of bound mask element!");
9627 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9643 /// Test whether there are elements crossing 128-bit lanes in this
9650 /// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9663 int SrcLane = -1;
9678 /// Test whether a shuffle mask is equivalent within each sub-lane.
9681 /// lane-relative shuffle in each sub-lane. This trivially implies
9682 /// that it is also not lane-crossing. It may however involve a blend from the
9686 /// non-trivial to compute in the face of undef lanes. The representation is
9687 /// suitable for use with existing 128-bit shuffles as entries from the second
9693 RepeatedMask.assign(LaneSize, -1);
9703 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9708 // This is the first non-undef entry in this slot of a 128-bit lane.
9717 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
9730 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
9737 /// Test whether a target shuffle mask is equivalent within each sub-lane.
9760 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
9765 // This is the first non-undef entry in this slot of a 128-bit lane.
9774 /// Test whether a target shuffle mask is equivalent within each sub-lane.
9843 /// each element of the mask is either -1 (signifying undef) or the value given
9853 assert(Mask[i] >= -1 && "Out of bound mask element!");
9859 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9860 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9872 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9889 // Check for out-of-range target shuffle mask indices.
9916 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9925 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9926 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9956 // Create 128-bit vector type based on mask size.
9989 /// Get a 4-lane 8-bit shuffle immediate for a mask.
9991 /// This helper function produces an 8-bit shuffle immediate corresponding to
9998 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9999 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10000 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10001 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10002 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10004 // If the mask only uses one non-undef element, then fully 'splat' it to
10006 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10027 // Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10031 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10034 // If the mask only uses one non-undef element, then fully 'splat' it to
10036 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10068 // The function looks for a sub-mask that the nonzero elements are in
10069 // increasing order. If such sub-mask exist. The function returns true.
10073 int NextElement = -1;
10077 assert(Mask[i] >= -1 && "Out of bound mask element!");
10291 /// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10292 /// followed by unpack 256-bit.
10308 // This is a "natural" unpack operation (rather than the 128-bit sectored
10309 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10336 unsigned UpperElts = NumElts - NumSrcElts;
10386 // Non-VLX targets must truncate from a 512-bit type, so we need to
10433 unsigned UpperElts = NumElts - NumSrcElts;
10482 // TODO: Support non-BWI VPMOVWB truncations?
10497 unsigned UpperElts = NumElts - NumSrcElts;
10524 // and truncate from the double-sized src.
10567 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10578 "We should only be called with masks with a power-of-2 size!");
10580 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10583 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10600 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10634 unsigned NumPackedBits = NumSrcBits - BitSize;
10709 // Don't lower multi-stage packs on AVX512, truncation is better.
10714 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10784 return SDValue(); // No non-zeroable elements!
10839 // For 32/64-bit elements, if we only reference one input (plus any undefs),
10864 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
10892 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10917 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
10928 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10932 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
10939 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
10946 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
10956 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
10972 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
10975 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11003 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11004 // allow that load-folding possibility.
11013 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11015 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11027 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11050 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11062 /// a single-input permutation.
11065 /// then reduce the shuffle to a single-input permutation.
11073 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11074 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11101 /// a single-input permutation.
11104 /// then reduce the shuffle to a single-input (wider) permutation.
11130 NormM -= NumElts;
11155 SmallVector<int, 32> PermuteMask(NumElts, -1);
11162 NormM -= NumElts;
11170 assert(PermuteMask[Elt] != -1 &&
11196 // This routine only supports 128-bit integer dual input vectors.
11208 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11209 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11265 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11274 // half-crossings are created.
11277 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11285 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11297 /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11333 M -= NumElts;
11344 // TODO - it might be worth doing this for unary shuffles if the permute
11367 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11369 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11427 SmallVector<int, 32> V1Mask(NumElts, -1);
11428 SmallVector<int, 32> V2Mask(NumElts, -1);
11429 SmallVector<int, 32> FinalMask(NumElts, -1);
11438 V2Mask[i] = M - NumElts;
11447 // and change \p InputMask to be a no-op (identity) mask.
11468 // It is possible that the shuffle for one of the inputs is already a no-op.
11469 // See if we can simplify non-no-op shuffles into broadcasts,
11478 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11480 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11481 // pre-shuffle first is a better strategy.
11501 // Unpack/rotate failed - try again with variable blends.
11513 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11516 V1Mask.assign(NumElts, -1);
11517 V2Mask.assign(NumElts, -1);
11518 FinalMask.assign(NumElts, -1);
11526 V2Mask[i + (j / 2)] = M - NumElts;
11540 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11541 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11549 return -1;
11574 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11582 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11607 // [-1, 12, 13, 14, -1, -1, 1, -1]
11608 // [-1, -1, -1, -1, -1, -1, 1, 2]
11610 // [-1, 4, 5, 6, -1, -1, 9, -1]
11611 // [-1, 4, 5, 6, -1, -1, -1, -1]
11622 int StartIdx = i - (M % NumElts);
11625 return -1;
11630 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11636 return -1;
11653 return -1;
11673 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11684 /// specified as a *right shift* because x86 is little-endian, it is a *left
11690 return -1;
11692 // PALIGNR works on 128-bit lanes.
11695 return -1;
11699 return -1;
11712 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11728 "512-bit PALIGNR requires BWI instructions");
11735 "Rotate-based lowering only supports 128-bit lowering!");
11737 "Can shuffle at most 16 bytes in a 128-bit vector!");
11742 int LoByteShift = 16 - ByteRotation;
11763 /// specified as a *right shift* because x86 is little-endian, it is a *left
11771 "Only 32-bit and 64-bit elements are supported!");
11773 // 128/256-bit vectors are only supported with VLX.
11775 && "VLX required for 128/256-bit vectors");
11783 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
11785 // TODO: We can probably make this more aggressive and use shift-pairs like
11797 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
11800 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
11806 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
11821 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11822 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
11832 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11847 // 01234567 --> zzzzzz01 --> 1zzzzzzz
11848 // 01234567 --> 4567zzzz --> zzzzz456
11849 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
11851 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11866 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11883 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
11886 /// bit-wise element shifts and the byte shift across an entire 128-bit double
11889 /// PSHL : (little-endian) left bit shift.
11891 /// [ -1, 4, zz, -1 ]
11892 /// PSRL : (little-endian) right bit shift.
11894 /// [ -1, -1, 7, zz]
11895 /// PSLLDQ : (little-endian) left byte shift
11897 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
11898 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
11899 /// PSRLDQ : (little-endian) right byte shift
11901 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
11902 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
11913 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
11923 unsigned Len = Scale - Shift;
11925 return -1;
11945 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
11962 return -1;
12019 for (; Len > 0; --Len)
12020 if (!Zeroable[Len - 1])
12026 int Idx = -1;
12039 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12041 Idx = M - i;
12059 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12089 int Len = Hi - Idx;
12101 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12104 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12107 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12178 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12181 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12189 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12202 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12208 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12209 -1};
12216 int PSHUFDMask[4] = {Offset / 2, -1,
12217 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12221 int PSHUFWMask[4] = {1, -1, -1, -1};
12230 // to 64-bits.
12278 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12280 ShMask[i - AlignToUnpack] = i;
12282 Offset -= AlignToUnpack;
12290 Offset -= (NumElements / 2);
12310 /// match this pattern. It will use all of the micro-architectural details it
12311 /// can to emit an efficient lowering. It handles both blends with all-zero
12312 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12315 /// The reason we have dedicated lowering for zext-style shuffles is that they
12326 "Exceeds 32-bit integer zero extension limit");
12329 // Define a helper function to check a particular ext-scale and lower to it if
12331 auto Lower = [&](int Scale) -> SDValue {
12356 Offset = M - (i / Scale);
12358 return SDValue(); // Flip-flopping inputs.
12360 // Offset must start in the lowest 128-bit lane or at the start of an
12373 return SDValue(); // Non-consecutive strided elements.
12377 // If we fail to find an input, we have a zero-shuffle which should always
12392 // The widest scale possible for extending is to a 64-bit integer.
12406 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12411 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12465 return V->hasOneUse() &&
12493 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12503 // Bail if a non-zero V1 isn't used in place.
12506 V1Mask[V2Index] = -1;
12516 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12522 // Using zext to expand a narrow element won't work for non-zero
12528 // Zero-extend directly to i32.
12533 // and OR with the zero-extended scalar.
12554 // this. We can't support integer vectors or non-zero targets cheaply.
12555 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12602 /// Try to lower broadcast of a single - truncated - integer element,
12616 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12617 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12643 // If we're extracting non-least-significant bits, shift so we can truncate.
12660 // This routine only handles 128-bit shufps.
12662 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12663 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12664 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12665 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12677 /// Test whether the specified input (0 or 1) is in-place blended by the
12692 /// If we are extracting two 128-bit halves of a vector and shuffling the
12693 /// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12694 /// multi-shuffle lowering.
12701 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12734 NewMask.append(NumElts, -1);
12736 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12739 // This is free: ymm -> xmm.
12747 /// filtering. While a little annoying to re-dispatch on type here, there isn't
12812 BitOffset -= BeginOffset;
12822 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
12848 cast<LoadSDNode>(V)->isSimple()) {
12849 // We do not check for one-use of the vector load because a broadcast load
12855 SDValue BaseAddr = Ld->getOperand(1);
12858 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
12864 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
12867 SDValue Ops[] = {Ld->getChain(), NewAddr};
12871 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12876 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12878 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12884 // We can only broadcast from the zero-element of a vector register,
12885 // but it can be advantageous to broadcast from the zero-element of a
12890 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12894 // If we are broadcasting an element from the lowest 128-bit subvector, try
12899 "Unexpected bit-offset");
12905 // Only broadcast the zero-element of a 128-bit subvector.
12910 "Unexpected bit-offset");
12937 // We only support broadcasting from 128-bit vectors to minimize the
12939 // 128-bits, removing as many bitcasts as possible.
12970 int VADstIndex = -1;
12971 int VBDstIndex = -1;
12987 // We can only insert a single non-zeroable element.
13000 // Don't bother if we have no (non-zeroable) element for insertion.
13014 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13018 // the zero mask and the V2 insertion - so remove V1 dependency.
13060 /// Handle lowering of 2-lane 64-bit floating point shuffles.
13062 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
13098 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13099 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13114 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13115 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13121 // blend patterns if a zero-blend above didn't work.
13140 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13145 /// Handle lowering of 2-lane 64-bit integer shuffles.
13147 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13169 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13170 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13171 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13172 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13178 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13179 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13218 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13259 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13271 NewMask[V2Index] -= 4;
13276 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13295 NewMask[2] -= 4;
13296 NewMask[3] -= 4;
13301 NewMask[0] -= 4;
13302 NewMask[1] -= 4;
13314 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13315 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13338 /// Lower 4-lane 32-bit floating point shuffles.
13405 // There are special ways we can lower some single-element blends. However, we
13406 // have custom ways we can lower more complex single-element blends below that
13408 // when the V2 input is targeting element 0 of the mask -- that is the fast
13443 /// Lower 4-lane i32 vector shuffles.
13445 /// We try to handle these with integer-domain shuffles where we can, but for
13477 // Try to use broadcast unless the mask only has one non-undef element.
13510 // There are special ways we can lower some single-element blends.
13533 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13572 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13584 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13586 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13587 /// vector, form the analogous 128-bit 8-element Mask.
13607 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13620 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13621 int NumHToL = LoInputs.size() - NumLToL;
13622 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13623 int NumHToH = HiInputs.size() - NumLToH;
13629 // If we are shuffling values from one half - check how many different DWORD
13643 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13675 DWordPairs.resize(2, std::make_pair(-1, -1));
13685 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13690 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13691 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13693 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13694 // and an existing 2-into-2 on the other half. In this case we may have to
13695 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13696 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13697 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13698 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13699 // half than the one we target for fixing) will be fixed when we re-enter this
13703 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13704 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13706 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13708 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13709 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13711 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13712 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13739 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13746 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13748 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13749 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13750 // is essential that we don't *create* a 3<-1 as then we might oscillate.
13754 // to balance this to ensure we don't form a 3-1 shuffle in the other
13822 // Recurse back into this routine to re-compute state now that this isn't
13836 int PSHUFLMask[4] = {-1, -1, -1, -1};
13837 int PSHUFHMask[4] = {-1, -1, -1, -1};
13838 int PSHUFDMask[4] = {-1, -1, -1, -1};
13841 // original halves. This will then dictate the targets of the cross-half
13850 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13851 InPlaceInputs[0] - HalfOffset;
13858 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13865 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13866 InPlaceInputs[0] - HalfOffset;
13870 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13877 // Now gather the cross-half inputs and place them into a free dword of
13880 // look more like the 3-1 fixing operation.
13905 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13906 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13907 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13908 Input - SourceOffset;
13911 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13914 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13916 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13917 Input - SourceOffset &&
13920 // Note that this correctly re-maps both when we do a swap and when
13923 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13927 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13928 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13930 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13935 // And just directly shift any other-half mask elements to be same-half
13940 M = M - SourceOffset + DestOffset;
13950 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13951 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13953 SourceHalfMask[InputFixed - SourceOffset] =
13954 IncomingInputs[0] - SourceOffset;
13961 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13962 // We have two non-adjacent or clobbered inputs we need to extract from
13965 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13966 IncomingInputs[1] - SourceOffset};
13992 // (because there are no off-half inputs to this half) and there is no
13994 // swap an input with a non-input.
14072 M -= 4;
14080 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14104 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14133 /// Generic lowering of 8-lane i16 shuffles.
14135 /// This handles both single-input shuffles and combined shuffle/blends with
14140 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14204 "All single-input shuffles should be canonicalized to be V1-input "
14219 // There are special ways we can lower some single-element blends.
14269 // Check if this is part of a 256-bit vector truncation.
14285 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14315 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14342 // We can always bit-blend if we have to so the fallback strategy is to
14343 // decompose into single-input permutes and blends/unpacks.
14348 /// Lower 8-lane 16-bit floating point shuffles.
14377 // Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14378 // sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14406 M += (Scale - 1) * NumElts;
14429 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14430 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14478 // For single-input shuffles, there are some nicer lowering tricks we can use.
14494 // Notably, this handles splat and partial-splat shuffles more efficiently.
14495 // However, it only makes sense if the pre-duplication shuffle simplifies
14497 // express the pre-duplication shuffle as an i16 shuffle.
14508 auto tryToWidenViaDuplication = [&]() -> SDValue {
14525 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14569 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14572 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14608 // blends but after all of the single-input lowerings. If the single input
14617 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14630 // do so. This avoids using them to handle blends-with-zero which is
14643 // FIXME: It might be worth trying to detect if the unpack-feeding
14650 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14661 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14671 // There are special ways we can lower some single-element blends.
14696 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14728 // Handle multi-input cases by blending/unpacking single-input shuffles.
14733 // The fallback path for single-input shuffles widens this into two v8i16
14738 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14739 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14782 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
14784 /// This routine breaks down the specific type of 128-bit shuffle and
14819 /// Generic routine to split vector shuffle into half-sized shuffles.
14828 "Only for 256-bit or wider vector shuffles!");
14840 // Use splitVector/extractSubVector so that split build-vectors just build two
14853 // Now create two 4-way blends of these half-width vectors.
14874 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
14885 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14886 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14887 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14891 V2BlendMask[i] = M - NumElements;
14904 // a minimal number of high-level vector shuffle nodes.
14923 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14948 /// This is provided as a good fallback for many lowerings of non-single-input
14949 /// shuffles with more than one 128-bit lane. In those cases, we want to select
14950 /// between splitting the shuffle into 128-bit components and stitching those
14951 /// back together vs. extracting the single-input shuffles and blending those
14958 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
14966 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14970 V2BroadcastIdx = M - Size;
14971 else if (M - Size != V2BroadcastIdx)
14985 // If the inputs all stem from a single 128-bit lane of each input, then we
15001 // requires that the decomposed single-input shuffles don't end up here.
15007 // TODO: Extend to support v8f32 (+ 512-bit shuffles).
15014 int LHSMask[4] = {-1, -1, -1, -1};
15015 int RHSMask[4] = {-1, -1, -1, -1};
15016 int SHUFPDMask[4] = {-1, -1, -1, -1};
15036 /// Lower a vector shuffle crossing multiple 128-bit lanes as
15037 /// a lane permutation followed by a per-lane permutation.
15039 /// This is mainly for cases where we can have non-repeating permutes
15057 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15101 // TODO - isShuffleMaskInputInPlace could be extended to something like
15113 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15142 // Then attempt a solution with 64-bit sublanes (vpermq).
15146 // If that doesn't work and we have fast variable cross-lane shuffle,
15147 // attempt 32-bit sublanes (vpermd).
15168 /// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15172 /// single-input cross lane shuffle which is lower than any other fully general
15173 /// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15178 // FIXME: This should probably be generalized for 512-bit vectors as well.
15179 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15190 // If there are only inputs from one 128-bit lane, splitting will in fact be
15208 // TODO - we could support shuffling V2 in the Flipped input.
15216 "In-lane shuffle mask expected");
15224 // Flip the lanes, and shuffle the results which should now be in-lane.
15233 /// Handle lowering 2-lane 128-bit shuffles.
15281 // Blends are faster and handle all the non-lane-crossing cases.
15289 // Check for patterns which can be matched with a single insert of a 128-bit
15295 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15317 // Otherwise form a 128-bit permutation. After accounting for undefs,
15318 // convert the 64-bit shuffle mask selection values into 128-bit
15323 // [1:0] - select 128 bits from sources for low half of destination
15324 // [2] - ignore
15325 // [3] - zero low half of destination
15326 // [5:4] - select 128 bits from sources for high half of destination
15327 // [6] - ignore
15328 // [7] - zero high half of destination
15347 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
15365 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15366 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15371 int Srcs[2] = {-1, -1};
15372 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15471 SmallVector<int, 16> NewMask(NumElts, -1);
15475 int M = -1;
15486 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15492 int M = -1;
15503 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15508 NewMask[i] = -1;
15540 HalfIdx1 = -1;
15541 HalfIdx2 = -1;
15615 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15623 "Expected 256-bit or 512-bit vector");
15673 // Always extract lowers when setting lower - these are all free subreg ops.
15679 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15698 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15711 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15713 // AVX2 has efficient 64-bit element cross-lane shuffles.
15717 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15729 /// Handle case where shuffle sources are coming from the same 128-bit lane and
15730 /// every lane can be represented as the same repeating mask - allowing us to
15749 // accounting for UNDEFs but only references the lowest 128-bit
15767 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15775 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15790 // Bail if the shuffle mask doesn't cross 128-bit lanes.
15805 // can form a repeating shuffle mask (local to each sub-lane). At the same
15806 // time, determine the source sub-lane for each destination sub-lane.
15807 int TopSrcSubLane = -1;
15808 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15814 // Extract the sub-lane mask, check that it all comes from the same lane
15816 int SrcLane = -1;
15817 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15830 // Whole sub-lane is UNDEF.
15834 // Attempt to match against the candidate repeated sub-lane masks.
15850 // Merge the sub-lane mask into the matching repeated sub-lane mask.
15860 // Track the top most source sub-lane - by setting the remaining to
15868 // Bail if we failed to find a matching repeated sub-lane mask.
15876 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15889 // Shuffle each source sub-lane to its destination.
15890 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15911 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15912 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
15914 // Otherwise we can only permute whole 128-bit lanes.
15951 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
15992 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16017 if (Zeroable.countl_one() < (Mask.size() - 8))
16045 // Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16047 // 256-bit vectors in earlier isel stages. Therefore, this function matches a
16048 // pair of 256-bit shuffles and makes sure the masks are consecutive.
16083 for (SDNode *User : V1->users())
16084 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16085 User->getOperand(1) == V2)
16090 // Find out which half of the 512-bit shuffles is each smaller shuffle
16095 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16096 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16099 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16100 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16122 /// Handle lowering of 4-lane 64-bit floating point shuffles.
16124 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16149 // Non-half-crossing single input shuffles can be lowered with an
16162 // Try to create an in-lane repeating shuffle mask and then shuffle the
16168 // Try to permute the lanes and then use a per-lane permute.
16210 // Try to create an in-lane repeating shuffle mask and then shuffle the
16216 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16242 /// Handle lowering of 4-lane 64-bit integer shuffles.
16276 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16330 // Try to create an in-lane repeating shuffle mask and then shuffle the
16341 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16355 /// Handle lowering of 8-lane 32-bit floating point shuffles.
16357 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16389 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16415 // Try to create an in-lane repeating shuffle mask and then shuffle the
16422 // two 128-bit lanes use the variable mask to VPERMILPS.
16437 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16459 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16477 /// Handle lowering of 8-lane 32-bit integer shuffles.
16506 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16535 // If the shuffle mask is repeated in each 128-bit lane we can use more
16536 // efficient instructions that mirror the shuffles across the two 128-bit
16579 // Try to create an in-lane repeating shuffle mask and then shuffle the
16586 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16592 // generate a cross-lane VPERMD instruction.
16608 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16619 /// Handle lowering of 16-lane 16-bit integer shuffles.
16673 // Try to create an in-lane repeating shuffle mask and then shuffle the
16685 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16690 // There are no generalized cross-lane shuffle operations available on i16
16703 // As this is a single-input shuffle, the repeated mask should be
16715 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
16719 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16725 // Try to permute the lanes and then use a per-lane permute.
16742 /// Handle lowering of 32-lane 8-bit integer shuffles.
16802 // Try to create an in-lane repeating shuffle mask and then shuffle the
16808 // There are no generalized cross-lane shuffle operations available on i8
16811 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16828 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16832 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16838 // Try to permute the lanes and then use a per-lane permute.
16863 /// High-level routine to lower various 256-bit x86 vector shuffles.
16865 /// This routine either breaks down the specific type of a 256-bit x86 vector
16866 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
16887 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16889 // querying in the per-vector-type lowering routines. With AVX1 we have
16890 // essentially *zero* ability to manipulate a 256-bit vector with integer
16897 // for masking/blending then decompose into 128-bit vectors.
16935 llvm_unreachable("Not a valid 256-bit x86 vector type!");
16939 /// Try to lower a vector shuffle as a 128-bit shuffles.
16951 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16969 // Check for patterns which can be matched with a single insert of a 256-bit
16982 // See if this is an insertion of the lower 128-bits of V2 into V1.
16984 int V2Index = -1;
16986 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16997 // Make sure we only have a single V2 index and its the lowest 128-bits.
17012 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17024 int PermMask[4] = {-1, -1, -1, -1};
17027 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17045 /// Handle lowering of 8-lane 64-bit floating point shuffles.
17060 // Non-half-crossing single input shuffles can be lowered with an
17099 /// Handle lowering of 16-lane 32-bit floating point shuffles.
17108 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17144 // Try to create an in-lane repeating shuffle mask and then shuffle the
17151 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17166 /// Handle lowering of 8-lane 64-bit integer shuffles.
17183 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17185 // 128-bit lanes.
17239 /// Handle lowering of 16-lane 32-bit integer shuffles.
17269 // If the shuffle mask is repeated in each 128-bit lane we can use more
17270 // efficient instructions that mirror the shuffles across the four 128-bit
17318 // Try to create an in-lane repeating shuffle mask and then shuffle the
17336 /// Handle lowering of 32-lane 16-bit integer shuffles.
17344 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17381 // As this is a single-input shuffle, the repeated mask should be
17397 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17407 /// Handle lowering of 64-lane 8-bit integer shuffles.
17415 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17459 // Try to create an in-lane repeating shuffle mask and then shuffle the
17474 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17487 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17501 /// High-level routine to lower various 512-bit x86 vector shuffles.
17503 /// This routine either breaks down the specific type of a 512-bit x86 vector
17504 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
17512 "Cannot lower 512-bit vectors w/ basic ISA!");
17576 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17588 int ShiftAmt = -1;
17597 // The first non-undef element determines our shift amount.
17599 ShiftAmt = M - i;
17604 // All non-undef elements must shift by the same amount.
17605 if (ShiftAmt != M - i)
17619 // Returns the shift amount if possible or -1 if not. This is a simplified
17627 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17636 unsigned Len = Size - Shift;
17647 return -1;
17652 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
17653 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
17661 "Cannot lower 512-bit vectors w/o basic ISA!");
17668 int Src = -1;
17688 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17718 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17720 ShiftAmt += WideElts - NumElts;
17734 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
17737 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
17756 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17761 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17762 // 256-bit operation available.
17766 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17767 // 256-bit operation available.
17871 // are preferable to blendw/blendvb/masked-mov.
17879 switch (V->getOpcode()) {
17898 if (!V->hasOneUse())
17916 /// Top-level lowering for x86 vector shuffles.
17926 ArrayRef<int> OrigMask = SVOp->getMask();
17948 // Check for non-undef masks pointing at an undef vector and make the masks
17956 M = -1;
17964 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
17967 // We actually see shuffles that are entirely re-arrangements of a set of
17982 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17990 // TODO: Avoid lowering directly from this top-level function: make this
17991 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18005 // Modify the new Mask to take all zeros from the all-zero vector.
18006 // Choose indices that are blend-friendly.
18009 "V2's non-undef elements are used?!");
18015 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18082 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18083 // compressed as 512-bit vectors in AVX512F.
18135 // Only non-legal VSELECTs reach this lowering, convert those into generic
18136 // shuffles and re-use the shuffle lowering path for blends.
18167 // Try to lower this to a blend-style vector shuffle. This can handle all
18173 // with patterns on the mask registers on AVX-512.
18190 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18191 // into an i1 condition so that we can use the mask-based 512-bit blend
18232 // VSELECT-matching blend, return Op, and but if we need to expand, return
18278 unsigned IdxVal = Idx->getAsZExtVal();
18292 SDNode *User = *Op.getNode()->user_begin();
18293 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18294 (User->getOpcode() != ISD::BITCAST ||
18295 User->getValueType(0) != MVT::i32))
18309 /// AVX-512 feature.
18326 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18340 unsigned IdxVal = IdxC->getZExtValue();
18357 MVT VT = N->getSimpleValueType(0);
18360 for (SDNode *User : N->users()) {
18361 switch (User->getOpcode()) {
18365 if (!isa<ConstantSDNode>(User->getOperand(1))) {
18369 DemandedElts.setBit(User->getConstantOperandVal(1));
18372 if (!User->getValueType(0).isSimple() ||
18373 !User->getValueType(0).isVector()) {
18405 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18413 // | Uops | 0 - DV | 5 | 6 | 7 | |
18414 // ---------------------------------------------
18425 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18426 // ---------------------------------------------------------
18427 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18428 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18435 unsigned IdxVal = IdxC->getZExtValue();
18437 // If this is a 256-bit vector result, first extract the 128-bit vector and
18438 // then extract the element from the 128-bit vector.
18440 // Get the 128-bit vector.
18449 IdxVal &= ElemsPerChunk - 1;
18480 // Only extract a single element from a v16i8 source - determine the common
18481 // DWORD/WORD that all extractions share, and extract the sub-byte.
18487 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18518 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
18535 int Mask[2] = { 1, -1 };
18545 /// AVX-512 feature.
18566 // Copy into a k-register, extract to v1i1 and insert_subvector.
18598 // possible vector indices, and FP insertion has less gpr->simd traffic.
18618 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18623 if (N2C->getAPIntValue().uge(NumElts))
18625 uint64_t IdxVal = N2C->getZExtValue();
18631 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18656 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18659 // With a 256-bit vector, we can insert into the zero element efficiently
18664 // doing anyway after extracting to a 128-bit vector.
18675 "Vectors will always have power-of-two number of elements.");
18677 // If we are not inserting into the low 128-bit vector chunk,
18691 // Get the desired 128-bit vector chunk.
18696 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18704 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18757 // If this is an insertion of 32-bits into the low 32-bits of
18762 // generate insertps because blendps does not have a 32-bit memory
18792 // If this is a 256-bit vector result, first insert into a 128-bit
18793 // vector and then insert into the 256-bit vector.
18795 // Insert into a 128-bit vector.
18802 // Insert the 128-bit vector.
18854 // References to absolute symbols are never PC-relative.
18855 if (GV && GV->isAbsoluteSymbolRef())
18858 // The following OpFlags under RIP-rel PIC use RIP.
18887 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
18909 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18933 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18934 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18960 GV = G->getGlobal();
18961 Offset = G->getOffset();
18964 ExternalSym = ES->getSymbol();
18984 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19017 // If there was a non-zero offset that we didn't fold, create an explicit
19046 if (TGA->hasOneUse()) {
19048 SDNode *TLSDescOp = *TGA->user_begin();
19049 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19052 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19053 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19056 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19057 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19062 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19063 GA->getOffset(), OperandFlags);
19134 MFI->incNumLocalDynamicTLSAccesses();
19154 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19155 GA->getValueType(0),
19156 GA->getOffset(), OperandFlags);
19169 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19178 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19196 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19198 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19199 GA->getOffset(), OperandFlags);
19226 const GlobalValue *GV = GA->getGlobal();
19267 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
19268 GA->getValueType(0),
19269 GA->getOffset(), OpFlag);
19312 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19313 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19329 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
19351 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19352 GA->getValueType(0),
19353 GA->getOffset(), X86II::MO_SECREL);
19380 // offset and returning `true` for TLS-desc currently duplicates both
19381 // which is detrimental :-/
19397 // Try to use a packed vector operation to handle i64 on 32-bit targets when
19407 bool IsStrict = Op->isStrictFPOpcode();
19419 // Using 256-bit to ensure result is 128-bits for f32 case.
19440 // Try to use a packed vector operation to handle i64 on 32-bit targets.
19448 bool IsStrict = Op->isStrictFPOpcode();
19500 /// round-trip between XMM and GPR.
19512 // See if we have a 128-bit vector cast op for this type of cast.
19521 // If we are extracting from a non-zero element, first shuffle the source
19524 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19528 // If the source vector is wider than 128-bits, extract the low part. Do not
19533 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19534 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19541 /// try to vectorize the cast ops. This will avoid an expensive round-trip
19558 // See if we have 128-bit vector cast instructions for this type of cast.
19571 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19577 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19593 bool IsStrict = Op->isStrictFPOpcode();
19594 MVT VT = Op->getSimpleValueType(0);
19595 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19604 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19618 {Op->getOperand(0), Src});
19632 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19633 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19681 bool IsStrict = Op->isStrictFPOpcode();
19683 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19720 bool IsStrict = Op->isStrictFPOpcode();
19723 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19793 // Bitcasting to f64 here allows us to do a single 64-bit store from
19795 // with two 32-bit stores.
19858 /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19867 /// 64-bit unsigned integer to double expansion.
19871 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
19873 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
19874 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
19906 // Load the 64-bit value into an XMM register.
19919 // TODO: Are there any fast-math-flags to propagate here?
19927 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19935 /// 32-bit unsigned integer to float expansion.
19939 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19944 // Load the 32-bit value into an XMM register.
19961 if (Op.getNode()->isStrictFPOpcode()) {
19963 // TODO: Are there any fast-math-flags to propagate here?
19979 // TODO: Are there any fast-math-flags to propagate here?
19992 bool IsStrict = Op->isStrictFPOpcode();
20005 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20024 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20042 bool IsStrict = Op->isStrictFPOpcode();
20043 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20049 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20051 MVT VT = Op->getSimpleValueType(0);
20053 // v8i32->v8f64 is legal with AVX512 so just return it.
20070 {Op->getOperand(0), V});
20085 Op->getSimpleValueType(0) == MVT::v4f64) {
20118 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20125 if (VecFloatVT != Op->getSimpleValueType(0))
20129 // - The vector of constants:
20130 // -- 0x4b000000
20131 // -- 0x53000000
20132 // - A shift:
20133 // -- v >> 16
20176 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20178 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20181 // TODO: Are there any fast-math-flags to propagate here?
20199 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20219 bool IsStrict = Op->isStrictFPOpcode();
20225 MVT DstVT = Op->getSimpleValueType(0);
20249 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20253 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20267 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20268 // infinity. It produces -0.0, so disable under strictfp.
20272 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20281 // Make a 64-bit buffer, and use it to build an FILD.
20283 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20304 // Bitcasting to f64 here allows us to do a single 64-bit store from
20306 // with two 32-bit stores.
20330 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20345 // TODO: Are there any fast-math-flags to propagate here?
20380 bool IsStrict = Op->isStrictFPOpcode();
20396 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20402 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20412 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20433 // FistSrc = (Value - FltOfs);
20434 // Fist-to-mem64 FistSrc
20435 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20497 // FIXME This causes a redundant load/store if the SSE-class value is already
20564 // v8i16 -> v8i32
20565 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20566 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20569 // v4i32 -> v4i64
20570 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20571 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20577 // Short-circuit if we can determine that each 128-bit half is the same value.
20580 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20609 MVT VT = Op->getSimpleValueType(0);
20610 SDValue In = Op->getOperand(0);
20620 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20633 // Widen to 512-bits if VLX is not supported.
20654 // Extract back to 128/256-bit if we widened.
20678 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20679 /// within each 128-bit lane.
20711 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20719 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
20720 // On pre-AVX512, pack the src in both halves to help value tracking.
20749 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20757 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20758 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20764 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20775 // If 512bit -> 128bit truncate another stage.
20781 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20784 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
20799 /// e.g. trunc <8 x i32> X to <8 x i16> -->
20845 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
20846 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
20853 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
20870 // Pre-SSE41 we can only use PACKUSWB.
20873 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20878 // Truncate with PACKSS if we are truncating a vector with sign-bits
20883 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
20891 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20900 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
20904 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
20911 /// This function lowers a vector truncation of 'extended sign-bits' or
20912 /// 'extended zero-bits' values.
20979 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
20980 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
20988 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
21007 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21031 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21043 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21056 // We either have 8 elements or we're allowed to use 512-bit vectors.
21063 ShiftInx = InVT.getScalarSizeInBits() - 1;
21093 // truncate the remainder. We'd rather produce two 64-bit results and
21106 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21110 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21113 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21128 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21140 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21151 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21153 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21170 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21174 -1, -1, -1, -1, -1, -1, -1, -1,
21176 -1, -1, -1, -1, -1, -1, -1, -1 };
21181 static const int ShufMask2[] = {0, 2, -1, -1};
21196 llvm_unreachable("All 256->128 cases should have been handled above!");
21206 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21209 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21223 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21232 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21238 bool IsStrict = Op->isStrictFPOpcode();
21241 MVT VT = Op->getSimpleValueType(0);
21243 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21272 // Widen to 512-bits.
21278 // TODO: Should we just do this for non-strict as well?
21341 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21365 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21372 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21382 // TODO: Should we just do this for non-strict as well?
21404 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21412 // TODO: Should we just do this for non-strict as well?
21436 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21458 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21496 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21497 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21513 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21524 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21618 EVT DstVT = N->getValueType(0);
21619 SDValue Src = N->getOperand(0);
21637 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21666 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21669 SDValue Src = Node->getOperand(0);
21676 EVT DstVT = Node->getValueType(0);
21684 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21697 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21710 // floating-point values.
21813 bool IsStrict = Op->isStrictFPOpcode();
21821 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
21852 // Need a libcall, but ABI for f16 is soft-float on MacOS.
21914 {Op->getOperand(0), Res});
21926 {Op->getOperand(0), Res});
21931 bool IsStrict = Op->isStrictFPOpcode();
22004 // FIXME: Should we use zeros for upper elements for non-strict?
22023 bool IsStrict = Op->isStrictFPOpcode();
22052 bool IsStrict = Op->isStrictFPOpcode();
22068 // FIXME: Should we use zeros for upper elements for non-strict?
22136 // clang-format off
22143 // clang-format on
22162 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22163 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22170 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22171 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22172 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22173 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22222 for (SDNode *User : Op->users())
22223 if (User->getOpcode() == ISD::FNEG)
22235 // decide if we should generate a 16-byte constant mask when we only need 4 or
22239 // generate a 16-byte vector constant and logic op even for the scalar case.
22240 // Using a 16-byte mask allows folding the load of the mask with
22266 // For the scalar case extend to a 128-bit vector, perform the logic op,
22298 // Perform all scalar logic operations as 16-byte vectors because there are no
22327 APFloat APF = Op0CN->getValueAPF();
22365 // instruction. Since the shift amount is in-range-or-undefined, we know
22376 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22390 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22449 /// Try to map a 128-bit or larger integer comparison to vector instructions
22466 // logically-combined vector-sized operands compared to zero. This pattern may
22483 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22484 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22525 auto ScalarToVector = [&](SDValue X) -> SDValue {
22553 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22586 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22587 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22589 "Non 128-bit vector on pre-SSE41 target");
22600 /// are supported when the pointer SrcMask is non-null.
22601 /// TODO - move this to SelectionDAG?
22619 if (I->getOpcode() == unsigned(BinOp)) {
22620 Opnds.push_back(I->getOperand(0));
22621 Opnds.push_back(I->getOperand(1));
22622 // Re-evaluate the number of nodes to be traversed.
22627 // Quit if a non-EXTRACT_VECTOR_ELT
22628 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22632 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22636 SDValue Src = I->getOperand(0);
22641 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22650 unsigned CIdx = Idx->getZExtValue();
22651 if (M->second[CIdx])
22653 M->second.setBit(CIdx);
22659 SrcMask->push_back(SrcOpMap[SrcOp]);
22682 // Quit if not convertable to legal scalar or 128/256-bit vector.
22686 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
22703 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22726 // Without PTEST, a masked v2i64 or-reduction is not faster than
22733 // Split down to 128/256/512-bit vector.
22750 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
22759 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
22806 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
22817 // Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
22832 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22835 // Check whether we're masking/truncating an OR-reduction result, in which
22850 Mask = Cst->getAPIntValue();
22861 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
22869 // Quit if not splittable to scalar/128/256/512-bit vector.
22875 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22891 // Match icmp(reduce_and(X),-1) allof reduction patterns.
22911 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
22916 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
22925 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
22947 for (SDUse &Use : Op->uses()) {
22950 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22952 UOpNo = User->use_begin()->getOperandNo();
22953 User = User->use_begin()->getUser();
22956 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22957 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22963 // Transform to an x86-specific ALU node with flags if there is a chance of
22967 for (SDNode *U : Op->users())
22968 if (U->getOpcode() != ISD::CopyToReg &&
22969 U->getOpcode() != ISD::SETCC &&
22970 U->getOpcode() != ISD::STORE)
22996 switch (Op->getOpcode()) {
23001 if (Op.getNode()->getFlags().hasNoSignedWrap())
23026 // non-casted variable when we check for possible users.
23042 // Otherwise use a regular EFLAGS-setting instruction.
23044 // clang-format off
23051 // clang-format on
23066 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23067 Op->getOperand(1)).getValue(1);
23079 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
23108 // Don't do this if the immediate can fit in 8-bits.
23109 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23110 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23132 // TODO: Add sign-bits equivalent for isX86CCSigned(X86CC)?
23142 // 0-x == y --> x+y == 0
23143 // 0-x != y --> x+y != 0
23151 // x == 0-y --> x+y == 0
23152 // x != 0-y --> x+y != 0
23173 if (N->getOpcode() == ISD::FDIV)
23176 EVT FPVT = N->getValueType(0);
23179 // This indicates a non-free bitcast.
23181 // integer vector anyways for the int->fp cast.
23206 /// The minimum architected relative accuracy is 2^-12. We need one
23207 /// Newton-Raphson step to have a good float result (24 bits of precision).
23216 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23217 // It is likely not profitable to do this for f64 because a double-precision
23233 // There is no FSQRT for 512-bits, but there is RSQRT14.
23260 /// The minimum architected relative accuracy is 2^-12. We need one
23261 /// Newton-Raphson step to have a good float result (24 bits of precision).
23268 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23269 // It is likely not profitable to do this for f64 because a double-precision
23281 // real-world code. These defaults are intended to match GCC behavior.
23288 // There is no FSQRT for 512-bits, but there is RCP14.
23326 if (isIntDivCheap(N->getValueType(0), Attr))
23338 EVT VT = N->getValueType(0);
23344 // If the divisor is 2 or -2, the default expansion is better.
23346 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23375 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23383 uint64_t AndRHSVal = AndRHS->getZExtValue();
23421 // Check if pre-AVX condcode can be performed by a single FCMP op.
23426 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23434 // 0 - EQ
23435 // 1 - LT
23436 // 2 - LE
23437 // 3 - UNORD
23438 // 4 - NEQ
23439 // 5 - NLT
23440 // 6 - NLE
23441 // 7 - ORD
23443 // clang-format off
23465 // clang-format on
23489 /// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23522 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23549 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23550 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23554 const APInt &EltC = Elt->getAPIntValue();
23561 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23590 // Only do this pre-AVX since vpcmp* is no longer destructive.
23604 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23613 // Psubus is better than flip-sign because it requires no inversion.
23633 MVT VT = Op->getSimpleValueType(0);
23634 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23648 // Break 256-bit FP vector compare into smaller ones.
23652 // Break 512-bit FP vector compare into smaller ones.
23678 // compare like we do for non-strict, we might trigger spurious exceptions
23693 // floating-point vector result that matches the operand type. This allows
23721 SignalCmp->setFlags(Op->getFlags());
23812 // The non-AVX512 code below works under the assumption that source and
23819 // In AVX-512 architecture setcc returns mask with i1 elements,
23831 // clang-format off
23843 // clang-format on
23854 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23866 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23870 if (C1 && C1->getAPIntValue().isPowerOf2()) {
23872 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23878 DAG.getConstant(BitWidth - 1, dl, VT));
23883 // Break 256-bit integer vector compare into smaller ones.
23887 // Break 512-bit integer vector compare into smaller ones.
23893 // not-of-PCMPEQ:
23894 // X != INT_MIN --> X >s INT_MIN
23895 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23896 // +X != 0 --> +X >s 0
23908 // If both operands are known non-negative, then an unsigned compare is the
23923 // X > C --> X >= (C+1) --> X == umax(X, C+1)
23931 // X < C --> X <= (C-1) --> X == umin(X, C-1)
23941 // clang-format off
23947 // clang-format on
23953 // If the logical-not of the result is required, perform that now.
24009 // If the i64 elements are sign-extended enough to be representable as i32
24069 // Make sure the lower and upper halves are both all-ones.
24093 // If the logical-not of the result is required, perform that now.
24175 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24209 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24220 // (seteq (add X, -1), -1). Similar for setne.
24249 MVT VT = Op->getSimpleValueType(0);
24253 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24259 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24281 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24284 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24287 // encoding size - so it must either already be a i8 or i32 immediate, or it
24292 const APInt &Op1Val = Op1C->getAPIntValue();
24352 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
24425 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24426 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24451 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24477 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24480 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24484 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24537 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24538 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24539 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24540 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24541 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24542 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24543 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24550 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24563 // 'X - 1' sets the carry flag if X == 0.
24564 // '0 - X' sets the carry flag if X != 0.
24565 // Convert the carry flag to a -1/0 mask with sbb:
24566 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24567 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24568 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24569 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24607 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24611 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
24679 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
24680 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
24681 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
24682 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
24683 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
24684 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
24685 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24686 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24694 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
24695 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
24711 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
24714 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24719 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24720 unsigned ShCt = VT.getSizeInBits() - 1;
24736 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
24753 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
24793 // a < b ? -1 : 0 -> RES = ~setcc_carry
24794 // a < b ? 0 : -1 -> RES = setcc_carry
24795 // a >= b ? -1 : 0 -> RES = setcc_carry
24796 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24798 unsigned CondCode = CC->getAsZExtVal();
24846 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
24852 MVT VT = Op->getSimpleValueType(0);
24853 SDValue In = Op->getOperand(0);
24869 // Widen to 512-bits if VLX is not supported.
24896 // Extract back to 128/256-bit if we widened.
24906 SDValue In = Op->getOperand(0);
24919 // non-SSE4.1 targets. For zero extend this should only handle inputs of
24924 SDValue In = Op->getOperand(0);
24925 MVT VT = Op->getSimpleValueType(0);
24945 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24946 // For 512-bit vectors, we need 128-bits or 256-bits.
24949 // at least 128-bits.
24955 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24957 // need to be handled here for 256/512-bit results.
24959 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24972 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24974 assert(VT.is256BitVector() && "256-bit vector expected");
24994 // If the source elements are already all-signbits, we don't need to extend,
25006 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25023 Mask[i * Scale + (Scale - 1)] = i;
25028 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25046 MVT VT = Op->getSimpleValueType(0);
25047 SDValue In = Op->getOperand(0);
25079 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25080 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25086 SmallVector<int,8> ShufMask(NumElems, -1);
25096 /// Change a vector store into a pair of half-size vector stores.
25098 SDValue StoredVal = Store->getValue();
25101 "Expecting 256/512-bit op");
25108 if (!Store->isSimple())
25115 SDValue Ptr0 = Store->getBasePtr();
25119 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25120 Store->getOriginalAlign(),
25121 Store->getMemOperand()->getFlags());
25122 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25123 Store->getPointerInfo().getWithOffset(HalfOffset),
25124 Store->getOriginalAlign(),
25125 Store->getMemOperand()->getFlags());
25133 SDValue StoredVal = Store->getValue();
25135 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25141 if (!Store->isSimple())
25152 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25156 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25157 Store->getPointerInfo().getWithOffset(Offset),
25158 Store->getOriginalAlign(),
25159 Store->getMemOperand()->getFlags());
25169 SDValue StoredVal = St->getValue();
25176 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25191 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25192 St->getPointerInfo(), St->getOriginalAlign(),
25193 St->getMemOperand()->getFlags());
25196 if (St->isTruncatingStore())
25199 // If this is a 256-bit store of concatenated ops, we are better off splitting
25200 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
25226 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25234 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25235 St->getPointerInfo(), St->getOriginalAlign(),
25236 St->getMemOperand()->getFlags());
25240 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25242 St->getMemOperand());
25264 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25269 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25270 Ld->getPointerInfo(), Ld->getOriginalAlign(),
25271 Ld->getMemOperand()->getFlags());
25274 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25310 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25327 Overflow, Op->getFlags());
25334 EFLAGS, Op->getFlags());
25341 // have a fall-through edge, because this requires an explicit
25343 if (Op.getNode()->hasOneUse()) {
25344 SDNode *User = *Op.getNode()->user_begin();
25348 if (User->getOpcode() == ISD::BR) {
25349 SDValue FalseBB = User->getOperand(1);
25351 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25360 CCVal, Cmp, Op->getFlags());
25363 Cmp, Op->getFlags());
25373 Cmp, Op->getFlags());
25376 Cmp, Op->getFlags());
25383 Cmp, Op->getFlags());
25394 Overflow, Op->getFlags());
25414 Op->getFlags());
25437 EVT VT = Node->getValueType(0);
25467 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25487 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25490 Register SPReg = RegInfo->getStackRegister();
25497 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25515 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25522 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25528 // gp_offset (0 - 6 * 8)
25529 // fp_offset (48 - 48 + 8 * 16)
25537 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25545 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25551 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25559 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25569 "LowerVAARG only handles 64-bit va_arg!");
25579 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25583 EVT ArgVT = Op.getNode()->getValueType(0);
25627 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
25629 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
25638 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
25639 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25688 ShiftAmt = ElementType.getSizeInBits() - 1;
25694 && "Unknown target vector shift-by-constant node");
25734 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
25739 // Peek through any zext node if we can get back to a 128-bit source.
25750 // The shift uses the entire lower 64-bits of the amount vector, so no need to
25756 // If the shift amount has come from a scalar, then zero-extend the scalar
25765 // then we can zero-extend it by setting all the other mask elements to
25780 // Extract if the shift amount vector is larger than 128-bits.
25786 // Zero-extend bottom element to v2i64 vector type, either by extension or
25797 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25806 // Change opcode to non-immediate version.
25809 // The return type has to be a 128-bit type with the same element
25886 if (MaskConst->getZExtValue() & 0x1)
25907 if (!Fn->hasPersonalityFn())
25910 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25912 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25918 "can only recover FP for 32-bit MSVC EH personality functions");
25925 /// RegNodeBase = EntryEBP - RegNodeSize
25926 /// ParentFP = RegNodeBase - ParentFrameOffset
25940 if (!Fn->hasPersonalityFn())
25946 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25958 // RegNodeBase = EntryEBP - RegNodeSize
25959 // ParentFP = RegNodeBase - ParentFrameOffset
25970 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25976 unsigned RC = C->getZExtValue();
25990 RC = C->getZExtValue();
26010 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26013 switch(IntrData->Type) {
26016 // First, we check if the intrinsic may have non-default rounding mode,
26017 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26018 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26029 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26037 Opc = IntrData->Opc0;
26039 Opc = IntrData->Opc1;
26049 // First, we check if the intrinsic may have non-default rounding mode,
26050 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26051 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26063 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26071 Opc = IntrData->Opc0;
26073 Opc = IntrData->Opc1;
26086 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26088 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26092 // First, we check if the intrinsic may have non-default rounding mode,
26093 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26094 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26106 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26110 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26113 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26116 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26125 // - RC Opcode is specified and
26126 // - RC is not "current direction".
26127 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26140 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26151 Opc = IntrData->Opc0;
26153 Opc = IntrData->Opc1;
26165 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26167 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26168 // (2) With rounding mode and sae - 7 operands.
26182 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26190 unsigned Opc = IntrData->Opc0;
26212 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26214 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26229 Opc = IntrData->Opc0;
26231 Opc = IntrData->Opc1;
26244 if (IntrData->Opc1 != 0) {
26248 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26254 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26263 unsigned Opc = IntrData->Opc0;
26264 if (IntrData->Opc1 != 0) {
26267 Opc = IntrData->Opc1;
26284 Opc = IntrData->Opc0;
26286 Opc = IntrData->Opc1;
26300 unsigned Opc = IntrData->Opc0;
26301 if (IntrData->Opc1 != 0) {
26304 Opc = IntrData->Opc1;
26320 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26327 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26338 if (IntrData->Type == CFMA_OP_MASKZ)
26342 // - RC Opcode is specified and
26343 // - RC is not "current direction".
26345 if (IntrData->Opc1 != 0) {
26349 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26355 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26356 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26357 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26364 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26370 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26386 // First, we check if the intrinsic may have non-default rounding mode,
26387 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26388 if (IntrData->Opc1 != 0) {
26391 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26397 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26407 if (IntrData->Opc1 != 0) {
26410 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26416 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26428 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26443 auto ComiOpCode = IntrData->Opc0;
26514 // Catch shift-by-constant.
26516 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26518 CShAmt->getZExtValue(), DAG);
26521 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26535 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26545 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26549 unsigned Opc = IntrData->Opc0;
26550 if (IntrData->Opc1 != 0) {
26553 Opc = IntrData->Opc1;
26566 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26571 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26575 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26580 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26584 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26589 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26594 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26601 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26606 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26622 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26627 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26637 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
26642 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26652 unsigned Opc = IntrData->Opc0;
26666 Opc = IntrData->Opc1;
26678 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26684 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
26840 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26855 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26868 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26888 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26890 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26893 // supported on 32-bit Windows, which isn't PIC.
26902 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26915 if (RegInfo->hasBasePointer(MF))
26916 Reg = RegInfo->getBaseRegister();
26918 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26920 Reg = RegInfo->getPtrSizedStackRegister(MF);
26922 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26939 Op->getOperand(1), Op->getOperand(2));
26960 // to 8-bits which may make it no longer out of bounds.
26961 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27012 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27034 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27052 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27067 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27090 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27104 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27121 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27135 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27152 /// Returns a Glue value which can be used to add extra copy-from-reg if the
27161 SDValue Chain = N->getOperand(0);
27165 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27166 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27191 // Merge the two 32-bit values into a 64-bit one.
27199 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27214 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27215 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27216 // and the EAX register is loaded with the low-order 32 bits.
27252 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27270 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27303 // 64-bit targets support extended Swift async frame setup,
27305 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27321 X86FI->setHasSwiftAsyncContext(true);
27322 SDValue Chain = Op->getOperand(0);
27329 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27334 if (!X86FI->getSwiftAsyncContextFrameIdx())
27335 X86FI->setSwiftAsyncContextFrameIdx(
27339 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27342 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27343 Op->getOperand(0));
27390 SDValue Chain = Op->getOperand(0);
27409 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27410 Op->getOperand(3), Op->getOperand(4));
27412 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27433 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27462 MachineMemOperand *MMO = MemIntr->getMemOperand();
27463 EVT MemVT = MemIntr->getMemoryVT();
27469 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27500 MachineMemOperand *MMO = MemIntr->getMemOperand();
27501 EVT MemVT = MemIntr->getMemoryVT();
27510 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27523 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27535 X86MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
27598 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27604 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27618 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27624 unsigned Imm = Op2->getAsZExtVal();
27628 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27638 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27640 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
27678 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
27679 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
27713 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27725 switch(IntrData->Type) {
27730 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
27731 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27735 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
27736 DAG.getConstant(1, dl, Op->getValueType(1)),
27739 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
27742 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
27752 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27774 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27781 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
27793 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
27809 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
27815 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
27816 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27819 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
27820 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
27834 EVT MemVT = MemIntr->getMemoryVT();
27836 uint16_t TruncationOp = IntrData->Opc0;
27841 MemIntr->getMemOperand());
27848 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
27856 MemIntr->getMemOperand(), DAG);
27862 VMask, MemVT, MemIntr->getMemOperand(), DAG);
27881 unsigned Depth = Op.getConstantOperandVal(0);
27885 if (Depth > 0) {
27888 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
27915 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
27916 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
27919 int FrameAddrIndex = FuncInfo->getFAIndex();
27922 unsigned SlotSize = RegInfo->getSlotSize();
27925 FuncInfo->setFAIndex(FrameAddrIndex);
27931 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
27933 unsigned Depth = Op.getConstantOperandVal(0);
27938 while (Depth--)
27966 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27982 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28013 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28021 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28042 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28075 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28081 // Large code-model.
28082 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28085 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28086 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28130 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28131 CallingConv::ID CC = Func->getCallingConv();
28144 FunctionType *FTy = Func->getFunctionType();
28145 const AttributeList &Attrs = Func->getAttributes();
28147 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28151 for (FunctionType::param_iterator I = FTy->param_begin(),
28152 E = FTy->param_end(); I != E; ++I, ++Idx)
28160 report_fatal_error("Nest register in use - reduce number of inreg"
28186 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28218 01 Round to -inf
28223 -1 Undefined
28227 3 Round to -inf
28229 To perform the conversion, we use a packed lookup table of the four 2-bit
28231 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28280 SDValue Chain = Op.getNode()->getOperand(0);
28303 SDValue NewRM = Op.getNode()->getOperand(1);
28306 uint64_t RM = CVal->getZExtValue();
28309 // clang-format off
28316 // clang-format on
28321 // 0 Round to 0 -> 11
28322 // 1 Round to nearest -> 00
28323 // 2 Round to +inf -> 10
28324 // 3 Round to -inf -> 01
28325 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28398 SDValue Chain = Op->getOperand(0);
28399 SDValue Ptr = Op->getOperand(1);
28401 EVT MemVT = Node->getMemoryVT();
28403 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28414 (MMO->getFlags() & ~MachineMemOperand::MOStore);
28464 SDValue Chain = Op->getOperand(0);
28465 SDValue Ptr = Op->getOperand(1);
28467 EVT MemVT = Node->getMemoryVT();
28469 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28477 SDValue Chain = Op.getNode()->getOperand(0);
28483 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28492 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28520 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28522 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28524 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28573 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28587 // Per-nibble leading zero PSHUFB lookup table.
28667 // vXi8 vectors need to be promoted to 512-bits for vXi32.
28671 // Decompose 256-bit ops into smaller 128-bit ops.
28675 // Decompose 512-bit ops into smaller 256-bit ops.
28705 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
28714 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
28720 // Finally xor with NumBits-1.
28722 DAG.getConstant(NumBits - 1, dl, OpVT));
28773 "Only handle AVX 256-bit vector integer operation");
28799 // Handle a special-case with a bit-hack instead of cmp+select:
28800 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
28805 if (C && C->getAPIntValue().isSignMask()) {
28806 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
28807 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
28814 // usubsat X, Y --> (X >u Y) ? X - Y : 0
28853 // Since X86 does not have CMOV for 8-bit integer, we don't convert
28854 // 8-bit integer abs to NEG and CMOV.
28863 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
28872 "Only handle AVX 256-bit vector integer operation");
28936 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
28959 // Num xNaN +0 -0
28960 // --------------- ---------------
28962 // X --------------- X ---------------
28963 // xNaN | X | X/Y | -0 | +0 | -0 |
28964 // --------------- ---------------
28975 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
28977 return CstOp->getAPIntValue() == Zero;
28978 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
28979 Op->getOpcode() == ISD::SPLAT_VECTOR) {
28980 for (const SDValue &OpVal : Op->op_values()) {
28986 if (!CstOp->getValueAPF().isZero())
28988 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
28999 Op->getFlags().hasNoSignedZeros() ||
29012 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29031 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29062 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29070 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29100 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29101 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29113 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29114 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29139 // Decompose 256-bit ops into 128-bit ops.
29149 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29175 for (auto [Idx, Val] : enumerate(B->ops())) {
29197 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29319 // and use pmullw to calculate the full 16-bit product.
29322 // pmulhw to calculate the full 16-bit product. This trick means we don't
29389 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29394 // Decompose 256-bit ops into 128-bit ops.
29418 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29419 9, -1, 11, -1, 13, -1, 15, -1};
29474 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29501 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29504 EVT OvfVT = Op->getValueType(1);
29613 // UMULO overflows if the high bits are non-zero.
29629 if (isa<ConstantSDNode>(Op->getOperand(1))) {
29637 switch (Op->getOpcode()) {
29638 // clang-format off
29644 // clang-format on
29652 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
29653 EVT ArgVT = Op->getOperand(i).getValueType();
29657 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29662 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
29692 bool IsStrict = Op->isStrictFPOpcode();
29701 if (Op->getOpcode() == ISD::FP_TO_SINT ||
29702 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
29725 bool IsStrict = Op->isStrictFPOpcode();
29734 if (Op->getOpcode() == ISD::SINT_TO_FP ||
29735 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
29747 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29758 // Return true if the required (according to Opcode) shift-imm form is natively
29787 // These instructions are defined together with shift-immediate.
29794 // Return true if the required (according to Opcode) variable-shift form is
29810 // vXi16 supported only on AVX-512, BWI
29849 ShiftAmt - 32, DAG);
29885 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
29887 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29905 // If we're logical shifting an all-signbits value then we can just perform as
29921 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29957 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
29966 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
29993 int BaseShAmtIdx = -1;
29999 // vXi8 shifts - shift as v8i16 + mask result.
30010 // Create the mask using vXi16 shifts. For shift-rights we need to move
30030 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30143 // XOP has 128-bit variable logical/arithmetic shifts.
30144 // +ve/-ve Amt = shift left/right.
30155 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30156 // shifts per-lane and then shuffle the partial results back together.
30171 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30173 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30196 unsigned AmtA = UniqueCstAmt.begin()->first;
30197 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30198 const APInt &MaskA = UniqueCstAmt.begin()->second;
30199 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30234 SmallVector<SDValue, 32> AmtWideElts(Amt->op_begin(), Amt->op_end());
30238 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30265 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30266 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30302 // have vandps but that is an FP instruction and crossing FP<->int typically
30325 // FullMask = (1 << EltSizeInBits) - 1
30344 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30345 // are `EltSizeInBits-AmtWide` bits wide.
30347 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30349 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30351 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30352 // (Masked ^ SignBitMask) - SignBitMask
30355 // Masked + SignBitMask - SignBitMask
30358 // so sign extending should be a no-op.
30361 // Masked - SignBitmask - SignBitMask
30363 // This is equal to Masked - 2*SignBitMask which will correctly sign
30386 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30400 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30402 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30425 // immediate shifts, else we need to zero-extend each lane to the lower i64
30438 // just zero-extending, but for SSE just duplicating the top 16-bits is
30442 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30443 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30444 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30445 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30449 {4, 5, 6, 7, -1, -1, -1, -1});
30466 // TODO - ideally shuffle combining would handle this.
30468 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30469 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30472 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30473 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30477 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30478 // look up the pre-computed shift values.
30512 // NOTE: We honor prefered vector width before promoting to 512-bits.
30603 // On pre-SSE41 targets we test for the sign bit by comparing to
30604 // zero - a negative value will set all bits of the lanes to true
30705 // If we have a constant shift amount, the non-SSE41 path is best as
30721 // On pre-SSE41 targets we splat the sign bit - a negative value will
30768 // Decompose 256-bit shifts into 128-bit shifts.
30814 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
30815 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
30821 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
30822 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
30831 // bit-select - lower using vXi16 shifts and then perform the bitmask at
30833 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
30834 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
30855 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30867 // Split 256-bit integers on XOP/pre-AVX2 targets.
30868 // Split 512-bit integers on non 512-bit BWI targets.
30873 // Pre-mask the amount modulo using the wider vector.
30880 int ScalarAmtIdx = -1;
30900 // If per-element shifts are legal, fallback to generic expansion.
30905 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30906 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30922 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
30946 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30947 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30950 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31013 // Else, fall-back on VPROLV/VPRORV.
31017 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31046 // Split 256-bit integers on XOP/pre-AVX2 targets.
31050 // XOP has 128-bit vector variable + immediate rotates.
31051 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31055 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31064 // Use general rotate by variable (per-element).
31068 // Rotate by an uniform constant - expand back to shifts.
31073 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31074 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31082 // Split 512-bit integers on non 512-bit BWI targets.
31096 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31100 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31101 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31103 int BaseRotAmtIdx = -1;
31124 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31125 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31126 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31147 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31148 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31177 // On pre-SSE41 targets we test for the sign bit by comparing to
31178 // zero - a negative value will set all bits of the lanes to true
31235 // Fallback for non-constants AVX2 vXi16 as well.
31270 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31271 // that can then be OR'd with the lower 32-bits.
31296 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31308 Type *MemType = SI->getValueOperand()->getType();
31310 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31312 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31316 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31328 Type *MemType = LI->getType();
31330 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31332 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31333 // can use movq to do the load. If we have X87 we can load into an 80-bit
31335 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31339 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31340 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31362 if (isPowerOf2_64(C->getZExtValue()))
31364 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31369 // Check if V is some power of 2 pattern known to be non-zero
31387 if (I->getOpcode() == Instruction::Shl) {
31389 // -X` and some other provable power of 2 patterns that we can use CTZ on
31392 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31393 // be provably a non-zero power of 2.
31396 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31399 if (ShiftVal->equalsInt(1))
31405 Value *BitV = I->getOperand(1);
31409 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31424 if (AI->use_empty())
31427 if (AI->getOperation() == AtomicRMWInst::Xor) {
31428 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31430 if (match(AI->getOperand(1), m_SignMask()))
31436 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31440 Instruction *I = AI->user_back();
31441 auto BitChange = FindSingleBitChange(AI->getValOperand());
31442 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31443 I->getOpcode() != Instruction::And ||
31444 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31445 AI->getParent() != I->getParent())
31448 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31451 if (AI == I->getOperand(OtherIdx))
31456 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31457 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31458 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31461 if (AI->getOperation() == AtomicRMWInst::And) {
31462 return ~C1->getValue() == C2->getValue()
31472 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31484 if (AI->getOperation() == AtomicRMWInst::And)
31500 switch (AI->getOperation()) {
31516 Instruction *I = AI->user_back();
31517 LLVMContext &Ctx = AI->getContext();
31518 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31521 auto BitTested = FindSingleBitChange(AI->getValOperand());
31525 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31527 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31528 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31538 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31540 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31547 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31548 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31550 // If the result is only used for zero/non-zero status then we don't need to
31552 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31554 if (ICmp->isEquality()) {
31555 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31556 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31559 if ((C0 ? C0 : C1)->isZero())
31569 I->replaceAllUsesWith(Result);
31570 I->eraseFromParent();
31571 AI->eraseFromParent();
31576 if (!AI->hasOneUse())
31579 Value *Op = AI->getOperand(1);
31581 Instruction *I = AI->user_back();
31582 AtomicRMWInst::BinOp Opc = AI->getOperation();
31587 if (match(I->user_back(),
31590 if (match(I->user_back(),
31600 if (match(I->user_back(),
31603 if (match(I->user_back(),
31613 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
31616 if (match(I->user_back(),
31625 if (match(I->user_back(),
31628 if (match(I->user_back(),
31643 LLVMContext &Ctx = AI->getContext();
31644 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
31646 TempI = AI->user_back();
31647 assert(TempI->hasOneUse() && "Must have one use");
31648 ICI = cast<ICmpInst>(TempI->user_back());
31651 ICmpInst::Predicate Pred = ICI->getPredicate();
31669 switch (AI->getOperation()) {
31688 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31691 IID, AI->getType(),
31692 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
31694 ICI->replaceAllUsesWith(Result);
31695 ICI->eraseFromParent();
31697 TempI->eraseFromParent();
31698 AI->eraseFromParent();
31704 Type *MemType = AI->getType();
31708 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
31713 AtomicRMWInst::BinOp Op = AI->getOperation();
31743 // These always require a non-trivial set of data operations on x86. We must
31752 Type *MemType = AI->getType();
31756 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
31762 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
31763 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
31764 AI->use_empty())
31769 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
31770 auto SSID = AI->getSyncScopeID();
31773 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
31776 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
31798 // different cache-line to prevent cache-line bouncing. In practice it
31809 AI->getType(), AI->getPointerOperand(), AI->getAlign());
31810 Loaded->setAtomic(Order, SSID);
31811 AI->replaceAllUsesWith(Loaded);
31812 AI->eraseFromParent();
31827 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
31838 // c) To minimize concerns about cross thread stack usage - in particular,
31840 // captures state in the TOS frame and accesses it from many threads -
31845 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
31849 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
31889 // The only fence that needs an instruction is a sequentially-consistent
31890 // cross-thread fence.
31900 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31928 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
31938 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
31980 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
31992 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
32113 // http://wm.ite.pl/articles/sse-popcount.html
32116 // index into a in-register pre-computed pop count table. We then split up the
32117 // input vector in two new ones: (1) a vector with only the shifted-right
32120 // to index the in-register table. Next, both are added and the result is a
32171 // Decompose 256-bit ops into smaller 128-bit ops.
32175 // Decompose 512-bit ops into smaller 256-bit ops.
32210 unsigned ActiveBits = Known.getBitWidth() - LZ;
32211 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32213 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32225 // i3 CTPOP - perform LUT into i32 integer.
32240 // i4 CTPOP - perform LUT into i64 integer.
32257 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32301 // Decompose 256-bit ops into smaller 128-bit ops.
32306 "Only 128-bit vector bitreverse lowering supported.");
32314 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32340 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32344 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32387 // 0-15 value (moved to the other nibble).
32422 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32439 // Xor the high and low 16-bits together using a 32-bit operation.
32448 // Xor the high and low 16-bits together using a 32-bit operation.
32453 // If the input is 16-bits, we need to extend to use an i32 shift below.
32457 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32458 // This should allow an h-reg to be used to save a shift.
32475 switch (N->getOpcode()) {
32495 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32499 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32500 /*MemVT=*/N->getSimpleValueType(0), MMO);
32503 /// Lower atomic_load_ops into LOCK-prefixed operations.
32507 SDValue Chain = N->getOperand(0);
32508 SDValue LHS = N->getOperand(1);
32509 SDValue RHS = N->getOperand(2);
32510 unsigned Opc = N->getOpcode();
32511 MVT VT = N->getSimpleValueType(0);
32517 if (N->hasAnyUseOfValue(0)) {
32518 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32525 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32539 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32545 if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
32546 AN->getSyncScopeID() == SyncScope::System) {
32551 assert(!N->hasAnyUseOfValue(0));
32553 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32556 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32558 assert(!N->hasAnyUseOfValue(0));
32560 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32566 assert(!N->hasAnyUseOfValue(0));
32568 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32576 EVT VT = Node->getMemoryVT();
32579 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32594 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
32595 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
32596 Node->getMemOperand());
32604 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
32608 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
32610 MVT::i64, Node->getMemOperand());
32612 // First load this into an 80-bit X87 register using a stack temporary.
32615 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
32618 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
32628 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
32631 StoreOps, MVT::i64, Node->getMemOperand());
32645 // Convert seq_cst store -> xchg
32646 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
32647 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
32648 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
32649 Node->getOperand(0), Node->getOperand(2),
32650 Node->getOperand(1), Node->getMemOperand());
32656 MVT VT = N->getSimpleValueType(0);
32680 if (N->getValueType(1) == MVT::i1)
32683 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
32777 SmallVector<SDValue, 16> Ops(InOp->op_begin(), InOp->op_end());
32778 Ops.append(WidenNumElts - InNumElts, FillVal);
32790 "MGATHER/MSCATTER are supported on AVX-512 arch only");
32793 SDValue Src = N->getValue();
32798 SDValue Scale = N->getScale();
32799 SDValue Index = N->getIndex();
32800 SDValue Mask = N->getMask();
32801 SDValue Chain = N->getChain();
32802 SDValue BasePtr = N->getBasePtr();
32814 N->getMemoryVT(), N->getMemOperand());
32826 // If we don't have VLX and neither the passthru or index is 512-bits, we
32830 // Determine how much we need to widen by to get a 512-bit type.
32847 N->getMemoryVT(), N->getMemOperand());
32856 SDValue Mask = N->getMask();
32858 SDValue PassThru = N->getPassThru();
32868 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32869 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
32870 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
32871 N->isExpandingLoad());
32877 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
32878 "Expanding masked load is supported on AVX-512 target only!");
32880 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
32881 "Expanding masked load is supported for 32 and 64-bit types only!");
32905 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32906 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
32907 N->getExtensionType(), N->isExpandingLoad());
32919 SDValue DataToStore = N->getValue();
32922 SDValue Mask = N->getMask();
32925 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
32926 "Expanding masked load is supported on AVX-512 target only!");
32928 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
32929 "Expanding masked load is supported for 32 and 64-bit types only!");
32952 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
32953 N->getOffset(), Mask, N->getMemoryVT(),
32954 N->getMemOperand(), N->getAddressingMode(),
32955 N->isTruncatingStore(), N->isCompressingStore());
32961 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
32966 SDValue Index = N->getIndex();
32967 SDValue Mask = N->getMask();
32968 SDValue PassThru = N->getPassThru();
32977 // If we don't have VLX and neither the passthru or index is 512-bits, we
32982 // Determine how much we need to widen by to get a 512-bit type.
33001 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33002 N->getScale() };
33004 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33005 N->getMemOperand());
33017 unsigned SrcAS = N->getSrcAddressSpace();
33019 assert(SrcAS != N->getDestAddressSpace() &&
33039 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33044 if (Op->getGluedNode())
33045 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33069 // We don't support non-data prefetch without PREFETCHI.
33079 SDValue Operand = N->getOperand(0);
33107 // sub-string, e.g. "$12" contain "$1"
33109 I = AsmStr.size() - OpNoStr1.size();
33164 // ->
33185 // ->
33200 // clang-format off
33356 // clang-format on
33366 unsigned Opc = N->getOpcode();
33371 N->dump(&DAG);
33375 EVT VT = N->getValueType(0);
33387 EVT VT = N->getValueType(0);
33393 {N->getOperand(0), Lo});
33395 {N->getOperand(0), Hi});
33407 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33410 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33414 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33428 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33430 // Bit count should fit in 32-bits, extract it as that and then zero
33441 EVT VT = N->getValueType(0);
33444 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33447 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33448 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33460 EVT VT = N->getValueType(0);
33465 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33466 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33471 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33483 // UMULO overflows if the high bits are non-zero.
33486 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33499 EVT VT = N->getValueType(0);
33500 EVT InVT = N->getOperand(0).getValueType();
33515 Ops[0] = N->getOperand(0);
33517 Ops[0] = N->getOperand(1);
33531 EVT VT = N->getValueType(0);
33536 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33538 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33542 {N->getOperand(0), LHS, RHS});
33554 EVT VT = N->getValueType(0);
33560 // TODO: Can we do something for non-splat?
33562 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33565 Ops0[0] = N->getOperand(0);
33580 MVT VT = N->getSimpleValueType(0);
33588 SDValue In = N->getOperand(0);
33599 Subtarget, N->getFlags())) {
33613 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
33655 -1, -1, -1, -1, -1, -1, -1, -1 });
33678 assert(N->getValueType(0) == MVT::v8i8 &&
33683 EVT VT = N->getValueType(0);
33684 SDValue In = N->getOperand(0);
33691 // Custom split this so we can extend i8/i16->i32 invec. This is better
33692 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
33733 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
33759 EVT VT = N->getValueType(0);
33760 SDValue Op = N->getOperand(0);
33777 bool IsStrict = N->isStrictFPOpcode();
33779 EVT VT = N->getValueType(0);
33780 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33781 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
33820 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
33859 {N->getOperand(0), Src});
33918 // legalization to v8i32<-v8f64.
33925 Opc = N->getOpcode();
33931 {N->getOperand(0), Src});
33942 // Custom widen strict v2f32->v2i32 by padding with zeros.
33948 {N->getOperand(0), Src});
33966 // If we use a 128-bit result we might need to use a target specific node.
33985 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34023 bool IsStrict = N->isStrictFPOpcode();
34025 EVT VT = N->getValueType(0);
34026 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34040 {N->getOperand(0), Src});
34057 {N->getOperand(0), Src});
34082 {N->getOperand(0), Elt});
34099 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34114 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34119 {N->getOperand(0), Src});
34134 {N->getOperand(0), Or, VBias});
34141 // TODO: Are there any fast-math-flags to propagate here?
34149 bool IsStrict = N->isStrictFPOpcode();
34150 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34151 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34152 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34154 EVT VT = N->getValueType(0);
34194 assert(N->getValueType(0) == MVT::v2f32 &&
34198 bool IsStrict = N->isStrictFPOpcode();
34199 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34207 {N->getOperand(0), V});
34216 unsigned IntNo = N->getConstantOperandVal(1);
34244 EVT T = N->getValueType(0);
34248 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34252 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34253 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34260 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34265 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34270 // live-range.
34273 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34275 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34282 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34299 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34308 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34316 if (N->getValueType(0) == MVT::i128) {
34318 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34319 Node->getBasePtr(), Node->getMemOperand());
34324 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34333 // Then extract the lower 64-bits.
34336 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34338 MVT::i64, Node->getMemOperand());
34347 // then casts to i64. This avoids a 128-bit stack temporary being
34348 // created by type legalization if we were to cast v4f32->v2i64.
34357 // First load this into an 80-bit X87 register. This will put the whole
34360 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34363 Node->getMemOperand());
34371 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34410 EVT DstVT = N->getValueType(0);
34411 EVT SrcVT = N->getOperand(0).getValueType();
34413 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34414 // we can split using the k-register rather than memory.
34416 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34433 N->getOperand(0));
34442 EVT VT = N->getValueType(0);
34446 SDValue Index = Gather->getIndex();
34452 SDValue Mask = Gather->getMask();
34455 Gather->getPassThru(),
34464 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34465 Gather->getBasePtr(), Index, Gather->getScale() };
34468 Gather->getMemoryVT(), Gather->getMemOperand());
34477 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34479 MVT VT = N->getSimpleValueType(0);
34488 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34489 Ld->getPointerInfo(), Ld->getOriginalAlign(),
34490 Ld->getMemOperand()->getFlags());
34502 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34504 MVT::i64, Ld->getMemOperand());
34515 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34524 assert(N->getSimpleValueType(0) == MVT::f16 &&
34527 SDValue VecOp = N->getOperand(0);
34529 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34531 N->getOperand(1));
35018 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35034 // If lower 4G is not available, then we must use rip-relative addressing.
35065 // These are non-commutative binops.
35101 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35103 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35104 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35109 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35115 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35144 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35145 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35149 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35170 // X86 has 8, 16, and 32-bit zero-extending loads.
35230 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35254 // Very little shuffling can be done for 64-bit vectors right now.
35266 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35287 // zero-extensions.
35294 //===----------------------------------------------------------------------===//
35296 //===----------------------------------------------------------------------===//
35303 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
35313 for (MachineBasicBlock *Succ : BB->successors())
35314 if (Succ->isLiveIn(X86::EFLAGS))
35325 const BasicBlock *BB = MBB->getBasicBlock();
35326 MachineFunction::iterator I = ++MBB->getIterator();
35334 // s0 = -1
35344 MachineFunction *MF = MBB->getParent();
35345 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35346 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35347 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35348 MF->insert(I, mainMBB);
35349 MF->insert(I, fallMBB);
35350 MF->insert(I, sinkMBB);
35353 mainMBB->addLiveIn(X86::EFLAGS);
35354 fallMBB->addLiveIn(X86::EFLAGS);
35355 sinkMBB->addLiveIn(X86::EFLAGS);
35359 sinkMBB->splice(sinkMBB->begin(), MBB,
35360 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35361 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
35363 MachineRegisterInfo &MRI = MF->getRegInfo();
35373 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35374 thisMBB->addSuccessor(mainMBB);
35375 thisMBB->addSuccessor(fallMBB);
35378 // mainDstReg := -1
35379 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35380 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35381 mainMBB->addSuccessor(sinkMBB);
35387 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35388 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35390 fallMBB->addSuccessor(sinkMBB);
35394 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35405 // Emit va_arg instruction on X86-64.
35407 // Operands to this pseudo-instruction:
35409 // 1-5) Input : va_list address (addr, i64mem)
35413 // 9 ) EFLAGS (implicit-def)
35428 MachineFunction *MF = MBB->getParent();
35436 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35437 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35438 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35439 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35443 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
35445 getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
35504 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35505 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35506 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35507 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35509 MachineFunction::iterator MBBIter = ++MBB->getIterator();
35512 MF->insert(MBBIter, offsetMBB);
35513 MF->insert(MBBIter, overflowMBB);
35514 MF->insert(MBBIter, endMBB);
35517 endMBB->splice(endMBB->begin(), thisMBB,
35518 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35519 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35522 thisMBB->addSuccessor(offsetMBB);
35523 thisMBB->addSuccessor(overflowMBB);
35526 offsetMBB->addSuccessor(endMBB);
35527 overflowMBB->addSuccessor(endMBB);
35531 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35540 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35542 .addImm(MaxOffset + 8 - ArgSizeA8);
35546 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35558 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35568 // Zero-extend the offset
35570 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35576 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
35581 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
35588 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
35593 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
35603 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
35614 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35629 // aligned_addr = (addr + (align-1)) & ~(align-1)
35632 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
35635 .addImm(Alignment.value() - 1);
35639 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
35642 .addImm(~(uint64_t)(Alignment.value() - 1));
35644 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
35649 // (the overflow address should be kept 8-byte aligned)
35653 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
35660 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
35671 BuildMI(*endMBB, endMBB->begin(), MIMD,
35672 TII->get(X86::PHI), DestReg)
35696 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
35700 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
35701 // together with other CMOV pseudo-opcodes into a single basic-block with
35746 MachineFunction *MF = TrueMBB->getParent();
35747 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
35750 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
35753 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
35765 Register DestReg = MIIt->getOperand(0).getReg();
35766 Register Op1Reg = MIIt->getOperand(1).getReg();
35767 Register Op2Reg = MIIt->getOperand(2).getReg();
35772 if (MIIt->getOperand(3).getImm() == OppCC)
35776 Op1Reg = It->second.first;
35779 Op2Reg = It->second.second;
35782 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
35827 // because this custom-inserter would have generated:
35874 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
35875 MachineFunction *F = ThisMBB->getParent();
35876 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35877 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35878 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
35880 MachineFunction::iterator It = ++ThisMBB->getIterator();
35881 F->insert(It, FirstInsertedMBB);
35882 F->insert(It, SecondInsertedMBB);
35883 F->insert(It, SinkMBB);
35888 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
35895 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
35896 SinkMBB->addLiveIn(X86::EFLAGS);
35900 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
35902 ThisMBB->end());
35903 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
35906 ThisMBB->addSuccessor(FirstInsertedMBB);
35908 ThisMBB->addSuccessor(SinkMBB);
35910 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
35912 FirstInsertedMBB->addSuccessor(SinkMBB);
35914 SecondInsertedMBB->addSuccessor(SinkMBB);
35918 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
35922 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
35932 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
35956 // diamond control-flow pattern. The incoming instruction knows the
35965 // fallthrough --> FalseMBB
35967 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36000 // function - EmitLoweredCascadedSelect.
36014 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36015 (NextMIIt->getOperand(3).getImm() == CC ||
36016 NextMIIt->getOperand(3).getImm() == OppCC)) {
36018 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36024 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36025 NextMIIt->getOpcode() == MI.getOpcode() &&
36026 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36027 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36028 NextMIIt->getOperand(1).isKill()) {
36032 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36033 MachineFunction *F = ThisMBB->getParent();
36034 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36035 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36037 MachineFunction::iterator It = ++ThisMBB->getIterator();
36038 F->insert(It, FalseMBB);
36039 F->insert(It, SinkMBB);
36042 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36043 FalseMBB->setCallFrameSize(CallFrameSize);
36044 SinkMBB->setCallFrameSize(CallFrameSize);
36049 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36051 FalseMBB->addLiveIn(X86::EFLAGS);
36052 SinkMBB->addLiveIn(X86::EFLAGS);
36060 SinkMBB->push_back(MI.removeFromParent());
36063 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36065 ThisMBB->end());
36066 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36069 ThisMBB->addSuccessor(FalseMBB);
36071 ThisMBB->addSuccessor(SinkMBB);
36073 FalseMBB->addSuccessor(SinkMBB);
36076 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36087 ThisMBB->erase(MIItBegin, MIItEnd);
36102 MachineFunction *MF = MBB->getParent();
36106 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36110 MachineRegisterInfo &MRI = MF->getRegInfo();
36111 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36112 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36113 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36115 MachineFunction::iterator MBBIter = ++MBB->getIterator();
36116 MF->insert(MBBIter, testMBB);
36117 MF->insert(MBBIter, blockMBB);
36118 MF->insert(MBBIter, tailMBB);
36129 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36133 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36141 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36145 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36148 testMBB->addSuccessor(blockMBB);
36149 testMBB->addSuccessor(tailMBB);
36155 // + ---- <- ------------ <- ------------- <- ------------ +
36157 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36159 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36165 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36168 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36173 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36174 blockMBB->addSuccessor(testMBB);
36177 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36181 tailMBB->splice(tailMBB->end(), MBB,
36182 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36183 tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
36184 MBB->addSuccessor(testMBB);
36196 MachineFunction *MF = BB->getParent();
36199 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36201 assert(MF->shouldSplitStack());
36225 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36226 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36227 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36229 MachineRegisterInfo &MRI = MF->getRegInfo();
36231 getRegClassFor(getPointerTy(MF->getDataLayout()));
36241 MachineFunction::iterator MBBIter = ++BB->getIterator();
36243 MF->insert(MBBIter, bumpMBB);
36244 MF->insert(MBBIter, mallocMBB);
36245 MF->insert(MBBIter, continueMBB);
36247 continueMBB->splice(continueMBB->begin(), BB,
36248 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36249 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36253 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36254 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36256 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36259 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36263 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36265 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36267 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36271 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36273 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36275 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36281 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36283 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36289 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36291 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36292 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36299 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36302 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36304 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36307 BB->addSuccessor(bumpMBB);
36308 BB->addSuccessor(mallocMBB);
36309 mallocMBB->addSuccessor(continueMBB);
36310 bumpMBB->addSuccessor(continueMBB);
36313 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36330 MachineFunction *MF = BB->getParent();
36336 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
36339 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36346 MF->CreateMachineBasicBlock(BB->getBasicBlock());
36347 assert(BB->succ_size() == 1);
36348 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36349 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36350 BB->addSuccessor(RestoreMBB);
36355 RestoreMBB->setIsEHPad(true);
36357 auto RestoreMBBI = RestoreMBB->begin();
36366 // our load from the relocation, sticking it in either RDI (x86-64)
36369 MachineFunction *F = BB->getParent();
36377 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36381 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
36382 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
36385 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36392 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36397 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36404 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36409 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36410 .addReg(TII->getGlobalBaseReg(F))
36416 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36445 // aliases and are doing non-trivial configuration of the thunk's body. For
36446 // example, the Linux kernel will do boot-time hot patching of the thunk
36452 // LLVM will generate calls to specific thunks, we merely make a best-effort
36457 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36460 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36463 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36466 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36469 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36477 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36480 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36483 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36486 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36489 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36492 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36499 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36515 // Find an available scratch register to hold the callee. On 64-bit, we can
36517 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36533 // Choose the first remaining non-zero available register.
36547 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36550 MI.setDesc(TII->get(Opc));
36551 MachineInstrBuilder(*BB->getParent(), &MI)
36571 MachineFunction *MF = MBB->getParent();
36573 MachineRegisterInfo &MRI = MF->getRegInfo();
36580 MVT PVT = getPointerTy(MF->getDataLayout());
36584 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
36592 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
36596 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
36613 MachineFunction *MF = MBB->getParent();
36616 MachineRegisterInfo &MRI = MF->getRegInfo();
36618 const BasicBlock *BB = MBB->getBasicBlock();
36619 MachineFunction::iterator I = ++MBB->getIterator();
36631 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
36638 MVT PVT = getPointerTy(MF->getDataLayout());
36645 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
36659 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
36660 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
36661 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
36662 MF->insert(I, mainMBB);
36663 MF->insert(I, sinkMBB);
36664 MF->push_back(restoreMBB);
36665 restoreMBB->setMachineBlockAddressTaken();
36670 sinkMBB->splice(sinkMBB->begin(), MBB,
36671 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36672 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
36678 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
36687 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
36695 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
36696 .addReg(XII->getGlobalBaseReg(MF))
36705 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
36718 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
36723 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
36727 MIB.addRegMask(RegInfo->getNoPreservedMask());
36728 thisMBB->addSuccessor(mainMBB);
36729 thisMBB->addSuccessor(restoreMBB);
36733 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
36734 mainMBB->addSuccessor(sinkMBB);
36737 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
36744 if (RegInfo->hasBasePointer(*MF)) {
36747 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
36748 X86FI->setRestoreBasePointer(MF);
36749 Register FramePtr = RegInfo->getFrameRegister(*MF);
36750 Register BasePtr = RegInfo->getBaseRegister();
36752 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
36753 FramePtr, true, X86FI->getRestoreBasePointerOffset())
36756 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
36757 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
36758 restoreMBB->addSuccessor(sinkMBB);
36773 MachineFunction *MF = MBB->getParent();
36775 MachineRegisterInfo &MRI = MF->getRegInfo();
36780 MVT PVT = getPointerTy(MF->getDataLayout());
36807 MachineFunction::iterator I = ++MBB->getIterator();
36808 const BasicBlock *BB = MBB->getBasicBlock();
36810 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
36811 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
36812 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
36813 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
36814 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
36815 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
36816 MF->insert(I, checkSspMBB);
36817 MF->insert(I, fallMBB);
36818 MF->insert(I, fixShadowMBB);
36819 MF->insert(I, fixShadowLoopPrepareMBB);
36820 MF->insert(I, fixShadowLoopMBB);
36821 MF->insert(I, sinkMBB);
36824 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
36825 MBB->end());
36826 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
36828 MBB->addSuccessor(checkSspMBB);
36832 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
36836 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
36846 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
36851 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
36854 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
36857 checkSspMBB->addSuccessor(sinkMBB);
36858 checkSspMBB->addSuccessor(fallMBB);
36865 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
36881 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
36886 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
36889 fallMBB->addSuccessor(sinkMBB);
36890 fallMBB->addSuccessor(fixShadowMBB);
36896 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
36902 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
36906 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
36911 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
36914 fixShadowMBB->addSuccessor(sinkMBB);
36915 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
36920 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
36927 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
36929 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
36935 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
36942 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
36946 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
36949 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
36952 fixShadowLoopMBB->addSuccessor(sinkMBB);
36953 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
36962 MachineFunction *MF = MBB->getParent();
36964 MachineRegisterInfo &MRI = MF->getRegInfo();
36969 MVT PVT = getPointerTy(MF->getDataLayout());
36979 Register SP = RegInfo->getStackRegister();
36992 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
36997 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
37010 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37024 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37036 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37047 MachineFunction *MF = MBB->getParent();
37048 MachineRegisterInfo *MRI = &MF->getRegInfo();
37051 MVT PVT = getPointerTy(MF->getDataLayout());
37057 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37065 VR = MRI->createVirtualRegister(TRC);
37069 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37076 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37077 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37084 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37096 MachineFunction *MF = BB->getParent();
37097 MachineRegisterInfo *MRI = &MF->getRegInfo();
37099 int FI = MF->getFrameInfo().getFunctionContextIndex();
37119 if (!MF->hasCallSiteLandingPad(Sym))
37122 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37136 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
37146 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37147 DispatchBB->setIsEHPad(true);
37149 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37150 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37151 DispatchBB->addSuccessor(TrapBB);
37153 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37154 DispatchBB->addSuccessor(DispContBB);
37157 MF->push_back(DispatchBB);
37158 MF->push_back(DispContBB);
37159 MF->push_back(TrapBB);
37167 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37168 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37170 const X86RegisterInfo &RI = TII->getRegisterInfo();
37176 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37177 MFI->setRestoreBasePointer(MF);
37182 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37183 MFI->getRestoreBasePointerOffset())
37186 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37191 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37192 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37194 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37197 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37202 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37203 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37206 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37213 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37221 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37229 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37230 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37231 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37234 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37241 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37244 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37248 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37256 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37268 DispContBB->addSuccessor(LP);
37272 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37277 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
37278 MBB->succ_rend());
37281 if (MBBS->isEHPad()) {
37282 MBB->removeSuccessor(MBBS);
37287 MBB->addSuccessor(DispatchBB);
37289 // Find the invoke call and mark all of the callee-saved registers as
37313 // Mark all former landing pads as non-landing pads. The dispatch is the only
37316 LP->setIsEHPad(false);
37330 MachineFunction &MF = *BB->getParent();
37337 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37343 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37351 MachineFunction *MF = BB->getParent();
37414 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37415 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37419 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37420 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37425 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37426 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37432 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37433 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37438 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37439 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37444 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37449 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37454 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37465 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37484 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37485 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37489 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37490 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37494 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37495 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37500 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37501 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37506 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37507 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37513 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37518 // clang-format off
37529 // clang-format on
37533 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37537 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37581 // - which is ESI for i686 - register allocator would not be able to
37583 // - there never would be enough unreserved registers during regalloc
37588 // If it is not i686 or there is no base pointer - nothing to do here.
37589 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
37596 assert(TRI->getBaseRegister() == X86::ESI &&
37600 MachineRegisterInfo &MRI = MF->getRegInfo();
37601 MVT SPTy = getPointerTy(MF->getDataLayout());
37615 while (RMBBI != BB->rend() &&
37616 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
37617 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
37618 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
37619 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
37624 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
37632 Register BasePtr = TRI->getBaseRegister();
37633 if (TRI->hasBasePointer(*MF) &&
37635 if (!BB->isLiveIn(BasePtr))
37636 BB->addLiveIn(BasePtr);
37639 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37640 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
37642 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37644 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
37651 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
37654 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
37663 Register BasePtr = TRI->getBaseRegister();
37667 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
37668 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
37670 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
37672 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
37674 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
37677 if (!BB->isLiveIn(BasePtr)) {
37678 BB->addLiveIn(BasePtr);
37681 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
37683 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
37685 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
37688 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37689 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
37692 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37693 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
37702 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
37703 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37704 MFI->setHasPreallocatedCall(true);
37706 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
37710 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
37717 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
37720 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37721 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
37725 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
37792 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
37803 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
37805 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37806 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
37810 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37811 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
37841 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
37849 MIB.add(MI.getOperand(CurOp++)); // index -- stride
37900 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37917 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37954 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37992 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38003 //===----------------------------------------------------------------------===//
38005 //===----------------------------------------------------------------------===//
38033 // For vectors - if we have a constant, then try to sign extend.
38062 const APInt &Mask = C->getAPIntValue();
38064 // Clear all non-demanded bits initially.
38088 // and non-demanded bits.
38102 const SelectionDAG &DAG, unsigned Depth) {
38106 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38107 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38120 unsigned Depth) {
38129 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38130 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38131 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38132 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38142 unsigned Depth) {
38152 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38153 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38154 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38155 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38162 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38172 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38174 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38175 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38191 unsigned Depth) const {
38208 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38209 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38217 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38224 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38234 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38236 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38255 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38272 ShAmt = VT.getScalarSizeInBits() - 1;
38275 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38302 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38306 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38321 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38323 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38325 Known = DAG.computeKnownBits(Src, Depth + 1);
38331 Known = DAG.computeKnownBits(Src, Depth + 1);
38339 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38340 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38347 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38348 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38357 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38358 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38370 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38376 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38378 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38397 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38407 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38412 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38413 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38421 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38425 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38437 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38438 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38447 Known = DAG.computeKnownBits(Op0, Depth + 1);
38456 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38457 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38460 // The result will have at least as many trailing zeros as the non-mask
38466 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38494 // Truncations/Conversions - upper elements are known zero.
38509 // Strict Conversions - upper elements are known zero.
38549 Op, DemandedElts, Depth, DAG,
38558 switch (Op->getConstantOperandVal(0)) {
38568 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38580 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38592 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38601 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38631 // TODO - handle target shuffle ops with different value types.
38642 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
38652 unsigned Depth) const {
38667 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
38668 if (Tmp > (NumSrcBits - VTBits))
38669 return Tmp - (NumSrcBits - VTBits);
38681 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
38690 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
38691 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
38694 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
38704 if (Tmp > (SrcBits - VTBits))
38705 return Tmp - (SrcBits - VTBits);
38712 return DAG.ComputeNumSignBits(Src, Depth + 1);
38720 return VTBits; // Shifted all bits out --> zero.
38721 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
38723 return 1; // Shifted all sign bits out --> unknown.
38724 return Tmp - ShiftVal.getZExtValue();
38730 if (ShiftVal.uge(VTBits - 1))
38732 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
38738 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
38749 // Vector compares return zero/all-bits result values.
38754 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
38757 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
38762 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
38764 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
38770 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38797 // TODO - handle target shuffle ops with different value types.
38807 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
38820 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
38821 return N->getOperand(0);
38830 if (!LN->isSimple())
38834 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
38836 LN->getPointerInfo(), LN->getOriginalAlign(),
38837 LN->getMemOperand()->getFlags());
38851 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
38854 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
38856 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
38867 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
38885 unsigned Len = Scale - 1;
38911 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
38915 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
38946 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
38968 "AVX512 required for 512-bit vector shuffles");
39014 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39031 // VPERMILPD can permute with a non-repeating shuffle.
39052 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39057 // Narrow the repeated mask to create 32-bit element permutes.
39095 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39209 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39218 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39225 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39251 // non-blended source element is zero in each case.
39461 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39462 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39465 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39466 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39469 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39470 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39477 int ShufMask[4] = {-1, -1, -1, -1};
39505 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
39514 /// chain of single-use x86 shuffle instructions and accumulated the combined
39518 /// instruction but should only be used to replace chains over a certain depth.
39520 ArrayRef<int> BaseMask, int Depth,
39567 (RootVT.isFloatingPoint() && Depth >= 1) ||
39571 // is different from the root element size - this would prevent writemasks
39575 if (Root.hasOneUse() && Root->user_begin()->getOpcode() == ISD::VSELECT &&
39576 Root->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
39593 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
39608 // Handle 128/256-bit lane shuffles of 512-bit vectors.
39614 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
39615 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39635 int PermMask[4] = {-1, -1, -1, -1};
39639 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
39672 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39680 // Handle 128-bit lane shuffles of 256-bit vectors.
39686 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39695 // If we're inserting the low subvector, an insert-subvector 'concat'
39700 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39708 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
39725 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39728 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
39747 // For masks that have been widened to 128-bit elements or more,
39748 // narrow back down to 64-bit elements.
39759 // TODO - variable shuffles might need this to be widened again.
39789 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
39790 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
39799 // Attempt to match against broadcast-from-vector.
39808 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
39815 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
39828 if (Depth == 0 && Root.getOpcode() == Shuffle)
39840 if (Depth == 0 && Root.getOpcode() == Shuffle)
39860 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
39873 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
39890 if (Depth == 0 && Root.getOpcode() == Shuffle)
39904 if (Depth == 0 && Root.getOpcode() == Shuffle)
39922 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
39932 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
39952 if (Depth == 0 && Root.getOpcode() == Opc)
39969 if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
39985 // Don't try to re-form single instruction chains under any circumstances now
39987 if (Depth < 1)
39990 // Depth threshold above which we can efficiently use variable mask shuffles.
39996 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
39998 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
40000 // higher depth before combining them.
40002 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
40011 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40020 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40035 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40036 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40047 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40060 Inputs, Root, BaseMask, Depth, HasVariableMask,
40065 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40066 // (non-VLX will pad to 512-bit shuffles).
40085 // See if we can combine a single input shuffle with zeros to a bit-mask,
40113 // the 128-bit lanes use the variable mask to VPERMILPS.
40130 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40136 // Bits[3] - Match Bit.
40137 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40138 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40145 VPerm2Idx.push_back(-1);
40198 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40199 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40204 // Bits[4:0] - Byte Index (0 - 31)
40205 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40233 Inputs, Root, BaseMask, Depth, HasVariableMask,
40238 // (non-VLX will pad to 512-bit shuffles)
40268 // -->
40271 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
40320 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
40370 // elements, and shrink them to the half-width mask. It does this in a loop
40387 // Increase depth for every upper subvector we've peeked through.
40388 Depth += AdjustedMasks;
40400 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
40458 // the HOP args are pre-shuffled.
40459 // TODO: Generalize to any sized/depth chain.
40470 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40484 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40504 int PostMask[4] = {-1, -1, -1, -1};
40526 SDValue BC1 = BC[BC.size() - 1];
40550 M -= NumElts + (SubLane * NumHalfEltsPerLane);
40564 M -= NumHalfEltsPerLane;
40567 M -= NumHalfEltsPerLane;
40597 // If we are post-shuffling a 256-bit hop and not requiring the upper
40598 // elements, then try to narrow to a 128-bit hop directly.
40647 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
40719 /// of single-use shuffle instructions, build a generic model of the cumulative
40726 /// special-purpose shuffle.
40736 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
40742 /// combine-ordering. To fix this, we should do the redundant instruction
40746 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
40758 // Bound the depth of our recursive combine because this is ultimately
40760 if (Depth >= MaxDepth)
40769 return SDValue(); // Bail if we hit a non-simple non-vector.
40783 OpDemandedElts.setBit(M - BaseIdx);
40786 // Op is smaller than Root - extract the demanded elts for the subvector.
40791 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
40805 OpZero, DAG, Depth, false)) {
40843 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
40850 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
40879 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
40885 // Match failed - should we replace an existing Op?
40892 return Ops.size() - 1;
40898 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
40907 // This function can be performance-critical, so we rely on the power-of-2
40909 // bit-masks and shifts.
40911 "Non-power-of-2 shuffle mask sizes");
40913 "Non-power-of-2 shuffle mask sizes");
40924 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
40925 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
40926 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
40947 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
40957 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
40966 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
40969 (RootMaskedIdx & (OpRatio - 1));
40971 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41030 // the remaining recursion depth.
41031 if (Ops.size() < (MaxDepth - Depth)) {
41040 if (Ops[i].getNode()->hasOneUse() ||
41046 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
41058 // If constant fold failed and we only have constants - then we have
41059 // multiple uses by a single non-variable shuffle - just bail.
41060 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41091 int OpEltIdx = MaskElt - Lo;
41103 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41130 // Reresolve - we might have repeated subvector sources.
41138 // elements, and shrink them to the half-width mask. It does this in a loop
41157 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41176 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41184 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
41190 /// Get the PSHUF-style mask from PSHUF node.
41193 /// PSHUF-style masks that can be reused with such instructions.
41202 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41209 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41210 "Mask doesn't repeat in high 128-bit lanes!");
41224 M -= 4;
41241 "Called with something other than an x86 128-bit half shuffle!");
41243 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41264 // dword shuffle, and the high words are self-contained.
41274 // dword shuffle, and the low words are self-contained.
41284 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41290 // Search for a half-shuffle which we can combine with.
41294 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41365 // permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41372 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41374 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41418 // Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41488 // the blend mask is the same in the 128-bit subvectors (or can widen to
41498 // Don't introduce lane-crossing permutes without AVX2, unless it can be
41513 // TODO - move this to TLI like isBinOp?
41524 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
41525 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41543 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
41544 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
41545 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
41574 N->isOnlyUserOf(N.getOperand(0).getNode())) {
41630 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
41631 N->isOnlyUserOf(N.getOperand(1).getNode())) {
41709 /// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
41786 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
41808 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
41817 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
41825 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
41826 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
41837 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
41838 // If we're re-broadcasting a smaller type then broadcast with that type and
41852 // Reduce broadcast source vector to lowest 128-bits.
41857 // broadcast(scalar_to_vector(x)) -> broadcast(x).
41862 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
41872 for (SDNode *User : Src->users())
41873 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
41874 Src == User->getOperand(0) &&
41875 User->getValueSizeInBits(0).getFixedValue() >
41881 // vbroadcast(scalarload X) -> vbroadcast_load X
41887 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41890 LN->getMemoryVT(), LN->getMemOperand());
41917 if (LN->isSimple()) {
41919 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41922 LN->getPointerInfo(), LN->getOriginalAlign(),
41923 LN->getMemOperand()->getFlags());
41935 if (LN->getMemoryVT().getSizeInBits() == 16) {
41937 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41940 LN->getMemoryVT(), LN->getMemOperand());
41959 LN->isSimple()) {
41963 LN->getBasePtr(), TypeSize::getFixed(Offset), DL);
41964 SDValue Ops[] = { LN->getChain(), Ptr };
41967 LN->getPointerInfo().getWithOffset(Offset),
41968 LN->getOriginalAlign(),
41969 LN->getMemOperand()->getFlags());
41978 // vbroadcast(vzload X) -> vbroadcast_load X
41981 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
41983 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41986 LN->getMemoryVT(), LN->getMemOperand());
41994 // vbroadcast(vector load X) -> vbroadcast_load
42000 if (LN->isSimple()) {
42002 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42005 LN->getPointerInfo(), LN->getOriginalAlign(),
42006 LN->getMemOperand()->getFlags());
42037 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42039 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42042 LN->getMemoryVT(), LN->getMemOperand());
42069 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42072 // Create a vector constant - scalar constant followed by zeros.
42077 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42084 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42093 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42119 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42137 // --> m3 = blend(m1,m2)
42173 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42207 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42222 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42225 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the
42227 SDValue LHS = N->getOperand(0);
42228 SDValue RHS = N->getOperand(1);
42229 uint64_t Mask = N->getConstantOperandVal(2);
42250 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42251 SDValue LHS = N->getOperand(0);
42252 SDValue RHS = N->getOperand(1);
42260 N->getOperand(2)));
42264 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42269 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42301 if (N0->hasOneUse()) {
42336 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42385 // Zero/UNDEF insertion - zero out element and remove dependency.
42451 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42452 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42453 MemIntr->getBasePtr(),
42454 MemIntr->getMemOperand());
42492 M = (M < 0 ? M : M & (Mask.size() - 1));
42509 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
42531 // Nuke no-op shuffles that show up after combining.
42546 // dwords as otherwise it would have been removed as a no-op.
42561 // only works when we have a PSHUFD followed by two half-shuffles.
42610 int ParitySrc[2] = {-1, -1};
42649 EVT VT = N->getValueType(0);
42655 // We only handle target-independent shuffles.
42658 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42661 SDValue V1 = N->getOperand(0);
42662 SDValue V2 = N->getOperand(1);
42671 if (!V1->hasOneUse() || !V2->hasOneUse())
42678 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
42679 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
42680 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
42684 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
42685 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
42686 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
42690 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42696 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
42697 : V2->getOpcode() == ISD::FADD;
42708 // We only handle target-independent shuffles.
42711 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42714 MVT VT = N->getSimpleValueType(0);
42720 SDValue Op0 = N->getOperand(0);
42721 SDValue Op1 = N->getOperand(1);
42733 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42745 /// Try to combine a shuffle into a target-specific add-sub or
42746 /// mul-add-sub node.
42758 MVT VT = N->getSimpleValueType(0);
42770 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
42772 // X86 targets with 512-bit ADDSUB instructions!
42787 // if we can express this as a single-source shuffle, that's preferable.
42794 EVT VT = N->getValueType(0);
42796 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
42806 SDValue N0 = N->getOperand(0);
42807 SDValue N1 = N->getOperand(1);
42822 for (int Elt : SVOp->getMask())
42823 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
42834 EVT VT = Shuf->getValueType(0);
42835 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
42841 ArrayRef<int> Mask = Shuf->getMask();
42846 // (half-index output is 0 or 2).
42853 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
42855 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
42858 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
42859 Shuf->getOperand(1), HalfMask, HalfIdx1,
42873 EVT VT = N->getValueType(0);
42894 if (isTargetShuffle(N->getOpcode())) {
42900 // instructions into higher-order shuffles. We do this after combining
42908 // TODO - merge this into combineX86ShufflesRecursively.
42913 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
42914 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42928 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
42941 Depth + 1))
42949 if (!Load || !Load->getBasePtr().hasOneUse())
42956 Type *CTy = C->getType();
42957 if (!CTy->isVectorTy() ||
42958 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
42961 // Handle scaling for i64 elements on 32-bit targets.
42962 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
42971 Constant *Elt = C->getAggregateElement(i);
42973 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
42989 Load->getAlign());
42995 TargetLoweringOpt &TLO, unsigned Depth) const {
43009 Depth + 1))
43012 Depth + 1))
43027 Depth + 1))
43030 Depth + 1))
43038 Depth + 1))
43042 Depth + 1))
43059 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43061 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43074 // We only need the bottom 64-bits of the (128-bit) shift amount.
43080 // only the bottom 64-bits are only ever used.
43081 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43082 unsigned UseOpc = Use->getOpcode();
43085 Use->getOperand(0) != Amt;
43092 Depth + 1, AssumeSingleUse))
43102 Depth + 1))
43105 // Fold shift(0,x) -> 0
43113 Src, DemandedElts, TLO.DAG, Depth + 1))
43128 Depth + 1))
43131 // Fold shift(0,x) -> 0
43137 Depth + 1))
43150 Depth + 1))
43153 Depth + 1))
43160 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43161 unsigned ShiftAmt = Amt->getZExtValue();
43173 int Diff = ShiftAmt - C1;
43175 Diff = -Diff;
43188 Depth + 1))
43199 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43200 unsigned ShiftAmt = Amt->getZExtValue();
43212 int Diff = ShiftAmt - C1;
43214 Diff = -Diff;
43227 Depth + 1))
43255 // We can't assume an undef src element gives an undef dst - the
43276 Depth + 1))
43279 Depth + 1))
43284 TLO.DAG, Depth + 1);
43286 TLO.DAG, Depth + 1);
43305 Depth + 1))
43319 Depth + 1))
43323 Depth + 1))
43326 // TODO - pass on known zero/undef.
43329 // TODO - we should do this for all target/faux shuffles ops.
43332 TLO.DAG, Depth + 1);
43334 TLO.DAG, Depth + 1);
43356 Depth + 1))
43360 Depth + 1))
43363 // TODO - pass on known zero/undef.
43369 TLO.DAG, Depth + 1);
43371 TLO.DAG, Depth + 1);
43389 Depth + 1))
43407 SelZero, TLO, Depth + 1))
43413 LHSZero, TLO, Depth + 1))
43418 RHSZero, TLO, Depth + 1))
43430 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43441 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
43442 Mem->getMemOperand());
43465 Depth + 1))
43468 // TODO - we should do this for all target/faux shuffles ops.
43470 Src, SrcElts, TLO.DAG, Depth + 1))
43476 Depth))
43483 Depth))
43489 Depth))
43494 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
43495 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
43496 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
43502 // See if 512-bit ops only use the bottom 128-bits.
43525 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
43527 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
43528 MemIntr->getMemOperand());
43537 EVT MemVT = MemIntr->getMemoryVT();
43541 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
43542 MemIntr->getBasePtr(), MemIntr->getMemOperand());
43641 // (Non-Lane Crossing) Target Shuffles.
43704 OpZero, TLO.DAG, Depth, false))
43747 int M = OpMask[i] - Lo;
43752 // TODO - Propagate input undef/zero elts.
43755 TLO, Depth + 1))
43761 // We need to convert the depth to something combineX86ShufflesRecursively
43762 // can handle - so pretend its Depth == 0 again, and reduce the max depth
43767 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
43775 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
43789 unsigned Depth) const {
43802 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
43813 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
43826 KnownLHS, TLO, Depth + 1))
43829 KnownRHS, TLO, Depth + 1))
43832 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
43843 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43845 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43860 Known, TLO, Depth + 1))
43864 OriginalDemandedElts, Known2, TLO, Depth + 1))
43881 unsigned ShAmt = Op1->getAsZExtVal();
43894 int Diff = ShAmt - Shift2Amt;
43908 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
43909 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
43910 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
43914 TLO, Depth + 1))
43924 // Attempt to avoid multi-use ops if we don't need anything from them.
43926 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
43938 unsigned ShAmt = Op1->getAsZExtVal();
43945 TLO, Depth + 1))
43955 // Attempt to avoid multi-use ops if we don't need anything from them.
43957 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
43969 unsigned ShAmt = Op1->getAsZExtVal();
43979 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
43994 TLO, Depth + 1))
44002 if (Known.Zero[BitWidth - ShAmt - 1] ||
44008 if (Known.One[BitWidth - ShAmt - 1])
44012 // Attempt to avoid multi-use ops if we don't need anything from them.
44014 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44029 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44031 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44033 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44051 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44052 unsigned Idx = CIdx->getZExtValue();
44056 // bits from the implict zext - simplify to zero.
44064 KnownZero, TLO, Depth + 1))
44069 KnownVec, TLO, Depth + 1))
44073 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44089 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44090 unsigned Idx = CIdx->getZExtValue();
44098 KnownVec, TLO, Depth + 1))
44104 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44116 // TODO - add known bits handling.
44124 KnownLHS, TLO, Depth + 1))
44127 KnownRHS, TLO, Depth + 1))
44130 // Attempt to avoid multi-use ops if we don't need anything from them.
44132 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44134 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44141 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44149 TLO, Depth + 1))
44156 Src->hasOneUse()) {
44168 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44184 // See if we only demand bits from the lower 128-bit vector.
44195 TLO, Depth + 1))
44199 Known.Zero.setHighBits(BitWidth - NumElts);
44205 Depth + 1))
44208 if (KnownSrc.One[SrcBits - 1])
44210 else if (KnownSrc.Zero[SrcBits - 1])
44213 // Attempt to avoid multi-use os if we don't need anything from it.
44215 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44230 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44231 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44233 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44239 OriginalDemandedElts, Known2, TLO, Depth + 1))
44242 OriginalDemandedElts, Known, TLO, Depth + 1))
44254 // Only bottom 16-bits of the control bits are required.
44257 uint64_t Val1 = Cst1->getZExtValue();
44266 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44267 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44277 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44288 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44304 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44308 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44317 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44322 // The result will have at least as many trailing zeros as the non-mask
44330 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
44335 SelectionDAG &DAG, unsigned Depth) const {
44347 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
44348 !DemandedElts[CIdx->getZExtValue()])
44358 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
44359 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
44360 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44371 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44383 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
44395 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
44396 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
44411 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
44450 Op, DemandedBits, DemandedElts, DAG, Depth);
44455 bool PoisonOnly, unsigned Depth) const {
44479 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
44487 Op, DemandedElts, DAG, PoisonOnly, Depth);
44492 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
44514 switch (Op->getConstantOperandVal(0)) {
44525 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
44532 unsigned Depth) const {
44544 DAG, Depth);
44580 // clang-format off
44585 // clang-format on
44590 // Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
44601 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
44652 // ->
44685 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
44694 // With AVX512 vxi1 types are legal and we prefer using k-regs.
44718 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
44724 // avoid sign-extending to this type entirely.
44738 // sign-extend to a 256-bit operation to avoid truncation.
44748 // sign-extend to a 256-bit operation to match the compare.
44749 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
44750 // 256-bit because the shuffle is cheaper than sign extending the result of
44761 // it is not profitable to sign-extend to 256-bit because this will
44762 // require an extra cross-lane shuffle which is more expensive than
44763 // truncating the result of the compare to 128-bits.
44815 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
44825 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
44830 // Only do this if we have k-registers.
44834 EVT DstVT = N->getValueType(0);
44835 SDValue Op = N->getOperand(0);
44881 unsigned NumElts = BV->getNumOperands();
44882 SDValue Splat = BV->getSplatValue();
44906 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
44914 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
44922 // Use PSHUFW to repeat 16-bit elements.
44933 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
44961 unsigned Depth = 0) {
44962 if (Depth >= SelectionDAG::MaxRecursionDepth)
44963 return SDValue(); // Limit search depth.
44978 if (C->isZero())
44980 if (C->isAllOnes())
44991 Subtarget, Depth + 1))
45004 Subtarget, Depth + 1))
45016 Subtarget, Depth + 1))
45018 Subtarget, Depth + 1))
45031 Depth + 1))
45034 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45040 if (Depth > 0)
45050 SDValue N0 = N->getOperand(0);
45051 EVT VT = N->getValueType(0);
45057 // ->
45088 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45092 SmallVector<SDValue, 4> Ops(N0->ops());
45137 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45147 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45150 MemVT, BCast->getMemOperand());
45158 // avoiding store-load conversions.
45167 // Handle zero-extension of i32 with MOVD.
45172 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45196 // Detect bitcasts of 64-bit build vectors and convert to a
45235 if (C->isAllOnes())
45237 if (C->isZero())
45243 // Turn it into a sign bit compare that produces a k-register. This avoids
45284 // remove GPR<->K-register crossings.
45289 // floating-point operand into a floating-point logic operation. This may
45295 // clang-format off
45300 // clang-format on
45315 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
45324 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
45347 auto IsFreeTruncation = [](SDValue &Op) -> bool {
45354 return (BV && BV->isConstant());
45372 SDValue AbsOp1 = Abs->getOperand(0);
45379 // Check if the operands of the sub are zero-extended from vectors of i8.
45409 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
45442 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
45471 EVT ExtractVT = Extract->getValueType(0);
45490 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
45517 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
45545 EVT ExtractVT = Extract->getValueType(0);
45573 // Special case for (pre-legalization) vXi1 reductions.
45577 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
45580 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
45581 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
45611 // FIXME: Better handling of k-registers or 512-bit vectors?
45652 // parity -> (PARITY(MOVMSK X))
45660 // any_of -> MOVMSK != 0
45664 // all_of -> MOVMSK == ((1 << NumElts) - 1)
45671 // negate to get the final 0/-1 mask value.
45683 EVT ExtractVT = Extract->getValueType(0);
45689 EVT VT = Extract->getOperand(0).getValueType();
45698 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
45729 for (unsigned i = Stages - StageBias; i > 0; --i) {
45730 SmallVector<int, 16> Mask(DpElems, -1);
45731 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45746 Extract->getOperand(1));
45755 EVT ExtractVT = Extract->getValueType(0);
45761 EVT VT = Extract->getOperand(0).getValueType();
45783 // abs-diff pattern.
45787 // Check whether we have an abs-diff pattern feeding into the select.
45803 for(unsigned i = Stages - 3; i > 0; --i) {
45804 SmallVector<int, 16> Mask(SadElems, -1);
45805 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45820 Extract->getOperand(1));
45824 // integer, that requires a potentially expensive XMM -> GPR transfer.
45829 // to a single-use of the loaded vector. For the reasons above, we
45835 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45839 EVT VT = N->getValueType(0);
45841 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
45842 return Use->getOpcode() == ISD::STORE ||
45843 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
45844 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
45851 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
45853 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
45855 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
45856 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
45858 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
45859 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
45876 SDValue Src = N->getOperand(0);
45877 SDValue Idx = N->getOperand(1);
45879 EVT VT = N->getValueType(0);
45889 const APInt &IdxC = N->getConstantOperandAPInt(1);
45903 // TODO support non-zero offsets.
45917 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
45919 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
45920 MemIntr->getBasePtr(),
45921 MemIntr->getPointerInfo(),
45922 MemIntr->getOriginalAlign(),
45923 MemIntr->getMemOperand()->getFlags());
45958 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
45962 // We can only legally extract other elements from 128-bit vectors and in
45963 // certain circumstances, depending on SSE-level.
45973 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
45977 Idx &= (NumEltsPerLane - 1);
46030 // If narrowing/widening failed, see if we can extract+zero-extend.
46041 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46063 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46076 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46077 SDValue Vec = ExtElt->getOperand(0);
46078 SDValue Index = ExtElt->getOperand(1);
46079 EVT VT = ExtElt->getValueType(0);
46083 // non-zero element because the shuffle+scalar op will be cheaper?
46088 // extract, the condition code), so deal with those as a special-case.
46094 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46117 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46129 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46162 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46165 for (SDValue Op : Vec->ops())
46179 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46191 SDValue Index = ExtElt->getOperand(1);
46195 EVT VT = ExtElt->getValueType(0);
46204 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
46222 // vXi8 mul reduction - promote to vXi16 mul reduction.
46245 {4, 5, 6, 7, -1, -1, -1, -1}));
46248 {2, 3, -1, -1, -1, -1, -1, -1}));
46251 {1, -1, -1, -1, -1, -1, -1, -1}));
46256 // vXi8 add reduction - sub 128-bit vector.
46265 // Must be a >=128-bit vector with pow2 elements.
46269 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
46281 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
46290 // If the source vector values are 0-255, then we can use PSADBW to
46327 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
46342 // 256-bit horizontal instructions operate on 128-bit chunks rather than
46345 // TODO: We could extend this to handle 512-bit or even longer vectors.
46358 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
46369 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
46376 SDValue InputVector = N->getOperand(0);
46377 SDValue EltIdx = N->getOperand(1);
46381 EVT VT = N->getValueType(0);
46383 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
46388 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
46399 uint64_t Idx = CIdx->getZExtValue();
46405 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
46414 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
46425 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
46436 // TODO - Remove this once we can handle the implicit zero-extension of
46459 // pre-legalization,
46483 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
46502 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46503 Use->getOperand(0).getResNo() == ResNo &&
46504 Use->getValueType(0) == MVT::i1) {
46506 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
46512 if (all_of(InputVector->users(), IsBoolExtract) &&
46518 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
46520 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
46532 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
46563 // Input type must be extending a bool vector (bit-casted from a scalar
46585 // must split it down into sub-sections for broadcasting. For example:
46586 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
46587 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
46614 // For smaller scalar integers, we can simply any-extend it to the vector
46636 // zero-extension.
46640 DAG.getConstant(EltSizeInBits - 1, DL, VT));
46643 /// If a vector select has an operand that is -1 or 0, try to simplify the
46650 SDValue Cond = N->getOperand(0);
46651 SDValue LHS = N->getOperand(1);
46652 SDValue RHS = N->getOperand(2);
46657 if (N->getOpcode() != ISD::VSELECT)
46680 // vector floating-point selects.
46698 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
46711 // vselect Cond, 111..., 000... -> Cond
46718 // vselect Cond, 111..., X -> or Cond, X
46725 // vselect Cond, X, 000... -> and Cond, X
46732 // vselect Cond, 000..., X -> andn Cond, X
46736 // The canonical form differs for i1 vectors - x86andnp is not used
46749 /// and concatenate the result to eliminate a wide (256-bit) vector instruction:
46750 /// vselect Cond, (concat T0, T1), (concat F0, F1) -->
46754 unsigned Opcode = N->getOpcode();
46758 // TODO: Split 512-bit vectors too?
46759 EVT VT = N->getValueType(0);
46764 SDValue Cond = N->getOperand(0);
46765 SDValue TVal = N->getOperand(1);
46766 SDValue FVal = N->getOperand(2);
46782 SDValue Cond = N->getOperand(0);
46783 SDValue LHS = N->getOperand(1);
46784 SDValue RHS = N->getOperand(2);
46792 EVT VT = N->getValueType(0);
46797 // this with a wider condition value (post-legalization it becomes an i8),
46802 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
46804 // TODO: For constants that overflow or do not differ by power-of-2 or small
46806 const APInt &TrueVal = TrueC->getAPIntValue();
46807 const APInt &FalseVal = FalseC->getAPIntValue();
46809 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
46812 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46835 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
46838 // Multiply condition by the difference if non-one.
46842 // Add the base if non-zero.
46843 if (!FalseC->isZero())
46852 /// If this is a *dynamic* select (non-constant condition) and we can match
46861 SDValue Cond = N->getOperand(0);
46862 if ((N->getOpcode() != ISD::VSELECT &&
46863 N->getOpcode() != X86ISD::BLENDV) ||
46869 EVT VT = N->getValueType(0);
46875 // cases where a *dynamic* blend will fail even though a constant-condition
46878 // Potentially, we should combine constant-condition vselect nodes
46879 // pre-legalization into shuffles and not mark as many types as custom
46883 // FIXME: We don't support i16-element blends currently. We could and
46885 // rather than just the high bit and using an i8-element blend.
46894 // There are no 512-bit blend instructions that use sign bits.
46899 // and don't ever optimize vector selects that map to AVX512 mask-registers.
46904 for (SDUse &Use : Cond->uses())
46905 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
46906 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
46927 for (SDNode *U : Cond->users()) {
46928 if (U->getOpcode() == X86ISD::BLENDV)
46931 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
46932 Cond, U->getOperand(1), U->getOperand(2));
46942 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
46943 N->getOperand(1), N->getOperand(2));
46955 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
46958 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
46959 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
46970 "Mask must be zero/all-bits");
46978 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
46979 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
46996 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
46999 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47001 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47014 if (N->getOpcode() != ISD::VSELECT)
47017 SDValue Cond = N->getOperand(0);
47018 SDValue LHS = N->getOperand(1);
47019 SDValue RHS = N->getOperand(2);
47031 // (vselect M, L, R) -> (vselect ~M, R, L)
47033 ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
47040 /// Do target-specific dag combines on SELECT and VSELECT nodes.
47045 SDValue Cond = N->getOperand(0);
47046 SDValue LHS = N->getOperand(1);
47047 SDValue RHS = N->getOperand(2);
47066 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47067 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47079 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
47082 N->getOpcode() == X86ISD::BLENDV))
47086 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47089 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
47121 // ignored in unsafe-math mode).
47129 bool IsStrict = Cond->isStrictFPOpcode();
47131 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47204 // Check for x CC y ? y : x -- a min/max with reversed arms.
47274 DL, {N->getValueType(0), MVT::Other},
47279 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47287 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47288 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47290 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47314 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
47317 // AVX512 - Extend select to merge with target shuffle.
47318 // select(mask, extract_subvector(shuffle(x)), y) -->
47320 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
47354 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
47359 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47362 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
47363 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
47366 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
47377 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
47378 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
47397 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
47399 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
47405 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
47411 // clang-format off
47417 // clang-format on
47430 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
47442 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
47454 // with out-of-bounds clamping.
47458 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
47459 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
47460 // exceeding bitwidth-1.
47461 if (N->getOpcode() == ISD::VSELECT) {
47463 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
47464 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
47475 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
47476 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
47502 // select(~Cond, X, Y) -> select(Cond, Y, X)
47505 return DAG.getNode(N->getOpcode(), DL, VT,
47508 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
47518 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
47521 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
47528 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
47537 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
47567 // This can lower using a vector shift bit-hack rather than mask and compare.
47569 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
47573 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
47575 // The 'and' mask must be composed of power-of-2 constants.
47578 if (C && C->getAPIntValue().isPowerOf2()) {
47579 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
47585 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
47586 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
47587 // 16-bit lacks a proper blendv.
47595 return C->getAPIntValue().isPowerOf2();
47597 // Create a left-shift constant to get the mask bits over to the sign-bit.
47602 ShlVals.push_back(EltBitWidth - 1 -
47603 MaskVal->getAPIntValue().exactLogBase2());
47605 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
47626 // This combine only operates on CMP-like nodes.
47628 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47638 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
47639 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
47640 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
47641 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
47645 // - XOR/OR/AND (if they were made to survive AtomicExpand)
47646 // - LHS != 1
47665 APInt Addend = OpRHSC->getAPIntValue();
47667 Addend = -Addend;
47673 APInt Comparison = CmpRHSC->getAPIntValue();
47674 APInt NegAddend = -Addend;
47689 APInt DecComparison = Comparison - 1;
47711 AN->getMemOperand());
47727 else if (CC == X86::COND_G && Addend == -1)
47729 else if (CC == X86::COND_LE && Addend == -1)
47752 // CMP(X,0) -> signbit test
47757 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
47763 // OR(X,Y) -> see if only one operand contributes to the signbit.
47764 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
47812 // This combine only operates on CMP-like nodes.
47814 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47838 if (C->getZExtValue() == 1) {
47841 } else if (C->getZExtValue() != 0)
47851 int OpIdx = -1;
47903 if (FVal && FVal->getZExtValue() != 0) {
47904 if (FVal->getZExtValue() != 1)
47911 if (FValIsFalse && TVal->getZExtValue() != 1)
47913 if (!FValIsFalse && TVal->getZExtValue() != 0)
47932 if (Cond->getOpcode() == X86ISD::CMP) {
47933 if (!isNullConstant(Cond->getOperand(1)))
47936 Cond = Cond->getOperand(0);
47942 switch (Cond->getOpcode()) {
47950 SetCC0 = Cond->getOperand(0);
47951 SetCC1 = Cond->getOperand(1);
47958 SetCC0->getOperand(1) != SetCC1->getOperand(1))
47961 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
47962 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
47963 Flags = SetCC0->getOperand(1);
47967 // When legalizing carry, we create carries via add X, -1
47997 CarryOp1.getNode()->hasOneUse() &&
48001 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48052 // testc -> testz.
48056 // !testc -> !testz.
48060 // testz -> testc.
48064 // !testz -> !testc.
48069 // testnzc -> testnzc (no change).
48085 // TESTC(X,~X) == TESTC(X,-1)
48095 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48136 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48173 // TESTZ(-1,X) == TESTZ(X,X)
48177 // TESTZ(X,-1) == TESTZ(X,X)
48181 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48209 // Handle eq/ne against -1 (all_of).
48220 const APInt &CmpVal = CmpConstant->getAPIntValue();
48251 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48254 // signbits extend down to all the sub-elements as well.
48268 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48277 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48278 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48279 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48280 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48298 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48299 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48300 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48301 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48313 // Check for 256-bit split vector cases.
48341 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
48355 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
48383 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
48414 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
48415 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
48416 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
48417 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
48467 EVT VT = N->getValueType(0);
48468 SDValue FalseOp = N->getOperand(0);
48469 SDValue TrueOp = N->getOperand(1);
48470 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
48471 SDValue Cond = N->getOperand(3);
48473 // cmov X, X, ?, ? --> X
48497 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
48503 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
48506 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
48510 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
48512 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
48518 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
48520 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
48525 FalseC->getValueType(0), Cond);
48534 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
48557 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
48564 // Add the base if non-zero.
48565 if (FalseC->getAPIntValue() != 0)
48575 // (select (x != c), e, c) -> select (x != c), e, x),
48576 // (select (x == c), c, e) -> select (x == c), x, e)
48580 // The rationale for this change is that the conditional-move from a constant
48581 // needs two instructions, however, conditional-move from a register needs
48585 // some instruction-combining opportunities. This opt needs to be
48619 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
48624 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
48625 EVT CondVT = Cond->getValueType(0);
48628 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
48637 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
48638 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
48674 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
48675 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
48676 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
48677 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
48713 EVT VT = N->getOperand(0).getValueType();
48717 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
48721 SDValue Opd = N->getOperand(i);
48729 // When ranges are from -128 ~ 127, use MULS8 mode.
48735 // When ranges are from -32768 ~ 32767, use MULS16 mode.
48763 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
48769 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
48793 SDValue N0 = N->getOperand(0);
48794 SDValue N1 = N->getOperand(1);
48795 EVT VT = N->getOperand(0).getValueType();
48847 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48852 N->getOperand(0));
48857 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48862 N->getOperand(0));
48880 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48905 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48914 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
48917 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
48918 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48920 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48941 EVT VT = N->getValueType(0);
48957 SDValue N0 = N->getOperand(0);
48958 SDValue N1 = N->getOperand(1);
48993 // Mask off upper 16-bits of sign-extended constants.
48996 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
49001 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49011 N->isOnlyUserOf(Op.getNode())) {
49018 N->isOnlyUserOf(Op.getNode())) {
49048 EVT VT = N->getValueType(0);
49056 SDValue N0 = N->getOperand(0);
49057 SDValue N1 = N->getOperand(1);
49059 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49060 // 32-bits. We can lower with this if the sign bits stretch that far.
49088 EVT VT = N->getValueType(0);
49104 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49113 return DAG.getNegative(N->getOperand(0), DL, VT);
49132 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49137 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49163 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49164 N->user_begin()->getOpcode() == ISD::ADD))
49172 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49175 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49193 if (isPowerOf2_64(AbsMulAmt - 1)) {
49196 ISD::ADD, DL, VT, N->getOperand(0),
49197 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49198 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49202 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49204 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49208 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49210 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49211 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49215 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49216 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49219 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49222 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49224 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49228 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49231 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49234 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49235 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49244 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49247 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49267 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49274 SDValue ShiftOperand = N->getOperand(0);
49279 EVT VT = N->getValueType(0);
49285 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
49309 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49316 SDValue N0 = N->getOperand(0);
49317 SDValue N1 = N->getOperand(1);
49324 // with out-of-bounds clamping.
49330 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
49336 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
49344 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
49351 Mask <<= N1C->getAPIntValue();
49353 // We can handle cases concerning bit-widening nodes containing setcc_c if
49359 // zext(setcc_c) -> i32 0x0000FFFF
49360 // c1 -> i32 0x0000FFFF
49361 // c2 -> i32 0x00000001
49362 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
49363 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
49384 SDValue N0 = N->getOperand(0);
49385 SDValue N1 = N->getOperand(1);
49393 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
49397 m_SpecificInt(VT.getScalarSizeInBits() - 1))))
49402 // into (SHL (sext_in_reg X), ShlConst - SraConst)
49404 // or (SRA (sext_in_reg X), SraConst - ShlConst)
49406 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
49422 APInt ShlConst = N01->getAsAPIntVal();
49423 APInt SraConst = N1->getAsAPIntVal();
49433 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
49434 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
49442 DAG.getConstant(ShlConst - SraConst, DL, CVT));
49444 DAG.getConstant(SraConst - ShlConst, DL, CVT));
49453 SDValue N0 = N->getOperand(0);
49454 SDValue N1 = N->getOperand(1);
49463 // with out-of-bounds clamping.
49469 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
49475 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
49489 // TODO: This is a generic DAG combine that became an x86-only combine to
49490 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
49491 // and-not ('andn').
49500 // If we can shrink the constant mask below 8-bits or 32-bits, then this
49502 // from improved known-bits analysis or instruction selection.
49503 APInt MaskVal = AndC->getAPIntValue();
49512 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
49517 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
49527 unsigned Opcode = N->getOpcode();
49531 EVT VT = N->getValueType(0);
49532 SDValue N0 = N->getOperand(0);
49533 SDValue N1 = N->getOperand(1);
49537 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
49539 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
49554 // shuffle to a v4X64 width - we can probably relax this in the future.
49572 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
49597 int PostShuffle[4] = {-1, -1, -1, -1};
49629 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
49672 unsigned Opcode = N->getOpcode();
49676 EVT VT = N->getValueType(0);
49677 SDValue N0 = N->getOperand(0);
49678 SDValue N1 = N->getOperand(1);
49691 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
49692 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
49742 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
49746 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
49782 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
49823 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
49824 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
49828 MVT VT = N->getSimpleValueType(0);
49829 SDValue LHS = N->getOperand(0);
49830 SDValue RHS = N->getOperand(1);
49832 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
49833 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
49836 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
49855 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
49861 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
49871 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
49872 X86ISD::VSRL == N->getOpcode()) &&
49874 EVT VT = N->getValueType(0);
49875 SDValue N0 = N->getOperand(0);
49876 SDValue N1 = N->getOperand(1);
49878 // Shift zero -> zero.
49888 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
49904 unsigned Opcode = N->getOpcode();
49909 EVT VT = N->getValueType(0);
49910 SDValue N0 = N->getOperand(0);
49911 SDValue N1 = N->getOperand(1);
49917 // (shift undef, X) -> 0
49923 unsigned ShiftVal = N->getConstantOperandVal(1);
49927 ShiftVal = NumBitsPerElt - 1;
49930 // (shift X, 0) -> X
49934 // (shift 0, C) -> 0
49940 // (VSRAI -1, C) -> -1
49953 NewShiftVal = NumBitsPerElt - 1;
49959 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
49963 // (shl (add X, X), C) -> (shl X, (C + 1))
49977 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
49982 N0->hasOneUse()) {
50027 if (N->isOnlyUserOf(N0.getNode())) {
50031 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50035 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50057 EVT VT = N->getValueType(0);
50058 unsigned Opcode = N->getOpcode();
50064 SDValue Vec = N->getOperand(0);
50065 SDValue Scl = N->getOperand(1);
50066 SDValue Idx = N->getOperand(2);
50068 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50092 /// OR -> CMPNEQSS.
50101 SDValue N0 = N->getOperand(0);
50102 SDValue N1 = N->getOperand(1);
50111 SDValue CMP00 = CMP0->getOperand(0);
50112 SDValue CMP01 = CMP0->getOperand(1);
50119 for (const SDNode *U : N->users()) {
50123 switch (U->getOpcode()) {
50163 N->getSimpleValueType(0));
50173 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50174 // 64-bit integer, since that's not a legal type. Since
50200 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50202 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50204 MVT VT = N->getSimpleValueType(0);
50209 SDValue N0 = N->getOperand(0);
50210 SDValue N1 = N->getOperand(1);
50228 /// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50229 /// ->
50234 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50236 EVT VT = N->getValueType(0);
50245 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50246 // end-users are ISD::AND including cases
50248 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50249 !SVN->getOperand(1).isUndef()) {
50252 SDValue IVEN = SVN->getOperand(0);
50257 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50265 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50266 SVN->getOperand(1), SVN->getMask());
50272 SDValue N0 = N->getOperand(0);
50273 SDValue N1 = N->getOperand(1);
50319 SelectionDAG &DAG, unsigned Depth) {
50321 if (Depth >= SelectionDAG::MaxRecursionDepth)
50334 if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
50348 if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
50367 // register. In most cases we actually compare or select YMM-sized registers
50370 // Even with AVX-512 this is still useful for removing casts around logical
50403 // clang-format off
50408 // clang-format on
50413 /// If both input operands of a logic op are being cast from floating-point
50414 /// types or FP compares, try to convert this into a floating-point logic node
50449 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
50450 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
50460 // logic (setcc N00, N01), (setcc N10, N11) -->
50478 // Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
50479 // to reduce XMM->GPR traffic.
50508 // Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
50527 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
50547 // BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
50584 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
50586 /// with a shift-right to eliminate loading the vector constant mask value.
50589 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
50590 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
50596 // shift and "andn". This saves a materialization of a -1 vector constant.
50599 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
50604 if (N->getValueType(0) == VT &&
50620 VT.getScalarSizeInBits() - 1, DAG);
50642 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
50644 return DAG.getBitcast(N->getValueType(0), Shift);
50650 if (Ld->isIndexed())
50653 SDValue Base = Ld->getBasePtr();
50669 /// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
50674 MVT VT = N->getSimpleValueType(0);
50696 // 'and-load' sequence.
50700 // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
50707 MVT VT = Node->getSimpleValueType(0);
50717 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
50720 const Value *MemOp = Ld->getMemOperand()->getValue();
50729 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
50730 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
50731 Constant *Init = GV->getInitializer();
50732 Type *Ty = Init->getType();
50734 !Ty->getArrayElementType()->isIntegerTy() ||
50735 Ty->getArrayElementType()->getScalarSizeInBits() !=
50737 Ty->getArrayNumElements() >
50738 Ty->getArrayElementType()->getScalarSizeInBits())
50742 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
50745 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
50746 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
50754 // Do the transformation (For 32-bit type):
50755 // -> (and (load arr[idx]), inp)
50756 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
50758 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
50777 // where the setcc will freely 0 upper bits of k-register. We can replace the
50782 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
50784 EVT VT = N->getValueType(0);
50788 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
50795 SDValue Src = N->getOperand(0);
50827 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
50861 SDValue OpMustEq, SDValue Op, unsigned Depth) {
50866 // Only do this re-ordering if op has one use.
50874 if (Depth++ >= kMaxDepth)
50879 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
50881 Op.getOperand(1 - OpIdx));
50897 // BLSR: (and x, (add x, -1))
50898 // BLSMSK: (xor x, (add x, -1))
50907 EVT VT = N->getValueType(0);
50913 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
50918 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
50919 N->getOperand(1 - OpIdx), 0))
50930 // ->
50936 // ->
50942 SDValue SetCC = N->getOperand(0);
50948 SDNode *BrCond = *Flag->user_begin();
50949 if (BrCond->getOpcode() != X86ISD::BRCOND)
50952 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
50959 if (N->getOpcode() == X86ISD::SUB)
50960 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
50964 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
50970 SmallVector<SDValue> Ops(BrCond->op_values());
50971 if (isNullConstant(N->getOperand(1)))
50973 else if (isOneConstant(N->getOperand(1)))
50979 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
50980 // Avoid self-assign error b/c CC1 can be `e/ne`.
50990 // ->
50994 // ->
51002 SDValue SetCC0 = N->getOperand(0);
51003 SDValue SetCC1 = N->getOperand(1);
51008 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51034 bool IsOR = N->getOpcode() == ISD::OR;
51045 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51069 SDValue N0 = N->getOperand(0);
51070 SDValue N1 = N->getOperand(1);
51071 EVT VT = N->getValueType(0);
51083 // Use a 32-bit and+zext if upper bits known zero.
51095 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51118 // `(-x << C0) & C1`
51120 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51133 const APInt &MulC = N01C->getAPIntValue();
51134 const APInt &AndC = N1C->getAPIntValue();
51135 APInt MulCLowBit = MulC & (-MulC);
51140 assert(MulCLowBitLog != -1 &&
51155 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51158 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51161 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51164 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51189 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51190 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51197 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51203 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51205 if (isOneConstant(N1) && N0->hasOneUse()) {
51209 Src.getOperand(0)->hasOneUse())
51257 // We can't assume an undef src element gives an undef dst - the
51278 if (N->getOpcode() != ISD::DELETED_NODE)
51293 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
51301 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
51320 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
51335 // Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
51339 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
51341 MVT VT = N->getSimpleValueType(0);
51346 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
51347 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
51370 // TODO - add UNDEF elts support.
51378 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
51379 // VPTERNLOG is only available as vXi32/64-bit types.
51392 SDValue X = N->getOperand(0);
51401 if (N->getOpcode() != ISD::OR)
51404 SDValue N0 = N->getOperand(0);
51405 SDValue N1 = N->getOperand(1);
51426 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
51442 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
51444 EVT VT = N->getValueType(0);
51498 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
51499 // The result of the shift is true or false, and on X86, the 32-bit
51517 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
51521 return (N->getOpcode() == ISD::OR && N->hasOneUse());
51524 // Check the zero extend is extending to 32-bit or more. The code generated by
51525 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
51527 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
51528 !isORCandidate(N->getOperand(0)))
51533 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
51534 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
51535 N->getOperand(1).getOpcode() == X86ISD::CMP &&
51536 isNullConstant(N->getOperand(1).getOperand(1)) &&
51537 N->getOperand(1).getValueType().bitsGE(MVT::i32);
51540 SDNode *OR = N->getOperand(0).getNode();
51541 SDValue LHS = OR->getOperand(0);
51542 SDValue RHS = OR->getOperand(1);
51549 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
51550 LHS = OR->getOperand(0);
51551 RHS = OR->getOperand(1);
51575 LHS = OR->getOperand(0);
51576 RHS = OR->getOperand(1);
51578 if (RHS->getOpcode() == ISD::OR)
51586 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
51592 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
51594 SDValue NotOp = And0_L->getOperand(0);
51601 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
51602 EVT VT = And1_L->getValueType(0);
51613 /// "and-not" operation. This function is intended to be called from a
51616 // Note that masked-merge variants using XOR or ADD expressions are
51618 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
51619 SDValue N0 = Node->getOperand(0);
51620 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
51622 SDValue N1 = Node->getOperand(1);
51623 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
51627 SDValue N00 = N0->getOperand(0);
51628 SDValue N01 = N0->getOperand(1);
51629 SDValue N10 = N1->getOperand(0);
51630 SDValue N11 = N1->getOperand(1);
51653 // Look through a one-use zext.
51670 // If X is -1 or 0, then we have an opportunity to avoid constants required in
51674 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
51675 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
51676 // This is a complicated way to get -1 or 0 from the carry flag:
51677 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
51678 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
51684 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
51685 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
51690 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
51691 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
51693 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51704 // X + SETB Z --> adc X, 0
51705 // X - SETB Z --> sbb X, 0
51721 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
51725 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51735 // X + SETAE --> sbb X, -1
51736 // X - SETAE --> adc X, -1
51743 // X + SETBE --> sbb X, -1
51744 // X - SETBE --> adc X, -1
51751 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
51755 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51775 // If X is -1 or 0, then we have an opportunity to avoid constants required in
51778 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
51780 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
51781 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
51782 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
51783 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
51792 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
51794 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
51795 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
51796 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
51797 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
51815 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
51816 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
51821 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
51822 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
51832 bool IsSub = N->getOpcode() == ISD::SUB;
51833 SDValue X = N->getOperand(0);
51834 SDValue Y = N->getOperand(1);
51835 EVT VT = N->getValueType(0);
51865 bool N1COdd = N1C->getZExtValue() & 1;
51872 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
51889 SDValue N0 = N->getOperand(0);
51890 SDValue N1 = N->getOperand(1);
51891 EVT VT = N->getValueType(0);
51903 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
51929 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51932 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51935 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51938 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51954 // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
51964 uint64_t Val = CN->getZExtValue();
51979 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
51980 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
51981 // iff the upper elements of the non-shifted arg are zero.
52029 if (N->getOpcode() != ISD::DELETED_NODE)
52040 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52047 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52049 /// SETGT(X, -1)
52052 EVT ResultType = N->getValueType(0);
52056 SDValue N0 = N->getOperand(0);
52057 SDValue N1 = N->getOperand(1);
52079 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52082 // Create a greater-than comparison against -1.
52100 /// xor (sra X, elt_size(X)-1), -1
52102 /// pcmpgt X, -1
52108 EVT VT = N->getValueType(0);
52113 // clang-format off
52123 // clang-format on
52128 SDValue Shift = N->getOperand(0);
52129 SDValue Ones = N->getOperand(1);
52138 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52141 // Create a greater-than comparison against -1. We don't use the more obvious
52142 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52239 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52240 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52242 // clip to 0-255.
52256 // For 256-bit or smaller vectors, we require VLX.
52258 // If the result type is 256-bits or larger and we have disable 512-bit
52271 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52334 EVT RegVT = Ld->getValueType(0);
52335 SDValue Ptr = Ld->getBasePtr();
52336 SDValue Chain = Ld->getChain();
52337 ISD::LoadExtType Ext = Ld->getExtensionType();
52339 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
52362 for (SDNode *User : Chain->users()) {
52365 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
52366 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
52368 UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) &&
52369 User->getValueSizeInBits(0).getFixedValue() >
52371 EVT UserVT = User->getValueType(0);
52372 SDValue UserPtr = UserLd->getBasePtr();
52378 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
52379 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
52408 EVT RegVT = Ld->getValueType(0);
52409 EVT MemVT = Ld->getMemoryVT();
52413 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
52414 // into two 16-byte operations. Also split non-temporal aligned loads on
52415 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
52416 ISD::LoadExtType Ext = Ld->getExtensionType();
52420 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
52421 Ld->getAlign() >= Align(16)) ||
52423 *Ld->getMemOperand(), &Fast) &&
52430 SDValue Ptr1 = Ld->getBasePtr();
52436 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
52437 Ld->getOriginalAlign(),
52438 Ld->getMemOperand()->getFlags());
52439 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
52440 Ld->getPointerInfo().getWithOffset(HalfOffset),
52441 Ld->getOriginalAlign(),
52442 Ld->getMemOperand()->getFlags());
52450 // Bool vector load - attempt to cast to an integer, as we have good
52457 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
52458 Ld->getPointerInfo(),
52459 Ld->getOriginalAlign(),
52460 Ld->getMemOperand()->getFlags());
52468 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
52470 SDValue Ptr = Ld->getBasePtr();
52471 SDValue Chain = Ld->getChain();
52472 for (SDNode *User : Chain->users()) {
52475 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
52476 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
52477 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
52478 !User->hasAnyUseOfValue(1) &&
52479 User->getValueSizeInBits(0).getFixedValue() >
52493 unsigned AddrSpace = Ld->getAddressSpace();
52497 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
52499 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
52500 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
52501 Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
52502 Ld->getMemOperand()->getFlags());
52511 /// Otherwise, return -1.
52521 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
52522 return -1;
52524 int TrueIndex = -1;
52525 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
52527 const SDValue &Op = BV->getOperand(i);
52532 return -1;
52533 if (ConstNode->getAPIntValue().countr_one() >= 1) {
52536 return -1;
52551 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
52557 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
52559 Addr = MaskedOp->getBasePtr();
52567 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
52572 /// If exactly one element of the mask is set for a non-extending masked load,
52574 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
52580 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
52581 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
52594 EVT VT = ML->getValueType(0);
52604 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
52605 ML->getPointerInfo().getWithOffset(Offset),
52606 Alignment, ML->getMemOperand()->getFlags());
52608 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
52620 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
52621 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
52625 EVT VT = ML->getValueType(0);
52631 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
52632 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
52633 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
52635 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
52636 ML->getMemOperand());
52637 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
52638 ML->getPassThru());
52644 // (for example, vblendvps -> vblendps).
52646 // Don't try this if the pass-through operand is already undefined. That would
52648 if (ML->getPassThru().isUndef())
52651 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
52654 // The new masked load has an undef pass-through operand. The select uses the
52655 // original pass-through operand.
52657 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
52658 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
52659 ML->getAddressingMode(), ML->getExtensionType());
52660 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
52661 ML->getPassThru());
52672 if (Mld->isExpandingLoad())
52675 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
52686 // If the mask value has been legalized to a non-boolean vector, try to
52688 SDValue Mask = Mld->getMask();
52690 EVT VT = Mld->getValueType(0);
52694 if (N->getOpcode() != ISD::DELETED_NODE)
52701 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
52702 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
52703 Mld->getAddressingMode(), Mld->getExtensionType());
52709 /// If exactly one element of the mask is set for a non-truncating masked store,
52711 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
52716 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
52728 SDValue Value = MS->getValue();
52740 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
52741 MS->getPointerInfo().getWithOffset(Offset),
52742 Alignment, MS->getMemOperand()->getFlags());
52749 if (Mst->isCompressingStore())
52752 EVT VT = Mst->getValue().getValueType();
52756 if (Mst->isTruncatingStore())
52762 // If the mask value has been legalized to a non-boolean vector, try to
52764 SDValue Mask = Mst->getMask();
52768 if (N->getOpcode() != ISD::DELETED_NODE)
52774 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
52775 Mst->getBasePtr(), Mst->getOffset(), NewMask,
52776 Mst->getMemoryVT(), Mst->getMemOperand(),
52777 Mst->getAddressingMode());
52780 SDValue Value = Mst->getValue();
52781 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
52783 Mst->getMemoryVT())) {
52784 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
52785 Mst->getBasePtr(), Mst->getOffset(), Mask,
52786 Mst->getMemoryVT(), Mst->getMemOperand(),
52787 Mst->getAddressingMode(), true);
52797 EVT StVT = St->getMemoryVT();
52799 SDValue StoredVal = St->getValue();
52810 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52811 St->getPointerInfo(), St->getOriginalAlign(),
52812 St->getMemOperand()->getFlags());
52816 // This will avoid a copy to k-register.
52823 return DAG.getStore(St->getChain(), dl, Val,
52824 St->getBasePtr(), St->getPointerInfo(),
52825 St->getOriginalAlign(),
52826 St->getMemOperand()->getFlags());
52837 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52838 St->getPointerInfo(), St->getOriginalAlign(),
52839 St->getMemOperand()->getFlags());
52846 // If its a v64i1 store without 64-bit support, we need two stores.
52849 StoredVal->ops().slice(0, 32));
52852 StoredVal->ops().slice(32, 32));
52855 SDValue Ptr0 = St->getBasePtr();
52859 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
52860 St->getOriginalAlign(),
52861 St->getMemOperand()->getFlags());
52863 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
52864 St->getPointerInfo().getWithOffset(4),
52865 St->getOriginalAlign(),
52866 St->getMemOperand()->getFlags());
52871 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52872 St->getPointerInfo(), St->getOriginalAlign(),
52873 St->getMemOperand()->getFlags());
52876 // Convert scalar fabs/fneg load-store to integer equivalents.
52893 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
52894 St->getPointerInfo(), St->getOriginalAlign(),
52895 St->getMemOperand()->getFlags());
52899 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
52900 // Sandy Bridge, perform two 16-byte stores.
52904 *St->getMemOperand(), &Fast) &&
52913 // Split under-aligned vector non-temporal stores.
52914 if (St->isNonTemporal() && StVT == VT &&
52915 St->getAlign().value() < VT.getStoreSize()) {
52916 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
52925 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
52935 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
52937 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
52938 St->getValue().getOpcode() == ISD::TRUNCATE &&
52939 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
52941 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
52943 St->getValue().getOperand(0));
52944 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
52945 MVT::v16i8, St->getMemOperand());
52949 if (!St->isTruncatingStore() &&
52955 return EmitTruncSStore(IsSigned, St->getChain(),
52956 dl, StoredVal.getOperand(0), St->getBasePtr(),
52957 VT, St->getMemOperand(), DAG);
52961 if (!St->isTruncatingStore()) {
52983 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
52984 TruncVT, St->getMemOperand());
52993 if (St->isTruncatingStore() && VT.isVector()) {
52995 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
52996 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
52997 dl, Val, St->getBasePtr(),
52998 St->getMemoryVT(), St->getMemOperand(), DAG);
52999 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53001 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53002 dl, Val, St->getBasePtr(),
53003 St->getMemoryVT(), St->getMemOperand(), DAG);
53010 unsigned AddrSpace = St->getAddressSpace();
53014 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53016 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53018 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53019 St->getOriginalAlign(), St->getMemOperand()->getFlags(),
53020 St->getAAInfo());
53027 Subtarget.hasCF() && St->isSimple()) {
53037 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53038 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53056 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53059 St->getMemOperand());
53062 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53067 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53079 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53080 cast<LoadSDNode>(St->getValue())->isSimple() &&
53081 St->getChain().hasOneUse() && St->isSimple()) {
53082 auto *Ld = cast<LoadSDNode>(St->getValue());
53088 if (!Ld->hasNUsesOfValue(1, 0))
53094 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53095 Ld->getBasePtr(), Ld->getMemOperand());
53099 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53100 St->getMemOperand());
53103 // This is similar to the above case, but here we handle a scalar 64-bit
53104 // integer store that is extracted from a vector on a 32-bit target.
53105 // If we have SSE2, then we can treat it like a floating-point double
53110 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
53111 SDValue OldExtract = St->getOperand(1);
53118 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53119 St->getPointerInfo(), St->getOriginalAlign(),
53120 St->getMemOperand()->getFlags());
53131 SDValue StoredVal = N->getOperand(1);
53133 EVT MemVT = St->getMemoryVT();
53141 if (N->getOpcode() != ISD::DELETED_NODE)
53158 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53160 /// A horizontal-op B, for some already available A and B, and if so then LHS is
53178 // which is A horizontal-op B.
53220 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53277 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53278 // so we just repeat the inner loop if this is a 256-bit op.
53299 // Compute the post-shuffle mask index based on where the element
53303 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
53305 // The low half of the 128-bit result must choose from A.
53306 // The high half of the 128-bit result must choose from B,
53322 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
53330 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
53333 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
53334 llvm::any_of(NewRHS->users(), FoundHorizUser));
53352 EVT VT = N->getValueType(0);
53353 unsigned Opcode = N->getOpcode();
53358 return N->hasOneUse() &&
53359 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
53360 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
53361 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
53369 SDValue LHS = N->getOperand(0);
53370 SDValue RHS = N->getOperand(1);
53386 SDValue LHS = N->getOperand(0);
53387 SDValue RHS = N->getOperand(1);
53411 // <i32 -2147483648[float -0.000000e+00]> 0
53413 // <(load 4 from constant-pool)> t0, t29
53424 EVT VT = N->getValueType(0);
53425 SDValue LHS = N->getOperand(0);
53426 SDValue RHS = N->getOperand(1);
53428 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
53430 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
53432 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
53481 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
53482 !AllowContract(N->getFlags()))
53485 EVT VT = N->getValueType(0);
53489 SDValue LHS = N->getOperand(0);
53490 SDValue RHS = N->getOperand(1);
53495 &HasNoSignedZero](SDValue N) -> bool {
53500 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
53508 ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&
53509 HasNoSignedZero(Op0->getFlags())) ||
53510 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
53533 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
53537 /// Do target-specific dag combines on floating-point adds/subs.
53551 EVT VT = N->getValueType(0);
53552 SDValue Src = N->getOperand(0);
53565 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
53567 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
53573 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
53574 SDValue Src = N->getOperand(0);
53578 EVT VT = N->getValueType(0);
53615 // In most cases its only worth pre-truncating if we're only facing the cost
53620 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
53658 // Only handle vXi16 types that are at least 128-bits unless they will be
53674 // Count leading sign/zero bits on both inputs - if there are enough then
53675 // truncation back to vXi16 will be cheap - either as a pack/shuffle
53724 // adjacent pairs of 16-bit products, and saturates the result before
53725 // truncating to 16-bits.
53818 unsigned IdxN00 = ConstN00Elt->getZExtValue();
53819 unsigned IdxN01 = ConstN01Elt->getZExtValue();
53820 unsigned IdxN10 = ConstN10Elt->getZExtValue();
53821 unsigned IdxN11 = ConstN11Elt->getZExtValue();
53874 EVT VT = N->getValueType(0);
53875 SDValue Src = N->getOperand(0);
53878 // Attempt to pre-truncate inputs to arithmetic ops instead.
53912 EVT VT = N->getValueType(0);
53913 SDValue In = N->getOperand(0);
53931 /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
53938 static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
53939 if (N->getOpcode() == ISD::FNEG)
53940 return N->getOperand(0);
53943 if (Depth > SelectionDAG::MaxRecursionDepth)
53946 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
53949 EVT VT = Op->getValueType(0);
53959 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
53962 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
53965 cast<ShuffleVectorSDNode>(Op)->getMask());
53970 // -V, INDEX).
53975 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
54005 // Only allow bitcast from correctly-sized constant.
54021 // clang-format off
54035 // clang-format on
54041 // clang-format off
54059 // clang-format on
54066 // clang-format off
54076 // clang-format on
54083 /// Do target-specific dag combines on floating point negations.
54087 EVT OrigVT = N->getValueType(0);
54102 // use of a constant by performing (-0 - A*B) instead.
54105 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54125 unsigned Depth) const {
54127 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54135 SDNodeFlags Flags = Op.getNode()->getFlags();
54160 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54170 // Fill in the non-negated ops with the original values.
54179 ForCodeSize, Cost, Depth + 1))
54185 ForCodeSize, Cost, Depth);
54190 MVT VT = N->getSimpleValueType(0);
54201 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54202 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54204 switch (N->getOpcode()) {
54205 // clang-format off
54211 // clang-format on
54218 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54220 if (N->getOpcode() != ISD::XOR)
54223 SDValue LHS = N->getOperand(0);
54224 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54228 X86::CondCode(LHS->getConstantOperandVal(0)));
54230 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54235 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
54240 EVT VT = N->getValueType(0);
54245 SDValue N0 = N->getOperand(0);
54246 SDValue N1 = N->getOperand(1);
54258 } else if (N->getOpcode() == ISD::SUB) {
54271 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
54292 SDValue N0 = N->getOperand(0);
54293 SDValue N1 = N->getOperand(1);
54294 EVT VT = N->getValueType(0);
54308 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
54311 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
54314 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
54317 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
54330 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
54336 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
54347 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
54358 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
54359 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
54362 N0.getOperand(0).getOpcode() == N->getOpcode()) {
54366 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
54383 SDValue N0 = N->getOperand(0);
54384 EVT VT = N->getValueType(0);
54386 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
54397 ReverseMask[I] = (NumElts - 1) - I;
54411 unsigned Opcode = N->getOpcode();
54412 SDValue N0 = N->getOperand(0);
54413 SDValue N1 = N->getOperand(1);
54414 EVT VT = N->getValueType(0);
54418 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
54435 EVT VT = N->getValueType(0);
54438 // TODO - Constant Folding.
54457 /// to be used as a replacement operand with operations (eg, bitwise-and) where
54472 SDValue N0 = N->getOperand(0);
54473 SDValue N1 = N->getOperand(1);
54474 EVT VT = N->getValueType(0);
54487 return C && C->getConstantFPValue()->isAllOnesValue();
54490 // fand (fxor X, -1), Y --> fandn X, Y
54494 // fand X, (fxor Y, -1) --> fandn Y, X
54501 /// Do target-specific dag combines on X86ISD::FAND nodes.
54504 // FAND(0.0, x) -> 0.0
54505 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
54508 // FAND(x, 0.0) -> 0.0
54509 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
54518 /// Do target-specific dag combines on X86ISD::FANDN nodes.
54521 // FANDN(0.0, x) -> x
54522 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
54523 return N->getOperand(1);
54525 // FANDN(x, 0.0) -> 0.0
54526 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
54532 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
54536 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
54538 // F[X]OR(0.0, x) -> x
54539 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
54540 return N->getOperand(1);
54542 // F[X]OR(x, 0.0) -> x
54543 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
54544 return N->getOperand(0);
54552 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
54554 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
54561 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
54564 switch (N->getOpcode()) {
54570 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
54571 N->getOperand(0), N->getOperand(1));
54576 EVT VT = N->getValueType(0);
54588 SDValue Op0 = N->getOperand(0);
54589 SDValue Op1 = N->getOperand(1);
54591 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
54595 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
54596 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
54598 // If one of the operands is known non-NaN use the native min/max instructions
54599 // with the non-NaN input as second operand.
54601 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
54603 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
54617 // ----------------
54619 // Op0 ----------------
54621 // ----------------
54642 EVT VT = N->getValueType(0);
54650 SDValue In = N->getOperand(0);
54654 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
54655 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
54661 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
54676 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
54677 EVT VT = N->getValueType(0);
54680 SDValue In = N->getOperand(IsStrict ? 1 : 0);
54684 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
54693 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
54694 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
54698 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
54710 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
54714 SDValue N0 = N->getOperand(0);
54715 SDValue N1 = N->getOperand(1);
54716 MVT VT = N->getSimpleValueType(0);
54722 // ANDNP(undef, x) -> 0
54723 // ANDNP(x, undef) -> 0
54727 // ANDNP(0, x) -> x
54731 // ANDNP(x, 0) -> 0
54735 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
54745 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
54773 if (N0->hasOneUse()) {
54803 // We can't assume an undef src element gives an undef dst - the
54825 if (N->getOpcode() != ISD::DELETED_NODE)
54832 if (N1->hasOneUse()) {
54833 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
54838 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
54858 SDValue N1 = N->getOperand(1);
54864 if (N->getOpcode() != ISD::DELETED_NODE)
54874 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
54875 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
54877 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
54881 if (N->getOpcode() != ISD::DELETED_NODE)
54888 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
54893 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
54894 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
54897 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
54914 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
54916 EVT DstVT = N->getValueType(0);
54918 SDValue N0 = N->getOperand(0);
54919 SDValue N1 = N->getOperand(1);
54920 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
54976 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
54981 EVT VT = N->getValueType(0);
54982 SDValue N0 = N->getOperand(0);
54983 SDValue N1 = N->getOperand(1);
54984 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
54988 // both SSE and AVX2 since there is no sign-extended shift right
54989 // operation on a vector with 64-bit elements.
54990 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
55016 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55017 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55023 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55024 Ext->getOpcode() != ISD::ZERO_EXTEND)
55028 EVT VT = Ext->getValueType(0);
55032 SDValue Add = Ext->getOperand(0);
55038 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55039 bool NSW = Add->getFlags().hasNoSignedWrap();
55040 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55062 for (auto *User : Ext->users()) {
55063 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55072 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55073 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55077 // sign-extended.
55085 // operands and the result of CMOV is not used anywhere else - promote CMOV
55088 // (or more) pseudo-CMOVs only when they go one-after-another and
55092 // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55093 // promotion is also good in terms of code-size.
55094 // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55097 SDValue CMovN = Extend->getOperand(0);
55101 EVT TargetVT = Extend->getValueType(0);
55102 unsigned ExtendOpcode = Extend->getOpcode();
55145 SDValue N0 = N->getOperand(0);
55146 EVT VT = N->getValueType(0);
55169 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
55181 if (N->getOpcode() == ISD::ZERO_EXTEND)
55190 SDValue N0 = N->getOperand(0);
55191 EVT VT = N->getValueType(0);
55194 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55197 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55198 N0->getOperand(1));
55220 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55251 return User->getOpcode() != ISD::FMA &&
55252 User->getOpcode() != ISD::STRICT_FMA;
55254 if (llvm::any_of(V->users(), IsNotFMA))
55260 for (const SDValue &Op : V->op_values()) {
55262 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
55275 if (llvm::any_of(NV->users(), IsNotFMA))
55282 for (const SDValue &Op : V->op_values()) {
55284 if (Cst->isNegative())
55296 EVT VT = N->getValueType(0);
55298 bool IsStrict = N->isTargetOpcode()
55299 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
55300 : N->isStrictFPOpcode();
55307 SDValue A = N->getOperand(IsStrict ? 1 : 0);
55308 SDValue B = N->getOperand(IsStrict ? 2 : 1);
55309 SDValue C = N->getOperand(IsStrict ? 3 : 2);
55311 // If the operation allows fast-math and the target does not support FMA,
55313 SDNodeFlags Flags = N->getFlags();
55367 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
55369 // Propagate fast-math-flags to new FMA node.
55372 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
55374 {N->getOperand(0), A, B, C});
55376 if (N->getNumOperands() == 4)
55377 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
55382 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
55383 // Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
55387 EVT VT = N->getValueType(0);
55392 SDValue N2 = N->getOperand(2);
55398 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
55400 if (N->getNumOperands() == 4)
55401 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
55402 NegN2, N->getOperand(3));
55403 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
55411 SDValue N0 = N->getOperand(0);
55412 EVT VT = N->getValueType(0);
55414 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55416 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
55418 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
55419 N0->getOperand(1));
55439 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
55470 /// pre-promote its result type since vXi1 vectors don't get promoted
55489 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
55490 const SDValue LHS = N->getOperand(0);
55491 const SDValue RHS = N->getOperand(1);
55492 EVT VT = N->getValueType(0);
55509 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
55510 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
55512 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
55527 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
55528 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
55530 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
55545 // cmpeq(trunc(x),C) --> cmpeq(x,C)
55546 // cmpne(trunc(x),C) --> cmpne(x,C)
55562 // icmp eq Abs(X) C ->
55563 // (icmp eq A, C) | (icmp eq A, -C)
55564 // icmp ne Abs(X) C ->
55565 // (icmp ne A, C) & (icmp ne A, -C)
55571 const APInt &CInt = C->getAPIntValue();
55577 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
55695 // -> `(icmp ult (add x, -C), 2)`
55699 // in worse codegen. So, undo the middle-end transform and go back to `(or
55722 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
55739 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
55740 (CC == ISD::SETUGE && (-CmpC) == 2)) {
55756 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
55762 // X pred 0.0 --> X pred -X
55778 SDValue Src = N->getOperand(0);
55780 MVT VT = N->getSimpleValueType(0);
55800 // Look through int->fp bitcasts that don't change the element width.
55806 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
55817 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
55828 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
55829 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
55831 // Use KnownBits to determine if only a single bit is non-zero
55845 // vXi8 shifts - we only care about the signbit so can use PSLLW.
55861 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
55862 if (N->isOnlyUserOf(Src.getNode())) {
55895 MVT VT = N->getSimpleValueType(0);
55910 SDValue Mask = MemOp->getMask();
55917 if (N->getOpcode() != ISD::DELETED_NODE)
55932 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
55933 Gather->getMask(), Base, Index, Scale } ;
55934 return DAG.getMaskedGather(Gather->getVTList(),
55935 Gather->getMemoryVT(), DL, Ops,
55936 Gather->getMemOperand(),
55937 Gather->getIndexType(),
55938 Gather->getExtensionType());
55941 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
55942 Scatter->getMask(), Base, Index, Scale };
55943 return DAG.getMaskedScatter(Scatter->getVTList(),
55944 Scatter->getMemoryVT(), DL,
55945 Ops, Scatter->getMemOperand(),
55946 Scatter->getIndexType(),
55947 Scatter->isTruncatingStore());
55954 SDValue Index = GorS->getIndex();
55955 SDValue Base = GorS->getBasePtr();
55956 SDValue Scale = GorS->getScale();
55962 // Shrink constant indices if they are larger than 32-bits.
55970 if (BV->isConstant() && IndexWidth > 32 &&
55971 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
55985 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56000 uint64_t ScaleAmt = Scale->getAsZExtVal();
56003 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
56004 // FIXME: Allow non-constant?
56007 APInt Adder = C->getAPIntValue() * ScaleAmt;
56018 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
56046 SDValue Mask = GorS->getMask();
56050 if (N->getOpcode() != ISD::DELETED_NODE)
56063 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
56064 SDValue EFLAGS = N->getOperand(1);
56077 SDValue EFLAGS = N->getOperand(3);
56078 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
56085 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
56086 N->getOperand(1), Cond, Flags);
56095 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
56099 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
56106 EVT VT = N->getValueType(0);
56107 bool IsStrict = N->isStrictFPOpcode();
56109 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
56116 // make the transformation for non-constant splats as well, but it's unclear
56121 if (!BV->isConstant())
56126 EVT IntVT = BV->getValueType(0);
56131 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
56132 {N->getOperand(0), SDValue(BV, 0)});
56134 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
56137 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
56148 /// If we are converting a value to floating-point, try to replace scalar
56155 SDValue Trunc = N->getOperand(0);
56171 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
56180 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
56185 bool IsStrict = N->isStrictFPOpcode();
56186 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
56187 EVT VT = N->getValueType(0);
56193 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
56194 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
56196 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
56197 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
56213 {N->getOperand(0), P});
56217 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
56218 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
56219 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
56229 {N->getOperand(0), P});
56236 SDNodeFlags Flags = N->getFlags();
56240 {N->getOperand(0), Op0});
56252 bool IsStrict = N->isStrictFPOpcode();
56257 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
56258 EVT VT = N->getValueType(0);
56264 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
56265 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
56267 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
56268 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
56284 {N->getOperand(0), P});
56288 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
56289 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
56290 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
56298 {N->getOperand(0), P});
56308 if (NumSignBits >= (BitWidth - 31)) {
56317 {N->getOperand(0), Trunc});
56325 { 0, 2, -1, -1 });
56328 {N->getOperand(0), Shuf});
56334 // a 32-bit target where SSE doesn't support i64->FP operations.
56348 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
56351 Subtarget.getTargetLowering()->BuildFILD(
56352 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
56353 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
56374 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
56375 EVT SrcVT = N->getOperand(0).getValueType();
56376 EVT DstVT = N->getValueType(0);
56384 N->getOperand(0), V2F32Value);
56398 for (const SDNode *User : Flags->users()) {
56400 switch (User->getOpcode()) {
56406 CC = (X86::CondCode)User->getConstantOperandVal(0);
56410 CC = (X86::CondCode)User->getConstantOperandVal(2);
56415 // clang-format off
56423 // clang-format on
56433 for (const SDNode *User : Flags->users()) {
56435 switch (User->getOpcode()) {
56449 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
56461 if (!isNullConstant(N->getOperand(1)))
56469 SDValue Op = N->getOperand(0);
56486 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
56501 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
56529 // Peek through any zero-extend if we're only testing for a zero result.
56547 // i32 truncated op to prevent partial-reg compares of promoted ops.
56607 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
56611 SDValue LHS = N->getOperand(0);
56612 SDValue RHS = N->getOperand(1);
56614 bool IsSub = X86ISD::SUB == N->getOpcode();
56617 if (IsSub && isOneConstant(N->getOperand(1)) && !N->hasAnyUseOfValue(0))
56622 if (!N->hasAnyUseOfValue(1)) {
56630 SDVTList VTs = DAG.getVTList(N->getValueType(0));
56635 if (GenericAddSub->hasOneUse() &&
56636 GenericAddSub->user_begin()->isOnlyUserOf(N))
56644 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
56653 SDValue LHS = N->getOperand(0);
56654 SDValue RHS = N->getOperand(1);
56655 SDValue BorrowIn = N->getOperand(2);
56658 MVT VT = N->getSimpleValueType(0);
56663 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
56666 !N->hasAnyUseOfValue(1))
56667 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
56676 SDValue LHS = N->getOperand(0);
56677 SDValue RHS = N->getOperand(1);
56678 SDValue CarryIn = N->getOperand(2);
56684 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
56690 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
56695 EVT VT = N->getValueType(0);
56696 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
56705 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
56708 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
56710 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
56711 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
56717 MVT VT = N->getSimpleValueType(0);
56722 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
56724 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
56725 !N->hasAnyUseOfValue(1))
56726 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
56777 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
56778 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
56953 // ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
56962 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
57000 /// earlier folds that may be used to turn select-of-constants into logic hacks.
57004 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
57005 // better because we eliminate 1-2 instructions. This transform is still
57008 // immediate asm operands (fit in 32-bits).
57021 SDValue Cmov = N->getOperand(0);
57022 SDValue OtherOp = N->getOperand(1);
57033 EVT VT = N->getValueType(0);
57040 // a 3-operand LEA which is likely slower than a 2-operand LEA.
57044 all_of(N->users(), [&](SDNode *Use) {
57046 return MemNode && MemNode->getBasePtr().getNode() == N;
57048 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
57059 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
57069 EVT VT = N->getValueType(0);
57070 SDValue Op0 = N->getOperand(0);
57071 SDValue Op1 = N->getOperand(1);
57088 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
57104 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
57124 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
57141 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
57142 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
57144 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
57145 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
57152 // Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
57153 // condition comes from the subtract node that produced -X. This matches the
57173 // Get the X and -X from the negate.
57189 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
57206 SDValue Op0 = N->getOperand(0);
57207 SDValue Op1 = N->getOperand(1);
57211 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
57213 EVT VT = N->getValueType(0);
57216 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
57221 APInt NewImm = Op0C->getAPIntValue() - 1;
57235 // ->
57237 if (N->getConstantOperandVal(3) != X86::COND_NE)
57240 SDValue Sub = N->getOperand(4);
57249 SmallVector<SDValue, 5> Ops(N->op_values());
57253 return DAG.getMemIntrinsicNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops,
57254 cast<MemSDNode>(N)->getMemoryVT(),
57255 cast<MemSDNode>(N)->getMemOperand());
57261 EVT VT = N->getValueType(0);
57262 SDValue Op0 = N->getOperand(0);
57263 SDValue Op1 = N->getOperand(1);
57275 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
57278 Op1->hasOneUse()) {
57293 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
57294 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
57296 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
57297 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
57301 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
57303 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
57305 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
57306 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
57322 unsigned Opcode = N->getOpcode();
57326 SDValue LHS = N->getOperand(0);
57327 SDValue RHS = N->getOperand(1);
57328 MVT VT = N->getSimpleValueType(0);
57338 // PCMPEQ(X,UNDEF) -> UNDEF
57339 // PCMPGT(X,UNDEF) -> 0
57340 // PCMPGT(UNDEF,X) -> 0
57417 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
57427 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
57436 // extract_subvector(broadcast(x))) -> broadcast(x)
57438 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
57446 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
57450 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
57469 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
57480 // concat(extract_subvector(x,lo), extract_subvector(x,hi)) -> x.
57492 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
57543 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
57547 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
57789 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
58038 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
58039 *FirstLd->getMemOperand(), &Fast) &&
58087 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
58096 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
58113 EVT VT = N->getValueType(0);
58114 EVT SrcVT = N->getOperand(0).getValueType();
58116 SmallVector<SDValue, 4> Ops(N->ops());
58125 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
58126 if (I == (E - 1)) {
58152 MVT OpVT = N->getSimpleValueType(0);
58157 SDValue Vec = N->getOperand(0);
58158 SDValue SubVec = N->getOperand(1);
58160 uint64_t IdxVal = N->getConstantOperandVal(2);
58197 Ins.getOperand(1), N->getOperand(2));
58206 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
58213 SubVec.getOperand(1), N->getOperand(2));
58275 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
58278 MemIntr->getMemoryVT(),
58279 MemIntr->getMemOperand());
58302 /// is a common pattern for AVX1 integer code because 256-bit selects may be
58303 /// legal, but there is almost no integer math/logic available for 256-bit.
58308 SDValue Sel = Ext->getOperand(0);
58315 // TODO: This can be extended to handle extraction to 256-bits.
58316 MVT VT = Ext->getSimpleValueType(0);
58324 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
58331 unsigned ExtIdx = Ext->getConstantOperandVal(1);
58360 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
58362 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
58370 if (!N->getValueType(0).isSimple())
58373 MVT VT = N->getSimpleValueType(0);
58374 SDValue InVec = N->getOperand(0);
58375 unsigned IdxVal = N->getConstantOperandVal(1);
58391 SDValue NotOp = V->getOperand(0);
58396 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
58399 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
58419 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
58421 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
58438 InVec.getOperand(0), N->getOperand(1));
58439 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
58455 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
58624 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
58640 EVT VT = N->getValueType(0);
58641 SDValue Src = N->getOperand(0);
58672 if (Ld->getExtensionType() == Ext &&
58673 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
58710 // to remove XMM->GPR->XMM moves.
58720 for (SDNode *User : Src->users())
58721 if (User->getOpcode() == X86ISD::VBROADCAST &&
58722 Src == User->getOperand(0)) {
58725 User->getValueSizeInBits(0).getFixedValue();
58747 Amt->getZExtValue(), DAG);
58759 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
58778 SDValue LHS = N->getOperand(0);
58779 SDValue RHS = N->getOperand(1);
58784 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
58789 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
58803 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
58809 LHS.getOperand(0), { 0, -1, 1, -1 });
58811 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
58813 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
58819 RHS.getOperand(0), { 0, -1, 1, -1 });
58821 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
58830 MVT VT = N->getSimpleValueType(0);
58831 SDValue LHS = N->getOperand(0);
58832 SDValue RHS = N->getOperand(1);
58833 unsigned Opc = N->getOpcode();
58876 EVT VT = N->getValueType(0);
58877 SDValue In = N->getOperand(0);
58878 unsigned Opcode = N->getOpcode();
58887 if (Ld->isSimple()) {
58895 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
58896 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
58903 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
58908 // -> EXTEND_VECTOR_INREG(X).
58909 // TODO: Handle non-zero subvector indices.
58916 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
58943 EVT VT = N->getValueType(0);
58945 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
58949 // --> extract_subvector(kshiftr(X,C1+C2),0)
58950 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
58951 if (N->getOpcode() == X86ISD::KSHIFTR) {
58953 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
58954 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
58955 SDValue Src = N->getOperand(0).getOperand(0);
58956 uint64_t Amt = N->getConstantOperandVal(1) +
58957 N->getOperand(0).getConstantOperandVal(1);
58983 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
58986 if (N->getValueType(0) != MVT::f32 ||
58987 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
58992 N->getOperand(0).getOperand(0));
59003 EVT VT = N->getValueType(0);
59004 bool IsStrict = N->isStrictFPOpcode();
59005 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
59068 {N->getOperand(0), Src});
59099 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
59100 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
59104 if (N->hasAnyUseOfValue(1))
59109 SDValue Ptr = MemIntrin->getBasePtr();
59110 SDValue Chain = MemIntrin->getChain();
59111 EVT VT = N->getSimpleValueType(0);
59112 EVT MemVT = MemIntrin->getMemoryVT();
59116 for (SDNode *User : Ptr->users())
59117 if (User != N && User->getOpcode() == N->getOpcode() &&
59118 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
59119 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
59120 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
59122 !User->hasAnyUseOfValue(1) &&
59123 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
59138 bool IsStrict = N->isStrictFPOpcode();
59139 EVT VT = N->getValueType(0);
59140 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
59161 bool IsOp0Strict = Op0->isStrictFPOpcode();
59203 {N->getOperand(0), Src, Rnd});
59225 SDValue Src = N->getOperand(0);
59231 if (LN->isSimple()) {
59232 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
59233 LN->getBasePtr(),
59234 LN->getPointerInfo(),
59235 LN->getOriginalAlign(),
59236 LN->getMemOperand()->getFlags());
59247 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
59263 for (const SDValue &Arg : N->op_values()) {
59270 SDVTList VTs = N->getVTList();
59281 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
59284 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
59298 unsigned IntNo = N->getConstantOperandVal(0);
59301 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
59312 unsigned IntNo = N->getConstantOperandVal(1);
59315 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
59326 unsigned IntNo = N->getConstantOperandVal(1);
59329 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
59338 switch (N->getOpcode()) {
59339 // clang-format off
59530 // clang-format on
59540 // Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
59554 // TODO: Almost no 8-bit ops are desirable because they have no actual
59555 // size/speed advantages vs. 32-bit ops, but they do have a major
59558 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
59559 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
59560 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
59601 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
59603 // In case control-flow branch protection is enabled, we need to add
59621 EVT VT = LogicOp->getValueType(0);
59622 EVT OpVT = SETCC0->getOperand(0).getValueType();
59635 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
59647 // 8-bit multiply-by-constant can usually be expanded to something cheaper
59655 SDNode *User = *Op->user_begin();
59660 return Ld->getBasePtr() == St->getBasePtr();
59668 SDNode *User = *Op->user_begin();
59669 if (User->getOpcode() != ISD::ATOMIC_STORE)
59673 return Ld->getBasePtr() == St->getBasePtr();
59679 SDNode *User = *Op->user_begin();
59680 EVT VT = User->getValueType(0);
59681 return (User->getOpcode() == ISD::ZERO_EXTEND &&
59737 //===----------------------------------------------------------------------===//
59739 //===----------------------------------------------------------------------===//
59777 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
59779 const std::string &AsmStr = IA->getAsmString();
59781 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
59782 if (!Ty || Ty->getBitWidth() % 16 != 0)
59785 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
59808 // rorw $$8, ${0:w} --> llvm.bswap.i16
59809 if (CI->getType()->isIntegerTy(16) &&
59810 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
59814 StringRef ConstraintsStr = IA->getConstraintString();
59822 if (CI->getType()->isIntegerTy(32) &&
59823 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
59828 StringRef ConstraintsStr = IA->getConstraintString();
59835 if (CI->getType()->isIntegerTy(64)) {
59836 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
59840 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
59975 Type *Ty = CallOperandVal->getType();
59991 if (CallOperandVal->getType()->isIntegerTy())
59997 if (Ty->isFloatingPointTy())
60001 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
60012 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
60013 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
60014 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
60019 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
60024 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
60044 if (CallOperandVal->getType()->isIntegerTy())
60050 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
60054 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
60055 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
60060 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
60065 if (C->getZExtValue() <= 31)
60070 if (C->getZExtValue() <= 63)
60075 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
60080 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
60085 if (C->getZExtValue() <= 3)
60090 if (C->getZExtValue() <= 0xff)
60100 if ((C->getSExtValue() >= -0x80000000LL) &&
60101 (C->getSExtValue() <= 0x7fffffffLL))
60106 if (C->getZExtValue() <= 0xffffffff)
60148 // Extend to 32-bits
60166 if (C->getZExtValue() <= 31) {
60167 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60175 if (C->getZExtValue() <= 63) {
60176 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60184 if (isInt<8>(C->getSExtValue())) {
60185 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60193 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
60194 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
60195 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
60203 if (C->getZExtValue() <= 3) {
60204 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60212 if (C->getZExtValue() <= 255) {
60213 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60221 if (C->getZExtValue() <= 127) {
60222 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60229 // 32-bit signed value
60232 C->getSExtValue())) {
60234 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
60247 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
60248 BA->getValueType(0)));
60251 if (Op->getOpcode() == ISD::ADD &&
60252 isa<ConstantSDNode>(Op->getOperand(1))) {
60253 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
60254 Op = Op->getOperand(0);
60257 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
60258 GA->getValueType(0), Offset));
60263 // 32-bit unsigned value
60266 C->getZExtValue())) {
60267 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60279 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
60283 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
60284 : CST->getSExtValue();
60296 // If we are in non-pic codegen mode, we allow the address of a global (with
60302 Subtarget.classifyGlobalReference(GA->getGlobal())))
60389 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
60410 // 32-bit fallthrough
60704 // Map st(0) -> st(7) -> ST0
60714 return std::make_pair(X86::FP0 + Constraint[4] - '0',
60723 // flags -> EFLAGS
60727 // dirflag -> DF
60733 // fpsr -> FPSW
60741 // Make sure it isn't a register that requires 64-bit mode.
60744 TRI->getEncodingValue(Res.first) >= 8) {
60745 // Register requires REX prefix, but we're in 32-bit mode.
60751 TRI->getEncodingValue(Res.first) & 0x10) {
60757 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
60760 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
60785 // Model GCC's behavior here and select a fixed pair of 32-bit
60806 if (RC && RC->contains(DestReg))
60825 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
60827 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
60829 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
60862 // integer division, leaving the division as-is is a loss even in terms of
60875 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
60876 AFI->setIsSplitCSR(true);
60883 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
60888 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
60889 MachineBasicBlock::iterator MBBI = Entry->begin();
60897 Register NewVR = MRI->createVirtualRegister(RC);
60899 // FIXME: this currently does not emit CFI pseudo-instructions, it works
60900 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
60902 // CFI pseudo-instructions.
60904 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
60906 Entry->addLiveIn(*I);
60907 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
60910 // Insert the copy-back instructions right before the terminator.
60912 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
60913 TII->get(TargetOpcode::COPY), *I)
60926 assert(MBBI->isCall() && MBBI->getCFIType() &&
60932 switch (MBBI->getOpcode()) {
60939 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
60944 assert(MBBI->isCall() &&
60946 if (OrigCall->shouldUpdateAdditionalCallInfo())
60948 MBBI->setCFIType(MF, OrigCall->getCFIType());
60949 OrigCall->eraseFromParent();
60956 MachineOperand &Target = MBBI->getOperand(0);
60958 switch (MBBI->getOpcode()) {
60971 // 64-bit indirect thunk calls.
60981 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
60983 .addImm(MBBI->getCFIType())
60997 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
61001 if (MF.getFunction().hasFnAttribute("probe-stack"))
61002 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
61003 "inline-asm";
61017 if (MF.getFunction().hasFnAttribute("probe-stack"))
61018 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
61023 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
61037 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
61042 if (ML && ML->isInnermost() &&