X86ISelLowering.cpp - OpenGrok cross reference for /llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp

Lines Matching +full:depth +full:- +full:wise
1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
12 //===----------------------------------------------------------------------===//
68 #define DEBUG_TYPE "x86-isel"
71     "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
75         "alignment set by x86-experimental-pref-loop-alignment."),
79     "x86-br-merging-base-cost", cl::init(2),
85         "will be merged, and above which conditionals will be split. Set to -1 "
90     "x86-br-merging-ccmp-bias", cl::init(6),
91     cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
96     WidenShift("x86-widen-shift", cl::init(true),
101     "x86-br-merging-likely-bias", cl::init(0),
102     cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
107              "the instruction cost threshold. Set to -1 to never merge likely "
112     "x86-br-merging-unlikely-bias", cl::init(-1),
114         "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
119         "the instruction cost threshold. Set to -1 to never merge unlikely "
124     "mul-constant-optimization", cl::init(true),
139   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
143   // default expansion to a no-op.
146   // For 64-bit, since we have so many registers, use the ILP scheduler.
147   // For 32-bit, use the register pressure specific scheduling.
156   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
266     // We have an algorithm for SSE2, and we turn this into a 64-bit
270     // We have an algorithm for SSE2->double, and we turn this into a
271     // 64-bit FILD followed by conditional FADD for other targets.
286     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
300     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
373       // Without SSE, i64->f64 goes through memory.
381   // the two-result form to trivial CSE, which is able to combine x/y and x%y
384   // Scalar integer multiply-high is also lowered to use two-result
386   // (low) operations are left as Legal, as there are single-result
387   // instructions for this in x86. Using the two-result multiply instructions
459     // Special handling for half-precision floating point conversions.
528   // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
547   // 64-bit shl, sra, srl (iff 32-bit x86)
576     // All CPUs supporting AVX will atomically load/store aligned 128-bit
585   // FIXME - use subtarget debug flags
672     // Disable f32->f64 extload as we can only generate this in one instruction
675     // non-optsize case.
801       addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
802       addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
811       addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
812       addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
820   // Handle constrained floating-point operations of scalar.
859     // clang-format off
871     // clang-format on
885     // Handle constrained floating-point operations of scalar.
898     // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
925     // clang-format off
933     // clang-format on
986     // clang-format off
1000     // clang-format on
1051       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1067   // with -msoft-float, disable use of MMX as well.
1107     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1280     // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1289     // Add 32-bit vector stores to help vectorization opportunities.
1397     // FIXME: Do we need to handle scalar-to-vector here?
1423       // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1545     // These types need custom splitting if their input is a 128-bit vector.
1653       // when we have a 256bit-wide blend with immediate.
1675     // (result) is 128-bit but the source is 256-bit wide.
1681     // Custom lower several nodes for 256-bit types.
1735   // available with AVX512. 512-bit vectors are in a separate block controlled
1764     // There is no byte sized k-register load or store without AVX512DQ.
1777     // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1810   // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1811   // elements. 512-bits can be disabled based on prefer-vector-width and
1812   // required-vector-width function attributes.
1894     // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1895     // to 512-bit rather than use the AVX2 instructions so that we can use
1896     // k-masks.
1919       // Extends from v64i1 masks to 512-bit vectors.
2032       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2044     // (result) is 256-bit but the source is 512-bit wide.
2045     // 128-bit was made Legal under AVX1.
2111   // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2114     // These operations are handled on non-VLX by artificially widening in
2229     // Extends from v32i1 masks to 256-bit vectors.
2239     // These operations are handled on non-VLX by artificially widening in
2241     // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2514       // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2523       // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2532       // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2537       // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2557   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2561   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
2562   // than generic legalization for 64-bit multiplication-with-overflow, though.
2604   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2605   // is. We should promote the value to 64-bits to solve this.
2606   // This is what the CRT headers do - `fmodf` is an inline header
2610     // clang-format off
2631   // clang-format on
2633   // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined.  MinGW has
2641   // We have target-specific dag combine patterns for the following nodes:
2707   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2709   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2711   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2720   // Default loop alignment, which can be overridden by -align-loops.
2723   // An out-of-order CPU can speculatively execute past a predictable branch,
2731   // Default to having -disable-strictnode-mutation on
2735 // This has so far only been implemented for 64-bit MachO.
2776 //===----------------------------------------------------------------------===//
2778 //===----------------------------------------------------------------------===//
2790       Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2793   // TODO: If this is a non-temporal load and the target has an instruction
2806   // We can not replace a wide volatile load with a broadcast-from-memory,
2809   return !Ld->isVolatile() ||
2810          Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2817   SDNode *User = *Op->user_begin();
2818   while (User->getOpcode() == ISD::BITCAST) {
2819     if (!User->hasOneUse())
2821     User = *User->user_begin();
2828     unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2903   int ReturnAddrIndex = FuncInfo->getRAIndex();
2907     unsigned SlotSize = RegInfo->getSlotSize();
2909                                                           -(int64_t)SlotSize,
2911     FuncInfo->setRAIndex(ReturnAddrIndex);
2923   // If we don't have a symbolic displacement - we don't have any extra
2929   // 64-bit offsets.
2939   // For other non-large code models we assume that latest small object is 16MB
2968   // clang-format off
2980   // clang-format on
2984 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2992       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2993         // X > -1   -> X == 0, jump !sign.
2997       if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2998         // X < 0   -> X == 0, jump on sign.
3001       if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3002         // X >= 0   -> X == 0, jump on !sign.
3005       if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3006         // X < 1   -> X <= 0
3041   // clang-format off
3042   default: llvm_unreachable("Condcode should be pre-legalized away");
3063   // clang-format on
3105       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3113       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3121       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3129       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3140       unsigned Size = I.getType()->getScalarSizeInBits();
3141       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3152       unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3153       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3174       unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3175       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3185   switch (IntrData->Type) {
3191     MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
3193     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3195     else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3197     else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3210     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3221     MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3222     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3251   assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3253   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3255   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3258       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3262   // can be store-folded. Therefore, it's probably not worth splitting the load.
3263   EVT VT = Load->getValueType(0);
3264   if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3265     for (SDUse &Use : Load->uses()) {
3273       if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR || !User->hasOneUse() ||
3274           User->user_begin()->getOpcode() != ISD::STORE)
3277     // All non-chain uses are extract + store.
3288   assert(Ty->isIntegerTy());
3290   unsigned BitSize = Ty->getPrimitiveSizeInBits();
3298   // a floating-point compare and we have blendv or conditional move, then it is
3299   // cheaper to select instead of doing a cross-register move and creating a
3326   // through type legalization on 32-bit targets so we would need to special
3333   // most implementations, sub-vXi32 vector multiplies are always fast,
3342   return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3343          (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3364   // TODO - do we have any exceptions?
3392          (!Ty->isVectorTy() &&
3393           Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3469   // There are only 32-bit and 64-bit forms for 'andn'.
3473   return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3516   // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3538           VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3558       // at least imm32 mask (or be zext i32 -> i64).
3560         return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3563       // We can only benefit if req at least 7-bit for the mask. We
3570       // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3572       return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3583   // Non-vector type and we have a zext mask with SRL.
3606   return N->getOpcode() != ISD::FP_EXTEND;
3611   assert(((N->getOpcode() == ISD::SHL &&
3612            N->getOperand(0).getOpcode() == ISD::SRL) ||
3613           (N->getOpcode() == ISD::SRL &&
3614            N->getOperand(0).getOpcode() == ISD::SHL)) &&
3615          "Expected shift-shift mask");
3617   EVT VT = N->getValueType(0);
3620     // Only fold if the shift values are equal - so it folds to AND.
3621     // TODO - we should fold if either is a non-uniform vector but we don't do
3622     // the fold for non-splats yet.
3623     return N->getOperand(1) == N->getOperand(0).getOperand(1);
3635   // 64-bit shifts on 32-bit targets produce really bad bloated code.
3671   // TODO: Allow 64-bit type for 32-bit target.
3672   // TODO: 512-bit types should be allowed, but make sure that those
3765 /// Return true if every element in Mask, is an in-place blend/select mask or is
3777 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3823 /// shuffle masks. The latter have the special property of a '-2' representing
3824 /// a zero-ed lane of a vector.
3839     // a pair of values. If we find such a case, use the non-undef mask's value.
3883     assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3937 // Use an UNDEF node if MaskElt == -1.
3938 // Split 64-bit constants in the 32-bit mode.
4025   // available, use a floating-point +0.0 instead.
4079   // This is the index of the first element of the vectorWidth-bit chunk
4081   IdxVal &= ~(ElemsPerChunk - 1);
4086                               Vec->ops().slice(IdxVal, ElemsPerChunk));
4098 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
4100 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4102 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
4111 /// Generate a DAG to grab 256-bits from a 512-bit vector.
4134   // This is the index of the first element of the vectorWidth-bit chunk
4136   IdxVal &= ~(ElemsPerChunk - 1);
4142 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
4144 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4146 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
4163   // If the upper 128-bits of a build vector are already undef/zero, then try to
4164   // widen from the lower 128-bits.
4167     ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4219   if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4220     Ops.append(N->op_begin(), N->op_end());
4224   if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4225     SDValue Src = N->getOperand(0);
4226     SDValue Sub = N->getOperand(1);
4227     const APInt &Idx = N->getConstantOperandAPInt(2);
4313   // If this is a splat value (with no-undefs) then use the lower subvector,
4351   // Make sure we only try to split 256/512-bit types to avoid creating
4377 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4378 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4423 // Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4435     // AVX512 broadcasts 32/64-bit operands.
4436     // TODO: Support float once getAVX512Node is used by fp-ops.
4447       if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4482   // Perform the 512-bit op then extract the bottom subvector.
4488 /// Insert i1-subvector to i1-vector.
4558     if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4564       unsigned ShiftLeft = NumElems - SubVecNumElems;
4565       unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4591       SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4606   unsigned ShiftLeft = NumElems - SubVecNumElems;
4607   unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4633   unsigned LowShift = NumElems - IdxVal;
4671          "Expected a 128/256/512-bit vector type");
4685   // For 256-bit vectors, we only need the lower (128-bit) input half.
4686   // For 512-bit vectors, we only need the lower input half or quarter.
4726 /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4728 /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4729 /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4794   // Rely on vector shuffles for vXi64 -> vXi32 packing.
4828       SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4870 // TODO: Add support for non-zero offsets.
4873   if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4875   return CNode->getConstVal();
4881   return getTargetConstantFromBasePtr(Load->getBasePtr());
4976       Mask = CInt->getValue();
4980       Mask = CFP->getValueAPF().bitcastToAPInt();
4984       Type *Ty = CDS->getType();
4985       Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
4986       Type *EltTy = CDS->getElementType();
4987       bool IsInteger = EltTy->isIntegerTy();
4989           EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
4992       unsigned EltBits = EltTy->getPrimitiveSizeInBits();
4993       for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
4995           Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
4997           Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5014     SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5019     APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5029     if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5040     Type *CstTy = Cst->getType();
5041     unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5042     if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5045     unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5053       if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5064     if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5067     SDValue Ptr = MemIntr->getBasePtr();
5079         SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5088     SDValue Ptr = MemIntr->getBasePtr();
5092       Type *CstTy = Cst->getType();
5093       unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5094       unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5095       if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5098       unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5105         if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5128     SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5134     // If bitcasts to larger elements we might lose track of undefs - don't
5159     // TODO - support extract_subvector through bitcasts.
5181     // TODO - support shuffle through bitcasts.
5185     ArrayRef<int> Mask = SVN->getMask();
5214         if (UndefElts1[M - NumElts])
5216         EltBits.push_back(EltBits1[M - NumElts]);
5233     int SplatIndex = -1;
5238         SplatIndex = -1;
5287 // Helper to attempt to return a cheaper, bit-inverted version of \p V.
5293   // Match not(xor X, -1) -> X.
5299   // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5309   // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5320       // Don't fold min_signed_value -> (min_signed_value - 1)
5324         Elt -= 1;
5335   // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5347   // Match not(or(not(X),not(Y))) -> and(X, Y).
5350     // TODO: Handle cases with single NOT operand -> ANDNP
5361 /// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5370   unsigned Repetitions = 1u << (NumStages - 1);
5418 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5424 /// It is an error to call this with non-empty Mask/Ops vectors.
5447     ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5454     ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5461     ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5512            "Only 32-bit and 64-bit elements are supported!");
5515     ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5525     ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5534     ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5541     ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5548     ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5554     ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5560     ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5570     // We only decode broadcasts of same-sized vectors, peeking through to
5604     ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5618     ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5625     ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5651       unsigned CtrlImm = CtrlOp->getZExtValue();
5713   // inputs that are actually the same node. Re-map the mask to always point
5718         M -= Mask.size();
5720   // If we didn't already add operands in the opcode-specific code, default to
5786       int Scale = Size / V->getNumOperands();
5793         APInt Val = Cst->getAPIntValue();
5798         APInt Val = Cst->getValueAPF().bitcastToAPInt();
5809       int Scale = V->getNumOperands() / Size;
5888     // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5889     // TODO: We currently only set UNDEF for integer types - floats use the same
5903     // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
5997                                    const SelectionDAG &DAG, unsigned Depth,
6007                                const SelectionDAG &DAG, unsigned Depth,
6026     ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6037     // Attempt to decode as a per-byte mask.
6048     // We can't assume an undef src element gives an undef dst - the other src
6073                                 Depth + 1, true) ||
6075                                 Depth + 1, true))
6105     if (!N->isOnlyUserOf(Sub.getNode()))
6132     // Limit this to vXi64 512-bit vector cases to make the most of AVX512
6134     if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6158                                 Depth + 1, ResolveKnownElts))
6210       // Check we have an in-range constant insertion index.
6298            DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6300            DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6303       // PACKSS then it was likely being used for sign-extension for a
6305       if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6310       if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6318            !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6320            !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6363     Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6383           Mask[i + j] = i + j - ByteShift;
6387           Mask[i + j - ByteShift] = i + j;
6413           Mask[i + j] = i + j - ByteShift;
6417           Mask[i + j - ByteShift] = i + j;
6429     Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6462     // We can only handle all-signbits extensions.
6518           M -= MaskWidth;
6529           M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6548                                    const SelectionDAG &DAG, unsigned Depth,
6550   if (Depth >= SelectionDAG::MaxRecursionDepth)
6551     return false; // Limit search depth.
6562   if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6573                                    const SelectionDAG &DAG, unsigned Depth,
6577                                 KnownZero, DAG, Depth, ResolveKnownElts);
6582                                    const SelectionDAG &DAG, unsigned Depth = 0,
6590   return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6602   // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6603   if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6606   SDValue Ptr = DAG.getMemBasePlusOffset(Mem->getBasePtr(),
6609   SDValue Ops[] = {Mem->getChain(), Ptr};
6613           Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6621                                    SelectionDAG &DAG, unsigned Depth) {
6622   if (Depth >= SelectionDAG::MaxRecursionDepth)
6623     return SDValue(); // Limit search depth.
6631     int Elt = SV->getMaskElt(Index);
6636     SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6637     return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6659     return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6670       return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6671     return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6680     return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6687     return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6695       return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6701   // For insert_vector_elt - either return the index matching scalar or recurse
6707     return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6748         assert(0 == i && "Expected insertion into zero-index");
6771   // SSE4.1 - use PINSRB to insert each byte directly.
6778   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6779   // If both the lowest 16-bits are non-zero, then convert to MOVD.
6888   assert(Zeroable.size() - Zeroable.count() > 1 &&
6889          "We expect at least two non-zero elements!");
6902     // Make sure that this node is extracting from a 128-bit vector.
6927     Elt = Op->getOperand(EltIdx);
6958     SDValue Current = Op->getOperand(i);
6959     SDValue SrcVector = Current->getOperand(0);
6968   assert(V1.getNode() && "Expected at least two non-zero elements!");
7005     SDValue Ptr = LD->getBasePtr();
7006     if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7008     EVT PVT = LD->getValueType(0);
7012     int FI = -1;
7015       FI = FINode->getIndex();
7019       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7026     // FIXME: 256-bit vector instructions don't require a strict alignment,
7029     SDValue Chain = LD->getChain();
7050     int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7057     int EltNo = (Offset - StartOffset) >> 2;
7062                              LD->getPointerInfo().getWithOffset(StartOffset));
7076     if (!BaseLd->isSimple())
7090       uint64_t Amt = AmtC->getZExtValue();
7104         uint64_t Idx = IdxC->getZExtValue();
7119 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7129   int LastLoadedElt = -1;
7160     unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7171   // Handle Special Cases - all undef or undef/zero.
7188   int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7190   assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7202       int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7207                                               EltIdx - FirstLoadedElt);
7228     auto MMOFlags = LDBase->getMemOperand()->getFlags();
7229     assert(LDBase->isSimple() &&
7232         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7233                     LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
7242   bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7245   // LOAD - all consecutive load/undefs (must start/end with a load or be
7256     // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7258     if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7268     // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7274         SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7305   // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7319       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7321           X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7322           LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
7330   // BROADCAST - match the smallest possible repetition pattern, load that
7340       // Don't attempt a 1:N subvector broadcast - it should be caught by
7401 // are consecutive, non-overlapping, and in the right order.
7424   auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7448   auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7473   for (auto *U : N->users()) {
7474     unsigned Opc = U->getOpcode();
7476     if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7478     if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7484     if (N->hasOneUse()) {
7487       if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7507   // TODO: Splats could be generated for non-AVX CPUs using SSE
7508   // instructions, but there's less potential gain for only 128-bit vectors.
7512   MVT VT = BVOp->getSimpleValueType(0);
7522   if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7566   if (!Ld || (NumElts - NumUndefElts) <= 1) {
7571     if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7589         Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7605         Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7621     if (!Ld || NumElts - NumUndefElts != 1)
7632   // TODO: Handle broadcasts of non-constant sequences.
7634   // Make sure that all of the users of a non-constant load are from the
7636   // FIXME: Is the use count needed for non-constant, non-load case?
7637   if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7655   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7670         C = CI->getConstantIntValue();
7672         C = CF->getConstantFPValue();
7678       Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7689   // Handle AVX2 in-register broadcasts.
7698   // Make sure the non-chain result is only used by this build vector.
7699   if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7706     SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7709                                 LN->getMemoryVT(), LN->getMemOperand());
7714   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7720     SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7723                                 LN->getMemoryVT(), LN->getMemOperand());
7742   int Idx = ExtIdx->getAsZExtVal();
7746   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7757   SDValue ShuffleVec = SVOp->getOperand(0);
7762   int ShuffleIdx = SVOp->getMaskElt(Idx);
7783   SmallVector<int, 8> Mask(NumElems, -1);
7803     // Quit if non-constant index.
7871   int SplatIdx = -1;
7877       Immediate |= (InC->getZExtValue() & 0x1) << idx;
7888   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7962 /// 128-bit partial horizontal operation on a 256-bit vector, but that operation
7963 /// may not match the layout of an x86 256-bit horizontal instruction.
7977 /// horizontal operations, but the index-matching logic is incorrect for that.
7979 /// code because it is only used for partial h-op matching now?
7984   EVT VT = N->getValueType(0);
7985   assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
7993   unsigned NumElts = LastIdx - BaseIdx;
7999     SDValue Op = N->getOperand(i + BaseIdx);
8002     if (Op->isUndef()) {
8010     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8063 /// Emit a sequence of two 128-bit horizontal add/sub followed by
8067 /// This function expects two 256-bit vectors called V0 and V1.
8068 /// At first, each vector is split into two separate 128-bit vectors.
8069 /// Then, the resulting 128-bit vectors are used to implement two
8074 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8077 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8078 /// horizontal binop dag node would take as input the lower 128-bit of V1
8079 /// and the upper 128-bit of V1.
8085 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8086 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8092 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8093 /// the upper 128-bits of the result.
8114     if (!isUndefLO && !V0->isUndef())
8116     if (!isUndefHI && !V1->isUndef())
8120     if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8123     if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8132 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8133 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8141   MVT VT = BV->getSimpleValueType(0);
8151   // Odd-numbered elements in the input build vector are obtained from
8153   // Even-numbered elements in the input build vector are obtained from
8157     SDValue Op = BV->getOperand(i);
8260 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8267       !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8301   MVT VT = BV->getSimpleValueType(0);
8314   // There are no known X86 targets with 512-bit ADDSUB instructions!
8333   MVT VT = BV->getSimpleValueType(0);
8338   // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8339   // half of the result is calculated independently from the 128-bit halves of
8340   // the inputs, so that makes the index-checking logic below more complicated.
8349       SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8361         // clang-format off
8367         // clang-format on
8380       // The source vector is chosen based on which 64-bit half of the
8424   // This is free (examples: zmm --> xmm, xmm --> ymm).
8425   MVT VT = BV->getSimpleValueType(0);
8440     if (BV->getOperand(i).isUndef())
8460   // We need at least 2 non-undef elements to make this worthwhile by default.
8462       count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8467   // int/FP at 128-bit/256-bit. Each type was introduced with a different
8469   MVT VT = BV->getSimpleValueType(0);
8480   // Try harder to match 256-bit ops by using extract/concat.
8490     if (BV->getOperand(i)->isUndef())
8494     if (BV->getOperand(i)->isUndef())
8530       assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8584   MVT VT = Op->getSimpleValueType(0);
8590   unsigned Opcode = Op->getOperand(0).getOpcode();
8592     if (Opcode != Op->getOperand(i).getOpcode())
8608     // Don't do this if the buildvector is a splat - we'd replace one
8610     if (Op->getSplatValue())
8618   for (SDValue Elt : Op->ops()) {
8639   // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8667   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8668   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8669   // vpcmpeqd on 256-bit vectors.
8702     // Zero-extend the index elements within the vector.
8739     // e.g. v4i32 -> v16i8 (Scale = 4)
8788       // SSE41 can compare v2i64 - select between indices 0 and 1.
8940 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8941 // reasoned to be a permutation of a vector by indices in a non-constant vector.
8945 // ->
8950 // construction of vectors with constant-0 elements.
8957   // This is done by checking that the i-th build_vector operand is of the form:
8971     SDValue ExtractedIndex = Op->getOperand(1);
8988     if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9032       OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9039       NumConstants--;
9060   // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9062   // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9063   // and blend the FREEZE-UNDEF operands back in.
9064   // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9067     SmallVector<int, 16> BlendMask(NumElems, -1);
9071         BlendMask[i] = -1;
9098           NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9099         UpperElems = NumElems - (NumElems / 4);
9102       MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9104           DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9121   // If we are inserting one variable into a vector of non-zero constants, try
9125   // constants. Insertion into a zero vector is handled as a special-case
9127   if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9131     // Create an all-constant vector. The variable element in the old
9142         ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9144         ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9165     unsigned InsertC = InsIndex->getAsZExtVal();
9170     // There's no good way to insert into the high elements of a >128-bit
9173     assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9182   // Special case for single non-zero, non-undef, element.
9187     // If we have a constant or non-constant insertion into the low element of
9233     // is a non-constant being inserted into an element other than the low one,
9252       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9259   // handled, so this is best done with a single constant-pool load.
9268     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
9274   // If this is a splat of pairs of 32-bit elements, we can use a narrower
9300   // For AVX-length vectors, build the individual 128-bit pieces and use
9307         DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9309         HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9315   // Let legalizer expand 2-wide build_vectors.
9383   assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9406   // our (non-undef) elements to the full vector width with the element in the
9427     Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9435 // 256-bit AVX can use the vinsertf128 instruction
9436 // to create 256-bit vectors from two other 128-bit ones.
9444           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
9471   // If we have more than 2 non-zeros, build each half separately.
9474     ArrayRef<SDUse> Ops = Op->ops();
9502 // k-register.
9529   // If we are inserting non-zero vector and there are zeros in LSBs and undef
9533       Log2_64(NonZeros) != NumOperands - 1) {
9545   // If there are zero or one non-zeros we can handle this very simply.
9559     ArrayRef<SDUse> Ops = Op->ops();
9590   // AVX can use the vinsertf128 instruction to create 256-bit vectors
9591   // from two other 128-bit ones.
9593   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9597 //===----------------------------------------------------------------------===//
9606 //===----------------------------------------------------------------------===//
9608 /// Tiny helper function to identify a no-op mask.
9611 /// array input, which is assumed to be a single-input shuffle mask of the kind
9614 /// in-place shuffle are 'no-op's.
9617     assert(Mask[i] >= -1 && "Out of bound mask element!");
9627 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9643 /// Test whether there are elements crossing 128-bit lanes in this
9650 /// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9663       int SrcLane = -1;
9678 /// Test whether a shuffle mask is equivalent within each sub-lane.
9681 /// lane-relative shuffle in each sub-lane. This trivially implies
9682 /// that it is also not lane-crossing. It may however involve a blend from the
9686 /// non-trivial to compute in the face of undef lanes. The representation is
9687 /// suitable for use with existing 128-bit shuffles as entries from the second
9693   RepeatedMask.assign(LaneSize, -1);
9703     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9708       // This is the first non-undef entry in this slot of a 128-bit lane.
9717 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
9730 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
9737 /// Test whether a target shuffle mask is equivalent within each sub-lane.
9760     // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
9765       // This is the first non-undef entry in this slot of a 128-bit lane.
9774 /// Test whether a target shuffle mask is equivalent within each sub-lane.
9843 /// each element of the mask is either -1 (signifying undef) or the value given
9853     assert(Mask[i] >= -1 && "Out of bound mask element!");
9859       MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9860       ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9872 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9889   // Check for out-of-range target shuffle mask indices.
9916         int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9925       MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9926       ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9956   // Create 128-bit vector type based on mask size.
9989 /// Get a 4-lane 8-bit shuffle immediate for a mask.
9991 /// This helper function produces an 8-bit shuffle immediate corresponding to
9998   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9999   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10000   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10001   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10002   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10004   // If the mask only uses one non-undef element, then fully 'splat' it to
10006   int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10027 // Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10031   assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10034   // If the mask only uses one non-undef element, then fully 'splat' it to
10036   int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10068 // The function looks for a sub-mask that the nonzero elements are in
10069 // increasing order. If such sub-mask exist. The function returns true.
10073   int NextElement = -1;
10077     assert(Mask[i] >= -1 && "Out of bound mask element!");
10291 /// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10292 /// followed by unpack 256-bit.
10308   // This is a "natural" unpack operation (rather than the 128-bit sectored
10309   // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10336     unsigned UpperElts = NumElts - NumSrcElts;
10386   // Non-VLX targets must truncate from a 512-bit type, so we need to
10433     unsigned UpperElts = NumElts - NumSrcElts;
10482     // TODO: Support non-BWI VPMOVWB truncations?
10497       unsigned UpperElts = NumElts - NumSrcElts;
10524       // and truncate from the double-sized src.
10567 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10578          "We should only be called with masks with a power-of-2 size!");
10580   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10583   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10600         if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10634     unsigned NumPackedBits = NumSrcBits - BitSize;
10709   // Don't lower multi-stage packs on AVX512, truncation is better.
10714   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10784     return SDValue(); // No non-zeroable elements!
10839   // For 32/64-bit elements, if we only reference one input (plus any undefs),
10864            IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
10892       LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10917   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
10928     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10932     assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
10939     assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
10946       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
10956     // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
10972     assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
10975     assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11003     // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11004     // allow that load-folding possibility.
11013     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11015     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11027                 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11050     // Otherwise load an immediate into a GPR, cast to k-register, and use a
11062 /// a single-input permutation.
11065 /// then reduce the shuffle to a single-input permutation.
11073   SmallVector<int, 32> BlendMask(Mask.size(), -1);
11074   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11101 /// a single-input permutation.
11104 /// then reduce the shuffle to a single-input (wider) permutation.
11130       NormM -= NumElts;
11155   SmallVector<int, 32> PermuteMask(NumElts, -1);
11162       NormM -= NumElts;
11170     assert(PermuteMask[Elt] != -1 &&
11196   // This routine only supports 128-bit integer dual input vectors.
11208     SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11209     SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11265   // If none of the unpack-rooted lowerings worked (or were profitable) try an
11274     // half-crossings are created.
11277     SmallVector<int, 32> PermMask((unsigned)Size, -1);
11285           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11297 /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11333         M -= NumElts;
11344   // TODO - it might be worth doing this for unary shuffles if the permute
11367           PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11369           PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11427   SmallVector<int, 32> V1Mask(NumElts, -1);
11428   SmallVector<int, 32> V2Mask(NumElts, -1);
11429   SmallVector<int, 32> FinalMask(NumElts, -1);
11438       V2Mask[i] = M - NumElts;
11447   // and change \p InputMask to be a no-op (identity) mask.
11468   // It is possible that the shuffle for one of the inputs is already a no-op.
11469   // See if we can simplify non-no-op shuffles into broadcasts,
11478   // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11480   // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11481   // pre-shuffle first is a better strategy.
11501     // Unpack/rotate failed - try again with variable blends.
11513   // TODO: It doesn't have to be alternating - but each lane mustn't have more
11516     V1Mask.assign(NumElts, -1);
11517     V2Mask.assign(NumElts, -1);
11518     FinalMask.assign(NumElts, -1);
11526           V2Mask[i + (j / 2)] = M - NumElts;
11540   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11541   assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11549     return -1;
11574   // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11582     unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11607   //   [-1, 12, 13, 14, -1, -1,  1, -1]
11608   //   [-1, -1, -1, -1, -1, -1,  1,  2]
11610   //   [-1,  4,  5,  6, -1, -1,  9, -1]
11611   //   [-1,  4,  5,  6, -1, -1, -1, -1]
11622     int StartIdx = i - (M % NumElts);
11625       return -1;
11630     int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11636       return -1;
11653       return -1;
11673 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11684 /// specified as a *right shift* because x86 is little-endian, it is a *left
11690     return -1;
11692   // PALIGNR works on 128-bit lanes.
11695     return -1;
11699     return -1;
11712   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11728            "512-bit PALIGNR requires BWI instructions");
11735          "Rotate-based lowering only supports 128-bit lowering!");
11737          "Can shuffle at most 16 bytes in a 128-bit vector!");
11742   int LoByteShift = 16 - ByteRotation;
11763 /// specified as a *right shift* because x86 is little-endian, it is a *left
11771          "Only 32-bit and 64-bit elements are supported!");
11773   // 128/256-bit vectors are only supported with VLX.
11775          && "VLX required for 128/256-bit vectors");
11783   // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
11785   // TODO: We can probably make this more aggressive and use shift-pairs like
11797     if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
11800                          DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
11806     if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
11821   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11822   assert(VT.is128BitVector() && "Only 128-bit vectors supported");
11832   unsigned Len = NumElts - (ZeroLo + ZeroHi);
11847   // 01234567 --> zzzzzz01 --> 1zzzzzzz
11848   // 01234567 --> 4567zzzz --> zzzzz456
11849   // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
11851     unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11866     unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11883 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
11886 /// bit-wise element shifts and the byte shift across an entire 128-bit double
11889 /// PSHL : (little-endian) left bit shift.
11891 /// [ -1, 4, zz, -1 ]
11892 /// PSRL : (little-endian) right bit shift.
11894 /// [ -1, -1,  7, zz]
11895 /// PSLLDQ : (little-endian) left byte shift
11897 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
11898 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
11899 /// PSRLDQ : (little-endian) right byte shift
11901 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
11902 /// [  1, 2, -1, -1, -1, -1, zz, zz]
11913         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
11923       unsigned Len = Scale - Shift;
11925         return -1;
11945   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
11962   return -1;
12019   for (; Len > 0; --Len)
12020     if (!Zeroable[Len - 1])
12026   int Idx = -1;
12039     if (Idx < 0 || (Src == V && Idx == (M - i))) {
12041       Idx = M - i;
12059 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12089       int Len = Hi - Idx;
12101       if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12104                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12107                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12178     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12181       ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12189     // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12202   assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12208     int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12209                          -1};
12216     int PSHUFDMask[4] = {Offset / 2, -1,
12217                          SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12221     int PSHUFWMask[4] = {1, -1, -1, -1};
12230   // to 64-bits.
12278     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12280       ShMask[i - AlignToUnpack] = i;
12282     Offset -= AlignToUnpack;
12290       Offset -= (NumElements / 2);
12310 /// match this pattern. It will use all of the micro-architectural details it
12311 /// can to emit an efficient lowering. It handles both blends with all-zero
12312 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12315 /// The reason we have dedicated lowering for zext-style shuffles is that they
12326          "Exceeds 32-bit integer zero extension limit");
12329   // Define a helper function to check a particular ext-scale and lower to it if
12331   auto Lower = [&](int Scale) -> SDValue {
12356         Offset = M - (i / Scale);
12358         return SDValue(); // Flip-flopping inputs.
12360       // Offset must start in the lowest 128-bit lane or at the start of an
12373         return SDValue(); // Non-consecutive strided elements.
12377     // If we fail to find an input, we have a zero-shuffle which should always
12392   // The widest scale possible for extending is to a 64-bit integer.
12406   // General extends failed, but 128-bit vectors may be able to use MOVQ.
12411   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12465   return V->hasOneUse() &&
12493       find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12503   // Bail if a non-zero V1 isn't used in place.
12506     V1Mask[V2Index] = -1;
12516   SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12522       // Using zext to expand a narrow element won't work for non-zero
12528       // Zero-extend directly to i32.
12533       // and OR with the zero-extended scalar.
12554     // this. We can't support integer vectors or non-zero targets cheaply.
12555     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12602 /// Try to lower broadcast of a single - truncated - integer element,
12616   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12617   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12643   // If we're extracting non-least-significant bits, shift so we can truncate.
12660   // This routine only handles 128-bit shufps.
12662   assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12663   assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12664   assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12665   assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12677 /// Test whether the specified input (0 or 1) is in-place blended by the
12692 /// If we are extracting two 128-bit halves of a vector and shuffling the
12693 /// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12694 /// multi-shuffle lowering.
12701          "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12734   NewMask.append(NumElts, -1);
12736   // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12739   // This is free: ymm -> xmm.
12747 /// filtering. While a little annoying to re-dispatch on type here, there isn't
12812         BitOffset -= BeginOffset;
12822   assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
12848              cast<LoadSDNode>(V)->isSimple()) {
12849     // We do not check for one-use of the vector load because a broadcast load
12855     SDValue BaseAddr = Ld->getOperand(1);
12858     assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
12864     // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
12867       SDValue Ops[] = {Ld->getChain(), NewAddr};
12871               Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12876     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12878                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12884     // We can only broadcast from the zero-element of a vector register,
12885     // but it can be advantageous to broadcast from the zero-element of a
12890     // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12894     // If we are broadcasting an element from the lowest 128-bit subvector, try
12899              "Unexpected bit-offset");
12905       // Only broadcast the zero-element of a 128-bit subvector.
12910              "Unexpected bit-offset");
12937   // We only support broadcasting from 128-bit vectors to minimize the
12939   // 128-bits, removing as many bitcasts as possible.
12970     int VADstIndex = -1;
12971     int VBDstIndex = -1;
12987       // We can only insert a single non-zeroable element.
13000     // Don't bother if we have no (non-zeroable) element for insertion.
13014       VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13018     // the zero mask and the V2 insertion - so remove V1 dependency.
13060 /// Handle lowering of 2-lane 64-bit floating point shuffles.
13062 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
13098   assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13099   assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13114   int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13115                         Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13121   // blend patterns if a zero-blend above didn't work.
13140   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13145 /// Handle lowering of 2-lane 64-bit integer shuffles.
13147 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13169     int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13170                           Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13171                           Mask[1] < 0 ? -1 : (Mask[1] * 2),
13172                           Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13178   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13179   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13218   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13259     int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13271       NewMask[V2Index] -= 4;
13276       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13295       NewMask[2] -= 4;
13296       NewMask[3] -= 4;
13301       NewMask[0] -= 4;
13302       NewMask[1] -= 4;
13314                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13315                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13338 /// Lower 4-lane 32-bit floating point shuffles.
13405   // There are special ways we can lower some single-element blends. However, we
13406   // have custom ways we can lower more complex single-element blends below that
13408   // when the V2 input is targeting element 0 of the mask -- that is the fast
13443 /// Lower 4-lane i32 vector shuffles.
13445 /// We try to handle these with integer-domain shuffles where we can, but for
13477     // Try to use broadcast unless the mask only has one non-undef element.
13510   // There are special ways we can lower some single-element blends.
13533   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13572 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13584 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13586 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13587 /// vector, form the analogous 128-bit 8-element Mask.
13607       HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13620   int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13621   int NumHToL = LoInputs.size() - NumLToL;
13622   int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13623   int NumHToH = HiInputs.size() - NumLToH;
13629   // If we are shuffling values from one half - check how many different DWORD
13643     int PSHUFDMask[4] = { -1, -1, -1, -1 };
13675       DWordPairs.resize(2, std::make_pair(-1, -1));
13685   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13690   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13691   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13693   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13694   // and an existing 2-into-2 on the other half. In this case we may have to
13695   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13696   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13697   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13698   // because any other situation (including a 3-into-1 or 1-into-3 in the other
13699   // half than the one we target for fixing) will be fixed when we re-enter this
13703   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13704   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13706   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13708   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13709   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13711   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13712   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13739         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13746     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13748     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13749     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13750     // is essential that we don't *create* a 3<-1 as then we might oscillate.
13754       // to balance this to ensure we don't form a 3-1 shuffle in the other
13822     // Recurse back into this routine to re-compute state now that this isn't
13836   int PSHUFLMask[4] = {-1, -1, -1, -1};
13837   int PSHUFHMask[4] = {-1, -1, -1, -1};
13838   int PSHUFDMask[4] = {-1, -1, -1, -1};
13841   // original halves. This will then dictate the targets of the cross-half
13850       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13851           InPlaceInputs[0] - HalfOffset;
13858         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13865     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13866         InPlaceInputs[0] - HalfOffset;
13870     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13877   // Now gather the cross-half inputs and place them into a free dword of
13880   // look more like the 3-1 fixing operation.
13905         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13906           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13907             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13908                 Input - SourceOffset;
13911               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13914                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13916             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13917                        Input - SourceOffset &&
13920           // Note that this correctly re-maps both when we do a swap and when
13923           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13927         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13928           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13930           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13935       // And just directly shift any other-half mask elements to be same-half
13940           M = M - SourceOffset + DestOffset;
13950       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13951         int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13953         SourceHalfMask[InputFixed - SourceOffset] =
13954             IncomingInputs[0] - SourceOffset;
13961           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13962         // We have two non-adjacent or clobbered inputs we need to extract from
13965         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13966                               IncomingInputs[1] - SourceOffset};
13992           // (because there are no off-half inputs to this half) and there is no
13994           // swap an input with a non-input.
14072       M -= 4;
14080 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14104     int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14133 /// Generic lowering of 8-lane i16 shuffles.
14135 /// This handles both single-input shuffles and combined shuffle/blends with
14140 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14204          "All single-input shuffles should be canonicalized to be V1-input "
14219   // There are special ways we can lower some single-element blends.
14269     // Check if this is part of a 256-bit vector truncation.
14285       for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14315   // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14342   // We can always bit-blend if we have to so the fallback strategy is to
14343   // decompose into single-input permutes and blends/unpacks.
14348 /// Lower 8-lane 16-bit floating point shuffles.
14377 // Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14378 // sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14406         M += (Scale - 1) * NumElts;
14429 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14430 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14478   // For single-input shuffles, there are some nicer lowering tricks we can use.
14494     // Notably, this handles splat and partial-splat shuffles more efficiently.
14495     // However, it only makes sense if the pre-duplication shuffle simplifies
14497     // express the pre-duplication shuffle as an i16 shuffle.
14508     auto tryToWidenViaDuplication = [&]() -> SDValue {
14525       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14569       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14572           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14608   // blends but after all of the single-input lowerings. If the single input
14617   // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14630     // do so. This avoids using them to handle blends-with-zero which is
14643       // FIXME: It might be worth trying to detect if the unpack-feeding
14650       // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14661       // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14671   // There are special ways we can lower some single-element blends.
14696     for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14728   // Handle multi-input cases by blending/unpacking single-input shuffles.
14733   // The fallback path for single-input shuffles widens this into two v8i16
14738   std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14739   std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14782 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
14784 /// This routine breaks down the specific type of 128-bit shuffle and
14819 /// Generic routine to split vector shuffle into half-sized shuffles.
14828          "Only for 256-bit or wider vector shuffles!");
14840   // Use splitVector/extractSubVector so that split build-vectors just build two
14853   // Now create two 4-way blends of these half-width vectors.
14874   auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
14885     SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14886     SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14887     SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14891         V2BlendMask[i] = M - NumElements;
14904     // a minimal number of high-level vector shuffle nodes.
14923           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14948 /// This is provided as a good fallback for many lowerings of non-single-input
14949 /// shuffles with more than one 128-bit lane. In those cases, we want to select
14950 /// between splitting the shuffle into 128-bit components and stitching those
14951 /// back together vs. extracting the single-input shuffles and blending those
14958   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
14966     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14970           V2BroadcastIdx = M - Size;
14971         else if (M - Size != V2BroadcastIdx)
14985   // If the inputs all stem from a single 128-bit lane of each input, then we
15001   // requires that the decomposed single-input shuffles don't end up here.
15007 // TODO: Extend to support v8f32 (+ 512-bit shuffles).
15014   int LHSMask[4] = {-1, -1, -1, -1};
15015   int RHSMask[4] = {-1, -1, -1, -1};
15016   int SHUFPDMask[4] = {-1, -1, -1, -1};
15036 /// Lower a vector shuffle crossing multiple 128-bit lanes as
15037 /// a lane permutation followed by a per-lane permutation.
15039 /// This is mainly for cases where we can have non-repeating permutes
15057   auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15101       // TODO - isShuffleMaskInputInPlace could be extended to something like
15113       if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15142   // Then attempt a solution with 64-bit sublanes (vpermq).
15146   // If that doesn't work and we have fast variable cross-lane shuffle,
15147   // attempt 32-bit sublanes (vpermd).
15168 /// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15172 /// single-input cross lane shuffle which is lower than any other fully general
15173 /// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15178   // FIXME: This should probably be generalized for 512-bit vectors as well.
15179   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15190   // If there are only inputs from one 128-bit lane, splitting will in fact be
15208   // TODO - we could support shuffling V2 in the Flipped input.
15216          "In-lane shuffle mask expected");
15224   // Flip the lanes, and shuffle the results which should now be in-lane.
15233 /// Handle lowering 2-lane 128-bit shuffles.
15281   // Blends are faster and handle all the non-lane-crossing cases.
15289     // Check for patterns which can be matched with a single insert of a 128-bit
15295       // this will likely become vinsertf128 which can't fold a 256-bit memop.
15317   // Otherwise form a 128-bit permutation. After accounting for undefs,
15318   // convert the 64-bit shuffle mask selection values into 128-bit
15323   //    [1:0] - select 128 bits from sources for low half of destination
15324   //    [2]   - ignore
15325   //    [3]   - zero low half of destination
15326   //    [5:4] - select 128 bits from sources for high half of destination
15327   //    [6]   - ignore
15328   //    [7]   - zero high half of destination
15347 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
15365   SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15366   SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15371     int Srcs[2] = {-1, -1};
15372     SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15471   SmallVector<int, 16> NewMask(NumElts, -1);
15475       int M = -1;
15486       cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15492       int M = -1;
15503       cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15508       NewMask[i] = -1;
15540   HalfIdx1 = -1;
15541   HalfIdx2 = -1;
15615 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15623          "Expected 256-bit or 512-bit vector");
15673     // Always extract lowers when setting lower - these are all free subreg ops.
15679       // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15698       // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15711   // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15713     // AVX2 has efficient 64-bit element cross-lane shuffles.
15717     // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15729 /// Handle case where shuffle sources are coming from the same 128-bit lane and
15730 /// every lane can be represented as the same repeating mask - allowing us to
15749       // accounting for UNDEFs but only references the lowest 128-bit
15767       SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15775       SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15790   // Bail if the shuffle mask doesn't cross 128-bit lanes.
15805     // can form a repeating shuffle mask (local to each sub-lane). At the same
15806     // time, determine the source sub-lane for each destination sub-lane.
15807     int TopSrcSubLane = -1;
15808     SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15814       // Extract the sub-lane mask, check that it all comes from the same lane
15816       int SrcLane = -1;
15817       SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15830       // Whole sub-lane is UNDEF.
15834       // Attempt to match against the candidate repeated sub-lane masks.
15850         // Merge the sub-lane mask into the matching repeated sub-lane mask.
15860         // Track the top most source sub-lane - by setting the remaining to
15868       // Bail if we failed to find a matching repeated sub-lane mask.
15876     SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15889     // Shuffle each source sub-lane to its destination.
15890     SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15911   // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15912   // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
15914   // Otherwise we can only permute whole 128-bit lanes.
15951   SmallVector<int, 8> SHUFPDMask(NumElts, -1);
15992   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16017   if (Zeroable.countl_one() < (Mask.size() - 8))
16045 // Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16047 // 256-bit vectors in earlier isel stages. Therefore, this function matches a
16048 // pair of 256-bit shuffles and makes sure the masks are consecutive.
16083   for (SDNode *User : V1->users())
16084     if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16085         User->getOperand(1) == V2)
16090   // Find out which half of the 512-bit shuffles is each smaller shuffle
16095   if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16096       IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16099   } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16100              IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16122 /// Handle lowering of 4-lane 64-bit floating point shuffles.
16124 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16149       // Non-half-crossing single input shuffles can be lowered with an
16162     // Try to create an in-lane repeating shuffle mask and then shuffle the
16168     // Try to permute the lanes and then use a per-lane permute.
16210   // Try to create an in-lane repeating shuffle mask and then shuffle the
16216   // Try to simplify this by merging 128-bit lanes to enable a lane-based
16242 /// Handle lowering of 4-lane 64-bit integer shuffles.
16276     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16330   // Try to create an in-lane repeating shuffle mask and then shuffle the
16341   // Try to simplify this by merging 128-bit lanes to enable a lane-based
16355 /// Handle lowering of 8-lane 32-bit floating point shuffles.
16357 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16389   // If the shuffle mask is repeated in each 128-bit lane, we have many more
16415   // Try to create an in-lane repeating shuffle mask and then shuffle the
16422   // two 128-bit lanes use the variable mask to VPERMILPS.
16437   // Try to simplify this by merging 128-bit lanes to enable a lane-based
16459   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16477 /// Handle lowering of 8-lane 32-bit integer shuffles.
16506   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16535   // If the shuffle mask is repeated in each 128-bit lane we can use more
16536   // efficient instructions that mirror the shuffles across the two 128-bit
16579   // Try to create an in-lane repeating shuffle mask and then shuffle the
16586     // Try to produce a fixed cross-128-bit lane permute followed by unpack
16592     // generate a cross-lane VPERMD instruction.
16608   // Try to simplify this by merging 128-bit lanes to enable a lane-based
16619 /// Handle lowering of 16-lane 16-bit integer shuffles.
16673   // Try to create an in-lane repeating shuffle mask and then shuffle the
16685     // Try to produce a fixed cross-128-bit lane permute followed by unpack
16690     // There are no generalized cross-lane shuffle operations available on i16
16703       // As this is a single-input shuffle, the repeated mask should be
16715   // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
16719   // Try to simplify this by merging 128-bit lanes to enable a lane-based
16725   // Try to permute the lanes and then use a per-lane permute.
16742 /// Handle lowering of 32-lane 8-bit integer shuffles.
16802   // Try to create an in-lane repeating shuffle mask and then shuffle the
16808   // There are no generalized cross-lane shuffle operations available on i8
16811     // Try to produce a fixed cross-128-bit lane permute followed by unpack
16828   // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16832   // Try to simplify this by merging 128-bit lanes to enable a lane-based
16838   // Try to permute the lanes and then use a per-lane permute.
16863 /// High-level routine to lower various 256-bit x86 vector shuffles.
16865 /// This routine either breaks down the specific type of a 256-bit x86 vector
16866 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
16887   // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16889   // querying in the per-vector-type lowering routines. With AVX1 we have
16890   // essentially *zero* ability to manipulate a 256-bit vector with integer
16897       // for masking/blending then decompose into 128-bit vectors.
16935     llvm_unreachable("Not a valid 256-bit x86 vector type!");
16939 /// Try to lower a vector shuffle as a 128-bit shuffles.
16951   // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16969   // Check for patterns which can be matched with a single insert of a 256-bit
16982   // See if this is an insertion of the lower 128-bits of V2 into V1.
16984   int V2Index = -1;
16986     assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16997       // Make sure we only have a single V2 index and its the lowest 128-bits.
17012   // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17024   int PermMask[4] = {-1, -1, -1, -1};
17027     assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17045 /// Handle lowering of 8-lane 64-bit floating point shuffles.
17060       // Non-half-crossing single input shuffles can be lowered with an
17099 /// Handle lowering of 16-lane 32-bit floating point shuffles.
17108   // If the shuffle mask is repeated in each 128-bit lane, we have many more
17144   // Try to create an in-lane repeating shuffle mask and then shuffle the
17151   // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17166 /// Handle lowering of 8-lane 64-bit integer shuffles.
17183     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17185     // 128-bit lanes.
17239 /// Handle lowering of 16-lane 32-bit integer shuffles.
17269   // If the shuffle mask is repeated in each 128-bit lane we can use more
17270   // efficient instructions that mirror the shuffles across the four 128-bit
17318   // Try to create an in-lane repeating shuffle mask and then shuffle the
17336 /// Handle lowering of 32-lane 16-bit integer shuffles.
17344   assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17381       // As this is a single-input shuffle, the repeated mask should be
17397   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17407 /// Handle lowering of 64-lane 8-bit integer shuffles.
17415   assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17459   // Try to create an in-lane repeating shuffle mask and then shuffle the
17474     // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17487   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17501 /// High-level routine to lower various 512-bit x86 vector shuffles.
17503 /// This routine either breaks down the specific type of a 512-bit x86 vector
17504 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
17512          "Cannot lower 512-bit vectors w/ basic ISA!");
17576     llvm_unreachable("Not a valid 512-bit x86 vector type!");
17588   int ShiftAmt = -1;
17597     // The first non-undef element determines our shift amount.
17599       ShiftAmt = M - i;
17604     // All non-undef elements must shift by the same amount.
17605     if (ShiftAmt != M - i)
17619 // Returns the shift amount if possible or -1 if not. This is a simplified
17627       if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17636     unsigned Len = Size - Shift;
17647   return -1;
17652 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
17653 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
17661          "Cannot lower 512-bit vectors w/o basic ISA!");
17668   int Src = -1;
17688   if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17718                         DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17720         ShiftAmt += WideElts - NumElts;
17734   if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
17737     ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
17756     // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17761     // Take 512-bit type, unless we are avoiding 512-bit types and have the
17762     // 256-bit operation available.
17766     // Take 512-bit type, unless we are avoiding 512-bit types and have the
17767     // 256-bit operation available.
17871   // are preferable to blendw/blendvb/masked-mov.
17879     switch (V->getOpcode()) {
17898     if (!V->hasOneUse())
17916     /// Top-level lowering for x86 vector shuffles.
17926   ArrayRef<int> OrigMask = SVOp->getMask();
17948   // Check for non-undef masks pointing at an undef vector and make the masks
17956         M = -1;
17964                       [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
17967   // We actually see shuffles that are entirely re-arrangements of a set of
17982   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17990     // TODO: Avoid lowering directly from this top-level function: make this
17991     // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18005         // Modify the new Mask to take all zeros from the all-zero vector.
18006         // Choose indices that are blend-friendly.
18009                "V2's non-undef elements are used?!");
18015         // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18082   // 128- and 256-bit vectors with <= 16 elements can be converted to and
18083   // compressed as 512-bit vectors in AVX512F.
18135   // Only non-legal VSELECTs reach this lowering, convert those into generic
18136   // shuffles and re-use the shuffle lowering path for blends.
18167   // Try to lower this to a blend-style vector shuffle. This can handle all
18173   // with patterns on the mask registers on AVX-512.
18190   // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18191   // into an i1 condition so that we can use the mask-based 512-bit blend
18232   // VSELECT-matching blend, return Op, and but if we need to expand, return
18278     unsigned IdxVal = Idx->getAsZExtVal();
18292     SDNode *User = *Op.getNode()->user_begin();
18293     if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18294         (User->getOpcode() != ISD::BITCAST ||
18295          User->getValueType(0) != MVT::i32))
18309 /// AVX-512 feature.
18326     // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18340   unsigned IdxVal = IdxC->getZExtValue();
18357   MVT VT = N->getSimpleValueType(0);
18360   for (SDNode *User : N->users()) {
18361     switch (User->getOpcode()) {
18365       if (!isa<ConstantSDNode>(User->getOperand(1))) {
18369       DemandedElts.setBit(User->getConstantOperandVal(1));
18372       if (!User->getValueType(0).isSimple() ||
18373           !User->getValueType(0).isVector()) {
18405     // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18413     // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
18414     // ---------------------------------------------
18425     // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
18426     // ---------------------------------------------------------
18427     // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18428     // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
18435   unsigned IdxVal = IdxC->getZExtValue();
18437   // If this is a 256-bit vector result, first extract the 128-bit vector and
18438   // then extract the element from the 128-bit vector.
18440     // Get the 128-bit vector.
18449     IdxVal &= ElemsPerChunk - 1;
18480   // Only extract a single element from a v16i8 source - determine the common
18481   // DWORD/WORD that all extractions share, and extract the sub-byte.
18487     // Extract either the lowest i32 or any i16, and extract the sub-byte.
18518     SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
18535     int Mask[2] = { 1, -1 };
18545 /// AVX-512 feature.
18566   // Copy into a k-register, extract to v1i1 and insert_subvector.
18598     // possible vector indices, and FP insertion has less gpr->simd traffic.
18618     // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18623   if (N2C->getAPIntValue().uge(NumElts))
18625   uint64_t IdxVal = N2C->getZExtValue();
18631     // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18656   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18659     // With a 256-bit vector, we can insert into the zero element efficiently
18664       // doing anyway after extracting to a 128-bit vector.
18675            "Vectors will always have power-of-two number of elements.");
18677     // If we are not inserting into the low 128-bit vector chunk,
18691     // Get the desired 128-bit vector chunk.
18696     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18704   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18757         // If this is an insertion of 32-bits into the low 32-bits of
18762         // generate insertps because blendps does not have a 32-bit memory
18792   // If this is a 256-bit vector result, first insert into a 128-bit
18793   // vector and then insert into the 256-bit vector.
18795     // Insert into a 128-bit vector.
18802     // Insert the 128-bit vector.
18854   // References to absolute symbols are never PC-relative.
18855   if (GV && GV->isAbsoluteSymbolRef())
18858   // The following OpFlags under RIP-rel PIC use RIP.
18887       CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
18909   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18933   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18934   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18960     GV = G->getGlobal();
18961     Offset = G->getOffset();
18964     ExternalSym = ES->getSymbol();
18984     // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19017   // If there was a non-zero offset that we didn't fold, create an explicit
19046     if (TGA->hasOneUse()) {
19048       SDNode *TLSDescOp = *TGA->user_begin();
19049       assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19052       auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19053       assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19056       auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19057       assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19062     TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19063                                      GA->getOffset(), OperandFlags);
19134   MFI->incNumLocalDynamicTLSAccesses();
19154   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19155                                            GA->getValueType(0),
19156                                            GA->getOffset(), OperandFlags);
19169   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19178   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
19196   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19198       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19199                                  GA->getOffset(), OperandFlags);
19226   const GlobalValue *GV = GA->getGlobal();
19267     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
19268                                                 GA->getValueType(0),
19269                                                 GA->getOffset(), OpFlag);
19312     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19313     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19329     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
19351     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19352                                              GA->getValueType(0),
19353                                              GA->getOffset(), X86II::MO_SECREL);
19380       // offset and returning `true` for TLS-desc currently duplicates both
19381       // which is detrimental :-/
19397 // Try to use a packed vector operation to handle i64 on 32-bit targets when
19407   bool IsStrict = Op->isStrictFPOpcode();
19419   // Using 256-bit to ensure result is 128-bits for f32 case.
19440 // Try to use a packed vector operation to handle i64 on 32-bit targets.
19448   bool IsStrict = Op->isStrictFPOpcode();
19500 /// round-trip between XMM and GPR.
19512   // See if we have a 128-bit vector cast op for this type of cast.
19521   // If we are extracting from a non-zero element, first shuffle the source
19524     SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19528   // If the source vector is wider than 128-bits, extract the low part. Do not
19533   // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19534   // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19541 /// try to vectorize the cast ops. This will avoid an expensive round-trip
19558   // See if we have 128-bit vector cast instructions for this type of cast.
19571   // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19577   // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19593   bool IsStrict = Op->isStrictFPOpcode();
19594   MVT VT = Op->getSimpleValueType(0);
19595   SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19604     // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19618                         {Op->getOperand(0), Src});
19632   bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19633                   Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19681   bool IsStrict = Op->isStrictFPOpcode();
19683   SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19720   bool IsStrict = Op->isStrictFPOpcode();
19723   SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19793     // Bitcasting to f64 here allows us to do a single 64-bit store from
19795     // with two 32-bit stores.
19858 /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19867 /// 64-bit unsigned integer to double expansion.
19871   // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
19873   // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
19874   assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
19906   // Load the 64-bit value into an XMM register.
19919   // TODO: Are there any fast-math-flags to propagate here?
19927     SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19935 /// 32-bit unsigned integer to float expansion.
19939   unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19944   // Load the 32-bit value into an XMM register.
19961   if (Op.getNode()->isStrictFPOpcode()) {
19963     // TODO: Are there any fast-math-flags to propagate here?
19979   // TODO: Are there any fast-math-flags to propagate here?
19992   bool IsStrict = Op->isStrictFPOpcode();
20005       SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20024   // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20042   bool IsStrict = Op->isStrictFPOpcode();
20043   SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20049     // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20051     MVT VT = Op->getSimpleValueType(0);
20053     // v8i32->v8f64 is legal with AVX512 so just return it.
20070                         {Op->getOperand(0), V});
20085       Op->getSimpleValueType(0) == MVT::v4f64) {
20118   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20125   if (VecFloatVT != Op->getSimpleValueType(0))
20129   // - The vector of constants:
20130   // -- 0x4b000000
20131   // -- 0x53000000
20132   // - A shift:
20133   // -- v >> 16
20176   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20178   // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20181   // TODO: Are there any fast-math-flags to propagate here?
20199   unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20219   bool IsStrict = Op->isStrictFPOpcode();
20225   MVT DstVT = Op->getSimpleValueType(0);
20249     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
20253   // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20267   // The transform for i64->f64 isn't correct for 0 when rounding to negative
20268   // infinity. It produces -0.0, so disable under strictfp.
20272   // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20281   // Make a 64-bit buffer, and use it to build an FILD.
20283   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20304     // Bitcasting to f64 here allows us to do a single 64-bit store from
20306     // with two 32-bit stores.
20330   Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20345   // TODO: Are there any fast-math-flags to propagate here?
20380   bool IsStrict = Op->isStrictFPOpcode();
20396   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20402     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20412   // We lower FP->int64 into FISTP64 followed by a load from a temporary
20433     //  FistSrc = (Value - FltOfs);
20434     //  Fist-to-mem64 FistSrc
20435     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20497   // FIXME This causes a redundant load/store if the SSE-class value is already
20564   //   v8i16 -> v8i32
20565   //   Use vpmovzwd for 4 lower elements  v8i16 -> v4i32.
20566   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
20569   //   v4i32 -> v4i64
20570   //   Use vpmovzdq for 4 lower elements  v4i32 -> v2i64.
20571   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
20577   // Short-circuit if we can determine that each 128-bit half is the same value.
20580     if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20609   MVT VT = Op->getSimpleValueType(0);
20610   SDValue In = Op->getOperand(0);
20620                        DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20633   // Widen to 512-bits if VLX is not supported.
20654   // Extract back to 128/256-bit if we widened.
20678 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20679 /// within each 128-bit lane.
20711   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20719   // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
20720   // On pre-AVX512, pack the src in both halves to help value tracking.
20749   // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20757   // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20758   // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20764     // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20775     // If 512bit -> 128bit truncate another stage.
20781   assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20784     // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
20799 /// e.g. trunc <8 x i32> X to <8 x i16> -->
20845   // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
20846   // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
20853   // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
20870   // Pre-SSE41 we can only use PACKUSWB.
20873       (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20878   // Truncate with PACKSS if we are truncating a vector with sign-bits
20883   // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
20891   unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20900   if (In.getOpcode() == ISD::SRL && In->hasOneUse())
20904         return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
20911 /// This function lowers a vector truncation of 'extended sign-bits' or
20912 /// 'extended zero-bits' values.
20979   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
20980   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
20988   // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
21007   unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21031     // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21043             {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21056     // We either have 8 elements or we're allowed to use 512-bit vectors.
21063     ShiftInx = InVT.getScalarSizeInBits() - 1;
21093       // truncate the remainder. We'd rather produce two 64-bit results and
21106     // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21110               VT, In, DL, Subtarget, DAG, Op->getFlags()))
21113     // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21128             VT, In, DL, Subtarget, DAG, Op->getFlags()))
21140     // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21151     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21153       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21170     // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21174                                       -1, -1, -1, -1, -1, -1, -1, -1,
21176                                       -1, -1, -1, -1, -1, -1, -1, -1 };
21181       static const int ShufMask2[] = {0, 2, -1, -1};
21196   llvm_unreachable("All 256->128 cases should have been handled above!");
21206   assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21209   // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21223   // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21232                   DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21238   bool IsStrict = Op->isStrictFPOpcode();
21241   MVT VT = Op->getSimpleValueType(0);
21243   SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21272         // Widen to 512-bits.
21278         // TODO: Should we just do this for non-strict as well?
21341     // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21365     // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21372     // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21382       // TODO: Should we just do this for non-strict as well?
21404     // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21412       // TODO: Should we just do this for non-strict as well?
21436         // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21458         return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21496       // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21497       // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21513           ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21524     // Promote i32 to i64 and use a signed operation on 64-bit targets.
21618   EVT DstVT = N->getValueType(0);
21619   SDValue Src = N->getOperand(0);
21637   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21666   bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21669   SDValue Src = Node->getOperand(0);
21676   EVT DstVT = Node->getValueType(0);
21684   EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21697   // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21710   // floating-point values.
21813   bool IsStrict = Op->isStrictFPOpcode();
21821   // Let f16->f80 get lowered to a libcall, except for darwin, where we should
21852       // Need a libcall, but ABI for f16 is soft-float on MacOS.
21914                          {Op->getOperand(0), Res});
21926                        {Op->getOperand(0), Res});
21931   bool IsStrict = Op->isStrictFPOpcode();
22004       // FIXME: Should we use zeros for upper elements for non-strict?
22023   bool IsStrict = Op->isStrictFPOpcode();
22052   bool IsStrict = Op->isStrictFPOpcode();
22068     // FIXME: Should we use zeros for upper elements for non-strict?
22136   // clang-format off
22143   // clang-format on
22162   // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22163   // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22170   // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22171   // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22172   // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22173   // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22222     for (SDNode *User : Op->users())
22223       if (User->getOpcode() == ISD::FNEG)
22235   // decide if we should generate a 16-byte constant mask when we only need 4 or
22239   // generate a 16-byte vector constant and logic op even for the scalar case.
22240   // Using a 16-byte mask allows folding the load of the mask with
22266   // For the scalar case extend to a 128-bit vector, perform the logic op,
22298   // Perform all scalar logic operations as 16-byte vectors because there are no
22327     APFloat APF = Op0CN->getValueAPF();
22365   // instruction.  Since the shift amount is in-range-or-undefined, we know
22376   // See if we can use the 32-bit instruction instead of the 64-bit one for a
22390     if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22449 /// Try to map a 128-bit or larger integer comparison to vector instructions
22466   // logically-combined vector-sized operands compared to zero. This pattern may
22483   // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22484   // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22525     auto ScalarToVector = [&](SDValue X) -> SDValue {
22553       // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22586     // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22587     // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22589            "Non 128-bit vector on pre-SSE41 target");
22600 /// are supported when the pointer SrcMask is non-null.
22601 /// TODO - move this to SelectionDAG?
22619     if (I->getOpcode() == unsigned(BinOp)) {
22620       Opnds.push_back(I->getOperand(0));
22621       Opnds.push_back(I->getOperand(1));
22622       // Re-evaluate the number of nodes to be traversed.
22627     // Quit if a non-EXTRACT_VECTOR_ELT
22628     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22632     auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22636     SDValue Src = I->getOperand(0);
22641       if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22650     unsigned CIdx = Idx->getZExtValue();
22651     if (M->second[CIdx])
22653     M->second.setBit(CIdx);
22659       SrcMask->push_back(SrcOpMap[SrcOp]);
22682   // Quit if not convertable to legal scalar or 128/256-bit vector.
22686   // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
22703   // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22726   // Without PTEST, a masked v2i64 or-reduction is not faster than
22733   // Split down to 128/256/512-bit vector.
22750       // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
22759       // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
22806   assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
22817 // Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
22832   if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22835   // Check whether we're masking/truncating an OR-reduction result, in which
22850         Mask = Cst->getAPIntValue();
22861   // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
22869     // Quit if not splittable to scalar/128/256/512-bit vector.
22875     for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22891   // Match icmp(reduce_and(X),-1) allof reduction patterns.
22911       // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
22916         ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
22925       // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
22947   for (SDUse &Use : Op->uses()) {
22950     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22952       UOpNo = User->use_begin()->getOperandNo();
22953       User = User->use_begin()->getUser();
22956     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22957         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22963 // Transform to an x86-specific ALU node with flags if there is a chance of
22967   for (SDNode *U : Op->users())
22968     if (U->getOpcode() != ISD::CopyToReg &&
22969         U->getOpcode() != ISD::SETCC &&
22970         U->getOpcode() != ISD::STORE)
22996     switch (Op->getOpcode()) {
23001       if (Op.getNode()->getFlags().hasNoSignedWrap())
23026   // non-casted variable when we check for possible users.
23042     // Otherwise use a regular EFLAGS-setting instruction.
23044     // clang-format off
23051     // clang-format on
23066     return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23067                        Op->getOperand(1)).getValue(1);
23079   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
23108     // Don't do this if the immediate can fit in 8-bits.
23109     if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23110         (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23132   // TODO: Add sign-bits equivalent for isX86CCSigned(X86CC)?
23142   // 0-x == y --> x+y == 0
23143   // 0-x != y --> x+y != 0
23151   // x == 0-y --> x+y == 0
23152   // x != 0-y --> x+y != 0
23173   if (N->getOpcode() == ISD::FDIV)
23176   EVT FPVT = N->getValueType(0);
23179   // This indicates a non-free bitcast.
23181   // integer vector anyways for the int->fp cast.
23206 /// The minimum architected relative accuracy is 2^-12. We need one
23207 /// Newton-Raphson step to have a good float result (24 bits of precision).
23216   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23217   // It is likely not profitable to do this for f64 because a double-precision
23233     // There is no FSQRT for 512-bits, but there is RSQRT14.
23260 /// The minimum architected relative accuracy is 2^-12. We need one
23261 /// Newton-Raphson step to have a good float result (24 bits of precision).
23268   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23269   // It is likely not profitable to do this for f64 because a double-precision
23281     // real-world code. These defaults are intended to match GCC behavior.
23288     // There is no FSQRT for 512-bits, but there is RCP14.
23326   if (isIntDivCheap(N->getValueType(0), Attr))
23338   EVT VT = N->getValueType(0);
23344   // If the divisor is 2 or -2, the default expansion is better.
23346       Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23375         if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23383     uint64_t AndRHSVal = AndRHS->getZExtValue();
23421 // Check if pre-AVX condcode can be performed by a single FCMP op.
23426 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23434   //  0 - EQ
23435   //  1 - LT
23436   //  2 - LE
23437   //  3 - UNORD
23438   //  4 - NEQ
23439   //  5 - NLT
23440   //  6 - NLE
23441   //  7 - ORD
23443   // clang-format off
23465   // clang-format on
23489 /// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23522   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23549     auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23550     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23554     const APInt &EltC = Elt->getAPIntValue();
23561     NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23590     // Only do this pre-AVX since vpcmp* is no longer destructive.
23604     // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23613   // Psubus is better than flip-sign because it requires no inversion.
23633   MVT VT = Op->getSimpleValueType(0);
23634   ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23648       // Break 256-bit FP vector compare into smaller ones.
23652       // Break 512-bit FP vector compare into smaller ones.
23678     // compare like we do for non-strict, we might trigger spurious exceptions
23693       // floating-point vector result that matches the operand type. This allows
23721         SignalCmp->setFlags(Op->getFlags());
23812   // The non-AVX512 code below works under the assumption that source and
23819     // In AVX-512 architecture setcc returns mask with i1 elements,
23831     // clang-format off
23843     // clang-format on
23854   // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23866   // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23870     if (C1 && C1->getAPIntValue().isPowerOf2()) {
23872       unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23878                            DAG.getConstant(BitWidth - 1, dl, VT));
23883   // Break 256-bit integer vector compare into smaller ones.
23887   // Break 512-bit integer vector compare into smaller ones.
23893   // not-of-PCMPEQ:
23894   // X != INT_MIN --> X >s INT_MIN
23895   // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23896   // +X != 0 --> +X >s 0
23908   // If both operands are known non-negative, then an unsigned compare is the
23923       // X > C --> X >= (C+1) --> X == umax(X, C+1)
23931       // X < C --> X <= (C-1) --> X == umin(X, C-1)
23941     // clang-format off
23947     // clang-format on
23953     // If the logical-not of the result is required, perform that now.
24009       // If the i64 elements are sign-extended enough to be representable as i32
24069       // Make sure the lower and upper halves are both all-ones.
24093   // If the logical-not of the result is required, perform that now.
24175     // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24209       if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24220     // (seteq (add X, -1), -1). Similar for setne.
24249   MVT VT = Op->getSimpleValueType(0);
24253   assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24259       cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24281     // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24284     // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24287     // encoding size - so it must either already be a i8 or i32 immediate, or it
24292       const APInt &Op1Val = Op1C->getAPIntValue();
24352   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
24425   assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24426   return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24451   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24477       return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24480     // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24484     // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24537     // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24538     // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24539     // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24540     // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24541     // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24542     // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24543     // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24550     // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24563     // 'X - 1' sets the carry flag if X == 0.
24564     // '0 - X' sets the carry flag if X != 0.
24565     // Convert the carry flag to a -1/0 mask with sbb:
24566     // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24567     // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24568     // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24569     // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24607       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24611         translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
24679   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
24680   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
24681   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
24682   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
24683   // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
24684   // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
24685   // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24686   // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24694     // Special handling for __builtin_ffs(X) - 1 pattern which looks like
24695     // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
24711                Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
24714       // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24719       // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24720       unsigned ShCt = VT.getSizeInBits() - 1;
24736   // (select (and X, 1), Op1, Op2  --> (select (icmpeq (and X, 1), 0), Op2, Op1)
24753       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
24793   // a <  b ? -1 :  0 -> RES = ~setcc_carry
24794   // a <  b ?  0 : -1 -> RES = setcc_carry
24795   // a >= b ? -1 :  0 -> RES = setcc_carry
24796   // a >= b ?  0 : -1 -> RES = ~setcc_carry
24798     unsigned CondCode = CC->getAsZExtVal();
24846   return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
24852   MVT VT = Op->getSimpleValueType(0);
24853   SDValue In = Op->getOperand(0);
24869   // Widen to 512-bits if VLX is not supported.
24896   // Extract back to 128/256-bit if we widened.
24906   SDValue In = Op->getOperand(0);
24919 // non-SSE4.1 targets. For zero extend this should only handle inputs of
24924   SDValue In = Op->getOperand(0);
24925   MVT VT = Op->getSimpleValueType(0);
24945   // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24946   // For 512-bit vectors, we need 128-bits or 256-bits.
24949     // at least 128-bits.
24955   // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24957   // need to be handled here for 256/512-bit results.
24959     assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24972   // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24974     assert(VT.is256BitVector() && "256-bit vector expected");
24994   // If the source elements are already all-signbits, we don't need to extend,
25006   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25023       Mask[i * Scale + (Scale - 1)] = i;
25028     unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25046   MVT VT = Op->getSimpleValueType(0);
25047   SDValue In = Op->getOperand(0);
25079   // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25080   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25086   SmallVector<int,8> ShufMask(NumElems, -1);
25096 /// Change a vector store into a pair of half-size vector stores.
25098   SDValue StoredVal = Store->getValue();
25101          "Expecting 256/512-bit op");
25108   if (!Store->isSimple())
25115   SDValue Ptr0 = Store->getBasePtr();
25119       DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25120                    Store->getOriginalAlign(),
25121                    Store->getMemOperand()->getFlags());
25122   SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25123                              Store->getPointerInfo().getWithOffset(HalfOffset),
25124                              Store->getOriginalAlign(),
25125                              Store->getMemOperand()->getFlags());
25133   SDValue StoredVal = Store->getValue();
25135          StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25141   if (!Store->isSimple())
25152     SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25156     SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25157                               Store->getPointerInfo().getWithOffset(Offset),
25158                               Store->getOriginalAlign(),
25159                               Store->getMemOperand()->getFlags());
25169   SDValue StoredVal = St->getValue();
25176     assert(!St->isTruncatingStore() && "Expected non-truncating store");
25191     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25192                         St->getPointerInfo(), St->getOriginalAlign(),
25193                         St->getMemOperand()->getFlags());
25196   if (St->isTruncatingStore())
25199   // If this is a 256-bit store of concatenated ops, we are better off splitting
25200   // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
25226     // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25234     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25235                         St->getPointerInfo(), St->getOriginalAlign(),
25236                         St->getMemOperand()->getFlags());
25240   SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25242                                  St->getMemOperand());
25264     assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25269     SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25270                                 Ld->getPointerInfo(), Ld->getOriginalAlign(),
25271                                 Ld->getMemOperand()->getFlags());
25274     assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25310     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25327                          Overflow, Op->getFlags());
25334                          EFLAGS, Op->getFlags());
25341       // have a fall-through edge, because this requires an explicit
25343       if (Op.getNode()->hasOneUse()) {
25344         SDNode *User = *Op.getNode()->user_begin();
25348         if (User->getOpcode() == ISD::BR) {
25349           SDValue FalseBB = User->getOperand(1);
25351             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25360                               CCVal, Cmp, Op->getFlags());
25363                              Cmp, Op->getFlags());
25373                           Cmp, Op->getFlags());
25376                          Cmp, Op->getFlags());
25383                          Cmp, Op->getFlags());
25394                        Overflow, Op->getFlags());
25414                      Op->getFlags());
25437   EVT VT = Node->getValueType(0);
25467           DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25487     MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25490     Register SPReg = RegInfo->getStackRegister();
25497           DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25515   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25522     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25528   //   gp_offset         (0 - 6 * 8)
25529   //   fp_offset         (48 - 48 + 8 * 16)
25537       DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25545       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25551   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25559   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25569          "LowerVAARG only handles 64-bit va_arg!");
25579   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25583   EVT ArgVT = Op.getNode()->getValueType(0);
25627   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
25629   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
25638   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
25639   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25688       ShiftAmt = ElementType.getSizeInBits() - 1;
25694          && "Unknown target vector shift-by-constant node");
25734     SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
25739   // Peek through any zext node if we can get back to a 128-bit source.
25750   // The shift uses the entire lower 64-bits of the amount vector, so no need to
25756       // If the shift amount has come from a scalar, then zero-extend the scalar
25765       // then we can zero-extend it by setting all the other mask elements to
25780   // Extract if the shift amount vector is larger than 128-bits.
25786   // Zero-extend bottom element to v2i64 vector type, either by extension or
25797           (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25806   // Change opcode to non-immediate version.
25809   // The return type has to be a 128-bit type with the same element
25886     if (MaskConst->getZExtValue() & 0x1)
25907   if (!Fn->hasPersonalityFn())
25910   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25912   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25918       "can only recover FP for 32-bit MSVC EH personality functions");
25925 ///   RegNodeBase = EntryEBP - RegNodeSize
25926 ///   ParentFP = RegNodeBase - ParentFrameOffset
25940   if (!Fn->hasPersonalityFn())
25946       GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25958   // RegNodeBase = EntryEBP - RegNodeSize
25959   // ParentFP = RegNodeBase - ParentFrameOffset
25970       return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25976       unsigned RC = C->getZExtValue();
25990       RC = C->getZExtValue();
26010   SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26013     switch(IntrData->Type) {
26016       // First, we check if the intrinsic may have non-default rounding mode,
26017       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26018       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26029       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26037         Opc = IntrData->Opc0;
26039         Opc = IntrData->Opc1;
26049       // First, we check if the intrinsic may have non-default rounding mode,
26050       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26051       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26063       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26071         Opc = IntrData->Opc0;
26073         Opc = IntrData->Opc1;
26086       if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26088         Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26092       // First, we check if the intrinsic may have non-default rounding mode,
26093       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26094       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26106       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26110       assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26113         Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26116       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26125       //   - RC Opcode is specified and
26126       //   - RC is not "current direction".
26127       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26140           DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26151         Opc = IntrData->Opc0;
26153         Opc = IntrData->Opc1;
26165       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26167       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26168       // (2) With rounding mode and sae - 7 operands.
26182         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26190       unsigned Opc = IntrData->Opc0;
26212         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26214         NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26229         Opc = IntrData->Opc0;
26231         Opc = IntrData->Opc1;
26244       if (IntrData->Opc1 != 0) {
26248           NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26254         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26263       unsigned Opc = IntrData->Opc0;
26264       if (IntrData->Opc1 != 0) {
26267           Opc = IntrData->Opc1;
26284         Opc = IntrData->Opc0;
26286         Opc = IntrData->Opc1;
26300       unsigned Opc = IntrData->Opc0;
26301       if (IntrData->Opc1 != 0) {
26304           Opc = IntrData->Opc1;
26320       return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26327       return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26338       if (IntrData->Type == CFMA_OP_MASKZ)
26342       //   - RC Opcode is specified and
26343       //   - RC is not "current direction".
26345       if (IntrData->Opc1 != 0) {
26349           NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26355         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26356       if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26357           IntrData->Opc0 == X86ISD::VFCMADDCSH)
26364       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26370       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26386       // First, we check if the intrinsic may have non-default rounding mode,
26387       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26388       if (IntrData->Opc1 != 0) {
26391           return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26397       return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26407       if (IntrData->Opc1 != 0) {
26410           Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26416         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26428       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26443       auto ComiOpCode = IntrData->Opc0;
26514       // Catch shift-by-constant.
26516         return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26518                                           CShAmt->getZExtValue(), DAG);
26521       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26535       return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26545       SDValue Passthru = (IntrData->Type == FIXUPIMM)
26549       unsigned Opc = IntrData->Opc0;
26550       if (IntrData->Opc1 != 0) {
26553           Opc = IntrData->Opc1;
26566       assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26571       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26575       assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26580       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26584       assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26589       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26594       SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26601         Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26606         Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26622         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26627       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26637         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
26642       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26652       unsigned Opc = IntrData->Opc0;
26666         Opc = IntrData->Opc1;
26678         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26684       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
26840     SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26855     SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26868     SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26888     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26890         GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26893     // supported on 32-bit Windows, which isn't PIC.
26902     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26915     if (RegInfo->hasBasePointer(MF))
26916       Reg = RegInfo->getBaseRegister();
26918       bool CantUseFP = RegInfo->hasStackRealignment(MF);
26920         Reg = RegInfo->getPtrSizedStackRegister(MF);
26922         Reg = RegInfo->getPtrSizedFrameRegister(MF);
26939                     Op->getOperand(1), Op->getOperand(2));
26960       // to 8-bits which may make it no longer out of bounds.
26961       unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27012       // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27034   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27052                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27067   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27090                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27104   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27121                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27135   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27152 /// Returns a Glue value which can be used to add extra copy-from-reg if the
27161   SDValue Chain = N->getOperand(0);
27165     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27166     Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27191     // Merge the two 32-bit values into a 64-bit one.
27199   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27214   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27215   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27216   // and the EAX register is loaded with the low-order 32 bits.
27252   EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27270   EHInfo->EHGuardFrameIndex = FINode->getIndex();
27303   // 64-bit targets support extended Swift async frame setup,
27305   return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27321         X86FI->setHasSwiftAsyncContext(true);
27322         SDValue Chain = Op->getOperand(0);
27329         return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27334         if (!X86FI->getSwiftAsyncContextFrameIdx())
27335           X86FI->setSwiftAsyncContextFrameIdx(
27339             DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27342         return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27343                            Op->getOperand(0));
27390       SDValue Chain = Op->getOperand(0);
27409           DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27410                       Op->getOperand(3), Op->getOperand(4));
27412       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27433       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27462       MachineMemOperand *MMO = MemIntr->getMemOperand();
27463       EVT MemVT = MemIntr->getMemoryVT();
27469       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27500       MachineMemOperand *MMO = MemIntr->getMemOperand();
27501       EVT MemVT = MemIntr->getMemoryVT();
27510       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27523       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27535       X86MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
27598       MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27604       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27618       MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27624       unsigned Imm = Op2->getAsZExtVal();
27628       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27638       MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27640           X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
27678       MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
27679       return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
27713       MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27725   switch(IntrData->Type) {
27730     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
27731     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27735     SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
27736                      DAG.getConstant(1, dl, Op->getValueType(1)),
27739     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
27742     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
27752     return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27774     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27781     unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
27793     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
27809     expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
27815     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
27816     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27819     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
27820     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
27834     EVT MemVT  = MemIntr->getMemoryVT();
27836     uint16_t TruncationOp = IntrData->Opc0;
27841                                  MemIntr->getMemOperand());
27848                                 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
27856                                MemIntr->getMemOperand(), DAG);
27862                                    VMask, MemVT, MemIntr->getMemOperand(), DAG);
27881   unsigned Depth = Op.getConstantOperandVal(0);
27885   if (Depth > 0) {
27888     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
27915   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
27916     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
27919     int FrameAddrIndex = FuncInfo->getFAIndex();
27922       unsigned SlotSize = RegInfo->getSlotSize();
27925       FuncInfo->setFAIndex(FrameAddrIndex);
27931       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
27933   unsigned Depth = Op.getConstantOperandVal(0);
27938   while (Depth--)
27966       Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27982   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28013   Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28021                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28042     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28075   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28081     // Large code-model.
28082     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
28085     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28086     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28130       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28131     CallingConv::ID CC = Func->getCallingConv();
28144       FunctionType *FTy = Func->getFunctionType();
28145       const AttributeList &Attrs = Func->getAttributes();
28147       if (!Attrs.isEmpty() && !Func->isVarArg()) {
28151         for (FunctionType::param_iterator I = FTy->param_begin(),
28152              E = FTy->param_end(); I != E; ++I, ++Idx)
28160           report_fatal_error("Nest register in use - reduce number of inreg"
28186     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28218      01 Round to -inf
28223     -1 Undefined
28227      3 Round to -inf
28229   To perform the conversion, we use a packed lookup table of the four 2-bit
28231     0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28280   SDValue Chain = Op.getNode()->getOperand(0);
28303   SDValue NewRM = Op.getNode()->getOperand(1);
28306     uint64_t RM = CVal->getZExtValue();
28309     // clang-format off
28316     // clang-format on
28321     //    0 Round to 0       -> 11
28322     //    1 Round to nearest -> 00
28323     //    2 Round to +inf    -> 10
28324     //    3 Round to -inf    -> 01
28325     // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28398   SDValue Chain = Op->getOperand(0);
28399   SDValue Ptr = Op->getOperand(1);
28401   EVT MemVT = Node->getMemoryVT();
28403   MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28414         (MMO->getFlags() & ~MachineMemOperand::MOStore);
28464   SDValue Chain = Op->getOperand(0);
28465   SDValue Ptr = Op->getOperand(1);
28467   EVT MemVT = Node->getMemoryVT();
28469   MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28477   SDValue Chain = Op.getNode()->getOperand(0);
28483   // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28492   // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28520             (0x8080808080808080ULL >> (64 - (8 * Amt))));
28522     return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28524     return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28573   SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28587   // Per-nibble leading zero PSHUFB lookup table.
28667       // vXi8 vectors need to be promoted to 512-bits for vXi32.
28671   // Decompose 256-bit ops into smaller 128-bit ops.
28675   // Decompose 512-bit ops into smaller 256-bit ops.
28705     PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
28714     SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
28720   // Finally xor with NumBits-1.
28722                    DAG.getConstant(NumBits - 1, dl, OpVT));
28773          "Only handle AVX 256-bit vector integer operation");
28799       // Handle a special-case with a bit-hack instead of cmp+select:
28800       // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
28805       if (C && C->getAPIntValue().isSignMask()) {
28806         SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
28807         SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
28814       // usubsat X, Y --> (X >u Y) ? X - Y : 0
28853     // Since X86 does not have CMOV for 8-bit integer, we don't convert
28854     // 8-bit integer abs to NEG and CMOV.
28863   // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
28872            "Only handle AVX 256-bit vector integer operation");
28936       return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
28959   //             Num   xNaN              +0     -0
28960   //          ---------------         ---------------
28962   // X        ---------------  X      ---------------
28963   //    xNaN  |   X  |  X/Y |     -0  |  +0  |  -0  |
28964   //          ---------------         ---------------
28975       return CstOp->getValueAPF().bitcastToAPInt() == Zero;
28977       return CstOp->getAPIntValue() == Zero;
28978     if (Op->getOpcode() == ISD::BUILD_VECTOR ||
28979         Op->getOpcode() == ISD::SPLAT_VECTOR) {
28980       for (const SDValue &OpVal : Op->op_values()) {
28986         if (!CstOp->getValueAPF().isZero())
28988         if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
28999                           Op->getFlags().hasNoSignedZeros() ||
29012              (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29031     return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29062                    Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29070   SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29100     // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29101     // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29113     // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29114     // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29139   // Decompose 256-bit ops into 128-bit ops.
29149   // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29175         for (auto [Idx, Val] : enumerate(B->ops())) {
29197     // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29319   // and use pmullw to calculate the full 16-bit product.
29322   // pmulhw to calculate the full 16-bit product. This trick means we don't
29389   bool IsSigned = Op->getOpcode() == ISD::MULHS;
29394   // Decompose 256-bit ops into 128-bit ops.
29418     const int Mask[] = {1, -1,  3, -1,  5, -1,  7, -1,
29419                         9, -1, 11, -1, 13, -1, 15, -1};
29474   // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29501   bool IsSigned = Op->getOpcode() == ISD::SMULO;
29504   EVT OvfVT = Op->getValueType(1);
29613     // UMULO overflows if the high bits are non-zero.
29629   if (isa<ConstantSDNode>(Op->getOperand(1))) {
29637   switch (Op->getOpcode()) {
29638   // clang-format off
29644   // clang-format on
29652   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
29653     EVT ArgVT = Op->getOperand(i).getValueType();
29657     int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29662         DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
29692   bool IsStrict = Op->isStrictFPOpcode();
29701   if (Op->getOpcode() == ISD::FP_TO_SINT ||
29702       Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
29725   bool IsStrict = Op->isStrictFPOpcode();
29734   if (Op->getOpcode() == ISD::SINT_TO_FP ||
29735       Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
29747   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29758 // Return true if the required (according to Opcode) shift-imm form is natively
29787 // These instructions are defined together with shift-immediate.
29794 // Return true if the required (according to Opcode) variable-shift form is
29810   // vXi16 supported only on AVX-512, BWI
29849                                                  ShiftAmt - 32, DAG);
29885     // shl: (shl V, 1) -> (add (freeze V), (freeze V))
29887       // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29905   // If we're logical shifting an all-signbits value then we can just perform as
29921       // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29957       APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
29966       APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
29993   int BaseShAmtIdx = -1;
29999     // vXi8 shifts - shift as v8i16 + mask result.
30010         // Create the mask using vXi16 shifts. For shift-rights we need to move
30030           // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30143   // XOP has 128-bit variable logical/arithmetic shifts.
30144   // +ve/-ve Amt = shift left/right.
30155   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30156   // shifts per-lane and then shuffle the partial results back together.
30171       if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30173       unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30196     unsigned AmtA = UniqueCstAmt.begin()->first;
30197     unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30198     const APInt &MaskA = UniqueCstAmt.begin()->second;
30199     const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30234     SmallVector<SDValue, 32> AmtWideElts(Amt->op_begin(), Amt->op_end());
30238       // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30265         if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30266             AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30302     // have vandps but that is an FP instruction and crossing FP<->int typically
30325       //   FullMask = (1 << EltSizeInBits) - 1
30344       // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30345       // are `EltSizeInBits-AmtWide` bits wide.
30347       // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30349       // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30351       //   SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30352       //   (Masked ^ SignBitMask) - SignBitMask
30355       //   Masked + SignBitMask - SignBitMask
30358       // so sign extending should be a no-op.
30361       //   Masked - SignBitmask - SignBitMask
30363       // This is equal to Masked - 2*SignBitMask which will correctly sign
30386   // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30400   // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30402   // of these cases in pre-SSE41/XOP/AVX512 but not both.
30425   // immediate shifts, else we need to zero-extend each lane to the lower i64
30438       // just zero-extending, but for SSE just duplicating the top 16-bits is
30442         Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30443         Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30444         Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30445         Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30449                                              {4, 5, 6, 7, -1, -1, -1, -1});
30466     // TODO - ideally shuffle combining would handle this.
30468       SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30469       SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30472     SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30473     SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30477   // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30478   // look up the pre-computed shift values.
30512   // NOTE: We honor prefered vector width before promoting to 512-bits.
30603       // On pre-SSE41 targets we test for the sign bit by comparing to
30604       // zero - a negative value will set all bits of the lanes to true
30705     // If we have a constant shift amount, the non-SSE41 path is best as
30721       // On pre-SSE41 targets we splat the sign bit - a negative value will
30768   // Decompose 256-bit shifts into 128-bit shifts.
30814     // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
30815     // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
30821       uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
30822       uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
30831         // bit-select - lower using vXi16 shifts and then perform the bitmask at
30833         APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
30834         APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
30855     SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30867     // Split 256-bit integers on XOP/pre-AVX2 targets.
30868     // Split 512-bit integers on non 512-bit BWI targets.
30873       // Pre-mask the amount modulo using the wider vector.
30880       int ScalarAmtIdx = -1;
30900     // If per-element shifts are legal, fallback to generic expansion.
30905     // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30906     // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30922     // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
30946   // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30947   // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30950     SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31013     // Else, fall-back on VPROLV/VPRORV.
31017   // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31046   // Split 256-bit integers on XOP/pre-AVX2 targets.
31050   // XOP has 128-bit vector variable + immediate rotates.
31051   // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31055     assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31064     // Use general rotate by variable (per-element).
31068   // Rotate by an uniform constant - expand back to shifts.
31073     uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31074     uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31082   // Split 512-bit integers on non 512-bit BWI targets.
31096   SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31100   // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31101   // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31103     int BaseRotAmtIdx = -1;
31124   // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31125   // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31126   // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31147     // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31148     // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31177       // On pre-SSE41 targets we test for the sign bit by comparing to
31178       // zero - a negative value will set all bits of the lanes to true
31235   // Fallback for non-constants AVX2 vXi16 as well.
31270   // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31271   // that can then be OR'd with the lower 32-bits.
31296   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31308   Type *MemType = SI->getValueOperand()->getType();
31310   if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31312     if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31316     if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31328   Type *MemType = LI->getType();
31330   if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31332     // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31333     // can use movq to do the load. If we have X87 we can load into an 80-bit
31335     if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31339     // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31340     if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31362     if (isPowerOf2_64(C->getZExtValue()))
31364     else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31369   // Check if V is some power of 2 pattern known to be non-zero
31387     if (I->getOpcode() == Instruction::Shl) {
31389       // -X` and some other provable power of 2 patterns that we can use CTZ on
31392       // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31393       // be provably a non-zero power of 2.
31396       auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31399       if (ShiftVal->equalsInt(1))
31405       Value *BitV = I->getOperand(1);
31409       uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31424   if (AI->use_empty())
31427   if (AI->getOperation() == AtomicRMWInst::Xor) {
31428     // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31430     if (match(AI->getOperand(1), m_SignMask()))
31436   // Note: InstCombinePass can cause a de-optimization here. It replaces the
31440   Instruction *I = AI->user_back();
31441   auto BitChange = FindSingleBitChange(AI->getValOperand());
31442   if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31443       I->getOpcode() != Instruction::And ||
31444       AI->getType()->getPrimitiveSizeInBits() == 8 ||
31445       AI->getParent() != I->getParent())
31448   unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31451   if (AI == I->getOperand(OtherIdx))
31456     auto *C1 = cast<ConstantInt>(AI->getValOperand());
31457     auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31458     if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31461     if (AI->getOperation() == AtomicRMWInst::And) {
31462       return ~C1->getValue() == C2->getValue()
31472   auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31484   if (AI->getOperation() == AtomicRMWInst::And)
31500   switch (AI->getOperation()) {
31516   Instruction *I = AI->user_back();
31517   LLVMContext &Ctx = AI->getContext();
31518   Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31521   auto BitTested = FindSingleBitChange(AI->getValOperand());
31525     auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31527     unsigned Imm = llvm::countr_zero(C->getZExtValue());
31528     Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31538     unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31540         Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31547     Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31548     Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31550     // If the result is only used for zero/non-zero status then we don't need to
31552     for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31554         if (ICmp->isEquality()) {
31555           auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31556           auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31559             if ((C0 ? C0 : C1)->isZero())
31569   I->replaceAllUsesWith(Result);
31570   I->eraseFromParent();
31571   AI->eraseFromParent();
31576   if (!AI->hasOneUse())
31579   Value *Op = AI->getOperand(1);
31581   Instruction *I = AI->user_back();
31582   AtomicRMWInst::BinOp Opc = AI->getOperation();
31587       if (match(I->user_back(),
31590       if (match(I->user_back(),
31600       if (match(I->user_back(),
31603       if (match(I->user_back(),
31613     if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
31616     if (match(I->user_back(),
31625       if (match(I->user_back(),
31628       if (match(I->user_back(),
31643   LLVMContext &Ctx = AI->getContext();
31644   ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
31646     TempI = AI->user_back();
31647     assert(TempI->hasOneUse() && "Must have one use");
31648     ICI = cast<ICmpInst>(TempI->user_back());
31651   ICmpInst::Predicate Pred = ICI->getPredicate();
31669   switch (AI->getOperation()) {
31688   Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31691       IID, AI->getType(),
31692       {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
31694   ICI->replaceAllUsesWith(Result);
31695   ICI->eraseFromParent();
31697     TempI->eraseFromParent();
31698   AI->eraseFromParent();
31704   Type *MemType = AI->getType();
31708   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
31713   AtomicRMWInst::BinOp Op = AI->getOperation();
31743     // These always require a non-trivial set of data operations on x86. We must
31752   Type *MemType = AI->getType();
31756   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
31762   if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
31763     if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
31764         AI->use_empty())
31769   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
31770   auto SSID = AI->getSyncScopeID();
31773   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
31776   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
31798     // different cache-line to prevent cache-line bouncing. In practice it
31809       AI->getType(), AI->getPointerOperand(), AI->getAlign());
31810   Loaded->setAtomic(Order, SSID);
31811   AI->replaceAllUsesWith(Loaded);
31812   AI->eraseFromParent();
31827   // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
31838   //   c) To minimize concerns about cross thread stack usage - in particular,
31840   //      captures state in the TOS frame and accesses it from many threads -
31845   // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
31849   const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
31889   // The only fence that needs an instruction is a sequentially-consistent
31890   // cross-thread fence.
31900   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31928   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
31938   return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
31980     assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
31992     assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
32113   // http://wm.ite.pl/articles/sse-popcount.html
32116   // index into a in-register pre-computed pop count table. We then split up the
32117   // input vector in two new ones: (1) a vector with only the shifted-right
32120   // to index the in-register table. Next, both are added and the result is a
32171   // Decompose 256-bit ops into smaller 128-bit ops.
32175   // Decompose 512-bit ops into smaller 256-bit ops.
32210     unsigned ActiveBits = Known.getBitWidth() - LZ;
32211     unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32213     // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32225     // i3 CTPOP - perform LUT into i32 integer.
32240     // i4 CTPOP - perform LUT into i64 integer.
32257     // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32301   // Decompose 256-bit ops into smaller 128-bit ops.
32306          "Only 128-bit vector bitreverse lowering supported.");
32314     for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32340   // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32344   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32387   // 0-15 value (moved to the other nibble).
32422   // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32439     // Xor the high and low 16-bits together using a 32-bit operation.
32448     // Xor the high and low 16-bits together using a 32-bit operation.
32453     // If the input is 16-bits, we need to extend to use an i32 shift below.
32457   // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32458   // This should allow an h-reg to be used to save a shift.
32475   switch (N->getOpcode()) {
32495   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32499       {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32500       /*MemVT=*/N->getSimpleValueType(0), MMO);
32503 /// Lower atomic_load_ops into LOCK-prefixed operations.
32507   SDValue Chain = N->getOperand(0);
32508   SDValue LHS = N->getOperand(1);
32509   SDValue RHS = N->getOperand(2);
32510   unsigned Opc = N->getOpcode();
32511   MVT VT = N->getSimpleValueType(0);
32517   if (N->hasAnyUseOfValue(0)) {
32518     // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32525                            DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32539   if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32545     if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
32546         AN->getSyncScopeID() == SyncScope::System) {
32551       assert(!N->hasAnyUseOfValue(0));
32553       return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32556     // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32558     assert(!N->hasAnyUseOfValue(0));
32560     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32566   assert(!N->hasAnyUseOfValue(0));
32568   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32576   EVT VT = Node->getMemoryVT();
32579       Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32594       SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
32595       Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
32596                            Node->getMemOperand());
32604             DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
32608         SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
32610                                         MVT::i64, Node->getMemOperand());
32612         // First load this into an 80-bit X87 register using a stack temporary.
32615         int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
32618         Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
32628         SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
32631                                     StoreOps, MVT::i64, Node->getMemOperand());
32645   // Convert seq_cst store -> xchg
32646   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
32647   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
32648   SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
32649                                Node->getOperand(0), Node->getOperand(2),
32650                                Node->getOperand(1), Node->getMemOperand());
32656   MVT VT = N->getSimpleValueType(0);
32680   if (N->getValueType(1) == MVT::i1)
32683   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
32777     SmallVector<SDValue, 16> Ops(InOp->op_begin(), InOp->op_end());
32778     Ops.append(WidenNumElts - InNumElts, FillVal);
32790          "MGATHER/MSCATTER are supported on AVX-512 arch only");
32793   SDValue Src = N->getValue();
32798   SDValue Scale = N->getScale();
32799   SDValue Index = N->getIndex();
32800   SDValue Mask = N->getMask();
32801   SDValue Chain = N->getChain();
32802   SDValue BasePtr = N->getBasePtr();
32814                                      N->getMemoryVT(), N->getMemOperand());
32826   // If we don't have VLX and neither the passthru or index is 512-bits, we
32830     // Determine how much we need to widen by to get a 512-bit type.
32847                                  N->getMemoryVT(), N->getMemOperand());
32856   SDValue Mask = N->getMask();
32858   SDValue PassThru = N->getPassThru();
32868         VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32869         getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
32870         N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
32871         N->isExpandingLoad());
32877   assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
32878          "Expanding masked load is supported on AVX-512 target only!");
32880   assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
32881          "Expanding masked load is supported for 32 and 64-bit types only!");
32905       WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32906       PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
32907       N->getExtensionType(), N->isExpandingLoad());
32919   SDValue DataToStore = N->getValue();
32922   SDValue Mask = N->getMask();
32925   assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
32926          "Expanding masked load is supported on AVX-512 target only!");
32928   assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
32929          "Expanding masked load is supported for 32 and 64-bit types only!");
32952   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
32953                             N->getOffset(), Mask, N->getMemoryVT(),
32954                             N->getMemOperand(), N->getAddressingMode(),
32955                             N->isTruncatingStore(), N->isCompressingStore());
32961          "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
32966   SDValue Index = N->getIndex();
32967   SDValue Mask = N->getMask();
32968   SDValue PassThru = N->getPassThru();
32977   // If we don't have VLX and neither the passthru or index is 512-bits, we
32982     // Determine how much we need to widen by to get a 512-bit type.
33001   SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33002                     N->getScale() };
33004       X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33005       N->getMemOperand());
33017   unsigned SrcAS = N->getSrcAddressSpace();
33019   assert(SrcAS != N->getDestAddressSpace() &&
33039   // no-ops in the case of a null GC strategy (or a GC strategy which does not
33044   if (Op->getGluedNode())
33045     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33069   // We don't support non-data prefetch without PREFETCHI.
33079   SDValue Operand = N->getOperand(0);
33107     // sub-string, e.g. "$12" contain "$1"
33109       I = AsmStr.size() - OpNoStr1.size();
33164   // ->
33185   // ->
33200   // clang-format off
33356   // clang-format on
33366   unsigned Opc = N->getOpcode();
33371     N->dump(&DAG);
33375     EVT VT = N->getValueType(0);
33387     EVT VT = N->getValueType(0);
33393                      {N->getOperand(0), Lo});
33395                      {N->getOperand(0), Hi});
33407     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33410     KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33414       SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33428           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33430       // Bit count should fit in 32-bits, extract it as that and then zero
33441     EVT VT = N->getValueType(0);
33444     // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33447     SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33448     SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33460     EVT VT = N->getValueType(0);
33465     SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33466     SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33471     Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33483       // UMULO overflows if the high bits are non-zero.
33486     SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33499     EVT VT = N->getValueType(0);
33500     EVT InVT = N->getOperand(0).getValueType();
33515     Ops[0] = N->getOperand(0);
33517     Ops[0] = N->getOperand(1);
33531     EVT VT = N->getValueType(0);
33536                               N->getOperand(IsStrict ? 1 : 0), UNDEF);
33538                               N->getOperand(IsStrict ? 2 : 1), UNDEF);
33542                         {N->getOperand(0), LHS, RHS});
33554     EVT VT = N->getValueType(0);
33560       // TODO: Can we do something for non-splat?
33562       if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33565         Ops0[0] = N->getOperand(0);
33580     MVT VT = N->getSimpleValueType(0);
33588     SDValue In = N->getOperand(0);
33599                                             Subtarget, N->getFlags())) {
33613         SmallVector<int, 16> TruncMask(WidenNumElts, -1);
33655                                           -1, -1, -1, -1, -1, -1, -1, -1 });
33678     assert(N->getValueType(0) == MVT::v8i8 &&
33683     EVT VT = N->getValueType(0);
33684     SDValue In = N->getOperand(0);
33691       // Custom split this so we can extend i8/i16->i32 invec. This is better
33692       // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
33733       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
33759     EVT VT = N->getValueType(0);
33760     SDValue Op = N->getOperand(0);
33777     bool IsStrict = N->isStrictFPOpcode();
33779     EVT VT = N->getValueType(0);
33780     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33781     SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
33820             DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
33859                           {N->getOperand(0), Src});
33918           // legalization to v8i32<-v8f64.
33925           Opc = N->getOpcode();
33931                             {N->getOperand(0), Src});
33942       // Custom widen strict v2f32->v2i32 by padding with zeros.
33948                                   {N->getOperand(0), Src});
33966       // If we use a 128-bit result we might need to use a target specific node.
33985         Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34023     bool IsStrict = N->isStrictFPOpcode();
34025     EVT VT = N->getValueType(0);
34026     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34040                                   {N->getOperand(0), Src});
34057                                   {N->getOperand(0), Src});
34082                           {N->getOperand(0), Elt});
34099           DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34114       // Custom widen strict v2i32->v2f32 to avoid scalarization.
34119                                 {N->getOperand(0), Src});
34134                                 {N->getOperand(0), Or, VBias});
34141       // TODO: Are there any fast-math-flags to propagate here?
34149     bool IsStrict = N->isStrictFPOpcode();
34150     SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34151     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34152     SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34154     EVT VT = N->getValueType(0);
34194     assert(N->getValueType(0) == MVT::v2f32 &&
34198     bool IsStrict = N->isStrictFPOpcode();
34199     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34207                       {N->getOperand(0), V});
34216     unsigned IntNo = N->getConstantOperandVal(1);
34244     EVT T = N->getValueType(0);
34248            "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34252         DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34253     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34260         DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34265     // In 64-bit mode we might need the base pointer in RBX, but we can't know
34270     // live-range.
34273     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34275       SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34282       SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34299     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34308         (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34316       if (N->getValueType(0) == MVT::i128) {
34318           SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34319                                    Node->getBasePtr(), Node->getMemOperand());
34324           Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34333         // Then extract the lower 64-bits.
34336         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34338                                              MVT::i64, Node->getMemOperand());
34347         // then casts to i64. This avoids a 128-bit stack temporary being
34348         // created by type legalization if we were to cast v4f32->v2i64.
34357         // First load this into an 80-bit X87 register. This will put the whole
34360         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34363                                                  Node->getMemOperand());
34371         int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34410     EVT DstVT = N->getValueType(0);
34411     EVT SrcVT = N->getOperand(0).getValueType();
34413     // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34414     // we can split using the k-register rather than memory.
34416       assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34433                                 N->getOperand(0));
34442     EVT VT = N->getValueType(0);
34446       SDValue Index = Gather->getIndex();
34452       SDValue Mask = Gather->getMask();
34455                                      Gather->getPassThru(),
34464       SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34465                         Gather->getBasePtr(), Index, Gather->getScale() };
34468           Gather->getMemoryVT(), Gather->getMemOperand());
34477     // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34479     MVT VT = N->getSimpleValueType(0);
34488       SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34489                                 Ld->getPointerInfo(), Ld->getOriginalAlign(),
34490                                 Ld->getMemOperand()->getFlags());
34502     SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34504                                           MVT::i64, Ld->getMemOperand());
34515     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34524     assert(N->getSimpleValueType(0) == MVT::f16 &&
34527     SDValue VecOp = N->getOperand(0);
34529     SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34531                         N->getOperand(1));
35018   // X86 allows a sign-extended 32-bit immediate field as a displacement.
35034     // If lower 4G is not available, then we must use rip-relative addressing.
35065   // These are non-commutative binops.
35101   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35103   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35104   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35109   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35115   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35144   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35145   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35149   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35170     // X86 has 8, 16, and 32-bit zero-extending loads.
35230   // TODO: This is too general. There are cases where pre-AVX512 codegen would
35254   // Very little shuffling can be done for 64-bit vectors right now.
35266   // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35287   // zero-extensions.
35294 //===----------------------------------------------------------------------===//
35296 //===----------------------------------------------------------------------===//
35303   for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
35313   for (MachineBasicBlock *Succ : BB->successors())
35314     if (Succ->isLiveIn(X86::EFLAGS))
35325   const BasicBlock *BB = MBB->getBasicBlock();
35326   MachineFunction::iterator I = ++MBB->getIterator();
35334   //  s0 = -1
35344   MachineFunction *MF = MBB->getParent();
35345   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35346   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35347   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35348   MF->insert(I, mainMBB);
35349   MF->insert(I, fallMBB);
35350   MF->insert(I, sinkMBB);
35353     mainMBB->addLiveIn(X86::EFLAGS);
35354     fallMBB->addLiveIn(X86::EFLAGS);
35355     sinkMBB->addLiveIn(X86::EFLAGS);
35359   sinkMBB->splice(sinkMBB->begin(), MBB,
35360                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35361   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
35363   MachineRegisterInfo &MRI = MF->getRegInfo();
35373   BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35374   thisMBB->addSuccessor(mainMBB);
35375   thisMBB->addSuccessor(fallMBB);
35378   //  mainDstReg := -1
35379   BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35380   BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35381   mainMBB->addSuccessor(sinkMBB);
35387   BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35388   BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35390   fallMBB->addSuccessor(sinkMBB);
35394   BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35405   // Emit va_arg instruction on X86-64.
35407   // Operands to this pseudo-instruction:
35409   // 1-5) Input         : va_list address (addr, i64mem)
35413   // 9  ) EFLAGS (implicit-def)
35428   MachineFunction *MF = MBB->getParent();
35436   MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35437       OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35438   MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35439       OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35443   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
35445       getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
35504     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35505     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35506     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35507     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35509     MachineFunction::iterator MBBIter = ++MBB->getIterator();
35512     MF->insert(MBBIter, offsetMBB);
35513     MF->insert(MBBIter, overflowMBB);
35514     MF->insert(MBBIter, endMBB);
35517     endMBB->splice(endMBB->begin(), thisMBB,
35518                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35519     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35522     thisMBB->addSuccessor(offsetMBB);
35523     thisMBB->addSuccessor(overflowMBB);
35526     offsetMBB->addSuccessor(endMBB);
35527     overflowMBB->addSuccessor(endMBB);
35531     BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35540     BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35542       .addImm(MaxOffset + 8 - ArgSizeA8);
35546     BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35558         TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35568       // Zero-extend the offset
35570       BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35576       BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
35581       BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
35588     BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
35593     BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
35603     BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
35614           TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35629     // aligned_addr = (addr + (align-1)) & ~(align-1)
35632         TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
35635         .addImm(Alignment.value() - 1);
35639         TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
35642         .addImm(~(uint64_t)(Alignment.value() - 1));
35644     BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
35649   // (the overflow address should be kept 8-byte aligned)
35653       TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
35660           TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
35671     BuildMI(*endMBB, endMBB->begin(), MIMD,
35672             TII->get(X86::PHI), DestReg)
35696   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
35700 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
35701 // together with other CMOV pseudo-opcodes into a single basic-block with
35746   MachineFunction *MF = TrueMBB->getParent();
35747   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
35750   X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
35753   MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
35765     Register DestReg = MIIt->getOperand(0).getReg();
35766     Register Op1Reg = MIIt->getOperand(1).getReg();
35767     Register Op2Reg = MIIt->getOperand(2).getReg();
35772     if (MIIt->getOperand(3).getImm() == OppCC)
35776       Op1Reg = It->second.first;
35779       Op2Reg = It->second.second;
35782         BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
35827   // because this custom-inserter would have generated:
35874   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
35875   MachineFunction *F = ThisMBB->getParent();
35876   MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35877   MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35878   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
35880   MachineFunction::iterator It = ++ThisMBB->getIterator();
35881   F->insert(It, FirstInsertedMBB);
35882   F->insert(It, SecondInsertedMBB);
35883   F->insert(It, SinkMBB);
35888   FirstInsertedMBB->addLiveIn(X86::EFLAGS);
35895     SecondInsertedMBB->addLiveIn(X86::EFLAGS);
35896     SinkMBB->addLiveIn(X86::EFLAGS);
35900   SinkMBB->splice(SinkMBB->begin(), ThisMBB,
35902                   ThisMBB->end());
35903   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
35906   ThisMBB->addSuccessor(FirstInsertedMBB);
35908   ThisMBB->addSuccessor(SinkMBB);
35910   FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
35912   FirstInsertedMBB->addSuccessor(SinkMBB);
35914   SecondInsertedMBB->addSuccessor(SinkMBB);
35918   BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
35922   BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
35932       BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
35956   // diamond control-flow pattern.  The incoming instruction knows the
35965   //   fallthrough --> FalseMBB
35967   // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36000   // function - EmitLoweredCascadedSelect.
36014     while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36015            (NextMIIt->getOperand(3).getImm() == CC ||
36016             NextMIIt->getOperand(3).getImm() == OppCC)) {
36018       NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36024   if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36025       NextMIIt->getOpcode() == MI.getOpcode() &&
36026       NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36027       NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36028       NextMIIt->getOperand(1).isKill()) {
36032   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36033   MachineFunction *F = ThisMBB->getParent();
36034   MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36035   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36037   MachineFunction::iterator It = ++ThisMBB->getIterator();
36038   F->insert(It, FalseMBB);
36039   F->insert(It, SinkMBB);
36042   unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36043   FalseMBB->setCallFrameSize(CallFrameSize);
36044   SinkMBB->setCallFrameSize(CallFrameSize);
36049   if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36051     FalseMBB->addLiveIn(X86::EFLAGS);
36052     SinkMBB->addLiveIn(X86::EFLAGS);
36060       SinkMBB->push_back(MI.removeFromParent());
36063   SinkMBB->splice(SinkMBB->end(), ThisMBB,
36065                   ThisMBB->end());
36066   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36069   ThisMBB->addSuccessor(FalseMBB);
36071   ThisMBB->addSuccessor(SinkMBB);
36073   FalseMBB->addSuccessor(SinkMBB);
36076   BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36087   ThisMBB->erase(MIItBegin, MIItEnd);
36102   MachineFunction *MF = MBB->getParent();
36106   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36110   MachineRegisterInfo &MRI = MF->getRegInfo();
36111   MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36112   MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36113   MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36115   MachineFunction::iterator MBBIter = ++MBB->getIterator();
36116   MF->insert(MBBIter, testMBB);
36117   MF->insert(MBBIter, blockMBB);
36118   MF->insert(MBBIter, tailMBB);
36129   BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36133     BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36141           TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36145   BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36148   testMBB->addSuccessor(blockMBB);
36149   testMBB->addSuccessor(tailMBB);
36155   //       + ---- <- ------------ <- ------------- <- ------------ +
36157   // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36159   //                                                               + <- ----------- <- ------------ <- ----------- <- ------------ +
36165   addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36168   BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36173   BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36174   blockMBB->addSuccessor(testMBB);
36177   BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36181   tailMBB->splice(tailMBB->end(), MBB,
36182                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36183   tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
36184   MBB->addSuccessor(testMBB);
36196   MachineFunction *MF = BB->getParent();
36199   const BasicBlock *LLVM_BB = BB->getBasicBlock();
36201   assert(MF->shouldSplitStack());
36225   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36226   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36227   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36229   MachineRegisterInfo &MRI = MF->getRegInfo();
36231       getRegClassFor(getPointerTy(MF->getDataLayout()));
36241   MachineFunction::iterator MBBIter = ++BB->getIterator();
36243   MF->insert(MBBIter, bumpMBB);
36244   MF->insert(MBBIter, mallocMBB);
36245   MF->insert(MBBIter, continueMBB);
36247   continueMBB->splice(continueMBB->begin(), BB,
36248                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
36249   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36253   BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36254   BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36256   BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36259   BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36263   BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36265   BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36267   BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36271       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36273     BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36275     BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36281     BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36283     BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36289     BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36291     BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36292     BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36299     BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36302   BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36304   BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36307   BB->addSuccessor(bumpMBB);
36308   BB->addSuccessor(mallocMBB);
36309   mallocMBB->addSuccessor(continueMBB);
36310   bumpMBB->addSuccessor(continueMBB);
36313   BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36330   MachineFunction *MF = BB->getParent();
36336              classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
36339   // Only 32-bit EH needs to worry about manually restoring stack pointers.
36346       MF->CreateMachineBasicBlock(BB->getBasicBlock());
36347   assert(BB->succ_size() == 1);
36348   MF->insert(std::next(BB->getIterator()), RestoreMBB);
36349   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36350   BB->addSuccessor(RestoreMBB);
36355   RestoreMBB->setIsEHPad(true);
36357   auto RestoreMBBI = RestoreMBB->begin();
36366   // our load from the relocation, sticking it in either RDI (x86-64)
36369   MachineFunction *F = BB->getParent();
36377   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36381       Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
36382       Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
36385         BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36392     MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36397         BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36404     MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36409         BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36410             .addReg(TII->getGlobalBaseReg(F))
36416     MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36445     // aliases and are doing non-trivial configuration of the thunk's body. For
36446     // example, the Linux kernel will do boot-time hot patching of the thunk
36452     // LLVM will generate calls to specific thunks, we merely make a best-effort
36457       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36460       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36463       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36466       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36469       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36477     // When targeting an internal COMDAT thunk use an LLVM-specific name.
36480       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36483       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36486       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36489       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36492       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36499     assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36515   // Find an available scratch register to hold the callee. On 64-bit, we can
36517   // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36533   // Choose the first remaining non-zero available register.
36547   BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36550   MI.setDesc(TII->get(Opc));
36551   MachineInstrBuilder(*BB->getParent(), &MI)
36571   MachineFunction *MF = MBB->getParent();
36573   MachineRegisterInfo &MRI = MF->getRegInfo();
36580   MVT PVT = getPointerTy(MF->getDataLayout());
36584   BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
36592   BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
36596   MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
36613   MachineFunction *MF = MBB->getParent();
36616   MachineRegisterInfo &MRI = MF->getRegInfo();
36618   const BasicBlock *BB = MBB->getBasicBlock();
36619   MachineFunction::iterator I = ++MBB->getIterator();
36631   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
36638   MVT PVT = getPointerTy(MF->getDataLayout());
36645   //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
36659   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
36660   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
36661   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
36662   MF->insert(I, mainMBB);
36663   MF->insert(I, sinkMBB);
36664   MF->push_back(restoreMBB);
36665   restoreMBB->setMachineBlockAddressTaken();
36670   sinkMBB->splice(sinkMBB->begin(), MBB,
36671                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36672   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
36678   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
36687       MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
36695       MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
36696               .addReg(XII->getGlobalBaseReg(MF))
36705   MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
36718   if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
36723   MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
36727   MIB.addRegMask(RegInfo->getNoPreservedMask());
36728   thisMBB->addSuccessor(mainMBB);
36729   thisMBB->addSuccessor(restoreMBB);
36733   BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
36734   mainMBB->addSuccessor(sinkMBB);
36737   BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
36744   if (RegInfo->hasBasePointer(*MF)) {
36747     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
36748     X86FI->setRestoreBasePointer(MF);
36749     Register FramePtr = RegInfo->getFrameRegister(*MF);
36750     Register BasePtr = RegInfo->getBaseRegister();
36752     addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
36753                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
36756   BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
36757   BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
36758   restoreMBB->addSuccessor(sinkMBB);
36773   MachineFunction *MF = MBB->getParent();
36775   MachineRegisterInfo &MRI = MF->getRegInfo();
36780   MVT PVT = getPointerTy(MF->getDataLayout());
36807   MachineFunction::iterator I = ++MBB->getIterator();
36808   const BasicBlock *BB = MBB->getBasicBlock();
36810   MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
36811   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
36812   MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
36813   MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
36814   MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
36815   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
36816   MF->insert(I, checkSspMBB);
36817   MF->insert(I, fallMBB);
36818   MF->insert(I, fixShadowMBB);
36819   MF->insert(I, fixShadowLoopPrepareMBB);
36820   MF->insert(I, fixShadowLoopMBB);
36821   MF->insert(I, sinkMBB);
36824   sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
36825                   MBB->end());
36826   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
36828   MBB->addSuccessor(checkSspMBB);
36832   BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
36836     BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
36846   BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
36851   BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
36854   BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
36857   checkSspMBB->addSuccessor(sinkMBB);
36858   checkSspMBB->addSuccessor(fallMBB);
36865       BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
36881   BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
36886   BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
36889   fallMBB->addSuccessor(sinkMBB);
36890   fallMBB->addSuccessor(fixShadowMBB);
36896   BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
36902   BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
36906   BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
36911   BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
36914   fixShadowMBB->addSuccessor(sinkMBB);
36915   fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
36920   BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
36927   BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
36929   fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
36935   BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
36942   BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
36946   BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
36949   BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
36952   fixShadowLoopMBB->addSuccessor(sinkMBB);
36953   fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
36962   MachineFunction *MF = MBB->getParent();
36964   MachineRegisterInfo &MRI = MF->getRegInfo();
36969   MVT PVT = getPointerTy(MF->getDataLayout());
36979   Register SP = RegInfo->getStackRegister();
36992   if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
36997   MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
37010   MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37024   MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37036   BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37047   MachineFunction *MF = MBB->getParent();
37048   MachineRegisterInfo *MRI = &MF->getRegInfo();
37051   MVT PVT = getPointerTy(MF->getDataLayout());
37057   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37065     VR = MRI->createVirtualRegister(TRC);
37069       BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37076       BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37077           .addReg(0) /* TII->getGlobalBaseReg(MF) */
37084   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37096   MachineFunction *MF = BB->getParent();
37097   MachineRegisterInfo *MRI = &MF->getRegInfo();
37099   int FI = MF->getFrameInfo().getFunctionContextIndex();
37119     if (!MF->hasCallSiteLandingPad(Sym))
37122     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37136       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
37146   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37147   DispatchBB->setIsEHPad(true);
37149   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37150   BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37151   DispatchBB->addSuccessor(TrapBB);
37153   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37154   DispatchBB->addSuccessor(DispContBB);
37157   MF->push_back(DispatchBB);
37158   MF->push_back(DispContBB);
37159   MF->push_back(TrapBB);
37167   MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37168   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37170   const X86RegisterInfo &RI = TII->getRegisterInfo();
37176     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37177     MFI->setRestoreBasePointer(MF);
37182     addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37183                  MFI->getRestoreBasePointerOffset())
37186     BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37191   Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37192   addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37194   BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37197   BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37202     Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37203     Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37206     BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37213     BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37221       BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37229       Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37230       Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37231       Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37234       BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37241       BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37244       BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37248       BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37256     BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37268       DispContBB->addSuccessor(LP);
37272   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37277     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
37278                                                    MBB->succ_rend());
37281       if (MBBS->isEHPad()) {
37282         MBB->removeSuccessor(MBBS);
37287     MBB->addSuccessor(DispatchBB);
37289     // Find the invoke call and mark all of the callee-saved registers as
37313   // Mark all former landing pads as non-landing pads.  The dispatch is the only
37316     LP->setIsEHPad(false);
37330   MachineFunction &MF = *BB->getParent();
37337   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37343   BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37351   MachineFunction *MF = BB->getParent();
37414         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37415     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37419     Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37420     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37425     Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37426     BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37432         MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37433     BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37438         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37439     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37444     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37449       BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37454       BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37465     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37484         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37485     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37489     Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37490     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37494     Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37495     BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37500         MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37501     BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37506         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37507     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37513                               TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37518     // clang-format off
37529     // clang-format on
37533     addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37537     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37581     // - which is ESI for i686 - register allocator would not be able to
37583     // - there never would be enough unreserved registers during regalloc
37588     // If it is not i686 or there is no base pointer - nothing to do here.
37589     if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
37596     assert(TRI->getBaseRegister() == X86::ESI &&
37600     MachineRegisterInfo &MRI = MF->getRegInfo();
37601     MVT SPTy = getPointerTy(MF->getDataLayout());
37615     while (RMBBI != BB->rend() &&
37616            (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
37617             RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
37618             RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
37619             RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
37624         BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
37632     Register BasePtr = TRI->getBaseRegister();
37633     if (TRI->hasBasePointer(*MF) &&
37635       if (!BB->isLiveIn(BasePtr))
37636         BB->addLiveIn(BasePtr);
37639           MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37640       BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
37642       Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37644           BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
37651       BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
37654           BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
37663     Register BasePtr = TRI->getBaseRegister();
37667     if (!IsRBX || !TRI->hasBasePointer(*MF)) {
37668       BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
37670       BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
37672       BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
37674       BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
37677       if (!BB->isLiveIn(BasePtr)) {
37678         BB->addLiveIn(BasePtr);
37681       BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
37683       BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
37685       assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
37688           MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37689       BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
37692       Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37693       BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
37702     assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
37703     auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37704     MFI->setHasPreallocatedCall(true);
37706     size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
37710     BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
37717     assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
37720     auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37721     size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
37725     addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
37792     MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
37803     BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
37805     auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37806     MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
37810     auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37811     MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
37841     MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
37849     MIB.add(MI.getOperand(CurOp++)); // index -- stride
37900     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37917     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37954     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37992     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38003 //===----------------------------------------------------------------------===//
38005 //===----------------------------------------------------------------------===//
38033     // For vectors - if we have a constant, then try to sign extend.
38062   const APInt &Mask = C->getAPIntValue();
38064   // Clear all non-demanded bits initially.
38088   // and non-demanded bits.
38102                                       const SelectionDAG &DAG, unsigned Depth) {
38106   Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38107   Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38120                                        unsigned Depth) {
38129   KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38130   KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38131   KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38132   KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38142                                          unsigned Depth) {
38152   KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38153   KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38154   KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38155   KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38162     const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38172       [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38174             DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38175             DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38191                                                       unsigned Depth) const {
38208     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38209     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38217     Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38224       Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38234         !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38236       Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38255     Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38272       ShAmt = VT.getScalarSizeInBits() - 1;
38275     Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38302       Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38306       Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38321     // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38323     KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38325       Known = DAG.computeKnownBits(Src, Depth + 1);
38331       Known = DAG.computeKnownBits(Src, Depth + 1);
38339       Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38340       Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38347     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38348     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38357     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38358     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38370     computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38376         DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38378         DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38397     computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38407     computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38412     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38413     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38421     Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38425     KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38437       unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38438       unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38447         Known = DAG.computeKnownBits(Op0, Depth + 1);
38456     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38457     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38460     // The result will have at least as many trailing zeros as the non-mask
38466     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38494     // Truncations/Conversions - upper elements are known zero.
38509     // Strict Conversions - upper elements are known zero.
38549         Op, DemandedElts, Depth, DAG,
38558     switch (Op->getConstantOperandVal(0)) {
38568       computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38580       computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38592       computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38601   // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38631             // TODO - handle target shuffle ops with different value types.
38642               DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
38652     unsigned Depth) const {
38667     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
38668     if (Tmp > (NumSrcBits - VTBits))
38669       return Tmp - (NumSrcBits - VTBits);
38681     auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
38690             DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
38691             DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
38694       return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
38704     if (Tmp > (SrcBits - VTBits))
38705       return Tmp - (SrcBits - VTBits);
38712       return DAG.ComputeNumSignBits(Src, Depth + 1);
38720       return VTBits; // Shifted all bits out --> zero.
38721     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
38723       return 1; // Shifted all sign bits out --> unknown.
38724     return Tmp - ShiftVal.getZExtValue();
38730     if (ShiftVal.uge(VTBits - 1))
38732     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
38738     // cmpss/cmpsd return zero/all-bits result values in the bottom element.
38749     // Vector compares return zero/all-bits result values.
38754         DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
38757         DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
38762     unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
38764     unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
38770   // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38797             // TODO - handle target shuffle ops with different value types.
38807               DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
38820   if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
38821     return N->getOperand(0);
38830   if (!LN->isSimple())
38834   SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
38836                                  LN->getPointerInfo(), LN->getOriginalAlign(),
38837                                  LN->getMemOperand()->getFlags());
38851   // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
38854     if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
38856          isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
38867   // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
38885         unsigned Len = Scale - 1;
38911   // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
38915       isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
38946     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
38968            "AVX512 required for 512-bit vector shuffles");
39014       // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39031       // VPERMILPD can permute with a non-repeating shuffle.
39052       // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39057           // Narrow the repeated mask to create 32-bit element permutes.
39095               OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39209     // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39218     // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39225     // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39251   // non-blended source element is zero in each case.
39461           S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39462           S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39465           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39466           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39469           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39470           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39477       int ShufMask[4] = {-1, -1, -1, -1};
39505     ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
39514 /// chain of single-use x86 shuffle instructions and accumulated the combined
39518 /// instruction but should only be used to replace chains over a certain depth.
39520                                       ArrayRef<int> BaseMask, int Depth,
39567                      (RootVT.isFloatingPoint() && Depth >= 1) ||
39571   // is different from the root element size - this would prevent writemasks
39575     if (Root.hasOneUse() && Root->user_begin()->getOpcode() == ISD::VSELECT &&
39576         Root->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
39593   // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
39608   // Handle 128/256-bit lane shuffles of 512-bit vectors.
39614     if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
39615       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39635       int PermMask[4] = {-1, -1, -1, -1};
39639         assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
39672       if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39680   // Handle 128-bit lane shuffles of 256-bit vectors.
39686       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39695     // If we're inserting the low subvector, an insert-subvector 'concat'
39700       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39708     if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
39725     if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39728     // TODO - handle AVX512VL cases with X86ISD::SHUF128.
39747   // For masks that have been widened to 128-bit elements or more,
39748   // narrow back down to 64-bit elements.
39759   // TODO - variable shuffles might need this to be widened again.
39789   bool AllowFloatDomain = FloatDomain || (Depth >= 3);
39790   bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
39799     // Attempt to match against broadcast-from-vector.
39808           if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
39815           if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
39828       if (Depth == 0 && Root.getOpcode() == Shuffle)
39840       if (Depth == 0 && Root.getOpcode() == Shuffle)
39860         if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
39873       if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
39890     if (Depth == 0 && Root.getOpcode() == Shuffle)
39904     if (Depth == 0 && Root.getOpcode() == Shuffle)
39922       if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
39932       if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
39952       if (Depth == 0 && Root.getOpcode() == Opc)
39969       if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
39985   // Don't try to re-form single instruction chains under any circumstances now
39987   if (Depth < 1)
39990   // Depth threshold above which we can efficiently use variable mask shuffles.
39996       (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
39998       (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
40000   // higher depth before combining them.
40002       (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
40011     // If we have a single input lane-crossing shuffle then lower to VPERMV.
40020       // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40035     // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40036     // vector as the second source (non-VLX will pad to 512-bit shuffles).
40047       // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40060             Inputs, Root, BaseMask, Depth, HasVariableMask,
40065     // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40066     // (non-VLX will pad to 512-bit shuffles).
40085   // See if we can combine a single input shuffle with zeros to a bit-mask,
40113   // the 128-bit lanes use the variable mask to VPERMILPS.
40130   // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40136     // Bits[3] - Match Bit.
40137     // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40138     // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40145         VPerm2Idx.push_back(-1);
40198   // With XOP, if we have a 128-bit binary input shuffle we can always combine
40199   // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40204     // Bits[4:0] - Byte Index (0 - 31)
40205     // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40233           Inputs, Root, BaseMask, Depth, HasVariableMask,
40238   // (non-VLX will pad to 512-bit shuffles)
40268 // -->
40271     ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
40320   WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
40370   // elements, and shrink them to the half-width mask. It does this in a loop
40387   // Increase depth for every upper subvector we've peeked through.
40388   Depth += AdjustedMasks;
40400           combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
40458       // the HOP args are pre-shuffled.
40459       // TODO: Generalize to any sized/depth chain.
40470           if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40484       // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40504         int PostMask[4] = {-1, -1, -1, -1};
40526   SDValue BC1 = BC[BC.size() - 1];
40550           M -= NumElts + (SubLane * NumHalfEltsPerLane);
40564         M -= NumHalfEltsPerLane;
40567         M -= NumHalfEltsPerLane;
40597   // If we are post-shuffling a 256-bit hop and not requiring the upper
40598   // elements, then try to narrow to a 128-bit hop directly.
40647       llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
40719 /// of single-use shuffle instructions, build a generic model of the cumulative
40726 ///    special-purpose shuffle.
40736 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
40742 /// combine-ordering. To fix this, we should do the redundant instruction
40746     ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
40758   // Bound the depth of our recursive combine because this is ultimately
40760   if (Depth >= MaxDepth)
40769     return SDValue(); // Bail if we hit a non-simple non-vector.
40783       OpDemandedElts.setBit(M - BaseIdx);
40786     // Op is smaller than Root - extract the demanded elts for the subvector.
40791                .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
40805                              OpZero, DAG, Depth, false)) {
40843     OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
40850   bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
40879     auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
40885       // Match failed - should we replace an existing Op?
40892       return Ops.size() - 1;
40898           AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
40907     // This function can be performance-critical, so we rely on the power-of-2
40909     // bit-masks and shifts.
40911            "Non-power-of-2 shuffle mask sizes");
40913            "Non-power-of-2 shuffle mask sizes");
40924     assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
40925     assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
40926     assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
40947               : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
40957       RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
40966       // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
40969                                                 (RootMaskedIdx & (OpRatio - 1));
40971       OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41030   // the remaining recursion depth.
41031   if (Ops.size() < (MaxDepth - Depth)) {
41040       if (Ops[i].getNode()->hasOneUse() ||
41046               Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
41058   // If constant fold failed and we only have constants - then we have
41059   // multiple uses by a single non-variable shuffle - just bail.
41060   if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41091         int OpEltIdx = MaskElt - Lo;
41103                  NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41130     // Reresolve - we might have repeated subvector sources.
41138     // elements, and shrink them to the half-width mask. It does this in a loop
41157             Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41176       Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41184       {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
41190 /// Get the PSHUF-style mask from PSHUF node.
41193 /// PSHUF-style masks that can be reused with such instructions.
41202   // If we have more than 128-bits, only the low 128-bits of shuffle mask
41209         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41210                "Mask doesn't repeat in high 128-bit lanes!");
41224       M -= 4;
41241          "Called with something other than an x86 128-bit half shuffle!");
41243   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41264       // dword shuffle, and the high words are self-contained.
41274       // dword shuffle, and the low words are self-contained.
41284       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41290       // Search for a half-shuffle which we can combine with.
41294           !V->isOnlyUserOf(V.getOperand(0).getNode()))
41365 // permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41372   // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41374     if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41418 // Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41488   // the blend mask is the same in the 128-bit subvectors (or can widen to
41498   // Don't introduce lane-crossing permutes without AVX2, unless it can be
41513 // TODO - move this to TLI like isBinOp?
41524 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
41525 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41543            (Op.getOpcode() == Opc && Op->hasOneUse()) ||
41544            (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
41545            (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
41574         N->isOnlyUserOf(N.getOperand(0).getNode())) {
41630     if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
41631         N->isOnlyUserOf(N.getOperand(1).getNode())) {
41709 /// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
41786     // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
41808     // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
41817               {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
41825     // broadcast(bitcast(src)) -> bitcast(broadcast(src))
41826     // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
41837     // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
41838     // If we're re-broadcasting a smaller type then broadcast with that type and
41852     // Reduce broadcast source vector to lowest 128-bits.
41857     // broadcast(scalar_to_vector(x)) -> broadcast(x).
41862     // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
41872     for (SDNode *User : Src->users())
41873       if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
41874           Src == User->getOperand(0) &&
41875           User->getValueSizeInBits(0).getFixedValue() >
41881     // vbroadcast(scalarload X) -> vbroadcast_load X
41887       SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41890                                   LN->getMemoryVT(), LN->getMemOperand());
41917         if (LN->isSimple()) {
41919           SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41922               LN->getPointerInfo(), LN->getOriginalAlign(),
41923               LN->getMemOperand()->getFlags());
41935         if (LN->getMemoryVT().getSizeInBits() == 16) {
41937           SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41940                                       LN->getMemoryVT(), LN->getMemOperand());
41959             LN->isSimple()) {
41963               LN->getBasePtr(), TypeSize::getFixed(Offset), DL);
41964           SDValue Ops[] = { LN->getChain(), Ptr };
41967               LN->getPointerInfo().getWithOffset(Offset),
41968               LN->getOriginalAlign(),
41969               LN->getMemOperand()->getFlags());
41978     // vbroadcast(vzload X) -> vbroadcast_load X
41981       if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
41983         SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41986                                     LN->getMemoryVT(), LN->getMemOperand());
41994     // vbroadcast(vector load X) -> vbroadcast_load
42000       if (LN->isSimple()) {
42002         SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42005             LN->getPointerInfo(), LN->getOriginalAlign(),
42006             LN->getMemOperand()->getFlags());
42037       if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42039         SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42042                                     LN->getMemoryVT(), LN->getMemOperand());
42069     // vzext_movl (scalar_to_vector C) --> load [C,0...]
42072         // Create a vector constant - scalar constant followed by zeros.
42077         ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42084         Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42093     // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42119       // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42137       // --> m3 = blend(m1,m2)
42173     // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42207     // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42222     // If we're permuting the upper 256-bits subvectors of a concatenation, then
42225       // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the
42227       SDValue LHS = N->getOperand(0);
42228       SDValue RHS = N->getOperand(1);
42229       uint64_t Mask = N->getConstantOperandVal(2);
42250     // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42251     SDValue LHS = N->getOperand(0);
42252     SDValue RHS = N->getOperand(1);
42260                                               N->getOperand(2)));
42264     // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42269     // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42301     if (N0->hasOneUse()) {
42336     // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42385         // Zero/UNDEF insertion - zero out element and remove dependency.
42451       if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42452         SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42453                                    MemIntr->getBasePtr(),
42454                                    MemIntr->getMemOperand());
42492           M = (M < 0 ? M : M & (Mask.size() - 1));
42509             M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
42531   // Nuke no-op shuffles that show up after combining.
42546     // dwords as otherwise it would have been removed as a no-op.
42561     // only works when we have a PSHUFD followed by two half-shuffles.
42610   int ParitySrc[2] = {-1, -1};
42649   EVT VT = N->getValueType(0);
42655   // We only handle target-independent shuffles.
42658   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42661   SDValue V1 = N->getOperand(0);
42662   SDValue V2 = N->getOperand(1);
42671   if (!V1->hasOneUse() || !V2->hasOneUse())
42678     LHS = V1->getOperand(0); RHS = V1->getOperand(1);
42679     if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
42680         (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
42684     LHS = V2->getOperand(0); RHS = V2->getOperand(1);
42685     if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
42686         (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
42690   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42696   IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
42697                      : V2->getOpcode() == ISD::FADD;
42708   // We only handle target-independent shuffles.
42711   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42714   MVT VT = N->getSimpleValueType(0);
42720   SDValue Op0 = N->getOperand(0);
42721   SDValue Op1 = N->getOperand(1);
42733   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42745 /// Try to combine a shuffle into a target-specific add-sub or
42746 /// mul-add-sub node.
42758   MVT VT = N->getSimpleValueType(0);
42770   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
42772   // X86 targets with 512-bit ADDSUB instructions!
42787 // if we can express this as a single-source shuffle, that's preferable.
42794   EVT VT = N->getValueType(0);
42796   // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
42806   SDValue N0 = N->getOperand(0);
42807   SDValue N1 = N->getOperand(1);
42822   for (int Elt : SVOp->getMask())
42823     Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
42834   EVT VT = Shuf->getValueType(0);
42835   if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
42841   ArrayRef<int> Mask = Shuf->getMask();
42846   // (half-index output is 0 or 2).
42853   // Create a half-width shuffle to replace the unnecessarily wide shuffle.
42855   // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
42858   return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
42859                                Shuf->getOperand(1), HalfMask, HalfIdx1,
42873   EVT VT = N->getValueType(0);
42894   if (isTargetShuffle(N->getOpcode())) {
42900     // instructions into higher-order shuffles. We do this after combining
42908     // TODO - merge this into combineX86ShufflesRecursively.
42913     // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
42914     // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42928     TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
42941                                  Depth + 1))
42949   if (!Load || !Load->getBasePtr().hasOneUse())
42956   Type *CTy = C->getType();
42957   if (!CTy->isVectorTy() ||
42958       CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
42961   // Handle scaling for i64 elements on 32-bit targets.
42962   unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
42971     Constant *Elt = C->getAggregateElement(i);
42973       ConstVecOps.push_back(UndefValue::get(Elt->getType()));
42989       Load->getAlign());
42995     TargetLoweringOpt &TLO, unsigned Depth) const {
43009                                    Depth + 1))
43012                                    Depth + 1))
43027                                    Depth + 1))
43030                                    Depth + 1))
43038                                    Depth + 1))
43042                                    Depth + 1))
43059           LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43061           RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43074     // We only need the bottom 64-bits of the (128-bit) shift amount.
43080     // only the bottom 64-bits are only ever used.
43081     bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43082       unsigned UseOpc = Use->getOpcode();
43085              Use->getOperand(0) != Amt;
43092                                    Depth + 1, AssumeSingleUse))
43102                                    Depth + 1))
43105     // Fold shift(0,x) -> 0
43113               Src, DemandedElts, TLO.DAG, Depth + 1))
43128                                    Depth + 1))
43131     // Fold shift(0,x) -> 0
43137                                    Depth + 1))
43150                                    Depth + 1))
43153                                    Depth + 1))
43160     assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43161     unsigned ShiftAmt = Amt->getZExtValue();
43173         int Diff = ShiftAmt - C1;
43175           Diff = -Diff;
43188                                    Depth + 1))
43199     assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43200     unsigned ShiftAmt = Amt->getZExtValue();
43212         int Diff = ShiftAmt - C1;
43214           Diff = -Diff;
43227                                    Depth + 1))
43255             // We can't assume an undef src element gives an undef dst - the
43276                                    Depth + 1))
43279                                    Depth + 1))
43284                                                        TLO.DAG, Depth + 1);
43286                                                        TLO.DAG, Depth + 1);
43305                                    Depth + 1))
43319                                    Depth + 1))
43323                                    Depth + 1))
43326     // TODO - pass on known zero/undef.
43329     // TODO - we should do this for all target/faux shuffles ops.
43332                                                             TLO.DAG, Depth + 1);
43334                                                             TLO.DAG, Depth + 1);
43356                                    Depth + 1))
43360                                    Depth + 1))
43363     // TODO - pass on known zero/undef.
43369                                                             TLO.DAG, Depth + 1);
43371                                                             TLO.DAG, Depth + 1);
43389                                    Depth + 1))
43407                                    SelZero, TLO, Depth + 1))
43413                                    LHSZero, TLO, Depth + 1))
43418                                    RHSZero, TLO, Depth + 1))
43430     if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43441       SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
43442                                     Mem->getMemOperand());
43465                                    Depth + 1))
43468     // TODO - we should do this for all target/faux shuffles ops.
43470             Src, SrcElts, TLO.DAG, Depth + 1))
43476                                                    Depth))
43483                                                    Depth))
43489                                                    Depth))
43494   // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
43495   // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
43496   // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
43502     // See if 512-bit ops only use the bottom 128-bits.
43525       SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
43527           X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
43528           MemIntr->getMemOperand());
43537       EVT MemVT = MemIntr->getMemoryVT();
43541             TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
43542                             MemIntr->getBasePtr(), MemIntr->getMemOperand());
43641       // (Non-Lane Crossing) Target Shuffles.
43704                               OpZero, TLO.DAG, Depth, false))
43747         int M = OpMask[i] - Lo;
43752     // TODO - Propagate input undef/zero elts.
43755                                    TLO, Depth + 1))
43761   // We need to convert the depth to something combineX86ShufflesRecursively
43762   // can handle - so pretend its Depth == 0 again, and reduce the max depth
43767     assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
43775         {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
43789     unsigned Depth) const {
43802     if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
43813     // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
43826                              KnownLHS, TLO, Depth + 1))
43829                              KnownRHS, TLO, Depth + 1))
43832     // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
43843         LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43845         RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43860                              Known, TLO, Depth + 1))
43864                              OriginalDemandedElts, Known2, TLO, Depth + 1))
43881     unsigned ShAmt = Op1->getAsZExtVal();
43894         int Diff = ShAmt - Shift2Amt;
43908         TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
43909     unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
43910     if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
43914                              TLO, Depth + 1))
43924       // Attempt to avoid multi-use ops if we don't need anything from them.
43926               Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
43938     unsigned ShAmt = Op1->getAsZExtVal();
43945                              TLO, Depth + 1))
43955       // Attempt to avoid multi-use ops if we don't need anything from them.
43957               Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
43969     unsigned ShAmt = Op1->getAsZExtVal();
43979     // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
43994                              TLO, Depth + 1))
44002     if (Known.Zero[BitWidth - ShAmt - 1] ||
44008     if (Known.One[BitWidth - ShAmt - 1])
44012       // Attempt to avoid multi-use ops if we don't need anything from them.
44014               Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44029         Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44031         LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44033         RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44051     if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44052       unsigned Idx = CIdx->getZExtValue();
44056       // bits from the implict zext - simplify to zero.
44064                                      KnownZero, TLO, Depth + 1))
44069                                KnownVec, TLO, Depth + 1))
44073               Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44089     if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44090       unsigned Idx = CIdx->getZExtValue();
44098                                KnownVec, TLO, Depth + 1))
44104       if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44116     // TODO - add known bits handling.
44124                                KnownLHS, TLO, Depth + 1))
44127                                KnownRHS, TLO, Depth + 1))
44130       // Attempt to avoid multi-use ops if we don't need anything from them.
44132           Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44134           Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44141     // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44149                              TLO, Depth + 1))
44156         Src->hasOneUse()) {
44168     // icmp sgt(0, R) == ashr(R, BitWidth-1).
44184     // See if we only demand bits from the lower 128-bit vector.
44195                                    TLO, Depth + 1))
44199     Known.Zero.setHighBits(BitWidth - NumElts);
44205                              Depth + 1))
44208     if (KnownSrc.One[SrcBits - 1])
44210     else if (KnownSrc.Zero[SrcBits - 1])
44213     // Attempt to avoid multi-use os if we don't need anything from it.
44215             Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44230     bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44231     return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44233            SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44239                              OriginalDemandedElts, Known2, TLO, Depth + 1))
44242                              OriginalDemandedElts, Known, TLO, Depth + 1))
44254     // Only bottom 16-bits of the control bits are required.
44257       uint64_t Val1 = Cst1->getZExtValue();
44266       unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44267       unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44277         if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44288       if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44304     APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44308     if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44317     if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44322     // The result will have at least as many trailing zeros as the non-mask
44330       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
44335     SelectionDAG &DAG, unsigned Depth) const {
44347     if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
44348         !DemandedElts[CIdx->getZExtValue()])
44358     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
44359     unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
44360     if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44371     // icmp sgt(0, R) == ashr(R, BitWidth-1).
44383     KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
44395     KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
44396     KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
44411                              ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
44450       Op, DemandedBits, DemandedElts, DAG, Depth);
44455     bool PoisonOnly, unsigned Depth) const {
44479                 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
44487       Op, DemandedElts, DAG, PoisonOnly, Depth);
44492     bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
44514     switch (Op->getConstantOperandVal(0)) {
44525       Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
44532                                                   unsigned Depth) const {
44544                                                    DAG, Depth);
44580   // clang-format off
44585   // clang-format on
44590 // Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
44601         cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
44652 // ->
44685       cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
44694   // With AVX512 vxi1 types are legal and we prefer using k-regs.
44718   // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
44724   // avoid sign-extending to this type entirely.
44738     // sign-extend to a 256-bit operation to avoid truncation.
44748     // sign-extend to a 256-bit operation to match the compare.
44749     // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
44750     // 256-bit because the shuffle is cheaper than sign extending the result of
44761     // it is not profitable to sign-extend to 256-bit because this will
44762     // require an extra cross-lane shuffle which is more expensive than
44763     // truncating the result of the compare to 128-bits.
44815     if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
44825   assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
44830   // Only do this if we have k-registers.
44834   EVT DstVT = N->getValueType(0);
44835   SDValue Op = N->getOperand(0);
44881   unsigned NumElts = BV->getNumOperands();
44882   SDValue Splat = BV->getSplatValue();
44906   // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
44914       // Unpack v8i8 to splat i8 elements to lowest 16-bits.
44922       // Use PSHUFW to repeat 16-bit elements.
44933       Ops.push_back(CreateMMXElement(BV->getOperand(i)));
44961                                           unsigned Depth = 0) {
44962   if (Depth >= SelectionDAG::MaxRecursionDepth)
44963     return SDValue(); // Limit search depth.
44978     if (C->isZero())
44980     if (C->isAllOnes())
44991                                                   Subtarget, Depth + 1))
45004                                                   Subtarget, Depth + 1))
45016                                                 Subtarget, Depth + 1))
45018                                                   Subtarget, Depth + 1))
45031                                                   Depth + 1))
45034             DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45040   if (Depth > 0)
45050   SDValue N0 = N->getOperand(0);
45051   EVT VT = N->getValueType(0);
45057   // ->
45088         SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45092           SmallVector<SDValue, 4> Ops(N0->ops());
45137     unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45147       SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45150                                   MemVT, BCast->getMemOperand());
45158   // avoiding store-load conversions.
45167       // Handle zero-extension of i32 with MOVD.
45172       // TODO - investigate supporting sext 32-bit immediates on x86_64.
45196     // Detect bitcasts of 64-bit build vectors and convert to a
45235       if (C->isAllOnes())
45237       if (C->isZero())
45243   // Turn it into a sign bit compare that produces a k-register. This avoids
45284   // remove GPR<->K-register crossings.
45289   // floating-point operand into a floating-point logic operation. This may
45295   // clang-format off
45300   // clang-format on
45315   // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
45324   // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
45347   auto IsFreeTruncation = [](SDValue &Op) -> bool {
45354     return (BV && BV->isConstant());
45372   SDValue AbsOp1 = Abs->getOperand(0);
45379   // Check if the operands of the sub are zero-extended from vectors of i8.
45409   // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
45442   // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
45471   EVT ExtractVT = Extract->getValueType(0);
45490   // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
45517   // v16i8 UMIN will leave the upper element as zero, performing zero-extension
45545   EVT ExtractVT = Extract->getValueType(0);
45573     // Special case for (pre-legalization) vXi1 reductions.
45577       ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
45580         // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
45581         // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
45611     // FIXME: Better handling of k-registers or 512-bit vectors?
45652     // parity -> (PARITY(MOVMSK X))
45660     // any_of -> MOVMSK != 0
45664     // all_of -> MOVMSK == ((1 << NumElts) - 1)
45671   // negate to get the final 0/-1 mask value.
45683   EVT ExtractVT = Extract->getValueType(0);
45689   EVT VT = Extract->getOperand(0).getValueType();
45698   // done by vpdpbusd compute a signed 16-bit product that will be sign extended
45729     for (unsigned i = Stages - StageBias; i > 0; --i) {
45730       SmallVector<int, 16> Mask(DpElems, -1);
45731       for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45746                      Extract->getOperand(1));
45755   EVT ExtractVT = Extract->getValueType(0);
45761   EVT VT = Extract->getOperand(0).getValueType();
45783   // abs-diff pattern.
45787   // Check whether we have an abs-diff pattern feeding into the select.
45803     for(unsigned i = Stages - 3; i > 0; --i) {
45804       SmallVector<int, 16> Mask(SadElems, -1);
45805       for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45820                      Extract->getOperand(1));
45824 // integer, that requires a potentially expensive XMM -> GPR transfer.
45829 //       to a single-use of the loaded vector. For the reasons above, we
45835   assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45839   EVT VT = N->getValueType(0);
45841   bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
45842     return Use->getOpcode() == ISD::STORE ||
45843            Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
45844            Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
45851       DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
45853         DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
45855     MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
45856     Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
45858         DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
45859                     LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
45876   SDValue Src = N->getOperand(0);
45877   SDValue Idx = N->getOperand(1);
45879   EVT VT = N->getValueType(0);
45889   const APInt &IdxC = N->getConstantOperandAPInt(1);
45903       // TODO support non-zero offsets.
45917     if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
45919       SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
45920                                  MemIntr->getBasePtr(),
45921                                  MemIntr->getPointerInfo(),
45922                                  MemIntr->getOriginalAlign(),
45923                                  MemIntr->getMemOperand()->getFlags());
45958     return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
45962   // We can only legally extract other elements from 128-bit vectors and in
45963   // certain circumstances, depending on SSE-level.
45973       unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
45977       Idx &= (NumEltsPerLane - 1);
46030   // If narrowing/widening failed, see if we can extract+zero-extend.
46041     if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46063   if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46076   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46077   SDValue Vec = ExtElt->getOperand(0);
46078   SDValue Index = ExtElt->getOperand(1);
46079   EVT VT = ExtElt->getValueType(0);
46083   // non-zero element because the shuffle+scalar op will be cheaper?
46088   // extract, the condition code), so deal with those as a special-case.
46094     // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46117     // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46129   // TODO: This switch could include FNEG and the x86-specific FP logic ops
46162     // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46165     for (SDValue Op : Vec->ops())
46179   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46191   SDValue Index = ExtElt->getOperand(1);
46195   EVT VT = ExtElt->getValueType(0);
46204   // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
46222   // vXi8 mul reduction - promote to vXi16 mul reduction.
46245                                              {4, 5, 6, 7, -1, -1, -1, -1}));
46248                                            {2, 3, -1, -1, -1, -1, -1, -1}));
46251                                            {1, -1, -1, -1, -1, -1, -1, -1}));
46256   // vXi8 add reduction - sub 128-bit vector.
46265   // Must be a >=128-bit vector with pow2 elements.
46269   // vXi8 add reduction - sum lo/hi halves then use PSADBW.
46281         {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
46290   // If the source vector values are 0-255, then we can use PSADBW to
46327       SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
46342   // 256-bit horizontal instructions operate on 128-bit chunks rather than
46345   // TODO: We could extend this to handle 512-bit or even longer vectors.
46358   // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
46369 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
46376   SDValue InputVector = N->getOperand(0);
46377   SDValue EltIdx = N->getOperand(1);
46381   EVT VT = N->getValueType(0);
46383   bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
46388   if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
46399       uint64_t Idx = CIdx->getZExtValue();
46405     // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
46414             DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
46425     // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
46436     // TODO - Remove this once we can handle the implicit zero-extension of
46459   // pre-legalization,
46483             N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
46502       if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46503           Use->getOperand(0).getResNo() == ResNo &&
46504           Use->getValueType(0) == MVT::i1) {
46506         IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
46512     if (all_of(InputVector->users(), IsBoolExtract) &&
46518           // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
46520           SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
46532   // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
46563   // Input type must be extending a bool vector (bit-casted from a scalar
46585     // must split it down into sub-sections for broadcasting. For example:
46586     //   i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
46587     //   i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
46614     // For smaller scalar integers, we can simply any-extend it to the vector
46636   // zero-extension.
46640                      DAG.getConstant(EltSizeInBits - 1, DL, VT));
46643 /// If a vector select has an operand that is -1 or 0, try to simplify the
46650   SDValue Cond = N->getOperand(0);
46651   SDValue LHS = N->getOperand(1);
46652   SDValue RHS = N->getOperand(2);
46657   if (N->getOpcode() != ISD::VSELECT)
46680   // vector floating-point selects.
46698           cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
46711   // vselect Cond, 111..., 000... -> Cond
46718   // vselect Cond, 111..., X -> or Cond, X
46725   // vselect Cond, X, 000... -> and Cond, X
46732   // vselect Cond, 000..., X -> andn Cond, X
46736     // The canonical form differs for i1 vectors - x86andnp is not used
46749 /// and concatenate the result to eliminate a wide (256-bit) vector instruction:
46750 ///   vselect Cond, (concat T0, T1), (concat F0, F1) -->
46754   unsigned Opcode = N->getOpcode();
46758   // TODO: Split 512-bit vectors too?
46759   EVT VT = N->getValueType(0);
46764   SDValue Cond = N->getOperand(0);
46765   SDValue TVal = N->getOperand(1);
46766   SDValue FVal = N->getOperand(2);
46782   SDValue Cond = N->getOperand(0);
46783   SDValue LHS = N->getOperand(1);
46784   SDValue RHS = N->getOperand(2);
46792   EVT VT = N->getValueType(0);
46797   // this with a wider condition value (post-legalization it becomes an i8),
46802   // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
46804   // TODO: For constants that overflow or do not differ by power-of-2 or small
46806   const APInt &TrueVal = TrueC->getAPIntValue();
46807   const APInt &FalseVal = FalseC->getAPIntValue();
46809   // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
46812     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46835     // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
46838     // Multiply condition by the difference if non-one.
46842     // Add the base if non-zero.
46843     if (!FalseC->isZero())
46852 /// If this is a *dynamic* select (non-constant condition) and we can match
46861   SDValue Cond = N->getOperand(0);
46862   if ((N->getOpcode() != ISD::VSELECT &&
46863        N->getOpcode() != X86ISD::BLENDV) ||
46869   EVT VT = N->getValueType(0);
46875   // cases where a *dynamic* blend will fail even though a constant-condition
46878   // Potentially, we should combine constant-condition vselect nodes
46879   // pre-legalization into shuffles and not mark as many types as custom
46883   // FIXME: We don't support i16-element blends currently. We could and
46885   // rather than just the high bit and using an i8-element blend.
46894   // There are no 512-bit blend instructions that use sign bits.
46899   // and don't ever optimize vector selects that map to AVX512 mask-registers.
46904     for (SDUse &Use : Cond->uses())
46905       if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
46906            Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
46927     for (SDNode *U : Cond->users()) {
46928       if (U->getOpcode() == X86ISD::BLENDV)
46931       SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
46932                                Cond, U->getOperand(1), U->getOperand(2));
46942     return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
46943                        N->getOperand(1), N->getOperand(2));
46955 //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
46958 //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
46959 //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
46970          "Mask must be zero/all-bits");
46978     return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
46979            ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
46996   // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
46999   // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47001   // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47014   if (N->getOpcode() != ISD::VSELECT)
47017   SDValue Cond = N->getOperand(0);
47018   SDValue LHS = N->getOperand(1);
47019   SDValue RHS = N->getOperand(2);
47031   // (vselect M, L, R) -> (vselect ~M, R, L)
47033       ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
47040 /// Do target-specific dag combines on SELECT and VSELECT nodes.
47045   SDValue Cond = N->getOperand(0);
47046   SDValue LHS = N->getOperand(1);
47047   SDValue RHS = N->getOperand(2);
47066   // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47067   // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47079       (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
47082                                      N->getOpcode() == X86ISD::BLENDV))
47086   // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47089   if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
47121   // ignored in unsafe-math mode).
47129     bool IsStrict = Cond->isStrictFPOpcode();
47131         cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47204     // Check for x CC y ? y : x -- a min/max with reversed arms.
47274                                   DL, {N->getValueType(0), MVT::Other},
47279       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47287   // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47288   if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47290     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47314     return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
47317   // AVX512 - Extend select to merge with target shuffle.
47318   // select(mask, extract_subvector(shuffle(x)), y) -->
47320   // TODO - support non target shuffles as well with canCombineAsMaskOperation.
47354   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
47359     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47362     // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
47363     // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
47366     // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
47377     //  (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
47378     //  (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
47397     // --> (select (cmpuge Cond0, Cond1), LHS, Y)
47399     // --> (select (cmpsle Cond0, Cond1), LHS, Y)
47405           cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
47411         // clang-format off
47417         // clang-format on
47430   if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
47442   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
47454   // with out-of-bounds clamping.
47458   // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
47459   // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
47460   // exceeding bitwidth-1.
47461   if (N->getOpcode() == ISD::VSELECT) {
47463     // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
47464     // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
47475     // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
47476     // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
47502   // select(~Cond, X, Y) -> select(Cond, Y, X)
47505       return DAG.getNode(N->getOpcode(), DL, VT,
47508     // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
47518       return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
47521     // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
47528       return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
47537   if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
47567   // This can lower using a vector shift bit-hack rather than mask and compare.
47569       N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
47573       cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
47575     // The 'and' mask must be composed of power-of-2 constants.
47578     if (C && C->getAPIntValue().isPowerOf2()) {
47579       // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
47585     // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
47586     // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
47587     // 16-bit lacks a proper blendv.
47595           return C->getAPIntValue().isPowerOf2();
47597       // Create a left-shift constant to get the mask bits over to the sign-bit.
47602         ShlVals.push_back(EltBitWidth - 1 -
47603                           MaskVal->getAPIntValue().exactLogBase2());
47605       // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
47626   // This combine only operates on CMP-like nodes.
47628         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47638   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
47639   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
47640   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
47641   //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
47645   // - XOR/OR/AND (if they were made to survive AtomicExpand)
47646   // - LHS != 1
47665   APInt Addend = OpRHSC->getAPIntValue();
47667     Addend = -Addend;
47673   APInt Comparison = CmpRHSC->getAPIntValue();
47674   APInt NegAddend = -Addend;
47689     APInt DecComparison = Comparison - 1;
47711         AN->getMemOperand());
47727   else if (CC == X86::COND_G && Addend == -1)
47729   else if (CC == X86::COND_LE && Addend == -1)
47752     // CMP(X,0) -> signbit test
47757     // TODO: Remove one use limit once sdiv-fix regressions are fixed.
47763     // OR(X,Y) -> see if only one operand contributes to the signbit.
47764     // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
47812   // This combine only operates on CMP-like nodes.
47814         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47838   if (C->getZExtValue() == 1) {
47841   } else if (C->getZExtValue() != 0)
47851       int OpIdx = -1;
47903     if (FVal && FVal->getZExtValue() != 0) {
47904       if (FVal->getZExtValue() != 1)
47911     if (FValIsFalse && TVal->getZExtValue() != 1)
47913     if (!FValIsFalse && TVal->getZExtValue() != 0)
47932   if (Cond->getOpcode() == X86ISD::CMP) {
47933     if (!isNullConstant(Cond->getOperand(1)))
47936     Cond = Cond->getOperand(0);
47942   switch (Cond->getOpcode()) {
47950     SetCC0 = Cond->getOperand(0);
47951     SetCC1 = Cond->getOperand(1);
47958       SetCC0->getOperand(1) != SetCC1->getOperand(1))
47961   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
47962   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
47963   Flags = SetCC0->getOperand(1);
47967 // When legalizing carry, we create carries via add X, -1
47997               CarryOp1.getNode()->hasOneUse() &&
48001                 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48052       // testc -> testz.
48056       // !testc -> !testz.
48060       // testz -> testc.
48064       // !testz -> !testc.
48069       // testnzc -> testnzc (no change).
48085     // TESTC(X,~X) == TESTC(X,-1)
48095     // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48136       // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48173     // TESTZ(-1,X) == TESTZ(X,X)
48177     // TESTZ(X,-1) == TESTZ(X,X)
48181     // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48209   // Handle eq/ne against -1 (all_of).
48220   const APInt &CmpVal = CmpConstant->getAPIntValue();
48251   bool IsOneUse = CmpOp.getNode()->hasOneUse();
48254   // signbits extend down to all the sub-elements as well.
48268         DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48277   // MOVMSK(CONCAT(X,Y)) == 0 ->  MOVMSK(OR(X,Y)).
48278   // MOVMSK(CONCAT(X,Y)) != 0 ->  MOVMSK(OR(X,Y)).
48279   // MOVMSK(CONCAT(X,Y)) == -1 ->  MOVMSK(AND(X,Y)).
48280   // MOVMSK(CONCAT(X,Y)) != -1 ->  MOVMSK(AND(X,Y)).
48298   // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48299   // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48300   // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48301   // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48313       // Check for 256-bit split vector cases.
48341     // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
48355     // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
48383   // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
48414   // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
48415   // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
48416   // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
48417   // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
48467   EVT VT = N->getValueType(0);
48468   SDValue FalseOp = N->getOperand(0);
48469   SDValue TrueOp = N->getOperand(1);
48470   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
48471   SDValue Cond = N->getOperand(3);
48473   // cmov X, X, ?, ? --> X
48497       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
48503       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
48506       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
48510         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
48512         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
48518       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
48520       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
48525                            FalseC->getValueType(0), Cond);
48534         APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
48557           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
48564           // Add the base if non-zero.
48565           if (FalseC->getAPIntValue() != 0)
48575   //   (select (x != c), e, c) -> select (x != c), e, x),
48576   //   (select (x == c), c, e) -> select (x == c), x, e)
48580   // The rationale for this change is that the conditional-move from a constant
48581   // needs two instructions, however, conditional-move from a register needs
48585   //  some instruction-combining opportunities. This opt needs to be
48619       Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
48624     if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
48625       EVT CondVT = Cond->getValueType(0);
48628           DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
48637   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
48638   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
48674   // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
48675   //      (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
48676   // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
48677   //    (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
48713   EVT VT = N->getOperand(0).getValueType();
48717   assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
48721     SDValue Opd = N->getOperand(i);
48729   // When ranges are from -128 ~ 127, use MULS8 mode.
48735   // When ranges are from -32768 ~ 32767, use MULS16 mode.
48763 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
48769 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
48793   SDValue N0 = N->getOperand(0);
48794   SDValue N1 = N->getOperand(1);
48795   EVT VT = N->getOperand(0).getValueType();
48847     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48852                          N->getOperand(0));
48857     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48862                          N->getOperand(0));
48880     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48905     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48914   if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
48917       unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
48918       SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48920       SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48941   EVT VT = N->getValueType(0);
48957   SDValue N0 = N->getOperand(0);
48958   SDValue N1 = N->getOperand(1);
48993     // Mask off upper 16-bits of sign-extended constants.
48996     if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
49001       // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49011         N->isOnlyUserOf(Op.getNode())) {
49018         N->isOnlyUserOf(Op.getNode())) {
49048   EVT VT = N->getValueType(0);
49056   SDValue N0 = N->getOperand(0);
49057   SDValue N1 = N->getOperand(1);
49059   // MULDQ returns the 64-bit result of the signed multiplication of the lower
49060   // 32-bits. We can lower with this if the sign bits stretch that far.
49088   EVT VT = N->getValueType(0);
49104   KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49113     return DAG.getNegative(N->getOperand(0), DL, VT);
49132   uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49137       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49163       if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49164                                       N->user_begin()->getOpcode() == ISD::ADD))
49172         NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49175         NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49193     if (isPowerOf2_64(AbsMulAmt - 1)) {
49196           ISD::ADD, DL, VT, N->getOperand(0),
49197           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49198                       DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49202       // (mul x, 2^N - 1) => (sub (shl x, N), x)
49204           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49208         NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49210         NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49211     } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49215           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49216                       DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49219           DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49222       // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49224           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49228           DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49231       uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49234       if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49235         ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49244             DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49247             DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49267   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49274   SDValue ShiftOperand = N->getOperand(0);
49279   EVT VT = N->getValueType(0);
49285   if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
49309   ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49316   SDValue N0 = N->getOperand(0);
49317   SDValue N1 = N->getOperand(1);
49324   // with out-of-bounds clamping.
49330     // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
49336     // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
49344   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
49351     Mask <<= N1C->getAPIntValue();
49353     // We can handle cases concerning bit-widening nodes containing setcc_c if
49359     //   zext(setcc_c)                 -> i32 0x0000FFFF
49360     //   c1                            -> i32 0x0000FFFF
49361     //   c2                            -> i32 0x00000001
49362     //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
49363     //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
49384   SDValue N0 = N->getOperand(0);
49385   SDValue N1 = N->getOperand(1);
49393   // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
49397                             m_SpecificInt(VT.getScalarSizeInBits() - 1))))
49402   // into (SHL (sext_in_reg X), ShlConst - SraConst)
49404   //   or (SRA (sext_in_reg X), SraConst - ShlConst)
49406   // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
49422   APInt ShlConst = N01->getAsAPIntVal();
49423   APInt SraConst = N1->getAsAPIntVal();
49433     // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
49434     if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
49442                          DAG.getConstant(ShlConst - SraConst, DL, CVT));
49444                        DAG.getConstant(SraConst - ShlConst, DL, CVT));
49453   SDValue N0 = N->getOperand(0);
49454   SDValue N1 = N->getOperand(1);
49463   // with out-of-bounds clamping.
49469     // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
49475     // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
49489   // TODO: This is a generic DAG combine that became an x86-only combine to
49490   // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
49491   // and-not ('andn').
49500   // If we can shrink the constant mask below 8-bits or 32-bits, then this
49502   // from improved known-bits analysis or instruction selection.
49503   APInt MaskVal = AndC->getAPIntValue();
49512   APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
49517     // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
49527   unsigned Opcode = N->getOpcode();
49531   EVT VT = N->getValueType(0);
49532   SDValue N0 = N->getOperand(0);
49533   SDValue N1 = N->getOperand(1);
49537       N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
49539       N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
49554         // shuffle to a v4X64 width - we can probably relax this in the future.
49572   // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
49597       int PostShuffle[4] = {-1, -1, -1, -1};
49629   // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
49672   unsigned Opcode = N->getOpcode();
49676   EVT VT = N->getValueType(0);
49677   SDValue N0 = N->getOperand(0);
49678   SDValue N1 = N->getOperand(1);
49691   if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
49692       (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
49742   // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
49746   // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
49782   // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
49823   assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
49824           X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
49828     MVT VT = N->getSimpleValueType(0);
49829     SDValue LHS = N->getOperand(0);
49830     SDValue RHS = N->getOperand(1);
49832     // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
49833     if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
49836         N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
49855         return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
49861   // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
49871   assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
49872           X86ISD::VSRL == N->getOpcode()) &&
49874   EVT VT = N->getValueType(0);
49875   SDValue N0 = N->getOperand(0);
49876   SDValue N1 = N->getOperand(1);
49878   // Shift zero -> zero.
49888     unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
49904   unsigned Opcode = N->getOpcode();
49909   EVT VT = N->getValueType(0);
49910   SDValue N0 = N->getOperand(0);
49911   SDValue N1 = N->getOperand(1);
49917   // (shift undef, X) -> 0
49923   unsigned ShiftVal = N->getConstantOperandVal(1);
49927     ShiftVal = NumBitsPerElt - 1;
49930   // (shift X, 0) -> X
49934   // (shift 0, C) -> 0
49940   // (VSRAI -1, C) -> -1
49953       NewShiftVal = NumBitsPerElt - 1;
49959   // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
49963   // (shl (add X, X), C) -> (shl X, (C + 1))
49977   // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
49982       N0->hasOneUse()) {
50027   if (N->isOnlyUserOf(N0.getNode())) {
50031     // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50035         BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50057   EVT VT = N->getValueType(0);
50058   unsigned Opcode = N->getOpcode();
50064   SDValue Vec = N->getOperand(0);
50065   SDValue Scl = N->getOperand(1);
50066   SDValue Idx = N->getOperand(2);
50068   // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50092 /// OR -> CMPNEQSS.
50101     SDValue N0 = N->getOperand(0);
50102     SDValue N1 = N->getOperand(1);
50111     SDValue CMP00 = CMP0->getOperand(0);
50112     SDValue CMP01 = CMP0->getOperand(1);
50119       for (const SDNode *U : N->users()) {
50123         switch (U->getOpcode()) {
50163                                       N->getSimpleValueType(0));
50173             // On a 32-bit target, we cannot bitcast the 64-bit float to a
50174             // 64-bit integer, since that's not a legal type. Since
50200 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50202   assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50204   MVT VT = N->getSimpleValueType(0);
50209   SDValue N0 = N->getOperand(0);
50210   SDValue N1 = N->getOperand(1);
50228 ///            (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50229 ///   ->
50234   assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50236   EVT VT = N->getValueType(0);
50245     // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50246     // end-users are ISD::AND including cases
50248     if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50249         !SVN->getOperand(1).isUndef()) {
50252     SDValue IVEN = SVN->getOperand(0);
50257         IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50265       return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50266                                   SVN->getOperand(1), SVN->getMask());
50272   SDValue N0 = N->getOperand(0);
50273   SDValue N1 = N->getOperand(1);
50319                                      SelectionDAG &DAG, unsigned Depth) {
50321   if (Depth >= SelectionDAG::MaxRecursionDepth)
50334   if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
50348   if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
50367 // register. In most cases we actually compare or select YMM-sized registers
50370 // Even with AVX-512 this is still useful for removing casts around logical
50403   // clang-format off
50408   // clang-format on
50413 /// If both input operands of a logic op are being cast from floating-point
50414 /// types or FP compares, try to convert this into a floating-point logic node
50449   ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
50450   ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
50460   // logic (setcc N00, N01), (setcc N10, N11) -->
50478 // Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
50479 // to reduce XMM->GPR traffic.
50508 // Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
50527   if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
50547 // BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
50584 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
50586 /// with a shift-right to eliminate loading the vector constant mask value.
50589   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
50590   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
50596   // shift and "andn". This saves a materialization of a -1 vector constant.
50599   // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
50604   if (N->getValueType(0) == VT &&
50620                                      VT.getScalarSizeInBits() - 1, DAG);
50642   SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
50644   return DAG.getBitcast(N->getValueType(0), Shift);
50650   if (Ld->isIndexed())
50653   SDValue Base = Ld->getBasePtr();
50669 /// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
50674   MVT VT = N->getSimpleValueType(0);
50696 // 'and-load' sequence.
50700 //   int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
50707   MVT VT = Node->getSimpleValueType(0);
50717     auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
50720     const Value *MemOp = Ld->getMemOperand()->getValue();
50729       if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
50730         if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
50731           Constant *Init = GV->getInitializer();
50732           Type *Ty = Init->getType();
50734               !Ty->getArrayElementType()->isIntegerTy() ||
50735               Ty->getArrayElementType()->getScalarSizeInBits() !=
50737               Ty->getArrayNumElements() >
50738                   Ty->getArrayElementType()->getScalarSizeInBits())
50742           uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
50745             auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
50746             if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
50754           // Do the transformation (For 32-bit type):
50755           // -> (and (load arr[idx]), inp)
50756           // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
50758           SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
50777 // where the setcc will freely 0 upper bits of k-register. We can replace the
50782   assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
50784   EVT VT = N->getValueType(0);
50788   auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
50795   SDValue Src = N->getOperand(0);
50827       !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
50861                                 SDValue OpMustEq, SDValue Op, unsigned Depth) {
50866   // Only do this re-ordering if op has one use.
50874     if (Depth++ >= kMaxDepth)
50879               getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
50881                            Op.getOperand(1 - OpIdx));
50897     // BLSR: (and x, (add x, -1))
50898     // BLSMSK: (xor x, (add x, -1))
50907   EVT VT = N->getValueType(0);
50913   assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
50918             getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
50919                              N->getOperand(1 - OpIdx), 0))
50930   //  ->
50936   //  ->
50942   SDValue SetCC = N->getOperand(0);
50948   SDNode *BrCond = *Flag->user_begin();
50949   if (BrCond->getOpcode() != X86ISD::BRCOND)
50952   if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
50959   if (N->getOpcode() == X86ISD::SUB)
50960     X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
50964       static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
50970   SmallVector<SDValue> Ops(BrCond->op_values());
50971   if (isNullConstant(N->getOperand(1)))
50973   else if (isOneConstant(N->getOperand(1)))
50979       DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
50980   // Avoid self-assign error b/c CC1 can be `e/ne`.
50990   //  ->
50994   //  ->
51002   SDValue SetCC0 = N->getOperand(0);
51003   SDValue SetCC1 = N->getOperand(1);
51008   auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51034   bool IsOR = N->getOpcode() == ISD::OR;
51045       static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51069   SDValue N0 = N->getOperand(0);
51070   SDValue N1 = N->getOperand(1);
51071   EVT VT = N->getValueType(0);
51083   // Use a 32-bit and+zext if upper bits known zero.
51095   // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51118   //    `(-x << C0) & C1`
51120   //    `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51133       const APInt &MulC = N01C->getAPIntValue();
51134       const APInt &AndC = N1C->getAPIntValue();
51135       APInt MulCLowBit = MulC & (-MulC);
51140         assert(MulCLowBitLog != -1 &&
51155   if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51158   if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51161   if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51164   if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51189   // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51190   // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51197         N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51203   // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51205   if (isOneConstant(N1) && N0->hasOneUse()) {
51209            Src.getOperand(0)->hasOneUse())
51257             // We can't assume an undef src element gives an undef dst - the
51278       if (N->getOpcode() != ISD::DELETED_NODE)
51293       isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
51301     if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
51320               {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
51335 // Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
51339   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
51341   MVT VT = N->getSimpleValueType(0);
51346   SDValue N0 = peekThroughBitcasts(N->getOperand(0));
51347   SDValue N1 = peekThroughBitcasts(N->getOperand(1));
51370     // TODO - add UNDEF elts support.
51378     // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
51379     // VPTERNLOG is only available as vXi32/64-bit types.
51392   SDValue X = N->getOperand(0);
51401   if (N->getOpcode() != ISD::OR)
51404   SDValue N0 = N->getOperand(0);
51405   SDValue N1 = N->getOperand(1);
51426   // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
51442   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
51444   EVT VT = N->getValueType(0);
51498   SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
51499   // The result of the shift is true or false, and on X86, the 32-bit
51517   if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
51521     return (N->getOpcode() == ISD::OR && N->hasOneUse());
51524   // Check the zero extend is extending to 32-bit or more. The code generated by
51525   // srl(ctlz) for 16-bit or less variants of the pattern would require extra
51527   if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
51528       !isORCandidate(N->getOperand(0)))
51533     return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
51534            X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
51535            N->getOperand(1).getOpcode() == X86ISD::CMP &&
51536            isNullConstant(N->getOperand(1).getOperand(1)) &&
51537            N->getOperand(1).getValueType().bitsGE(MVT::i32);
51540   SDNode *OR = N->getOperand(0).getNode();
51541   SDValue LHS = OR->getOperand(0);
51542   SDValue RHS = OR->getOperand(1);
51549     OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
51550     LHS = OR->getOperand(0);
51551     RHS = OR->getOperand(1);
51575     LHS = OR->getOperand(0);
51576     RHS = OR->getOperand(1);
51578     if (RHS->getOpcode() == ISD::OR)
51586   return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
51592   if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
51594   SDValue NotOp = And0_L->getOperand(0);
51601   // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
51602   EVT VT = And1_L->getValueType(0);
51613 /// "and-not" operation. This function is intended to be called from a
51616   // Note that masked-merge variants using XOR or ADD expressions are
51618   assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
51619   SDValue N0 = Node->getOperand(0);
51620   if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
51622   SDValue N1 = Node->getOperand(1);
51623   if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
51627   SDValue N00 = N0->getOperand(0);
51628   SDValue N01 = N0->getOperand(1);
51629   SDValue N10 = N1->getOperand(0);
51630   SDValue N11 = N1->getOperand(1);
51653   // Look through a one-use zext.
51670   // If X is -1 or 0, then we have an opportunity to avoid constants required in
51674     if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
51675         (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
51676       // This is a complicated way to get -1 or 0 from the carry flag:
51677       // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
51678       //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
51684     if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
51685         (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
51690         // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
51691         //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB
51693             X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51704     // X + SETB Z --> adc X, 0
51705     // X - SETB Z --> sbb X, 0
51721     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
51725           DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51735     // X + SETAE --> sbb X, -1
51736     // X - SETAE --> adc X, -1
51743     // X + SETBE --> sbb X, -1
51744     // X - SETBE --> adc X, -1
51751     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
51755           DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51775   // If X is -1 or 0, then we have an opportunity to avoid constants required in
51778     // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
51780     //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
51781     // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
51782     if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
51783         (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
51792     // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
51794     //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
51795     // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
51796     if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
51797         (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
51815   // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
51816   // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
51821   // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
51822   // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
51832   bool IsSub = N->getOpcode() == ISD::SUB;
51833   SDValue X = N->getOperand(0);
51834   SDValue Y = N->getOperand(1);
51835   EVT VT = N->getValueType(0);
51865       bool N1COdd = N1C->getZExtValue() & 1;
51872   // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
51889   SDValue N0 = N->getOperand(0);
51890   SDValue N1 = N->getOperand(1);
51891   EVT VT = N->getValueType(0);
51903   // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
51929   if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51932   if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51935   if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51938   if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51954   // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
51964         uint64_t Val = CN->getZExtValue();
51979   // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
51980   // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
51981   // iff the upper elements of the non-shifted arg are zero.
52029       if (N->getOpcode() != ISD::DELETED_NODE)
52040   if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52047 ///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52049 ///   SETGT(X, -1)
52052   EVT ResultType = N->getValueType(0);
52056   SDValue N0 = N->getOperand(0);
52057   SDValue N1 = N->getOperand(1);
52079       Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52082   // Create a greater-than comparison against -1.
52100 ///   xor (sra X, elt_size(X)-1), -1
52102 ///   pcmpgt X, -1
52108   EVT VT = N->getValueType(0);
52113   // clang-format off
52123     // clang-format on
52128   SDValue Shift = N->getOperand(0);
52129   SDValue Ones = N->getOperand(1);
52138       ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52141   // Create a greater-than comparison against -1. We don't use the more obvious
52142   // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52239   // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52240   // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52242   // clip to 0-255.
52256   // For 256-bit or smaller vectors, we require VLX.
52258   // If the result type is 256-bits or larger and we have disable 512-bit
52271       // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52334   EVT RegVT = Ld->getValueType(0);
52335   SDValue Ptr = Ld->getBasePtr();
52336   SDValue Chain = Ld->getChain();
52337   ISD::LoadExtType Ext = Ld->getExtensionType();
52339   if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
52362   for (SDNode *User : Chain->users()) {
52365         (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
52366          User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
52368         UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) &&
52369         User->getValueSizeInBits(0).getFixedValue() >
52371       EVT UserVT = User->getValueType(0);
52372       SDValue UserPtr = UserLd->getBasePtr();
52378         unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
52379         unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
52408   EVT RegVT = Ld->getValueType(0);
52409   EVT MemVT = Ld->getMemoryVT();
52413   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
52414   // into two 16-byte operations. Also split non-temporal aligned loads on
52415   // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
52416   ISD::LoadExtType Ext = Ld->getExtensionType();
52420       ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
52421         Ld->getAlign() >= Align(16)) ||
52423                                *Ld->getMemOperand(), &Fast) &&
52430     SDValue Ptr1 = Ld->getBasePtr();
52436         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
52437                     Ld->getOriginalAlign(),
52438                     Ld->getMemOperand()->getFlags());
52439     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
52440                                 Ld->getPointerInfo().getWithOffset(HalfOffset),
52441                                 Ld->getOriginalAlign(),
52442                                 Ld->getMemOperand()->getFlags());
52450   // Bool vector load - attempt to cast to an integer, as we have good
52457       SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
52458                                     Ld->getPointerInfo(),
52459                                     Ld->getOriginalAlign(),
52460                                     Ld->getMemOperand()->getFlags());
52468   if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
52470     SDValue Ptr = Ld->getBasePtr();
52471     SDValue Chain = Ld->getChain();
52472     for (SDNode *User : Chain->users()) {
52475           User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
52476           UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
52477           UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
52478           !User->hasAnyUseOfValue(1) &&
52479           User->getValueSizeInBits(0).getFixedValue() >
52493   unsigned AddrSpace = Ld->getAddressSpace();
52497     if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
52499           DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
52500       return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
52501                             Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
52502                             Ld->getMemOperand()->getFlags());
52511 /// Otherwise, return -1.
52521   if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
52522     return -1;
52524   int TrueIndex = -1;
52525   unsigned NumElts = BV->getValueType(0).getVectorNumElements();
52527     const SDValue &Op = BV->getOperand(i);
52532       return -1;
52533     if (ConstNode->getAPIntValue().countr_one() >= 1) {
52536         return -1;
52551   int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
52557   EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
52559   Addr = MaskedOp->getBasePtr();
52567   Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
52572 /// If exactly one element of the mask is set for a non-extending masked load,
52574 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
52580   assert(ML->isUnindexed() && "Unexpected indexed masked load!");
52581   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
52594   EVT VT = ML->getValueType(0);
52604       DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
52605                   ML->getPointerInfo().getWithOffset(Offset),
52606                   Alignment, ML->getMemOperand()->getFlags());
52608   SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
52620   assert(ML->isUnindexed() && "Unexpected indexed masked load!");
52621   if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
52625   EVT VT = ML->getValueType(0);
52631   BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
52632   bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
52633   bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
52635     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
52636                                 ML->getMemOperand());
52637     SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
52638                                   ML->getPassThru());
52644   // (for example, vblendvps -> vblendps).
52646   // Don't try this if the pass-through operand is already undefined. That would
52648   if (ML->getPassThru().isUndef())
52651   if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
52654   // The new masked load has an undef pass-through operand. The select uses the
52655   // original pass-through operand.
52657       VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
52658       DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
52659       ML->getAddressingMode(), ML->getExtensionType());
52660   SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
52661                                 ML->getPassThru());
52672   if (Mld->isExpandingLoad())
52675   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
52686   // If the mask value has been legalized to a non-boolean vector, try to
52688   SDValue Mask = Mld->getMask();
52690     EVT VT = Mld->getValueType(0);
52694       if (N->getOpcode() != ISD::DELETED_NODE)
52701           VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
52702           NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
52703           Mld->getAddressingMode(), Mld->getExtensionType());
52709 /// If exactly one element of the mask is set for a non-truncating masked store,
52711 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
52716   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
52728   SDValue Value = MS->getValue();
52740   return DAG.getStore(MS->getChain(), DL, Extract, Addr,
52741                       MS->getPointerInfo().getWithOffset(Offset),
52742                       Alignment, MS->getMemOperand()->getFlags());
52749   if (Mst->isCompressingStore())
52752   EVT VT = Mst->getValue().getValueType();
52756   if (Mst->isTruncatingStore())
52762   // If the mask value has been legalized to a non-boolean vector, try to
52764   SDValue Mask = Mst->getMask();
52768       if (N->getOpcode() != ISD::DELETED_NODE)
52774       return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
52775                                 Mst->getBasePtr(), Mst->getOffset(), NewMask,
52776                                 Mst->getMemoryVT(), Mst->getMemOperand(),
52777                                 Mst->getAddressingMode());
52780   SDValue Value = Mst->getValue();
52781   if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
52783                             Mst->getMemoryVT())) {
52784     return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
52785                               Mst->getBasePtr(), Mst->getOffset(), Mask,
52786                               Mst->getMemoryVT(), Mst->getMemOperand(),
52787                               Mst->getAddressingMode(), true);
52797   EVT StVT = St->getMemoryVT();
52799   SDValue StoredVal = St->getValue();
52810     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52811                         St->getPointerInfo(), St->getOriginalAlign(),
52812                         St->getMemOperand()->getFlags());
52816   // This will avoid a copy to k-register.
52823     return DAG.getStore(St->getChain(), dl, Val,
52824                         St->getBasePtr(), St->getPointerInfo(),
52825                         St->getOriginalAlign(),
52826                         St->getMemOperand()->getFlags());
52837     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52838                         St->getPointerInfo(), St->getOriginalAlign(),
52839                         St->getMemOperand()->getFlags());
52846     // If its a v64i1 store without 64-bit support, we need two stores.
52849                                       StoredVal->ops().slice(0, 32));
52852                                       StoredVal->ops().slice(32, 32));
52855       SDValue Ptr0 = St->getBasePtr();
52859           DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
52860                        St->getOriginalAlign(),
52861                        St->getMemOperand()->getFlags());
52863           DAG.getStore(St->getChain(), dl, Hi, Ptr1,
52864                        St->getPointerInfo().getWithOffset(4),
52865                        St->getOriginalAlign(),
52866                        St->getMemOperand()->getFlags());
52871     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52872                         St->getPointerInfo(), St->getOriginalAlign(),
52873                         St->getMemOperand()->getFlags());
52876   // Convert scalar fabs/fneg load-store to integer equivalents.
52893       return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
52894                           St->getPointerInfo(), St->getOriginalAlign(),
52895                           St->getMemOperand()->getFlags());
52899   // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
52900   // Sandy Bridge, perform two 16-byte stores.
52904                              *St->getMemOperand(), &Fast) &&
52913   // Split under-aligned vector non-temporal stores.
52914   if (St->isNonTemporal() && StVT == VT &&
52915       St->getAlign().value() < VT.getStoreSize()) {
52916     // ZMM/YMM nt-stores - either it can be stored as a series of shorter
52925     // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
52935   // Try to optimize v16i16->v16i8 truncating stores when BWI is not
52937   if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
52938       St->getValue().getOpcode() == ISD::TRUNCATE &&
52939       St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
52941       St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
52943                               St->getValue().getOperand(0));
52944     return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
52945                              MVT::v16i8, St->getMemOperand());
52949   if (!St->isTruncatingStore() &&
52955     return EmitTruncSStore(IsSigned, St->getChain(),
52956                            dl, StoredVal.getOperand(0), St->getBasePtr(),
52957                            VT, St->getMemOperand(), DAG);
52961   if (!St->isTruncatingStore()) {
52983           return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
52984                                    TruncVT, St->getMemOperand());
52993   if (St->isTruncatingStore() && VT.isVector()) {
52995       if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
52996         return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
52997                                dl, Val, St->getBasePtr(),
52998                                St->getMemoryVT(), St->getMemOperand(), DAG);
52999       if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53001         return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53002                                dl, Val, St->getBasePtr(),
53003                                St->getMemoryVT(), St->getMemOperand(), DAG);
53010   unsigned AddrSpace = St->getAddressSpace();
53014     if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53016           DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53018           St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53019           St->getOriginalAlign(), St->getMemOperand()->getFlags(),
53020           St->getAAInfo());
53027       Subtarget.hasCF() && St->isSimple()) {
53037     auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53038     if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53056     SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53059                                    St->getMemOperand());
53062   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
53067   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53079   if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53080       cast<LoadSDNode>(St->getValue())->isSimple() &&
53081       St->getChain().hasOneUse() && St->isSimple()) {
53082     auto *Ld = cast<LoadSDNode>(St->getValue());
53088     if (!Ld->hasNUsesOfValue(1, 0))
53094     SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53095                                 Ld->getBasePtr(), Ld->getMemOperand());
53099     return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53100                         St->getMemOperand());
53103   // This is similar to the above case, but here we handle a scalar 64-bit
53104   // integer store that is extracted from a vector on a 32-bit target.
53105   // If we have SSE2, then we can treat it like a floating-point double
53110       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
53111     SDValue OldExtract = St->getOperand(1);
53118     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53119                         St->getPointerInfo(), St->getOriginalAlign(),
53120                         St->getMemOperand()->getFlags());
53131   SDValue StoredVal = N->getOperand(1);
53133   EVT MemVT = St->getMemoryVT();
53141     if (N->getOpcode() != ISD::DELETED_NODE)
53158 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53160 /// A horizontal-op B, for some already available A and B, and if so then LHS is
53178   // which is A horizontal-op B.
53220   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53277   // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53278   // so we just repeat the inner loop if this is a 256-bit op.
53299       // Compute the post-shuffle mask index based on where the element
53303                   ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
53305       // The  low half of the 128-bit result must choose from A.
53306       // The high half of the 128-bit result must choose from B,
53322   // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
53330     return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
53333       ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
53334                        llvm::any_of(NewRHS->users(), FoundHorizUser));
53352   EVT VT = N->getValueType(0);
53353   unsigned Opcode = N->getOpcode();
53358     return N->hasOneUse() &&
53359            N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
53360            (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
53361             N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
53369       SDValue LHS = N->getOperand(0);
53370       SDValue RHS = N->getOperand(1);
53386       SDValue LHS = N->getOperand(0);
53387       SDValue RHS = N->getOperand(1);
53411 //    <i32 -2147483648[float -0.000000e+00]> 0
53413 //    <(load 4 from constant-pool)> t0, t29
53424   EVT VT = N->getValueType(0);
53425   SDValue LHS = N->getOperand(0);
53426   SDValue RHS = N->getOperand(1);
53428       N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
53430     if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
53432       if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
53481   if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
53482       !AllowContract(N->getFlags()))
53485   EVT VT = N->getValueType(0);
53489   SDValue LHS = N->getOperand(0);
53490   SDValue RHS = N->getOperand(1);
53495                        &HasNoSignedZero](SDValue N) -> bool {
53500     if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
53508           ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&
53509             HasNoSignedZero(Op0->getFlags())) ||
53510            IsVectorAllNegativeZero(Op0->getOperand(2)))) {
53533       DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
53537 /// Do target-specific dag combines on floating-point adds/subs.
53551   EVT VT = N->getValueType(0);
53552   SDValue Src = N->getOperand(0);
53565 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
53567 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
53573   assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
53574   SDValue Src = N->getOperand(0);
53578   EVT VT = N->getValueType(0);
53615   // In most cases its only worth pre-truncating if we're only facing the cost
53620     // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
53658   // Only handle vXi16 types that are at least 128-bits unless they will be
53674   // Count leading sign/zero bits on both inputs - if there are enough then
53675   // truncation back to vXi16 will be cheap - either as a pack/shuffle
53724 // adjacent pairs of 16-bit products, and saturates the result before
53725 // truncating to 16-bits.
53818     unsigned IdxN00 = ConstN00Elt->getZExtValue();
53819     unsigned IdxN01 = ConstN01Elt->getZExtValue();
53820     unsigned IdxN10 = ConstN10Elt->getZExtValue();
53821     unsigned IdxN11 = ConstN11Elt->getZExtValue();
53874   EVT VT = N->getValueType(0);
53875   SDValue Src = N->getOperand(0);
53878   // Attempt to pre-truncate inputs to arithmetic ops instead.
53912   EVT VT = N->getValueType(0);
53913   SDValue In = N->getOperand(0);
53931 /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
53938 static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
53939   if (N->getOpcode() == ISD::FNEG)
53940     return N->getOperand(0);
53943   if (Depth > SelectionDAG::MaxRecursionDepth)
53946   unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
53949   EVT VT = Op->getValueType(0);
53959     // of this is VECTOR_SHUFFLE(-VEC1, UNDEF).  The mask can be anything here.
53962     if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
53965                                     cast<ShuffleVectorSDNode>(Op)->getMask());
53970     // -V, INDEX).
53975     if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
54005       // Only allow bitcast from correctly-sized constant.
54021     // clang-format off
54035     // clang-format on
54041     // clang-format off
54059     // clang-format on
54066     // clang-format off
54076     // clang-format on
54083 /// Do target-specific dag combines on floating point negations.
54087   EVT OrigVT = N->getValueType(0);
54102   // use of a constant by performing (-0 - A*B) instead.
54105       Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54125                                                 unsigned Depth) const {
54127   if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54135   SDNodeFlags Flags = Op.getNode()->getFlags();
54160           Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54170     // Fill in the non-negated ops with the original values.
54179                                  ForCodeSize, Cost, Depth + 1))
54185                                               ForCodeSize, Cost, Depth);
54190   MVT VT = N->getSimpleValueType(0);
54201   SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54202   SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54204   switch (N->getOpcode()) {
54205   // clang-format off
54211   // clang-format on
54218 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54220   if (N->getOpcode() != ISD::XOR)
54223   SDValue LHS = N->getOperand(0);
54224   if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54228       X86::CondCode(LHS->getConstantOperandVal(0)));
54230   return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54235   assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
54240   EVT VT = N->getValueType(0);
54245   SDValue N0 = N->getOperand(0);
54246   SDValue N1 = N->getOperand(1);
54258   } else if (N->getOpcode() == ISD::SUB) {
54271   if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
54292   SDValue N0 = N->getOperand(0);
54293   SDValue N1 = N->getOperand(1);
54294   EVT VT = N->getValueType(0);
54308   if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
54311   if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
54314   if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
54317   if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
54330   if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
54336   // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
54347   // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
54358   // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
54359   // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
54362       N0.getOperand(0).getOpcode() == N->getOpcode()) {
54366     if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
54383   SDValue N0 = N->getOperand(0);
54384   EVT VT = N->getValueType(0);
54386   // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
54397         ReverseMask[I] = (NumElts - 1) - I;
54411   unsigned Opcode = N->getOpcode();
54412   SDValue N0 = N->getOperand(0);
54413   SDValue N1 = N->getOperand(1);
54414   EVT VT = N->getValueType(0);
54418   // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
54435   EVT VT = N->getValueType(0);
54438   // TODO - Constant Folding.
54457 /// to be used as a replacement operand with operations (eg, bitwise-and) where
54472   SDValue N0 = N->getOperand(0);
54473   SDValue N1 = N->getOperand(1);
54474   EVT VT = N->getValueType(0);
54487     return C && C->getConstantFPValue()->isAllOnesValue();
54490   // fand (fxor X, -1), Y --> fandn X, Y
54494   // fand X, (fxor Y, -1) --> fandn Y, X
54501 /// Do target-specific dag combines on X86ISD::FAND nodes.
54504   // FAND(0.0, x) -> 0.0
54505   if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
54508   // FAND(x, 0.0) -> 0.0
54509   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
54518 /// Do target-specific dag combines on X86ISD::FANDN nodes.
54521   // FANDN(0.0, x) -> x
54522   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
54523     return N->getOperand(1);
54525   // FANDN(x, 0.0) -> 0.0
54526   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
54532 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
54536   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
54538   // F[X]OR(0.0, x) -> x
54539   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
54540     return N->getOperand(1);
54542   // F[X]OR(x, 0.0) -> x
54543   if (isNullFPScalarOrVectorConst(N->getOperand(1)))
54544     return N->getOperand(0);
54552 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
54554   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
54561   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
54564   switch (N->getOpcode()) {
54570   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
54571                      N->getOperand(0), N->getOperand(1));
54576   EVT VT = N->getValueType(0);
54588   SDValue Op0 = N->getOperand(0);
54589   SDValue Op1 = N->getOperand(1);
54591   auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
54595   if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
54596     return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
54598   // If one of the operands is known non-NaN use the native min/max instructions
54599   // with the non-NaN input as second operand.
54601     return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
54603     return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
54617   //            ----------------
54619   // Op0        ----------------
54621   //            ----------------
54642   EVT VT = N->getValueType(0);
54650   SDValue In = N->getOperand(0);
54654     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
54655     LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
54661       SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
54676   bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
54677   EVT VT = N->getValueType(0);
54680   SDValue In = N->getOperand(IsStrict ? 1 : 0);
54684     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
54693             DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
54694                         {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
54698             DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
54710 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
54714   SDValue N0 = N->getOperand(0);
54715   SDValue N1 = N->getOperand(1);
54716   MVT VT = N->getSimpleValueType(0);
54722   // ANDNP(undef, x) -> 0
54723   // ANDNP(x, undef) -> 0
54727   // ANDNP(0, x) -> x
54731   // ANDNP(x, 0) -> 0
54735   // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
54745   // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
54773     if (N0->hasOneUse()) {
54803             // We can't assume an undef src element gives an undef dst - the
54825       if (N->getOpcode() != ISD::DELETED_NODE)
54832   if (N1->hasOneUse()) {
54833     // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
54838     // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
54858   SDValue N1 = N->getOperand(1);
54864     if (N->getOpcode() != ISD::DELETED_NODE)
54874   bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
54875   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
54877   if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
54881       if (N->getOpcode() != ISD::DELETED_NODE)
54888       LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
54893               N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
54894               {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
54897           SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
54914   assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
54916   EVT DstVT = N->getValueType(0);
54918   SDValue N0 = N->getOperand(0);
54919   SDValue N1 = N->getOperand(1);
54920   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
54976   assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
54981   EVT VT = N->getValueType(0);
54982   SDValue N0 = N->getOperand(0);
54983   SDValue N1 = N->getOperand(1);
54984   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
54988   // both SSE and AVX2 since there is no sign-extended shift right
54989   // operation on a vector with 64-bit elements.
54990   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
55016 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55017 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55023   if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55024       Ext->getOpcode() != ISD::ZERO_EXTEND)
55028   EVT VT = Ext->getValueType(0);
55032   SDValue Add = Ext->getOperand(0);
55038   bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55039   bool NSW = Add->getFlags().hasNoSignedWrap();
55040   bool NUW = Add->getFlags().hasNoUnsignedWrap();
55062   for (auto *User : Ext->users()) {
55063     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55072   int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55073   SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55077   // sign-extended.
55085 // operands and the result of CMOV is not used anywhere else - promote CMOV
55088 //        (or more) pseudo-CMOVs only when they go one-after-another and
55092 //     3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55093 //        promotion is also good in terms of code-size.
55094 //        (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55097   SDValue CMovN = Extend->getOperand(0);
55101   EVT TargetVT = Extend->getValueType(0);
55102   unsigned ExtendOpcode = Extend->getOpcode();
55145   SDValue N0 = N->getOperand(0);
55146   EVT VT = N->getValueType(0);
55169   ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
55181   if (N->getOpcode() == ISD::ZERO_EXTEND)
55190   SDValue N0 = N->getOperand(0);
55191   EVT VT = N->getValueType(0);
55194   // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55197     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55198                                  N0->getOperand(1));
55220   if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55251     return User->getOpcode() != ISD::FMA &&
55252            User->getOpcode() != ISD::STRICT_FMA;
55254   if (llvm::any_of(V->users(), IsNotFMA))
55260   for (const SDValue &Op : V->op_values()) {
55262       Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
55275   if (llvm::any_of(NV->users(), IsNotFMA))
55282   for (const SDValue &Op : V->op_values()) {
55284       if (Cst->isNegative())
55296   EVT VT = N->getValueType(0);
55298   bool IsStrict = N->isTargetOpcode()
55299                       ? TSI.isTargetStrictFPOpcode(N->getOpcode())
55300                       : N->isStrictFPOpcode();
55307   SDValue A = N->getOperand(IsStrict ? 1 : 0);
55308   SDValue B = N->getOperand(IsStrict ? 2 : 1);
55309   SDValue C = N->getOperand(IsStrict ? 3 : 2);
55311   // If the operation allows fast-math and the target does not support FMA,
55313   SDNodeFlags Flags = N->getFlags();
55367       negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
55369   // Propagate fast-math-flags to new FMA node.
55372     assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
55374                        {N->getOperand(0), A, B, C});
55376     if (N->getNumOperands() == 4)
55377       return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
55382 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
55383 // Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
55387   EVT VT = N->getValueType(0);
55392   SDValue N2 = N->getOperand(2);
55398   unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
55400   if (N->getNumOperands() == 4)
55401     return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
55402                        NegN2, N->getOperand(3));
55403   return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
55411   SDValue N0 = N->getOperand(0);
55412   EVT VT = N->getValueType(0);
55414   // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55416   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
55418     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
55419                                  N0->getOperand(1));
55439   if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
55470 /// pre-promote its result type since vXi1 vectors don't get promoted
55489   const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
55490   const SDValue LHS = N->getOperand(0);
55491   const SDValue RHS = N->getOperand(1);
55492   EVT VT = N->getValueType(0);
55509       // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
55510       // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
55512         if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
55527       // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
55528       // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
55530         if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
55545       // cmpeq(trunc(x),C) --> cmpeq(x,C)
55546       // cmpne(trunc(x),C) --> cmpne(x,C)
55562       //    icmp eq Abs(X) C ->
55563       //        (icmp eq A, C) | (icmp eq A, -C)
55564       //    icmp ne Abs(X) C ->
55565       //        (icmp ne A, C) & (icmp ne A, -C)
55571           const APInt &CInt = C->getAPIntValue();
55577                 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
55695   //        -> `(icmp ult (add x, -C), 2)`
55699   // in worse codegen. So, undo the middle-end transform and go back to `(or
55722       // If we had `(add x, -1)` and can lower with `umin`, don't transform as
55739       else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
55740                (CC == ISD::SETUGE && (-CmpC) == 2)) {
55756   // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
55762   // X pred 0.0 --> X pred -X
55778   SDValue Src = N->getOperand(0);
55780   MVT VT = N->getSimpleValueType(0);
55800   // Look through int->fp bitcasts that don't change the element width.
55806   // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
55817   // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
55828   // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
55829   // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
55831   // Use KnownBits to determine if only a single bit is non-zero
55845         // vXi8 shifts - we only care about the signbit so can use PSLLW.
55861   // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
55862   if (N->isOnlyUserOf(Src.getNode())) {
55895   MVT VT = N->getSimpleValueType(0);
55910   SDValue Mask = MemOp->getMask();
55917       if (N->getOpcode() != ISD::DELETED_NODE)
55932     SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
55933                       Gather->getMask(), Base, Index, Scale } ;
55934     return DAG.getMaskedGather(Gather->getVTList(),
55935                                Gather->getMemoryVT(), DL, Ops,
55936                                Gather->getMemOperand(),
55937                                Gather->getIndexType(),
55938                                Gather->getExtensionType());
55941   SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
55942                     Scatter->getMask(), Base, Index, Scale };
55943   return DAG.getMaskedScatter(Scatter->getVTList(),
55944                               Scatter->getMemoryVT(), DL,
55945                               Ops, Scatter->getMemOperand(),
55946                               Scatter->getIndexType(),
55947                               Scatter->isTruncatingStore());
55954   SDValue Index = GorS->getIndex();
55955   SDValue Base = GorS->getBasePtr();
55956   SDValue Scale = GorS->getScale();
55962     // Shrink constant indices if they are larger than 32-bits.
55970       if (BV->isConstant() && IndexWidth > 32 &&
55971           DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
55985         DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56000     uint64_t ScaleAmt = Scale->getAsZExtVal();
56003       if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
56004         // FIXME: Allow non-constant?
56007           APInt Adder = C->getAPIntValue() * ScaleAmt;
56018       if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
56046   SDValue Mask = GorS->getMask();
56050       if (N->getOpcode() != ISD::DELETED_NODE)
56063   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
56064   SDValue EFLAGS = N->getOperand(1);
56077   SDValue EFLAGS = N->getOperand(3);
56078   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
56085     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
56086                        N->getOperand(1), Cond, Flags);
56095   // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
56099   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
56106   EVT VT = N->getValueType(0);
56107   bool IsStrict = N->isStrictFPOpcode();
56109   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
56116   // make the transformation for non-constant splats as well, but it's unclear
56121     if (!BV->isConstant())
56126     EVT IntVT = BV->getValueType(0);
56131       SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
56132                                 {N->getOperand(0), SDValue(BV, 0)});
56134       SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
56137     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
56148 /// If we are converting a value to floating-point, try to replace scalar
56155   SDValue Trunc = N->getOperand(0);
56171   // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
56180   return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
56185   bool IsStrict = N->isStrictFPOpcode();
56186   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
56187   EVT VT = N->getValueType(0);
56193   //   UINT_TO_FP(vXi1~15)  -> SINT_TO_FP(ZEXT(vXi1~15  to vXi16))
56194   //   UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
56196   //   UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
56197   // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
56213                          {N->getOperand(0), P});
56217   // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
56218   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
56219   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
56229                          {N->getOperand(0), P});
56236   SDNodeFlags Flags = N->getFlags();
56240                          {N->getOperand(0), Op0});
56252   bool IsStrict = N->isStrictFPOpcode();
56257   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
56258   EVT VT = N->getValueType(0);
56264   //   SINT_TO_FP(vXi1~15)  -> SINT_TO_FP(SEXT(vXi1~15  to vXi16))
56265   //   SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
56267   //   SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
56268   // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
56284                          {N->getOperand(0), P});
56288   // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
56289   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
56290   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
56298                          {N->getOperand(0), P});
56308     if (NumSignBits >= (BitWidth - 31)) {
56317                              {N->getOperand(0), Trunc});
56325                                           { 0, 2, -1, -1 });
56328                            {N->getOperand(0), Shuf});
56334   // a 32-bit target where SSE doesn't support i64->FP operations.
56348     if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
56351           Subtarget.getTargetLowering()->BuildFILD(
56352               VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
56353               Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
56374   bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
56375   EVT SrcVT = N->getOperand(0).getValueType();
56376   EVT DstVT = N->getValueType(0);
56384                                  N->getOperand(0), V2F32Value);
56398   for (const SDNode *User : Flags->users()) {
56400     switch (User->getOpcode()) {
56406       CC = (X86::CondCode)User->getConstantOperandVal(0);
56410       CC = (X86::CondCode)User->getConstantOperandVal(2);
56415     // clang-format off
56423     // clang-format on
56433   for (const SDNode *User : Flags->users()) {
56435     switch (User->getOpcode()) {
56449     X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
56461   if (!isNullConstant(N->getOperand(1)))
56469   SDValue Op = N->getOperand(0);
56486       unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
56501   // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
56529   // Peek through any zero-extend if we're only testing for a zero result.
56547   // i32 truncated op to prevent partial-reg compares of promoted ops.
56607   assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
56611   SDValue LHS = N->getOperand(0);
56612   SDValue RHS = N->getOperand(1);
56614   bool IsSub = X86ISD::SUB == N->getOpcode();
56617   if (IsSub && isOneConstant(N->getOperand(1)) && !N->hasAnyUseOfValue(0))
56622   if (!N->hasAnyUseOfValue(1)) {
56630     SDVTList VTs = DAG.getVTList(N->getValueType(0));
56635         if (GenericAddSub->hasOneUse() &&
56636             GenericAddSub->user_begin()->isOnlyUserOf(N))
56644   MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
56653   SDValue LHS = N->getOperand(0);
56654   SDValue RHS = N->getOperand(1);
56655   SDValue BorrowIn = N->getOperand(2);
56658     MVT VT = N->getSimpleValueType(0);
56663   // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
56666       !N->hasAnyUseOfValue(1))
56667     return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
56676   SDValue LHS = N->getOperand(0);
56677   SDValue RHS = N->getOperand(1);
56678   SDValue CarryIn = N->getOperand(2);
56684     return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
56690   if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
56695     EVT VT = N->getValueType(0);
56696     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
56705   // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
56708   if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
56710     APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
56711     return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
56717     MVT VT = N->getSimpleValueType(0);
56722   // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
56724   if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
56725       !N->hasAnyUseOfValue(1))
56726     return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
56777     SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
56778             Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
56953 // ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
56962   // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
57000 /// earlier folds that may be used to turn select-of-constants into logic hacks.
57004   // If an operand is zero, add-of-0 gets simplified away, so that's clearly
57005   // better because we eliminate 1-2 instructions. This transform is still
57008   // immediate asm operands (fit in 32-bits).
57021   SDValue Cmov = N->getOperand(0);
57022   SDValue OtherOp = N->getOperand(1);
57033   EVT VT = N->getValueType(0);
57040   // a 3-operand LEA which is likely slower than a 2-operand LEA.
57044       all_of(N->users(), [&](SDNode *Use) {
57046         return MemNode && MemNode->getBasePtr().getNode() == N;
57048     // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
57059   // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
57069   EVT VT = N->getValueType(0);
57070   SDValue Op0 = N->getOperand(0);
57071   SDValue Op1 = N->getOperand(1);
57088   // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
57104   // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
57124   // Peephole for 512-bit VPDPBSSD on non-VLX targets.
57141   // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
57142   if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
57144     assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
57145     return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
57152 // Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
57153 // condition comes from the subtract node that produced -X. This matches the
57173     // Get the X and -X from the negate.
57189   // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
57206   SDValue Op0 = N->getOperand(0);
57207   SDValue Op1 = N->getOperand(1);
57211   // (add (zero_extend (setcc inverted) C-1))   if C is a nonzero immediate
57213   EVT VT = N->getValueType(0);
57216       !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
57221     APInt NewImm = Op0C->getAPIntValue() - 1;
57235   // ->
57237   if (N->getConstantOperandVal(3) != X86::COND_NE)
57240   SDValue Sub = N->getOperand(4);
57249   SmallVector<SDValue, 5> Ops(N->op_values());
57253   return DAG.getMemIntrinsicNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops,
57254                                  cast<MemSDNode>(N)->getMemoryVT(),
57255                                  cast<MemSDNode>(N)->getMemOperand());
57261   EVT VT = N->getValueType(0);
57262   SDValue Op0 = N->getOperand(0);
57263   SDValue Op1 = N->getOperand(1);
57275   // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
57278       Op1->hasOneUse()) {
57293   // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
57294   if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
57296     assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
57297     return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
57301   // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
57303   if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
57305     assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
57306     SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
57322   unsigned Opcode = N->getOpcode();
57326   SDValue LHS = N->getOperand(0);
57327   SDValue RHS = N->getOperand(1);
57328   MVT VT = N->getSimpleValueType(0);
57338   // PCMPEQ(X,UNDEF) -> UNDEF
57339   // PCMPGT(X,UNDEF) -> 0
57340   // PCMPGT(UNDEF,X) -> 0
57417     // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
57427     // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
57436     //                extract_subvector(broadcast(x))) -> broadcast(x)
57438     //                extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
57446           Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
57450     // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
57469       // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
57480       // concat(extract_subvector(x,lo), extract_subvector(x,hi)) -> x.
57492   // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
57543         for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
57547         for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
57789       // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
58038     if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
58039                                 *FirstLd->getMemOperand(), &Fast) &&
58087               getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
58096   // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
58113   EVT VT = N->getValueType(0);
58114   EVT SrcVT = N->getOperand(0).getValueType();
58116   SmallVector<SDValue, 4> Ops(N->ops());
58125       Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
58126       if (I == (E - 1)) {
58152   MVT OpVT = N->getSimpleValueType(0);
58157   SDValue Vec = N->getOperand(0);
58158   SDValue SubVec = N->getOperand(1);
58160   uint64_t IdxVal = N->getConstantOperandVal(2);
58197                              Ins.getOperand(1), N->getOperand(2));
58206   // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
58213                        SubVec.getOperand(1), N->getOperand(2));
58275     SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
58278                                 MemIntr->getMemoryVT(),
58279                                 MemIntr->getMemOperand());
58302 /// is a common pattern for AVX1 integer code because 256-bit selects may be
58303 /// legal, but there is almost no integer math/logic available for 256-bit.
58308   SDValue Sel = Ext->getOperand(0);
58315   // TODO: This can be extended to handle extraction to 256-bits.
58316   MVT VT = Ext->getSimpleValueType(0);
58324   MVT WideVT = Ext->getOperand(0).getSimpleValueType();
58331   unsigned ExtIdx = Ext->getConstantOperandVal(1);
58360   // For AVX1 only, if we are extracting from a 256-bit and+not (which will
58362   // split the 'and' into 128-bit ops to avoid the concatenate and extract.
58370   if (!N->getValueType(0).isSimple())
58373   MVT VT = N->getSimpleValueType(0);
58374   SDValue InVec = N->getOperand(0);
58375   unsigned IdxVal = N->getConstantOperandVal(1);
58391       SDValue NotOp = V->getOperand(0);
58396       // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
58399                          DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
58419     return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
58421   // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
58438                                  InVec.getOperand(0), N->getOperand(1));
58439     unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
58455       cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
58624   // Always split vXi64 logical shifts where we're extracting the upper 32-bits
58640   EVT VT = N->getValueType(0);
58641   SDValue Src = N->getOperand(0);
58672         if (Ld->getExtensionType() == Ext &&
58673             Ld->getMemoryVT().getScalarSizeInBits() <= 32)
58710     // to remove XMM->GPR->XMM moves.
58720     for (SDNode *User : Src->users())
58721       if (User->getOpcode() == X86ISD::VBROADCAST &&
58722           Src == User->getOperand(0)) {
58725             User->getValueSizeInBits(0).getFixedValue();
58747                                           Amt->getZExtValue(), DAG);
58759             Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
58778   SDValue LHS = N->getOperand(0);
58779   SDValue RHS = N->getOperand(1);
58784     return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
58789     return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
58803   if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
58809                                LHS.getOperand(0), { 0, -1, 1, -1 });
58811     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
58813   if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
58819                                RHS.getOperand(0), { 0, -1, 1, -1 });
58821     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
58830   MVT VT = N->getSimpleValueType(0);
58831   SDValue LHS = N->getOperand(0);
58832   SDValue RHS = N->getOperand(1);
58833   unsigned Opc = N->getOpcode();
58876   EVT VT = N->getValueType(0);
58877   SDValue In = N->getOperand(0);
58878   unsigned Opcode = N->getOpcode();
58887     if (Ld->isSimple()) {
58895             Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
58896             MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
58903   // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
58908   // -> EXTEND_VECTOR_INREG(X).
58909   // TODO: Handle non-zero subvector indices.
58916   // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
58943   EVT VT = N->getValueType(0);
58945   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
58949   //  --> extract_subvector(kshiftr(X,C1+C2),0)
58950   // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
58951   if (N->getOpcode() == X86ISD::KSHIFTR) {
58953     if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
58954         N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
58955       SDValue Src = N->getOperand(0).getOperand(0);
58956       uint64_t Amt = N->getConstantOperandVal(1) +
58957                      N->getOperand(0).getConstantOperandVal(1);
58983   if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
58986   if (N->getValueType(0) != MVT::f32 ||
58987       N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
58992                             N->getOperand(0).getOperand(0));
59003   EVT VT = N->getValueType(0);
59004   bool IsStrict = N->isStrictFPOpcode();
59005   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
59068                       {N->getOperand(0), Src});
59099   assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
59100           N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
59104   if (N->hasAnyUseOfValue(1))
59109   SDValue Ptr = MemIntrin->getBasePtr();
59110   SDValue Chain = MemIntrin->getChain();
59111   EVT VT = N->getSimpleValueType(0);
59112   EVT MemVT = MemIntrin->getMemoryVT();
59116   for (SDNode *User : Ptr->users())
59117     if (User != N && User->getOpcode() == N->getOpcode() &&
59118         cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
59119         cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
59120         cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
59122         !User->hasAnyUseOfValue(1) &&
59123         User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
59138   bool IsStrict = N->isStrictFPOpcode();
59139   EVT VT = N->getValueType(0);
59140   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
59161       bool IsOp0Strict = Op0->isStrictFPOpcode();
59203                       {N->getOperand(0), Src, Rnd});
59225   SDValue Src = N->getOperand(0);
59231     if (LN->isSimple()) {
59232       SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
59233                                   LN->getBasePtr(),
59234                                   LN->getPointerInfo(),
59235                                   LN->getOriginalAlign(),
59236                                   LN->getMemOperand()->getFlags());
59247   unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
59263   for (const SDValue &Arg : N->op_values()) {
59270   SDVTList VTs = N->getVTList();
59281     SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
59284       for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
59298   unsigned IntNo = N->getConstantOperandVal(0);
59301   if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
59312   unsigned IntNo = N->getConstantOperandVal(1);
59315   if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
59326   unsigned IntNo = N->getConstantOperandVal(1);
59329   if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
59338   switch (N->getOpcode()) {
59339   // clang-format off
59530     // clang-format on
59540 // Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
59554   // TODO: Almost no 8-bit ops are desirable because they have no actual
59555   //       size/speed advantages vs. 32-bit ops, but they do have a major
59558   // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
59559   // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
59560   // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
59601   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
59603     // In case control-flow branch protection is enabled, we need to add
59621   EVT VT = LogicOp->getValueType(0);
59622   EVT OpVT = SETCC0->getOperand(0).getValueType();
59635   // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
59647   // 8-bit multiply-by-constant can usually be expanded to something cheaper
59655     SDNode *User = *Op->user_begin();
59660     return Ld->getBasePtr() == St->getBasePtr();
59668     SDNode *User = *Op->user_begin();
59669     if (User->getOpcode() != ISD::ATOMIC_STORE)
59673     return Ld->getBasePtr() == St->getBasePtr();
59679     SDNode *User = *Op->user_begin();
59680     EVT VT = User->getValueType(0);
59681     return (User->getOpcode() == ISD::ZERO_EXTEND &&
59737 //===----------------------------------------------------------------------===//
59739 //===----------------------------------------------------------------------===//
59777   InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
59779   const std::string &AsmStr = IA->getAsmString();
59781   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
59782   if (!Ty || Ty->getBitWidth() % 16 != 0)
59785   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
59808     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
59809     if (CI->getType()->isIntegerTy(16) &&
59810         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
59814       StringRef ConstraintsStr = IA->getConstraintString();
59822     if (CI->getType()->isIntegerTy(32) &&
59823         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
59828       StringRef ConstraintsStr = IA->getConstraintString();
59835     if (CI->getType()->isIntegerTy(64)) {
59836       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
59840         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
59975   Type *Ty = CallOperandVal->getType();
59991     if (CallOperandVal->getType()->isIntegerTy())
59997     if (Ty->isFloatingPointTy())
60001     if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
60012       if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
60013           ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
60014           ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
60019       if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
60024       if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
60044       if (CallOperandVal->getType()->isIntegerTy())
60050     if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
60054     if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
60055         ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
60060     if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
60065       if (C->getZExtValue() <= 31)
60070       if (C->getZExtValue() <= 63)
60075       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
60080       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
60085       if (C->getZExtValue() <= 3)
60090       if (C->getZExtValue() <= 0xff)
60100       if ((C->getSExtValue() >= -0x80000000LL) &&
60101           (C->getSExtValue() <= 0x7fffffffLL))
60106       if (C->getZExtValue() <= 0xffffffff)
60148   // Extend to 32-bits
60166       if (C->getZExtValue() <= 31) {
60167         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60175       if (C->getZExtValue() <= 63) {
60176         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60184       if (isInt<8>(C->getSExtValue())) {
60185         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60193       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
60194           (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
60195         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
60203       if (C->getZExtValue() <= 3) {
60204         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60212       if (C->getZExtValue() <= 255) {
60213         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60221       if (C->getZExtValue() <= 127) {
60222         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60229     // 32-bit signed value
60232                                            C->getSExtValue())) {
60234         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
60247       Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
60248                                               BA->getValueType(0)));
60251       if (Op->getOpcode() == ISD::ADD &&
60252           isa<ConstantSDNode>(Op->getOperand(1))) {
60253         Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
60254         Op = Op->getOperand(0);
60257         Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
60258                                                  GA->getValueType(0), Offset));
60263     // 32-bit unsigned value
60266                                            C->getZExtValue())) {
60267         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60279       bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
60283       int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
60284                                                   : CST->getSExtValue();
60296     // If we are in non-pic codegen mode, we allow the address of a global (with
60302               Subtarget.classifyGlobalReference(GA->getGlobal())))
60389     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
60410       // 32-bit fallthrough
60704       // Map st(0) -> st(7) -> ST0
60714         return std::make_pair(X86::FP0 + Constraint[4] - '0',
60723     // flags -> EFLAGS
60727     // dirflag -> DF
60733     // fpsr -> FPSW
60741   // Make sure it isn't a register that requires 64-bit mode.
60744       TRI->getEncodingValue(Res.first) >= 8) {
60745     // Register requires REX prefix, but we're in 32-bit mode.
60751       TRI->getEncodingValue(Res.first) & 0x10) {
60757   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
60760   if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
60785         // Model GCC's behavior here and select a fixed pair of 32-bit
60806       if (RC && RC->contains(DestReg))
60825     else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
60827     else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
60829     else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
60862   // integer division, leaving the division as-is is a loss even in terms of
60875       Entry->getParent()->getInfo<X86MachineFunctionInfo>();
60876   AFI->setIsSplitCSR(true);
60883   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
60888   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
60889   MachineBasicBlock::iterator MBBI = Entry->begin();
60897     Register NewVR = MRI->createVirtualRegister(RC);
60899     // FIXME: this currently does not emit CFI pseudo-instructions, it works
60900     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
60902     // CFI pseudo-instructions.
60904         Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
60906     Entry->addLiveIn(*I);
60907     BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
60910     // Insert the copy-back instructions right before the terminator.
60912       BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
60913               TII->get(TargetOpcode::COPY), *I)
60926   assert(MBBI->isCall() && MBBI->getCFIType() &&
60932   switch (MBBI->getOpcode()) {
60939     if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
60944     assert(MBBI->isCall() &&
60946     if (OrigCall->shouldUpdateAdditionalCallInfo())
60948     MBBI->setCFIType(MF, OrigCall->getCFIType());
60949     OrigCall->eraseFromParent();
60956   MachineOperand &Target = MBBI->getOperand(0);
60958   switch (MBBI->getOpcode()) {
60971     // 64-bit indirect thunk calls.
60981   return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
60983       .addImm(MBBI->getCFIType())
60997       MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
61001   if (MF.getFunction().hasFnAttribute("probe-stack"))
61002     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
61003            "inline-asm";
61017   if (MF.getFunction().hasFnAttribute("probe-stack"))
61018     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
61023       MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
61037   return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
61042   if (ML && ML->isInnermost() &&