xref: /llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp (revision 41f430a48db992477534b65b288b47d487c4797d)
1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines a DAG pattern matching instruction selector for X86,
10 // converting from a legalized dag to a X86 dag.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "X86ISelDAGToDAG.h"
15 #include "X86.h"
16 #include "X86MachineFunctionInfo.h"
17 #include "X86Subtarget.h"
18 #include "X86TargetMachine.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/CodeGen/MachineModuleInfo.h"
21 #include "llvm/CodeGen/SelectionDAGISel.h"
22 #include "llvm/Config/llvm-config.h"
23 #include "llvm/IR/ConstantRange.h"
24 #include "llvm/IR/Function.h"
25 #include "llvm/IR/Instructions.h"
26 #include "llvm/IR/Intrinsics.h"
27 #include "llvm/IR/IntrinsicsX86.h"
28 #include "llvm/IR/Module.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/ErrorHandling.h"
32 #include "llvm/Support/KnownBits.h"
33 #include "llvm/Support/MathExtras.h"
34 #include <cstdint>
35 
36 using namespace llvm;
37 
38 #define DEBUG_TYPE "x86-isel"
39 #define PASS_NAME "X86 DAG->DAG Instruction Selection"
40 
41 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42 
43 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44     cl::desc("Enable setting constant bits to reduce size of mask immediates"),
45     cl::Hidden);
46 
47 static cl::opt<bool> EnablePromoteAnyextLoad(
48     "x86-promote-anyext-load", cl::init(true),
49     cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
50 
51 extern cl::opt<bool> IndirectBranchTracking;
52 
53 //===----------------------------------------------------------------------===//
54 //                      Pattern Matcher Implementation
55 //===----------------------------------------------------------------------===//
56 
57 namespace {
58   /// This corresponds to X86AddressMode, but uses SDValue's instead of register
59   /// numbers for the leaves of the matched tree.
60   struct X86ISelAddressMode {
61     enum {
62       RegBase,
63       FrameIndexBase
64     } BaseType = RegBase;
65 
66     // This is really a union, discriminated by BaseType!
67     SDValue Base_Reg;
68     int Base_FrameIndex = 0;
69 
70     unsigned Scale = 1;
71     SDValue IndexReg;
72     int32_t Disp = 0;
73     SDValue Segment;
74     const GlobalValue *GV = nullptr;
75     const Constant *CP = nullptr;
76     const BlockAddress *BlockAddr = nullptr;
77     const char *ES = nullptr;
78     MCSymbol *MCSym = nullptr;
79     int JT = -1;
80     Align Alignment;            // CP alignment.
81     unsigned char SymbolFlags = X86II::MO_NO_FLAG;  // X86II::MO_*
82     bool NegateIndex = false;
83 
84     X86ISelAddressMode() = default;
85 
86     bool hasSymbolicDisplacement() const {
87       return GV != nullptr || CP != nullptr || ES != nullptr ||
88              MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
89     }
90 
91     bool hasBaseOrIndexReg() const {
92       return BaseType == FrameIndexBase ||
93              IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
94     }
95 
96     /// Return true if this addressing mode is already RIP-relative.
97     bool isRIPRelative() const {
98       if (BaseType != RegBase) return false;
99       if (RegisterSDNode *RegNode =
100             dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
101         return RegNode->getReg() == X86::RIP;
102       return false;
103     }
104 
105     void setBaseReg(SDValue Reg) {
106       BaseType = RegBase;
107       Base_Reg = Reg;
108     }
109 
110 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
111     void dump(SelectionDAG *DAG = nullptr) {
112       dbgs() << "X86ISelAddressMode " << this << '\n';
113       dbgs() << "Base_Reg ";
114       if (Base_Reg.getNode())
115         Base_Reg.getNode()->dump(DAG);
116       else
117         dbgs() << "nul\n";
118       if (BaseType == FrameIndexBase)
119         dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
120       dbgs() << " Scale " << Scale << '\n'
121              << "IndexReg ";
122       if (NegateIndex)
123         dbgs() << "negate ";
124       if (IndexReg.getNode())
125         IndexReg.getNode()->dump(DAG);
126       else
127         dbgs() << "nul\n";
128       dbgs() << " Disp " << Disp << '\n'
129              << "GV ";
130       if (GV)
131         GV->dump();
132       else
133         dbgs() << "nul";
134       dbgs() << " CP ";
135       if (CP)
136         CP->dump();
137       else
138         dbgs() << "nul";
139       dbgs() << '\n'
140              << "ES ";
141       if (ES)
142         dbgs() << ES;
143       else
144         dbgs() << "nul";
145       dbgs() << " MCSym ";
146       if (MCSym)
147         dbgs() << MCSym;
148       else
149         dbgs() << "nul";
150       dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
151     }
152 #endif
153   };
154 }
155 
156 namespace {
157   //===--------------------------------------------------------------------===//
158   /// ISel - X86-specific code to select X86 machine instructions for
159   /// SelectionDAG operations.
160   ///
161   class X86DAGToDAGISel final : public SelectionDAGISel {
162     /// Keep a pointer to the X86Subtarget around so that we can
163     /// make the right decision when generating code for different targets.
164     const X86Subtarget *Subtarget;
165 
166     /// If true, selector should try to optimize for minimum code size.
167     bool OptForMinSize;
168 
169     /// Disable direct TLS access through segment registers.
170     bool IndirectTlsSegRefs;
171 
172   public:
173     X86DAGToDAGISel() = delete;
174 
175     explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
176         : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
177           OptForMinSize(false), IndirectTlsSegRefs(false) {}
178 
179     bool runOnMachineFunction(MachineFunction &MF) override {
180       // Reset the subtarget each time through.
181       Subtarget = &MF.getSubtarget<X86Subtarget>();
182       IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
183                              "indirect-tls-seg-refs");
184 
185       // OptFor[Min]Size are used in pattern predicates that isel is matching.
186       OptForMinSize = MF.getFunction().hasMinSize();
187       assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
188              "OptForMinSize implies OptForSize");
189       return SelectionDAGISel::runOnMachineFunction(MF);
190     }
191 
192     void emitFunctionEntryCode() override;
193 
194     bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
195 
196     void PreprocessISelDAG() override;
197     void PostprocessISelDAG() override;
198 
199 // Include the pieces autogenerated from the target description.
200 #include "X86GenDAGISel.inc"
201 
202   private:
203     void Select(SDNode *N) override;
204 
205     bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
206     bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
207                             bool AllowSegmentRegForX32 = false);
208     bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
209     bool matchAddress(SDValue N, X86ISelAddressMode &AM);
210     bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
211     bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
212     SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
213                                   unsigned Depth);
214     bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
215                                  unsigned Depth);
216     bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
217                                        unsigned Depth);
218     bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
219     bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
220                     SDValue &Scale, SDValue &Index, SDValue &Disp,
221                     SDValue &Segment);
222     bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
223                           SDValue ScaleOp, SDValue &Base, SDValue &Scale,
224                           SDValue &Index, SDValue &Disp, SDValue &Segment);
225     bool selectMOV64Imm32(SDValue N, SDValue &Imm);
226     bool selectLEAAddr(SDValue N, SDValue &Base,
227                        SDValue &Scale, SDValue &Index, SDValue &Disp,
228                        SDValue &Segment);
229     bool selectLEA64_32Addr(SDValue N, SDValue &Base,
230                             SDValue &Scale, SDValue &Index, SDValue &Disp,
231                             SDValue &Segment);
232     bool selectTLSADDRAddr(SDValue N, SDValue &Base,
233                            SDValue &Scale, SDValue &Index, SDValue &Disp,
234                            SDValue &Segment);
235     bool selectRelocImm(SDValue N, SDValue &Op);
236 
237     bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
238                      SDValue &Base, SDValue &Scale,
239                      SDValue &Index, SDValue &Disp,
240                      SDValue &Segment);
241 
242     // Convenience method where P is also root.
243     bool tryFoldLoad(SDNode *P, SDValue N,
244                      SDValue &Base, SDValue &Scale,
245                      SDValue &Index, SDValue &Disp,
246                      SDValue &Segment) {
247       return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
248     }
249 
250     bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
251                           SDValue &Base, SDValue &Scale,
252                           SDValue &Index, SDValue &Disp,
253                           SDValue &Segment);
254 
255     bool isProfitableToFormMaskedOp(SDNode *N) const;
256 
257     /// Implement addressing mode selection for inline asm expressions.
258     bool SelectInlineAsmMemoryOperand(const SDValue &Op,
259                                       InlineAsm::ConstraintCode ConstraintID,
260                                       std::vector<SDValue> &OutOps) override;
261 
262     void emitSpecialCodeForMain();
263 
264     inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
265                                    MVT VT, SDValue &Base, SDValue &Scale,
266                                    SDValue &Index, SDValue &Disp,
267                                    SDValue &Segment) {
268       if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
269         Base = CurDAG->getTargetFrameIndex(
270             AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
271       else if (AM.Base_Reg.getNode())
272         Base = AM.Base_Reg;
273       else
274         Base = CurDAG->getRegister(0, VT);
275 
276       Scale = getI8Imm(AM.Scale, DL);
277 
278 #define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
279       // Negate the index if needed.
280       if (AM.NegateIndex) {
281         unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r)
282                                          : GET_ND_IF_ENABLED(X86::NEG32r);
283         SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
284                                                      AM.IndexReg), 0);
285         AM.IndexReg = Neg;
286       }
287 
288       if (AM.IndexReg.getNode())
289         Index = AM.IndexReg;
290       else
291         Index = CurDAG->getRegister(0, VT);
292 
293       // These are 32-bit even in 64-bit mode since RIP-relative offset
294       // is 32-bit.
295       if (AM.GV)
296         Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
297                                               MVT::i32, AM.Disp,
298                                               AM.SymbolFlags);
299       else if (AM.CP)
300         Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
301                                              AM.Disp, AM.SymbolFlags);
302       else if (AM.ES) {
303         assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
304         Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
305       } else if (AM.MCSym) {
306         assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
307         assert(AM.SymbolFlags == 0 && "oo");
308         Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
309       } else if (AM.JT != -1) {
310         assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
311         Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
312       } else if (AM.BlockAddr)
313         Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
314                                              AM.SymbolFlags);
315       else
316         Disp = CurDAG->getSignedTargetConstant(AM.Disp, DL, MVT::i32);
317 
318       if (AM.Segment.getNode())
319         Segment = AM.Segment;
320       else
321         Segment = CurDAG->getRegister(0, MVT::i16);
322     }
323 
324     // Utility function to determine whether it is AMX SDNode right after
325     // lowering but before ISEL.
326     bool isAMXSDNode(SDNode *N) const {
327       // Check if N is AMX SDNode:
328       // 1. check specific opcode since these carry MVT::Untyped instead of
329       // x86amx_type;
330       // 2. check result type;
331       // 3. check operand type;
332       switch (N->getOpcode()) {
333       default:
334         break;
335       case X86::PT2RPNTLVWZ0V:
336       case X86::PT2RPNTLVWZ0T1V:
337       case X86::PT2RPNTLVWZ1V:
338       case X86::PT2RPNTLVWZ1T1V:
339       case X86::PT2RPNTLVWZ0RSV:
340       case X86::PT2RPNTLVWZ0RST1V:
341       case X86::PT2RPNTLVWZ1RSV:
342       case X86::PT2RPNTLVWZ1RST1V:
343         return true;
344       }
345       for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
346         if (N->getValueType(Idx) == MVT::x86amx)
347           return true;
348       }
349       for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) {
350         SDValue Op = N->getOperand(Idx);
351         if (Op.getValueType() == MVT::x86amx)
352           return true;
353       }
354       return false;
355     }
356 
357     // Utility function to determine whether we should avoid selecting
358     // immediate forms of instructions for better code size or not.
359     // At a high level, we'd like to avoid such instructions when
360     // we have similar constants used within the same basic block
361     // that can be kept in a register.
362     //
363     bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
364       uint32_t UseCount = 0;
365 
366       // Do not want to hoist if we're not optimizing for size.
367       // TODO: We'd like to remove this restriction.
368       // See the comment in X86InstrInfo.td for more info.
369       if (!CurDAG->shouldOptForSize())
370         return false;
371 
372       // Walk all the users of the immediate.
373       for (const SDNode *User : N->users()) {
374         if (UseCount >= 2)
375           break;
376 
377         // This user is already selected. Count it as a legitimate use and
378         // move on.
379         if (User->isMachineOpcode()) {
380           UseCount++;
381           continue;
382         }
383 
384         // We want to count stores of immediates as real uses.
385         if (User->getOpcode() == ISD::STORE &&
386             User->getOperand(1).getNode() == N) {
387           UseCount++;
388           continue;
389         }
390 
391         // We don't currently match users that have > 2 operands (except
392         // for stores, which are handled above)
393         // Those instruction won't match in ISEL, for now, and would
394         // be counted incorrectly.
395         // This may change in the future as we add additional instruction
396         // types.
397         if (User->getNumOperands() != 2)
398           continue;
399 
400         // If this is a sign-extended 8-bit integer immediate used in an ALU
401         // instruction, there is probably an opcode encoding to save space.
402         auto *C = dyn_cast<ConstantSDNode>(N);
403         if (C && isInt<8>(C->getSExtValue()))
404           continue;
405 
406         // Immediates that are used for offsets as part of stack
407         // manipulation should be left alone. These are typically
408         // used to indicate SP offsets for argument passing and
409         // will get pulled into stores/pushes (implicitly).
410         if (User->getOpcode() == X86ISD::ADD ||
411             User->getOpcode() == ISD::ADD    ||
412             User->getOpcode() == X86ISD::SUB ||
413             User->getOpcode() == ISD::SUB) {
414 
415           // Find the other operand of the add/sub.
416           SDValue OtherOp = User->getOperand(0);
417           if (OtherOp.getNode() == N)
418             OtherOp = User->getOperand(1);
419 
420           // Don't count if the other operand is SP.
421           RegisterSDNode *RegNode;
422           if (OtherOp->getOpcode() == ISD::CopyFromReg &&
423               (RegNode = dyn_cast_or_null<RegisterSDNode>(
424                  OtherOp->getOperand(1).getNode())))
425             if ((RegNode->getReg() == X86::ESP) ||
426                 (RegNode->getReg() == X86::RSP))
427               continue;
428         }
429 
430         // ... otherwise, count this and move on.
431         UseCount++;
432       }
433 
434       // If we have more than 1 use, then recommend for hoisting.
435       return (UseCount > 1);
436     }
437 
438     /// Return a target constant with the specified value of type i8.
439     inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
440       return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
441     }
442 
443     /// Return a target constant with the specified value, of type i32.
444     inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
445       return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
446     }
447 
448     /// Return a target constant with the specified value, of type i64.
449     inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
450       return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
451     }
452 
453     SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
454                                         const SDLoc &DL) {
455       assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
456       uint64_t Index = N->getConstantOperandVal(1);
457       MVT VecVT = N->getOperand(0).getSimpleValueType();
458       return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
459     }
460 
461     SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
462                                       const SDLoc &DL) {
463       assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
464       uint64_t Index = N->getConstantOperandVal(2);
465       MVT VecVT = N->getSimpleValueType(0);
466       return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
467     }
468 
469     SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
470                                                const SDLoc &DL) {
471       assert(VecWidth == 128 && "Unexpected vector width");
472       uint64_t Index = N->getConstantOperandVal(2);
473       MVT VecVT = N->getSimpleValueType(0);
474       uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
475       assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
476       // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
477       // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
478       return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
479     }
480 
481     SDValue getSBBZero(SDNode *N) {
482       SDLoc dl(N);
483       MVT VT = N->getSimpleValueType(0);
484 
485       // Create zero.
486       SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
487       SDValue Zero =
488           SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
489       if (VT == MVT::i64) {
490         Zero = SDValue(
491             CurDAG->getMachineNode(
492                 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
493                 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
494                 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
495             0);
496       }
497 
498       // Copy flags to the EFLAGS register and glue it to next node.
499       unsigned Opcode = N->getOpcode();
500       assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
501              "Unexpected opcode for SBB materialization");
502       unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
503       SDValue EFLAGS =
504           CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
505                                N->getOperand(FlagOpIndex), SDValue());
506 
507       // Create a 64-bit instruction if the result is 64-bits otherwise use the
508       // 32-bit version.
509       unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
510       MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
511       VTs = CurDAG->getVTList(SBBVT, MVT::i32);
512       return SDValue(
513           CurDAG->getMachineNode(Opc, dl, VTs,
514                                  {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
515           0);
516     }
517 
518     // Helper to detect unneeded and instructions on shift amounts. Called
519     // from PatFrags in tablegen.
520     bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
521       assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
522       const APInt &Val = N->getConstantOperandAPInt(1);
523 
524       if (Val.countr_one() >= Width)
525         return true;
526 
527       APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
528       return Mask.countr_one() >= Width;
529     }
530 
531     /// Return an SDNode that returns the value of the global base register.
532     /// Output instructions required to initialize the global base register,
533     /// if necessary.
534     SDNode *getGlobalBaseReg();
535 
536     /// Return a reference to the TargetMachine, casted to the target-specific
537     /// type.
538     const X86TargetMachine &getTargetMachine() const {
539       return static_cast<const X86TargetMachine &>(TM);
540     }
541 
542     /// Return a reference to the TargetInstrInfo, casted to the target-specific
543     /// type.
544     const X86InstrInfo *getInstrInfo() const {
545       return Subtarget->getInstrInfo();
546     }
547 
548     /// Return a condition code of the given SDNode
549     X86::CondCode getCondFromNode(SDNode *N) const;
550 
551     /// Address-mode matching performs shift-of-and to and-of-shift
552     /// reassociation in order to expose more scaled addressing
553     /// opportunities.
554     bool ComplexPatternFuncMutatesDAG() const override {
555       return true;
556     }
557 
558     bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
559 
560     // Indicates we should prefer to use a non-temporal load for this load.
561     bool useNonTemporalLoad(LoadSDNode *N) const {
562       if (!N->isNonTemporal())
563         return false;
564 
565       unsigned StoreSize = N->getMemoryVT().getStoreSize();
566 
567       if (N->getAlign().value() < StoreSize)
568         return false;
569 
570       switch (StoreSize) {
571       default: llvm_unreachable("Unsupported store size");
572       case 4:
573       case 8:
574         return false;
575       case 16:
576         return Subtarget->hasSSE41();
577       case 32:
578         return Subtarget->hasAVX2();
579       case 64:
580         return Subtarget->hasAVX512();
581       }
582     }
583 
584     bool foldLoadStoreIntoMemOperand(SDNode *Node);
585     MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
586     bool matchBitExtract(SDNode *Node);
587     bool shrinkAndImmediate(SDNode *N);
588     bool isMaskZeroExtended(SDNode *N) const;
589     bool tryShiftAmountMod(SDNode *N);
590     bool tryShrinkShlLogicImm(SDNode *N);
591     bool tryVPTERNLOG(SDNode *N);
592     bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
593                         SDNode *ParentC, SDValue A, SDValue B, SDValue C,
594                         uint8_t Imm);
595     bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
596     bool tryMatchBitSelect(SDNode *N);
597 
598     MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
599                                 const SDLoc &dl, MVT VT, SDNode *Node);
600     MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
601                                 const SDLoc &dl, MVT VT, SDNode *Node,
602                                 SDValue &InGlue);
603 
604     bool tryOptimizeRem8Extend(SDNode *N);
605 
606     bool onlyUsesZeroFlag(SDValue Flags) const;
607     bool hasNoSignFlagUses(SDValue Flags) const;
608     bool hasNoCarryFlagUses(SDValue Flags) const;
609   };
610 
611   class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
612   public:
613     static char ID;
614     explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
615                                    CodeGenOptLevel OptLevel)
616         : SelectionDAGISelLegacy(
617               ID, std::make_unique<X86DAGToDAGISel>(tm, OptLevel)) {}
618   };
619 }
620 
621 char X86DAGToDAGISelLegacy::ID = 0;
622 
623 INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
624 
625 // Returns true if this masked compare can be implemented legally with this
626 // type.
627 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
628   unsigned Opcode = N->getOpcode();
629   if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
630       Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
631       Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
632     // We can get 256-bit 8 element types here without VLX being enabled. When
633     // this happens we will use 512-bit operations and the mask will not be
634     // zero extended.
635     EVT OpVT = N->getOperand(0).getValueType();
636     // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
637     // second operand.
638     if (Opcode == X86ISD::STRICT_CMPM)
639       OpVT = N->getOperand(1).getValueType();
640     if (OpVT.is256BitVector() || OpVT.is128BitVector())
641       return Subtarget->hasVLX();
642 
643     return true;
644   }
645   // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
646   if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
647       Opcode == X86ISD::FSETCCM_SAE)
648     return true;
649 
650   return false;
651 }
652 
653 // Returns true if we can assume the writer of the mask has zero extended it
654 // for us.
655 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
656   // If this is an AND, check if we have a compare on either side. As long as
657   // one side guarantees the mask is zero extended, the AND will preserve those
658   // zeros.
659   if (N->getOpcode() == ISD::AND)
660     return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
661            isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
662 
663   return isLegalMaskCompare(N, Subtarget);
664 }
665 
666 bool
667 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
668   if (OptLevel == CodeGenOptLevel::None)
669     return false;
670 
671   if (!N.hasOneUse())
672     return false;
673 
674   if (N.getOpcode() != ISD::LOAD)
675     return true;
676 
677   // Don't fold non-temporal loads if we have an instruction for them.
678   if (useNonTemporalLoad(cast<LoadSDNode>(N)))
679     return false;
680 
681   // If N is a load, do additional profitability checks.
682   if (U == Root) {
683     switch (U->getOpcode()) {
684     default: break;
685     case X86ISD::ADD:
686     case X86ISD::ADC:
687     case X86ISD::SUB:
688     case X86ISD::SBB:
689     case X86ISD::AND:
690     case X86ISD::XOR:
691     case X86ISD::OR:
692     case ISD::ADD:
693     case ISD::UADDO_CARRY:
694     case ISD::AND:
695     case ISD::OR:
696     case ISD::XOR: {
697       SDValue Op1 = U->getOperand(1);
698 
699       // If the other operand is a 8-bit immediate we should fold the immediate
700       // instead. This reduces code size.
701       // e.g.
702       // movl 4(%esp), %eax
703       // addl $4, %eax
704       // vs.
705       // movl $4, %eax
706       // addl 4(%esp), %eax
707       // The former is 2 bytes shorter. In case where the increment is 1, then
708       // the saving can be 4 bytes (by using incl %eax).
709       if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
710         if (Imm->getAPIntValue().isSignedIntN(8))
711           return false;
712 
713         // If this is a 64-bit AND with an immediate that fits in 32-bits,
714         // prefer using the smaller and over folding the load. This is needed to
715         // make sure immediates created by shrinkAndImmediate are always folded.
716         // Ideally we would narrow the load during DAG combine and get the
717         // best of both worlds.
718         if (U->getOpcode() == ISD::AND &&
719             Imm->getAPIntValue().getBitWidth() == 64 &&
720             Imm->getAPIntValue().isIntN(32))
721           return false;
722 
723         // If this really a zext_inreg that can be represented with a movzx
724         // instruction, prefer that.
725         // TODO: We could shrink the load and fold if it is non-volatile.
726         if (U->getOpcode() == ISD::AND &&
727             (Imm->getAPIntValue() == UINT8_MAX ||
728              Imm->getAPIntValue() == UINT16_MAX ||
729              Imm->getAPIntValue() == UINT32_MAX))
730           return false;
731 
732         // ADD/SUB with can negate the immediate and use the opposite operation
733         // to fit 128 into a sign extended 8 bit immediate.
734         if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
735             (-Imm->getAPIntValue()).isSignedIntN(8))
736           return false;
737 
738         if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
739             (-Imm->getAPIntValue()).isSignedIntN(8) &&
740             hasNoCarryFlagUses(SDValue(U, 1)))
741           return false;
742       }
743 
744       // If the other operand is a TLS address, we should fold it instead.
745       // This produces
746       // movl    %gs:0, %eax
747       // leal    i@NTPOFF(%eax), %eax
748       // instead of
749       // movl    $i@NTPOFF, %eax
750       // addl    %gs:0, %eax
751       // if the block also has an access to a second TLS address this will save
752       // a load.
753       // FIXME: This is probably also true for non-TLS addresses.
754       if (Op1.getOpcode() == X86ISD::Wrapper) {
755         SDValue Val = Op1.getOperand(0);
756         if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
757           return false;
758       }
759 
760       // Don't fold load if this matches the BTS/BTR/BTC patterns.
761       // BTS: (or X, (shl 1, n))
762       // BTR: (and X, (rotl -2, n))
763       // BTC: (xor X, (shl 1, n))
764       if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
765         if (U->getOperand(0).getOpcode() == ISD::SHL &&
766             isOneConstant(U->getOperand(0).getOperand(0)))
767           return false;
768 
769         if (U->getOperand(1).getOpcode() == ISD::SHL &&
770             isOneConstant(U->getOperand(1).getOperand(0)))
771           return false;
772       }
773       if (U->getOpcode() == ISD::AND) {
774         SDValue U0 = U->getOperand(0);
775         SDValue U1 = U->getOperand(1);
776         if (U0.getOpcode() == ISD::ROTL) {
777           auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
778           if (C && C->getSExtValue() == -2)
779             return false;
780         }
781 
782         if (U1.getOpcode() == ISD::ROTL) {
783           auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
784           if (C && C->getSExtValue() == -2)
785             return false;
786         }
787       }
788 
789       break;
790     }
791     case ISD::SHL:
792     case ISD::SRA:
793     case ISD::SRL:
794       // Don't fold a load into a shift by immediate. The BMI2 instructions
795       // support folding a load, but not an immediate. The legacy instructions
796       // support folding an immediate, but can't fold a load. Folding an
797       // immediate is preferable to folding a load.
798       if (isa<ConstantSDNode>(U->getOperand(1)))
799         return false;
800 
801       break;
802     }
803   }
804 
805   // Prevent folding a load if this can implemented with an insert_subreg or
806   // a move that implicitly zeroes.
807   if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
808       isNullConstant(Root->getOperand(2)) &&
809       (Root->getOperand(0).isUndef() ||
810        ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
811     return false;
812 
813   return true;
814 }
815 
816 // Indicates it is profitable to form an AVX512 masked operation. Returning
817 // false will favor a masked register-register masked move or vblendm and the
818 // operation will be selected separately.
819 bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
820   assert(
821       (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
822       "Unexpected opcode!");
823 
824   // If the operation has additional users, the operation will be duplicated.
825   // Check the use count to prevent that.
826   // FIXME: Are there cheap opcodes we might want to duplicate?
827   return N->getOperand(1).hasOneUse();
828 }
829 
830 /// Replace the original chain operand of the call with
831 /// load's chain operand and move load below the call's chain operand.
832 static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
833                                SDValue Call, SDValue OrigChain) {
834   SmallVector<SDValue, 8> Ops;
835   SDValue Chain = OrigChain.getOperand(0);
836   if (Chain.getNode() == Load.getNode())
837     Ops.push_back(Load.getOperand(0));
838   else {
839     assert(Chain.getOpcode() == ISD::TokenFactor &&
840            "Unexpected chain operand");
841     for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
842       if (Chain.getOperand(i).getNode() == Load.getNode())
843         Ops.push_back(Load.getOperand(0));
844       else
845         Ops.push_back(Chain.getOperand(i));
846     SDValue NewChain =
847       CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
848     Ops.clear();
849     Ops.push_back(NewChain);
850   }
851   Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
852   CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
853   CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
854                              Load.getOperand(1), Load.getOperand(2));
855 
856   Ops.clear();
857   Ops.push_back(SDValue(Load.getNode(), 1));
858   Ops.append(Call->op_begin() + 1, Call->op_end());
859   CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
860 }
861 
862 /// Return true if call address is a load and it can be
863 /// moved below CALLSEQ_START and the chains leading up to the call.
864 /// Return the CALLSEQ_START by reference as a second output.
865 /// In the case of a tail call, there isn't a callseq node between the call
866 /// chain and the load.
867 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
868   // The transformation is somewhat dangerous if the call's chain was glued to
869   // the call. After MoveBelowOrigChain the load is moved between the call and
870   // the chain, this can create a cycle if the load is not folded. So it is
871   // *really* important that we are sure the load will be folded.
872   if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
873     return false;
874   auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
875   if (!LD ||
876       !LD->isSimple() ||
877       LD->getAddressingMode() != ISD::UNINDEXED ||
878       LD->getExtensionType() != ISD::NON_EXTLOAD)
879     return false;
880 
881   // Now let's find the callseq_start.
882   while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
883     if (!Chain.hasOneUse())
884       return false;
885     Chain = Chain.getOperand(0);
886   }
887 
888   if (!Chain.getNumOperands())
889     return false;
890   // Since we are not checking for AA here, conservatively abort if the chain
891   // writes to memory. It's not safe to move the callee (a load) across a store.
892   if (isa<MemSDNode>(Chain.getNode()) &&
893       cast<MemSDNode>(Chain.getNode())->writeMem())
894     return false;
895   if (Chain.getOperand(0).getNode() == Callee.getNode())
896     return true;
897   if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
898       Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
899       Callee.getValue(1).hasOneUse())
900     return true;
901   return false;
902 }
903 
904 static bool isEndbrImm64(uint64_t Imm) {
905 // There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
906 // i.g: 0xF3660F1EFA, 0xF3670F1EFA
907   if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
908     return false;
909 
910   uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
911                                     0x65, 0x66, 0x67, 0xf0, 0xf2};
912   int i = 24; // 24bit 0x0F1EFA has matched
913   while (i < 64) {
914     uint8_t Byte = (Imm >> i) & 0xFF;
915     if (Byte == 0xF3)
916       return true;
917     if (!llvm::is_contained(OptionalPrefixBytes, Byte))
918       return false;
919     i += 8;
920   }
921 
922   return false;
923 }
924 
925 static bool needBWI(MVT VT) {
926   return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
927 }
928 
929 void X86DAGToDAGISel::PreprocessISelDAG() {
930   bool MadeChange = false;
931   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
932        E = CurDAG->allnodes_end(); I != E; ) {
933     SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
934 
935     // This is for CET enhancement.
936     //
937     // ENDBR32 and ENDBR64 have specific opcodes:
938     // ENDBR32: F3 0F 1E FB
939     // ENDBR64: F3 0F 1E FA
940     // And we want that attackers won’t find unintended ENDBR32/64
941     // opcode matches in the binary
942     // Here’s an example:
943     // If the compiler had to generate asm for the following code:
944     // a = 0xF30F1EFA
945     // it could, for example, generate:
946     // mov 0xF30F1EFA, dword ptr[a]
947     // In such a case, the binary would include a gadget that starts
948     // with a fake ENDBR64 opcode. Therefore, we split such generation
949     // into multiple operations, let it not shows in the binary
950     if (N->getOpcode() == ISD::Constant) {
951       MVT VT = N->getSimpleValueType(0);
952       int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
953       int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
954       if (Imm == EndbrImm || isEndbrImm64(Imm)) {
955         // Check that the cf-protection-branch is enabled.
956         Metadata *CFProtectionBranch =
957             MF->getFunction().getParent()->getModuleFlag(
958                 "cf-protection-branch");
959         if (CFProtectionBranch || IndirectBranchTracking) {
960           SDLoc dl(N);
961           SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
962           Complement = CurDAG->getNOT(dl, Complement, VT);
963           --I;
964           CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
965           ++I;
966           MadeChange = true;
967           continue;
968         }
969       }
970     }
971 
972     // If this is a target specific AND node with no flag usages, turn it back
973     // into ISD::AND to enable test instruction matching.
974     if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
975       SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
976                                     N->getOperand(0), N->getOperand(1));
977       --I;
978       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
979       ++I;
980       MadeChange = true;
981       continue;
982     }
983 
984     // Convert vector increment or decrement to sub/add with an all-ones
985     // constant:
986     // add X, <1, 1...> --> sub X, <-1, -1...>
987     // sub X, <1, 1...> --> add X, <-1, -1...>
988     // The all-ones vector constant can be materialized using a pcmpeq
989     // instruction that is commonly recognized as an idiom (has no register
990     // dependency), so that's better/smaller than loading a splat 1 constant.
991     //
992     // But don't do this if it would inhibit a potentially profitable load
993     // folding opportunity for the other operand. That only occurs with the
994     // intersection of:
995     // (1) The other operand (op0) is load foldable.
996     // (2) The op is an add (otherwise, we are *creating* an add and can still
997     //     load fold the other op).
998     // (3) The target has AVX (otherwise, we have a destructive add and can't
999     //     load fold the other op without killing the constant op).
1000     // (4) The constant 1 vector has multiple uses (so it is profitable to load
1001     //     into a register anyway).
1002     auto mayPreventLoadFold = [&]() {
1003       return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
1004              N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
1005              !N->getOperand(1).hasOneUse();
1006     };
1007     if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
1008         N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
1009       APInt SplatVal;
1010       if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
1011           SplatVal.isOne()) {
1012         SDLoc DL(N);
1013 
1014         MVT VT = N->getSimpleValueType(0);
1015         unsigned NumElts = VT.getSizeInBits() / 32;
1016         SDValue AllOnes =
1017             CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
1018         AllOnes = CurDAG->getBitcast(VT, AllOnes);
1019 
1020         unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
1021         SDValue Res =
1022             CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
1023         --I;
1024         CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1025         ++I;
1026         MadeChange = true;
1027         continue;
1028       }
1029     }
1030 
1031     switch (N->getOpcode()) {
1032     case X86ISD::VBROADCAST: {
1033       MVT VT = N->getSimpleValueType(0);
1034       // Emulate v32i16/v64i8 broadcast without BWI.
1035       if (!Subtarget->hasBWI() && needBWI(VT)) {
1036         MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1037         SDLoc dl(N);
1038         SDValue NarrowBCast =
1039             CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1040         SDValue Res =
1041             CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1042                             NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1043         unsigned Index = NarrowVT.getVectorMinNumElements();
1044         Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1045                               CurDAG->getIntPtrConstant(Index, dl));
1046 
1047         --I;
1048         CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1049         ++I;
1050         MadeChange = true;
1051         continue;
1052       }
1053 
1054       break;
1055     }
1056     case X86ISD::VBROADCAST_LOAD: {
1057       MVT VT = N->getSimpleValueType(0);
1058       // Emulate v32i16/v64i8 broadcast without BWI.
1059       if (!Subtarget->hasBWI() && needBWI(VT)) {
1060         MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1061         auto *MemNode = cast<MemSDNode>(N);
1062         SDLoc dl(N);
1063         SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1064         SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1065         SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1066             X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1067             MemNode->getMemOperand());
1068         SDValue Res =
1069             CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1070                             NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1071         unsigned Index = NarrowVT.getVectorMinNumElements();
1072         Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1073                               CurDAG->getIntPtrConstant(Index, dl));
1074 
1075         --I;
1076         SDValue To[] = {Res, NarrowBCast.getValue(1)};
1077         CurDAG->ReplaceAllUsesWith(N, To);
1078         ++I;
1079         MadeChange = true;
1080         continue;
1081       }
1082 
1083       break;
1084     }
1085     case ISD::LOAD: {
1086       // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1087       // load, then just extract the lower subvector and avoid the second load.
1088       auto *Ld = cast<LoadSDNode>(N);
1089       MVT VT = N->getSimpleValueType(0);
1090       if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1091           !(VT.is128BitVector() || VT.is256BitVector()))
1092         break;
1093 
1094       MVT MaxVT = VT;
1095       SDNode *MaxLd = nullptr;
1096       SDValue Ptr = Ld->getBasePtr();
1097       SDValue Chain = Ld->getChain();
1098       for (SDNode *User : Ptr->users()) {
1099         auto *UserLd = dyn_cast<LoadSDNode>(User);
1100         MVT UserVT = User->getSimpleValueType(0);
1101         if (User != N && UserLd && ISD::isNormalLoad(User) &&
1102             UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1103             !User->hasAnyUseOfValue(1) &&
1104             (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1105             UserVT.getSizeInBits() > VT.getSizeInBits() &&
1106             (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1107           MaxLd = User;
1108           MaxVT = UserVT;
1109         }
1110       }
1111       if (MaxLd) {
1112         SDLoc dl(N);
1113         unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1114         MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1115         SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1116                                           SDValue(MaxLd, 0),
1117                                           CurDAG->getIntPtrConstant(0, dl));
1118         SDValue Res = CurDAG->getBitcast(VT, Extract);
1119 
1120         --I;
1121         SDValue To[] = {Res, SDValue(MaxLd, 1)};
1122         CurDAG->ReplaceAllUsesWith(N, To);
1123         ++I;
1124         MadeChange = true;
1125         continue;
1126       }
1127       break;
1128     }
1129     case ISD::VSELECT: {
1130       // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1131       EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1132       if (EleVT == MVT::i1)
1133         break;
1134 
1135       assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1136       assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1137              "We can't replace VSELECT with BLENDV in vXi16!");
1138       SDValue R;
1139       if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1140                                      EleVT.getSizeInBits()) {
1141         R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1142                             N->getOperand(0), N->getOperand(1), N->getOperand(2),
1143                             CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1144       } else {
1145         R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1146                             N->getOperand(0), N->getOperand(1),
1147                             N->getOperand(2));
1148       }
1149       --I;
1150       CurDAG->ReplaceAllUsesWith(N, R.getNode());
1151       ++I;
1152       MadeChange = true;
1153       continue;
1154     }
1155     case ISD::FP_ROUND:
1156     case ISD::STRICT_FP_ROUND:
1157     case ISD::FP_TO_SINT:
1158     case ISD::FP_TO_UINT:
1159     case ISD::STRICT_FP_TO_SINT:
1160     case ISD::STRICT_FP_TO_UINT: {
1161       // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1162       // don't need 2 sets of patterns.
1163       if (!N->getSimpleValueType(0).isVector())
1164         break;
1165 
1166       unsigned NewOpc;
1167       switch (N->getOpcode()) {
1168       default: llvm_unreachable("Unexpected opcode!");
1169       case ISD::FP_ROUND:          NewOpc = X86ISD::VFPROUND;        break;
1170       case ISD::STRICT_FP_ROUND:   NewOpc = X86ISD::STRICT_VFPROUND; break;
1171       case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1172       case ISD::FP_TO_SINT:        NewOpc = X86ISD::CVTTP2SI;        break;
1173       case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1174       case ISD::FP_TO_UINT:        NewOpc = X86ISD::CVTTP2UI;        break;
1175       }
1176       SDValue Res;
1177       if (N->isStrictFPOpcode())
1178         Res =
1179             CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1180                             {N->getOperand(0), N->getOperand(1)});
1181       else
1182         Res =
1183             CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1184                             N->getOperand(0));
1185       --I;
1186       CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1187       ++I;
1188       MadeChange = true;
1189       continue;
1190     }
1191     case ISD::SHL:
1192     case ISD::SRA:
1193     case ISD::SRL: {
1194       // Replace vector shifts with their X86 specific equivalent so we don't
1195       // need 2 sets of patterns.
1196       if (!N->getValueType(0).isVector())
1197         break;
1198 
1199       unsigned NewOpc;
1200       switch (N->getOpcode()) {
1201       default: llvm_unreachable("Unexpected opcode!");
1202       case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1203       case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1204       case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1205       }
1206       SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1207                                     N->getOperand(0), N->getOperand(1));
1208       --I;
1209       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1210       ++I;
1211       MadeChange = true;
1212       continue;
1213     }
1214     case ISD::ANY_EXTEND:
1215     case ISD::ANY_EXTEND_VECTOR_INREG: {
1216       // Replace vector any extend with the zero extend equivalents so we don't
1217       // need 2 sets of patterns. Ignore vXi1 extensions.
1218       if (!N->getValueType(0).isVector())
1219         break;
1220 
1221       unsigned NewOpc;
1222       if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1223         assert(N->getOpcode() == ISD::ANY_EXTEND &&
1224                "Unexpected opcode for mask vector!");
1225         NewOpc = ISD::SIGN_EXTEND;
1226       } else {
1227         NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1228                               ? ISD::ZERO_EXTEND
1229                               : ISD::ZERO_EXTEND_VECTOR_INREG;
1230       }
1231 
1232       SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1233                                     N->getOperand(0));
1234       --I;
1235       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1236       ++I;
1237       MadeChange = true;
1238       continue;
1239     }
1240     case ISD::FCEIL:
1241     case ISD::STRICT_FCEIL:
1242     case ISD::FFLOOR:
1243     case ISD::STRICT_FFLOOR:
1244     case ISD::FTRUNC:
1245     case ISD::STRICT_FTRUNC:
1246     case ISD::FROUNDEVEN:
1247     case ISD::STRICT_FROUNDEVEN:
1248     case ISD::FNEARBYINT:
1249     case ISD::STRICT_FNEARBYINT:
1250     case ISD::FRINT:
1251     case ISD::STRICT_FRINT: {
1252       // Replace fp rounding with their X86 specific equivalent so we don't
1253       // need 2 sets of patterns.
1254       unsigned Imm;
1255       switch (N->getOpcode()) {
1256       default: llvm_unreachable("Unexpected opcode!");
1257       case ISD::STRICT_FCEIL:
1258       case ISD::FCEIL:      Imm = 0xA; break;
1259       case ISD::STRICT_FFLOOR:
1260       case ISD::FFLOOR:     Imm = 0x9; break;
1261       case ISD::STRICT_FTRUNC:
1262       case ISD::FTRUNC:     Imm = 0xB; break;
1263       case ISD::STRICT_FROUNDEVEN:
1264       case ISD::FROUNDEVEN: Imm = 0x8; break;
1265       case ISD::STRICT_FNEARBYINT:
1266       case ISD::FNEARBYINT: Imm = 0xC; break;
1267       case ISD::STRICT_FRINT:
1268       case ISD::FRINT:      Imm = 0x4; break;
1269       }
1270       SDLoc dl(N);
1271       bool IsStrict = N->isStrictFPOpcode();
1272       SDValue Res;
1273       if (IsStrict)
1274         Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1275                               {N->getValueType(0), MVT::Other},
1276                               {N->getOperand(0), N->getOperand(1),
1277                                CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1278       else
1279         Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1280                               N->getOperand(0),
1281                               CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1282       --I;
1283       CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1284       ++I;
1285       MadeChange = true;
1286       continue;
1287     }
1288     case X86ISD::FANDN:
1289     case X86ISD::FAND:
1290     case X86ISD::FOR:
1291     case X86ISD::FXOR: {
1292       // Widen scalar fp logic ops to vector to reduce isel patterns.
1293       // FIXME: Can we do this during lowering/combine.
1294       MVT VT = N->getSimpleValueType(0);
1295       if (VT.isVector() || VT == MVT::f128)
1296         break;
1297 
1298       MVT VecVT = VT == MVT::f64   ? MVT::v2f64
1299                   : VT == MVT::f32 ? MVT::v4f32
1300                                    : MVT::v8f16;
1301 
1302       SDLoc dl(N);
1303       SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1304                                     N->getOperand(0));
1305       SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1306                                     N->getOperand(1));
1307 
1308       SDValue Res;
1309       if (Subtarget->hasSSE2()) {
1310         EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1311         Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1312         Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1313         unsigned Opc;
1314         switch (N->getOpcode()) {
1315         default: llvm_unreachable("Unexpected opcode!");
1316         case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1317         case X86ISD::FAND:  Opc = ISD::AND;      break;
1318         case X86ISD::FOR:   Opc = ISD::OR;       break;
1319         case X86ISD::FXOR:  Opc = ISD::XOR;      break;
1320         }
1321         Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1322         Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1323       } else {
1324         Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1325       }
1326       Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1327                             CurDAG->getIntPtrConstant(0, dl));
1328       --I;
1329       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1330       ++I;
1331       MadeChange = true;
1332       continue;
1333     }
1334     }
1335 
1336     if (OptLevel != CodeGenOptLevel::None &&
1337         // Only do this when the target can fold the load into the call or
1338         // jmp.
1339         !Subtarget->useIndirectThunkCalls() &&
1340         ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1341          (N->getOpcode() == X86ISD::TC_RETURN &&
1342           (Subtarget->is64Bit() ||
1343            !getTargetMachine().isPositionIndependent())))) {
1344       /// Also try moving call address load from outside callseq_start to just
1345       /// before the call to allow it to be folded.
1346       ///
1347       ///     [Load chain]
1348       ///         ^
1349       ///         |
1350       ///       [Load]
1351       ///       ^    ^
1352       ///       |    |
1353       ///      /      \--
1354       ///     /          |
1355       ///[CALLSEQ_START] |
1356       ///     ^          |
1357       ///     |          |
1358       /// [LOAD/C2Reg]   |
1359       ///     |          |
1360       ///      \        /
1361       ///       \      /
1362       ///       [CALL]
1363       bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1364       SDValue Chain = N->getOperand(0);
1365       SDValue Load  = N->getOperand(1);
1366       if (!isCalleeLoad(Load, Chain, HasCallSeq))
1367         continue;
1368       moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1369       ++NumLoadMoved;
1370       MadeChange = true;
1371       continue;
1372     }
1373 
1374     // Lower fpround and fpextend nodes that target the FP stack to be store and
1375     // load to the stack.  This is a gross hack.  We would like to simply mark
1376     // these as being illegal, but when we do that, legalize produces these when
1377     // it expands calls, then expands these in the same legalize pass.  We would
1378     // like dag combine to be able to hack on these between the call expansion
1379     // and the node legalization.  As such this pass basically does "really
1380     // late" legalization of these inline with the X86 isel pass.
1381     // FIXME: This should only happen when not compiled with -O0.
1382     switch (N->getOpcode()) {
1383     default: continue;
1384     case ISD::FP_ROUND:
1385     case ISD::FP_EXTEND:
1386     {
1387       MVT SrcVT = N->getOperand(0).getSimpleValueType();
1388       MVT DstVT = N->getSimpleValueType(0);
1389 
1390       // If any of the sources are vectors, no fp stack involved.
1391       if (SrcVT.isVector() || DstVT.isVector())
1392         continue;
1393 
1394       // If the source and destination are SSE registers, then this is a legal
1395       // conversion that should not be lowered.
1396       const X86TargetLowering *X86Lowering =
1397           static_cast<const X86TargetLowering *>(TLI);
1398       bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1399       bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1400       if (SrcIsSSE && DstIsSSE)
1401         continue;
1402 
1403       if (!SrcIsSSE && !DstIsSSE) {
1404         // If this is an FPStack extension, it is a noop.
1405         if (N->getOpcode() == ISD::FP_EXTEND)
1406           continue;
1407         // If this is a value-preserving FPStack truncation, it is a noop.
1408         if (N->getConstantOperandVal(1))
1409           continue;
1410       }
1411 
1412       // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1413       // FPStack has extload and truncstore.  SSE can fold direct loads into other
1414       // operations.  Based on this, decide what we want to do.
1415       MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1416       SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1417       int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1418       MachinePointerInfo MPI =
1419           MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1420       SDLoc dl(N);
1421 
1422       // FIXME: optimize the case where the src/dest is a load or store?
1423 
1424       SDValue Store = CurDAG->getTruncStore(
1425           CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1426       SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1427                                           MemTmp, MPI, MemVT);
1428 
1429       // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1430       // extload we created.  This will cause general havok on the dag because
1431       // anything below the conversion could be folded into other existing nodes.
1432       // To avoid invalidating 'I', back it up to the convert node.
1433       --I;
1434       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1435       break;
1436     }
1437 
1438     //The sequence of events for lowering STRICT_FP versions of these nodes requires
1439     //dealing with the chain differently, as there is already a preexisting chain.
1440     case ISD::STRICT_FP_ROUND:
1441     case ISD::STRICT_FP_EXTEND:
1442     {
1443       MVT SrcVT = N->getOperand(1).getSimpleValueType();
1444       MVT DstVT = N->getSimpleValueType(0);
1445 
1446       // If any of the sources are vectors, no fp stack involved.
1447       if (SrcVT.isVector() || DstVT.isVector())
1448         continue;
1449 
1450       // If the source and destination are SSE registers, then this is a legal
1451       // conversion that should not be lowered.
1452       const X86TargetLowering *X86Lowering =
1453           static_cast<const X86TargetLowering *>(TLI);
1454       bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1455       bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1456       if (SrcIsSSE && DstIsSSE)
1457         continue;
1458 
1459       if (!SrcIsSSE && !DstIsSSE) {
1460         // If this is an FPStack extension, it is a noop.
1461         if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1462           continue;
1463         // If this is a value-preserving FPStack truncation, it is a noop.
1464         if (N->getConstantOperandVal(2))
1465           continue;
1466       }
1467 
1468       // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1469       // FPStack has extload and truncstore.  SSE can fold direct loads into other
1470       // operations.  Based on this, decide what we want to do.
1471       MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1472       SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1473       int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1474       MachinePointerInfo MPI =
1475           MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1476       SDLoc dl(N);
1477 
1478       // FIXME: optimize the case where the src/dest is a load or store?
1479 
1480       //Since the operation is StrictFP, use the preexisting chain.
1481       SDValue Store, Result;
1482       if (!SrcIsSSE) {
1483         SDVTList VTs = CurDAG->getVTList(MVT::Other);
1484         SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1485         Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1486                                             MPI, /*Align*/ std::nullopt,
1487                                             MachineMemOperand::MOStore);
1488         if (N->getFlags().hasNoFPExcept()) {
1489           SDNodeFlags Flags = Store->getFlags();
1490           Flags.setNoFPExcept(true);
1491           Store->setFlags(Flags);
1492         }
1493       } else {
1494         assert(SrcVT == MemVT && "Unexpected VT!");
1495         Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1496                                  MPI);
1497       }
1498 
1499       if (!DstIsSSE) {
1500         SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1501         SDValue Ops[] = {Store, MemTmp};
1502         Result = CurDAG->getMemIntrinsicNode(
1503             X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1504             /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1505         if (N->getFlags().hasNoFPExcept()) {
1506           SDNodeFlags Flags = Result->getFlags();
1507           Flags.setNoFPExcept(true);
1508           Result->setFlags(Flags);
1509         }
1510       } else {
1511         assert(DstVT == MemVT && "Unexpected VT!");
1512         Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1513       }
1514 
1515       // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1516       // extload we created.  This will cause general havok on the dag because
1517       // anything below the conversion could be folded into other existing nodes.
1518       // To avoid invalidating 'I', back it up to the convert node.
1519       --I;
1520       CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1521       break;
1522     }
1523     }
1524 
1525 
1526     // Now that we did that, the node is dead.  Increment the iterator to the
1527     // next node to process, then delete N.
1528     ++I;
1529     MadeChange = true;
1530   }
1531 
1532   // Remove any dead nodes that may have been left behind.
1533   if (MadeChange)
1534     CurDAG->RemoveDeadNodes();
1535 }
1536 
1537 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1538 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1539   unsigned Opc = N->getMachineOpcode();
1540   if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1541       Opc != X86::MOVSX64rr8)
1542     return false;
1543 
1544   SDValue N0 = N->getOperand(0);
1545 
1546   // We need to be extracting the lower bit of an extend.
1547   if (!N0.isMachineOpcode() ||
1548       N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1549       N0.getConstantOperandVal(1) != X86::sub_8bit)
1550     return false;
1551 
1552   // We're looking for either a movsx or movzx to match the original opcode.
1553   unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1554                                                 : X86::MOVSX32rr8_NOREX;
1555   SDValue N00 = N0.getOperand(0);
1556   if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1557     return false;
1558 
1559   if (Opc == X86::MOVSX64rr8) {
1560     // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1561     // to 64.
1562     MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1563                                                    MVT::i64, N00);
1564     ReplaceUses(N, Extend);
1565   } else {
1566     // Ok we can drop this extend and just use the original extend.
1567     ReplaceUses(N, N00.getNode());
1568   }
1569 
1570   return true;
1571 }
1572 
1573 void X86DAGToDAGISel::PostprocessISelDAG() {
1574   // Skip peepholes at -O0.
1575   if (TM.getOptLevel() == CodeGenOptLevel::None)
1576     return;
1577 
1578   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1579 
1580   bool MadeChange = false;
1581   while (Position != CurDAG->allnodes_begin()) {
1582     SDNode *N = &*--Position;
1583     // Skip dead nodes and any non-machine opcodes.
1584     if (N->use_empty() || !N->isMachineOpcode())
1585       continue;
1586 
1587     if (tryOptimizeRem8Extend(N)) {
1588       MadeChange = true;
1589       continue;
1590     }
1591 
1592     unsigned Opc = N->getMachineOpcode();
1593     switch (Opc) {
1594     default:
1595       continue;
1596     // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1597     case X86::TEST8rr:
1598     case X86::TEST16rr:
1599     case X86::TEST32rr:
1600     case X86::TEST64rr:
1601     // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1602     case X86::CTEST8rr:
1603     case X86::CTEST16rr:
1604     case X86::CTEST32rr:
1605     case X86::CTEST64rr: {
1606       auto &Op0 = N->getOperand(0);
1607       if (Op0 != N->getOperand(1) || !Op0->hasNUsesOfValue(2, Op0.getResNo()) ||
1608           !Op0.isMachineOpcode())
1609         continue;
1610       SDValue And = N->getOperand(0);
1611 #define CASE_ND(OP)                                                            \
1612   case X86::OP:                                                                \
1613   case X86::OP##_ND:
1614       switch (And.getMachineOpcode()) {
1615       default:
1616         continue;
1617         CASE_ND(AND8rr)
1618         CASE_ND(AND16rr)
1619         CASE_ND(AND32rr)
1620         CASE_ND(AND64rr) {
1621           if (And->hasAnyUseOfValue(1))
1622             continue;
1623           SmallVector<SDValue> Ops(N->op_values());
1624           Ops[0] = And.getOperand(0);
1625           Ops[1] = And.getOperand(1);
1626           MachineSDNode *Test =
1627               CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, Ops);
1628           ReplaceUses(N, Test);
1629           MadeChange = true;
1630           continue;
1631         }
1632         CASE_ND(AND8rm)
1633         CASE_ND(AND16rm)
1634         CASE_ND(AND32rm)
1635         CASE_ND(AND64rm) {
1636           if (And->hasAnyUseOfValue(1))
1637             continue;
1638           unsigned NewOpc;
1639           bool IsCTESTCC = X86::isCTESTCC(Opc);
1640 #define FROM_TO(A, B)                                                          \
1641   CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B;                          \
1642   break;
1643           switch (And.getMachineOpcode()) {
1644             FROM_TO(AND8rm, TEST8mr);
1645             FROM_TO(AND16rm, TEST16mr);
1646             FROM_TO(AND32rm, TEST32mr);
1647             FROM_TO(AND64rm, TEST64mr);
1648           }
1649 #undef FROM_TO
1650 #undef CASE_ND
1651           // Need to swap the memory and register operand.
1652           SmallVector<SDValue> Ops = {And.getOperand(1), And.getOperand(2),
1653                                       And.getOperand(3), And.getOperand(4),
1654                                       And.getOperand(5), And.getOperand(0)};
1655           // CC, Cflags.
1656           if (IsCTESTCC) {
1657             Ops.push_back(N->getOperand(2));
1658             Ops.push_back(N->getOperand(3));
1659           }
1660           // Chain of memory load
1661           Ops.push_back(And.getOperand(6));
1662           // Glue
1663           if (IsCTESTCC)
1664             Ops.push_back(N->getOperand(4));
1665 
1666           MachineSDNode *Test = CurDAG->getMachineNode(
1667               NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops);
1668           CurDAG->setNodeMemRefs(
1669               Test, cast<MachineSDNode>(And.getNode())->memoperands());
1670           ReplaceUses(And.getValue(2), SDValue(Test, 1));
1671           ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1672           MadeChange = true;
1673           continue;
1674         }
1675       }
1676     }
1677     // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1678     // used. We're doing this late so we can prefer to fold the AND into masked
1679     // comparisons. Doing that can be better for the live range of the mask
1680     // register.
1681     case X86::KORTESTBkk:
1682     case X86::KORTESTWkk:
1683     case X86::KORTESTDkk:
1684     case X86::KORTESTQkk: {
1685       SDValue Op0 = N->getOperand(0);
1686       if (Op0 != N->getOperand(1) || !N->isOnlyUserOf(Op0.getNode()) ||
1687           !Op0.isMachineOpcode() || !onlyUsesZeroFlag(SDValue(N, 0)))
1688         continue;
1689 #define CASE(A)                                                                \
1690   case X86::A:                                                                 \
1691     break;
1692       switch (Op0.getMachineOpcode()) {
1693       default:
1694         continue;
1695         CASE(KANDBkk)
1696         CASE(KANDWkk)
1697         CASE(KANDDkk)
1698         CASE(KANDQkk)
1699       }
1700       unsigned NewOpc;
1701 #define FROM_TO(A, B)                                                          \
1702   case X86::A:                                                                 \
1703     NewOpc = X86::B;                                                           \
1704     break;
1705       switch (Opc) {
1706         FROM_TO(KORTESTBkk, KTESTBkk)
1707         FROM_TO(KORTESTWkk, KTESTWkk)
1708         FROM_TO(KORTESTDkk, KTESTDkk)
1709         FROM_TO(KORTESTQkk, KTESTQkk)
1710       }
1711       // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1712       // KAND instructions and KTEST use the same ISA feature.
1713       if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI())
1714         continue;
1715 #undef FROM_TO
1716       MachineSDNode *KTest = CurDAG->getMachineNode(
1717           NewOpc, SDLoc(N), MVT::i32, Op0.getOperand(0), Op0.getOperand(1));
1718       ReplaceUses(N, KTest);
1719       MadeChange = true;
1720       continue;
1721     }
1722     // Attempt to remove vectors moves that were inserted to zero upper bits.
1723     case TargetOpcode::SUBREG_TO_REG: {
1724       unsigned SubRegIdx = N->getConstantOperandVal(2);
1725       if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1726         continue;
1727 
1728       SDValue Move = N->getOperand(1);
1729       if (!Move.isMachineOpcode())
1730         continue;
1731 
1732       // Make sure its one of the move opcodes we recognize.
1733       switch (Move.getMachineOpcode()) {
1734       default:
1735         continue;
1736         CASE(VMOVAPDrr)       CASE(VMOVUPDrr)
1737         CASE(VMOVAPSrr)       CASE(VMOVUPSrr)
1738         CASE(VMOVDQArr)       CASE(VMOVDQUrr)
1739         CASE(VMOVAPDYrr)      CASE(VMOVUPDYrr)
1740         CASE(VMOVAPSYrr)      CASE(VMOVUPSYrr)
1741         CASE(VMOVDQAYrr)      CASE(VMOVDQUYrr)
1742         CASE(VMOVAPDZ128rr)   CASE(VMOVUPDZ128rr)
1743         CASE(VMOVAPSZ128rr)   CASE(VMOVUPSZ128rr)
1744         CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1745         CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1746         CASE(VMOVAPDZ256rr)   CASE(VMOVUPDZ256rr)
1747         CASE(VMOVAPSZ256rr)   CASE(VMOVUPSZ256rr)
1748         CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1749         CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1750       }
1751 #undef CASE
1752 
1753     SDValue In = Move.getOperand(0);
1754     if (!In.isMachineOpcode() ||
1755         In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1756       continue;
1757 
1758     // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1759     // the SHA instructions which use a legacy encoding.
1760     uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1761     if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1762         (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1763         (TSFlags & X86II::EncodingMask) != X86II::XOP)
1764       continue;
1765 
1766     // Producing instruction is another vector instruction. We can drop the
1767     // move.
1768     CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1769     MadeChange = true;
1770     }
1771     }
1772   }
1773 
1774   if (MadeChange)
1775     CurDAG->RemoveDeadNodes();
1776 }
1777 
1778 
1779 /// Emit any code that needs to be executed only in the main function.
1780 void X86DAGToDAGISel::emitSpecialCodeForMain() {
1781   if (Subtarget->isTargetCygMing()) {
1782     TargetLowering::ArgListTy Args;
1783     auto &DL = CurDAG->getDataLayout();
1784 
1785     TargetLowering::CallLoweringInfo CLI(*CurDAG);
1786     CLI.setChain(CurDAG->getRoot())
1787         .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1788                    CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1789                    std::move(Args));
1790     const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1791     std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1792     CurDAG->setRoot(Result.second);
1793   }
1794 }
1795 
1796 void X86DAGToDAGISel::emitFunctionEntryCode() {
1797   // If this is main, emit special code for main.
1798   const Function &F = MF->getFunction();
1799   if (F.hasExternalLinkage() && F.getName() == "main")
1800     emitSpecialCodeForMain();
1801 }
1802 
1803 static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) {
1804   // We can run into an issue where a frame index or a register base
1805   // includes a displacement that, when added to the explicit displacement,
1806   // will overflow the displacement field. Assuming that the
1807   // displacement fits into a 31-bit integer  (which is only slightly more
1808   // aggressive than the current fundamental assumption that it fits into
1809   // a 32-bit integer), a 31-bit disp should always be safe.
1810   return isInt<31>(Val);
1811 }
1812 
1813 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1814                                             X86ISelAddressMode &AM) {
1815   // We may have already matched a displacement and the caller just added the
1816   // symbolic displacement. So we still need to do the checks even if Offset
1817   // is zero.
1818 
1819   int64_t Val = AM.Disp + Offset;
1820 
1821   // Cannot combine ExternalSymbol displacements with integer offsets.
1822   if (Val != 0 && (AM.ES || AM.MCSym))
1823     return true;
1824 
1825   CodeModel::Model M = TM.getCodeModel();
1826   if (Subtarget->is64Bit()) {
1827     if (Val != 0 &&
1828         !X86::isOffsetSuitableForCodeModel(Val, M,
1829                                            AM.hasSymbolicDisplacement()))
1830       return true;
1831     // In addition to the checks required for a register base, check that
1832     // we do not try to use an unsafe Disp with a frame index.
1833     if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1834         !isDispSafeForFrameIndexOrRegBase(Val))
1835       return true;
1836     // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1837     // 64 bits. Instructions with 32-bit register addresses perform this zero
1838     // extension for us and we can safely ignore the high bits of Offset.
1839     // Instructions with only a 32-bit immediate address do not, though: they
1840     // sign extend instead. This means only address the low 2GB of address space
1841     // is directly addressable, we need indirect addressing for the high 2GB of
1842     // address space.
1843     // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1844     // implicit zero extension of instructions would cover up any problem.
1845     // However, we have asserts elsewhere that get triggered if we do, so keep
1846     // the checks for now.
1847     // TODO: We would actually be able to accept these, as well as the same
1848     // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1849     // to get an address size override to be emitted. However, this
1850     // pseudo-register is not part of any register class and therefore causes
1851     // MIR verification to fail.
1852     if (Subtarget->isTarget64BitILP32() &&
1853         !isDispSafeForFrameIndexOrRegBase((uint32_t)Val) &&
1854         !AM.hasBaseOrIndexReg())
1855       return true;
1856   } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val))
1857     // For 32-bit X86, make sure the displacement still isn't close to the
1858     // expressible limit.
1859     return true;
1860   AM.Disp = Val;
1861   return false;
1862 }
1863 
1864 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1865                                          bool AllowSegmentRegForX32) {
1866   SDValue Address = N->getOperand(1);
1867 
1868   // load gs:0 -> GS segment register.
1869   // load fs:0 -> FS segment register.
1870   //
1871   // This optimization is generally valid because the GNU TLS model defines that
1872   // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1873   // with 32-bit registers, as we get in ILP32 mode, those registers are first
1874   // zero-extended to 64 bits and then added it to the base address, which gives
1875   // unwanted results when the register holds a negative value.
1876   // For more information see http://people.redhat.com/drepper/tls.pdf
1877   if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1878       !IndirectTlsSegRefs &&
1879       (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1880        Subtarget->isTargetFuchsia())) {
1881     if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1882       return true;
1883     switch (N->getPointerInfo().getAddrSpace()) {
1884     case X86AS::GS:
1885       AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1886       return false;
1887     case X86AS::FS:
1888       AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1889       return false;
1890       // Address space X86AS::SS is not handled here, because it is not used to
1891       // address TLS areas.
1892     }
1893   }
1894 
1895   return true;
1896 }
1897 
1898 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1899 /// mode. These wrap things that will resolve down into a symbol reference.
1900 /// If no match is possible, this returns true, otherwise it returns false.
1901 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1902   // If the addressing mode already has a symbol as the displacement, we can
1903   // never match another symbol.
1904   if (AM.hasSymbolicDisplacement())
1905     return true;
1906 
1907   bool IsRIPRelTLS = false;
1908   bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1909   if (IsRIPRel) {
1910     SDValue Val = N.getOperand(0);
1911     if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1912       IsRIPRelTLS = true;
1913   }
1914 
1915   // We can't use an addressing mode in the 64-bit large code model.
1916   // Global TLS addressing is an exception. In the medium code model,
1917   // we use can use a mode when RIP wrappers are present.
1918   // That signifies access to globals that are known to be "near",
1919   // such as the GOT itself.
1920   CodeModel::Model M = TM.getCodeModel();
1921   if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1922     return true;
1923 
1924   // Base and index reg must be 0 in order to use %rip as base.
1925   if (IsRIPRel && AM.hasBaseOrIndexReg())
1926     return true;
1927 
1928   // Make a local copy in case we can't do this fold.
1929   X86ISelAddressMode Backup = AM;
1930 
1931   int64_t Offset = 0;
1932   SDValue N0 = N.getOperand(0);
1933   if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1934     AM.GV = G->getGlobal();
1935     AM.SymbolFlags = G->getTargetFlags();
1936     Offset = G->getOffset();
1937   } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1938     AM.CP = CP->getConstVal();
1939     AM.Alignment = CP->getAlign();
1940     AM.SymbolFlags = CP->getTargetFlags();
1941     Offset = CP->getOffset();
1942   } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1943     AM.ES = S->getSymbol();
1944     AM.SymbolFlags = S->getTargetFlags();
1945   } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1946     AM.MCSym = S->getMCSymbol();
1947   } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1948     AM.JT = J->getIndex();
1949     AM.SymbolFlags = J->getTargetFlags();
1950   } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1951     AM.BlockAddr = BA->getBlockAddress();
1952     AM.SymbolFlags = BA->getTargetFlags();
1953     Offset = BA->getOffset();
1954   } else
1955     llvm_unreachable("Unhandled symbol reference node.");
1956 
1957   // Can't use an addressing mode with large globals.
1958   if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1959       TM.isLargeGlobalValue(AM.GV)) {
1960     AM = Backup;
1961     return true;
1962   }
1963 
1964   if (foldOffsetIntoAddress(Offset, AM)) {
1965     AM = Backup;
1966     return true;
1967   }
1968 
1969   if (IsRIPRel)
1970     AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1971 
1972   // Commit the changes now that we know this fold is safe.
1973   return false;
1974 }
1975 
1976 /// Add the specified node to the specified addressing mode, returning true if
1977 /// it cannot be done. This just pattern matches for the addressing mode.
1978 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1979   if (matchAddressRecursively(N, AM, 0))
1980     return true;
1981 
1982   // Post-processing: Make a second attempt to fold a load, if we now know
1983   // that there will not be any other register. This is only performed for
1984   // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1985   // any foldable load the first time.
1986   if (Subtarget->isTarget64BitILP32() &&
1987       AM.BaseType == X86ISelAddressMode::RegBase &&
1988       AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1989     SDValue Save_Base_Reg = AM.Base_Reg;
1990     if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
1991       AM.Base_Reg = SDValue();
1992       if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
1993         AM.Base_Reg = Save_Base_Reg;
1994     }
1995   }
1996 
1997   // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1998   // a smaller encoding and avoids a scaled-index.
1999   if (AM.Scale == 2 &&
2000       AM.BaseType == X86ISelAddressMode::RegBase &&
2001       AM.Base_Reg.getNode() == nullptr) {
2002     AM.Base_Reg = AM.IndexReg;
2003     AM.Scale = 1;
2004   }
2005 
2006   // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2007   // because it has a smaller encoding.
2008   if (TM.getCodeModel() != CodeModel::Large &&
2009       (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
2010       AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
2011       AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
2012       AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
2013     // However, when GV is a local function symbol and in the same section as
2014     // the current instruction, and AM.Disp is negative and near INT32_MIN,
2015     // referencing GV+Disp generates a relocation referencing the section symbol
2016     // with an even smaller offset, which might underflow. We should bail out if
2017     // the negative offset is too close to INT32_MIN. Actually, we are more
2018     // conservative here, using a smaller magic number also used by
2019     // isOffsetSuitableForCodeModel.
2020     if (isa_and_nonnull<Function>(AM.GV) && AM.Disp < -16 * 1024 * 1024)
2021       return true;
2022 
2023     AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
2024   }
2025 
2026   return false;
2027 }
2028 
2029 bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
2030                                unsigned Depth) {
2031   // Add an artificial use to this node so that we can keep track of
2032   // it if it gets CSE'd with a different node.
2033   HandleSDNode Handle(N);
2034 
2035   X86ISelAddressMode Backup = AM;
2036   if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
2037       !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
2038     return false;
2039   AM = Backup;
2040 
2041   // Try again after commutating the operands.
2042   if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
2043                                Depth + 1) &&
2044       !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
2045     return false;
2046   AM = Backup;
2047 
2048   // If we couldn't fold both operands into the address at the same time,
2049   // see if we can just put each operand into a register and fold at least
2050   // the add.
2051   if (AM.BaseType == X86ISelAddressMode::RegBase &&
2052       !AM.Base_Reg.getNode() &&
2053       !AM.IndexReg.getNode()) {
2054     N = Handle.getValue();
2055     AM.Base_Reg = N.getOperand(0);
2056     AM.IndexReg = N.getOperand(1);
2057     AM.Scale = 1;
2058     return false;
2059   }
2060   N = Handle.getValue();
2061   return true;
2062 }
2063 
2064 // Insert a node into the DAG at least before the Pos node's position. This
2065 // will reposition the node as needed, and will assign it a node ID that is <=
2066 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2067 // IDs! The selection DAG must no longer depend on their uniqueness when this
2068 // is used.
2069 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2070   if (N->getNodeId() == -1 ||
2071       (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
2072        SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
2073     DAG.RepositionNode(Pos->getIterator(), N.getNode());
2074     // Mark Node as invalid for pruning as after this it may be a successor to a
2075     // selected node but otherwise be in the same position of Pos.
2076     // Conservatively mark it with the same -abs(Id) to assure node id
2077     // invariant is preserved.
2078     N->setNodeId(Pos->getNodeId());
2079     SelectionDAGISel::InvalidateNodeId(N.getNode());
2080   }
2081 }
2082 
2083 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2084 // safe. This allows us to convert the shift and and into an h-register
2085 // extract and a scaled index. Returns false if the simplification is
2086 // performed.
2087 static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
2088                                       uint64_t Mask,
2089                                       SDValue Shift, SDValue X,
2090                                       X86ISelAddressMode &AM) {
2091   if (Shift.getOpcode() != ISD::SRL ||
2092       !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2093       !Shift.hasOneUse())
2094     return true;
2095 
2096   int ScaleLog = 8 - Shift.getConstantOperandVal(1);
2097   if (ScaleLog <= 0 || ScaleLog >= 4 ||
2098       Mask != (0xffu << ScaleLog))
2099     return true;
2100 
2101   MVT XVT = X.getSimpleValueType();
2102   MVT VT = N.getSimpleValueType();
2103   SDLoc DL(N);
2104   SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2105   SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2106   SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2107   SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2108   SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2109   SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2110   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2111 
2112   // Insert the new nodes into the topological ordering. We must do this in
2113   // a valid topological ordering as nothing is going to go back and re-sort
2114   // these nodes. We continually insert before 'N' in sequence as this is
2115   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2116   // hierarchy left to express.
2117   insertDAGNode(DAG, N, Eight);
2118   insertDAGNode(DAG, N, NewMask);
2119   insertDAGNode(DAG, N, Srl);
2120   insertDAGNode(DAG, N, And);
2121   insertDAGNode(DAG, N, Ext);
2122   insertDAGNode(DAG, N, ShlCount);
2123   insertDAGNode(DAG, N, Shl);
2124   DAG.ReplaceAllUsesWith(N, Shl);
2125   DAG.RemoveDeadNode(N.getNode());
2126   AM.IndexReg = Ext;
2127   AM.Scale = (1 << ScaleLog);
2128   return false;
2129 }
2130 
2131 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2132 // allows us to fold the shift into this addressing mode. Returns false if the
2133 // transform succeeded.
2134 static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
2135                                         X86ISelAddressMode &AM) {
2136   SDValue Shift = N.getOperand(0);
2137 
2138   // Use a signed mask so that shifting right will insert sign bits. These
2139   // bits will be removed when we shift the result left so it doesn't matter
2140   // what we use. This might allow a smaller immediate encoding.
2141   int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2142 
2143   // If we have an any_extend feeding the AND, look through it to see if there
2144   // is a shift behind it. But only if the AND doesn't use the extended bits.
2145   // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2146   bool FoundAnyExtend = false;
2147   if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2148       Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2149       isUInt<32>(Mask)) {
2150     FoundAnyExtend = true;
2151     Shift = Shift.getOperand(0);
2152   }
2153 
2154   if (Shift.getOpcode() != ISD::SHL ||
2155       !isa<ConstantSDNode>(Shift.getOperand(1)))
2156     return true;
2157 
2158   SDValue X = Shift.getOperand(0);
2159 
2160   // Not likely to be profitable if either the AND or SHIFT node has more
2161   // than one use (unless all uses are for address computation). Besides,
2162   // isel mechanism requires their node ids to be reused.
2163   if (!N.hasOneUse() || !Shift.hasOneUse())
2164     return true;
2165 
2166   // Verify that the shift amount is something we can fold.
2167   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2168   if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2169     return true;
2170 
2171   MVT VT = N.getSimpleValueType();
2172   SDLoc DL(N);
2173   if (FoundAnyExtend) {
2174     SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2175     insertDAGNode(DAG, N, NewX);
2176     X = NewX;
2177   }
2178 
2179   SDValue NewMask = DAG.getSignedConstant(Mask >> ShiftAmt, DL, VT);
2180   SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2181   SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2182 
2183   // Insert the new nodes into the topological ordering. We must do this in
2184   // a valid topological ordering as nothing is going to go back and re-sort
2185   // these nodes. We continually insert before 'N' in sequence as this is
2186   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2187   // hierarchy left to express.
2188   insertDAGNode(DAG, N, NewMask);
2189   insertDAGNode(DAG, N, NewAnd);
2190   insertDAGNode(DAG, N, NewShift);
2191   DAG.ReplaceAllUsesWith(N, NewShift);
2192   DAG.RemoveDeadNode(N.getNode());
2193 
2194   AM.Scale = 1 << ShiftAmt;
2195   AM.IndexReg = NewAnd;
2196   return false;
2197 }
2198 
2199 // Implement some heroics to detect shifts of masked values where the mask can
2200 // be replaced by extending the shift and undoing that in the addressing mode
2201 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2202 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2203 // the addressing mode. This results in code such as:
2204 //
2205 //   int f(short *y, int *lookup_table) {
2206 //     ...
2207 //     return *y + lookup_table[*y >> 11];
2208 //   }
2209 //
2210 // Turning into:
2211 //   movzwl (%rdi), %eax
2212 //   movl %eax, %ecx
2213 //   shrl $11, %ecx
2214 //   addl (%rsi,%rcx,4), %eax
2215 //
2216 // Instead of:
2217 //   movzwl (%rdi), %eax
2218 //   movl %eax, %ecx
2219 //   shrl $9, %ecx
2220 //   andl $124, %rcx
2221 //   addl (%rsi,%rcx), %eax
2222 //
2223 // Note that this function assumes the mask is provided as a mask *after* the
2224 // value is shifted. The input chain may or may not match that, but computing
2225 // such a mask is trivial.
2226 static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
2227                                     uint64_t Mask,
2228                                     SDValue Shift, SDValue X,
2229                                     X86ISelAddressMode &AM) {
2230   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2231       !isa<ConstantSDNode>(Shift.getOperand(1)))
2232     return true;
2233 
2234   // We need to ensure that mask is a continuous run of bits.
2235   unsigned MaskIdx, MaskLen;
2236   if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2237     return true;
2238   unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2239 
2240   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2241 
2242   // The amount of shift we're trying to fit into the addressing mode is taken
2243   // from the shifted mask index (number of trailing zeros of the mask).
2244   unsigned AMShiftAmt = MaskIdx;
2245 
2246   // There is nothing we can do here unless the mask is removing some bits.
2247   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2248   if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2249 
2250   // Scale the leading zero count down based on the actual size of the value.
2251   // Also scale it down based on the size of the shift.
2252   unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2253   if (MaskLZ < ScaleDown)
2254     return true;
2255   MaskLZ -= ScaleDown;
2256 
2257   // The final check is to ensure that any masked out high bits of X are
2258   // already known to be zero. Otherwise, the mask has a semantic impact
2259   // other than masking out a couple of low bits. Unfortunately, because of
2260   // the mask, zero extensions will be removed from operands in some cases.
2261   // This code works extra hard to look through extensions because we can
2262   // replace them with zero extensions cheaply if necessary.
2263   bool ReplacingAnyExtend = false;
2264   if (X.getOpcode() == ISD::ANY_EXTEND) {
2265     unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2266                           X.getOperand(0).getSimpleValueType().getSizeInBits();
2267     // Assume that we'll replace the any-extend with a zero-extend, and
2268     // narrow the search to the extended value.
2269     X = X.getOperand(0);
2270     MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2271     ReplacingAnyExtend = true;
2272   }
2273   APInt MaskedHighBits =
2274     APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2275   if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2276     return true;
2277 
2278   // We've identified a pattern that can be transformed into a single shift
2279   // and an addressing mode. Make it so.
2280   MVT VT = N.getSimpleValueType();
2281   if (ReplacingAnyExtend) {
2282     assert(X.getValueType() != VT);
2283     // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2284     SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2285     insertDAGNode(DAG, N, NewX);
2286     X = NewX;
2287   }
2288 
2289   MVT XVT = X.getSimpleValueType();
2290   SDLoc DL(N);
2291   SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2292   SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2293   SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2294   SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2295   SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2296 
2297   // Insert the new nodes into the topological ordering. We must do this in
2298   // a valid topological ordering as nothing is going to go back and re-sort
2299   // these nodes. We continually insert before 'N' in sequence as this is
2300   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2301   // hierarchy left to express.
2302   insertDAGNode(DAG, N, NewSRLAmt);
2303   insertDAGNode(DAG, N, NewSRL);
2304   insertDAGNode(DAG, N, NewExt);
2305   insertDAGNode(DAG, N, NewSHLAmt);
2306   insertDAGNode(DAG, N, NewSHL);
2307   DAG.ReplaceAllUsesWith(N, NewSHL);
2308   DAG.RemoveDeadNode(N.getNode());
2309 
2310   AM.Scale = 1 << AMShiftAmt;
2311   AM.IndexReg = NewExt;
2312   return false;
2313 }
2314 
2315 // Transform "(X >> SHIFT) & (MASK << C1)" to
2316 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2317 // matched to a BEXTR later. Returns false if the simplification is performed.
2318 static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
2319                                    uint64_t Mask,
2320                                    SDValue Shift, SDValue X,
2321                                    X86ISelAddressMode &AM,
2322                                    const X86Subtarget &Subtarget) {
2323   if (Shift.getOpcode() != ISD::SRL ||
2324       !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2325       !Shift.hasOneUse() || !N.hasOneUse())
2326     return true;
2327 
2328   // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2329   if (!Subtarget.hasTBM() &&
2330       !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2331     return true;
2332 
2333   // We need to ensure that mask is a continuous run of bits.
2334   unsigned MaskIdx, MaskLen;
2335   if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2336     return true;
2337 
2338   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2339 
2340   // The amount of shift we're trying to fit into the addressing mode is taken
2341   // from the shifted mask index (number of trailing zeros of the mask).
2342   unsigned AMShiftAmt = MaskIdx;
2343 
2344   // There is nothing we can do here unless the mask is removing some bits.
2345   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2346   if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2347 
2348   MVT XVT = X.getSimpleValueType();
2349   MVT VT = N.getSimpleValueType();
2350   SDLoc DL(N);
2351   SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2352   SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2353   SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2354   SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2355   SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2356   SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2357   SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2358 
2359   // Insert the new nodes into the topological ordering. We must do this in
2360   // a valid topological ordering as nothing is going to go back and re-sort
2361   // these nodes. We continually insert before 'N' in sequence as this is
2362   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2363   // hierarchy left to express.
2364   insertDAGNode(DAG, N, NewSRLAmt);
2365   insertDAGNode(DAG, N, NewSRL);
2366   insertDAGNode(DAG, N, NewMask);
2367   insertDAGNode(DAG, N, NewAnd);
2368   insertDAGNode(DAG, N, NewExt);
2369   insertDAGNode(DAG, N, NewSHLAmt);
2370   insertDAGNode(DAG, N, NewSHL);
2371   DAG.ReplaceAllUsesWith(N, NewSHL);
2372   DAG.RemoveDeadNode(N.getNode());
2373 
2374   AM.Scale = 1 << AMShiftAmt;
2375   AM.IndexReg = NewExt;
2376   return false;
2377 }
2378 
2379 // Attempt to peek further into a scaled index register, collecting additional
2380 // extensions / offsets / etc. Returns /p N if we can't peek any further.
2381 SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2382                                                X86ISelAddressMode &AM,
2383                                                unsigned Depth) {
2384   assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2385   assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2386          "Illegal index scale");
2387 
2388   // Limit recursion.
2389   if (Depth >= SelectionDAG::MaxRecursionDepth)
2390     return N;
2391 
2392   EVT VT = N.getValueType();
2393   unsigned Opc = N.getOpcode();
2394 
2395   // index: add(x,c) -> index: x, disp + c
2396   if (CurDAG->isBaseWithConstantOffset(N)) {
2397     auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2398     uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2399     if (!foldOffsetIntoAddress(Offset, AM))
2400       return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2401   }
2402 
2403   // index: add(x,x) -> index: x, scale * 2
2404   if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2405     if (AM.Scale <= 4) {
2406       AM.Scale *= 2;
2407       return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2408     }
2409   }
2410 
2411   // index: shl(x,i) -> index: x, scale * (1 << i)
2412   if (Opc == X86ISD::VSHLI) {
2413     uint64_t ShiftAmt = N.getConstantOperandVal(1);
2414     uint64_t ScaleAmt = 1ULL << ShiftAmt;
2415     if ((AM.Scale * ScaleAmt) <= 8) {
2416       AM.Scale *= ScaleAmt;
2417       return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2418     }
2419   }
2420 
2421   // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2422   // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2423   if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2424     SDValue Src = N.getOperand(0);
2425     if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2426         Src.hasOneUse()) {
2427       if (CurDAG->isBaseWithConstantOffset(Src)) {
2428         SDValue AddSrc = Src.getOperand(0);
2429         auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2430         int64_t Offset = AddVal->getSExtValue();
2431         if (!foldOffsetIntoAddress((uint64_t)Offset * AM.Scale, AM)) {
2432           SDLoc DL(N);
2433           SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2434           SDValue ExtVal = CurDAG->getSignedConstant(Offset, DL, VT);
2435           SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2436           insertDAGNode(*CurDAG, N, ExtSrc);
2437           insertDAGNode(*CurDAG, N, ExtVal);
2438           insertDAGNode(*CurDAG, N, ExtAdd);
2439           CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2440           CurDAG->RemoveDeadNode(N.getNode());
2441           return ExtSrc;
2442         }
2443       }
2444     }
2445   }
2446 
2447   // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2448   // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2449   // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2450   if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2451     SDValue Src = N.getOperand(0);
2452     unsigned SrcOpc = Src.getOpcode();
2453     if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2454          CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
2455         Src.hasOneUse()) {
2456       if (CurDAG->isBaseWithConstantOffset(Src)) {
2457         SDValue AddSrc = Src.getOperand(0);
2458         uint64_t Offset = Src.getConstantOperandVal(1);
2459         if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2460           SDLoc DL(N);
2461           SDValue Res;
2462           // If we're also scaling, see if we can use that as well.
2463           if (AddSrc.getOpcode() == ISD::SHL &&
2464               isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2465             SDValue ShVal = AddSrc.getOperand(0);
2466             uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2467             APInt HiBits =
2468                 APInt::getHighBitsSet(AddSrc.getScalarValueSizeInBits(), ShAmt);
2469             uint64_t ScaleAmt = 1ULL << ShAmt;
2470             if ((AM.Scale * ScaleAmt) <= 8 &&
2471                 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2472                  CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2473               AM.Scale *= ScaleAmt;
2474               SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2475               SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2476                                                  AddSrc.getOperand(1));
2477               insertDAGNode(*CurDAG, N, ExtShVal);
2478               insertDAGNode(*CurDAG, N, ExtShift);
2479               AddSrc = ExtShift;
2480               Res = ExtShVal;
2481             }
2482           }
2483           SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2484           SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2485           SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2486           insertDAGNode(*CurDAG, N, ExtSrc);
2487           insertDAGNode(*CurDAG, N, ExtVal);
2488           insertDAGNode(*CurDAG, N, ExtAdd);
2489           CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2490           CurDAG->RemoveDeadNode(N.getNode());
2491           return Res ? Res : ExtSrc;
2492         }
2493       }
2494     }
2495   }
2496 
2497   // TODO: Handle extensions, shifted masks etc.
2498   return N;
2499 }
2500 
2501 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2502                                               unsigned Depth) {
2503   SDLoc dl(N);
2504   LLVM_DEBUG({
2505     dbgs() << "MatchAddress: ";
2506     AM.dump(CurDAG);
2507   });
2508   // Limit recursion.
2509   if (Depth >= SelectionDAG::MaxRecursionDepth)
2510     return matchAddressBase(N, AM);
2511 
2512   // If this is already a %rip relative address, we can only merge immediates
2513   // into it.  Instead of handling this in every case, we handle it here.
2514   // RIP relative addressing: %rip + 32-bit displacement!
2515   if (AM.isRIPRelative()) {
2516     // FIXME: JumpTable and ExternalSymbol address currently don't like
2517     // displacements.  It isn't very important, but this should be fixed for
2518     // consistency.
2519     if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2520       return true;
2521 
2522     if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2523       if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2524         return false;
2525     return true;
2526   }
2527 
2528   switch (N.getOpcode()) {
2529   default: break;
2530   case ISD::LOCAL_RECOVER: {
2531     if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2532       if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2533         // Use the symbol and don't prefix it.
2534         AM.MCSym = ESNode->getMCSymbol();
2535         return false;
2536       }
2537     break;
2538   }
2539   case ISD::Constant: {
2540     uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2541     if (!foldOffsetIntoAddress(Val, AM))
2542       return false;
2543     break;
2544   }
2545 
2546   case X86ISD::Wrapper:
2547   case X86ISD::WrapperRIP:
2548     if (!matchWrapper(N, AM))
2549       return false;
2550     break;
2551 
2552   case ISD::LOAD:
2553     if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2554       return false;
2555     break;
2556 
2557   case ISD::FrameIndex:
2558     if (AM.BaseType == X86ISelAddressMode::RegBase &&
2559         AM.Base_Reg.getNode() == nullptr &&
2560         (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(AM.Disp))) {
2561       AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2562       AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2563       return false;
2564     }
2565     break;
2566 
2567   case ISD::SHL:
2568     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2569       break;
2570 
2571     if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2572       unsigned Val = CN->getZExtValue();
2573       // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2574       // that the base operand remains free for further matching. If
2575       // the base doesn't end up getting used, a post-processing step
2576       // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2577       if (Val == 1 || Val == 2 || Val == 3) {
2578         SDValue ShVal = N.getOperand(0);
2579         AM.Scale = 1 << Val;
2580         AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2581         return false;
2582       }
2583     }
2584     break;
2585 
2586   case ISD::SRL: {
2587     // Scale must not be used already.
2588     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2589 
2590     // We only handle up to 64-bit values here as those are what matter for
2591     // addressing mode optimizations.
2592     assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2593            "Unexpected value size!");
2594 
2595     SDValue And = N.getOperand(0);
2596     if (And.getOpcode() != ISD::AND) break;
2597     SDValue X = And.getOperand(0);
2598 
2599     // The mask used for the transform is expected to be post-shift, but we
2600     // found the shift first so just apply the shift to the mask before passing
2601     // it down.
2602     if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2603         !isa<ConstantSDNode>(And.getOperand(1)))
2604       break;
2605     uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2606 
2607     // Try to fold the mask and shift into the scale, and return false if we
2608     // succeed.
2609     if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2610       return false;
2611     break;
2612   }
2613 
2614   case ISD::SMUL_LOHI:
2615   case ISD::UMUL_LOHI:
2616     // A mul_lohi where we need the low part can be folded as a plain multiply.
2617     if (N.getResNo() != 0) break;
2618     [[fallthrough]];
2619   case ISD::MUL:
2620   case X86ISD::MUL_IMM:
2621     // X*[3,5,9] -> X+X*[2,4,8]
2622     if (AM.BaseType == X86ISelAddressMode::RegBase &&
2623         AM.Base_Reg.getNode() == nullptr &&
2624         AM.IndexReg.getNode() == nullptr) {
2625       if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2626         if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2627             CN->getZExtValue() == 9) {
2628           AM.Scale = unsigned(CN->getZExtValue())-1;
2629 
2630           SDValue MulVal = N.getOperand(0);
2631           SDValue Reg;
2632 
2633           // Okay, we know that we have a scale by now.  However, if the scaled
2634           // value is an add of something and a constant, we can fold the
2635           // constant into the disp field here.
2636           if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2637               isa<ConstantSDNode>(MulVal.getOperand(1))) {
2638             Reg = MulVal.getOperand(0);
2639             auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2640             uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2641             if (foldOffsetIntoAddress(Disp, AM))
2642               Reg = N.getOperand(0);
2643           } else {
2644             Reg = N.getOperand(0);
2645           }
2646 
2647           AM.IndexReg = AM.Base_Reg = Reg;
2648           return false;
2649         }
2650     }
2651     break;
2652 
2653   case ISD::SUB: {
2654     // Given A-B, if A can be completely folded into the address and
2655     // the index field with the index field unused, use -B as the index.
2656     // This is a win if a has multiple parts that can be folded into
2657     // the address. Also, this saves a mov if the base register has
2658     // other uses, since it avoids a two-address sub instruction, however
2659     // it costs an additional mov if the index register has other uses.
2660 
2661     // Add an artificial use to this node so that we can keep track of
2662     // it if it gets CSE'd with a different node.
2663     HandleSDNode Handle(N);
2664 
2665     // Test if the LHS of the sub can be folded.
2666     X86ISelAddressMode Backup = AM;
2667     if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2668       N = Handle.getValue();
2669       AM = Backup;
2670       break;
2671     }
2672     N = Handle.getValue();
2673     // Test if the index field is free for use.
2674     if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2675       AM = Backup;
2676       break;
2677     }
2678 
2679     int Cost = 0;
2680     SDValue RHS = N.getOperand(1);
2681     // If the RHS involves a register with multiple uses, this
2682     // transformation incurs an extra mov, due to the neg instruction
2683     // clobbering its operand.
2684     if (!RHS.getNode()->hasOneUse() ||
2685         RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2686         RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2687         RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2688         (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2689          RHS.getOperand(0).getValueType() == MVT::i32))
2690       ++Cost;
2691     // If the base is a register with multiple uses, this
2692     // transformation may save a mov.
2693     if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2694          !AM.Base_Reg.getNode()->hasOneUse()) ||
2695         AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2696       --Cost;
2697     // If the folded LHS was interesting, this transformation saves
2698     // address arithmetic.
2699     if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2700         ((AM.Disp != 0) && (Backup.Disp == 0)) +
2701         (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2702       --Cost;
2703     // If it doesn't look like it may be an overall win, don't do it.
2704     if (Cost >= 0) {
2705       AM = Backup;
2706       break;
2707     }
2708 
2709     // Ok, the transformation is legal and appears profitable. Go for it.
2710     // Negation will be emitted later to avoid creating dangling nodes if this
2711     // was an unprofitable LEA.
2712     AM.IndexReg = RHS;
2713     AM.NegateIndex = true;
2714     AM.Scale = 1;
2715     return false;
2716   }
2717 
2718   case ISD::OR:
2719   case ISD::XOR:
2720     // See if we can treat the OR/XOR node as an ADD node.
2721     if (!CurDAG->isADDLike(N))
2722       break;
2723     [[fallthrough]];
2724   case ISD::ADD:
2725     if (!matchAdd(N, AM, Depth))
2726       return false;
2727     break;
2728 
2729   case ISD::AND: {
2730     // Perform some heroic transforms on an and of a constant-count shift
2731     // with a constant to enable use of the scaled offset field.
2732 
2733     // Scale must not be used already.
2734     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2735 
2736     // We only handle up to 64-bit values here as those are what matter for
2737     // addressing mode optimizations.
2738     assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2739            "Unexpected value size!");
2740 
2741     if (!isa<ConstantSDNode>(N.getOperand(1)))
2742       break;
2743 
2744     if (N.getOperand(0).getOpcode() == ISD::SRL) {
2745       SDValue Shift = N.getOperand(0);
2746       SDValue X = Shift.getOperand(0);
2747 
2748       uint64_t Mask = N.getConstantOperandVal(1);
2749 
2750       // Try to fold the mask and shift into an extract and scale.
2751       if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2752         return false;
2753 
2754       // Try to fold the mask and shift directly into the scale.
2755       if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2756         return false;
2757 
2758       // Try to fold the mask and shift into BEXTR and scale.
2759       if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2760         return false;
2761     }
2762 
2763     // Try to swap the mask and shift to place shifts which can be done as
2764     // a scale on the outside of the mask.
2765     if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2766       return false;
2767 
2768     break;
2769   }
2770   case ISD::ZERO_EXTEND: {
2771     // Try to widen a zexted shift left to the same size as its use, so we can
2772     // match the shift as a scale factor.
2773     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2774       break;
2775 
2776     SDValue Src = N.getOperand(0);
2777 
2778     // See if we can match a zext(addlike(x,c)).
2779     // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2780     if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2781       if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2782         if (Index != N) {
2783           AM.IndexReg = Index;
2784           return false;
2785         }
2786 
2787     // Peek through mask: zext(and(shl(x,c1),c2))
2788     APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2789     if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2790       if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2791         Mask = MaskC->getAPIntValue();
2792         Src = Src.getOperand(0);
2793       }
2794 
2795     if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2796       // Give up if the shift is not a valid scale factor [1,2,3].
2797       SDValue ShlSrc = Src.getOperand(0);
2798       SDValue ShlAmt = Src.getOperand(1);
2799       auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2800       if (!ShAmtC)
2801         break;
2802       unsigned ShAmtV = ShAmtC->getZExtValue();
2803       if (ShAmtV > 3)
2804         break;
2805 
2806       // The narrow shift must only shift out zero bits (it must be 'nuw').
2807       // That makes it safe to widen to the destination type.
2808       APInt HighZeros =
2809           APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2810       if (!Src->getFlags().hasNoUnsignedWrap() &&
2811           !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2812         break;
2813 
2814       // zext (shl nuw i8 %x, C1) to i32
2815       // --> shl (zext i8 %x to i32), (zext C1)
2816       // zext (and (shl nuw i8 %x, C1), C2) to i32
2817       // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2818       MVT SrcVT = ShlSrc.getSimpleValueType();
2819       MVT VT = N.getSimpleValueType();
2820       SDLoc DL(N);
2821 
2822       SDValue Res = ShlSrc;
2823       if (!Mask.isAllOnes()) {
2824         Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2825         insertDAGNode(*CurDAG, N, Res);
2826         Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2827         insertDAGNode(*CurDAG, N, Res);
2828       }
2829       SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2830       insertDAGNode(*CurDAG, N, Zext);
2831       SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2832       insertDAGNode(*CurDAG, N, NewShl);
2833       CurDAG->ReplaceAllUsesWith(N, NewShl);
2834       CurDAG->RemoveDeadNode(N.getNode());
2835 
2836       // Convert the shift to scale factor.
2837       AM.Scale = 1 << ShAmtV;
2838       // If matchIndexRecursively is not called here,
2839       // Zext may be replaced by other nodes but later used to call a builder
2840       // method
2841       AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2842       return false;
2843     }
2844 
2845     if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2846       // Try to fold the mask and shift into an extract and scale.
2847       if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2848                                      Src.getOperand(0), AM))
2849         return false;
2850 
2851       // Try to fold the mask and shift directly into the scale.
2852       if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2853                                    Src.getOperand(0), AM))
2854         return false;
2855 
2856       // Try to fold the mask and shift into BEXTR and scale.
2857       if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2858                                   Src.getOperand(0), AM, *Subtarget))
2859         return false;
2860     }
2861 
2862     break;
2863   }
2864   }
2865 
2866   return matchAddressBase(N, AM);
2867 }
2868 
2869 /// Helper for MatchAddress. Add the specified node to the
2870 /// specified addressing mode without any further recursion.
2871 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2872   // Is the base register already occupied?
2873   if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2874     // If so, check to see if the scale index register is set.
2875     if (!AM.IndexReg.getNode()) {
2876       AM.IndexReg = N;
2877       AM.Scale = 1;
2878       return false;
2879     }
2880 
2881     // Otherwise, we cannot select it.
2882     return true;
2883   }
2884 
2885   // Default, generate it as a register.
2886   AM.BaseType = X86ISelAddressMode::RegBase;
2887   AM.Base_Reg = N;
2888   return false;
2889 }
2890 
2891 bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2892                                                     X86ISelAddressMode &AM,
2893                                                     unsigned Depth) {
2894   SDLoc dl(N);
2895   LLVM_DEBUG({
2896     dbgs() << "MatchVectorAddress: ";
2897     AM.dump(CurDAG);
2898   });
2899   // Limit recursion.
2900   if (Depth >= SelectionDAG::MaxRecursionDepth)
2901     return matchAddressBase(N, AM);
2902 
2903   // TODO: Support other operations.
2904   switch (N.getOpcode()) {
2905   case ISD::Constant: {
2906     uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2907     if (!foldOffsetIntoAddress(Val, AM))
2908       return false;
2909     break;
2910   }
2911   case X86ISD::Wrapper:
2912     if (!matchWrapper(N, AM))
2913       return false;
2914     break;
2915   case ISD::ADD: {
2916     // Add an artificial use to this node so that we can keep track of
2917     // it if it gets CSE'd with a different node.
2918     HandleSDNode Handle(N);
2919 
2920     X86ISelAddressMode Backup = AM;
2921     if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2922         !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2923                                        Depth + 1))
2924       return false;
2925     AM = Backup;
2926 
2927     // Try again after commuting the operands.
2928     if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2929                                        Depth + 1) &&
2930         !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2931                                        Depth + 1))
2932       return false;
2933     AM = Backup;
2934 
2935     N = Handle.getValue();
2936     break;
2937   }
2938   }
2939 
2940   return matchAddressBase(N, AM);
2941 }
2942 
2943 /// Helper for selectVectorAddr. Handles things that can be folded into a
2944 /// gather/scatter address. The index register and scale should have already
2945 /// been handled.
2946 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2947   return matchVectorAddressRecursively(N, AM, 0);
2948 }
2949 
2950 bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2951                                        SDValue IndexOp, SDValue ScaleOp,
2952                                        SDValue &Base, SDValue &Scale,
2953                                        SDValue &Index, SDValue &Disp,
2954                                        SDValue &Segment) {
2955   X86ISelAddressMode AM;
2956   AM.Scale = ScaleOp->getAsZExtVal();
2957 
2958   // Attempt to match index patterns, as long as we're not relying on implicit
2959   // sign-extension, which is performed BEFORE scale.
2960   if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2961     AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2962   else
2963     AM.IndexReg = IndexOp;
2964 
2965   unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2966   if (AddrSpace == X86AS::GS)
2967     AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2968   if (AddrSpace == X86AS::FS)
2969     AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2970   if (AddrSpace == X86AS::SS)
2971     AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2972 
2973   SDLoc DL(BasePtr);
2974   MVT VT = BasePtr.getSimpleValueType();
2975 
2976   // Try to match into the base and displacement fields.
2977   if (matchVectorAddress(BasePtr, AM))
2978     return false;
2979 
2980   getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2981   return true;
2982 }
2983 
2984 /// Returns true if it is able to pattern match an addressing mode.
2985 /// It returns the operands which make up the maximal addressing mode it can
2986 /// match by reference.
2987 ///
2988 /// Parent is the parent node of the addr operand that is being matched.  It
2989 /// is always a load, store, atomic node, or null.  It is only null when
2990 /// checking memory operands for inline asm nodes.
2991 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2992                                  SDValue &Scale, SDValue &Index,
2993                                  SDValue &Disp, SDValue &Segment) {
2994   X86ISelAddressMode AM;
2995 
2996   if (Parent &&
2997       // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2998       // that are not a MemSDNode, and thus don't have proper addrspace info.
2999       Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
3000       Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
3001       Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
3002       Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
3003       Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
3004       Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
3005       Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
3006     unsigned AddrSpace =
3007       cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
3008     if (AddrSpace == X86AS::GS)
3009       AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
3010     if (AddrSpace == X86AS::FS)
3011       AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
3012     if (AddrSpace == X86AS::SS)
3013       AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
3014   }
3015 
3016   // Save the DL and VT before calling matchAddress, it can invalidate N.
3017   SDLoc DL(N);
3018   MVT VT = N.getSimpleValueType();
3019 
3020   if (matchAddress(N, AM))
3021     return false;
3022 
3023   getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3024   return true;
3025 }
3026 
3027 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
3028   // Cannot use 32 bit constants to reference objects in kernel/large code
3029   // model.
3030   if (TM.getCodeModel() == CodeModel::Kernel ||
3031       TM.getCodeModel() == CodeModel::Large)
3032     return false;
3033 
3034   // In static codegen with small code model, we can get the address of a label
3035   // into a register with 'movl'
3036   if (N->getOpcode() != X86ISD::Wrapper)
3037     return false;
3038 
3039   N = N.getOperand(0);
3040 
3041   // At least GNU as does not accept 'movl' for TPOFF relocations.
3042   // FIXME: We could use 'movl' when we know we are targeting MC.
3043   if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
3044     return false;
3045 
3046   Imm = N;
3047   // Small/medium code model can reference non-TargetGlobalAddress objects with
3048   // 32 bit constants.
3049   if (N->getOpcode() != ISD::TargetGlobalAddress) {
3050     return TM.getCodeModel() == CodeModel::Small ||
3051            TM.getCodeModel() == CodeModel::Medium;
3052   }
3053 
3054   const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
3055   if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3056     return CR->getUnsignedMax().ult(1ull << 32);
3057 
3058   return !TM.isLargeGlobalValue(GV);
3059 }
3060 
3061 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
3062                                          SDValue &Scale, SDValue &Index,
3063                                          SDValue &Disp, SDValue &Segment) {
3064   // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3065   SDLoc DL(N);
3066 
3067   if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3068     return false;
3069 
3070   auto *RN = dyn_cast<RegisterSDNode>(Base);
3071   if (RN && RN->getReg() == 0)
3072     Base = CurDAG->getRegister(0, MVT::i64);
3073   else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
3074     // Base could already be %rip, particularly in the x32 ABI.
3075     SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3076                                                      MVT::i64), 0);
3077     Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3078                                          Base);
3079   }
3080 
3081   RN = dyn_cast<RegisterSDNode>(Index);
3082   if (RN && RN->getReg() == 0)
3083     Index = CurDAG->getRegister(0, MVT::i64);
3084   else {
3085     assert(Index.getValueType() == MVT::i32 &&
3086            "Expect to be extending 32-bit registers for use in LEA");
3087     SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3088                                                      MVT::i64), 0);
3089     Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3090                                           Index);
3091   }
3092 
3093   return true;
3094 }
3095 
3096 /// Calls SelectAddr and determines if the maximal addressing
3097 /// mode it matches can be cost effectively emitted as an LEA instruction.
3098 bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3099                                     SDValue &Base, SDValue &Scale,
3100                                     SDValue &Index, SDValue &Disp,
3101                                     SDValue &Segment) {
3102   X86ISelAddressMode AM;
3103 
3104   // Save the DL and VT before calling matchAddress, it can invalidate N.
3105   SDLoc DL(N);
3106   MVT VT = N.getSimpleValueType();
3107 
3108   // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3109   // segments.
3110   SDValue Copy = AM.Segment;
3111   SDValue T = CurDAG->getRegister(0, MVT::i32);
3112   AM.Segment = T;
3113   if (matchAddress(N, AM))
3114     return false;
3115   assert (T == AM.Segment);
3116   AM.Segment = Copy;
3117 
3118   unsigned Complexity = 0;
3119   if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3120     Complexity = 1;
3121   else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3122     Complexity = 4;
3123 
3124   if (AM.IndexReg.getNode())
3125     Complexity++;
3126 
3127   // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3128   // a simple shift.
3129   if (AM.Scale > 1)
3130     Complexity++;
3131 
3132   // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3133   // to a LEA. This is determined with some experimentation but is by no means
3134   // optimal (especially for code size consideration). LEA is nice because of
3135   // its three-address nature. Tweak the cost function again when we can run
3136   // convertToThreeAddress() at register allocation time.
3137   if (AM.hasSymbolicDisplacement()) {
3138     // For X86-64, always use LEA to materialize RIP-relative addresses.
3139     if (Subtarget->is64Bit())
3140       Complexity = 4;
3141     else
3142       Complexity += 2;
3143   }
3144 
3145   // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3146   // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3147   // duplicating flag-producing instructions later in the pipeline.
3148   if (N.getOpcode() == ISD::ADD) {
3149     auto isMathWithFlags = [](SDValue V) {
3150       switch (V.getOpcode()) {
3151       case X86ISD::ADD:
3152       case X86ISD::SUB:
3153       case X86ISD::ADC:
3154       case X86ISD::SBB:
3155       case X86ISD::SMUL:
3156       case X86ISD::UMUL:
3157       /* TODO: These opcodes can be added safely, but we may want to justify
3158                their inclusion for different reasons (better for reg-alloc).
3159       case X86ISD::OR:
3160       case X86ISD::XOR:
3161       case X86ISD::AND:
3162       */
3163         // Value 1 is the flag output of the node - verify it's not dead.
3164         return !SDValue(V.getNode(), 1).use_empty();
3165       default:
3166         return false;
3167       }
3168     };
3169     // TODO: We might want to factor in whether there's a load folding
3170     // opportunity for the math op that disappears with LEA.
3171     if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3172       Complexity++;
3173   }
3174 
3175   if (AM.Disp)
3176     Complexity++;
3177 
3178   // If it isn't worth using an LEA, reject it.
3179   if (Complexity <= 2)
3180     return false;
3181 
3182   getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3183   return true;
3184 }
3185 
3186 /// This is only run on TargetGlobalTLSAddress nodes.
3187 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3188                                         SDValue &Scale, SDValue &Index,
3189                                         SDValue &Disp, SDValue &Segment) {
3190   assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3191          N.getOpcode() == ISD::TargetExternalSymbol);
3192 
3193   X86ISelAddressMode AM;
3194   if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3195     AM.GV = GA->getGlobal();
3196     AM.Disp += GA->getOffset();
3197     AM.SymbolFlags = GA->getTargetFlags();
3198   } else {
3199     auto *SA = cast<ExternalSymbolSDNode>(N);
3200     AM.ES = SA->getSymbol();
3201     AM.SymbolFlags = SA->getTargetFlags();
3202   }
3203 
3204   if (Subtarget->is32Bit()) {
3205     AM.Scale = 1;
3206     AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3207   }
3208 
3209   MVT VT = N.getSimpleValueType();
3210   getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3211   return true;
3212 }
3213 
3214 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3215   // Keep track of the original value type and whether this value was
3216   // truncated. If we see a truncation from pointer type to VT that truncates
3217   // bits that are known to be zero, we can use a narrow reference.
3218   EVT VT = N.getValueType();
3219   bool WasTruncated = false;
3220   if (N.getOpcode() == ISD::TRUNCATE) {
3221     WasTruncated = true;
3222     N = N.getOperand(0);
3223   }
3224 
3225   if (N.getOpcode() != X86ISD::Wrapper)
3226     return false;
3227 
3228   // We can only use non-GlobalValues as immediates if they were not truncated,
3229   // as we do not have any range information. If we have a GlobalValue and the
3230   // address was not truncated, we can select it as an operand directly.
3231   unsigned Opc = N.getOperand(0)->getOpcode();
3232   if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3233     Op = N.getOperand(0);
3234     // We can only select the operand directly if we didn't have to look past a
3235     // truncate.
3236     return !WasTruncated;
3237   }
3238 
3239   // Check that the global's range fits into VT.
3240   auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3241   std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3242   if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3243     return false;
3244 
3245   // Okay, we can use a narrow reference.
3246   Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3247                                       GA->getOffset(), GA->getTargetFlags());
3248   return true;
3249 }
3250 
3251 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3252                                   SDValue &Base, SDValue &Scale,
3253                                   SDValue &Index, SDValue &Disp,
3254                                   SDValue &Segment) {
3255   assert(Root && P && "Unknown root/parent nodes");
3256   if (!ISD::isNON_EXTLoad(N.getNode()) ||
3257       !IsProfitableToFold(N, P, Root) ||
3258       !IsLegalToFold(N, P, Root, OptLevel))
3259     return false;
3260 
3261   return selectAddr(N.getNode(),
3262                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
3263 }
3264 
3265 bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3266                                        SDValue &Base, SDValue &Scale,
3267                                        SDValue &Index, SDValue &Disp,
3268                                        SDValue &Segment) {
3269   assert(Root && P && "Unknown root/parent nodes");
3270   if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3271       !IsProfitableToFold(N, P, Root) ||
3272       !IsLegalToFold(N, P, Root, OptLevel))
3273     return false;
3274 
3275   return selectAddr(N.getNode(),
3276                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
3277 }
3278 
3279 /// Return an SDNode that returns the value of the global base register.
3280 /// Output instructions required to initialize the global base register,
3281 /// if necessary.
3282 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3283   unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3284   auto &DL = MF->getDataLayout();
3285   return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3286 }
3287 
3288 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3289   if (N->getOpcode() == ISD::TRUNCATE)
3290     N = N->getOperand(0).getNode();
3291   if (N->getOpcode() != X86ISD::Wrapper)
3292     return false;
3293 
3294   auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3295   if (!GA)
3296     return false;
3297 
3298   auto *GV = GA->getGlobal();
3299   std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3300   if (CR)
3301     return CR->getSignedMin().sge(-1ull << Width) &&
3302            CR->getSignedMax().slt(1ull << Width);
3303   // In the kernel code model, globals are in the negative 2GB of the address
3304   // space, so globals can be a sign extended 32-bit immediate.
3305   // In other code models, small globals are in the low 2GB of the address
3306   // space, so sign extending them is equivalent to zero extending them.
3307   return Width == 32 && !TM.isLargeGlobalValue(GV);
3308 }
3309 
3310 X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3311   assert(N->isMachineOpcode() && "Unexpected node");
3312   unsigned Opc = N->getMachineOpcode();
3313   const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3314   int CondNo = X86::getCondSrcNoFromDesc(MCID);
3315   if (CondNo < 0)
3316     return X86::COND_INVALID;
3317 
3318   return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3319 }
3320 
3321 /// Test whether the given X86ISD::CMP node has any users that use a flag
3322 /// other than ZF.
3323 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3324   // Examine each user of the node.
3325   for (SDUse &Use : Flags->uses()) {
3326     // Only check things that use the flags.
3327     if (Use.getResNo() != Flags.getResNo())
3328       continue;
3329     SDNode *User = Use.getUser();
3330     // Only examine CopyToReg uses that copy to EFLAGS.
3331     if (User->getOpcode() != ISD::CopyToReg ||
3332         cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3333       return false;
3334     // Examine each user of the CopyToReg use.
3335     for (SDUse &FlagUse : User->uses()) {
3336       // Only examine the Flag result.
3337       if (FlagUse.getResNo() != 1)
3338         continue;
3339       // Anything unusual: assume conservatively.
3340       if (!FlagUse.getUser()->isMachineOpcode())
3341         return false;
3342       // Examine the condition code of the user.
3343       X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3344 
3345       switch (CC) {
3346       // Comparisons which only use the zero flag.
3347       case X86::COND_E: case X86::COND_NE:
3348         continue;
3349       // Anything else: assume conservatively.
3350       default:
3351         return false;
3352       }
3353     }
3354   }
3355   return true;
3356 }
3357 
3358 /// Test whether the given X86ISD::CMP node has any uses which require the SF
3359 /// flag to be accurate.
3360 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3361   // Examine each user of the node.
3362   for (SDUse &Use : Flags->uses()) {
3363     // Only check things that use the flags.
3364     if (Use.getResNo() != Flags.getResNo())
3365       continue;
3366     SDNode *User = Use.getUser();
3367     // Only examine CopyToReg uses that copy to EFLAGS.
3368     if (User->getOpcode() != ISD::CopyToReg ||
3369         cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3370       return false;
3371     // Examine each user of the CopyToReg use.
3372     for (SDUse &FlagUse : User->uses()) {
3373       // Only examine the Flag result.
3374       if (FlagUse.getResNo() != 1)
3375         continue;
3376       // Anything unusual: assume conservatively.
3377       if (!FlagUse.getUser()->isMachineOpcode())
3378         return false;
3379       // Examine the condition code of the user.
3380       X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3381 
3382       switch (CC) {
3383       // Comparisons which don't examine the SF flag.
3384       case X86::COND_A: case X86::COND_AE:
3385       case X86::COND_B: case X86::COND_BE:
3386       case X86::COND_E: case X86::COND_NE:
3387       case X86::COND_O: case X86::COND_NO:
3388       case X86::COND_P: case X86::COND_NP:
3389         continue;
3390       // Anything else: assume conservatively.
3391       default:
3392         return false;
3393       }
3394     }
3395   }
3396   return true;
3397 }
3398 
3399 static bool mayUseCarryFlag(X86::CondCode CC) {
3400   switch (CC) {
3401   // Comparisons which don't examine the CF flag.
3402   case X86::COND_O: case X86::COND_NO:
3403   case X86::COND_E: case X86::COND_NE:
3404   case X86::COND_S: case X86::COND_NS:
3405   case X86::COND_P: case X86::COND_NP:
3406   case X86::COND_L: case X86::COND_GE:
3407   case X86::COND_G: case X86::COND_LE:
3408     return false;
3409   // Anything else: assume conservatively.
3410   default:
3411     return true;
3412   }
3413 }
3414 
3415 /// Test whether the given node which sets flags has any uses which require the
3416 /// CF flag to be accurate.
3417  bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3418   // Examine each user of the node.
3419   for (SDUse &Use : Flags->uses()) {
3420     // Only check things that use the flags.
3421     if (Use.getResNo() != Flags.getResNo())
3422       continue;
3423 
3424     SDNode *User = Use.getUser();
3425     unsigned UserOpc = User->getOpcode();
3426 
3427     if (UserOpc == ISD::CopyToReg) {
3428       // Only examine CopyToReg uses that copy to EFLAGS.
3429       if (cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3430         return false;
3431       // Examine each user of the CopyToReg use.
3432       for (SDUse &FlagUse : User->uses()) {
3433         // Only examine the Flag result.
3434         if (FlagUse.getResNo() != 1)
3435           continue;
3436         // Anything unusual: assume conservatively.
3437         if (!FlagUse.getUser()->isMachineOpcode())
3438           return false;
3439         // Examine the condition code of the user.
3440         X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3441 
3442         if (mayUseCarryFlag(CC))
3443           return false;
3444       }
3445 
3446       // This CopyToReg is ok. Move on to the next user.
3447       continue;
3448     }
3449 
3450     // This might be an unselected node. So look for the pre-isel opcodes that
3451     // use flags.
3452     unsigned CCOpNo;
3453     switch (UserOpc) {
3454     default:
3455       // Something unusual. Be conservative.
3456       return false;
3457     case X86ISD::SETCC:       CCOpNo = 0; break;
3458     case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3459     case X86ISD::CMOV:        CCOpNo = 2; break;
3460     case X86ISD::BRCOND:      CCOpNo = 2; break;
3461     }
3462 
3463     X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
3464     if (mayUseCarryFlag(CC))
3465       return false;
3466   }
3467   return true;
3468 }
3469 
3470 /// Check whether or not the chain ending in StoreNode is suitable for doing
3471 /// the {load; op; store} to modify transformation.
3472 static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
3473                                         SDValue StoredVal, SelectionDAG *CurDAG,
3474                                         unsigned LoadOpNo,
3475                                         LoadSDNode *&LoadNode,
3476                                         SDValue &InputChain) {
3477   // Is the stored value result 0 of the operation?
3478   if (StoredVal.getResNo() != 0) return false;
3479 
3480   // Are there other uses of the operation other than the store?
3481   if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3482 
3483   // Is the store non-extending and non-indexed?
3484   if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3485     return false;
3486 
3487   SDValue Load = StoredVal->getOperand(LoadOpNo);
3488   // Is the stored value a non-extending and non-indexed load?
3489   if (!ISD::isNormalLoad(Load.getNode())) return false;
3490 
3491   // Return LoadNode by reference.
3492   LoadNode = cast<LoadSDNode>(Load);
3493 
3494   // Is store the only read of the loaded value?
3495   if (!Load.hasOneUse())
3496     return false;
3497 
3498   // Is the address of the store the same as the load?
3499   if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3500       LoadNode->getOffset() != StoreNode->getOffset())
3501     return false;
3502 
3503   bool FoundLoad = false;
3504   SmallVector<SDValue, 4> ChainOps;
3505   SmallVector<const SDNode *, 4> LoopWorklist;
3506   SmallPtrSet<const SDNode *, 16> Visited;
3507   const unsigned int Max = 1024;
3508 
3509   //  Visualization of Load-Op-Store fusion:
3510   // -------------------------
3511   // Legend:
3512   //    *-lines = Chain operand dependencies.
3513   //    |-lines = Normal operand dependencies.
3514   //    Dependencies flow down and right. n-suffix references multiple nodes.
3515   //
3516   //        C                        Xn  C
3517   //        *                         *  *
3518   //        *                          * *
3519   //  Xn  A-LD    Yn                    TF         Yn
3520   //   *    * \   |                       *        |
3521   //    *   *  \  |                        *       |
3522   //     *  *   \ |             =>       A--LD_OP_ST
3523   //      * *    \|                                 \
3524   //       TF    OP                                  \
3525   //         *   | \                                  Zn
3526   //          *  |  \
3527   //         A-ST    Zn
3528   //
3529 
3530   // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3531   //                                      #2: Yn -> LD
3532   //                                      #3: ST -> Zn
3533 
3534   // Ensure the transform is safe by checking for the dual
3535   // dependencies to make sure we do not induce a loop.
3536 
3537   // As LD is a predecessor to both OP and ST we can do this by checking:
3538   //  a). if LD is a predecessor to a member of Xn or Yn.
3539   //  b). if a Zn is a predecessor to ST.
3540 
3541   // However, (b) can only occur through being a chain predecessor to
3542   // ST, which is the same as Zn being a member or predecessor of Xn,
3543   // which is a subset of LD being a predecessor of Xn. So it's
3544   // subsumed by check (a).
3545 
3546   SDValue Chain = StoreNode->getChain();
3547 
3548   // Gather X elements in ChainOps.
3549   if (Chain == Load.getValue(1)) {
3550     FoundLoad = true;
3551     ChainOps.push_back(Load.getOperand(0));
3552   } else if (Chain.getOpcode() == ISD::TokenFactor) {
3553     for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3554       SDValue Op = Chain.getOperand(i);
3555       if (Op == Load.getValue(1)) {
3556         FoundLoad = true;
3557         // Drop Load, but keep its chain. No cycle check necessary.
3558         ChainOps.push_back(Load.getOperand(0));
3559         continue;
3560       }
3561       LoopWorklist.push_back(Op.getNode());
3562       ChainOps.push_back(Op);
3563     }
3564   }
3565 
3566   if (!FoundLoad)
3567     return false;
3568 
3569   // Worklist is currently Xn. Add Yn to worklist.
3570   for (SDValue Op : StoredVal->ops())
3571     if (Op.getNode() != LoadNode)
3572       LoopWorklist.push_back(Op.getNode());
3573 
3574   // Check (a) if Load is a predecessor to Xn + Yn
3575   if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3576                                    true))
3577     return false;
3578 
3579   InputChain =
3580       CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3581   return true;
3582 }
3583 
3584 // Change a chain of {load; op; store} of the same value into a simple op
3585 // through memory of that value, if the uses of the modified value and its
3586 // address are suitable.
3587 //
3588 // The tablegen pattern memory operand pattern is currently not able to match
3589 // the case where the EFLAGS on the original operation are used.
3590 //
3591 // To move this to tablegen, we'll need to improve tablegen to allow flags to
3592 // be transferred from a node in the pattern to the result node, probably with
3593 // a new keyword. For example, we have this
3594 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3595 //  [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3596 // but maybe need something like this
3597 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3598 //  [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3599 //   (transferrable EFLAGS)]>;
3600 //
3601 // Until then, we manually fold these and instruction select the operation
3602 // here.
3603 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3604   auto *StoreNode = cast<StoreSDNode>(Node);
3605   SDValue StoredVal = StoreNode->getOperand(1);
3606   unsigned Opc = StoredVal->getOpcode();
3607 
3608   // Before we try to select anything, make sure this is memory operand size
3609   // and opcode we can handle. Note that this must match the code below that
3610   // actually lowers the opcodes.
3611   EVT MemVT = StoreNode->getMemoryVT();
3612   if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3613       MemVT != MVT::i8)
3614     return false;
3615 
3616   bool IsCommutable = false;
3617   bool IsNegate = false;
3618   switch (Opc) {
3619   default:
3620     return false;
3621   case X86ISD::SUB:
3622     IsNegate = isNullConstant(StoredVal.getOperand(0));
3623     break;
3624   case X86ISD::SBB:
3625     break;
3626   case X86ISD::ADD:
3627   case X86ISD::ADC:
3628   case X86ISD::AND:
3629   case X86ISD::OR:
3630   case X86ISD::XOR:
3631     IsCommutable = true;
3632     break;
3633   }
3634 
3635   unsigned LoadOpNo = IsNegate ? 1 : 0;
3636   LoadSDNode *LoadNode = nullptr;
3637   SDValue InputChain;
3638   if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3639                                    LoadNode, InputChain)) {
3640     if (!IsCommutable)
3641       return false;
3642 
3643     // This operation is commutable, try the other operand.
3644     LoadOpNo = 1;
3645     if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3646                                      LoadNode, InputChain))
3647       return false;
3648   }
3649 
3650   SDValue Base, Scale, Index, Disp, Segment;
3651   if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3652                   Segment))
3653     return false;
3654 
3655   auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3656                           unsigned Opc8) {
3657     switch (MemVT.getSimpleVT().SimpleTy) {
3658     case MVT::i64:
3659       return Opc64;
3660     case MVT::i32:
3661       return Opc32;
3662     case MVT::i16:
3663       return Opc16;
3664     case MVT::i8:
3665       return Opc8;
3666     default:
3667       llvm_unreachable("Invalid size!");
3668     }
3669   };
3670 
3671   MachineSDNode *Result;
3672   switch (Opc) {
3673   case X86ISD::SUB:
3674     // Handle negate.
3675     if (IsNegate) {
3676       unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3677                                      X86::NEG8m);
3678       const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3679       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3680                                       MVT::Other, Ops);
3681       break;
3682     }
3683    [[fallthrough]];
3684   case X86ISD::ADD:
3685     // Try to match inc/dec.
3686     if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3687       bool IsOne = isOneConstant(StoredVal.getOperand(1));
3688       bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3689       // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3690       if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3691         unsigned NewOpc =
3692           ((Opc == X86ISD::ADD) == IsOne)
3693               ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3694               : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3695         const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3696         Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3697                                         MVT::Other, Ops);
3698         break;
3699       }
3700     }
3701     [[fallthrough]];
3702   case X86ISD::ADC:
3703   case X86ISD::SBB:
3704   case X86ISD::AND:
3705   case X86ISD::OR:
3706   case X86ISD::XOR: {
3707     auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3708       switch (Opc) {
3709       case X86ISD::ADD:
3710         return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3711                             X86::ADD8mr);
3712       case X86ISD::ADC:
3713         return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3714                             X86::ADC8mr);
3715       case X86ISD::SUB:
3716         return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3717                             X86::SUB8mr);
3718       case X86ISD::SBB:
3719         return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3720                             X86::SBB8mr);
3721       case X86ISD::AND:
3722         return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3723                             X86::AND8mr);
3724       case X86ISD::OR:
3725         return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3726       case X86ISD::XOR:
3727         return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3728                             X86::XOR8mr);
3729       default:
3730         llvm_unreachable("Invalid opcode!");
3731       }
3732     };
3733     auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3734       switch (Opc) {
3735       case X86ISD::ADD:
3736         return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3737                             X86::ADD8mi);
3738       case X86ISD::ADC:
3739         return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3740                             X86::ADC8mi);
3741       case X86ISD::SUB:
3742         return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3743                             X86::SUB8mi);
3744       case X86ISD::SBB:
3745         return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3746                             X86::SBB8mi);
3747       case X86ISD::AND:
3748         return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3749                             X86::AND8mi);
3750       case X86ISD::OR:
3751         return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3752                             X86::OR8mi);
3753       case X86ISD::XOR:
3754         return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3755                             X86::XOR8mi);
3756       default:
3757         llvm_unreachable("Invalid opcode!");
3758       }
3759     };
3760 
3761     unsigned NewOpc = SelectRegOpcode(Opc);
3762     SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3763 
3764     // See if the operand is a constant that we can fold into an immediate
3765     // operand.
3766     if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3767       int64_t OperandV = OperandC->getSExtValue();
3768 
3769       // Check if we can shrink the operand enough to fit in an immediate (or
3770       // fit into a smaller immediate) by negating it and switching the
3771       // operation.
3772       if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3773           ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3774            (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3775             isInt<32>(-OperandV))) &&
3776           hasNoCarryFlagUses(StoredVal.getValue(1))) {
3777         OperandV = -OperandV;
3778         Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3779       }
3780 
3781       if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3782         Operand = CurDAG->getSignedTargetConstant(OperandV, SDLoc(Node), MemVT);
3783         NewOpc = SelectImmOpcode(Opc);
3784       }
3785     }
3786 
3787     if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3788       SDValue CopyTo =
3789           CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3790                                StoredVal.getOperand(2), SDValue());
3791 
3792       const SDValue Ops[] = {Base,    Scale,   Index,  Disp,
3793                              Segment, Operand, CopyTo, CopyTo.getValue(1)};
3794       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3795                                       Ops);
3796     } else {
3797       const SDValue Ops[] = {Base,    Scale,   Index,     Disp,
3798                              Segment, Operand, InputChain};
3799       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3800                                       Ops);
3801     }
3802     break;
3803   }
3804   default:
3805     llvm_unreachable("Invalid opcode!");
3806   }
3807 
3808   MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3809                                  LoadNode->getMemOperand()};
3810   CurDAG->setNodeMemRefs(Result, MemOps);
3811 
3812   // Update Load Chain uses as well.
3813   ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3814   ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3815   ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3816   CurDAG->RemoveDeadNode(Node);
3817   return true;
3818 }
3819 
3820 // See if this is an  X & Mask  that we can match to BEXTR/BZHI.
3821 // Where Mask is one of the following patterns:
3822 //   a) x &  (1 << nbits) - 1
3823 //   b) x & ~(-1 << nbits)
3824 //   c) x &  (-1 >> (32 - y))
3825 //   d) x << (32 - y) >> (32 - y)
3826 //   e) (1 << nbits) - 1
3827 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3828   assert(
3829       (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3830        Node->getOpcode() == ISD::SRL) &&
3831       "Should be either an and-mask, or right-shift after clearing high bits.");
3832 
3833   // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3834   if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3835     return false;
3836 
3837   MVT NVT = Node->getSimpleValueType(0);
3838 
3839   // Only supported for 32 and 64 bits.
3840   if (NVT != MVT::i32 && NVT != MVT::i64)
3841     return false;
3842 
3843   SDValue NBits;
3844   bool NegateNBits;
3845 
3846   // If we have BMI2's BZHI, we are ok with muti-use patterns.
3847   // Else, if we only have BMI1's BEXTR, we require one-use.
3848   const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3849   auto checkUses = [AllowExtraUsesByDefault](
3850                        SDValue Op, unsigned NUses,
3851                        std::optional<bool> AllowExtraUses) {
3852     return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3853            Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3854   };
3855   auto checkOneUse = [checkUses](SDValue Op,
3856                                  std::optional<bool> AllowExtraUses =
3857                                      std::nullopt) {
3858     return checkUses(Op, 1, AllowExtraUses);
3859   };
3860   auto checkTwoUse = [checkUses](SDValue Op,
3861                                  std::optional<bool> AllowExtraUses =
3862                                      std::nullopt) {
3863     return checkUses(Op, 2, AllowExtraUses);
3864   };
3865 
3866   auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3867     if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3868       assert(V.getSimpleValueType() == MVT::i32 &&
3869              V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3870              "Expected i64 -> i32 truncation");
3871       V = V.getOperand(0);
3872     }
3873     return V;
3874   };
3875 
3876   // a) x & ((1 << nbits) + (-1))
3877   auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3878                         &NegateNBits](SDValue Mask) -> bool {
3879     // Match `add`. Must only have one use!
3880     if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3881       return false;
3882     // We should be adding all-ones constant (i.e. subtracting one.)
3883     if (!isAllOnesConstant(Mask->getOperand(1)))
3884       return false;
3885     // Match `1 << nbits`. Might be truncated. Must only have one use!
3886     SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3887     if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3888       return false;
3889     if (!isOneConstant(M0->getOperand(0)))
3890       return false;
3891     NBits = M0->getOperand(1);
3892     NegateNBits = false;
3893     return true;
3894   };
3895 
3896   auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3897     V = peekThroughOneUseTruncation(V);
3898     return CurDAG->MaskedValueIsAllOnes(
3899         V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3900                                 NVT.getSizeInBits()));
3901   };
3902 
3903   // b) x & ~(-1 << nbits)
3904   auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3905                         &NBits, &NegateNBits](SDValue Mask) -> bool {
3906     // Match `~()`. Must only have one use!
3907     if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3908       return false;
3909     // The -1 only has to be all-ones for the final Node's NVT.
3910     if (!isAllOnes(Mask->getOperand(1)))
3911       return false;
3912     // Match `-1 << nbits`. Might be truncated. Must only have one use!
3913     SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3914     if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3915       return false;
3916     // The -1 only has to be all-ones for the final Node's NVT.
3917     if (!isAllOnes(M0->getOperand(0)))
3918       return false;
3919     NBits = M0->getOperand(1);
3920     NegateNBits = false;
3921     return true;
3922   };
3923 
3924   // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3925   // or leave the shift amount as-is, but then we'll have to negate it.
3926   auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3927                                                      unsigned Bitwidth) {
3928     NBits = ShiftAmt;
3929     NegateNBits = true;
3930     // Skip over a truncate of the shift amount, if any.
3931     if (NBits.getOpcode() == ISD::TRUNCATE)
3932       NBits = NBits.getOperand(0);
3933     // Try to match the shift amount as (bitwidth - y). It should go away, too.
3934     // If it doesn't match, that's fine, we'll just negate it ourselves.
3935     if (NBits.getOpcode() != ISD::SUB)
3936       return;
3937     auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3938     if (!V0 || V0->getZExtValue() != Bitwidth)
3939       return;
3940     NBits = NBits.getOperand(1);
3941     NegateNBits = false;
3942   };
3943 
3944   // c) x &  (-1 >> z)  but then we'll have to subtract z from bitwidth
3945   //   or
3946   // c) x &  (-1 >> (32 - y))
3947   auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3948                         canonicalizeShiftAmt](SDValue Mask) -> bool {
3949     // The mask itself may be truncated.
3950     Mask = peekThroughOneUseTruncation(Mask);
3951     unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3952     // Match `l>>`. Must only have one use!
3953     if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3954       return false;
3955     // We should be shifting truly all-ones constant.
3956     if (!isAllOnesConstant(Mask.getOperand(0)))
3957       return false;
3958     SDValue M1 = Mask.getOperand(1);
3959     // The shift amount should not be used externally.
3960     if (!checkOneUse(M1))
3961       return false;
3962     canonicalizeShiftAmt(M1, Bitwidth);
3963     // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3964     // is no extra use of the mask. Clearly, there was one since we are here.
3965     // But at the same time, if we need to negate the shift amount,
3966     // then we don't want the mask to stick around, else it's unprofitable.
3967     return !NegateNBits;
3968   };
3969 
3970   SDValue X;
3971 
3972   // d) x << z >> z  but then we'll have to subtract z from bitwidth
3973   //   or
3974   // d) x << (32 - y) >> (32 - y)
3975   auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3976                         AllowExtraUsesByDefault, &NegateNBits,
3977                         &X](SDNode *Node) -> bool {
3978     if (Node->getOpcode() != ISD::SRL)
3979       return false;
3980     SDValue N0 = Node->getOperand(0);
3981     if (N0->getOpcode() != ISD::SHL)
3982       return false;
3983     unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3984     SDValue N1 = Node->getOperand(1);
3985     SDValue N01 = N0->getOperand(1);
3986     // Both of the shifts must be by the exact same value.
3987     if (N1 != N01)
3988       return false;
3989     canonicalizeShiftAmt(N1, Bitwidth);
3990     // There should not be any external uses of the inner shift / shift amount.
3991     // Note that while we are generally okay with external uses given BMI2,
3992     // iff we need to negate the shift amount, we are not okay with extra uses.
3993     const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
3994     if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
3995       return false;
3996     X = N0->getOperand(0);
3997     return true;
3998   };
3999 
4000   auto matchLowBitMask = [matchPatternA, matchPatternB,
4001                           matchPatternC](SDValue Mask) -> bool {
4002     return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
4003   };
4004 
4005   if (Node->getOpcode() == ISD::AND) {
4006     X = Node->getOperand(0);
4007     SDValue Mask = Node->getOperand(1);
4008 
4009     if (matchLowBitMask(Mask)) {
4010       // Great.
4011     } else {
4012       std::swap(X, Mask);
4013       if (!matchLowBitMask(Mask))
4014         return false;
4015     }
4016   } else if (matchLowBitMask(SDValue(Node, 0))) {
4017     X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
4018   } else if (!matchPatternD(Node))
4019     return false;
4020 
4021   // If we need to negate the shift amount, require BMI2 BZHI support.
4022   // It's just too unprofitable for BMI1 BEXTR.
4023   if (NegateNBits && !Subtarget->hasBMI2())
4024     return false;
4025 
4026   SDLoc DL(Node);
4027 
4028   // Truncate the shift amount.
4029   NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
4030   insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4031 
4032   // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4033   // All the other bits are undefined, we do not care about them.
4034   SDValue ImplDef = SDValue(
4035       CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
4036   insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
4037 
4038   SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
4039   insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
4040   NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
4041                                          MVT::i32, ImplDef, NBits, SRIdxVal),
4042                   0);
4043   insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4044 
4045   // We might have matched the amount of high bits to be cleared,
4046   // but we want the amount of low bits to be kept, so negate it then.
4047   if (NegateNBits) {
4048     SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
4049     insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
4050 
4051     NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
4052     insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4053   }
4054 
4055   if (Subtarget->hasBMI2()) {
4056     // Great, just emit the BZHI..
4057     if (NVT != MVT::i32) {
4058       // But have to place the bit count into the wide-enough register first.
4059       NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
4060       insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4061     }
4062 
4063     SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
4064     ReplaceNode(Node, Extract.getNode());
4065     SelectCode(Extract.getNode());
4066     return true;
4067   }
4068 
4069   // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4070   // *logically* shifted (potentially with one-use trunc inbetween),
4071   // and the truncation was the only use of the shift,
4072   // and if so look past one-use truncation.
4073   {
4074     SDValue RealX = peekThroughOneUseTruncation(X);
4075     // FIXME: only if the shift is one-use?
4076     if (RealX != X && RealX.getOpcode() == ISD::SRL)
4077       X = RealX;
4078   }
4079 
4080   MVT XVT = X.getSimpleValueType();
4081 
4082   // Else, emitting BEXTR requires one more step.
4083   // The 'control' of BEXTR has the pattern of:
4084   // [15...8 bit][ 7...0 bit] location
4085   // [ bit count][     shift] name
4086   // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
4087 
4088   // Shift NBits left by 8 bits, thus producing 'control'.
4089   // This makes the low 8 bits to be zero.
4090   SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
4091   insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
4092   SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
4093   insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4094 
4095   // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4096   // FIXME: only if the shift is one-use?
4097   if (X.getOpcode() == ISD::SRL) {
4098     SDValue ShiftAmt = X.getOperand(1);
4099     X = X.getOperand(0);
4100 
4101     assert(ShiftAmt.getValueType() == MVT::i8 &&
4102            "Expected shift amount to be i8");
4103 
4104     // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4105     // We could zext to i16 in some form, but we intentionally don't do that.
4106     SDValue OrigShiftAmt = ShiftAmt;
4107     ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4108     insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4109 
4110     // And now 'or' these low 8 bits of shift amount into the 'control'.
4111     Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4112     insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4113   }
4114 
4115   // But have to place the 'control' into the wide-enough register first.
4116   if (XVT != MVT::i32) {
4117     Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4118     insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4119   }
4120 
4121   // And finally, form the BEXTR itself.
4122   SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4123 
4124   // The 'X' was originally truncated. Do that now.
4125   if (XVT != NVT) {
4126     insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4127     Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4128   }
4129 
4130   ReplaceNode(Node, Extract.getNode());
4131   SelectCode(Extract.getNode());
4132 
4133   return true;
4134 }
4135 
4136 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4137 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4138   MVT NVT = Node->getSimpleValueType(0);
4139   SDLoc dl(Node);
4140 
4141   SDValue N0 = Node->getOperand(0);
4142   SDValue N1 = Node->getOperand(1);
4143 
4144   // If we have TBM we can use an immediate for the control. If we have BMI
4145   // we should only do this if the BEXTR instruction is implemented well.
4146   // Otherwise moving the control into a register makes this more costly.
4147   // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4148   // hoisting the move immediate would make it worthwhile with a less optimal
4149   // BEXTR?
4150   bool PreferBEXTR =
4151       Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4152   if (!PreferBEXTR && !Subtarget->hasBMI2())
4153     return nullptr;
4154 
4155   // Must have a shift right.
4156   if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4157     return nullptr;
4158 
4159   // Shift can't have additional users.
4160   if (!N0->hasOneUse())
4161     return nullptr;
4162 
4163   // Only supported for 32 and 64 bits.
4164   if (NVT != MVT::i32 && NVT != MVT::i64)
4165     return nullptr;
4166 
4167   // Shift amount and RHS of and must be constant.
4168   auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4169   auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4170   if (!MaskCst || !ShiftCst)
4171     return nullptr;
4172 
4173   // And RHS must be a mask.
4174   uint64_t Mask = MaskCst->getZExtValue();
4175   if (!isMask_64(Mask))
4176     return nullptr;
4177 
4178   uint64_t Shift = ShiftCst->getZExtValue();
4179   uint64_t MaskSize = llvm::popcount(Mask);
4180 
4181   // Don't interfere with something that can be handled by extracting AH.
4182   // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4183   if (Shift == 8 && MaskSize == 8)
4184     return nullptr;
4185 
4186   // Make sure we are only using bits that were in the original value, not
4187   // shifted in.
4188   if (Shift + MaskSize > NVT.getSizeInBits())
4189     return nullptr;
4190 
4191   // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4192   // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4193   // does not fit into 32 bits. Load folding is not a sufficient reason.
4194   if (!PreferBEXTR && MaskSize <= 32)
4195     return nullptr;
4196 
4197   SDValue Control;
4198   unsigned ROpc, MOpc;
4199 
4200 #define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4201   if (!PreferBEXTR) {
4202     assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4203     // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4204     // Let's perform the mask first, and apply shift later. Note that we need to
4205     // widen the mask to account for the fact that we'll apply shift afterwards!
4206     Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4207     ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4208                            : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4209     MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4210                            : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4211     unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4212     Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4213   } else {
4214     // The 'control' of BEXTR has the pattern of:
4215     // [15...8 bit][ 7...0 bit] location
4216     // [ bit count][     shift] name
4217     // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
4218     Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4219     if (Subtarget->hasTBM()) {
4220       ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4221       MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4222     } else {
4223       assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4224       // BMI requires the immediate to placed in a register.
4225       ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4226                              : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4227       MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4228                              : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4229       unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4230       Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4231     }
4232   }
4233 
4234   MachineSDNode *NewNode;
4235   SDValue Input = N0->getOperand(0);
4236   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4237   if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4238     SDValue Ops[] = {
4239         Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4240     SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4241     NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4242     // Update the chain.
4243     ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4244     // Record the mem-refs
4245     CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4246   } else {
4247     NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4248   }
4249 
4250   if (!PreferBEXTR) {
4251     // We still need to apply the shift.
4252     SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4253     unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4254                                       : GET_ND_IF_ENABLED(X86::SHR32ri);
4255     NewNode =
4256         CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4257   }
4258 
4259   return NewNode;
4260 }
4261 
4262 // Emit a PCMISTR(I/M) instruction.
4263 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4264                                              bool MayFoldLoad, const SDLoc &dl,
4265                                              MVT VT, SDNode *Node) {
4266   SDValue N0 = Node->getOperand(0);
4267   SDValue N1 = Node->getOperand(1);
4268   SDValue Imm = Node->getOperand(2);
4269   auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4270   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4271 
4272   // Try to fold a load. No need to check alignment.
4273   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4274   if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4275     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4276                       N1.getOperand(0) };
4277     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4278     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4279     // Update the chain.
4280     ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4281     // Record the mem-refs
4282     CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4283     return CNode;
4284   }
4285 
4286   SDValue Ops[] = { N0, N1, Imm };
4287   SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4288   MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4289   return CNode;
4290 }
4291 
4292 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4293 // to emit a second instruction after this one. This is needed since we have two
4294 // copyToReg nodes glued before this and we need to continue that glue through.
4295 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4296                                              bool MayFoldLoad, const SDLoc &dl,
4297                                              MVT VT, SDNode *Node,
4298                                              SDValue &InGlue) {
4299   SDValue N0 = Node->getOperand(0);
4300   SDValue N2 = Node->getOperand(2);
4301   SDValue Imm = Node->getOperand(4);
4302   auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4303   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4304 
4305   // Try to fold a load. No need to check alignment.
4306   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4307   if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4308     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4309                       N2.getOperand(0), InGlue };
4310     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4311     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4312     InGlue = SDValue(CNode, 3);
4313     // Update the chain.
4314     ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4315     // Record the mem-refs
4316     CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4317     return CNode;
4318   }
4319 
4320   SDValue Ops[] = { N0, N2, Imm, InGlue };
4321   SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4322   MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4323   InGlue = SDValue(CNode, 2);
4324   return CNode;
4325 }
4326 
4327 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4328   EVT VT = N->getValueType(0);
4329 
4330   // Only handle scalar shifts.
4331   if (VT.isVector())
4332     return false;
4333 
4334   // Narrower shifts only mask to 5 bits in hardware.
4335   unsigned Size = VT == MVT::i64 ? 64 : 32;
4336 
4337   SDValue OrigShiftAmt = N->getOperand(1);
4338   SDValue ShiftAmt = OrigShiftAmt;
4339   SDLoc DL(N);
4340 
4341   // Skip over a truncate of the shift amount.
4342   if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4343     ShiftAmt = ShiftAmt->getOperand(0);
4344 
4345   // This function is called after X86DAGToDAGISel::matchBitExtract(),
4346   // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4347 
4348   SDValue NewShiftAmt;
4349   if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4350       ShiftAmt->getOpcode() == ISD::XOR) {
4351     SDValue Add0 = ShiftAmt->getOperand(0);
4352     SDValue Add1 = ShiftAmt->getOperand(1);
4353     auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4354     auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4355     // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4356     // to avoid the ADD/SUB/XOR.
4357     if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4358       NewShiftAmt = Add0;
4359 
4360     } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4361                ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4362                 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4363       // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4364       // we can replace it with a NOT. In the XOR case it may save some code
4365       // size, in the SUB case it also may save a move.
4366       assert(Add0C == nullptr || Add1C == nullptr);
4367 
4368       // We can only do N-X, not X-N
4369       if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4370         return false;
4371 
4372       EVT OpVT = ShiftAmt.getValueType();
4373 
4374       SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4375       NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4376                                     Add0C == nullptr ? Add0 : Add1, AllOnes);
4377       insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4378       insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4379       // If we are shifting by N-X where N == 0 mod Size, then just shift by
4380       // -X to generate a NEG instead of a SUB of a constant.
4381     } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4382                Add0C->getZExtValue() != 0) {
4383       EVT SubVT = ShiftAmt.getValueType();
4384       SDValue X;
4385       if (Add0C->getZExtValue() % Size == 0)
4386         X = Add1;
4387       else if (ShiftAmt.hasOneUse() && Size == 64 &&
4388                Add0C->getZExtValue() % 32 == 0) {
4389         // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4390         // This is mainly beneficial if we already compute (x+n*32).
4391         if (Add1.getOpcode() == ISD::TRUNCATE) {
4392           Add1 = Add1.getOperand(0);
4393           SubVT = Add1.getValueType();
4394         }
4395         if (Add0.getValueType() != SubVT) {
4396           Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4397           insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4398         }
4399 
4400         X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4401         insertDAGNode(*CurDAG, OrigShiftAmt, X);
4402       } else
4403         return false;
4404       // Insert a negate op.
4405       // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4406       // that uses it that's not a shift.
4407       SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4408       SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4409       NewShiftAmt = Neg;
4410 
4411       // Insert these operands into a valid topological order so they can
4412       // get selected independently.
4413       insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4414       insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4415     } else
4416       return false;
4417   } else
4418     return false;
4419 
4420   if (NewShiftAmt.getValueType() != MVT::i8) {
4421     // Need to truncate the shift amount.
4422     NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4423     // Add to a correct topological ordering.
4424     insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4425   }
4426 
4427   // Insert a new mask to keep the shift amount legal. This should be removed
4428   // by isel patterns.
4429   NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4430                                 CurDAG->getConstant(Size - 1, DL, MVT::i8));
4431   // Place in a correct topological ordering.
4432   insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4433 
4434   SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4435                                                    NewShiftAmt);
4436   if (UpdatedNode != N) {
4437     // If we found an existing node, we should replace ourselves with that node
4438     // and wait for it to be selected after its other users.
4439     ReplaceNode(N, UpdatedNode);
4440     return true;
4441   }
4442 
4443   // If the original shift amount is now dead, delete it so that we don't run
4444   // it through isel.
4445   if (OrigShiftAmt.getNode()->use_empty())
4446     CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4447 
4448   // Now that we've optimized the shift amount, defer to normal isel to get
4449   // load folding and legacy vs BMI2 selection without repeating it here.
4450   SelectCode(N);
4451   return true;
4452 }
4453 
4454 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4455   MVT NVT = N->getSimpleValueType(0);
4456   unsigned Opcode = N->getOpcode();
4457   SDLoc dl(N);
4458 
4459   // For operations of the form (x << C1) op C2, check if we can use a smaller
4460   // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4461   SDValue Shift = N->getOperand(0);
4462   SDValue N1 = N->getOperand(1);
4463 
4464   auto *Cst = dyn_cast<ConstantSDNode>(N1);
4465   if (!Cst)
4466     return false;
4467 
4468   int64_t Val = Cst->getSExtValue();
4469 
4470   // If we have an any_extend feeding the AND, look through it to see if there
4471   // is a shift behind it. But only if the AND doesn't use the extended bits.
4472   // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4473   bool FoundAnyExtend = false;
4474   if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4475       Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4476       isUInt<32>(Val)) {
4477     FoundAnyExtend = true;
4478     Shift = Shift.getOperand(0);
4479   }
4480 
4481   if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4482     return false;
4483 
4484   // i8 is unshrinkable, i16 should be promoted to i32.
4485   if (NVT != MVT::i32 && NVT != MVT::i64)
4486     return false;
4487 
4488   auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4489   if (!ShlCst)
4490     return false;
4491 
4492   uint64_t ShAmt = ShlCst->getZExtValue();
4493 
4494   // Make sure that we don't change the operation by removing bits.
4495   // This only matters for OR and XOR, AND is unaffected.
4496   uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4497   if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4498     return false;
4499 
4500   // Check the minimum bitwidth for the new constant.
4501   // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4502   auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4503     if (Opcode == ISD::AND) {
4504       // AND32ri is the same as AND64ri32 with zext imm.
4505       // Try this before sign extended immediates below.
4506       ShiftedVal = (uint64_t)Val >> ShAmt;
4507       if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4508         return true;
4509       // Also swap order when the AND can become MOVZX.
4510       if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4511         return true;
4512     }
4513     ShiftedVal = Val >> ShAmt;
4514     if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4515         (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4516       return true;
4517     if (Opcode != ISD::AND) {
4518       // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4519       ShiftedVal = (uint64_t)Val >> ShAmt;
4520       if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4521         return true;
4522     }
4523     return false;
4524   };
4525 
4526   int64_t ShiftedVal;
4527   if (!CanShrinkImmediate(ShiftedVal))
4528     return false;
4529 
4530   // Ok, we can reorder to get a smaller immediate.
4531 
4532   // But, its possible the original immediate allowed an AND to become MOVZX.
4533   // Doing this late due to avoid the MakedValueIsZero call as late as
4534   // possible.
4535   if (Opcode == ISD::AND) {
4536     // Find the smallest zext this could possibly be.
4537     unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4538     ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4539 
4540     // Figure out which bits need to be zero to achieve that mask.
4541     APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4542                                             ZExtWidth);
4543     NeededMask &= ~Cst->getAPIntValue();
4544 
4545     if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4546       return false;
4547   }
4548 
4549   SDValue X = Shift.getOperand(0);
4550   if (FoundAnyExtend) {
4551     SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4552     insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4553     X = NewX;
4554   }
4555 
4556   SDValue NewCst = CurDAG->getSignedConstant(ShiftedVal, dl, NVT);
4557   insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4558   SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4559   insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4560   SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4561                                    Shift.getOperand(1));
4562   ReplaceNode(N, NewSHL.getNode());
4563   SelectCode(NewSHL.getNode());
4564   return true;
4565 }
4566 
4567 bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4568                                      SDNode *ParentB, SDNode *ParentC,
4569                                      SDValue A, SDValue B, SDValue C,
4570                                      uint8_t Imm) {
4571   assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4572          C.isOperandOf(ParentC) && "Incorrect parent node");
4573 
4574   auto tryFoldLoadOrBCast =
4575       [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4576              SDValue &Index, SDValue &Disp, SDValue &Segment) {
4577         if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4578           return true;
4579 
4580         // Not a load, check for broadcast which may be behind a bitcast.
4581         if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4582           P = L.getNode();
4583           L = L.getOperand(0);
4584         }
4585 
4586         if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4587           return false;
4588 
4589         // Only 32 and 64 bit broadcasts are supported.
4590         auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4591         unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4592         if (Size != 32 && Size != 64)
4593           return false;
4594 
4595         return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4596       };
4597 
4598   bool FoldedLoad = false;
4599   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4600   if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4601     FoldedLoad = true;
4602   } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4603                                 Tmp4)) {
4604     FoldedLoad = true;
4605     std::swap(A, C);
4606     // Swap bits 1/4 and 3/6.
4607     uint8_t OldImm = Imm;
4608     Imm = OldImm & 0xa5;
4609     if (OldImm & 0x02) Imm |= 0x10;
4610     if (OldImm & 0x10) Imm |= 0x02;
4611     if (OldImm & 0x08) Imm |= 0x40;
4612     if (OldImm & 0x40) Imm |= 0x08;
4613   } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4614                                 Tmp4)) {
4615     FoldedLoad = true;
4616     std::swap(B, C);
4617     // Swap bits 1/2 and 5/6.
4618     uint8_t OldImm = Imm;
4619     Imm = OldImm & 0x99;
4620     if (OldImm & 0x02) Imm |= 0x04;
4621     if (OldImm & 0x04) Imm |= 0x02;
4622     if (OldImm & 0x20) Imm |= 0x40;
4623     if (OldImm & 0x40) Imm |= 0x20;
4624   }
4625 
4626   SDLoc DL(Root);
4627 
4628   SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4629 
4630   MVT NVT = Root->getSimpleValueType(0);
4631 
4632   MachineSDNode *MNode;
4633   if (FoldedLoad) {
4634     SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4635 
4636     unsigned Opc;
4637     if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4638       auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4639       unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4640       assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4641 
4642       bool UseD = EltSize == 32;
4643       if (NVT.is128BitVector())
4644         Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4645       else if (NVT.is256BitVector())
4646         Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4647       else if (NVT.is512BitVector())
4648         Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4649       else
4650         llvm_unreachable("Unexpected vector size!");
4651     } else {
4652       bool UseD = NVT.getVectorElementType() == MVT::i32;
4653       if (NVT.is128BitVector())
4654         Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4655       else if (NVT.is256BitVector())
4656         Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4657       else if (NVT.is512BitVector())
4658         Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4659       else
4660         llvm_unreachable("Unexpected vector size!");
4661     }
4662 
4663     SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4664     MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4665 
4666     // Update the chain.
4667     ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4668     // Record the mem-refs
4669     CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4670   } else {
4671     bool UseD = NVT.getVectorElementType() == MVT::i32;
4672     unsigned Opc;
4673     if (NVT.is128BitVector())
4674       Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4675     else if (NVT.is256BitVector())
4676       Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4677     else if (NVT.is512BitVector())
4678       Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4679     else
4680       llvm_unreachable("Unexpected vector size!");
4681 
4682     MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4683   }
4684 
4685   ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4686   CurDAG->RemoveDeadNode(Root);
4687   return true;
4688 }
4689 
4690 // Try to match two logic ops to a VPTERNLOG.
4691 // FIXME: Handle more complex patterns that use an operand more than once?
4692 bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4693   MVT NVT = N->getSimpleValueType(0);
4694 
4695   // Make sure we support VPTERNLOG.
4696   if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4697       NVT.getVectorElementType() == MVT::i1)
4698     return false;
4699 
4700   // We need VLX for 128/256-bit.
4701   if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4702     return false;
4703 
4704   SDValue N0 = N->getOperand(0);
4705   SDValue N1 = N->getOperand(1);
4706 
4707   auto getFoldableLogicOp = [](SDValue Op) {
4708     // Peek through single use bitcast.
4709     if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4710       Op = Op.getOperand(0);
4711 
4712     if (!Op.hasOneUse())
4713       return SDValue();
4714 
4715     unsigned Opc = Op.getOpcode();
4716     if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4717         Opc == X86ISD::ANDNP)
4718       return Op;
4719 
4720     return SDValue();
4721   };
4722 
4723   SDValue A, FoldableOp;
4724   if ((FoldableOp = getFoldableLogicOp(N1))) {
4725     A = N0;
4726   } else if ((FoldableOp = getFoldableLogicOp(N0))) {
4727     A = N1;
4728   } else
4729     return false;
4730 
4731   SDValue B = FoldableOp.getOperand(0);
4732   SDValue C = FoldableOp.getOperand(1);
4733   SDNode *ParentA = N;
4734   SDNode *ParentB = FoldableOp.getNode();
4735   SDNode *ParentC = FoldableOp.getNode();
4736 
4737   // We can build the appropriate control immediate by performing the logic
4738   // operation we're matching using these constants for A, B, and C.
4739   uint8_t TernlogMagicA = 0xf0;
4740   uint8_t TernlogMagicB = 0xcc;
4741   uint8_t TernlogMagicC = 0xaa;
4742 
4743   // Some of the inputs may be inverted, peek through them and invert the
4744   // magic values accordingly.
4745   // TODO: There may be a bitcast before the xor that we should peek through.
4746   auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4747     if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4748         ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4749       Magic = ~Magic;
4750       Parent = Op.getNode();
4751       Op = Op.getOperand(0);
4752     }
4753   };
4754 
4755   PeekThroughNot(A, ParentA, TernlogMagicA);
4756   PeekThroughNot(B, ParentB, TernlogMagicB);
4757   PeekThroughNot(C, ParentC, TernlogMagicC);
4758 
4759   uint8_t Imm;
4760   switch (FoldableOp.getOpcode()) {
4761   default: llvm_unreachable("Unexpected opcode!");
4762   case ISD::AND:      Imm = TernlogMagicB & TernlogMagicC; break;
4763   case ISD::OR:       Imm = TernlogMagicB | TernlogMagicC; break;
4764   case ISD::XOR:      Imm = TernlogMagicB ^ TernlogMagicC; break;
4765   case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4766   }
4767 
4768   switch (N->getOpcode()) {
4769   default: llvm_unreachable("Unexpected opcode!");
4770   case X86ISD::ANDNP:
4771     if (A == N0)
4772       Imm &= ~TernlogMagicA;
4773     else
4774       Imm = ~(Imm) & TernlogMagicA;
4775     break;
4776   case ISD::AND: Imm &= TernlogMagicA; break;
4777   case ISD::OR:  Imm |= TernlogMagicA; break;
4778   case ISD::XOR: Imm ^= TernlogMagicA; break;
4779   }
4780 
4781   return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
4782 }
4783 
4784 /// If the high bits of an 'and' operand are known zero, try setting the
4785 /// high bits of an 'and' constant operand to produce a smaller encoding by
4786 /// creating a small, sign-extended negative immediate rather than a large
4787 /// positive one. This reverses a transform in SimplifyDemandedBits that
4788 /// shrinks mask constants by clearing bits. There is also a possibility that
4789 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4790 /// case, just replace the 'and'. Return 'true' if the node is replaced.
4791 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4792   // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4793   // have immediate operands.
4794   MVT VT = And->getSimpleValueType(0);
4795   if (VT != MVT::i32 && VT != MVT::i64)
4796     return false;
4797 
4798   auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4799   if (!And1C)
4800     return false;
4801 
4802   // Bail out if the mask constant is already negative. It's can't shrink more.
4803   // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4804   // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4805   // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4806   // are negative too.
4807   APInt MaskVal = And1C->getAPIntValue();
4808   unsigned MaskLZ = MaskVal.countl_zero();
4809   if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4810     return false;
4811 
4812   // Don't extend into the upper 32 bits of a 64 bit mask.
4813   if (VT == MVT::i64 && MaskLZ >= 32) {
4814     MaskLZ -= 32;
4815     MaskVal = MaskVal.trunc(32);
4816   }
4817 
4818   SDValue And0 = And->getOperand(0);
4819   APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4820   APInt NegMaskVal = MaskVal | HighZeros;
4821 
4822   // If a negative constant would not allow a smaller encoding, there's no need
4823   // to continue. Only change the constant when we know it's a win.
4824   unsigned MinWidth = NegMaskVal.getSignificantBits();
4825   if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4826     return false;
4827 
4828   // Extend masks if we truncated above.
4829   if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4830     NegMaskVal = NegMaskVal.zext(64);
4831     HighZeros = HighZeros.zext(64);
4832   }
4833 
4834   // The variable operand must be all zeros in the top bits to allow using the
4835   // new, negative constant as the mask.
4836   // TODO: Handle constant folding?
4837   KnownBits Known0 = CurDAG->computeKnownBits(And0);
4838   if (Known0.isConstant() || !HighZeros.isSubsetOf(Known0.Zero))
4839     return false;
4840 
4841   // Check if the mask is -1. In that case, this is an unnecessary instruction
4842   // that escaped earlier analysis.
4843   if (NegMaskVal.isAllOnes()) {
4844     ReplaceNode(And, And0.getNode());
4845     return true;
4846   }
4847 
4848   // A negative mask allows a smaller encoding. Create a new 'and' node.
4849   SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4850   insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4851   SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4852   ReplaceNode(And, NewAnd.getNode());
4853   SelectCode(NewAnd.getNode());
4854   return true;
4855 }
4856 
4857 static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4858                               bool FoldedBCast, bool Masked) {
4859 #define VPTESTM_CASE(VT, SUFFIX) \
4860 case MVT::VT: \
4861   if (Masked) \
4862     return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4863   return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4864 
4865 
4866 #define VPTESTM_BROADCAST_CASES(SUFFIX) \
4867 default: llvm_unreachable("Unexpected VT!"); \
4868 VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4869 VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4870 VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4871 VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4872 VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4873 VPTESTM_CASE(v8i64, QZ##SUFFIX)
4874 
4875 #define VPTESTM_FULL_CASES(SUFFIX) \
4876 VPTESTM_BROADCAST_CASES(SUFFIX) \
4877 VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4878 VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4879 VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4880 VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4881 VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4882 VPTESTM_CASE(v32i16, WZ##SUFFIX)
4883 
4884   if (FoldedBCast) {
4885     switch (TestVT.SimpleTy) {
4886     VPTESTM_BROADCAST_CASES(rmb)
4887     }
4888   }
4889 
4890   if (FoldedLoad) {
4891     switch (TestVT.SimpleTy) {
4892     VPTESTM_FULL_CASES(rm)
4893     }
4894   }
4895 
4896   switch (TestVT.SimpleTy) {
4897   VPTESTM_FULL_CASES(rr)
4898   }
4899 
4900 #undef VPTESTM_FULL_CASES
4901 #undef VPTESTM_BROADCAST_CASES
4902 #undef VPTESTM_CASE
4903 }
4904 
4905 // Try to create VPTESTM instruction. If InMask is not null, it will be used
4906 // to form a masked operation.
4907 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4908                                  SDValue InMask) {
4909   assert(Subtarget->hasAVX512() && "Expected AVX512!");
4910   assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4911          "Unexpected VT!");
4912 
4913   // Look for equal and not equal compares.
4914   ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4915   if (CC != ISD::SETEQ && CC != ISD::SETNE)
4916     return false;
4917 
4918   SDValue SetccOp0 = Setcc.getOperand(0);
4919   SDValue SetccOp1 = Setcc.getOperand(1);
4920 
4921   // Canonicalize the all zero vector to the RHS.
4922   if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4923     std::swap(SetccOp0, SetccOp1);
4924 
4925   // See if we're comparing against zero.
4926   if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4927     return false;
4928 
4929   SDValue N0 = SetccOp0;
4930 
4931   MVT CmpVT = N0.getSimpleValueType();
4932   MVT CmpSVT = CmpVT.getVectorElementType();
4933 
4934   // Start with both operands the same. We'll try to refine this.
4935   SDValue Src0 = N0;
4936   SDValue Src1 = N0;
4937 
4938   {
4939     // Look through single use bitcasts.
4940     SDValue N0Temp = N0;
4941     if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4942       N0Temp = N0.getOperand(0);
4943 
4944      // Look for single use AND.
4945     if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4946       Src0 = N0Temp.getOperand(0);
4947       Src1 = N0Temp.getOperand(1);
4948     }
4949   }
4950 
4951   // Without VLX we need to widen the operation.
4952   bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4953 
4954   auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
4955                                 SDValue &Base, SDValue &Scale, SDValue &Index,
4956                                 SDValue &Disp, SDValue &Segment) {
4957     // If we need to widen, we can't fold the load.
4958     if (!Widen)
4959       if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4960         return true;
4961 
4962     // If we didn't fold a load, try to match broadcast. No widening limitation
4963     // for this. But only 32 and 64 bit types are supported.
4964     if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4965       return false;
4966 
4967     // Look through single use bitcasts.
4968     if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4969       P = L.getNode();
4970       L = L.getOperand(0);
4971     }
4972 
4973     if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4974       return false;
4975 
4976     auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4977     if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4978       return false;
4979 
4980     return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4981   };
4982 
4983   // We can only fold loads if the sources are unique.
4984   bool CanFoldLoads = Src0 != Src1;
4985 
4986   bool FoldedLoad = false;
4987   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4988   if (CanFoldLoads) {
4989     FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
4990                                     Tmp3, Tmp4);
4991     if (!FoldedLoad) {
4992       // And is commutative.
4993       FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
4994                                       Tmp2, Tmp3, Tmp4);
4995       if (FoldedLoad)
4996         std::swap(Src0, Src1);
4997     }
4998   }
4999 
5000   bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
5001 
5002   bool IsMasked = InMask.getNode() != nullptr;
5003 
5004   SDLoc dl(Root);
5005 
5006   MVT ResVT = Setcc.getSimpleValueType();
5007   MVT MaskVT = ResVT;
5008   if (Widen) {
5009     // Widen the inputs using insert_subreg or copy_to_regclass.
5010     unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
5011     unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
5012     unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
5013     CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
5014     MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
5015     SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
5016                                                      CmpVT), 0);
5017     Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
5018 
5019     if (!FoldedBCast)
5020       Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
5021 
5022     if (IsMasked) {
5023       // Widen the mask.
5024       unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
5025       SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5026       InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5027                                               dl, MaskVT, InMask, RC), 0);
5028     }
5029   }
5030 
5031   bool IsTestN = CC == ISD::SETEQ;
5032   unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
5033                                IsMasked);
5034 
5035   MachineSDNode *CNode;
5036   if (FoldedLoad) {
5037     SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
5038 
5039     if (IsMasked) {
5040       SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5041                         Src1.getOperand(0) };
5042       CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5043     } else {
5044       SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5045                         Src1.getOperand(0) };
5046       CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5047     }
5048 
5049     // Update the chain.
5050     ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
5051     // Record the mem-refs
5052     CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
5053   } else {
5054     if (IsMasked)
5055       CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
5056     else
5057       CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
5058   }
5059 
5060   // If we widened, we need to shrink the mask VT.
5061   if (Widen) {
5062     unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
5063     SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5064     CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5065                                    dl, ResVT, SDValue(CNode, 0), RC);
5066   }
5067 
5068   ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
5069   CurDAG->RemoveDeadNode(Root);
5070   return true;
5071 }
5072 
5073 // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5074 // into vpternlog.
5075 bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5076   assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5077 
5078   MVT NVT = N->getSimpleValueType(0);
5079 
5080   // Make sure we support VPTERNLOG.
5081   if (!NVT.isVector() || !Subtarget->hasAVX512())
5082     return false;
5083 
5084   // We need VLX for 128/256-bit.
5085   if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5086     return false;
5087 
5088   SDValue N0 = N->getOperand(0);
5089   SDValue N1 = N->getOperand(1);
5090 
5091   // Canonicalize AND to LHS.
5092   if (N1.getOpcode() == ISD::AND)
5093     std::swap(N0, N1);
5094 
5095   if (N0.getOpcode() != ISD::AND ||
5096       N1.getOpcode() != X86ISD::ANDNP ||
5097       !N0.hasOneUse() || !N1.hasOneUse())
5098     return false;
5099 
5100   // ANDN is not commutable, use it to pick down A and C.
5101   SDValue A = N1.getOperand(0);
5102   SDValue C = N1.getOperand(1);
5103 
5104   // AND is commutable, if one operand matches A, the other operand is B.
5105   // Otherwise this isn't a match.
5106   SDValue B;
5107   if (N0.getOperand(0) == A)
5108     B = N0.getOperand(1);
5109   else if (N0.getOperand(1) == A)
5110     B = N0.getOperand(0);
5111   else
5112     return false;
5113 
5114   SDLoc dl(N);
5115   SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
5116   SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
5117   ReplaceNode(N, Ternlog.getNode());
5118 
5119   return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
5120                         Ternlog.getNode(), A, B, C, 0xCA);
5121 }
5122 
5123 void X86DAGToDAGISel::Select(SDNode *Node) {
5124   MVT NVT = Node->getSimpleValueType(0);
5125   unsigned Opcode = Node->getOpcode();
5126   SDLoc dl(Node);
5127 
5128   if (Node->isMachineOpcode()) {
5129     LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5130     Node->setNodeId(-1);
5131     return;   // Already selected.
5132   }
5133 
5134   switch (Opcode) {
5135   default: break;
5136   case ISD::INTRINSIC_W_CHAIN: {
5137     unsigned IntNo = Node->getConstantOperandVal(1);
5138     switch (IntNo) {
5139     default: break;
5140     case Intrinsic::x86_encodekey128:
5141     case Intrinsic::x86_encodekey256: {
5142       if (!Subtarget->hasKL())
5143         break;
5144 
5145       unsigned Opcode;
5146       switch (IntNo) {
5147       default: llvm_unreachable("Impossible intrinsic");
5148       case Intrinsic::x86_encodekey128:
5149         Opcode = X86::ENCODEKEY128;
5150         break;
5151       case Intrinsic::x86_encodekey256:
5152         Opcode = X86::ENCODEKEY256;
5153         break;
5154       }
5155 
5156       SDValue Chain = Node->getOperand(0);
5157       Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
5158                                    SDValue());
5159       if (Opcode == X86::ENCODEKEY256)
5160         Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
5161                                      Chain.getValue(1));
5162 
5163       MachineSDNode *Res = CurDAG->getMachineNode(
5164           Opcode, dl, Node->getVTList(),
5165           {Node->getOperand(2), Chain, Chain.getValue(1)});
5166       ReplaceNode(Node, Res);
5167       return;
5168     }
5169     case Intrinsic::x86_tileloaddrs64_internal:
5170     case Intrinsic::x86_tileloaddrst164_internal:
5171       if (!Subtarget->hasAMXMOVRS())
5172         break;
5173       [[fallthrough]];
5174     case Intrinsic::x86_tileloadd64_internal:
5175     case Intrinsic::x86_tileloaddt164_internal: {
5176       if (!Subtarget->hasAMXTILE())
5177         break;
5178       auto *MFI =
5179           CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5180       MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5181       unsigned Opc;
5182       switch (IntNo) {
5183       default:
5184         llvm_unreachable("Unexpected intrinsic!");
5185       case Intrinsic::x86_tileloaddrs64_internal:
5186         Opc = X86::PTILELOADDRSV;
5187         break;
5188       case Intrinsic::x86_tileloaddrst164_internal:
5189         Opc = X86::PTILELOADDRST1V;
5190         break;
5191       case Intrinsic::x86_tileloadd64_internal:
5192         Opc = X86::PTILELOADDV;
5193         break;
5194       case Intrinsic::x86_tileloaddt164_internal:
5195         Opc = X86::PTILELOADDT1V;
5196         break;
5197       }
5198       // _tile_loadd_internal(row, col, buf, STRIDE)
5199       SDValue Base = Node->getOperand(4);
5200       SDValue Scale = getI8Imm(1, dl);
5201       SDValue Index = Node->getOperand(5);
5202       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5203       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5204       SDValue Chain = Node->getOperand(0);
5205       MachineSDNode *CNode;
5206       SDValue Ops[] = {Node->getOperand(2),
5207                        Node->getOperand(3),
5208                        Base,
5209                        Scale,
5210                        Index,
5211                        Disp,
5212                        Segment,
5213                        Chain};
5214       CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5215       ReplaceNode(Node, CNode);
5216       return;
5217     }
5218     }
5219     break;
5220   }
5221   case ISD::INTRINSIC_VOID: {
5222     unsigned IntNo = Node->getConstantOperandVal(1);
5223     switch (IntNo) {
5224     default: break;
5225     case Intrinsic::x86_sse3_monitor:
5226     case Intrinsic::x86_monitorx:
5227     case Intrinsic::x86_clzero: {
5228       bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
5229 
5230       unsigned Opc = 0;
5231       switch (IntNo) {
5232       default: llvm_unreachable("Unexpected intrinsic!");
5233       case Intrinsic::x86_sse3_monitor:
5234         if (!Subtarget->hasSSE3())
5235           break;
5236         Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5237         break;
5238       case Intrinsic::x86_monitorx:
5239         if (!Subtarget->hasMWAITX())
5240           break;
5241         Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5242         break;
5243       case Intrinsic::x86_clzero:
5244         if (!Subtarget->hasCLZERO())
5245           break;
5246         Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5247         break;
5248       }
5249 
5250       if (Opc) {
5251         unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5252         SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
5253                                              Node->getOperand(2), SDValue());
5254         SDValue InGlue = Chain.getValue(1);
5255 
5256         if (IntNo == Intrinsic::x86_sse3_monitor ||
5257             IntNo == Intrinsic::x86_monitorx) {
5258           // Copy the other two operands to ECX and EDX.
5259           Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
5260                                        InGlue);
5261           InGlue = Chain.getValue(1);
5262           Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
5263                                        InGlue);
5264           InGlue = Chain.getValue(1);
5265         }
5266 
5267         MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5268                                                       { Chain, InGlue});
5269         ReplaceNode(Node, CNode);
5270         return;
5271       }
5272 
5273       break;
5274     }
5275     case Intrinsic::x86_tilestored64_internal: {
5276       auto *MFI =
5277           CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5278       MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5279       unsigned Opc = X86::PTILESTOREDV;
5280       // _tile_stored_internal(row, col, buf, STRIDE, c)
5281       SDValue Base = Node->getOperand(4);
5282       SDValue Scale = getI8Imm(1, dl);
5283       SDValue Index = Node->getOperand(5);
5284       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5285       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5286       SDValue Chain = Node->getOperand(0);
5287       MachineSDNode *CNode;
5288       SDValue Ops[] = {Node->getOperand(2),
5289                        Node->getOperand(3),
5290                        Base,
5291                        Scale,
5292                        Index,
5293                        Disp,
5294                        Segment,
5295                        Node->getOperand(6),
5296                        Chain};
5297       CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5298       ReplaceNode(Node, CNode);
5299       return;
5300     }
5301     case Intrinsic::x86_tileloaddrs64:
5302     case Intrinsic::x86_tileloaddrst164:
5303       if (!Subtarget->hasAMXMOVRS())
5304         break;
5305       [[fallthrough]];
5306     case Intrinsic::x86_tileloadd64:
5307     case Intrinsic::x86_tileloaddt164:
5308     case Intrinsic::x86_tilestored64: {
5309       if (!Subtarget->hasAMXTILE())
5310         break;
5311       auto *MFI =
5312           CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5313       MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5314       unsigned Opc;
5315       switch (IntNo) {
5316       default: llvm_unreachable("Unexpected intrinsic!");
5317       case Intrinsic::x86_tileloadd64:   Opc = X86::PTILELOADD; break;
5318       case Intrinsic::x86_tileloaddrs64:
5319         Opc = X86::PTILELOADDRS;
5320         break;
5321       case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5322       case Intrinsic::x86_tileloaddrst164:
5323         Opc = X86::PTILELOADDRST1;
5324         break;
5325       case Intrinsic::x86_tilestored64:  Opc = X86::PTILESTORED; break;
5326       }
5327       // FIXME: Match displacement and scale.
5328       unsigned TIndex = Node->getConstantOperandVal(2);
5329       SDValue TReg = getI8Imm(TIndex, dl);
5330       SDValue Base = Node->getOperand(3);
5331       SDValue Scale = getI8Imm(1, dl);
5332       SDValue Index = Node->getOperand(4);
5333       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5334       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5335       SDValue Chain = Node->getOperand(0);
5336       MachineSDNode *CNode;
5337       if (Opc == X86::PTILESTORED) {
5338         SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5339         CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5340       } else {
5341         SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5342         CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5343       }
5344       ReplaceNode(Node, CNode);
5345       return;
5346     }
5347     case Intrinsic::x86_t2rpntlvwz0rs:
5348     case Intrinsic::x86_t2rpntlvwz0rst1:
5349     case Intrinsic::x86_t2rpntlvwz1rs:
5350     case Intrinsic::x86_t2rpntlvwz1rst1:
5351       if (!Subtarget->hasAMXMOVRS())
5352         break;
5353       [[fallthrough]];
5354     case Intrinsic::x86_t2rpntlvwz0:
5355     case Intrinsic::x86_t2rpntlvwz0t1:
5356     case Intrinsic::x86_t2rpntlvwz1:
5357     case Intrinsic::x86_t2rpntlvwz1t1: {
5358       if (!Subtarget->hasAMXTRANSPOSE())
5359         break;
5360       auto *MFI =
5361           CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5362       MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5363       unsigned Opc;
5364       switch (IntNo) {
5365       default:
5366         llvm_unreachable("Unexpected intrinsic!");
5367       case Intrinsic::x86_t2rpntlvwz0:
5368         Opc = X86::PT2RPNTLVWZ0;
5369         break;
5370       case Intrinsic::x86_t2rpntlvwz0t1:
5371         Opc = X86::PT2RPNTLVWZ0T1;
5372         break;
5373       case Intrinsic::x86_t2rpntlvwz1:
5374         Opc = X86::PT2RPNTLVWZ1;
5375         break;
5376       case Intrinsic::x86_t2rpntlvwz1t1:
5377         Opc = X86::PT2RPNTLVWZ1T1;
5378         break;
5379       case Intrinsic::x86_t2rpntlvwz0rs:
5380         Opc = X86::PT2RPNTLVWZ0RS;
5381         break;
5382       case Intrinsic::x86_t2rpntlvwz0rst1:
5383         Opc = X86::PT2RPNTLVWZ0RST1;
5384         break;
5385       case Intrinsic::x86_t2rpntlvwz1rs:
5386         Opc = X86::PT2RPNTLVWZ1RS;
5387         break;
5388       case Intrinsic::x86_t2rpntlvwz1rst1:
5389         Opc = X86::PT2RPNTLVWZ1RST1;
5390         break;
5391       }
5392       // FIXME: Match displacement and scale.
5393       unsigned TIndex = Node->getConstantOperandVal(2);
5394       SDValue TReg = getI8Imm(TIndex, dl);
5395       SDValue Base = Node->getOperand(3);
5396       SDValue Scale = getI8Imm(1, dl);
5397       SDValue Index = Node->getOperand(4);
5398       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5399       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5400       SDValue Chain = Node->getOperand(0);
5401       SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
5402       MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5403       ReplaceNode(Node, CNode);
5404       return;
5405     }
5406     }
5407     break;
5408   }
5409   case ISD::BRIND:
5410   case X86ISD::NT_BRIND: {
5411     if (Subtarget->isTargetNaCl())
5412       // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5413       // leave the instruction alone.
5414       break;
5415     if (Subtarget->isTarget64BitILP32()) {
5416       // Converts a 32-bit register to a 64-bit, zero-extended version of
5417       // it. This is needed because x86-64 can do many things, but jmp %r32
5418       // ain't one of them.
5419       SDValue Target = Node->getOperand(1);
5420       assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5421       SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5422       SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5423                                       Node->getOperand(0), ZextTarget);
5424       ReplaceNode(Node, Brind.getNode());
5425       SelectCode(ZextTarget.getNode());
5426       SelectCode(Brind.getNode());
5427       return;
5428     }
5429     break;
5430   }
5431   case X86ISD::GlobalBaseReg:
5432     ReplaceNode(Node, getGlobalBaseReg());
5433     return;
5434 
5435   case ISD::BITCAST:
5436     // Just drop all 128/256/512-bit bitcasts.
5437     if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5438         NVT == MVT::f128) {
5439       ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
5440       CurDAG->RemoveDeadNode(Node);
5441       return;
5442     }
5443     break;
5444 
5445   case ISD::SRL:
5446     if (matchBitExtract(Node))
5447       return;
5448     [[fallthrough]];
5449   case ISD::SRA:
5450   case ISD::SHL:
5451     if (tryShiftAmountMod(Node))
5452       return;
5453     break;
5454 
5455   case X86ISD::VPTERNLOG: {
5456     uint8_t Imm = Node->getConstantOperandVal(3);
5457     if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
5458                        Node->getOperand(1), Node->getOperand(2), Imm))
5459       return;
5460     break;
5461   }
5462 
5463   case X86ISD::ANDNP:
5464     if (tryVPTERNLOG(Node))
5465       return;
5466     break;
5467 
5468   case ISD::AND:
5469     if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5470       // Try to form a masked VPTESTM. Operands can be in either order.
5471       SDValue N0 = Node->getOperand(0);
5472       SDValue N1 = Node->getOperand(1);
5473       if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5474           tryVPTESTM(Node, N0, N1))
5475         return;
5476       if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5477           tryVPTESTM(Node, N1, N0))
5478         return;
5479     }
5480 
5481     if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5482       ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5483       CurDAG->RemoveDeadNode(Node);
5484       return;
5485     }
5486     if (matchBitExtract(Node))
5487       return;
5488     if (AndImmShrink && shrinkAndImmediate(Node))
5489       return;
5490 
5491     [[fallthrough]];
5492   case ISD::OR:
5493   case ISD::XOR:
5494     if (tryShrinkShlLogicImm(Node))
5495       return;
5496     if (Opcode == ISD::OR && tryMatchBitSelect(Node))
5497       return;
5498     if (tryVPTERNLOG(Node))
5499       return;
5500 
5501     [[fallthrough]];
5502   case ISD::ADD:
5503     if (Opcode == ISD::ADD && matchBitExtract(Node))
5504       return;
5505     [[fallthrough]];
5506   case ISD::SUB: {
5507     // Try to avoid folding immediates with multiple uses for optsize.
5508     // This code tries to select to register form directly to avoid going
5509     // through the isel table which might fold the immediate. We can't change
5510     // the patterns on the add/sub/and/or/xor with immediate paterns in the
5511     // tablegen files to check immediate use count without making the patterns
5512     // unavailable to the fast-isel table.
5513     if (!CurDAG->shouldOptForSize())
5514       break;
5515 
5516     // Only handle i8/i16/i32/i64.
5517     if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5518       break;
5519 
5520     SDValue N0 = Node->getOperand(0);
5521     SDValue N1 = Node->getOperand(1);
5522 
5523     auto *Cst = dyn_cast<ConstantSDNode>(N1);
5524     if (!Cst)
5525       break;
5526 
5527     int64_t Val = Cst->getSExtValue();
5528 
5529     // Make sure its an immediate that is considered foldable.
5530     // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5531     if (!isInt<8>(Val) && !isInt<32>(Val))
5532       break;
5533 
5534     // If this can match to INC/DEC, let it go.
5535     if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5536       break;
5537 
5538     // Check if we should avoid folding this immediate.
5539     if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
5540       break;
5541 
5542     // We should not fold the immediate. So we need a register form instead.
5543     unsigned ROpc, MOpc;
5544     switch (NVT.SimpleTy) {
5545     default: llvm_unreachable("Unexpected VT!");
5546     case MVT::i8:
5547       switch (Opcode) {
5548       default: llvm_unreachable("Unexpected opcode!");
5549       case ISD::ADD:
5550         ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5551         MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5552         break;
5553       case ISD::SUB:
5554         ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5555         MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5556         break;
5557       case ISD::AND:
5558         ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5559         MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5560         break;
5561       case ISD::OR:
5562         ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5563         MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5564         break;
5565       case ISD::XOR:
5566         ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5567         MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5568         break;
5569       }
5570       break;
5571     case MVT::i16:
5572       switch (Opcode) {
5573       default: llvm_unreachable("Unexpected opcode!");
5574       case ISD::ADD:
5575         ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5576         MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5577         break;
5578       case ISD::SUB:
5579         ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5580         MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5581         break;
5582       case ISD::AND:
5583         ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5584         MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5585         break;
5586       case ISD::OR:
5587         ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5588         MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5589         break;
5590       case ISD::XOR:
5591         ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5592         MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5593         break;
5594       }
5595       break;
5596     case MVT::i32:
5597       switch (Opcode) {
5598       default: llvm_unreachable("Unexpected opcode!");
5599       case ISD::ADD:
5600         ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5601         MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5602         break;
5603       case ISD::SUB:
5604         ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5605         MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5606         break;
5607       case ISD::AND:
5608         ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5609         MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5610         break;
5611       case ISD::OR:
5612         ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5613         MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5614         break;
5615       case ISD::XOR:
5616         ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5617         MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5618         break;
5619       }
5620       break;
5621     case MVT::i64:
5622       switch (Opcode) {
5623       default: llvm_unreachable("Unexpected opcode!");
5624       case ISD::ADD:
5625         ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5626         MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5627         break;
5628       case ISD::SUB:
5629         ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5630         MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5631         break;
5632       case ISD::AND:
5633         ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5634         MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5635         break;
5636       case ISD::OR:
5637         ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5638         MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5639         break;
5640       case ISD::XOR:
5641         ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5642         MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5643         break;
5644       }
5645       break;
5646     }
5647 
5648     // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5649 
5650     // If this is a not a subtract, we can still try to fold a load.
5651     if (Opcode != ISD::SUB) {
5652       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5653       if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5654         SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5655         SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5656         MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5657         // Update the chain.
5658         ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
5659         // Record the mem-refs
5660         CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
5661         ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5662         CurDAG->RemoveDeadNode(Node);
5663         return;
5664       }
5665     }
5666 
5667     CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5668     return;
5669   }
5670 
5671   case X86ISD::SMUL:
5672     // i16/i32/i64 are handled with isel patterns.
5673     if (NVT != MVT::i8)
5674       break;
5675     [[fallthrough]];
5676   case X86ISD::UMUL: {
5677     SDValue N0 = Node->getOperand(0);
5678     SDValue N1 = Node->getOperand(1);
5679 
5680     unsigned LoReg, ROpc, MOpc;
5681     switch (NVT.SimpleTy) {
5682     default: llvm_unreachable("Unsupported VT!");
5683     case MVT::i8:
5684       LoReg = X86::AL;
5685       ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5686       MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5687       break;
5688     case MVT::i16:
5689       LoReg = X86::AX;
5690       ROpc = X86::MUL16r;
5691       MOpc = X86::MUL16m;
5692       break;
5693     case MVT::i32:
5694       LoReg = X86::EAX;
5695       ROpc = X86::MUL32r;
5696       MOpc = X86::MUL32m;
5697       break;
5698     case MVT::i64:
5699       LoReg = X86::RAX;
5700       ROpc = X86::MUL64r;
5701       MOpc = X86::MUL64m;
5702       break;
5703     }
5704 
5705     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5706     bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5707     // Multiply is commutative.
5708     if (!FoldedLoad) {
5709       FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5710       if (FoldedLoad)
5711         std::swap(N0, N1);
5712     }
5713 
5714     SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5715                                           N0, SDValue()).getValue(1);
5716 
5717     MachineSDNode *CNode;
5718     if (FoldedLoad) {
5719       // i16/i32/i64 use an instruction that produces a low and high result even
5720       // though only the low result is used.
5721       SDVTList VTs;
5722       if (NVT == MVT::i8)
5723         VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5724       else
5725         VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5726 
5727       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5728                         InGlue };
5729       CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5730 
5731       // Update the chain.
5732       ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5733       // Record the mem-refs
5734       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5735     } else {
5736       // i16/i32/i64 use an instruction that produces a low and high result even
5737       // though only the low result is used.
5738       SDVTList VTs;
5739       if (NVT == MVT::i8)
5740         VTs = CurDAG->getVTList(NVT, MVT::i32);
5741       else
5742         VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5743 
5744       CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InGlue});
5745     }
5746 
5747     ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5748     ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5749     CurDAG->RemoveDeadNode(Node);
5750     return;
5751   }
5752 
5753   case ISD::SMUL_LOHI:
5754   case ISD::UMUL_LOHI: {
5755     SDValue N0 = Node->getOperand(0);
5756     SDValue N1 = Node->getOperand(1);
5757 
5758     unsigned Opc, MOpc;
5759     unsigned LoReg, HiReg;
5760     bool IsSigned = Opcode == ISD::SMUL_LOHI;
5761     bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5762     bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5763     switch (NVT.SimpleTy) {
5764     default: llvm_unreachable("Unsupported VT!");
5765     case MVT::i32:
5766       Opc = UseMULXHi  ? X86::MULX32Hrr
5767             : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5768             : IsSigned ? X86::IMUL32r
5769                        : X86::MUL32r;
5770       MOpc = UseMULXHi  ? X86::MULX32Hrm
5771              : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5772              : IsSigned ? X86::IMUL32m
5773                         : X86::MUL32m;
5774       LoReg = UseMULX ? X86::EDX : X86::EAX;
5775       HiReg = X86::EDX;
5776       break;
5777     case MVT::i64:
5778       Opc = UseMULXHi  ? X86::MULX64Hrr
5779             : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5780             : IsSigned ? X86::IMUL64r
5781                        : X86::MUL64r;
5782       MOpc = UseMULXHi  ? X86::MULX64Hrm
5783              : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5784              : IsSigned ? X86::IMUL64m
5785                         : X86::MUL64m;
5786       LoReg = UseMULX ? X86::RDX : X86::RAX;
5787       HiReg = X86::RDX;
5788       break;
5789     }
5790 
5791     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5792     bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5793     // Multiply is commutative.
5794     if (!foldedLoad) {
5795       foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5796       if (foldedLoad)
5797         std::swap(N0, N1);
5798     }
5799 
5800     SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5801                                           N0, SDValue()).getValue(1);
5802     SDValue ResHi, ResLo;
5803     if (foldedLoad) {
5804       SDValue Chain;
5805       MachineSDNode *CNode = nullptr;
5806       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5807                         InGlue };
5808       if (UseMULXHi) {
5809         SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5810         CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5811         ResHi = SDValue(CNode, 0);
5812         Chain = SDValue(CNode, 1);
5813       } else if (UseMULX) {
5814         SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5815         CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5816         ResHi = SDValue(CNode, 0);
5817         ResLo = SDValue(CNode, 1);
5818         Chain = SDValue(CNode, 2);
5819       } else {
5820         SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5821         CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5822         Chain = SDValue(CNode, 0);
5823         InGlue = SDValue(CNode, 1);
5824       }
5825 
5826       // Update the chain.
5827       ReplaceUses(N1.getValue(1), Chain);
5828       // Record the mem-refs
5829       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5830     } else {
5831       SDValue Ops[] = { N1, InGlue };
5832       if (UseMULXHi) {
5833         SDVTList VTs = CurDAG->getVTList(NVT);
5834         SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5835         ResHi = SDValue(CNode, 0);
5836       } else if (UseMULX) {
5837         SDVTList VTs = CurDAG->getVTList(NVT, NVT);
5838         SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5839         ResHi = SDValue(CNode, 0);
5840         ResLo = SDValue(CNode, 1);
5841       } else {
5842         SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5843         SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5844         InGlue = SDValue(CNode, 0);
5845       }
5846     }
5847 
5848     // Copy the low half of the result, if it is needed.
5849     if (!SDValue(Node, 0).use_empty()) {
5850       if (!ResLo) {
5851         assert(LoReg && "Register for low half is not defined!");
5852         ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
5853                                        NVT, InGlue);
5854         InGlue = ResLo.getValue(2);
5855       }
5856       ReplaceUses(SDValue(Node, 0), ResLo);
5857       LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5858                  dbgs() << '\n');
5859     }
5860     // Copy the high half of the result, if it is needed.
5861     if (!SDValue(Node, 1).use_empty()) {
5862       if (!ResHi) {
5863         assert(HiReg && "Register for high half is not defined!");
5864         ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
5865                                        NVT, InGlue);
5866         InGlue = ResHi.getValue(2);
5867       }
5868       ReplaceUses(SDValue(Node, 1), ResHi);
5869       LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5870                  dbgs() << '\n');
5871     }
5872 
5873     CurDAG->RemoveDeadNode(Node);
5874     return;
5875   }
5876 
5877   case ISD::SDIVREM:
5878   case ISD::UDIVREM: {
5879     SDValue N0 = Node->getOperand(0);
5880     SDValue N1 = Node->getOperand(1);
5881 
5882     unsigned ROpc, MOpc;
5883     bool isSigned = Opcode == ISD::SDIVREM;
5884     if (!isSigned) {
5885       switch (NVT.SimpleTy) {
5886       default: llvm_unreachable("Unsupported VT!");
5887       case MVT::i8:  ROpc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
5888       case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5889       case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5890       case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5891       }
5892     } else {
5893       switch (NVT.SimpleTy) {
5894       default: llvm_unreachable("Unsupported VT!");
5895       case MVT::i8:  ROpc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
5896       case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5897       case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5898       case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5899       }
5900     }
5901 
5902     unsigned LoReg, HiReg, ClrReg;
5903     unsigned SExtOpcode;
5904     switch (NVT.SimpleTy) {
5905     default: llvm_unreachable("Unsupported VT!");
5906     case MVT::i8:
5907       LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
5908       SExtOpcode = 0; // Not used.
5909       break;
5910     case MVT::i16:
5911       LoReg = X86::AX;  HiReg = X86::DX;
5912       ClrReg = X86::DX;
5913       SExtOpcode = X86::CWD;
5914       break;
5915     case MVT::i32:
5916       LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5917       SExtOpcode = X86::CDQ;
5918       break;
5919     case MVT::i64:
5920       LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5921       SExtOpcode = X86::CQO;
5922       break;
5923     }
5924 
5925     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5926     bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5927     bool signBitIsZero = CurDAG->SignBitIsZero(N0);
5928 
5929     SDValue InGlue;
5930     if (NVT == MVT::i8) {
5931       // Special case for div8, just use a move with zero extension to AX to
5932       // clear the upper 8 bits (AH).
5933       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5934       MachineSDNode *Move;
5935       if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5936         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5937         unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5938                                                     : X86::MOVZX16rm8;
5939         Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
5940         Chain = SDValue(Move, 1);
5941         ReplaceUses(N0.getValue(1), Chain);
5942         // Record the mem-refs
5943         CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
5944       } else {
5945         unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5946                                                     : X86::MOVZX16rr8;
5947         Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
5948         Chain = CurDAG->getEntryNode();
5949       }
5950       Chain  = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
5951                                     SDValue());
5952       InGlue = Chain.getValue(1);
5953     } else {
5954       InGlue =
5955         CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
5956                              LoReg, N0, SDValue()).getValue(1);
5957       if (isSigned && !signBitIsZero) {
5958         // Sign extend the low part into the high part.
5959         InGlue =
5960           SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0);
5961       } else {
5962         // Zero out the high part, effectively zero extending the input.
5963         SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5964         SDValue ClrNode =
5965             SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
5966         switch (NVT.SimpleTy) {
5967         case MVT::i16:
5968           ClrNode =
5969               SDValue(CurDAG->getMachineNode(
5970                           TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
5971                           CurDAG->getTargetConstant(X86::sub_16bit, dl,
5972                                                     MVT::i32)),
5973                       0);
5974           break;
5975         case MVT::i32:
5976           break;
5977         case MVT::i64:
5978           ClrNode =
5979               SDValue(CurDAG->getMachineNode(
5980                           TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
5981                           CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
5982                           CurDAG->getTargetConstant(X86::sub_32bit, dl,
5983                                                     MVT::i32)),
5984                       0);
5985           break;
5986         default:
5987           llvm_unreachable("Unexpected division source");
5988         }
5989 
5990         InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
5991                                       ClrNode, InGlue).getValue(1);
5992       }
5993     }
5994 
5995     if (foldedLoad) {
5996       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5997                         InGlue };
5998       MachineSDNode *CNode =
5999         CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
6000       InGlue = SDValue(CNode, 1);
6001       // Update the chain.
6002       ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
6003       // Record the mem-refs
6004       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
6005     } else {
6006       InGlue =
6007         SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0);
6008     }
6009 
6010     // Prevent use of AH in a REX instruction by explicitly copying it to
6011     // an ABCD_L register.
6012     //
6013     // The current assumption of the register allocator is that isel
6014     // won't generate explicit references to the GR8_ABCD_H registers. If
6015     // the allocator and/or the backend get enhanced to be more robust in
6016     // that regard, this can be, and should be, removed.
6017     if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
6018       SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
6019       unsigned AHExtOpcode =
6020           isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
6021 
6022       SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
6023                                              MVT::Glue, AHCopy, InGlue);
6024       SDValue Result(RNode, 0);
6025       InGlue = SDValue(RNode, 1);
6026 
6027       Result =
6028           CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
6029 
6030       ReplaceUses(SDValue(Node, 1), Result);
6031       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6032                  dbgs() << '\n');
6033     }
6034     // Copy the division (low) result, if it is needed.
6035     if (!SDValue(Node, 0).use_empty()) {
6036       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6037                                                 LoReg, NVT, InGlue);
6038       InGlue = Result.getValue(2);
6039       ReplaceUses(SDValue(Node, 0), Result);
6040       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6041                  dbgs() << '\n');
6042     }
6043     // Copy the remainder (high) result, if it is needed.
6044     if (!SDValue(Node, 1).use_empty()) {
6045       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6046                                               HiReg, NVT, InGlue);
6047       InGlue = Result.getValue(2);
6048       ReplaceUses(SDValue(Node, 1), Result);
6049       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6050                  dbgs() << '\n');
6051     }
6052     CurDAG->RemoveDeadNode(Node);
6053     return;
6054   }
6055 
6056   case X86ISD::FCMP:
6057   case X86ISD::STRICT_FCMP:
6058   case X86ISD::STRICT_FCMPS: {
6059     bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
6060                        Node->getOpcode() == X86ISD::STRICT_FCMPS;
6061     SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
6062     SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
6063 
6064     // Save the original VT of the compare.
6065     MVT CmpVT = N0.getSimpleValueType();
6066 
6067     // Floating point needs special handling if we don't have FCOMI.
6068     if (Subtarget->canUseCMOV())
6069       break;
6070 
6071     bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
6072 
6073     unsigned Opc;
6074     switch (CmpVT.SimpleTy) {
6075     default: llvm_unreachable("Unexpected type!");
6076     case MVT::f32:
6077       Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
6078       break;
6079     case MVT::f64:
6080       Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
6081       break;
6082     case MVT::f80:
6083       Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
6084       break;
6085     }
6086 
6087     SDValue Chain =
6088         IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
6089     SDValue Glue;
6090     if (IsStrictCmp) {
6091       SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
6092       Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
6093       Glue = Chain.getValue(1);
6094     } else {
6095       Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
6096     }
6097 
6098     // Move FPSW to AX.
6099     SDValue FNSTSW =
6100         SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
6101 
6102     // Extract upper 8-bits of AX.
6103     SDValue Extract =
6104         CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
6105 
6106     // Move AH into flags.
6107     // Some 64-bit targets lack SAHF support, but they do support FCOMI.
6108     assert(Subtarget->canUseLAHFSAHF() &&
6109            "Target doesn't support SAHF or FCOMI?");
6110     SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
6111     Chain = AH;
6112     SDValue SAHF = SDValue(
6113         CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
6114 
6115     if (IsStrictCmp)
6116       ReplaceUses(SDValue(Node, 1), Chain);
6117 
6118     ReplaceUses(SDValue(Node, 0), SAHF);
6119     CurDAG->RemoveDeadNode(Node);
6120     return;
6121   }
6122 
6123   case X86ISD::CMP: {
6124     SDValue N0 = Node->getOperand(0);
6125     SDValue N1 = Node->getOperand(1);
6126 
6127     // Optimizations for TEST compares.
6128     if (!isNullConstant(N1))
6129       break;
6130 
6131     // Save the original VT of the compare.
6132     MVT CmpVT = N0.getSimpleValueType();
6133 
6134     // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
6135     // by a test instruction. The test should be removed later by
6136     // analyzeCompare if we are using only the zero flag.
6137     // TODO: Should we check the users and use the BEXTR flags directly?
6138     if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6139       if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
6140         unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6141                                              : X86::TEST32rr;
6142         SDValue BEXTR = SDValue(NewNode, 0);
6143         NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
6144         ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6145         CurDAG->RemoveDeadNode(Node);
6146         return;
6147       }
6148     }
6149 
6150     // We can peek through truncates, but we need to be careful below.
6151     if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6152       N0 = N0.getOperand(0);
6153 
6154     // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6155     // use a smaller encoding.
6156     // Look past the truncate if CMP is the only use of it.
6157     if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6158         N0.getValueType() != MVT::i8) {
6159       auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6160       if (!MaskC)
6161         break;
6162 
6163       // We may have looked through a truncate so mask off any bits that
6164       // shouldn't be part of the compare.
6165       uint64_t Mask = MaskC->getZExtValue();
6166       Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits());
6167 
6168       // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6169       // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6170       // zero flag.
6171       if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
6172           onlyUsesZeroFlag(SDValue(Node, 0))) {
6173         unsigned ShiftOpcode = ISD::DELETED_NODE;
6174         unsigned ShiftAmt;
6175         unsigned SubRegIdx;
6176         MVT SubRegVT;
6177         unsigned TestOpcode;
6178         unsigned LeadingZeros = llvm::countl_zero(Mask);
6179         unsigned TrailingZeros = llvm::countr_zero(Mask);
6180 
6181         // With leading/trailing zeros, the transform is profitable if we can
6182         // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6183         // incurring any extra register moves.
6184         bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
6185         if (LeadingZeros == 0 && SavesBytes) {
6186           // If the mask covers the most significant bit, then we can replace
6187           // TEST+AND with a SHR and check eflags.
6188           // This emits a redundant TEST which is subsequently eliminated.
6189           ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6190           ShiftAmt = TrailingZeros;
6191           SubRegIdx = 0;
6192           TestOpcode = X86::TEST64rr;
6193         } else if (TrailingZeros == 0 && SavesBytes) {
6194           // If the mask covers the least significant bit, then we can replace
6195           // TEST+AND with a SHL and check eflags.
6196           // This emits a redundant TEST which is subsequently eliminated.
6197           ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6198           ShiftAmt = LeadingZeros;
6199           SubRegIdx = 0;
6200           TestOpcode = X86::TEST64rr;
6201         } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
6202           // If the shifted mask extends into the high half and is 8/16/32 bits
6203           // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6204           unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6205           if (PopCount == 8) {
6206             ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6207             ShiftAmt = TrailingZeros;
6208             SubRegIdx = X86::sub_8bit;
6209             SubRegVT = MVT::i8;
6210             TestOpcode = X86::TEST8rr;
6211           } else if (PopCount == 16) {
6212             ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6213             ShiftAmt = TrailingZeros;
6214             SubRegIdx = X86::sub_16bit;
6215             SubRegVT = MVT::i16;
6216             TestOpcode = X86::TEST16rr;
6217           } else if (PopCount == 32) {
6218             ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6219             ShiftAmt = TrailingZeros;
6220             SubRegIdx = X86::sub_32bit;
6221             SubRegVT = MVT::i32;
6222             TestOpcode = X86::TEST32rr;
6223           }
6224         }
6225         if (ShiftOpcode != ISD::DELETED_NODE) {
6226           SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6227           SDValue Shift = SDValue(
6228               CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6229                                      N0.getOperand(0), ShiftC),
6230               0);
6231           if (SubRegIdx != 0) {
6232             Shift =
6233                 CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
6234           }
6235           MachineSDNode *Test =
6236               CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6237           ReplaceNode(Node, Test);
6238           return;
6239         }
6240       }
6241 
6242       MVT VT;
6243       int SubRegOp;
6244       unsigned ROpc, MOpc;
6245 
6246       // For each of these checks we need to be careful if the sign flag is
6247       // being used. It is only safe to use the sign flag in two conditions,
6248       // either the sign bit in the shrunken mask is zero or the final test
6249       // size is equal to the original compare size.
6250 
6251       if (isUInt<8>(Mask) &&
6252           (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6253            hasNoSignFlagUses(SDValue(Node, 0)))) {
6254         // For example, convert "testl %eax, $8" to "testb %al, $8"
6255         VT = MVT::i8;
6256         SubRegOp = X86::sub_8bit;
6257         ROpc = X86::TEST8ri;
6258         MOpc = X86::TEST8mi;
6259       } else if (OptForMinSize && isUInt<16>(Mask) &&
6260                  (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6261                   hasNoSignFlagUses(SDValue(Node, 0)))) {
6262         // For example, "testl %eax, $32776" to "testw %ax, $32776".
6263         // NOTE: We only want to form TESTW instructions if optimizing for
6264         // min size. Otherwise we only save one byte and possibly get a length
6265         // changing prefix penalty in the decoders.
6266         VT = MVT::i16;
6267         SubRegOp = X86::sub_16bit;
6268         ROpc = X86::TEST16ri;
6269         MOpc = X86::TEST16mi;
6270       } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
6271                  ((!(Mask & 0x80000000) &&
6272                    // Without minsize 16-bit Cmps can get here so we need to
6273                    // be sure we calculate the correct sign flag if needed.
6274                    (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6275                   CmpVT == MVT::i32 ||
6276                   hasNoSignFlagUses(SDValue(Node, 0)))) {
6277         // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6278         // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6279         // Otherwize, we find ourselves in a position where we have to do
6280         // promotion. If previous passes did not promote the and, we assume
6281         // they had a good reason not to and do not promote here.
6282         VT = MVT::i32;
6283         SubRegOp = X86::sub_32bit;
6284         ROpc = X86::TEST32ri;
6285         MOpc = X86::TEST32mi;
6286       } else {
6287         // No eligible transformation was found.
6288         break;
6289       }
6290 
6291       SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
6292       SDValue Reg = N0.getOperand(0);
6293 
6294       // Emit a testl or testw.
6295       MachineSDNode *NewNode;
6296       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6297       if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6298         if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
6299           if (!LoadN->isSimple()) {
6300             unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
6301             if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6302                 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6303                 (MOpc == X86::TEST32mi && NumVolBits != 32))
6304               break;
6305           }
6306         }
6307         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6308                           Reg.getOperand(0) };
6309         NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6310         // Update the chain.
6311         ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
6312         // Record the mem-refs
6313         CurDAG->setNodeMemRefs(NewNode,
6314                                {cast<LoadSDNode>(Reg)->getMemOperand()});
6315       } else {
6316         // Extract the subregister if necessary.
6317         if (N0.getValueType() != VT)
6318           Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
6319 
6320         NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6321       }
6322       // Replace CMP with TEST.
6323       ReplaceNode(Node, NewNode);
6324       return;
6325     }
6326     break;
6327   }
6328   case X86ISD::PCMPISTR: {
6329     if (!Subtarget->hasSSE42())
6330       break;
6331 
6332     bool NeedIndex = !SDValue(Node, 0).use_empty();
6333     bool NeedMask = !SDValue(Node, 1).use_empty();
6334     // We can't fold a load if we are going to make two instructions.
6335     bool MayFoldLoad = !NeedIndex || !NeedMask;
6336 
6337     MachineSDNode *CNode;
6338     if (NeedMask) {
6339       unsigned ROpc =
6340           Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6341       unsigned MOpc =
6342           Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6343       CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6344       ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6345     }
6346     if (NeedIndex || !NeedMask) {
6347       unsigned ROpc =
6348           Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6349       unsigned MOpc =
6350           Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6351       CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6352       ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6353     }
6354 
6355     // Connect the flag usage to the last instruction created.
6356     ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6357     CurDAG->RemoveDeadNode(Node);
6358     return;
6359   }
6360   case X86ISD::PCMPESTR: {
6361     if (!Subtarget->hasSSE42())
6362       break;
6363 
6364     // Copy the two implicit register inputs.
6365     SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6366                                           Node->getOperand(1),
6367                                           SDValue()).getValue(1);
6368     InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6369                                   Node->getOperand(3), InGlue).getValue(1);
6370 
6371     bool NeedIndex = !SDValue(Node, 0).use_empty();
6372     bool NeedMask = !SDValue(Node, 1).use_empty();
6373     // We can't fold a load if we are going to make two instructions.
6374     bool MayFoldLoad = !NeedIndex || !NeedMask;
6375 
6376     MachineSDNode *CNode;
6377     if (NeedMask) {
6378       unsigned ROpc =
6379           Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6380       unsigned MOpc =
6381           Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6382       CNode =
6383           emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6384       ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6385     }
6386     if (NeedIndex || !NeedMask) {
6387       unsigned ROpc =
6388           Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6389       unsigned MOpc =
6390           Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6391       CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6392       ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6393     }
6394     // Connect the flag usage to the last instruction created.
6395     ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6396     CurDAG->RemoveDeadNode(Node);
6397     return;
6398   }
6399 
6400   case ISD::SETCC: {
6401     if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
6402       return;
6403 
6404     break;
6405   }
6406 
6407   case ISD::STORE:
6408     if (foldLoadStoreIntoMemOperand(Node))
6409       return;
6410     break;
6411 
6412   case X86ISD::SETCC_CARRY: {
6413     MVT VT = Node->getSimpleValueType(0);
6414     SDValue Result;
6415     if (Subtarget->hasSBBDepBreaking()) {
6416       // We have to do this manually because tblgen will put the eflags copy in
6417       // the wrong place if we use an extract_subreg in the pattern.
6418       // Copy flags to the EFLAGS register and glue it to next node.
6419       SDValue EFLAGS =
6420           CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6421                                Node->getOperand(1), SDValue());
6422 
6423       // Create a 64-bit instruction if the result is 64-bits otherwise use the
6424       // 32-bit version.
6425       unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6426       MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6427       Result = SDValue(
6428           CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
6429           0);
6430     } else {
6431       // The target does not recognize sbb with the same reg operand as a
6432       // no-source idiom, so we explicitly zero the input values.
6433       Result = getSBBZero(Node);
6434     }
6435 
6436     // For less than 32-bits we need to extract from the 32-bit node.
6437     if (VT == MVT::i8 || VT == MVT::i16) {
6438       int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6439       Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6440     }
6441 
6442     ReplaceUses(SDValue(Node, 0), Result);
6443     CurDAG->RemoveDeadNode(Node);
6444     return;
6445   }
6446   case X86ISD::SBB: {
6447     if (isNullConstant(Node->getOperand(0)) &&
6448         isNullConstant(Node->getOperand(1))) {
6449       SDValue Result = getSBBZero(Node);
6450 
6451       // Replace the flag use.
6452       ReplaceUses(SDValue(Node, 1), Result.getValue(1));
6453 
6454       // Replace the result use.
6455       if (!SDValue(Node, 0).use_empty()) {
6456         // For less than 32-bits we need to extract from the 32-bit node.
6457         MVT VT = Node->getSimpleValueType(0);
6458         if (VT == MVT::i8 || VT == MVT::i16) {
6459           int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6460           Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6461         }
6462         ReplaceUses(SDValue(Node, 0), Result);
6463       }
6464 
6465       CurDAG->RemoveDeadNode(Node);
6466       return;
6467     }
6468     break;
6469   }
6470   case X86ISD::MGATHER: {
6471     auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
6472     SDValue IndexOp = Mgt->getIndex();
6473     SDValue Mask = Mgt->getMask();
6474     MVT IndexVT = IndexOp.getSimpleValueType();
6475     MVT ValueVT = Node->getSimpleValueType(0);
6476     MVT MaskVT = Mask.getSimpleValueType();
6477 
6478     // This is just to prevent crashes if the nodes are malformed somehow. We're
6479     // otherwise only doing loose type checking in here based on type what
6480     // a type constraint would say just like table based isel.
6481     if (!ValueVT.isVector() || !MaskVT.isVector())
6482       break;
6483 
6484     unsigned NumElts = ValueVT.getVectorNumElements();
6485     MVT ValueSVT = ValueVT.getVectorElementType();
6486 
6487     bool IsFP = ValueSVT.isFloatingPoint();
6488     unsigned EltSize = ValueSVT.getSizeInBits();
6489 
6490     unsigned Opc = 0;
6491     bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6492     if (AVX512Gather) {
6493       if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6494         Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6495       else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6496         Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6497       else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6498         Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6499       else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6500         Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6501       else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6502         Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6503       else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6504         Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6505       else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6506         Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6507       else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6508         Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6509       else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6510         Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6511       else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6512         Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6513       else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6514         Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6515       else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6516         Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6517     } else {
6518       assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6519              "Unexpected mask VT!");
6520       if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6521         Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6522       else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6523         Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6524       else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6525         Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6526       else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6527         Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6528       else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6529         Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6530       else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6531         Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6532       else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6533         Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6534       else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6535         Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6536     }
6537 
6538     if (!Opc)
6539       break;
6540 
6541     SDValue Base, Scale, Index, Disp, Segment;
6542     if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
6543                           Base, Scale, Index, Disp, Segment))
6544       break;
6545 
6546     SDValue PassThru = Mgt->getPassThru();
6547     SDValue Chain = Mgt->getChain();
6548     // Gather instructions have a mask output not in the ISD node.
6549     SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6550 
6551     MachineSDNode *NewNode;
6552     if (AVX512Gather) {
6553       SDValue Ops[] = {PassThru, Mask, Base,    Scale,
6554                        Index,    Disp, Segment, Chain};
6555       NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6556     } else {
6557       SDValue Ops[] = {PassThru, Base,    Scale, Index,
6558                        Disp,     Segment, Mask,  Chain};
6559       NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6560     }
6561     CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
6562     ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6563     ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
6564     CurDAG->RemoveDeadNode(Node);
6565     return;
6566   }
6567   case X86ISD::MSCATTER: {
6568     auto *Sc = cast<X86MaskedScatterSDNode>(Node);
6569     SDValue Value = Sc->getValue();
6570     SDValue IndexOp = Sc->getIndex();
6571     MVT IndexVT = IndexOp.getSimpleValueType();
6572     MVT ValueVT = Value.getSimpleValueType();
6573 
6574     // This is just to prevent crashes if the nodes are malformed somehow. We're
6575     // otherwise only doing loose type checking in here based on type what
6576     // a type constraint would say just like table based isel.
6577     if (!ValueVT.isVector())
6578       break;
6579 
6580     unsigned NumElts = ValueVT.getVectorNumElements();
6581     MVT ValueSVT = ValueVT.getVectorElementType();
6582 
6583     bool IsFP = ValueSVT.isFloatingPoint();
6584     unsigned EltSize = ValueSVT.getSizeInBits();
6585 
6586     unsigned Opc;
6587     if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6588       Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6589     else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6590       Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6591     else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6592       Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6593     else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6594       Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6595     else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6596       Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6597     else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6598       Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6599     else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6600       Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6601     else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6602       Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6603     else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6604       Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6605     else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6606       Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6607     else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6608       Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6609     else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6610       Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6611     else
6612       break;
6613 
6614     SDValue Base, Scale, Index, Disp, Segment;
6615     if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
6616                           Base, Scale, Index, Disp, Segment))
6617       break;
6618 
6619     SDValue Mask = Sc->getMask();
6620     SDValue Chain = Sc->getChain();
6621     // Scatter instructions have a mask output not in the ISD node.
6622     SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6623     SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6624 
6625     MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6626     CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
6627     ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
6628     CurDAG->RemoveDeadNode(Node);
6629     return;
6630   }
6631   case ISD::PREALLOCATED_SETUP: {
6632     auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6633     auto CallId = MFI->getPreallocatedIdForCallSite(
6634         cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6635     SDValue Chain = Node->getOperand(0);
6636     SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6637     MachineSDNode *New = CurDAG->getMachineNode(
6638         TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6639     ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
6640     CurDAG->RemoveDeadNode(Node);
6641     return;
6642   }
6643   case ISD::PREALLOCATED_ARG: {
6644     auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6645     auto CallId = MFI->getPreallocatedIdForCallSite(
6646         cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6647     SDValue Chain = Node->getOperand(0);
6648     SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6649     SDValue ArgIndex = Node->getOperand(2);
6650     SDValue Ops[3];
6651     Ops[0] = CallIdValue;
6652     Ops[1] = ArgIndex;
6653     Ops[2] = Chain;
6654     MachineSDNode *New = CurDAG->getMachineNode(
6655         TargetOpcode::PREALLOCATED_ARG, dl,
6656         CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6657                           MVT::Other),
6658         Ops);
6659     ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
6660     ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
6661     CurDAG->RemoveDeadNode(Node);
6662     return;
6663   }
6664   case X86ISD::AESENCWIDE128KL:
6665   case X86ISD::AESDECWIDE128KL:
6666   case X86ISD::AESENCWIDE256KL:
6667   case X86ISD::AESDECWIDE256KL: {
6668     if (!Subtarget->hasWIDEKL())
6669       break;
6670 
6671     unsigned Opcode;
6672     switch (Node->getOpcode()) {
6673     default:
6674       llvm_unreachable("Unexpected opcode!");
6675     case X86ISD::AESENCWIDE128KL:
6676       Opcode = X86::AESENCWIDE128KL;
6677       break;
6678     case X86ISD::AESDECWIDE128KL:
6679       Opcode = X86::AESDECWIDE128KL;
6680       break;
6681     case X86ISD::AESENCWIDE256KL:
6682       Opcode = X86::AESENCWIDE256KL;
6683       break;
6684     case X86ISD::AESDECWIDE256KL:
6685       Opcode = X86::AESDECWIDE256KL;
6686       break;
6687     }
6688 
6689     SDValue Chain = Node->getOperand(0);
6690     SDValue Addr = Node->getOperand(1);
6691 
6692     SDValue Base, Scale, Index, Disp, Segment;
6693     if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
6694       break;
6695 
6696     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
6697                                  SDValue());
6698     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
6699                                  Chain.getValue(1));
6700     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
6701                                  Chain.getValue(1));
6702     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
6703                                  Chain.getValue(1));
6704     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
6705                                  Chain.getValue(1));
6706     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
6707                                  Chain.getValue(1));
6708     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
6709                                  Chain.getValue(1));
6710     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
6711                                  Chain.getValue(1));
6712 
6713     MachineSDNode *Res = CurDAG->getMachineNode(
6714         Opcode, dl, Node->getVTList(),
6715         {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
6716     CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
6717     ReplaceNode(Node, Res);
6718     return;
6719   }
6720   }
6721 
6722   SelectCode(Node);
6723 }
6724 
6725 bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6726     const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6727     std::vector<SDValue> &OutOps) {
6728   SDValue Op0, Op1, Op2, Op3, Op4;
6729   switch (ConstraintID) {
6730   default:
6731     llvm_unreachable("Unexpected asm memory constraint");
6732   case InlineAsm::ConstraintCode::o: // offsetable        ??
6733   case InlineAsm::ConstraintCode::v: // not offsetable    ??
6734   case InlineAsm::ConstraintCode::m: // memory
6735   case InlineAsm::ConstraintCode::X:
6736   case InlineAsm::ConstraintCode::p: // address
6737     if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
6738       return true;
6739     break;
6740   }
6741 
6742   OutOps.push_back(Op0);
6743   OutOps.push_back(Op1);
6744   OutOps.push_back(Op2);
6745   OutOps.push_back(Op3);
6746   OutOps.push_back(Op4);
6747   return false;
6748 }
6749 
6750 X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM)
6751     : SelectionDAGISelPass(
6752           std::make_unique<X86DAGToDAGISel>(TM, TM.getOptLevel())) {}
6753 
6754 /// This pass converts a legalized DAG into a X86-specific DAG,
6755 /// ready for instruction scheduling.
6756 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
6757                                      CodeGenOptLevel OptLevel) {
6758   return new X86DAGToDAGISelLegacy(TM, OptLevel);
6759 }
6760