xref: /llvm-project/llvm/lib/Target/X86/X86FastISel.cpp (revision dfe43bd1ca46c59399b7cbbf81b09256232e27f9)
1 //===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the X86-specific support for the FastISel class. Much
10 // of the target-specific code is generated by tablegen in the file
11 // X86GenFastISel.inc, which is #included here.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "X86.h"
16 #include "X86CallingConv.h"
17 #include "X86InstrBuilder.h"
18 #include "X86InstrInfo.h"
19 #include "X86MachineFunctionInfo.h"
20 #include "X86RegisterInfo.h"
21 #include "X86Subtarget.h"
22 #include "X86TargetMachine.h"
23 #include "llvm/Analysis/BranchProbabilityInfo.h"
24 #include "llvm/CodeGen/FastISel.h"
25 #include "llvm/CodeGen/FunctionLoweringInfo.h"
26 #include "llvm/CodeGen/MachineConstantPool.h"
27 #include "llvm/CodeGen/MachineFrameInfo.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 #include "llvm/IR/CallingConv.h"
30 #include "llvm/IR/DebugInfo.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/GetElementPtrTypeIterator.h"
33 #include "llvm/IR/GlobalVariable.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/IntrinsicsX86.h"
37 #include "llvm/IR/Operator.h"
38 #include "llvm/MC/MCAsmInfo.h"
39 #include "llvm/MC/MCSymbol.h"
40 #include "llvm/Support/ErrorHandling.h"
41 #include "llvm/Target/TargetOptions.h"
42 using namespace llvm;
43 
44 namespace {
45 
46 class X86FastISel final : public FastISel {
47   /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
48   /// make the right decision when generating code for different targets.
49   const X86Subtarget *Subtarget;
50 
51 public:
52   explicit X86FastISel(FunctionLoweringInfo &funcInfo,
53                        const TargetLibraryInfo *libInfo)
54       : FastISel(funcInfo, libInfo) {
55     Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
56   }
57 
58   bool fastSelectInstruction(const Instruction *I) override;
59 
60   /// The specified machine instr operand is a vreg, and that
61   /// vreg is being provided by the specified load instruction.  If possible,
62   /// try to fold the load as an operand to the instruction, returning true if
63   /// possible.
64   bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
65                            const LoadInst *LI) override;
66 
67   bool fastLowerArguments() override;
68   bool fastLowerCall(CallLoweringInfo &CLI) override;
69   bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
70 
71 #include "X86GenFastISel.inc"
72 
73 private:
74   bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT,
75                           const DebugLoc &DL);
76 
77   bool X86FastEmitLoad(MVT VT, X86AddressMode &AM, MachineMemOperand *MMO,
78                        unsigned &ResultReg, unsigned Alignment = 1);
79 
80   bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM,
81                         MachineMemOperand *MMO = nullptr, bool Aligned = false);
82   bool X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM,
83                         MachineMemOperand *MMO = nullptr, bool Aligned = false);
84 
85   bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
86                          unsigned &ResultReg);
87 
88   bool X86SelectAddress(const Value *V, X86AddressMode &AM);
89   bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);
90 
91   bool X86SelectLoad(const Instruction *I);
92 
93   bool X86SelectStore(const Instruction *I);
94 
95   bool X86SelectRet(const Instruction *I);
96 
97   bool X86SelectCmp(const Instruction *I);
98 
99   bool X86SelectZExt(const Instruction *I);
100 
101   bool X86SelectSExt(const Instruction *I);
102 
103   bool X86SelectBranch(const Instruction *I);
104 
105   bool X86SelectShift(const Instruction *I);
106 
107   bool X86SelectDivRem(const Instruction *I);
108 
109   bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);
110 
111   bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);
112 
113   bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);
114 
115   bool X86SelectSelect(const Instruction *I);
116 
117   bool X86SelectTrunc(const Instruction *I);
118 
119   bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc,
120                                const TargetRegisterClass *RC);
121 
122   bool X86SelectFPExt(const Instruction *I);
123   bool X86SelectFPTrunc(const Instruction *I);
124   bool X86SelectSIToFP(const Instruction *I);
125   bool X86SelectUIToFP(const Instruction *I);
126   bool X86SelectIntToFP(const Instruction *I, bool IsSigned);
127 
128   const X86InstrInfo *getInstrInfo() const {
129     return Subtarget->getInstrInfo();
130   }
131   const X86TargetMachine *getTargetMachine() const {
132     return static_cast<const X86TargetMachine *>(&TM);
133   }
134 
135   bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
136 
137   unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT);
138   unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT);
139   unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT);
140   unsigned fastMaterializeConstant(const Constant *C) override;
141 
142   unsigned fastMaterializeAlloca(const AllocaInst *C) override;
143 
144   unsigned fastMaterializeFloatZero(const ConstantFP *CF) override;
145 
146   /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
147   /// computed in an SSE register, not on the X87 floating point stack.
148   bool isScalarFPTypeInSSEReg(EVT VT) const {
149     return (VT == MVT::f64 && Subtarget->hasSSE2()) ||
150            (VT == MVT::f32 && Subtarget->hasSSE1()) || VT == MVT::f16;
151   }
152 
153   bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
154 
155   bool IsMemcpySmall(uint64_t Len);
156 
157   bool TryEmitSmallMemcpy(X86AddressMode DestAM,
158                           X86AddressMode SrcAM, uint64_t Len);
159 
160   bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
161                             const Value *Cond);
162 
163   const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
164                                             X86AddressMode &AM);
165 
166   unsigned fastEmitInst_rrrr(unsigned MachineInstOpcode,
167                              const TargetRegisterClass *RC, unsigned Op0,
168                              unsigned Op1, unsigned Op2, unsigned Op3);
169 };
170 
171 } // end anonymous namespace.
172 
173 static std::pair<unsigned, bool>
174 getX86SSEConditionCode(CmpInst::Predicate Predicate) {
175   unsigned CC;
176   bool NeedSwap = false;
177 
178   // SSE Condition code mapping:
179   //  0 - EQ
180   //  1 - LT
181   //  2 - LE
182   //  3 - UNORD
183   //  4 - NEQ
184   //  5 - NLT
185   //  6 - NLE
186   //  7 - ORD
187   switch (Predicate) {
188   default: llvm_unreachable("Unexpected predicate");
189   case CmpInst::FCMP_OEQ: CC = 0;          break;
190   case CmpInst::FCMP_OGT: NeedSwap = true; [[fallthrough]];
191   case CmpInst::FCMP_OLT: CC = 1;          break;
192   case CmpInst::FCMP_OGE: NeedSwap = true; [[fallthrough]];
193   case CmpInst::FCMP_OLE: CC = 2;          break;
194   case CmpInst::FCMP_UNO: CC = 3;          break;
195   case CmpInst::FCMP_UNE: CC = 4;          break;
196   case CmpInst::FCMP_ULE: NeedSwap = true; [[fallthrough]];
197   case CmpInst::FCMP_UGE: CC = 5;          break;
198   case CmpInst::FCMP_ULT: NeedSwap = true; [[fallthrough]];
199   case CmpInst::FCMP_UGT: CC = 6;          break;
200   case CmpInst::FCMP_ORD: CC = 7;          break;
201   case CmpInst::FCMP_UEQ: CC = 8;          break;
202   case CmpInst::FCMP_ONE: CC = 12;         break;
203   }
204 
205   return std::make_pair(CC, NeedSwap);
206 }
207 
208 /// Adds a complex addressing mode to the given machine instr builder.
209 /// Note, this will constrain the index register.  If its not possible to
210 /// constrain the given index register, then a new one will be created.  The
211 /// IndexReg field of the addressing mode will be updated to match in this case.
212 const MachineInstrBuilder &
213 X86FastISel::addFullAddress(const MachineInstrBuilder &MIB,
214                             X86AddressMode &AM) {
215   // First constrain the index register.  It needs to be a GR64_NOSP.
216   AM.IndexReg = constrainOperandRegClass(MIB->getDesc(), AM.IndexReg,
217                                          MIB->getNumOperands() +
218                                          X86::AddrIndexReg);
219   return ::addFullAddress(MIB, AM);
220 }
221 
222 /// Check if it is possible to fold the condition from the XALU intrinsic
223 /// into the user. The condition code will only be updated on success.
224 bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
225                                        const Value *Cond) {
226   if (!isa<ExtractValueInst>(Cond))
227     return false;
228 
229   const auto *EV = cast<ExtractValueInst>(Cond);
230   if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
231     return false;
232 
233   const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
234   MVT RetVT;
235   const Function *Callee = II->getCalledFunction();
236   Type *RetTy =
237     cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
238   if (!isTypeLegal(RetTy, RetVT))
239     return false;
240 
241   if (RetVT != MVT::i32 && RetVT != MVT::i64)
242     return false;
243 
244   X86::CondCode TmpCC;
245   switch (II->getIntrinsicID()) {
246   default: return false;
247   case Intrinsic::sadd_with_overflow:
248   case Intrinsic::ssub_with_overflow:
249   case Intrinsic::smul_with_overflow:
250   case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
251   case Intrinsic::uadd_with_overflow:
252   case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
253   }
254 
255   // Check if both instructions are in the same basic block.
256   if (II->getParent() != I->getParent())
257     return false;
258 
259   // Make sure nothing is in the way
260   BasicBlock::const_iterator Start(I);
261   BasicBlock::const_iterator End(II);
262   for (auto Itr = std::prev(Start); Itr != End; --Itr) {
263     // We only expect extractvalue instructions between the intrinsic and the
264     // instruction to be selected.
265     if (!isa<ExtractValueInst>(Itr))
266       return false;
267 
268     // Check that the extractvalue operand comes from the intrinsic.
269     const auto *EVI = cast<ExtractValueInst>(Itr);
270     if (EVI->getAggregateOperand() != II)
271       return false;
272   }
273 
274   // Make sure no potentially eflags clobbering phi moves can be inserted in
275   // between.
276   auto HasPhis = [](const BasicBlock *Succ) { return !Succ->phis().empty(); };
277   if (I->isTerminator() && llvm::any_of(successors(I), HasPhis))
278     return false;
279 
280   // Make sure there are no potentially eflags clobbering constant
281   // materializations in between.
282   if (llvm::any_of(I->operands(), [](Value *V) { return isa<Constant>(V); }))
283     return false;
284 
285   CC = TmpCC;
286   return true;
287 }
288 
289 bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
290   EVT evt = TLI.getValueType(DL, Ty, /*AllowUnknown=*/true);
291   if (evt == MVT::Other || !evt.isSimple())
292     // Unhandled type. Halt "fast" selection and bail.
293     return false;
294 
295   VT = evt.getSimpleVT();
296   // For now, require SSE/SSE2 for performing floating-point operations,
297   // since x87 requires additional work.
298   if (VT == MVT::f64 && !Subtarget->hasSSE2())
299     return false;
300   if (VT == MVT::f32 && !Subtarget->hasSSE1())
301     return false;
302   // Similarly, no f80 support yet.
303   if (VT == MVT::f80)
304     return false;
305   // We only handle legal types. For example, on x86-32 the instruction
306   // selector contains all of the 64-bit instructions from x86-64,
307   // under the assumption that i64 won't be used if the target doesn't
308   // support it.
309   return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
310 }
311 
312 /// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
313 /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
314 /// Return true and the result register by reference if it is possible.
315 bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM,
316                                   MachineMemOperand *MMO, unsigned &ResultReg,
317                                   unsigned Alignment) {
318   bool HasSSE1 = Subtarget->hasSSE1();
319   bool HasSSE2 = Subtarget->hasSSE2();
320   bool HasSSE41 = Subtarget->hasSSE41();
321   bool HasAVX = Subtarget->hasAVX();
322   bool HasAVX2 = Subtarget->hasAVX2();
323   bool HasAVX512 = Subtarget->hasAVX512();
324   bool HasVLX = Subtarget->hasVLX();
325   bool IsNonTemporal = MMO && MMO->isNonTemporal();
326 
327   // Treat i1 loads the same as i8 loads. Masking will be done when storing.
328   if (VT == MVT::i1)
329     VT = MVT::i8;
330 
331   // Get opcode and regclass of the output for the given load instruction.
332   unsigned Opc = 0;
333   switch (VT.SimpleTy) {
334   default: return false;
335   case MVT::i8:
336     Opc = X86::MOV8rm;
337     break;
338   case MVT::i16:
339     Opc = X86::MOV16rm;
340     break;
341   case MVT::i32:
342     Opc = X86::MOV32rm;
343     break;
344   case MVT::i64:
345     // Must be in x86-64 mode.
346     Opc = X86::MOV64rm;
347     break;
348   case MVT::f32:
349     Opc = HasAVX512 ? X86::VMOVSSZrm_alt
350           : HasAVX  ? X86::VMOVSSrm_alt
351           : HasSSE1 ? X86::MOVSSrm_alt
352                     : X86::LD_Fp32m;
353     break;
354   case MVT::f64:
355     Opc = HasAVX512 ? X86::VMOVSDZrm_alt
356           : HasAVX  ? X86::VMOVSDrm_alt
357           : HasSSE2 ? X86::MOVSDrm_alt
358                     : X86::LD_Fp64m;
359     break;
360   case MVT::f80:
361     // No f80 support yet.
362     return false;
363   case MVT::v4f32:
364     if (IsNonTemporal && Alignment >= 16 && HasSSE41)
365       Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
366             HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
367     else if (Alignment >= 16)
368       Opc = HasVLX ? X86::VMOVAPSZ128rm :
369             HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm;
370     else
371       Opc = HasVLX ? X86::VMOVUPSZ128rm :
372             HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;
373     break;
374   case MVT::v2f64:
375     if (IsNonTemporal && Alignment >= 16 && HasSSE41)
376       Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
377             HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
378     else if (Alignment >= 16)
379       Opc = HasVLX ? X86::VMOVAPDZ128rm :
380             HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm;
381     else
382       Opc = HasVLX ? X86::VMOVUPDZ128rm :
383             HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;
384     break;
385   case MVT::v4i32:
386   case MVT::v2i64:
387   case MVT::v8i16:
388   case MVT::v16i8:
389     if (IsNonTemporal && Alignment >= 16 && HasSSE41)
390       Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
391             HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
392     else if (Alignment >= 16)
393       Opc = HasVLX ? X86::VMOVDQA64Z128rm :
394             HasAVX ? X86::VMOVDQArm : X86::MOVDQArm;
395     else
396       Opc = HasVLX ? X86::VMOVDQU64Z128rm :
397             HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;
398     break;
399   case MVT::v8f32:
400     assert(HasAVX);
401     if (IsNonTemporal && Alignment >= 32 && HasAVX2)
402       Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
403     else if (IsNonTemporal && Alignment >= 16)
404       return false; // Force split for X86::VMOVNTDQArm
405     else if (Alignment >= 32)
406       Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm;
407     else
408       Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm;
409     break;
410   case MVT::v4f64:
411     assert(HasAVX);
412     if (IsNonTemporal && Alignment >= 32 && HasAVX2)
413       Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
414     else if (IsNonTemporal && Alignment >= 16)
415       return false; // Force split for X86::VMOVNTDQArm
416     else if (Alignment >= 32)
417       Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm;
418     else
419       Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm;
420     break;
421   case MVT::v8i32:
422   case MVT::v4i64:
423   case MVT::v16i16:
424   case MVT::v32i8:
425     assert(HasAVX);
426     if (IsNonTemporal && Alignment >= 32 && HasAVX2)
427       Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
428     else if (IsNonTemporal && Alignment >= 16)
429       return false; // Force split for X86::VMOVNTDQArm
430     else if (Alignment >= 32)
431       Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm;
432     else
433       Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm;
434     break;
435   case MVT::v16f32:
436     assert(HasAVX512);
437     if (IsNonTemporal && Alignment >= 64)
438       Opc = X86::VMOVNTDQAZrm;
439     else
440       Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm;
441     break;
442   case MVT::v8f64:
443     assert(HasAVX512);
444     if (IsNonTemporal && Alignment >= 64)
445       Opc = X86::VMOVNTDQAZrm;
446     else
447       Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm;
448     break;
449   case MVT::v8i64:
450   case MVT::v16i32:
451   case MVT::v32i16:
452   case MVT::v64i8:
453     assert(HasAVX512);
454     // Note: There are a lot more choices based on type with AVX-512, but
455     // there's really no advantage when the load isn't masked.
456     if (IsNonTemporal && Alignment >= 64)
457       Opc = X86::VMOVNTDQAZrm;
458     else
459       Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm;
460     break;
461   }
462 
463   const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
464 
465   ResultReg = createResultReg(RC);
466   MachineInstrBuilder MIB =
467     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), ResultReg);
468   addFullAddress(MIB, AM);
469   if (MMO)
470     MIB->addMemOperand(*FuncInfo.MF, MMO);
471   return true;
472 }
473 
474 /// X86FastEmitStore - Emit a machine instruction to store a value Val of
475 /// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
476 /// and a displacement offset, or a GlobalAddress,
477 /// i.e. V. Return true if it is possible.
478 bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM,
479                                    MachineMemOperand *MMO, bool Aligned) {
480   bool HasSSE1 = Subtarget->hasSSE1();
481   bool HasSSE2 = Subtarget->hasSSE2();
482   bool HasSSE4A = Subtarget->hasSSE4A();
483   bool HasAVX = Subtarget->hasAVX();
484   bool HasAVX512 = Subtarget->hasAVX512();
485   bool HasVLX = Subtarget->hasVLX();
486   bool IsNonTemporal = MMO && MMO->isNonTemporal();
487 
488   // Get opcode and regclass of the output for the given store instruction.
489   unsigned Opc = 0;
490   switch (VT.getSimpleVT().SimpleTy) {
491   case MVT::f80: // No f80 support yet.
492   default: return false;
493   case MVT::i1: {
494     // Mask out all but lowest bit.
495     Register AndResult = createResultReg(&X86::GR8RegClass);
496     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
497             TII.get(X86::AND8ri), AndResult)
498       .addReg(ValReg).addImm(1);
499     ValReg = AndResult;
500     [[fallthrough]]; // handle i1 as i8.
501   }
502   case MVT::i8:  Opc = X86::MOV8mr;  break;
503   case MVT::i16: Opc = X86::MOV16mr; break;
504   case MVT::i32:
505     Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr;
506     break;
507   case MVT::i64:
508     // Must be in x86-64 mode.
509     Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr;
510     break;
511   case MVT::f32:
512     if (HasSSE1) {
513       if (IsNonTemporal && HasSSE4A)
514         Opc = X86::MOVNTSS;
515       else
516         Opc = HasAVX512 ? X86::VMOVSSZmr :
517               HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
518     } else
519       Opc = X86::ST_Fp32m;
520     break;
521   case MVT::f64:
522     if (HasSSE2) {
523       if (IsNonTemporal && HasSSE4A)
524         Opc = X86::MOVNTSD;
525       else
526         Opc = HasAVX512 ? X86::VMOVSDZmr :
527               HasAVX ? X86::VMOVSDmr : X86::MOVSDmr;
528     } else
529       Opc = X86::ST_Fp64m;
530     break;
531   case MVT::x86mmx:
532     Opc = (IsNonTemporal && HasSSE1) ? X86::MMX_MOVNTQmr : X86::MMX_MOVQ64mr;
533     break;
534   case MVT::v4f32:
535     if (Aligned) {
536       if (IsNonTemporal)
537         Opc = HasVLX ? X86::VMOVNTPSZ128mr :
538               HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr;
539       else
540         Opc = HasVLX ? X86::VMOVAPSZ128mr :
541               HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr;
542     } else
543       Opc = HasVLX ? X86::VMOVUPSZ128mr :
544             HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr;
545     break;
546   case MVT::v2f64:
547     if (Aligned) {
548       if (IsNonTemporal)
549         Opc = HasVLX ? X86::VMOVNTPDZ128mr :
550               HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr;
551       else
552         Opc = HasVLX ? X86::VMOVAPDZ128mr :
553               HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr;
554     } else
555       Opc = HasVLX ? X86::VMOVUPDZ128mr :
556             HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr;
557     break;
558   case MVT::v4i32:
559   case MVT::v2i64:
560   case MVT::v8i16:
561   case MVT::v16i8:
562     if (Aligned) {
563       if (IsNonTemporal)
564         Opc = HasVLX ? X86::VMOVNTDQZ128mr :
565               HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr;
566       else
567         Opc = HasVLX ? X86::VMOVDQA64Z128mr :
568               HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;
569     } else
570       Opc = HasVLX ? X86::VMOVDQU64Z128mr :
571             HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr;
572     break;
573   case MVT::v8f32:
574     assert(HasAVX);
575     if (Aligned) {
576       if (IsNonTemporal)
577         Opc = HasVLX ? X86::VMOVNTPSZ256mr : X86::VMOVNTPSYmr;
578       else
579         Opc = HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr;
580     } else
581       Opc = HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr;
582     break;
583   case MVT::v4f64:
584     assert(HasAVX);
585     if (Aligned) {
586       if (IsNonTemporal)
587         Opc = HasVLX ? X86::VMOVNTPDZ256mr : X86::VMOVNTPDYmr;
588       else
589         Opc = HasVLX ? X86::VMOVAPDZ256mr : X86::VMOVAPDYmr;
590     } else
591       Opc = HasVLX ? X86::VMOVUPDZ256mr : X86::VMOVUPDYmr;
592     break;
593   case MVT::v8i32:
594   case MVT::v4i64:
595   case MVT::v16i16:
596   case MVT::v32i8:
597     assert(HasAVX);
598     if (Aligned) {
599       if (IsNonTemporal)
600         Opc = HasVLX ? X86::VMOVNTDQZ256mr : X86::VMOVNTDQYmr;
601       else
602         Opc = HasVLX ? X86::VMOVDQA64Z256mr : X86::VMOVDQAYmr;
603     } else
604       Opc = HasVLX ? X86::VMOVDQU64Z256mr : X86::VMOVDQUYmr;
605     break;
606   case MVT::v16f32:
607     assert(HasAVX512);
608     if (Aligned)
609       Opc = IsNonTemporal ? X86::VMOVNTPSZmr : X86::VMOVAPSZmr;
610     else
611       Opc = X86::VMOVUPSZmr;
612     break;
613   case MVT::v8f64:
614     assert(HasAVX512);
615     if (Aligned) {
616       Opc = IsNonTemporal ? X86::VMOVNTPDZmr : X86::VMOVAPDZmr;
617     } else
618       Opc = X86::VMOVUPDZmr;
619     break;
620   case MVT::v8i64:
621   case MVT::v16i32:
622   case MVT::v32i16:
623   case MVT::v64i8:
624     assert(HasAVX512);
625     // Note: There are a lot more choices based on type with AVX-512, but
626     // there's really no advantage when the store isn't masked.
627     if (Aligned)
628       Opc = IsNonTemporal ? X86::VMOVNTDQZmr : X86::VMOVDQA64Zmr;
629     else
630       Opc = X86::VMOVDQU64Zmr;
631     break;
632   }
633 
634   const MCInstrDesc &Desc = TII.get(Opc);
635   // Some of the instructions in the previous switch use FR128 instead
636   // of FR32 for ValReg. Make sure the register we feed the instruction
637   // matches its register class constraints.
638   // Note: This is fine to do a copy from FR32 to FR128, this is the
639   // same registers behind the scene and actually why it did not trigger
640   // any bugs before.
641   ValReg = constrainOperandRegClass(Desc, ValReg, Desc.getNumOperands() - 1);
642   MachineInstrBuilder MIB =
643       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, Desc);
644   addFullAddress(MIB, AM).addReg(ValReg);
645   if (MMO)
646     MIB->addMemOperand(*FuncInfo.MF, MMO);
647 
648   return true;
649 }
650 
651 bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
652                                    X86AddressMode &AM,
653                                    MachineMemOperand *MMO, bool Aligned) {
654   // Handle 'null' like i32/i64 0.
655   if (isa<ConstantPointerNull>(Val))
656     Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
657 
658   // If this is a store of a simple constant, fold the constant into the store.
659   if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
660     unsigned Opc = 0;
661     bool Signed = true;
662     switch (VT.getSimpleVT().SimpleTy) {
663     default: break;
664     case MVT::i1:
665       Signed = false;
666       [[fallthrough]]; // Handle as i8.
667     case MVT::i8:  Opc = X86::MOV8mi;  break;
668     case MVT::i16: Opc = X86::MOV16mi; break;
669     case MVT::i32: Opc = X86::MOV32mi; break;
670     case MVT::i64:
671       // Must be a 32-bit sign extended value.
672       if (isInt<32>(CI->getSExtValue()))
673         Opc = X86::MOV64mi32;
674       break;
675     }
676 
677     if (Opc) {
678       MachineInstrBuilder MIB =
679         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc));
680       addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
681                                             : CI->getZExtValue());
682       if (MMO)
683         MIB->addMemOperand(*FuncInfo.MF, MMO);
684       return true;
685     }
686   }
687 
688   Register ValReg = getRegForValue(Val);
689   if (ValReg == 0)
690     return false;
691 
692   return X86FastEmitStore(VT, ValReg, AM, MMO, Aligned);
693 }
694 
695 /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
696 /// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
697 /// ISD::SIGN_EXTEND).
698 bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
699                                     unsigned Src, EVT SrcVT,
700                                     unsigned &ResultReg) {
701   unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, Src);
702   if (RR == 0)
703     return false;
704 
705   ResultReg = RR;
706   return true;
707 }
708 
709 bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
710   // Handle constant address.
711   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
712     // Can't handle alternate code models yet.
713     if (TM.getCodeModel() != CodeModel::Small &&
714         TM.getCodeModel() != CodeModel::Medium)
715       return false;
716 
717     // Can't handle large objects yet.
718     if (TM.isLargeGlobalValue(GV))
719       return false;
720 
721     // Can't handle TLS yet.
722     if (GV->isThreadLocal())
723       return false;
724 
725     // Can't handle !absolute_symbol references yet.
726     if (GV->isAbsoluteSymbolRef())
727       return false;
728 
729     // RIP-relative addresses can't have additional register operands, so if
730     // we've already folded stuff into the addressing mode, just force the
731     // global value into its own register, which we can use as the basereg.
732     if (!Subtarget->isPICStyleRIPRel() ||
733         (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
734       // Okay, we've committed to selecting this global. Set up the address.
735       AM.GV = GV;
736 
737       // Allow the subtarget to classify the global.
738       unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
739 
740       // If this reference is relative to the pic base, set it now.
741       if (isGlobalRelativeToPICBase(GVFlags)) {
742         // FIXME: How do we know Base.Reg is free??
743         AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
744       }
745 
746       // Unless the ABI requires an extra load, return a direct reference to
747       // the global.
748       if (!isGlobalStubReference(GVFlags)) {
749         if (Subtarget->isPICStyleRIPRel()) {
750           // Use rip-relative addressing if we can.  Above we verified that the
751           // base and index registers are unused.
752           assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
753           AM.Base.Reg = X86::RIP;
754         }
755         AM.GVOpFlags = GVFlags;
756         return true;
757       }
758 
759       // Ok, we need to do a load from a stub.  If we've already loaded from
760       // this stub, reuse the loaded pointer, otherwise emit the load now.
761       DenseMap<const Value *, Register>::iterator I = LocalValueMap.find(V);
762       Register LoadReg;
763       if (I != LocalValueMap.end() && I->second) {
764         LoadReg = I->second;
765       } else {
766         // Issue load from stub.
767         unsigned Opc = 0;
768         const TargetRegisterClass *RC = nullptr;
769         X86AddressMode StubAM;
770         StubAM.Base.Reg = AM.Base.Reg;
771         StubAM.GV = GV;
772         StubAM.GVOpFlags = GVFlags;
773 
774         // Prepare for inserting code in the local-value area.
775         SavePoint SaveInsertPt = enterLocalValueArea();
776 
777         if (TLI.getPointerTy(DL) == MVT::i64) {
778           Opc = X86::MOV64rm;
779           RC  = &X86::GR64RegClass;
780         } else {
781           Opc = X86::MOV32rm;
782           RC  = &X86::GR32RegClass;
783         }
784 
785         if (Subtarget->isPICStyleRIPRel() || GVFlags == X86II::MO_GOTPCREL ||
786             GVFlags == X86II::MO_GOTPCREL_NORELAX)
787           StubAM.Base.Reg = X86::RIP;
788 
789         LoadReg = createResultReg(RC);
790         MachineInstrBuilder LoadMI =
791           BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), LoadReg);
792         addFullAddress(LoadMI, StubAM);
793 
794         // Ok, back to normal mode.
795         leaveLocalValueArea(SaveInsertPt);
796 
797         // Prevent loading GV stub multiple times in same MBB.
798         LocalValueMap[V] = LoadReg;
799       }
800 
801       // Now construct the final address. Note that the Disp, Scale,
802       // and Index values may already be set here.
803       AM.Base.Reg = LoadReg;
804       AM.GV = nullptr;
805       return true;
806     }
807   }
808 
809   // If all else fails, try to materialize the value in a register.
810   if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
811     if (AM.Base.Reg == 0) {
812       AM.Base.Reg = getRegForValue(V);
813       return AM.Base.Reg != 0;
814     }
815     if (AM.IndexReg == 0) {
816       assert(AM.Scale == 1 && "Scale with no index!");
817       AM.IndexReg = getRegForValue(V);
818       return AM.IndexReg != 0;
819     }
820   }
821 
822   return false;
823 }
824 
825 /// X86SelectAddress - Attempt to fill in an address from the given value.
826 ///
827 bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
828   SmallVector<const Value *, 32> GEPs;
829 redo_gep:
830   const User *U = nullptr;
831   unsigned Opcode = Instruction::UserOp1;
832   if (const Instruction *I = dyn_cast<Instruction>(V)) {
833     // Don't walk into other basic blocks; it's possible we haven't
834     // visited them yet, so the instructions may not yet be assigned
835     // virtual registers.
836     if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) ||
837         FuncInfo.getMBB(I->getParent()) == FuncInfo.MBB) {
838       Opcode = I->getOpcode();
839       U = I;
840     }
841   } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
842     Opcode = C->getOpcode();
843     U = C;
844   }
845 
846   if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
847     if (Ty->getAddressSpace() > 255)
848       // Fast instruction selection doesn't support the special
849       // address spaces.
850       return false;
851 
852   switch (Opcode) {
853   default: break;
854   case Instruction::BitCast:
855     // Look past bitcasts.
856     return X86SelectAddress(U->getOperand(0), AM);
857 
858   case Instruction::IntToPtr:
859     // Look past no-op inttoptrs.
860     if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
861         TLI.getPointerTy(DL))
862       return X86SelectAddress(U->getOperand(0), AM);
863     break;
864 
865   case Instruction::PtrToInt:
866     // Look past no-op ptrtoints.
867     if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
868       return X86SelectAddress(U->getOperand(0), AM);
869     break;
870 
871   case Instruction::Alloca: {
872     // Do static allocas.
873     const AllocaInst *A = cast<AllocaInst>(V);
874     DenseMap<const AllocaInst *, int>::iterator SI =
875       FuncInfo.StaticAllocaMap.find(A);
876     if (SI != FuncInfo.StaticAllocaMap.end()) {
877       AM.BaseType = X86AddressMode::FrameIndexBase;
878       AM.Base.FrameIndex = SI->second;
879       return true;
880     }
881     break;
882   }
883 
884   case Instruction::Add: {
885     // Adds of constants are common and easy enough.
886     if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
887       uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
888       // They have to fit in the 32-bit signed displacement field though.
889       if (isInt<32>(Disp)) {
890         AM.Disp = (uint32_t)Disp;
891         return X86SelectAddress(U->getOperand(0), AM);
892       }
893     }
894     break;
895   }
896 
897   case Instruction::GetElementPtr: {
898     X86AddressMode SavedAM = AM;
899 
900     // Pattern-match simple GEPs.
901     uint64_t Disp = (int32_t)AM.Disp;
902     unsigned IndexReg = AM.IndexReg;
903     unsigned Scale = AM.Scale;
904     MVT PtrVT = TLI.getValueType(DL, U->getType()).getSimpleVT();
905 
906     gep_type_iterator GTI = gep_type_begin(U);
907     // Iterate through the indices, folding what we can. Constants can be
908     // folded, and one dynamic index can be handled, if the scale is supported.
909     for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
910          i != e; ++i, ++GTI) {
911       const Value *Op = *i;
912       if (StructType *STy = GTI.getStructTypeOrNull()) {
913         const StructLayout *SL = DL.getStructLayout(STy);
914         Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
915         continue;
916       }
917 
918       // A array/variable index is always of the form i*S where S is the
919       // constant scale size.  See if we can push the scale into immediates.
920       uint64_t S = GTI.getSequentialElementStride(DL);
921       for (;;) {
922         if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
923           // Constant-offset addressing.
924           Disp += CI->getSExtValue() * S;
925           break;
926         }
927         if (canFoldAddIntoGEP(U, Op)) {
928           // A compatible add with a constant operand. Fold the constant.
929           ConstantInt *CI =
930             cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
931           Disp += CI->getSExtValue() * S;
932           // Iterate on the other operand.
933           Op = cast<AddOperator>(Op)->getOperand(0);
934           continue;
935         }
936         if (IndexReg == 0 &&
937             (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
938             (S == 1 || S == 2 || S == 4 || S == 8)) {
939           // Scaled-index addressing.
940           Scale = S;
941           IndexReg = getRegForGEPIndex(PtrVT, Op);
942           if (IndexReg == 0)
943             return false;
944           break;
945         }
946         // Unsupported.
947         goto unsupported_gep;
948       }
949     }
950 
951     // Check for displacement overflow.
952     if (!isInt<32>(Disp))
953       break;
954 
955     AM.IndexReg = IndexReg;
956     AM.Scale = Scale;
957     AM.Disp = (uint32_t)Disp;
958     GEPs.push_back(V);
959 
960     if (const GetElementPtrInst *GEP =
961           dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
962       // Ok, the GEP indices were covered by constant-offset and scaled-index
963       // addressing. Update the address state and move on to examining the base.
964       V = GEP;
965       goto redo_gep;
966     } else if (X86SelectAddress(U->getOperand(0), AM)) {
967       return true;
968     }
969 
970     // If we couldn't merge the gep value into this addr mode, revert back to
971     // our address and just match the value instead of completely failing.
972     AM = SavedAM;
973 
974     for (const Value *I : reverse(GEPs))
975       if (handleConstantAddresses(I, AM))
976         return true;
977 
978     return false;
979   unsupported_gep:
980     // Ok, the GEP indices weren't all covered.
981     break;
982   }
983   }
984 
985   return handleConstantAddresses(V, AM);
986 }
987 
988 /// X86SelectCallAddress - Attempt to fill in an address from the given value.
989 ///
990 bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
991   const User *U = nullptr;
992   unsigned Opcode = Instruction::UserOp1;
993   const Instruction *I = dyn_cast<Instruction>(V);
994   // Record if the value is defined in the same basic block.
995   //
996   // This information is crucial to know whether or not folding an
997   // operand is valid.
998   // Indeed, FastISel generates or reuses a virtual register for all
999   // operands of all instructions it selects. Obviously, the definition and
1000   // its uses must use the same virtual register otherwise the produced
1001   // code is incorrect.
1002   // Before instruction selection, FunctionLoweringInfo::set sets the virtual
1003   // registers for values that are alive across basic blocks. This ensures
1004   // that the values are consistently set between across basic block, even
1005   // if different instruction selection mechanisms are used (e.g., a mix of
1006   // SDISel and FastISel).
1007   // For values local to a basic block, the instruction selection process
1008   // generates these virtual registers with whatever method is appropriate
1009   // for its needs. In particular, FastISel and SDISel do not share the way
1010   // local virtual registers are set.
1011   // Therefore, this is impossible (or at least unsafe) to share values
1012   // between basic blocks unless they use the same instruction selection
1013   // method, which is not guarantee for X86.
1014   // Moreover, things like hasOneUse could not be used accurately, if we
1015   // allow to reference values across basic blocks whereas they are not
1016   // alive across basic blocks initially.
1017   bool InMBB = true;
1018   if (I) {
1019     Opcode = I->getOpcode();
1020     U = I;
1021     InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
1022   } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
1023     Opcode = C->getOpcode();
1024     U = C;
1025   }
1026 
1027   switch (Opcode) {
1028   default: break;
1029   case Instruction::BitCast:
1030     // Look past bitcasts if its operand is in the same BB.
1031     if (InMBB)
1032       return X86SelectCallAddress(U->getOperand(0), AM);
1033     break;
1034 
1035   case Instruction::IntToPtr:
1036     // Look past no-op inttoptrs if its operand is in the same BB.
1037     if (InMBB &&
1038         TLI.getValueType(DL, U->getOperand(0)->getType()) ==
1039             TLI.getPointerTy(DL))
1040       return X86SelectCallAddress(U->getOperand(0), AM);
1041     break;
1042 
1043   case Instruction::PtrToInt:
1044     // Look past no-op ptrtoints if its operand is in the same BB.
1045     if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
1046       return X86SelectCallAddress(U->getOperand(0), AM);
1047     break;
1048   }
1049 
1050   // Handle constant address.
1051   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
1052     // Can't handle alternate code models yet.
1053     if (TM.getCodeModel() != CodeModel::Small &&
1054         TM.getCodeModel() != CodeModel::Medium)
1055       return false;
1056 
1057     // RIP-relative addresses can't have additional register operands.
1058     if (Subtarget->isPICStyleRIPRel() &&
1059         (AM.Base.Reg != 0 || AM.IndexReg != 0))
1060       return false;
1061 
1062     // Can't handle TLS.
1063     if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
1064       if (GVar->isThreadLocal())
1065         return false;
1066 
1067     // Okay, we've committed to selecting this global. Set up the basic address.
1068     AM.GV = GV;
1069 
1070     // Return a direct reference to the global. Fastisel can handle calls to
1071     // functions that require loads, such as dllimport and nonlazybind
1072     // functions.
1073     if (Subtarget->isPICStyleRIPRel()) {
1074       // Use rip-relative addressing if we can.  Above we verified that the
1075       // base and index registers are unused.
1076       assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
1077       AM.Base.Reg = X86::RIP;
1078     } else {
1079       AM.GVOpFlags = Subtarget->classifyLocalReference(nullptr);
1080     }
1081 
1082     return true;
1083   }
1084 
1085   // If all else fails, try to materialize the value in a register.
1086   if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
1087     auto GetCallRegForValue = [this](const Value *V) {
1088       Register Reg = getRegForValue(V);
1089 
1090       // In 64-bit mode, we need a 64-bit register even if pointers are 32 bits.
1091       if (Reg && Subtarget->isTarget64BitILP32()) {
1092         Register CopyReg = createResultReg(&X86::GR32RegClass);
1093         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV32rr),
1094                 CopyReg)
1095             .addReg(Reg);
1096 
1097         Register ExtReg = createResultReg(&X86::GR64RegClass);
1098         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1099                 TII.get(TargetOpcode::SUBREG_TO_REG), ExtReg)
1100             .addImm(0)
1101             .addReg(CopyReg)
1102             .addImm(X86::sub_32bit);
1103         Reg = ExtReg;
1104       }
1105 
1106       return Reg;
1107     };
1108 
1109     if (AM.Base.Reg == 0) {
1110       AM.Base.Reg = GetCallRegForValue(V);
1111       return AM.Base.Reg != 0;
1112     }
1113     if (AM.IndexReg == 0) {
1114       assert(AM.Scale == 1 && "Scale with no index!");
1115       AM.IndexReg = GetCallRegForValue(V);
1116       return AM.IndexReg != 0;
1117     }
1118   }
1119 
1120   return false;
1121 }
1122 
1123 
1124 /// X86SelectStore - Select and emit code to implement store instructions.
1125 bool X86FastISel::X86SelectStore(const Instruction *I) {
1126   // Atomic stores need special handling.
1127   const StoreInst *S = cast<StoreInst>(I);
1128 
1129   if (S->isAtomic())
1130     return false;
1131 
1132   const Value *PtrV = I->getOperand(1);
1133   if (TLI.supportSwiftError()) {
1134     // Swifterror values can come from either a function parameter with
1135     // swifterror attribute or an alloca with swifterror attribute.
1136     if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
1137       if (Arg->hasSwiftErrorAttr())
1138         return false;
1139     }
1140 
1141     if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
1142       if (Alloca->isSwiftError())
1143         return false;
1144     }
1145   }
1146 
1147   const Value *Val = S->getValueOperand();
1148   const Value *Ptr = S->getPointerOperand();
1149 
1150   MVT VT;
1151   if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
1152     return false;
1153 
1154   Align Alignment = S->getAlign();
1155   Align ABIAlignment = DL.getABITypeAlign(Val->getType());
1156   bool Aligned = Alignment >= ABIAlignment;
1157 
1158   X86AddressMode AM;
1159   if (!X86SelectAddress(Ptr, AM))
1160     return false;
1161 
1162   return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
1163 }
1164 
1165 /// X86SelectRet - Select and emit code to implement ret instructions.
1166 bool X86FastISel::X86SelectRet(const Instruction *I) {
1167   const ReturnInst *Ret = cast<ReturnInst>(I);
1168   const Function &F = *I->getParent()->getParent();
1169   const X86MachineFunctionInfo *X86MFInfo =
1170       FuncInfo.MF->getInfo<X86MachineFunctionInfo>();
1171 
1172   if (!FuncInfo.CanLowerReturn)
1173     return false;
1174 
1175   if (TLI.supportSwiftError() &&
1176       F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
1177     return false;
1178 
1179   if (TLI.supportSplitCSR(FuncInfo.MF))
1180     return false;
1181 
1182   CallingConv::ID CC = F.getCallingConv();
1183   if (CC != CallingConv::C &&
1184       CC != CallingConv::Fast &&
1185       CC != CallingConv::Tail &&
1186       CC != CallingConv::SwiftTail &&
1187       CC != CallingConv::X86_FastCall &&
1188       CC != CallingConv::X86_StdCall &&
1189       CC != CallingConv::X86_ThisCall &&
1190       CC != CallingConv::X86_64_SysV &&
1191       CC != CallingConv::Win64)
1192     return false;
1193 
1194   // Don't handle popping bytes if they don't fit the ret's immediate.
1195   if (!isUInt<16>(X86MFInfo->getBytesToPopOnReturn()))
1196     return false;
1197 
1198   // fastcc with -tailcallopt is intended to provide a guaranteed
1199   // tail call optimization. Fastisel doesn't know how to do that.
1200   if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
1201       CC == CallingConv::Tail || CC == CallingConv::SwiftTail)
1202     return false;
1203 
1204   // Let SDISel handle vararg functions.
1205   if (F.isVarArg())
1206     return false;
1207 
1208   // Build a list of return value registers.
1209   SmallVector<unsigned, 4> RetRegs;
1210 
1211   if (Ret->getNumOperands() > 0) {
1212     SmallVector<ISD::OutputArg, 4> Outs;
1213     GetReturnInfo(CC, F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
1214 
1215     // Analyze operands of the call, assigning locations to each operand.
1216     SmallVector<CCValAssign, 16> ValLocs;
1217     CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
1218     CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1219 
1220     const Value *RV = Ret->getOperand(0);
1221     Register Reg = getRegForValue(RV);
1222     if (Reg == 0)
1223       return false;
1224 
1225     // Only handle a single return value for now.
1226     if (ValLocs.size() != 1)
1227       return false;
1228 
1229     CCValAssign &VA = ValLocs[0];
1230 
1231     // Don't bother handling odd stuff for now.
1232     if (VA.getLocInfo() != CCValAssign::Full)
1233       return false;
1234     // Only handle register returns for now.
1235     if (!VA.isRegLoc())
1236       return false;
1237 
1238     // The calling-convention tables for x87 returns don't tell
1239     // the whole story.
1240     if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
1241       return false;
1242 
1243     unsigned SrcReg = Reg + VA.getValNo();
1244     EVT SrcVT = TLI.getValueType(DL, RV->getType());
1245     EVT DstVT = VA.getValVT();
1246     // Special handling for extended integers.
1247     if (SrcVT != DstVT) {
1248       if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
1249         return false;
1250 
1251       if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
1252         return false;
1253 
1254       if (SrcVT == MVT::i1) {
1255         if (Outs[0].Flags.isSExt())
1256           return false;
1257         SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg);
1258         SrcVT = MVT::i8;
1259       }
1260       if (SrcVT != DstVT) {
1261         unsigned Op =
1262             Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
1263         SrcReg =
1264             fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg);
1265       }
1266     }
1267 
1268     // Make the copy.
1269     Register DstReg = VA.getLocReg();
1270     const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
1271     // Avoid a cross-class copy. This is very unlikely.
1272     if (!SrcRC->contains(DstReg))
1273       return false;
1274     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1275             TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
1276 
1277     // Add register to return instruction.
1278     RetRegs.push_back(VA.getLocReg());
1279   }
1280 
1281   // Swift calling convention does not require we copy the sret argument
1282   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
1283 
1284   // All x86 ABIs require that for returning structs by value we copy
1285   // the sret argument into %rax/%eax (depending on ABI) for the return.
1286   // We saved the argument into a virtual register in the entry block,
1287   // so now we copy the value out and into %rax/%eax.
1288   if (F.hasStructRetAttr() && CC != CallingConv::Swift &&
1289       CC != CallingConv::SwiftTail) {
1290     Register Reg = X86MFInfo->getSRetReturnReg();
1291     assert(Reg &&
1292            "SRetReturnReg should have been set in LowerFormalArguments()!");
1293     unsigned RetReg = Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX;
1294     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1295             TII.get(TargetOpcode::COPY), RetReg).addReg(Reg);
1296     RetRegs.push_back(RetReg);
1297   }
1298 
1299   // Now emit the RET.
1300   MachineInstrBuilder MIB;
1301   if (X86MFInfo->getBytesToPopOnReturn()) {
1302     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1303                   TII.get(Subtarget->is64Bit() ? X86::RETI64 : X86::RETI32))
1304               .addImm(X86MFInfo->getBytesToPopOnReturn());
1305   } else {
1306     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1307                   TII.get(Subtarget->is64Bit() ? X86::RET64 : X86::RET32));
1308   }
1309   for (unsigned Reg : RetRegs)
1310     MIB.addReg(Reg, RegState::Implicit);
1311   return true;
1312 }
1313 
1314 /// X86SelectLoad - Select and emit code to implement load instructions.
1315 ///
1316 bool X86FastISel::X86SelectLoad(const Instruction *I) {
1317   const LoadInst *LI = cast<LoadInst>(I);
1318 
1319   // Atomic loads need special handling.
1320   if (LI->isAtomic())
1321     return false;
1322 
1323   const Value *SV = I->getOperand(0);
1324   if (TLI.supportSwiftError()) {
1325     // Swifterror values can come from either a function parameter with
1326     // swifterror attribute or an alloca with swifterror attribute.
1327     if (const Argument *Arg = dyn_cast<Argument>(SV)) {
1328       if (Arg->hasSwiftErrorAttr())
1329         return false;
1330     }
1331 
1332     if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
1333       if (Alloca->isSwiftError())
1334         return false;
1335     }
1336   }
1337 
1338   MVT VT;
1339   if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
1340     return false;
1341 
1342   const Value *Ptr = LI->getPointerOperand();
1343 
1344   X86AddressMode AM;
1345   if (!X86SelectAddress(Ptr, AM))
1346     return false;
1347 
1348   unsigned ResultReg = 0;
1349   if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg,
1350                        LI->getAlign().value()))
1351     return false;
1352 
1353   updateValueMap(I, ResultReg);
1354   return true;
1355 }
1356 
1357 static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
1358   bool HasAVX512 = Subtarget->hasAVX512();
1359   bool HasAVX = Subtarget->hasAVX();
1360   bool HasSSE1 = Subtarget->hasSSE1();
1361   bool HasSSE2 = Subtarget->hasSSE2();
1362 
1363   switch (VT.getSimpleVT().SimpleTy) {
1364   default:       return 0;
1365   case MVT::i8:  return X86::CMP8rr;
1366   case MVT::i16: return X86::CMP16rr;
1367   case MVT::i32: return X86::CMP32rr;
1368   case MVT::i64: return X86::CMP64rr;
1369   case MVT::f32:
1370     return HasAVX512 ? X86::VUCOMISSZrr
1371            : HasAVX  ? X86::VUCOMISSrr
1372            : HasSSE1 ? X86::UCOMISSrr
1373                      : 0;
1374   case MVT::f64:
1375     return HasAVX512 ? X86::VUCOMISDZrr
1376            : HasAVX  ? X86::VUCOMISDrr
1377            : HasSSE2 ? X86::UCOMISDrr
1378                      : 0;
1379   }
1380 }
1381 
1382 /// If we have a comparison with RHS as the RHS  of the comparison, return an
1383 /// opcode that works for the compare (e.g. CMP32ri) otherwise return 0.
1384 static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
1385   switch (VT.getSimpleVT().SimpleTy) {
1386   // Otherwise, we can't fold the immediate into this comparison.
1387   default:
1388     return 0;
1389   case MVT::i8:
1390     return X86::CMP8ri;
1391   case MVT::i16:
1392     return X86::CMP16ri;
1393   case MVT::i32:
1394     return X86::CMP32ri;
1395   case MVT::i64:
1396     // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
1397     // field.
1398     return isInt<32>(RHSC->getSExtValue()) ? X86::CMP64ri32 : 0;
1399   }
1400 }
1401 
1402 bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT,
1403                                      const DebugLoc &CurMIMD) {
1404   Register Op0Reg = getRegForValue(Op0);
1405   if (Op0Reg == 0) return false;
1406 
1407   // Handle 'null' like i32/i64 0.
1408   if (isa<ConstantPointerNull>(Op1))
1409     Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));
1410 
1411   // We have two options: compare with register or immediate.  If the RHS of
1412   // the compare is an immediate that we can fold into this compare, use
1413   // CMPri, otherwise use CMPrr.
1414   if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
1415     if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
1416       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurMIMD, TII.get(CompareImmOpc))
1417         .addReg(Op0Reg)
1418         .addImm(Op1C->getSExtValue());
1419       return true;
1420     }
1421   }
1422 
1423   unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
1424   if (CompareOpc == 0) return false;
1425 
1426   Register Op1Reg = getRegForValue(Op1);
1427   if (Op1Reg == 0) return false;
1428   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurMIMD, TII.get(CompareOpc))
1429     .addReg(Op0Reg)
1430     .addReg(Op1Reg);
1431 
1432   return true;
1433 }
1434 
1435 bool X86FastISel::X86SelectCmp(const Instruction *I) {
1436   const CmpInst *CI = cast<CmpInst>(I);
1437 
1438   MVT VT;
1439   if (!isTypeLegal(I->getOperand(0)->getType(), VT))
1440     return false;
1441 
1442   // Below code only works for scalars.
1443   if (VT.isVector())
1444     return false;
1445 
1446   // Try to optimize or fold the cmp.
1447   CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
1448   unsigned ResultReg = 0;
1449   switch (Predicate) {
1450   default: break;
1451   case CmpInst::FCMP_FALSE: {
1452     ResultReg = createResultReg(&X86::GR32RegClass);
1453     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV32r0),
1454             ResultReg);
1455     ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, X86::sub_8bit);
1456     if (!ResultReg)
1457       return false;
1458     break;
1459   }
1460   case CmpInst::FCMP_TRUE: {
1461     ResultReg = createResultReg(&X86::GR8RegClass);
1462     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV8ri),
1463             ResultReg).addImm(1);
1464     break;
1465   }
1466   }
1467 
1468   if (ResultReg) {
1469     updateValueMap(I, ResultReg);
1470     return true;
1471   }
1472 
1473   const Value *LHS = CI->getOperand(0);
1474   const Value *RHS = CI->getOperand(1);
1475 
1476   // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
1477   // We don't have to materialize a zero constant for this case and can just use
1478   // %x again on the RHS.
1479   if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
1480     const auto *RHSC = dyn_cast<ConstantFP>(RHS);
1481     if (RHSC && RHSC->isNullValue())
1482       RHS = LHS;
1483   }
1484 
1485   // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
1486   static const uint16_t SETFOpcTable[2][3] = {
1487     { X86::COND_E,  X86::COND_NP, X86::AND8rr },
1488     { X86::COND_NE, X86::COND_P,  X86::OR8rr  }
1489   };
1490   const uint16_t *SETFOpc = nullptr;
1491   switch (Predicate) {
1492   default: break;
1493   case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
1494   case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
1495   }
1496 
1497   ResultReg = createResultReg(&X86::GR8RegClass);
1498   if (SETFOpc) {
1499     if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
1500       return false;
1501 
1502     Register FlagReg1 = createResultReg(&X86::GR8RegClass);
1503     Register FlagReg2 = createResultReg(&X86::GR8RegClass);
1504     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
1505             FlagReg1).addImm(SETFOpc[0]);
1506     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
1507             FlagReg2).addImm(SETFOpc[1]);
1508     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(SETFOpc[2]),
1509             ResultReg).addReg(FlagReg1).addReg(FlagReg2);
1510     updateValueMap(I, ResultReg);
1511     return true;
1512   }
1513 
1514   X86::CondCode CC;
1515   bool SwapArgs;
1516   std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
1517   assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
1518 
1519   if (SwapArgs)
1520     std::swap(LHS, RHS);
1521 
1522   // Emit a compare of LHS/RHS.
1523   if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
1524     return false;
1525 
1526   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
1527           ResultReg).addImm(CC);
1528   updateValueMap(I, ResultReg);
1529   return true;
1530 }
1531 
1532 bool X86FastISel::X86SelectZExt(const Instruction *I) {
1533   EVT DstVT = TLI.getValueType(DL, I->getType());
1534   if (!TLI.isTypeLegal(DstVT))
1535     return false;
1536 
1537   Register ResultReg = getRegForValue(I->getOperand(0));
1538   if (ResultReg == 0)
1539     return false;
1540 
1541   // Handle zero-extension from i1 to i8, which is common.
1542   MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
1543   if (SrcVT == MVT::i1) {
1544     // Set the high bits to zero.
1545     ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg);
1546     SrcVT = MVT::i8;
1547 
1548     if (ResultReg == 0)
1549       return false;
1550   }
1551 
1552   if (DstVT == MVT::i64) {
1553     // Handle extension to 64-bits via sub-register shenanigans.
1554     unsigned MovInst;
1555 
1556     switch (SrcVT.SimpleTy) {
1557     case MVT::i8:  MovInst = X86::MOVZX32rr8;  break;
1558     case MVT::i16: MovInst = X86::MOVZX32rr16; break;
1559     case MVT::i32: MovInst = X86::MOV32rr;     break;
1560     default: llvm_unreachable("Unexpected zext to i64 source type");
1561     }
1562 
1563     Register Result32 = createResultReg(&X86::GR32RegClass);
1564     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(MovInst), Result32)
1565       .addReg(ResultReg);
1566 
1567     ResultReg = createResultReg(&X86::GR64RegClass);
1568     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::SUBREG_TO_REG),
1569             ResultReg)
1570       .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
1571   } else if (DstVT == MVT::i16) {
1572     // i8->i16 doesn't exist in the autogenerated isel table. Need to zero
1573     // extend to 32-bits and then extract down to 16-bits.
1574     Register Result32 = createResultReg(&X86::GR32RegClass);
1575     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOVZX32rr8),
1576             Result32).addReg(ResultReg);
1577 
1578     ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, X86::sub_16bit);
1579   } else if (DstVT != MVT::i8) {
1580     ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
1581                            ResultReg);
1582     if (ResultReg == 0)
1583       return false;
1584   }
1585 
1586   updateValueMap(I, ResultReg);
1587   return true;
1588 }
1589 
1590 bool X86FastISel::X86SelectSExt(const Instruction *I) {
1591   EVT DstVT = TLI.getValueType(DL, I->getType());
1592   if (!TLI.isTypeLegal(DstVT))
1593     return false;
1594 
1595   Register ResultReg = getRegForValue(I->getOperand(0));
1596   if (ResultReg == 0)
1597     return false;
1598 
1599   // Handle sign-extension from i1 to i8.
1600   MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
1601   if (SrcVT == MVT::i1) {
1602     // Set the high bits to zero.
1603     Register ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg);
1604     if (ZExtReg == 0)
1605       return false;
1606 
1607     // Negate the result to make an 8-bit sign extended value.
1608     ResultReg = createResultReg(&X86::GR8RegClass);
1609     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::NEG8r),
1610             ResultReg).addReg(ZExtReg);
1611 
1612     SrcVT = MVT::i8;
1613   }
1614 
1615   if (DstVT == MVT::i16) {
1616     // i8->i16 doesn't exist in the autogenerated isel table. Need to sign
1617     // extend to 32-bits and then extract down to 16-bits.
1618     Register Result32 = createResultReg(&X86::GR32RegClass);
1619     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOVSX32rr8),
1620             Result32).addReg(ResultReg);
1621 
1622     ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, X86::sub_16bit);
1623   } else if (DstVT != MVT::i8) {
1624     ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::SIGN_EXTEND,
1625                            ResultReg);
1626     if (ResultReg == 0)
1627       return false;
1628   }
1629 
1630   updateValueMap(I, ResultReg);
1631   return true;
1632 }
1633 
1634 bool X86FastISel::X86SelectBranch(const Instruction *I) {
1635   // Unconditional branches are selected by tablegen-generated code.
1636   // Handle a conditional branch.
1637   const BranchInst *BI = cast<BranchInst>(I);
1638   MachineBasicBlock *TrueMBB = FuncInfo.getMBB(BI->getSuccessor(0));
1639   MachineBasicBlock *FalseMBB = FuncInfo.getMBB(BI->getSuccessor(1));
1640 
1641   // Fold the common case of a conditional branch with a comparison
1642   // in the same block (values defined on other blocks may not have
1643   // initialized registers).
1644   X86::CondCode CC;
1645   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
1646     if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
1647       EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType());
1648 
1649       // Try to optimize or fold the cmp.
1650       CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
1651       switch (Predicate) {
1652       default: break;
1653       case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, MIMD.getDL()); return true;
1654       case CmpInst::FCMP_TRUE:  fastEmitBranch(TrueMBB, MIMD.getDL()); return true;
1655       }
1656 
1657       const Value *CmpLHS = CI->getOperand(0);
1658       const Value *CmpRHS = CI->getOperand(1);
1659 
1660       // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
1661       // 0.0.
1662       // We don't have to materialize a zero constant for this case and can just
1663       // use %x again on the RHS.
1664       if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
1665         const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
1666         if (CmpRHSC && CmpRHSC->isNullValue())
1667           CmpRHS = CmpLHS;
1668       }
1669 
1670       // Try to take advantage of fallthrough opportunities.
1671       if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
1672         std::swap(TrueMBB, FalseMBB);
1673         Predicate = CmpInst::getInversePredicate(Predicate);
1674       }
1675 
1676       // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
1677       // code check. Instead two branch instructions are required to check all
1678       // the flags. First we change the predicate to a supported condition code,
1679       // which will be the first branch. Later one we will emit the second
1680       // branch.
1681       bool NeedExtraBranch = false;
1682       switch (Predicate) {
1683       default: break;
1684       case CmpInst::FCMP_OEQ:
1685         std::swap(TrueMBB, FalseMBB);
1686         [[fallthrough]];
1687       case CmpInst::FCMP_UNE:
1688         NeedExtraBranch = true;
1689         Predicate = CmpInst::FCMP_ONE;
1690         break;
1691       }
1692 
1693       bool SwapArgs;
1694       std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
1695       assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
1696 
1697       if (SwapArgs)
1698         std::swap(CmpLHS, CmpRHS);
1699 
1700       // Emit a compare of the LHS and RHS, setting the flags.
1701       if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))
1702         return false;
1703 
1704       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))
1705         .addMBB(TrueMBB).addImm(CC);
1706 
1707       // X86 requires a second branch to handle UNE (and OEQ, which is mapped
1708       // to UNE above).
1709       if (NeedExtraBranch) {
1710         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))
1711           .addMBB(TrueMBB).addImm(X86::COND_P);
1712       }
1713 
1714       finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
1715       return true;
1716     }
1717   } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
1718     // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
1719     // typically happen for _Bool and C++ bools.
1720     MVT SourceVT;
1721     if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
1722         isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
1723       unsigned TestOpc = 0;
1724       switch (SourceVT.SimpleTy) {
1725       default: break;
1726       case MVT::i8:  TestOpc = X86::TEST8ri; break;
1727       case MVT::i16: TestOpc = X86::TEST16ri; break;
1728       case MVT::i32: TestOpc = X86::TEST32ri; break;
1729       case MVT::i64: TestOpc = X86::TEST64ri32; break;
1730       }
1731       if (TestOpc) {
1732         Register OpReg = getRegForValue(TI->getOperand(0));
1733         if (OpReg == 0) return false;
1734 
1735         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TestOpc))
1736           .addReg(OpReg).addImm(1);
1737 
1738         unsigned JmpCond = X86::COND_NE;
1739         if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
1740           std::swap(TrueMBB, FalseMBB);
1741           JmpCond = X86::COND_E;
1742         }
1743 
1744         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))
1745           .addMBB(TrueMBB).addImm(JmpCond);
1746 
1747         finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
1748         return true;
1749       }
1750     }
1751   } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
1752     // Fake request the condition, otherwise the intrinsic might be completely
1753     // optimized away.
1754     Register TmpReg = getRegForValue(BI->getCondition());
1755     if (TmpReg == 0)
1756       return false;
1757 
1758     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))
1759       .addMBB(TrueMBB).addImm(CC);
1760     finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
1761     return true;
1762   }
1763 
1764   // Otherwise do a clumsy setcc and re-test it.
1765   // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
1766   // in an explicit cast, so make sure to handle that correctly.
1767   Register OpReg = getRegForValue(BI->getCondition());
1768   if (OpReg == 0) return false;
1769 
1770   // In case OpReg is a K register, COPY to a GPR
1771   if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) {
1772     unsigned KOpReg = OpReg;
1773     OpReg = createResultReg(&X86::GR32RegClass);
1774     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1775             TII.get(TargetOpcode::COPY), OpReg)
1776         .addReg(KOpReg);
1777     OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, X86::sub_8bit);
1778   }
1779   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::TEST8ri))
1780       .addReg(OpReg)
1781       .addImm(1);
1782   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))
1783     .addMBB(TrueMBB).addImm(X86::COND_NE);
1784   finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
1785   return true;
1786 }
1787 
1788 bool X86FastISel::X86SelectShift(const Instruction *I) {
1789   unsigned CReg = 0, OpReg = 0;
1790   const TargetRegisterClass *RC = nullptr;
1791   if (I->getType()->isIntegerTy(8)) {
1792     CReg = X86::CL;
1793     RC = &X86::GR8RegClass;
1794     switch (I->getOpcode()) {
1795     case Instruction::LShr: OpReg = X86::SHR8rCL; break;
1796     case Instruction::AShr: OpReg = X86::SAR8rCL; break;
1797     case Instruction::Shl:  OpReg = X86::SHL8rCL; break;
1798     default: return false;
1799     }
1800   } else if (I->getType()->isIntegerTy(16)) {
1801     CReg = X86::CX;
1802     RC = &X86::GR16RegClass;
1803     switch (I->getOpcode()) {
1804     default: llvm_unreachable("Unexpected shift opcode");
1805     case Instruction::LShr: OpReg = X86::SHR16rCL; break;
1806     case Instruction::AShr: OpReg = X86::SAR16rCL; break;
1807     case Instruction::Shl:  OpReg = X86::SHL16rCL; break;
1808     }
1809   } else if (I->getType()->isIntegerTy(32)) {
1810     CReg = X86::ECX;
1811     RC = &X86::GR32RegClass;
1812     switch (I->getOpcode()) {
1813     default: llvm_unreachable("Unexpected shift opcode");
1814     case Instruction::LShr: OpReg = X86::SHR32rCL; break;
1815     case Instruction::AShr: OpReg = X86::SAR32rCL; break;
1816     case Instruction::Shl:  OpReg = X86::SHL32rCL; break;
1817     }
1818   } else if (I->getType()->isIntegerTy(64)) {
1819     CReg = X86::RCX;
1820     RC = &X86::GR64RegClass;
1821     switch (I->getOpcode()) {
1822     default: llvm_unreachable("Unexpected shift opcode");
1823     case Instruction::LShr: OpReg = X86::SHR64rCL; break;
1824     case Instruction::AShr: OpReg = X86::SAR64rCL; break;
1825     case Instruction::Shl:  OpReg = X86::SHL64rCL; break;
1826     }
1827   } else {
1828     return false;
1829   }
1830 
1831   MVT VT;
1832   if (!isTypeLegal(I->getType(), VT))
1833     return false;
1834 
1835   Register Op0Reg = getRegForValue(I->getOperand(0));
1836   if (Op0Reg == 0) return false;
1837 
1838   Register Op1Reg = getRegForValue(I->getOperand(1));
1839   if (Op1Reg == 0) return false;
1840   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),
1841           CReg).addReg(Op1Reg);
1842 
1843   // The shift instruction uses X86::CL. If we defined a super-register
1844   // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
1845   if (CReg != X86::CL)
1846     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1847             TII.get(TargetOpcode::KILL), X86::CL)
1848       .addReg(CReg, RegState::Kill);
1849 
1850   Register ResultReg = createResultReg(RC);
1851   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(OpReg), ResultReg)
1852     .addReg(Op0Reg);
1853   updateValueMap(I, ResultReg);
1854   return true;
1855 }
1856 
1857 bool X86FastISel::X86SelectDivRem(const Instruction *I) {
1858   const static unsigned NumTypes = 4; // i8, i16, i32, i64
1859   const static unsigned NumOps   = 4; // SDiv, SRem, UDiv, URem
1860   const static bool S = true;  // IsSigned
1861   const static bool U = false; // !IsSigned
1862   const static unsigned Copy = TargetOpcode::COPY;
1863   // For the X86 DIV/IDIV instruction, in most cases the dividend
1864   // (numerator) must be in a specific register pair highreg:lowreg,
1865   // producing the quotient in lowreg and the remainder in highreg.
1866   // For most data types, to set up the instruction, the dividend is
1867   // copied into lowreg, and lowreg is sign-extended or zero-extended
1868   // into highreg.  The exception is i8, where the dividend is defined
1869   // as a single register rather than a register pair, and we
1870   // therefore directly sign-extend or zero-extend the dividend into
1871   // lowreg, instead of copying, and ignore the highreg.
1872   const static struct DivRemEntry {
1873     // The following portion depends only on the data type.
1874     const TargetRegisterClass *RC;
1875     unsigned LowInReg;  // low part of the register pair
1876     unsigned HighInReg; // high part of the register pair
1877     // The following portion depends on both the data type and the operation.
1878     struct DivRemResult {
1879     unsigned OpDivRem;        // The specific DIV/IDIV opcode to use.
1880     unsigned OpSignExtend;    // Opcode for sign-extending lowreg into
1881                               // highreg, or copying a zero into highreg.
1882     unsigned OpCopy;          // Opcode for copying dividend into lowreg, or
1883                               // zero/sign-extending into lowreg for i8.
1884     unsigned DivRemResultReg; // Register containing the desired result.
1885     bool IsOpSigned;          // Whether to use signed or unsigned form.
1886     } ResultTable[NumOps];
1887   } OpTable[NumTypes] = {
1888     { &X86::GR8RegClass,  X86::AX,  0, {
1889         { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AL,  S }, // SDiv
1890         { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AH,  S }, // SRem
1891         { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AL,  U }, // UDiv
1892         { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AH,  U }, // URem
1893       }
1894     }, // i8
1895     { &X86::GR16RegClass, X86::AX,  X86::DX, {
1896         { X86::IDIV16r, X86::CWD,     Copy,            X86::AX,  S }, // SDiv
1897         { X86::IDIV16r, X86::CWD,     Copy,            X86::DX,  S }, // SRem
1898         { X86::DIV16r,  X86::MOV32r0, Copy,            X86::AX,  U }, // UDiv
1899         { X86::DIV16r,  X86::MOV32r0, Copy,            X86::DX,  U }, // URem
1900       }
1901     }, // i16
1902     { &X86::GR32RegClass, X86::EAX, X86::EDX, {
1903         { X86::IDIV32r, X86::CDQ,     Copy,            X86::EAX, S }, // SDiv
1904         { X86::IDIV32r, X86::CDQ,     Copy,            X86::EDX, S }, // SRem
1905         { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EAX, U }, // UDiv
1906         { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EDX, U }, // URem
1907       }
1908     }, // i32
1909     { &X86::GR64RegClass, X86::RAX, X86::RDX, {
1910         { X86::IDIV64r, X86::CQO,     Copy,            X86::RAX, S }, // SDiv
1911         { X86::IDIV64r, X86::CQO,     Copy,            X86::RDX, S }, // SRem
1912         { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RAX, U }, // UDiv
1913         { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RDX, U }, // URem
1914       }
1915     }, // i64
1916   };
1917 
1918   MVT VT;
1919   if (!isTypeLegal(I->getType(), VT))
1920     return false;
1921 
1922   unsigned TypeIndex, OpIndex;
1923   switch (VT.SimpleTy) {
1924   default: return false;
1925   case MVT::i8:  TypeIndex = 0; break;
1926   case MVT::i16: TypeIndex = 1; break;
1927   case MVT::i32: TypeIndex = 2; break;
1928   case MVT::i64: TypeIndex = 3;
1929     if (!Subtarget->is64Bit())
1930       return false;
1931     break;
1932   }
1933 
1934   switch (I->getOpcode()) {
1935   default: llvm_unreachable("Unexpected div/rem opcode");
1936   case Instruction::SDiv: OpIndex = 0; break;
1937   case Instruction::SRem: OpIndex = 1; break;
1938   case Instruction::UDiv: OpIndex = 2; break;
1939   case Instruction::URem: OpIndex = 3; break;
1940   }
1941 
1942   const DivRemEntry &TypeEntry = OpTable[TypeIndex];
1943   const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
1944   Register Op0Reg = getRegForValue(I->getOperand(0));
1945   if (Op0Reg == 0)
1946     return false;
1947   Register Op1Reg = getRegForValue(I->getOperand(1));
1948   if (Op1Reg == 0)
1949     return false;
1950 
1951   // Move op0 into low-order input register.
1952   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1953           TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
1954   // Zero-extend or sign-extend into high-order input register.
1955   if (OpEntry.OpSignExtend) {
1956     if (OpEntry.IsOpSigned)
1957       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1958               TII.get(OpEntry.OpSignExtend));
1959     else {
1960       Register Zero32 = createResultReg(&X86::GR32RegClass);
1961       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1962               TII.get(X86::MOV32r0), Zero32);
1963 
1964       // Copy the zero into the appropriate sub/super/identical physical
1965       // register. Unfortunately the operations needed are not uniform enough
1966       // to fit neatly into the table above.
1967       if (VT == MVT::i16) {
1968         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1969                 TII.get(Copy), TypeEntry.HighInReg)
1970           .addReg(Zero32, 0, X86::sub_16bit);
1971       } else if (VT == MVT::i32) {
1972         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1973                 TII.get(Copy), TypeEntry.HighInReg)
1974             .addReg(Zero32);
1975       } else if (VT == MVT::i64) {
1976         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1977                 TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
1978             .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
1979       }
1980     }
1981   }
1982   // Generate the DIV/IDIV instruction.
1983   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1984           TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
1985   // For i8 remainder, we can't reference ah directly, as we'll end
1986   // up with bogus copies like %r9b = COPY %ah. Reference ax
1987   // instead to prevent ah references in a rex instruction.
1988   //
1989   // The current assumption of the fast register allocator is that isel
1990   // won't generate explicit references to the GR8_NOREX registers. If
1991   // the allocator and/or the backend get enhanced to be more robust in
1992   // that regard, this can be, and should be, removed.
1993   unsigned ResultReg = 0;
1994   if ((I->getOpcode() == Instruction::SRem ||
1995        I->getOpcode() == Instruction::URem) &&
1996       OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
1997     Register SourceSuperReg = createResultReg(&X86::GR16RegClass);
1998     Register ResultSuperReg = createResultReg(&X86::GR16RegClass);
1999     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2000             TII.get(Copy), SourceSuperReg).addReg(X86::AX);
2001 
2002     // Shift AX right by 8 bits instead of using AH.
2003     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SHR16ri),
2004             ResultSuperReg).addReg(SourceSuperReg).addImm(8);
2005 
2006     // Now reference the 8-bit subreg of the result.
2007     ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
2008                                            X86::sub_8bit);
2009   }
2010   // Copy the result out of the physreg if we haven't already.
2011   if (!ResultReg) {
2012     ResultReg = createResultReg(TypeEntry.RC);
2013     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Copy), ResultReg)
2014         .addReg(OpEntry.DivRemResultReg);
2015   }
2016   updateValueMap(I, ResultReg);
2017 
2018   return true;
2019 }
2020 
2021 /// Emit a conditional move instruction (if the are supported) to lower
2022 /// the select.
2023 bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
2024   // Check if the subtarget supports these instructions.
2025   if (!Subtarget->canUseCMOV())
2026     return false;
2027 
2028   // FIXME: Add support for i8.
2029   if (RetVT < MVT::i16 || RetVT > MVT::i64)
2030     return false;
2031 
2032   const Value *Cond = I->getOperand(0);
2033   const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
2034   bool NeedTest = true;
2035   X86::CondCode CC = X86::COND_NE;
2036 
2037   // Optimize conditions coming from a compare if both instructions are in the
2038   // same basic block (values defined in other basic blocks may not have
2039   // initialized registers).
2040   const auto *CI = dyn_cast<CmpInst>(Cond);
2041   if (CI && (CI->getParent() == I->getParent())) {
2042     CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
2043 
2044     // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
2045     static const uint16_t SETFOpcTable[2][3] = {
2046       { X86::COND_NP, X86::COND_E,  X86::TEST8rr },
2047       { X86::COND_P,  X86::COND_NE, X86::OR8rr   }
2048     };
2049     const uint16_t *SETFOpc = nullptr;
2050     switch (Predicate) {
2051     default: break;
2052     case CmpInst::FCMP_OEQ:
2053       SETFOpc = &SETFOpcTable[0][0];
2054       Predicate = CmpInst::ICMP_NE;
2055       break;
2056     case CmpInst::FCMP_UNE:
2057       SETFOpc = &SETFOpcTable[1][0];
2058       Predicate = CmpInst::ICMP_NE;
2059       break;
2060     }
2061 
2062     bool NeedSwap;
2063     std::tie(CC, NeedSwap) = X86::getX86ConditionCode(Predicate);
2064     assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
2065 
2066     const Value *CmpLHS = CI->getOperand(0);
2067     const Value *CmpRHS = CI->getOperand(1);
2068     if (NeedSwap)
2069       std::swap(CmpLHS, CmpRHS);
2070 
2071     EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
2072     // Emit a compare of the LHS and RHS, setting the flags.
2073     if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
2074       return false;
2075 
2076     if (SETFOpc) {
2077       Register FlagReg1 = createResultReg(&X86::GR8RegClass);
2078       Register FlagReg2 = createResultReg(&X86::GR8RegClass);
2079       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
2080               FlagReg1).addImm(SETFOpc[0]);
2081       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
2082               FlagReg2).addImm(SETFOpc[1]);
2083       auto const &II = TII.get(SETFOpc[2]);
2084       if (II.getNumDefs()) {
2085         Register TmpReg = createResultReg(&X86::GR8RegClass);
2086         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II, TmpReg)
2087           .addReg(FlagReg2).addReg(FlagReg1);
2088       } else {
2089         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
2090           .addReg(FlagReg2).addReg(FlagReg1);
2091       }
2092     }
2093     NeedTest = false;
2094   } else if (foldX86XALUIntrinsic(CC, I, Cond)) {
2095     // Fake request the condition, otherwise the intrinsic might be completely
2096     // optimized away.
2097     Register TmpReg = getRegForValue(Cond);
2098     if (TmpReg == 0)
2099       return false;
2100 
2101     NeedTest = false;
2102   }
2103 
2104   if (NeedTest) {
2105     // Selects operate on i1, however, CondReg is 8 bits width and may contain
2106     // garbage. Indeed, only the less significant bit is supposed to be
2107     // accurate. If we read more than the lsb, we may see non-zero values
2108     // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
2109     // the select. This is achieved by performing TEST against 1.
2110     Register CondReg = getRegForValue(Cond);
2111     if (CondReg == 0)
2112       return false;
2113 
2114     // In case OpReg is a K register, COPY to a GPR
2115     if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
2116       unsigned KCondReg = CondReg;
2117       CondReg = createResultReg(&X86::GR32RegClass);
2118       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2119               TII.get(TargetOpcode::COPY), CondReg)
2120           .addReg(KCondReg);
2121       CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, X86::sub_8bit);
2122     }
2123     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::TEST8ri))
2124         .addReg(CondReg)
2125         .addImm(1);
2126   }
2127 
2128   const Value *LHS = I->getOperand(1);
2129   const Value *RHS = I->getOperand(2);
2130 
2131   Register RHSReg = getRegForValue(RHS);
2132   Register LHSReg = getRegForValue(LHS);
2133   if (!LHSReg || !RHSReg)
2134     return false;
2135 
2136   const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo();
2137   unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC) / 8, false,
2138                                     Subtarget->hasNDD());
2139   Register ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, LHSReg, CC);
2140   updateValueMap(I, ResultReg);
2141   return true;
2142 }
2143 
2144 /// Emit SSE or AVX instructions to lower the select.
2145 ///
2146 /// Try to use SSE1/SSE2 instructions to simulate a select without branches.
2147 /// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
2148 /// SSE instructions are available. If AVX is available, try to use a VBLENDV.
2149 bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
2150   // Optimize conditions coming from a compare if both instructions are in the
2151   // same basic block (values defined in other basic blocks may not have
2152   // initialized registers).
2153   const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));
2154   if (!CI || (CI->getParent() != I->getParent()))
2155     return false;
2156 
2157   if (I->getType() != CI->getOperand(0)->getType() ||
2158       !((Subtarget->hasSSE1() && RetVT == MVT::f32) ||
2159         (Subtarget->hasSSE2() && RetVT == MVT::f64)))
2160     return false;
2161 
2162   const Value *CmpLHS = CI->getOperand(0);
2163   const Value *CmpRHS = CI->getOperand(1);
2164   CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
2165 
2166   // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
2167   // We don't have to materialize a zero constant for this case and can just use
2168   // %x again on the RHS.
2169   if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
2170     const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
2171     if (CmpRHSC && CmpRHSC->isNullValue())
2172       CmpRHS = CmpLHS;
2173   }
2174 
2175   unsigned CC;
2176   bool NeedSwap;
2177   std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
2178   if (CC > 7 && !Subtarget->hasAVX())
2179     return false;
2180 
2181   if (NeedSwap)
2182     std::swap(CmpLHS, CmpRHS);
2183 
2184   const Value *LHS = I->getOperand(1);
2185   const Value *RHS = I->getOperand(2);
2186 
2187   Register LHSReg = getRegForValue(LHS);
2188   Register RHSReg = getRegForValue(RHS);
2189   Register CmpLHSReg = getRegForValue(CmpLHS);
2190   Register CmpRHSReg = getRegForValue(CmpRHS);
2191   if (!LHSReg || !RHSReg || !CmpLHSReg || !CmpRHSReg)
2192     return false;
2193 
2194   const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
2195   unsigned ResultReg;
2196 
2197   if (Subtarget->hasAVX512()) {
2198     // If we have AVX512 we can use a mask compare and masked movss/sd.
2199     const TargetRegisterClass *VR128X = &X86::VR128XRegClass;
2200     const TargetRegisterClass *VK1 = &X86::VK1RegClass;
2201 
2202     unsigned CmpOpcode =
2203       (RetVT == MVT::f32) ? X86::VCMPSSZrri : X86::VCMPSDZrri;
2204     Register CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpRHSReg,
2205                                        CC);
2206 
2207     // Need an IMPLICIT_DEF for the input that is used to generate the upper
2208     // bits of the result register since its not based on any of the inputs.
2209     Register ImplicitDefReg = createResultReg(VR128X);
2210     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2211             TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
2212 
2213     // Place RHSReg is the passthru of the masked movss/sd operation and put
2214     // LHS in the input. The mask input comes from the compare.
2215     unsigned MovOpcode =
2216       (RetVT == MVT::f32) ? X86::VMOVSSZrrk : X86::VMOVSDZrrk;
2217     unsigned MovReg = fastEmitInst_rrrr(MovOpcode, VR128X, RHSReg, CmpReg,
2218                                         ImplicitDefReg, LHSReg);
2219 
2220     ResultReg = createResultReg(RC);
2221     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2222             TII.get(TargetOpcode::COPY), ResultReg).addReg(MovReg);
2223 
2224   } else if (Subtarget->hasAVX()) {
2225     const TargetRegisterClass *VR128 = &X86::VR128RegClass;
2226 
2227     // If we have AVX, create 1 blendv instead of 3 logic instructions.
2228     // Blendv was introduced with SSE 4.1, but the 2 register form implicitly
2229     // uses XMM0 as the selection register. That may need just as many
2230     // instructions as the AND/ANDN/OR sequence due to register moves, so
2231     // don't bother.
2232     unsigned CmpOpcode =
2233       (RetVT == MVT::f32) ? X86::VCMPSSrri : X86::VCMPSDrri;
2234     unsigned BlendOpcode =
2235       (RetVT == MVT::f32) ? X86::VBLENDVPSrrr : X86::VBLENDVPDrrr;
2236 
2237     Register CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpRHSReg,
2238                                        CC);
2239     Register VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, LHSReg,
2240                                           CmpReg);
2241     ResultReg = createResultReg(RC);
2242     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2243             TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
2244   } else {
2245     // Choose the SSE instruction sequence based on data type (float or double).
2246     static const uint16_t OpcTable[2][4] = {
2247       { X86::CMPSSrri,  X86::ANDPSrr,  X86::ANDNPSrr,  X86::ORPSrr  },
2248       { X86::CMPSDrri,  X86::ANDPDrr,  X86::ANDNPDrr,  X86::ORPDrr  }
2249     };
2250 
2251     const uint16_t *Opc = nullptr;
2252     switch (RetVT.SimpleTy) {
2253     default: return false;
2254     case MVT::f32: Opc = &OpcTable[0][0]; break;
2255     case MVT::f64: Opc = &OpcTable[1][0]; break;
2256     }
2257 
2258     const TargetRegisterClass *VR128 = &X86::VR128RegClass;
2259     Register CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpRHSReg, CC);
2260     Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, LHSReg);
2261     Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, RHSReg);
2262     Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, AndReg);
2263     ResultReg = createResultReg(RC);
2264     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2265             TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg);
2266   }
2267   updateValueMap(I, ResultReg);
2268   return true;
2269 }
2270 
2271 bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
2272   // These are pseudo CMOV instructions and will be later expanded into control-
2273   // flow.
2274   unsigned Opc;
2275   switch (RetVT.SimpleTy) {
2276   default: return false;
2277   case MVT::i8:  Opc = X86::CMOV_GR8;   break;
2278   case MVT::i16: Opc = X86::CMOV_GR16;  break;
2279   case MVT::i32: Opc = X86::CMOV_GR32;  break;
2280   case MVT::f16:
2281     Opc = Subtarget->hasAVX512() ? X86::CMOV_FR16X : X86::CMOV_FR16; break;
2282   case MVT::f32:
2283     Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X : X86::CMOV_FR32; break;
2284   case MVT::f64:
2285     Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X : X86::CMOV_FR64; break;
2286   }
2287 
2288   const Value *Cond = I->getOperand(0);
2289   X86::CondCode CC = X86::COND_NE;
2290 
2291   // Optimize conditions coming from a compare if both instructions are in the
2292   // same basic block (values defined in other basic blocks may not have
2293   // initialized registers).
2294   const auto *CI = dyn_cast<CmpInst>(Cond);
2295   if (CI && (CI->getParent() == I->getParent())) {
2296     bool NeedSwap;
2297     std::tie(CC, NeedSwap) = X86::getX86ConditionCode(CI->getPredicate());
2298     if (CC > X86::LAST_VALID_COND)
2299       return false;
2300 
2301     const Value *CmpLHS = CI->getOperand(0);
2302     const Value *CmpRHS = CI->getOperand(1);
2303 
2304     if (NeedSwap)
2305       std::swap(CmpLHS, CmpRHS);
2306 
2307     EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
2308     if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
2309       return false;
2310   } else {
2311     Register CondReg = getRegForValue(Cond);
2312     if (CondReg == 0)
2313       return false;
2314 
2315     // In case OpReg is a K register, COPY to a GPR
2316     if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
2317       unsigned KCondReg = CondReg;
2318       CondReg = createResultReg(&X86::GR32RegClass);
2319       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2320               TII.get(TargetOpcode::COPY), CondReg)
2321           .addReg(KCondReg);
2322       CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, X86::sub_8bit);
2323     }
2324     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::TEST8ri))
2325         .addReg(CondReg)
2326         .addImm(1);
2327   }
2328 
2329   const Value *LHS = I->getOperand(1);
2330   const Value *RHS = I->getOperand(2);
2331 
2332   Register LHSReg = getRegForValue(LHS);
2333   Register RHSReg = getRegForValue(RHS);
2334   if (!LHSReg || !RHSReg)
2335     return false;
2336 
2337   const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
2338 
2339   Register ResultReg =
2340     fastEmitInst_rri(Opc, RC, RHSReg, LHSReg, CC);
2341   updateValueMap(I, ResultReg);
2342   return true;
2343 }
2344 
2345 bool X86FastISel::X86SelectSelect(const Instruction *I) {
2346   MVT RetVT;
2347   if (!isTypeLegal(I->getType(), RetVT))
2348     return false;
2349 
2350   // Check if we can fold the select.
2351   if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {
2352     CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
2353     const Value *Opnd = nullptr;
2354     switch (Predicate) {
2355     default:                              break;
2356     case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
2357     case CmpInst::FCMP_TRUE:  Opnd = I->getOperand(1); break;
2358     }
2359     // No need for a select anymore - this is an unconditional move.
2360     if (Opnd) {
2361       Register OpReg = getRegForValue(Opnd);
2362       if (OpReg == 0)
2363         return false;
2364       const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
2365       Register ResultReg = createResultReg(RC);
2366       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2367               TII.get(TargetOpcode::COPY), ResultReg)
2368         .addReg(OpReg);
2369       updateValueMap(I, ResultReg);
2370       return true;
2371     }
2372   }
2373 
2374   // First try to use real conditional move instructions.
2375   if (X86FastEmitCMoveSelect(RetVT, I))
2376     return true;
2377 
2378   // Try to use a sequence of SSE instructions to simulate a conditional move.
2379   if (X86FastEmitSSESelect(RetVT, I))
2380     return true;
2381 
2382   // Fall-back to pseudo conditional move instructions, which will be later
2383   // converted to control-flow.
2384   if (X86FastEmitPseudoSelect(RetVT, I))
2385     return true;
2386 
2387   return false;
2388 }
2389 
2390 // Common code for X86SelectSIToFP and X86SelectUIToFP.
2391 bool X86FastISel::X86SelectIntToFP(const Instruction *I, bool IsSigned) {
2392   // The target-independent selection algorithm in FastISel already knows how
2393   // to select a SINT_TO_FP if the target is SSE but not AVX.
2394   // Early exit if the subtarget doesn't have AVX.
2395   // Unsigned conversion requires avx512.
2396   bool HasAVX512 = Subtarget->hasAVX512();
2397   if (!Subtarget->hasAVX() || (!IsSigned && !HasAVX512))
2398     return false;
2399 
2400   // TODO: We could sign extend narrower types.
2401   EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
2402   if (SrcVT != MVT::i32 && SrcVT != MVT::i64)
2403     return false;
2404 
2405   // Select integer to float/double conversion.
2406   Register OpReg = getRegForValue(I->getOperand(0));
2407   if (OpReg == 0)
2408     return false;
2409 
2410   unsigned Opcode;
2411 
2412   static const uint16_t SCvtOpc[2][2][2] = {
2413     { { X86::VCVTSI2SSrr,  X86::VCVTSI642SSrr },
2414       { X86::VCVTSI2SDrr,  X86::VCVTSI642SDrr } },
2415     { { X86::VCVTSI2SSZrr, X86::VCVTSI642SSZrr },
2416       { X86::VCVTSI2SDZrr, X86::VCVTSI642SDZrr } },
2417   };
2418   static const uint16_t UCvtOpc[2][2] = {
2419     { X86::VCVTUSI2SSZrr, X86::VCVTUSI642SSZrr },
2420     { X86::VCVTUSI2SDZrr, X86::VCVTUSI642SDZrr },
2421   };
2422   bool Is64Bit = SrcVT == MVT::i64;
2423 
2424   if (I->getType()->isDoubleTy()) {
2425     // s/uitofp int -> double
2426     Opcode = IsSigned ? SCvtOpc[HasAVX512][1][Is64Bit] : UCvtOpc[1][Is64Bit];
2427   } else if (I->getType()->isFloatTy()) {
2428     // s/uitofp int -> float
2429     Opcode = IsSigned ? SCvtOpc[HasAVX512][0][Is64Bit] : UCvtOpc[0][Is64Bit];
2430   } else
2431     return false;
2432 
2433   MVT DstVT = TLI.getValueType(DL, I->getType()).getSimpleVT();
2434   const TargetRegisterClass *RC = TLI.getRegClassFor(DstVT);
2435   Register ImplicitDefReg = createResultReg(RC);
2436   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2437           TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
2438   Register ResultReg = fastEmitInst_rr(Opcode, RC, ImplicitDefReg, OpReg);
2439   updateValueMap(I, ResultReg);
2440   return true;
2441 }
2442 
2443 bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
2444   return X86SelectIntToFP(I, /*IsSigned*/true);
2445 }
2446 
2447 bool X86FastISel::X86SelectUIToFP(const Instruction *I) {
2448   return X86SelectIntToFP(I, /*IsSigned*/false);
2449 }
2450 
2451 // Helper method used by X86SelectFPExt and X86SelectFPTrunc.
2452 bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
2453                                           unsigned TargetOpc,
2454                                           const TargetRegisterClass *RC) {
2455   assert((I->getOpcode() == Instruction::FPExt ||
2456           I->getOpcode() == Instruction::FPTrunc) &&
2457          "Instruction must be an FPExt or FPTrunc!");
2458   bool HasAVX = Subtarget->hasAVX();
2459 
2460   Register OpReg = getRegForValue(I->getOperand(0));
2461   if (OpReg == 0)
2462     return false;
2463 
2464   unsigned ImplicitDefReg;
2465   if (HasAVX) {
2466     ImplicitDefReg = createResultReg(RC);
2467     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2468             TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
2469 
2470   }
2471 
2472   Register ResultReg = createResultReg(RC);
2473   MachineInstrBuilder MIB;
2474   MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpc),
2475                 ResultReg);
2476 
2477   if (HasAVX)
2478     MIB.addReg(ImplicitDefReg);
2479 
2480   MIB.addReg(OpReg);
2481   updateValueMap(I, ResultReg);
2482   return true;
2483 }
2484 
2485 bool X86FastISel::X86SelectFPExt(const Instruction *I) {
2486   if (Subtarget->hasSSE2() && I->getType()->isDoubleTy() &&
2487       I->getOperand(0)->getType()->isFloatTy()) {
2488     bool HasAVX512 = Subtarget->hasAVX512();
2489     // fpext from float to double.
2490     unsigned Opc =
2491         HasAVX512 ? X86::VCVTSS2SDZrr
2492                   : Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr;
2493     return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f64));
2494   }
2495 
2496   return false;
2497 }
2498 
2499 bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
2500   if (Subtarget->hasSSE2() && I->getType()->isFloatTy() &&
2501       I->getOperand(0)->getType()->isDoubleTy()) {
2502     bool HasAVX512 = Subtarget->hasAVX512();
2503     // fptrunc from double to float.
2504     unsigned Opc =
2505         HasAVX512 ? X86::VCVTSD2SSZrr
2506                   : Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr;
2507     return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f32));
2508   }
2509 
2510   return false;
2511 }
2512 
2513 bool X86FastISel::X86SelectTrunc(const Instruction *I) {
2514   EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
2515   EVT DstVT = TLI.getValueType(DL, I->getType());
2516 
2517   // This code only handles truncation to byte.
2518   if (DstVT != MVT::i8 && DstVT != MVT::i1)
2519     return false;
2520   if (!TLI.isTypeLegal(SrcVT))
2521     return false;
2522 
2523   Register InputReg = getRegForValue(I->getOperand(0));
2524   if (!InputReg)
2525     // Unhandled operand.  Halt "fast" selection and bail.
2526     return false;
2527 
2528   if (SrcVT == MVT::i8) {
2529     // Truncate from i8 to i1; no code needed.
2530     updateValueMap(I, InputReg);
2531     return true;
2532   }
2533 
2534   // Issue an extract_subreg.
2535   Register ResultReg = fastEmitInst_extractsubreg(MVT::i8, InputReg,
2536                                                   X86::sub_8bit);
2537   if (!ResultReg)
2538     return false;
2539 
2540   updateValueMap(I, ResultReg);
2541   return true;
2542 }
2543 
2544 bool X86FastISel::IsMemcpySmall(uint64_t Len) {
2545   return Len <= (Subtarget->is64Bit() ? 32 : 16);
2546 }
2547 
2548 bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
2549                                      X86AddressMode SrcAM, uint64_t Len) {
2550 
2551   // Make sure we don't bloat code by inlining very large memcpy's.
2552   if (!IsMemcpySmall(Len))
2553     return false;
2554 
2555   bool i64Legal = Subtarget->is64Bit();
2556 
2557   // We don't care about alignment here since we just emit integer accesses.
2558   while (Len) {
2559     MVT VT;
2560     if (Len >= 8 && i64Legal)
2561       VT = MVT::i64;
2562     else if (Len >= 4)
2563       VT = MVT::i32;
2564     else if (Len >= 2)
2565       VT = MVT::i16;
2566     else
2567       VT = MVT::i8;
2568 
2569     unsigned Reg;
2570     bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
2571     RV &= X86FastEmitStore(VT, Reg, DestAM);
2572     assert(RV && "Failed to emit load or store??");
2573     (void)RV;
2574 
2575     unsigned Size = VT.getSizeInBits()/8;
2576     Len -= Size;
2577     DestAM.Disp += Size;
2578     SrcAM.Disp += Size;
2579   }
2580 
2581   return true;
2582 }
2583 
2584 bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
2585   // FIXME: Handle more intrinsics.
2586   switch (II->getIntrinsicID()) {
2587   default: return false;
2588   case Intrinsic::convert_from_fp16:
2589   case Intrinsic::convert_to_fp16: {
2590     if (Subtarget->useSoftFloat() || !Subtarget->hasF16C())
2591       return false;
2592 
2593     const Value *Op = II->getArgOperand(0);
2594     Register InputReg = getRegForValue(Op);
2595     if (InputReg == 0)
2596       return false;
2597 
2598     // F16C only allows converting from float to half and from half to float.
2599     bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16;
2600     if (IsFloatToHalf) {
2601       if (!Op->getType()->isFloatTy())
2602         return false;
2603     } else {
2604       if (!II->getType()->isFloatTy())
2605         return false;
2606     }
2607 
2608     unsigned ResultReg = 0;
2609     const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16);
2610     if (IsFloatToHalf) {
2611       // 'InputReg' is implicitly promoted from register class FR32 to
2612       // register class VR128 by method 'constrainOperandRegClass' which is
2613       // directly called by 'fastEmitInst_ri'.
2614       // Instruction VCVTPS2PHrr takes an extra immediate operand which is
2615       // used to provide rounding control: use MXCSR.RC, encoded as 0b100.
2616       // It's consistent with the other FP instructions, which are usually
2617       // controlled by MXCSR.
2618       unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPS2PHZ128rr
2619                                          : X86::VCVTPS2PHrr;
2620       InputReg = fastEmitInst_ri(Opc, RC, InputReg, 4);
2621 
2622       // Move the lower 32-bits of ResultReg to another register of class GR32.
2623       Opc = Subtarget->hasAVX512() ? X86::VMOVPDI2DIZrr
2624                                    : X86::VMOVPDI2DIrr;
2625       ResultReg = createResultReg(&X86::GR32RegClass);
2626       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), ResultReg)
2627           .addReg(InputReg, RegState::Kill);
2628 
2629       // The result value is in the lower 16-bits of ResultReg.
2630       unsigned RegIdx = X86::sub_16bit;
2631       ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, RegIdx);
2632     } else {
2633       assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
2634       // Explicitly zero-extend the input to 32-bit.
2635       InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::ZERO_EXTEND, InputReg);
2636 
2637       // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
2638       InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
2639                             InputReg);
2640 
2641       unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPH2PSZ128rr
2642                                          : X86::VCVTPH2PSrr;
2643       InputReg = fastEmitInst_r(Opc, RC, InputReg);
2644 
2645       // The result value is in the lower 32-bits of ResultReg.
2646       // Emit an explicit copy from register class VR128 to register class FR32.
2647       ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));
2648       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2649               TII.get(TargetOpcode::COPY), ResultReg)
2650           .addReg(InputReg, RegState::Kill);
2651     }
2652 
2653     updateValueMap(II, ResultReg);
2654     return true;
2655   }
2656   case Intrinsic::frameaddress: {
2657     MachineFunction *MF = FuncInfo.MF;
2658     if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI())
2659       return false;
2660 
2661     Type *RetTy = II->getCalledFunction()->getReturnType();
2662 
2663     MVT VT;
2664     if (!isTypeLegal(RetTy, VT))
2665       return false;
2666 
2667     unsigned Opc;
2668     const TargetRegisterClass *RC = nullptr;
2669 
2670     switch (VT.SimpleTy) {
2671     default: llvm_unreachable("Invalid result type for frameaddress.");
2672     case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
2673     case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
2674     }
2675 
2676     // This needs to be set before we call getPtrSizedFrameRegister, otherwise
2677     // we get the wrong frame register.
2678     MachineFrameInfo &MFI = MF->getFrameInfo();
2679     MFI.setFrameAddressIsTaken(true);
2680 
2681     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
2682     unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF);
2683     assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
2684             (FrameReg == X86::EBP && VT == MVT::i32)) &&
2685            "Invalid Frame Register!");
2686 
2687     // Always make a copy of the frame register to a vreg first, so that we
2688     // never directly reference the frame register (the TwoAddressInstruction-
2689     // Pass doesn't like that).
2690     Register SrcReg = createResultReg(RC);
2691     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2692             TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
2693 
2694     // Now recursively load from the frame address.
2695     // movq (%rbp), %rax
2696     // movq (%rax), %rax
2697     // movq (%rax), %rax
2698     // ...
2699     unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
2700     while (Depth--) {
2701       Register DestReg = createResultReg(RC);
2702       addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2703                            TII.get(Opc), DestReg), SrcReg);
2704       SrcReg = DestReg;
2705     }
2706 
2707     updateValueMap(II, SrcReg);
2708     return true;
2709   }
2710   case Intrinsic::memcpy: {
2711     const MemCpyInst *MCI = cast<MemCpyInst>(II);
2712     // Don't handle volatile or variable length memcpys.
2713     if (MCI->isVolatile())
2714       return false;
2715 
2716     if (isa<ConstantInt>(MCI->getLength())) {
2717       // Small memcpy's are common enough that we want to do them
2718       // without a call if possible.
2719       uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue();
2720       if (IsMemcpySmall(Len)) {
2721         X86AddressMode DestAM, SrcAM;
2722         if (!X86SelectAddress(MCI->getRawDest(), DestAM) ||
2723             !X86SelectAddress(MCI->getRawSource(), SrcAM))
2724           return false;
2725         TryEmitSmallMemcpy(DestAM, SrcAM, Len);
2726         return true;
2727       }
2728     }
2729 
2730     unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
2731     if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))
2732       return false;
2733 
2734     if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
2735       return false;
2736 
2737     return lowerCallTo(II, "memcpy", II->arg_size() - 1);
2738   }
2739   case Intrinsic::memset: {
2740     const MemSetInst *MSI = cast<MemSetInst>(II);
2741 
2742     if (MSI->isVolatile())
2743       return false;
2744 
2745     unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
2746     if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))
2747       return false;
2748 
2749     if (MSI->getDestAddressSpace() > 255)
2750       return false;
2751 
2752     return lowerCallTo(II, "memset", II->arg_size() - 1);
2753   }
2754   case Intrinsic::stackprotector: {
2755     // Emit code to store the stack guard onto the stack.
2756     EVT PtrTy = TLI.getPointerTy(DL);
2757 
2758     const Value *Op1 = II->getArgOperand(0); // The guard's value.
2759     const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));
2760 
2761     MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);
2762 
2763     // Grab the frame index.
2764     X86AddressMode AM;
2765     if (!X86SelectAddress(Slot, AM)) return false;
2766     if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
2767     return true;
2768   }
2769   case Intrinsic::dbg_declare: {
2770     const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
2771     X86AddressMode AM;
2772     assert(DI->getAddress() && "Null address should be checked earlier!");
2773     if (!X86SelectAddress(DI->getAddress(), AM))
2774       return false;
2775     const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
2776     assert(DI->getVariable()->isValidLocationForIntrinsic(MIMD.getDL()) &&
2777            "Expected inlined-at fields to agree");
2778     addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II), AM)
2779         .addImm(0)
2780         .addMetadata(DI->getVariable())
2781         .addMetadata(DI->getExpression());
2782     return true;
2783   }
2784   case Intrinsic::trap: {
2785     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::TRAP));
2786     return true;
2787   }
2788   case Intrinsic::sqrt: {
2789     if (!Subtarget->hasSSE1())
2790       return false;
2791 
2792     Type *RetTy = II->getCalledFunction()->getReturnType();
2793 
2794     MVT VT;
2795     if (!isTypeLegal(RetTy, VT))
2796       return false;
2797 
2798     // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
2799     // is not generated by FastISel yet.
2800     // FIXME: Update this code once tablegen can handle it.
2801     static const uint16_t SqrtOpc[3][2] = {
2802       { X86::SQRTSSr,   X86::SQRTSDr },
2803       { X86::VSQRTSSr,  X86::VSQRTSDr },
2804       { X86::VSQRTSSZr, X86::VSQRTSDZr },
2805     };
2806     unsigned AVXLevel = Subtarget->hasAVX512() ? 2 :
2807                         Subtarget->hasAVX()    ? 1 :
2808                                                  0;
2809     unsigned Opc;
2810     switch (VT.SimpleTy) {
2811     default: return false;
2812     case MVT::f32: Opc = SqrtOpc[AVXLevel][0]; break;
2813     case MVT::f64: Opc = SqrtOpc[AVXLevel][1]; break;
2814     }
2815 
2816     const Value *SrcVal = II->getArgOperand(0);
2817     Register SrcReg = getRegForValue(SrcVal);
2818 
2819     if (SrcReg == 0)
2820       return false;
2821 
2822     const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
2823     unsigned ImplicitDefReg = 0;
2824     if (AVXLevel > 0) {
2825       ImplicitDefReg = createResultReg(RC);
2826       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2827               TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
2828     }
2829 
2830     Register ResultReg = createResultReg(RC);
2831     MachineInstrBuilder MIB;
2832     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc),
2833                   ResultReg);
2834 
2835     if (ImplicitDefReg)
2836       MIB.addReg(ImplicitDefReg);
2837 
2838     MIB.addReg(SrcReg);
2839 
2840     updateValueMap(II, ResultReg);
2841     return true;
2842   }
2843   case Intrinsic::sadd_with_overflow:
2844   case Intrinsic::uadd_with_overflow:
2845   case Intrinsic::ssub_with_overflow:
2846   case Intrinsic::usub_with_overflow:
2847   case Intrinsic::smul_with_overflow:
2848   case Intrinsic::umul_with_overflow: {
2849     // This implements the basic lowering of the xalu with overflow intrinsics
2850     // into add/sub/mul followed by either seto or setb.
2851     const Function *Callee = II->getCalledFunction();
2852     auto *Ty = cast<StructType>(Callee->getReturnType());
2853     Type *RetTy = Ty->getTypeAtIndex(0U);
2854     assert(Ty->getTypeAtIndex(1)->isIntegerTy() &&
2855            Ty->getTypeAtIndex(1)->getScalarSizeInBits() == 1 &&
2856            "Overflow value expected to be an i1");
2857 
2858     MVT VT;
2859     if (!isTypeLegal(RetTy, VT))
2860       return false;
2861 
2862     if (VT < MVT::i8 || VT > MVT::i64)
2863       return false;
2864 
2865     const Value *LHS = II->getArgOperand(0);
2866     const Value *RHS = II->getArgOperand(1);
2867 
2868     // Canonicalize immediate to the RHS.
2869     if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative())
2870       std::swap(LHS, RHS);
2871 
2872     unsigned BaseOpc, CondCode;
2873     switch (II->getIntrinsicID()) {
2874     default: llvm_unreachable("Unexpected intrinsic!");
2875     case Intrinsic::sadd_with_overflow:
2876       BaseOpc = ISD::ADD; CondCode = X86::COND_O; break;
2877     case Intrinsic::uadd_with_overflow:
2878       BaseOpc = ISD::ADD; CondCode = X86::COND_B; break;
2879     case Intrinsic::ssub_with_overflow:
2880       BaseOpc = ISD::SUB; CondCode = X86::COND_O; break;
2881     case Intrinsic::usub_with_overflow:
2882       BaseOpc = ISD::SUB; CondCode = X86::COND_B; break;
2883     case Intrinsic::smul_with_overflow:
2884       BaseOpc = X86ISD::SMUL; CondCode = X86::COND_O; break;
2885     case Intrinsic::umul_with_overflow:
2886       BaseOpc = X86ISD::UMUL; CondCode = X86::COND_O; break;
2887     }
2888 
2889     Register LHSReg = getRegForValue(LHS);
2890     if (LHSReg == 0)
2891       return false;
2892 
2893     unsigned ResultReg = 0;
2894     // Check if we have an immediate version.
2895     if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
2896       static const uint16_t Opc[2][4] = {
2897         { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },
2898         { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }
2899       };
2900 
2901       if (CI->isOne() && (BaseOpc == ISD::ADD || BaseOpc == ISD::SUB) &&
2902           CondCode == X86::COND_O) {
2903         // We can use INC/DEC.
2904         ResultReg = createResultReg(TLI.getRegClassFor(VT));
2905         bool IsDec = BaseOpc == ISD::SUB;
2906         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2907                 TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
2908           .addReg(LHSReg);
2909       } else
2910         ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, CI->getZExtValue());
2911     }
2912 
2913     unsigned RHSReg;
2914     if (!ResultReg) {
2915       RHSReg = getRegForValue(RHS);
2916       if (RHSReg == 0)
2917         return false;
2918       ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, RHSReg);
2919     }
2920 
2921     // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
2922     // it manually.
2923     if (BaseOpc == X86ISD::UMUL && !ResultReg) {
2924       static const uint16_t MULOpc[] =
2925         { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
2926       static const MCPhysReg Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
2927       // First copy the first operand into RAX, which is an implicit input to
2928       // the X86::MUL*r instruction.
2929       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2930               TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
2931         .addReg(LHSReg);
2932       ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
2933                                  TLI.getRegClassFor(VT), RHSReg);
2934     } else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
2935       static const uint16_t MULOpc[] =
2936         { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
2937       if (VT == MVT::i8) {
2938         // Copy the first operand into AL, which is an implicit input to the
2939         // X86::IMUL8r instruction.
2940         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2941                TII.get(TargetOpcode::COPY), X86::AL)
2942           .addReg(LHSReg);
2943         ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg);
2944       } else
2945         ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
2946                                     TLI.getRegClassFor(VT), LHSReg, RHSReg);
2947     }
2948 
2949     if (!ResultReg)
2950       return false;
2951 
2952     // Assign to a GPR since the overflow return value is lowered to a SETcc.
2953     Register ResultReg2 = createResultReg(&X86::GR8RegClass);
2954     assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
2955     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
2956             ResultReg2).addImm(CondCode);
2957 
2958     updateValueMap(II, ResultReg, 2);
2959     return true;
2960   }
2961   case Intrinsic::x86_sse_cvttss2si:
2962   case Intrinsic::x86_sse_cvttss2si64:
2963   case Intrinsic::x86_sse2_cvttsd2si:
2964   case Intrinsic::x86_sse2_cvttsd2si64: {
2965     bool IsInputDouble;
2966     switch (II->getIntrinsicID()) {
2967     default: llvm_unreachable("Unexpected intrinsic.");
2968     case Intrinsic::x86_sse_cvttss2si:
2969     case Intrinsic::x86_sse_cvttss2si64:
2970       if (!Subtarget->hasSSE1())
2971         return false;
2972       IsInputDouble = false;
2973       break;
2974     case Intrinsic::x86_sse2_cvttsd2si:
2975     case Intrinsic::x86_sse2_cvttsd2si64:
2976       if (!Subtarget->hasSSE2())
2977         return false;
2978       IsInputDouble = true;
2979       break;
2980     }
2981 
2982     Type *RetTy = II->getCalledFunction()->getReturnType();
2983     MVT VT;
2984     if (!isTypeLegal(RetTy, VT))
2985       return false;
2986 
2987     static const uint16_t CvtOpc[3][2][2] = {
2988       { { X86::CVTTSS2SIrr,   X86::CVTTSS2SI64rr },
2989         { X86::CVTTSD2SIrr,   X86::CVTTSD2SI64rr } },
2990       { { X86::VCVTTSS2SIrr,  X86::VCVTTSS2SI64rr },
2991         { X86::VCVTTSD2SIrr,  X86::VCVTTSD2SI64rr } },
2992       { { X86::VCVTTSS2SIZrr, X86::VCVTTSS2SI64Zrr },
2993         { X86::VCVTTSD2SIZrr, X86::VCVTTSD2SI64Zrr } },
2994     };
2995     unsigned AVXLevel = Subtarget->hasAVX512() ? 2 :
2996                         Subtarget->hasAVX()    ? 1 :
2997                                                  0;
2998     unsigned Opc;
2999     switch (VT.SimpleTy) {
3000     default: llvm_unreachable("Unexpected result type.");
3001     case MVT::i32: Opc = CvtOpc[AVXLevel][IsInputDouble][0]; break;
3002     case MVT::i64: Opc = CvtOpc[AVXLevel][IsInputDouble][1]; break;
3003     }
3004 
3005     // Check if we can fold insertelement instructions into the convert.
3006     const Value *Op = II->getArgOperand(0);
3007     while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
3008       const Value *Index = IE->getOperand(2);
3009       if (!isa<ConstantInt>(Index))
3010         break;
3011       unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
3012 
3013       if (Idx == 0) {
3014         Op = IE->getOperand(1);
3015         break;
3016       }
3017       Op = IE->getOperand(0);
3018     }
3019 
3020     Register Reg = getRegForValue(Op);
3021     if (Reg == 0)
3022       return false;
3023 
3024     Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
3025     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), ResultReg)
3026       .addReg(Reg);
3027 
3028     updateValueMap(II, ResultReg);
3029     return true;
3030   }
3031   case Intrinsic::x86_sse42_crc32_32_8:
3032   case Intrinsic::x86_sse42_crc32_32_16:
3033   case Intrinsic::x86_sse42_crc32_32_32:
3034   case Intrinsic::x86_sse42_crc32_64_64: {
3035     if (!Subtarget->hasCRC32())
3036       return false;
3037 
3038     Type *RetTy = II->getCalledFunction()->getReturnType();
3039 
3040     MVT VT;
3041     if (!isTypeLegal(RetTy, VT))
3042       return false;
3043 
3044     unsigned Opc;
3045     const TargetRegisterClass *RC = nullptr;
3046 
3047     switch (II->getIntrinsicID()) {
3048     default:
3049       llvm_unreachable("Unexpected intrinsic.");
3050 #define GET_EGPR_IF_ENABLED(OPC) Subtarget->hasEGPR() ? OPC##_EVEX : OPC
3051     case Intrinsic::x86_sse42_crc32_32_8:
3052       Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r8);
3053       RC = &X86::GR32RegClass;
3054       break;
3055     case Intrinsic::x86_sse42_crc32_32_16:
3056       Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r16);
3057       RC = &X86::GR32RegClass;
3058       break;
3059     case Intrinsic::x86_sse42_crc32_32_32:
3060       Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r32);
3061       RC = &X86::GR32RegClass;
3062       break;
3063     case Intrinsic::x86_sse42_crc32_64_64:
3064       Opc = GET_EGPR_IF_ENABLED(X86::CRC32r64r64);
3065       RC = &X86::GR64RegClass;
3066       break;
3067 #undef GET_EGPR_IF_ENABLED
3068     }
3069 
3070     const Value *LHS = II->getArgOperand(0);
3071     const Value *RHS = II->getArgOperand(1);
3072 
3073     Register LHSReg = getRegForValue(LHS);
3074     Register RHSReg = getRegForValue(RHS);
3075     if (!LHSReg || !RHSReg)
3076       return false;
3077 
3078     Register ResultReg = fastEmitInst_rr(Opc, RC, LHSReg, RHSReg);
3079     if (!ResultReg)
3080       return false;
3081 
3082     updateValueMap(II, ResultReg);
3083     return true;
3084   }
3085   }
3086 }
3087 
3088 bool X86FastISel::fastLowerArguments() {
3089   if (!FuncInfo.CanLowerReturn)
3090     return false;
3091 
3092   const Function *F = FuncInfo.Fn;
3093   if (F->isVarArg())
3094     return false;
3095 
3096   CallingConv::ID CC = F->getCallingConv();
3097   if (CC != CallingConv::C)
3098     return false;
3099 
3100   if (Subtarget->isCallingConvWin64(CC))
3101     return false;
3102 
3103   if (!Subtarget->is64Bit())
3104     return false;
3105 
3106   if (Subtarget->useSoftFloat())
3107     return false;
3108 
3109   // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
3110   unsigned GPRCnt = 0;
3111   unsigned FPRCnt = 0;
3112   for (auto const &Arg : F->args()) {
3113     if (Arg.hasAttribute(Attribute::ByVal) ||
3114         Arg.hasAttribute(Attribute::InReg) ||
3115         Arg.hasAttribute(Attribute::StructRet) ||
3116         Arg.hasAttribute(Attribute::SwiftSelf) ||
3117         Arg.hasAttribute(Attribute::SwiftAsync) ||
3118         Arg.hasAttribute(Attribute::SwiftError) ||
3119         Arg.hasAttribute(Attribute::Nest))
3120       return false;
3121 
3122     Type *ArgTy = Arg.getType();
3123     if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
3124       return false;
3125 
3126     EVT ArgVT = TLI.getValueType(DL, ArgTy);
3127     if (!ArgVT.isSimple()) return false;
3128     switch (ArgVT.getSimpleVT().SimpleTy) {
3129     default: return false;
3130     case MVT::i32:
3131     case MVT::i64:
3132       ++GPRCnt;
3133       break;
3134     case MVT::f32:
3135     case MVT::f64:
3136       if (!Subtarget->hasSSE1())
3137         return false;
3138       ++FPRCnt;
3139       break;
3140     }
3141 
3142     if (GPRCnt > 6)
3143       return false;
3144 
3145     if (FPRCnt > 8)
3146       return false;
3147   }
3148 
3149   static const MCPhysReg GPR32ArgRegs[] = {
3150     X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
3151   };
3152   static const MCPhysReg GPR64ArgRegs[] = {
3153     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
3154   };
3155   static const MCPhysReg XMMArgRegs[] = {
3156     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3157     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3158   };
3159 
3160   unsigned GPRIdx = 0;
3161   unsigned FPRIdx = 0;
3162   for (auto const &Arg : F->args()) {
3163     MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
3164     const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
3165     unsigned SrcReg;
3166     switch (VT.SimpleTy) {
3167     default: llvm_unreachable("Unexpected value type.");
3168     case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
3169     case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
3170     case MVT::f32: [[fallthrough]];
3171     case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
3172     }
3173     Register DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
3174     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
3175     // Without this, EmitLiveInCopies may eliminate the livein if its only
3176     // use is a bitcast (which isn't turned into an instruction).
3177     Register ResultReg = createResultReg(RC);
3178     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3179             TII.get(TargetOpcode::COPY), ResultReg)
3180       .addReg(DstReg, getKillRegState(true));
3181     updateValueMap(&Arg, ResultReg);
3182   }
3183   return true;
3184 }
3185 
3186 static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
3187                                                   CallingConv::ID CC,
3188                                                   const CallBase *CB) {
3189   if (Subtarget->is64Bit())
3190     return 0;
3191   if (Subtarget->getTargetTriple().isOSMSVCRT())
3192     return 0;
3193   if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3194       CC == CallingConv::HiPE || CC == CallingConv::Tail ||
3195       CC == CallingConv::SwiftTail)
3196     return 0;
3197 
3198   if (CB)
3199     if (CB->arg_empty() || !CB->paramHasAttr(0, Attribute::StructRet) ||
3200         CB->paramHasAttr(0, Attribute::InReg) || Subtarget->isTargetMCU())
3201       return 0;
3202 
3203   return 4;
3204 }
3205 
3206 bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
3207   auto &OutVals       = CLI.OutVals;
3208   auto &OutFlags      = CLI.OutFlags;
3209   auto &OutRegs       = CLI.OutRegs;
3210   auto &Ins           = CLI.Ins;
3211   auto &InRegs        = CLI.InRegs;
3212   CallingConv::ID CC  = CLI.CallConv;
3213   bool &IsTailCall    = CLI.IsTailCall;
3214   bool IsVarArg       = CLI.IsVarArg;
3215   const Value *Callee = CLI.Callee;
3216   MCSymbol *Symbol    = CLI.Symbol;
3217   const auto *CB      = CLI.CB;
3218 
3219   bool Is64Bit        = Subtarget->is64Bit();
3220   bool IsWin64        = Subtarget->isCallingConvWin64(CC);
3221 
3222   // Call / invoke instructions with NoCfCheck attribute require special
3223   // handling.
3224   if (CB && CB->doesNoCfCheck())
3225     return false;
3226 
3227   // Functions with no_caller_saved_registers that need special handling.
3228   if ((CB && isa<CallInst>(CB) && CB->hasFnAttr("no_caller_saved_registers")))
3229     return false;
3230 
3231   // Functions with no_callee_saved_registers that need special handling.
3232   if ((CB && CB->hasFnAttr("no_callee_saved_registers")))
3233     return false;
3234 
3235   // Indirect calls with CFI checks need special handling.
3236   if (CB && CB->isIndirectCall() && CB->getOperandBundle(LLVMContext::OB_kcfi))
3237     return false;
3238 
3239   // Functions using thunks for indirect calls need to use SDISel.
3240   if (Subtarget->useIndirectThunkCalls())
3241     return false;
3242 
3243   // Handle only C and fastcc calling conventions for now.
3244   switch (CC) {
3245   default: return false;
3246   case CallingConv::C:
3247   case CallingConv::Fast:
3248   case CallingConv::Tail:
3249   case CallingConv::Swift:
3250   case CallingConv::SwiftTail:
3251   case CallingConv::X86_FastCall:
3252   case CallingConv::X86_StdCall:
3253   case CallingConv::X86_ThisCall:
3254   case CallingConv::Win64:
3255   case CallingConv::X86_64_SysV:
3256   case CallingConv::CFGuard_Check:
3257     break;
3258   }
3259 
3260   // Allow SelectionDAG isel to handle tail calls.
3261   if (IsTailCall)
3262     return false;
3263 
3264   // fastcc with -tailcallopt is intended to provide a guaranteed
3265   // tail call optimization. Fastisel doesn't know how to do that.
3266   if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
3267       CC == CallingConv::Tail || CC == CallingConv::SwiftTail)
3268     return false;
3269 
3270   // Don't know how to handle Win64 varargs yet.  Nothing special needed for
3271   // x86-32. Special handling for x86-64 is implemented.
3272   if (IsVarArg && IsWin64)
3273     return false;
3274 
3275   // Don't know about inalloca yet.
3276   if (CLI.CB && CLI.CB->hasInAllocaArgument())
3277     return false;
3278 
3279   for (auto Flag : CLI.OutFlags)
3280     if (Flag.isSwiftError() || Flag.isPreallocated())
3281       return false;
3282 
3283   SmallVector<MVT, 16> OutVTs;
3284   SmallVector<unsigned, 16> ArgRegs;
3285 
3286   // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
3287   // instruction. This is safe because it is common to all FastISel supported
3288   // calling conventions on x86.
3289   for (int i = 0, e = OutVals.size(); i != e; ++i) {
3290     Value *&Val = OutVals[i];
3291     ISD::ArgFlagsTy Flags = OutFlags[i];
3292     if (auto *CI = dyn_cast<ConstantInt>(Val)) {
3293       if (CI->getBitWidth() < 32) {
3294         if (Flags.isSExt())
3295           Val = ConstantInt::get(CI->getContext(), CI->getValue().sext(32));
3296         else
3297           Val = ConstantInt::get(CI->getContext(), CI->getValue().zext(32));
3298       }
3299     }
3300 
3301     // Passing bools around ends up doing a trunc to i1 and passing it.
3302     // Codegen this as an argument + "and 1".
3303     MVT VT;
3304     auto *TI = dyn_cast<TruncInst>(Val);
3305     unsigned ResultReg;
3306     if (TI && TI->getType()->isIntegerTy(1) && CLI.CB &&
3307         (TI->getParent() == CLI.CB->getParent()) && TI->hasOneUse()) {
3308       Value *PrevVal = TI->getOperand(0);
3309       ResultReg = getRegForValue(PrevVal);
3310 
3311       if (!ResultReg)
3312         return false;
3313 
3314       if (!isTypeLegal(PrevVal->getType(), VT))
3315         return false;
3316 
3317       ResultReg = fastEmit_ri(VT, VT, ISD::AND, ResultReg, 1);
3318     } else {
3319       if (!isTypeLegal(Val->getType(), VT) ||
3320           (VT.isVector() && VT.getVectorElementType() == MVT::i1))
3321         return false;
3322       ResultReg = getRegForValue(Val);
3323     }
3324 
3325     if (!ResultReg)
3326       return false;
3327 
3328     ArgRegs.push_back(ResultReg);
3329     OutVTs.push_back(VT);
3330   }
3331 
3332   // Analyze operands of the call, assigning locations to each operand.
3333   SmallVector<CCValAssign, 16> ArgLocs;
3334   CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext());
3335 
3336   // Allocate shadow area for Win64
3337   if (IsWin64)
3338     CCInfo.AllocateStack(32, Align(8));
3339 
3340   CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
3341 
3342   // Get a count of how many bytes are to be pushed on the stack.
3343   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3344 
3345   // Issue CALLSEQ_START
3346   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
3347   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(AdjStackDown))
3348     .addImm(NumBytes).addImm(0).addImm(0);
3349 
3350   // Walk the register/memloc assignments, inserting copies/loads.
3351   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3352   for (const CCValAssign &VA : ArgLocs) {
3353     const Value *ArgVal = OutVals[VA.getValNo()];
3354     MVT ArgVT = OutVTs[VA.getValNo()];
3355 
3356     if (ArgVT == MVT::x86mmx)
3357       return false;
3358 
3359     unsigned ArgReg = ArgRegs[VA.getValNo()];
3360 
3361     // Promote the value if needed.
3362     switch (VA.getLocInfo()) {
3363     case CCValAssign::Full: break;
3364     case CCValAssign::SExt: {
3365       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
3366              "Unexpected extend");
3367 
3368       if (ArgVT == MVT::i1)
3369         return false;
3370 
3371       bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
3372                                        ArgVT, ArgReg);
3373       assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
3374       ArgVT = VA.getLocVT();
3375       break;
3376     }
3377     case CCValAssign::ZExt: {
3378       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
3379              "Unexpected extend");
3380 
3381       // Handle zero-extension from i1 to i8, which is common.
3382       if (ArgVT == MVT::i1) {
3383         // Set the high bits to zero.
3384         ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg);
3385         ArgVT = MVT::i8;
3386 
3387         if (ArgReg == 0)
3388           return false;
3389       }
3390 
3391       bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
3392                                        ArgVT, ArgReg);
3393       assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
3394       ArgVT = VA.getLocVT();
3395       break;
3396     }
3397     case CCValAssign::AExt: {
3398       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
3399              "Unexpected extend");
3400       bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,
3401                                        ArgVT, ArgReg);
3402       if (!Emitted)
3403         Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
3404                                     ArgVT, ArgReg);
3405       if (!Emitted)
3406         Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
3407                                     ArgVT, ArgReg);
3408 
3409       assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
3410       ArgVT = VA.getLocVT();
3411       break;
3412     }
3413     case CCValAssign::BCvt: {
3414       ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg);
3415       assert(ArgReg && "Failed to emit a bitcast!");
3416       ArgVT = VA.getLocVT();
3417       break;
3418     }
3419     case CCValAssign::VExt:
3420       // VExt has not been implemented, so this should be impossible to reach
3421       // for now.  However, fallback to Selection DAG isel once implemented.
3422       return false;
3423     case CCValAssign::AExtUpper:
3424     case CCValAssign::SExtUpper:
3425     case CCValAssign::ZExtUpper:
3426     case CCValAssign::FPExt:
3427     case CCValAssign::Trunc:
3428       llvm_unreachable("Unexpected loc info!");
3429     case CCValAssign::Indirect:
3430       // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
3431       // support this.
3432       return false;
3433     }
3434 
3435     if (VA.isRegLoc()) {
3436       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3437               TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
3438       OutRegs.push_back(VA.getLocReg());
3439     } else {
3440       assert(VA.isMemLoc() && "Unknown value location!");
3441 
3442       // Don't emit stores for undef values.
3443       if (isa<UndefValue>(ArgVal))
3444         continue;
3445 
3446       unsigned LocMemOffset = VA.getLocMemOffset();
3447       X86AddressMode AM;
3448       AM.Base.Reg = RegInfo->getStackRegister();
3449       AM.Disp = LocMemOffset;
3450       ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
3451       Align Alignment = DL.getABITypeAlign(ArgVal->getType());
3452       MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
3453           MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset),
3454           MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
3455       if (Flags.isByVal()) {
3456         X86AddressMode SrcAM;
3457         SrcAM.Base.Reg = ArgReg;
3458         if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))
3459           return false;
3460       } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
3461         // If this is a really simple value, emit this with the Value* version
3462         // of X86FastEmitStore.  If it isn't simple, we don't want to do this,
3463         // as it can cause us to reevaluate the argument.
3464         if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
3465           return false;
3466       } else {
3467         if (!X86FastEmitStore(ArgVT, ArgReg, AM, MMO))
3468           return false;
3469       }
3470     }
3471   }
3472 
3473   // ELF / PIC requires GOT in the EBX register before function calls via PLT
3474   // GOT pointer.
3475   if (Subtarget->isPICStyleGOT()) {
3476     unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
3477     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3478             TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
3479   }
3480 
3481   if (Is64Bit && IsVarArg && !IsWin64) {
3482     // From AMD64 ABI document:
3483     // For calls that may call functions that use varargs or stdargs
3484     // (prototype-less calls or calls to functions containing ellipsis (...) in
3485     // the declaration) %al is used as hidden argument to specify the number
3486     // of SSE registers used. The contents of %al do not need to match exactly
3487     // the number of registers, but must be an ubound on the number of SSE
3488     // registers used and is in the range 0 - 8 inclusive.
3489 
3490     // Count the number of XMM registers allocated.
3491     static const MCPhysReg XMMArgRegs[] = {
3492       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3493       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3494     };
3495     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3496     assert((Subtarget->hasSSE1() || !NumXMMRegs)
3497            && "SSE registers cannot be used when SSE is disabled");
3498     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV8ri),
3499             X86::AL).addImm(NumXMMRegs);
3500   }
3501 
3502   // Materialize callee address in a register. FIXME: GV address can be
3503   // handled with a CALLpcrel32 instead.
3504   X86AddressMode CalleeAM;
3505   if (!X86SelectCallAddress(Callee, CalleeAM))
3506     return false;
3507 
3508   unsigned CalleeOp = 0;
3509   const GlobalValue *GV = nullptr;
3510   if (CalleeAM.GV != nullptr) {
3511     GV = CalleeAM.GV;
3512   } else if (CalleeAM.Base.Reg != 0) {
3513     CalleeOp = CalleeAM.Base.Reg;
3514   } else
3515     return false;
3516 
3517   // Issue the call.
3518   MachineInstrBuilder MIB;
3519   if (CalleeOp) {
3520     // Register-indirect call.
3521     unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
3522     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(CallOpc))
3523       .addReg(CalleeOp);
3524   } else {
3525     // Direct call.
3526     assert(GV && "Not a direct call");
3527     // See if we need any target-specific flags on the GV operand.
3528     unsigned char OpFlags = Subtarget->classifyGlobalFunctionReference(GV);
3529     if (OpFlags == X86II::MO_PLT && !Is64Bit &&
3530         TM.getRelocationModel() == Reloc::Static && isa<Function>(GV) &&
3531         cast<Function>(GV)->isIntrinsic())
3532       OpFlags = X86II::MO_NO_FLAG;
3533 
3534     // This will be a direct call, or an indirect call through memory for
3535     // NonLazyBind calls or dllimport calls.
3536     bool NeedLoad = OpFlags == X86II::MO_DLLIMPORT ||
3537                     OpFlags == X86II::MO_GOTPCREL ||
3538                     OpFlags == X86II::MO_GOTPCREL_NORELAX ||
3539                     OpFlags == X86II::MO_COFFSTUB;
3540     unsigned CallOpc = NeedLoad
3541                            ? (Is64Bit ? X86::CALL64m : X86::CALL32m)
3542                            : (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32);
3543 
3544     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(CallOpc));
3545     if (NeedLoad)
3546       MIB.addReg(Is64Bit ? X86::RIP : X86::NoRegister).addImm(1).addReg(0);
3547     if (Symbol)
3548       MIB.addSym(Symbol, OpFlags);
3549     else
3550       MIB.addGlobalAddress(GV, 0, OpFlags);
3551     if (NeedLoad)
3552       MIB.addReg(0);
3553   }
3554 
3555   // Add a register mask operand representing the call-preserved registers.
3556   // Proper defs for return values will be added by setPhysRegsDeadExcept().
3557   MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
3558 
3559   // Add an implicit use GOT pointer in EBX.
3560   if (Subtarget->isPICStyleGOT())
3561     MIB.addReg(X86::EBX, RegState::Implicit);
3562 
3563   if (Is64Bit && IsVarArg && !IsWin64)
3564     MIB.addReg(X86::AL, RegState::Implicit);
3565 
3566   // Add implicit physical register uses to the call.
3567   for (auto Reg : OutRegs)
3568     MIB.addReg(Reg, RegState::Implicit);
3569 
3570   // Issue CALLSEQ_END
3571   unsigned NumBytesForCalleeToPop =
3572       X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
3573                        TM.Options.GuaranteedTailCallOpt)
3574           ? NumBytes // Callee pops everything.
3575           : computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CB);
3576   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
3577   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(AdjStackUp))
3578     .addImm(NumBytes).addImm(NumBytesForCalleeToPop);
3579 
3580   // Now handle call return values.
3581   SmallVector<CCValAssign, 16> RVLocs;
3582   CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs,
3583                     CLI.RetTy->getContext());
3584   CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
3585 
3586   // Copy all of the result registers out of their specified physreg.
3587   Register ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
3588   for (unsigned i = 0; i != RVLocs.size(); ++i) {
3589     CCValAssign &VA = RVLocs[i];
3590     EVT CopyVT = VA.getValVT();
3591     unsigned CopyReg = ResultReg + i;
3592     Register SrcReg = VA.getLocReg();
3593 
3594     // If this is x86-64, and we disabled SSE, we can't return FP values
3595     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
3596         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
3597       report_fatal_error("SSE register return with SSE disabled");
3598     }
3599 
3600     // If we prefer to use the value in xmm registers, copy it out as f80 and
3601     // use a truncate to move it from fp stack reg to xmm reg.
3602     if ((SrcReg == X86::FP0 || SrcReg == X86::FP1) &&
3603         isScalarFPTypeInSSEReg(VA.getValVT())) {
3604       CopyVT = MVT::f80;
3605       CopyReg = createResultReg(&X86::RFP80RegClass);
3606     }
3607 
3608     // Copy out the result.
3609     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3610             TII.get(TargetOpcode::COPY), CopyReg).addReg(SrcReg);
3611     InRegs.push_back(VA.getLocReg());
3612 
3613     // Round the f80 to the right size, which also moves it to the appropriate
3614     // xmm register. This is accomplished by storing the f80 value in memory
3615     // and then loading it back.
3616     if (CopyVT != VA.getValVT()) {
3617       EVT ResVT = VA.getValVT();
3618       unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
3619       unsigned MemSize = ResVT.getSizeInBits()/8;
3620       int FI = MFI.CreateStackObject(MemSize, Align(MemSize), false);
3621       addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3622                                 TII.get(Opc)), FI)
3623         .addReg(CopyReg);
3624       Opc = ResVT == MVT::f32 ? X86::MOVSSrm_alt : X86::MOVSDrm_alt;
3625       addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3626                                 TII.get(Opc), ResultReg + i), FI);
3627     }
3628   }
3629 
3630   CLI.ResultReg = ResultReg;
3631   CLI.NumResultRegs = RVLocs.size();
3632   CLI.Call = MIB;
3633 
3634   return true;
3635 }
3636 
3637 bool
3638 X86FastISel::fastSelectInstruction(const Instruction *I)  {
3639   switch (I->getOpcode()) {
3640   default: break;
3641   case Instruction::Load:
3642     return X86SelectLoad(I);
3643   case Instruction::Store:
3644     return X86SelectStore(I);
3645   case Instruction::Ret:
3646     return X86SelectRet(I);
3647   case Instruction::ICmp:
3648   case Instruction::FCmp:
3649     return X86SelectCmp(I);
3650   case Instruction::ZExt:
3651     return X86SelectZExt(I);
3652   case Instruction::SExt:
3653     return X86SelectSExt(I);
3654   case Instruction::Br:
3655     return X86SelectBranch(I);
3656   case Instruction::LShr:
3657   case Instruction::AShr:
3658   case Instruction::Shl:
3659     return X86SelectShift(I);
3660   case Instruction::SDiv:
3661   case Instruction::UDiv:
3662   case Instruction::SRem:
3663   case Instruction::URem:
3664     return X86SelectDivRem(I);
3665   case Instruction::Select:
3666     return X86SelectSelect(I);
3667   case Instruction::Trunc:
3668     return X86SelectTrunc(I);
3669   case Instruction::FPExt:
3670     return X86SelectFPExt(I);
3671   case Instruction::FPTrunc:
3672     return X86SelectFPTrunc(I);
3673   case Instruction::SIToFP:
3674     return X86SelectSIToFP(I);
3675   case Instruction::UIToFP:
3676     return X86SelectUIToFP(I);
3677   case Instruction::IntToPtr: // Deliberate fall-through.
3678   case Instruction::PtrToInt: {
3679     EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
3680     EVT DstVT = TLI.getValueType(DL, I->getType());
3681     if (DstVT.bitsGT(SrcVT))
3682       return X86SelectZExt(I);
3683     if (DstVT.bitsLT(SrcVT))
3684       return X86SelectTrunc(I);
3685     Register Reg = getRegForValue(I->getOperand(0));
3686     if (Reg == 0) return false;
3687     updateValueMap(I, Reg);
3688     return true;
3689   }
3690   case Instruction::BitCast: {
3691     // Select SSE2/AVX bitcasts between 128/256/512 bit vector types.
3692     if (!Subtarget->hasSSE2())
3693       return false;
3694 
3695     MVT SrcVT, DstVT;
3696     if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT) ||
3697         !isTypeLegal(I->getType(), DstVT))
3698       return false;
3699 
3700     // Only allow vectors that use xmm/ymm/zmm.
3701     if (!SrcVT.isVector() || !DstVT.isVector() ||
3702         SrcVT.getVectorElementType() == MVT::i1 ||
3703         DstVT.getVectorElementType() == MVT::i1)
3704       return false;
3705 
3706     Register Reg = getRegForValue(I->getOperand(0));
3707     if (!Reg)
3708       return false;
3709 
3710     // Emit a reg-reg copy so we don't propagate cached known bits information
3711     // with the wrong VT if we fall out of fast isel after selecting this.
3712     const TargetRegisterClass *DstClass = TLI.getRegClassFor(DstVT);
3713     Register ResultReg = createResultReg(DstClass);
3714     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3715               TII.get(TargetOpcode::COPY), ResultReg).addReg(Reg);
3716 
3717     updateValueMap(I, ResultReg);
3718     return true;
3719   }
3720   }
3721 
3722   return false;
3723 }
3724 
3725 unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
3726   if (VT > MVT::i64)
3727     return 0;
3728 
3729   uint64_t Imm = CI->getZExtValue();
3730   if (Imm == 0) {
3731     Register SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
3732     switch (VT.SimpleTy) {
3733     default: llvm_unreachable("Unexpected value type");
3734     case MVT::i1:
3735     case MVT::i8:
3736       return fastEmitInst_extractsubreg(MVT::i8, SrcReg, X86::sub_8bit);
3737     case MVT::i16:
3738       return fastEmitInst_extractsubreg(MVT::i16, SrcReg, X86::sub_16bit);
3739     case MVT::i32:
3740       return SrcReg;
3741     case MVT::i64: {
3742       Register ResultReg = createResultReg(&X86::GR64RegClass);
3743       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3744               TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
3745         .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
3746       return ResultReg;
3747     }
3748     }
3749   }
3750 
3751   unsigned Opc = 0;
3752   switch (VT.SimpleTy) {
3753   default: llvm_unreachable("Unexpected value type");
3754   case MVT::i1:
3755     VT = MVT::i8;
3756     [[fallthrough]];
3757   case MVT::i8:  Opc = X86::MOV8ri;  break;
3758   case MVT::i16: Opc = X86::MOV16ri; break;
3759   case MVT::i32: Opc = X86::MOV32ri; break;
3760   case MVT::i64: {
3761     if (isUInt<32>(Imm))
3762       Opc = X86::MOV32ri64;
3763     else if (isInt<32>(Imm))
3764       Opc = X86::MOV64ri32;
3765     else
3766       Opc = X86::MOV64ri;
3767     break;
3768   }
3769   }
3770   return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
3771 }
3772 
3773 unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
3774   if (CFP->isNullValue())
3775     return fastMaterializeFloatZero(CFP);
3776 
3777   // Can't handle alternate code models yet.
3778   CodeModel::Model CM = TM.getCodeModel();
3779   if (CM != CodeModel::Small && CM != CodeModel::Medium &&
3780       CM != CodeModel::Large)
3781     return 0;
3782 
3783   // Get opcode and regclass of the output for the given load instruction.
3784   unsigned Opc = 0;
3785   bool HasSSE1 = Subtarget->hasSSE1();
3786   bool HasSSE2 = Subtarget->hasSSE2();
3787   bool HasAVX = Subtarget->hasAVX();
3788   bool HasAVX512 = Subtarget->hasAVX512();
3789   switch (VT.SimpleTy) {
3790   default: return 0;
3791   case MVT::f32:
3792     Opc = HasAVX512 ? X86::VMOVSSZrm_alt
3793           : HasAVX  ? X86::VMOVSSrm_alt
3794           : HasSSE1 ? X86::MOVSSrm_alt
3795                     : X86::LD_Fp32m;
3796     break;
3797   case MVT::f64:
3798     Opc = HasAVX512 ? X86::VMOVSDZrm_alt
3799           : HasAVX  ? X86::VMOVSDrm_alt
3800           : HasSSE2 ? X86::MOVSDrm_alt
3801                     : X86::LD_Fp64m;
3802     break;
3803   case MVT::f80:
3804     // No f80 support yet.
3805     return 0;
3806   }
3807 
3808   // MachineConstantPool wants an explicit alignment.
3809   Align Alignment = DL.getPrefTypeAlign(CFP->getType());
3810 
3811   // x86-32 PIC requires a PIC base register for constant pools.
3812   unsigned PICBase = 0;
3813   unsigned char OpFlag = Subtarget->classifyLocalReference(nullptr);
3814   if (OpFlag == X86II::MO_PIC_BASE_OFFSET)
3815     PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
3816   else if (OpFlag == X86II::MO_GOTOFF)
3817     PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
3818   else if (Subtarget->is64Bit() && TM.getCodeModel() != CodeModel::Large)
3819     PICBase = X86::RIP;
3820 
3821   // Create the load from the constant pool.
3822   unsigned CPI = MCP.getConstantPoolIndex(CFP, Alignment);
3823   Register ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
3824 
3825   // Large code model only applies to 64-bit mode.
3826   if (Subtarget->is64Bit() && CM == CodeModel::Large) {
3827     Register AddrReg = createResultReg(&X86::GR64RegClass);
3828     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV64ri),
3829             AddrReg)
3830       .addConstantPoolIndex(CPI, 0, OpFlag);
3831     MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3832                                       TII.get(Opc), ResultReg);
3833     addRegReg(MIB, AddrReg, false, PICBase, false);
3834     MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
3835         MachinePointerInfo::getConstantPool(*FuncInfo.MF),
3836         MachineMemOperand::MOLoad, DL.getPointerSize(), Alignment);
3837     MIB->addMemOperand(*FuncInfo.MF, MMO);
3838     return ResultReg;
3839   }
3840 
3841   addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3842                                    TII.get(Opc), ResultReg),
3843                            CPI, PICBase, OpFlag);
3844   return ResultReg;
3845 }
3846 
3847 unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
3848   // Can't handle large GlobalValues yet.
3849   if (TM.getCodeModel() != CodeModel::Small &&
3850       TM.getCodeModel() != CodeModel::Medium)
3851     return 0;
3852   if (TM.isLargeGlobalValue(GV))
3853     return 0;
3854 
3855   // Materialize addresses with LEA/MOV instructions.
3856   X86AddressMode AM;
3857   if (X86SelectAddress(GV, AM)) {
3858     // If the expression is just a basereg, then we're done, otherwise we need
3859     // to emit an LEA.
3860     if (AM.BaseType == X86AddressMode::RegBase &&
3861         AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
3862       return AM.Base.Reg;
3863 
3864     Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
3865     if (TM.getRelocationModel() == Reloc::Static &&
3866         TLI.getPointerTy(DL) == MVT::i64) {
3867       // The displacement code could be more than 32 bits away so we need to use
3868       // an instruction with a 64 bit immediate
3869       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV64ri),
3870               ResultReg)
3871         .addGlobalAddress(GV);
3872     } else {
3873       unsigned Opc =
3874           TLI.getPointerTy(DL) == MVT::i32
3875               ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
3876               : X86::LEA64r;
3877       addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3878                              TII.get(Opc), ResultReg), AM);
3879     }
3880     return ResultReg;
3881   }
3882   return 0;
3883 }
3884 
3885 unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
3886   EVT CEVT = TLI.getValueType(DL, C->getType(), true);
3887 
3888   // Only handle simple types.
3889   if (!CEVT.isSimple())
3890     return 0;
3891   MVT VT = CEVT.getSimpleVT();
3892 
3893   if (const auto *CI = dyn_cast<ConstantInt>(C))
3894     return X86MaterializeInt(CI, VT);
3895   if (const auto *CFP = dyn_cast<ConstantFP>(C))
3896     return X86MaterializeFP(CFP, VT);
3897   if (const auto *GV = dyn_cast<GlobalValue>(C))
3898     return X86MaterializeGV(GV, VT);
3899   if (isa<UndefValue>(C)) {
3900     unsigned Opc = 0;
3901     switch (VT.SimpleTy) {
3902     default:
3903       break;
3904     case MVT::f32:
3905       if (!Subtarget->hasSSE1())
3906         Opc = X86::LD_Fp032;
3907       break;
3908     case MVT::f64:
3909       if (!Subtarget->hasSSE2())
3910         Opc = X86::LD_Fp064;
3911       break;
3912     case MVT::f80:
3913       Opc = X86::LD_Fp080;
3914       break;
3915     }
3916 
3917     if (Opc) {
3918       Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
3919       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc),
3920               ResultReg);
3921       return ResultReg;
3922     }
3923   }
3924 
3925   return 0;
3926 }
3927 
3928 unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
3929   // Fail on dynamic allocas. At this point, getRegForValue has already
3930   // checked its CSE maps, so if we're here trying to handle a dynamic
3931   // alloca, we're not going to succeed. X86SelectAddress has a
3932   // check for dynamic allocas, because it's called directly from
3933   // various places, but targetMaterializeAlloca also needs a check
3934   // in order to avoid recursion between getRegForValue,
3935   // X86SelectAddrss, and targetMaterializeAlloca.
3936   if (!FuncInfo.StaticAllocaMap.count(C))
3937     return 0;
3938   assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");
3939 
3940   X86AddressMode AM;
3941   if (!X86SelectAddress(C, AM))
3942     return 0;
3943   unsigned Opc =
3944       TLI.getPointerTy(DL) == MVT::i32
3945           ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
3946           : X86::LEA64r;
3947   const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
3948   Register ResultReg = createResultReg(RC);
3949   addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3950                          TII.get(Opc), ResultReg), AM);
3951   return ResultReg;
3952 }
3953 
3954 unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
3955   MVT VT;
3956   if (!isTypeLegal(CF->getType(), VT))
3957     return 0;
3958 
3959   // Get opcode and regclass for the given zero.
3960   bool HasSSE1 = Subtarget->hasSSE1();
3961   bool HasSSE2 = Subtarget->hasSSE2();
3962   bool HasAVX512 = Subtarget->hasAVX512();
3963   unsigned Opc = 0;
3964   switch (VT.SimpleTy) {
3965   default: return 0;
3966   case MVT::f16:
3967     Opc = HasAVX512 ? X86::AVX512_FsFLD0SH : X86::FsFLD0SH;
3968     break;
3969   case MVT::f32:
3970     Opc = HasAVX512 ? X86::AVX512_FsFLD0SS
3971           : HasSSE1 ? X86::FsFLD0SS
3972                     : X86::LD_Fp032;
3973     break;
3974   case MVT::f64:
3975     Opc = HasAVX512 ? X86::AVX512_FsFLD0SD
3976           : HasSSE2 ? X86::FsFLD0SD
3977                     : X86::LD_Fp064;
3978     break;
3979   case MVT::f80:
3980     // No f80 support yet.
3981     return 0;
3982   }
3983 
3984   Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
3985   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), ResultReg);
3986   return ResultReg;
3987 }
3988 
3989 
3990 bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
3991                                       const LoadInst *LI) {
3992   const Value *Ptr = LI->getPointerOperand();
3993   X86AddressMode AM;
3994   if (!X86SelectAddress(Ptr, AM))
3995     return false;
3996 
3997   const X86InstrInfo &XII = (const X86InstrInfo &)TII;
3998 
3999   unsigned Size = DL.getTypeAllocSize(LI->getType());
4000 
4001   SmallVector<MachineOperand, 8> AddrOps;
4002   AM.getFullAddress(AddrOps);
4003 
4004   MachineInstr *Result = XII.foldMemoryOperandImpl(
4005       *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, LI->getAlign(),
4006       /*AllowCommute=*/true);
4007   if (!Result)
4008     return false;
4009 
4010   // The index register could be in the wrong register class.  Unfortunately,
4011   // foldMemoryOperandImpl could have commuted the instruction so its not enough
4012   // to just look at OpNo + the offset to the index reg.  We actually need to
4013   // scan the instruction to find the index reg and see if its the correct reg
4014   // class.
4015   unsigned OperandNo = 0;
4016   for (MachineInstr::mop_iterator I = Result->operands_begin(),
4017        E = Result->operands_end(); I != E; ++I, ++OperandNo) {
4018     MachineOperand &MO = *I;
4019     if (!MO.isReg() || MO.isDef() || MO.getReg() != AM.IndexReg)
4020       continue;
4021     // Found the index reg, now try to rewrite it.
4022     Register IndexReg = constrainOperandRegClass(Result->getDesc(),
4023                                                  MO.getReg(), OperandNo);
4024     if (IndexReg == MO.getReg())
4025       continue;
4026     MO.setReg(IndexReg);
4027   }
4028 
4029   Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
4030   Result->cloneInstrSymbols(*FuncInfo.MF, *MI);
4031   MachineBasicBlock::iterator I(MI);
4032   removeDeadCode(I, std::next(I));
4033   return true;
4034 }
4035 
4036 unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode,
4037                                         const TargetRegisterClass *RC,
4038                                         unsigned Op0, unsigned Op1,
4039                                         unsigned Op2, unsigned Op3) {
4040   const MCInstrDesc &II = TII.get(MachineInstOpcode);
4041 
4042   Register ResultReg = createResultReg(RC);
4043   Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
4044   Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
4045   Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2);
4046   Op3 = constrainOperandRegClass(II, Op3, II.getNumDefs() + 3);
4047 
4048   if (II.getNumDefs() >= 1)
4049     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II, ResultReg)
4050         .addReg(Op0)
4051         .addReg(Op1)
4052         .addReg(Op2)
4053         .addReg(Op3);
4054   else {
4055     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
4056         .addReg(Op0)
4057         .addReg(Op1)
4058         .addReg(Op2)
4059         .addReg(Op3);
4060     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),
4061             ResultReg)
4062         .addReg(II.implicit_defs()[0]);
4063   }
4064   return ResultReg;
4065 }
4066 
4067 
4068 namespace llvm {
4069   FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
4070                                 const TargetLibraryInfo *libInfo) {
4071     return new X86FastISel(funcInfo, libInfo);
4072   }
4073 }
4074