xref: /llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp (revision ebb5f1a4e5f2f150c60302a9374b3ae1b66e2028)
1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64GlobalISelUtils.h"
15 #include "AArch64InstrInfo.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64RegisterBankInfo.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "AArch64TargetMachine.h"
21 #include "MCTargetDesc/AArch64AddressingModes.h"
22 #include "MCTargetDesc/AArch64MCTargetDesc.h"
23 #include "llvm/BinaryFormat/Dwarf.h"
24 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
25 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
26 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/Utils.h"
30 #include "llvm/CodeGen/MachineBasicBlock.h"
31 #include "llvm/CodeGen/MachineConstantPool.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstr.h"
35 #include "llvm/CodeGen/MachineInstrBuilder.h"
36 #include "llvm/CodeGen/MachineMemOperand.h"
37 #include "llvm/CodeGen/MachineOperand.h"
38 #include "llvm/CodeGen/MachineRegisterInfo.h"
39 #include "llvm/CodeGen/TargetOpcodes.h"
40 #include "llvm/CodeGen/TargetRegisterInfo.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/Instructions.h"
44 #include "llvm/IR/IntrinsicsAArch64.h"
45 #include "llvm/IR/Type.h"
46 #include "llvm/Pass.h"
47 #include "llvm/Support/Debug.h"
48 #include "llvm/Support/raw_ostream.h"
49 #include <optional>
50 
51 #define DEBUG_TYPE "aarch64-isel"
52 
53 using namespace llvm;
54 using namespace MIPatternMatch;
55 using namespace AArch64GISelUtils;
56 
57 namespace llvm {
58 class BlockFrequencyInfo;
59 class ProfileSummaryInfo;
60 }
61 
62 namespace {
63 
64 #define GET_GLOBALISEL_PREDICATE_BITSET
65 #include "AArch64GenGlobalISel.inc"
66 #undef GET_GLOBALISEL_PREDICATE_BITSET
67 
68 
69 class AArch64InstructionSelector : public InstructionSelector {
70 public:
71   AArch64InstructionSelector(const AArch64TargetMachine &TM,
72                              const AArch64Subtarget &STI,
73                              const AArch64RegisterBankInfo &RBI);
74 
75   bool select(MachineInstr &I) override;
76   static const char *getName() { return DEBUG_TYPE; }
77 
78   void setupMF(MachineFunction &MF, GISelKnownBits *KB,
79                CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI,
80                BlockFrequencyInfo *BFI) override {
81     InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
82     MIB.setMF(MF);
83 
84     // hasFnAttribute() is expensive to call on every BRCOND selection, so
85     // cache it here for each run of the selector.
86     ProduceNonFlagSettingCondBr =
87         !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
88     MFReturnAddr = Register();
89 
90     processPHIs(MF);
91   }
92 
93 private:
94   /// tblgen-erated 'select' implementation, used as the initial selector for
95   /// the patterns that don't require complex C++.
96   bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
97 
98   // A lowering phase that runs before any selection attempts.
99   // Returns true if the instruction was modified.
100   bool preISelLower(MachineInstr &I);
101 
102   // An early selection function that runs before the selectImpl() call.
103   bool earlySelect(MachineInstr &I);
104 
105   /// Save state that is shared between select calls, call select on \p I and
106   /// then restore the saved state. This can be used to recursively call select
107   /// within a select call.
108   bool selectAndRestoreState(MachineInstr &I);
109 
110   // Do some preprocessing of G_PHIs before we begin selection.
111   void processPHIs(MachineFunction &MF);
112 
113   bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
114 
115   /// Eliminate same-sized cross-bank copies into stores before selectImpl().
116   bool contractCrossBankCopyIntoStore(MachineInstr &I,
117                                       MachineRegisterInfo &MRI);
118 
119   bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
120 
121   bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
122                           MachineRegisterInfo &MRI) const;
123   bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
124                            MachineRegisterInfo &MRI) const;
125 
126   ///@{
127   /// Helper functions for selectCompareBranch.
128   bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
129                                     MachineIRBuilder &MIB) const;
130   bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
131                                     MachineIRBuilder &MIB) const;
132   bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
133                                     MachineIRBuilder &MIB) const;
134   bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
135                                   MachineBasicBlock *DstMBB,
136                                   MachineIRBuilder &MIB) const;
137   ///@}
138 
139   bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
140                            MachineRegisterInfo &MRI);
141 
142   bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
143   bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
144 
145   // Helper to generate an equivalent of scalar_to_vector into a new register,
146   // returned via 'Dst'.
147   MachineInstr *emitScalarToVector(unsigned EltSize,
148                                    const TargetRegisterClass *DstRC,
149                                    Register Scalar,
150                                    MachineIRBuilder &MIRBuilder) const;
151   /// Helper to narrow vector that was widened by emitScalarToVector.
152   /// Copy lowest part of 128-bit or 64-bit vector to 64-bit or 32-bit
153   /// vector, correspondingly.
154   MachineInstr *emitNarrowVector(Register DstReg, Register SrcReg,
155                                  MachineIRBuilder &MIRBuilder,
156                                  MachineRegisterInfo &MRI) const;
157 
158   /// Emit a lane insert into \p DstReg, or a new vector register if
159   /// std::nullopt is provided.
160   ///
161   /// The lane inserted into is defined by \p LaneIdx. The vector source
162   /// register is given by \p SrcReg. The register containing the element is
163   /// given by \p EltReg.
164   MachineInstr *emitLaneInsert(std::optional<Register> DstReg, Register SrcReg,
165                                Register EltReg, unsigned LaneIdx,
166                                const RegisterBank &RB,
167                                MachineIRBuilder &MIRBuilder) const;
168 
169   /// Emit a sequence of instructions representing a constant \p CV for a
170   /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
171   ///
172   /// \returns the last instruction in the sequence on success, and nullptr
173   /// otherwise.
174   MachineInstr *emitConstantVector(Register Dst, Constant *CV,
175                                    MachineIRBuilder &MIRBuilder,
176                                    MachineRegisterInfo &MRI);
177 
178   MachineInstr *tryAdvSIMDModImm8(Register Dst, unsigned DstSize, APInt Bits,
179                                   MachineIRBuilder &MIRBuilder);
180 
181   MachineInstr *tryAdvSIMDModImm16(Register Dst, unsigned DstSize, APInt Bits,
182                                    MachineIRBuilder &MIRBuilder, bool Inv);
183 
184   MachineInstr *tryAdvSIMDModImm32(Register Dst, unsigned DstSize, APInt Bits,
185                                    MachineIRBuilder &MIRBuilder, bool Inv);
186   MachineInstr *tryAdvSIMDModImm64(Register Dst, unsigned DstSize, APInt Bits,
187                                    MachineIRBuilder &MIRBuilder);
188   MachineInstr *tryAdvSIMDModImm321s(Register Dst, unsigned DstSize, APInt Bits,
189                                      MachineIRBuilder &MIRBuilder, bool Inv);
190   MachineInstr *tryAdvSIMDModImmFP(Register Dst, unsigned DstSize, APInt Bits,
191                                    MachineIRBuilder &MIRBuilder);
192 
193   bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
194                               MachineRegisterInfo &MRI);
195   /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
196   /// SUBREG_TO_REG.
197   bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI);
198   bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
199   bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
200   bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
201 
202   bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
203   bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
204   bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
205   bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
206 
207   /// Helper function to select vector load intrinsics like
208   /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc.
209   /// \p Opc is the opcode that the selected instruction should use.
210   /// \p NumVecs is the number of vector destinations for the instruction.
211   /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
212   bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
213                                  MachineInstr &I);
214   bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs,
215                                      MachineInstr &I);
216   void selectVectorStoreIntrinsic(MachineInstr &I, unsigned NumVecs,
217                                   unsigned Opc);
218   bool selectVectorStoreLaneIntrinsic(MachineInstr &I, unsigned NumVecs,
219                                       unsigned Opc);
220   bool selectIntrinsicWithSideEffects(MachineInstr &I,
221                                       MachineRegisterInfo &MRI);
222   bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
223   bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
224   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
225   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
226   bool selectPtrAuthGlobalValue(MachineInstr &I,
227                                 MachineRegisterInfo &MRI) const;
228   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
229   bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
230   bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
231   void SelectTable(MachineInstr &I, MachineRegisterInfo &MRI, unsigned NumVecs,
232                    unsigned Opc1, unsigned Opc2, bool isExt);
233 
234   bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI);
235   bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI);
236   bool selectIndexedStore(GIndexedStore &I, MachineRegisterInfo &MRI);
237 
238   unsigned emitConstantPoolEntry(const Constant *CPVal,
239                                  MachineFunction &MF) const;
240   MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
241                                          MachineIRBuilder &MIRBuilder) const;
242 
243   // Emit a vector concat operation.
244   MachineInstr *emitVectorConcat(std::optional<Register> Dst, Register Op1,
245                                  Register Op2,
246                                  MachineIRBuilder &MIRBuilder) const;
247 
248   // Emit an integer compare between LHS and RHS, which checks for Predicate.
249   MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
250                                    MachineOperand &Predicate,
251                                    MachineIRBuilder &MIRBuilder) const;
252 
253   /// Emit a floating point comparison between \p LHS and \p RHS.
254   /// \p Pred if given is the intended predicate to use.
255   MachineInstr *
256   emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
257                 std::optional<CmpInst::Predicate> = std::nullopt) const;
258 
259   MachineInstr *
260   emitInstr(unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
261             std::initializer_list<llvm::SrcOp> SrcOps,
262             MachineIRBuilder &MIRBuilder,
263             const ComplexRendererFns &RenderFns = std::nullopt) const;
264   /// Helper function to emit an add or sub instruction.
265   ///
266   /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
267   /// in a specific order.
268   ///
269   /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
270   ///
271   /// \code
272   ///   const std::array<std::array<unsigned, 2>, 4> Table {
273   ///    {{AArch64::ADDXri, AArch64::ADDWri},
274   ///     {AArch64::ADDXrs, AArch64::ADDWrs},
275   ///     {AArch64::ADDXrr, AArch64::ADDWrr},
276   ///     {AArch64::SUBXri, AArch64::SUBWri},
277   ///     {AArch64::ADDXrx, AArch64::ADDWrx}}};
278   /// \endcode
279   ///
280   /// Each row in the table corresponds to a different addressing mode. Each
281   /// column corresponds to a different register size.
282   ///
283   /// \attention Rows must be structured as follows:
284   ///   - Row 0: The ri opcode variants
285   ///   - Row 1: The rs opcode variants
286   ///   - Row 2: The rr opcode variants
287   ///   - Row 3: The ri opcode variants for negative immediates
288   ///   - Row 4: The rx opcode variants
289   ///
290   /// \attention Columns must be structured as follows:
291   ///   - Column 0: The 64-bit opcode variants
292   ///   - Column 1: The 32-bit opcode variants
293   ///
294   /// \p Dst is the destination register of the binop to emit.
295   /// \p LHS is the left-hand operand of the binop to emit.
296   /// \p RHS is the right-hand operand of the binop to emit.
297   MachineInstr *emitAddSub(
298       const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
299       Register Dst, MachineOperand &LHS, MachineOperand &RHS,
300       MachineIRBuilder &MIRBuilder) const;
301   MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
302                         MachineOperand &RHS,
303                         MachineIRBuilder &MIRBuilder) const;
304   MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
305                          MachineIRBuilder &MIRBuilder) const;
306   MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
307                          MachineIRBuilder &MIRBuilder) const;
308   MachineInstr *emitADCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
309                          MachineIRBuilder &MIRBuilder) const;
310   MachineInstr *emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
311                          MachineIRBuilder &MIRBuilder) const;
312   MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
313                         MachineIRBuilder &MIRBuilder) const;
314   MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
315                         MachineIRBuilder &MIRBuilder) const;
316   MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
317                            AArch64CC::CondCode CC,
318                            MachineIRBuilder &MIRBuilder) const;
319   MachineInstr *emitExtractVectorElt(std::optional<Register> DstReg,
320                                      const RegisterBank &DstRB, LLT ScalarTy,
321                                      Register VecReg, unsigned LaneIdx,
322                                      MachineIRBuilder &MIRBuilder) const;
323   MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2,
324                           AArch64CC::CondCode Pred,
325                           MachineIRBuilder &MIRBuilder) const;
326   /// Emit a CSet for a FP compare.
327   ///
328   /// \p Dst is expected to be a 32-bit scalar register.
329   MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
330                                 MachineIRBuilder &MIRBuilder) const;
331 
332   /// Emit an instruction that sets NZCV to the carry-in expected by \p I.
333   /// Might elide the instruction if the previous instruction already sets NZCV
334   /// correctly.
335   MachineInstr *emitCarryIn(MachineInstr &I, Register CarryReg);
336 
337   /// Emit the overflow op for \p Opcode.
338   ///
339   /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
340   /// G_USUBO, etc.
341   std::pair<MachineInstr *, AArch64CC::CondCode>
342   emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
343                  MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
344 
345   bool selectOverflowOp(MachineInstr &I, MachineRegisterInfo &MRI);
346 
347   /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
348   /// In some cases this is even possible with OR operations in the expression.
349   MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
350                                 MachineIRBuilder &MIB) const;
351   MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
352                                           CmpInst::Predicate CC,
353                                           AArch64CC::CondCode Predicate,
354                                           AArch64CC::CondCode OutCC,
355                                           MachineIRBuilder &MIB) const;
356   MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
357                                    bool Negate, Register CCOp,
358                                    AArch64CC::CondCode Predicate,
359                                    MachineIRBuilder &MIB) const;
360 
361   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
362   /// \p IsNegative is true if the test should be "not zero".
363   /// This will also optimize the test bit instruction when possible.
364   MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
365                             MachineBasicBlock *DstMBB,
366                             MachineIRBuilder &MIB) const;
367 
368   /// Emit a CB(N)Z instruction which branches to \p DestMBB.
369   MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
370                         MachineBasicBlock *DestMBB,
371                         MachineIRBuilder &MIB) const;
372 
373   // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
374   // We use these manually instead of using the importer since it doesn't
375   // support SDNodeXForm.
376   ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
377   ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
378   ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
379   ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
380 
381   ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
382   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
383   ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
384 
385   ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
386                                             unsigned Size) const;
387 
388   ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
389     return selectAddrModeUnscaled(Root, 1);
390   }
391   ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
392     return selectAddrModeUnscaled(Root, 2);
393   }
394   ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
395     return selectAddrModeUnscaled(Root, 4);
396   }
397   ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
398     return selectAddrModeUnscaled(Root, 8);
399   }
400   ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
401     return selectAddrModeUnscaled(Root, 16);
402   }
403 
404   /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
405   /// from complex pattern matchers like selectAddrModeIndexed().
406   ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
407                                           MachineRegisterInfo &MRI) const;
408 
409   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
410                                            unsigned Size) const;
411   template <int Width>
412   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
413     return selectAddrModeIndexed(Root, Width / 8);
414   }
415 
416   std::optional<bool>
417   isWorthFoldingIntoAddrMode(MachineInstr &MI,
418                              const MachineRegisterInfo &MRI) const;
419 
420   bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
421                                      const MachineRegisterInfo &MRI,
422                                      bool IsAddrOperand) const;
423   ComplexRendererFns
424   selectAddrModeShiftedExtendXReg(MachineOperand &Root,
425                                   unsigned SizeInBytes) const;
426 
427   /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
428   /// or not a shift + extend should be folded into an addressing mode. Returns
429   /// None when this is not profitable or possible.
430   ComplexRendererFns
431   selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
432                     MachineOperand &Offset, unsigned SizeInBytes,
433                     bool WantsExt) const;
434   ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
435   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
436                                        unsigned SizeInBytes) const;
437   template <int Width>
438   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
439     return selectAddrModeXRO(Root, Width / 8);
440   }
441 
442   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
443                                        unsigned SizeInBytes) const;
444   template <int Width>
445   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
446     return selectAddrModeWRO(Root, Width / 8);
447   }
448 
449   ComplexRendererFns selectShiftedRegister(MachineOperand &Root,
450                                            bool AllowROR = false) const;
451 
452   ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
453     return selectShiftedRegister(Root);
454   }
455 
456   ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
457     return selectShiftedRegister(Root, true);
458   }
459 
460   /// Given an extend instruction, determine the correct shift-extend type for
461   /// that instruction.
462   ///
463   /// If the instruction is going to be used in a load or store, pass
464   /// \p IsLoadStore = true.
465   AArch64_AM::ShiftExtendType
466   getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
467                        bool IsLoadStore = false) const;
468 
469   /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
470   ///
471   /// \returns Either \p Reg if no change was necessary, or the new register
472   /// created by moving \p Reg.
473   ///
474   /// Note: This uses emitCopy right now.
475   Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
476                               MachineIRBuilder &MIB) const;
477 
478   ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
479 
480   ComplexRendererFns selectExtractHigh(MachineOperand &Root) const;
481 
482   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
483                       int OpIdx = -1) const;
484   void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
485                           int OpIdx = -1) const;
486   void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
487                           int OpIdx = -1) const;
488   void renderUbsanTrap(MachineInstrBuilder &MIB, const MachineInstr &MI,
489                        int OpIdx) const;
490   void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
491                      int OpIdx = -1) const;
492   void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
493                      int OpIdx = -1) const;
494   void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
495                      int OpIdx = -1) const;
496   void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB,
497                                     const MachineInstr &MI,
498                                     int OpIdx = -1) const;
499 
500   // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
501   void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
502 
503   // Optimization methods.
504   bool tryOptSelect(GSelect &Sel);
505   bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
506   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
507                                       MachineOperand &Predicate,
508                                       MachineIRBuilder &MIRBuilder) const;
509 
510   /// Return true if \p MI is a load or store of \p NumBytes bytes.
511   bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
512 
513   /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
514   /// register zeroed out. In other words, the result of MI has been explicitly
515   /// zero extended.
516   bool isDef32(const MachineInstr &MI) const;
517 
518   const AArch64TargetMachine &TM;
519   const AArch64Subtarget &STI;
520   const AArch64InstrInfo &TII;
521   const AArch64RegisterInfo &TRI;
522   const AArch64RegisterBankInfo &RBI;
523 
524   bool ProduceNonFlagSettingCondBr = false;
525 
526   // Some cached values used during selection.
527   // We use LR as a live-in register, and we keep track of it here as it can be
528   // clobbered by calls.
529   Register MFReturnAddr;
530 
531   MachineIRBuilder MIB;
532 
533 #define GET_GLOBALISEL_PREDICATES_DECL
534 #include "AArch64GenGlobalISel.inc"
535 #undef GET_GLOBALISEL_PREDICATES_DECL
536 
537 // We declare the temporaries used by selectImpl() in the class to minimize the
538 // cost of constructing placeholder values.
539 #define GET_GLOBALISEL_TEMPORARIES_DECL
540 #include "AArch64GenGlobalISel.inc"
541 #undef GET_GLOBALISEL_TEMPORARIES_DECL
542 };
543 
544 } // end anonymous namespace
545 
546 #define GET_GLOBALISEL_IMPL
547 #include "AArch64GenGlobalISel.inc"
548 #undef GET_GLOBALISEL_IMPL
549 
550 AArch64InstructionSelector::AArch64InstructionSelector(
551     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
552     const AArch64RegisterBankInfo &RBI)
553     : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()),
554       RBI(RBI),
555 #define GET_GLOBALISEL_PREDICATES_INIT
556 #include "AArch64GenGlobalISel.inc"
557 #undef GET_GLOBALISEL_PREDICATES_INIT
558 #define GET_GLOBALISEL_TEMPORARIES_INIT
559 #include "AArch64GenGlobalISel.inc"
560 #undef GET_GLOBALISEL_TEMPORARIES_INIT
561 {
562 }
563 
564 // FIXME: This should be target-independent, inferred from the types declared
565 // for each class in the bank.
566 //
567 /// Given a register bank, and a type, return the smallest register class that
568 /// can represent that combination.
569 static const TargetRegisterClass *
570 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
571                          bool GetAllRegSet = false) {
572   if (RB.getID() == AArch64::GPRRegBankID) {
573     if (Ty.getSizeInBits() <= 32)
574       return GetAllRegSet ? &AArch64::GPR32allRegClass
575                           : &AArch64::GPR32RegClass;
576     if (Ty.getSizeInBits() == 64)
577       return GetAllRegSet ? &AArch64::GPR64allRegClass
578                           : &AArch64::GPR64RegClass;
579     if (Ty.getSizeInBits() == 128)
580       return &AArch64::XSeqPairsClassRegClass;
581     return nullptr;
582   }
583 
584   if (RB.getID() == AArch64::FPRRegBankID) {
585     switch (Ty.getSizeInBits()) {
586     case 8:
587       return &AArch64::FPR8RegClass;
588     case 16:
589       return &AArch64::FPR16RegClass;
590     case 32:
591       return &AArch64::FPR32RegClass;
592     case 64:
593       return &AArch64::FPR64RegClass;
594     case 128:
595       return &AArch64::FPR128RegClass;
596     }
597     return nullptr;
598   }
599 
600   return nullptr;
601 }
602 
603 /// Given a register bank, and size in bits, return the smallest register class
604 /// that can represent that combination.
605 static const TargetRegisterClass *
606 getMinClassForRegBank(const RegisterBank &RB, TypeSize SizeInBits,
607                       bool GetAllRegSet = false) {
608   if (SizeInBits.isScalable()) {
609     assert(RB.getID() == AArch64::FPRRegBankID &&
610            "Expected FPR regbank for scalable type size");
611     return &AArch64::ZPRRegClass;
612   }
613 
614   unsigned RegBankID = RB.getID();
615 
616   if (RegBankID == AArch64::GPRRegBankID) {
617     assert(!SizeInBits.isScalable() && "Unexpected scalable register size");
618     if (SizeInBits <= 32)
619       return GetAllRegSet ? &AArch64::GPR32allRegClass
620                           : &AArch64::GPR32RegClass;
621     if (SizeInBits == 64)
622       return GetAllRegSet ? &AArch64::GPR64allRegClass
623                           : &AArch64::GPR64RegClass;
624     if (SizeInBits == 128)
625       return &AArch64::XSeqPairsClassRegClass;
626   }
627 
628   if (RegBankID == AArch64::FPRRegBankID) {
629     if (SizeInBits.isScalable()) {
630       assert(SizeInBits == TypeSize::getScalable(128) &&
631              "Unexpected scalable register size");
632       return &AArch64::ZPRRegClass;
633     }
634 
635     switch (SizeInBits) {
636     default:
637       return nullptr;
638     case 8:
639       return &AArch64::FPR8RegClass;
640     case 16:
641       return &AArch64::FPR16RegClass;
642     case 32:
643       return &AArch64::FPR32RegClass;
644     case 64:
645       return &AArch64::FPR64RegClass;
646     case 128:
647       return &AArch64::FPR128RegClass;
648     }
649   }
650 
651   return nullptr;
652 }
653 
654 /// Returns the correct subregister to use for a given register class.
655 static bool getSubRegForClass(const TargetRegisterClass *RC,
656                               const TargetRegisterInfo &TRI, unsigned &SubReg) {
657   switch (TRI.getRegSizeInBits(*RC)) {
658   case 8:
659     SubReg = AArch64::bsub;
660     break;
661   case 16:
662     SubReg = AArch64::hsub;
663     break;
664   case 32:
665     if (RC != &AArch64::FPR32RegClass)
666       SubReg = AArch64::sub_32;
667     else
668       SubReg = AArch64::ssub;
669     break;
670   case 64:
671     SubReg = AArch64::dsub;
672     break;
673   default:
674     LLVM_DEBUG(
675         dbgs() << "Couldn't find appropriate subregister for register class.");
676     return false;
677   }
678 
679   return true;
680 }
681 
682 /// Returns the minimum size the given register bank can hold.
683 static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
684   switch (RB.getID()) {
685   case AArch64::GPRRegBankID:
686     return 32;
687   case AArch64::FPRRegBankID:
688     return 8;
689   default:
690     llvm_unreachable("Tried to get minimum size for unknown register bank.");
691   }
692 }
693 
694 /// Create a REG_SEQUENCE instruction using the registers in \p Regs.
695 /// Helper function for functions like createDTuple and createQTuple.
696 ///
697 /// \p RegClassIDs - The list of register class IDs available for some tuple of
698 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
699 /// expected to contain between 2 and 4 tuple classes.
700 ///
701 /// \p SubRegs - The list of subregister classes associated with each register
702 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
703 /// subregister class. The index of each subregister class is expected to
704 /// correspond with the index of each register class.
705 ///
706 /// \returns Either the destination register of REG_SEQUENCE instruction that
707 /// was created, or the 0th element of \p Regs if \p Regs contains a single
708 /// element.
709 static Register createTuple(ArrayRef<Register> Regs,
710                             const unsigned RegClassIDs[],
711                             const unsigned SubRegs[], MachineIRBuilder &MIB) {
712   unsigned NumRegs = Regs.size();
713   if (NumRegs == 1)
714     return Regs[0];
715   assert(NumRegs >= 2 && NumRegs <= 4 &&
716          "Only support between two and 4 registers in a tuple!");
717   const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
718   auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]);
719   auto RegSequence =
720       MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {});
721   for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
722     RegSequence.addUse(Regs[I]);
723     RegSequence.addImm(SubRegs[I]);
724   }
725   return RegSequence.getReg(0);
726 }
727 
728 /// Create a tuple of D-registers using the registers in \p Regs.
729 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
730   static const unsigned RegClassIDs[] = {
731       AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
732   static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
733                                      AArch64::dsub2, AArch64::dsub3};
734   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
735 }
736 
737 /// Create a tuple of Q-registers using the registers in \p Regs.
738 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
739   static const unsigned RegClassIDs[] = {
740       AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
741   static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
742                                      AArch64::qsub2, AArch64::qsub3};
743   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
744 }
745 
746 static std::optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
747   auto &MI = *Root.getParent();
748   auto &MBB = *MI.getParent();
749   auto &MF = *MBB.getParent();
750   auto &MRI = MF.getRegInfo();
751   uint64_t Immed;
752   if (Root.isImm())
753     Immed = Root.getImm();
754   else if (Root.isCImm())
755     Immed = Root.getCImm()->getZExtValue();
756   else if (Root.isReg()) {
757     auto ValAndVReg =
758         getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
759     if (!ValAndVReg)
760       return std::nullopt;
761     Immed = ValAndVReg->Value.getSExtValue();
762   } else
763     return std::nullopt;
764   return Immed;
765 }
766 
767 /// Check whether \p I is a currently unsupported binary operation:
768 /// - it has an unsized type
769 /// - an operand is not a vreg
770 /// - all operands are not in the same bank
771 /// These are checks that should someday live in the verifier, but right now,
772 /// these are mostly limitations of the aarch64 selector.
773 static bool unsupportedBinOp(const MachineInstr &I,
774                              const AArch64RegisterBankInfo &RBI,
775                              const MachineRegisterInfo &MRI,
776                              const AArch64RegisterInfo &TRI) {
777   LLT Ty = MRI.getType(I.getOperand(0).getReg());
778   if (!Ty.isValid()) {
779     LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
780     return true;
781   }
782 
783   const RegisterBank *PrevOpBank = nullptr;
784   for (auto &MO : I.operands()) {
785     // FIXME: Support non-register operands.
786     if (!MO.isReg()) {
787       LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
788       return true;
789     }
790 
791     // FIXME: Can generic operations have physical registers operands? If
792     // so, this will need to be taught about that, and we'll need to get the
793     // bank out of the minimal class for the register.
794     // Either way, this needs to be documented (and possibly verified).
795     if (!MO.getReg().isVirtual()) {
796       LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
797       return true;
798     }
799 
800     const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
801     if (!OpBank) {
802       LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
803       return true;
804     }
805 
806     if (PrevOpBank && OpBank != PrevOpBank) {
807       LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
808       return true;
809     }
810     PrevOpBank = OpBank;
811   }
812   return false;
813 }
814 
815 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc
816 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
817 /// and of size \p OpSize.
818 /// \returns \p GenericOpc if the combination is unsupported.
819 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
820                                unsigned OpSize) {
821   switch (RegBankID) {
822   case AArch64::GPRRegBankID:
823     if (OpSize == 32) {
824       switch (GenericOpc) {
825       case TargetOpcode::G_SHL:
826         return AArch64::LSLVWr;
827       case TargetOpcode::G_LSHR:
828         return AArch64::LSRVWr;
829       case TargetOpcode::G_ASHR:
830         return AArch64::ASRVWr;
831       default:
832         return GenericOpc;
833       }
834     } else if (OpSize == 64) {
835       switch (GenericOpc) {
836       case TargetOpcode::G_PTR_ADD:
837         return AArch64::ADDXrr;
838       case TargetOpcode::G_SHL:
839         return AArch64::LSLVXr;
840       case TargetOpcode::G_LSHR:
841         return AArch64::LSRVXr;
842       case TargetOpcode::G_ASHR:
843         return AArch64::ASRVXr;
844       default:
845         return GenericOpc;
846       }
847     }
848     break;
849   case AArch64::FPRRegBankID:
850     switch (OpSize) {
851     case 32:
852       switch (GenericOpc) {
853       case TargetOpcode::G_FADD:
854         return AArch64::FADDSrr;
855       case TargetOpcode::G_FSUB:
856         return AArch64::FSUBSrr;
857       case TargetOpcode::G_FMUL:
858         return AArch64::FMULSrr;
859       case TargetOpcode::G_FDIV:
860         return AArch64::FDIVSrr;
861       default:
862         return GenericOpc;
863       }
864     case 64:
865       switch (GenericOpc) {
866       case TargetOpcode::G_FADD:
867         return AArch64::FADDDrr;
868       case TargetOpcode::G_FSUB:
869         return AArch64::FSUBDrr;
870       case TargetOpcode::G_FMUL:
871         return AArch64::FMULDrr;
872       case TargetOpcode::G_FDIV:
873         return AArch64::FDIVDrr;
874       case TargetOpcode::G_OR:
875         return AArch64::ORRv8i8;
876       default:
877         return GenericOpc;
878       }
879     }
880     break;
881   }
882   return GenericOpc;
883 }
884 
885 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
886 /// appropriate for the (value) register bank \p RegBankID and of memory access
887 /// size \p OpSize.  This returns the variant with the base+unsigned-immediate
888 /// addressing mode (e.g., LDRXui).
889 /// \returns \p GenericOpc if the combination is unsupported.
890 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
891                                     unsigned OpSize) {
892   const bool isStore = GenericOpc == TargetOpcode::G_STORE;
893   switch (RegBankID) {
894   case AArch64::GPRRegBankID:
895     switch (OpSize) {
896     case 8:
897       return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
898     case 16:
899       return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
900     case 32:
901       return isStore ? AArch64::STRWui : AArch64::LDRWui;
902     case 64:
903       return isStore ? AArch64::STRXui : AArch64::LDRXui;
904     }
905     break;
906   case AArch64::FPRRegBankID:
907     switch (OpSize) {
908     case 8:
909       return isStore ? AArch64::STRBui : AArch64::LDRBui;
910     case 16:
911       return isStore ? AArch64::STRHui : AArch64::LDRHui;
912     case 32:
913       return isStore ? AArch64::STRSui : AArch64::LDRSui;
914     case 64:
915       return isStore ? AArch64::STRDui : AArch64::LDRDui;
916     case 128:
917       return isStore ? AArch64::STRQui : AArch64::LDRQui;
918     }
919     break;
920   }
921   return GenericOpc;
922 }
923 
924 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
925 /// to \p *To.
926 ///
927 /// E.g "To = COPY SrcReg:SubReg"
928 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
929                        const RegisterBankInfo &RBI, Register SrcReg,
930                        const TargetRegisterClass *To, unsigned SubReg) {
931   assert(SrcReg.isValid() && "Expected a valid source register?");
932   assert(To && "Destination register class cannot be null");
933   assert(SubReg && "Expected a valid subregister");
934 
935   MachineIRBuilder MIB(I);
936   auto SubRegCopy =
937       MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg);
938   MachineOperand &RegOp = I.getOperand(1);
939   RegOp.setReg(SubRegCopy.getReg(0));
940 
941   // It's possible that the destination register won't be constrained. Make
942   // sure that happens.
943   if (!I.getOperand(0).getReg().isPhysical())
944     RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
945 
946   return true;
947 }
948 
949 /// Helper function to get the source and destination register classes for a
950 /// copy. Returns a std::pair containing the source register class for the
951 /// copy, and the destination register class for the copy. If a register class
952 /// cannot be determined, then it will be nullptr.
953 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
954 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
955                      MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
956                      const RegisterBankInfo &RBI) {
957   Register DstReg = I.getOperand(0).getReg();
958   Register SrcReg = I.getOperand(1).getReg();
959   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
960   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
961 
962   TypeSize DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
963   TypeSize SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
964 
965   // Special casing for cross-bank copies of s1s. We can technically represent
966   // a 1-bit value with any size of register. The minimum size for a GPR is 32
967   // bits. So, we need to put the FPR on 32 bits as well.
968   //
969   // FIXME: I'm not sure if this case holds true outside of copies. If it does,
970   // then we can pull it into the helpers that get the appropriate class for a
971   // register bank. Or make a new helper that carries along some constraint
972   // information.
973   if (SrcRegBank != DstRegBank &&
974       (DstSize == TypeSize::getFixed(1) && SrcSize == TypeSize::getFixed(1)))
975     SrcSize = DstSize = TypeSize::getFixed(32);
976 
977   return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
978           getMinClassForRegBank(DstRegBank, DstSize, true)};
979 }
980 
981 // FIXME: We need some sort of API in RBI/TRI to allow generic code to
982 // constrain operands of simple instructions given a TargetRegisterClass
983 // and LLT
984 static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI,
985                              const RegisterBankInfo &RBI) {
986   for (MachineOperand &MO : I.operands()) {
987     if (!MO.isReg())
988       continue;
989     Register Reg = MO.getReg();
990     if (!Reg)
991       continue;
992     if (Reg.isPhysical())
993       continue;
994     LLT Ty = MRI.getType(Reg);
995     const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
996     const TargetRegisterClass *RC =
997         dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
998     if (!RC) {
999       const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
1000       RC = getRegClassForTypeOnBank(Ty, RB);
1001       if (!RC) {
1002         LLVM_DEBUG(
1003             dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n");
1004         break;
1005       }
1006     }
1007     RBI.constrainGenericRegister(Reg, *RC, MRI);
1008   }
1009 
1010   return true;
1011 }
1012 
1013 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
1014                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
1015                        const RegisterBankInfo &RBI) {
1016   Register DstReg = I.getOperand(0).getReg();
1017   Register SrcReg = I.getOperand(1).getReg();
1018   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
1019   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
1020 
1021   // Find the correct register classes for the source and destination registers.
1022   const TargetRegisterClass *SrcRC;
1023   const TargetRegisterClass *DstRC;
1024   std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
1025 
1026   if (!DstRC) {
1027     LLVM_DEBUG(dbgs() << "Unexpected dest size "
1028                       << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
1029     return false;
1030   }
1031 
1032   // Is this a copy? If so, then we may need to insert a subregister copy.
1033   if (I.isCopy()) {
1034     // Yes. Check if there's anything to fix up.
1035     if (!SrcRC) {
1036       LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
1037       return false;
1038     }
1039 
1040     const TypeSize SrcSize = TRI.getRegSizeInBits(*SrcRC);
1041     const TypeSize DstSize = TRI.getRegSizeInBits(*DstRC);
1042     unsigned SubReg;
1043 
1044     // If the source bank doesn't support a subregister copy small enough,
1045     // then we first need to copy to the destination bank.
1046     if (getMinSizeForRegBank(SrcRegBank) > DstSize) {
1047       const TargetRegisterClass *DstTempRC =
1048           getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true);
1049       getSubRegForClass(DstRC, TRI, SubReg);
1050 
1051       MachineIRBuilder MIB(I);
1052       auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg});
1053       copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg);
1054     } else if (SrcSize > DstSize) {
1055       // If the source register is bigger than the destination we need to
1056       // perform a subregister copy.
1057       const TargetRegisterClass *SubRegRC =
1058           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
1059       getSubRegForClass(SubRegRC, TRI, SubReg);
1060       copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg);
1061     } else if (DstSize > SrcSize) {
1062       // If the destination register is bigger than the source we need to do
1063       // a promotion using SUBREG_TO_REG.
1064       const TargetRegisterClass *PromotionRC =
1065           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
1066       getSubRegForClass(SrcRC, TRI, SubReg);
1067 
1068       Register PromoteReg = MRI.createVirtualRegister(PromotionRC);
1069       BuildMI(*I.getParent(), I, I.getDebugLoc(),
1070               TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
1071           .addImm(0)
1072           .addUse(SrcReg)
1073           .addImm(SubReg);
1074       MachineOperand &RegOp = I.getOperand(1);
1075       RegOp.setReg(PromoteReg);
1076     }
1077 
1078     // If the destination is a physical register, then there's nothing to
1079     // change, so we're done.
1080     if (DstReg.isPhysical())
1081       return true;
1082   }
1083 
1084   // No need to constrain SrcReg. It will get constrained when we hit another
1085   // of its use or its defs. Copies do not have constraints.
1086   if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
1087     LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
1088                       << " operand\n");
1089     return false;
1090   }
1091 
1092   // If this a GPR ZEXT that we want to just reduce down into a copy.
1093   // The sizes will be mismatched with the source < 32b but that's ok.
1094   if (I.getOpcode() == TargetOpcode::G_ZEXT) {
1095     I.setDesc(TII.get(AArch64::COPY));
1096     assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
1097     return selectCopy(I, TII, MRI, TRI, RBI);
1098   }
1099 
1100   I.setDesc(TII.get(AArch64::COPY));
1101   return true;
1102 }
1103 
1104 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
1105   if (!DstTy.isScalar() || !SrcTy.isScalar())
1106     return GenericOpc;
1107 
1108   const unsigned DstSize = DstTy.getSizeInBits();
1109   const unsigned SrcSize = SrcTy.getSizeInBits();
1110 
1111   switch (DstSize) {
1112   case 32:
1113     switch (SrcSize) {
1114     case 32:
1115       switch (GenericOpc) {
1116       case TargetOpcode::G_SITOFP:
1117         return AArch64::SCVTFUWSri;
1118       case TargetOpcode::G_UITOFP:
1119         return AArch64::UCVTFUWSri;
1120       case TargetOpcode::G_FPTOSI:
1121         return AArch64::FCVTZSUWSr;
1122       case TargetOpcode::G_FPTOUI:
1123         return AArch64::FCVTZUUWSr;
1124       default:
1125         return GenericOpc;
1126       }
1127     case 64:
1128       switch (GenericOpc) {
1129       case TargetOpcode::G_SITOFP:
1130         return AArch64::SCVTFUXSri;
1131       case TargetOpcode::G_UITOFP:
1132         return AArch64::UCVTFUXSri;
1133       case TargetOpcode::G_FPTOSI:
1134         return AArch64::FCVTZSUWDr;
1135       case TargetOpcode::G_FPTOUI:
1136         return AArch64::FCVTZUUWDr;
1137       default:
1138         return GenericOpc;
1139       }
1140     default:
1141       return GenericOpc;
1142     }
1143   case 64:
1144     switch (SrcSize) {
1145     case 32:
1146       switch (GenericOpc) {
1147       case TargetOpcode::G_SITOFP:
1148         return AArch64::SCVTFUWDri;
1149       case TargetOpcode::G_UITOFP:
1150         return AArch64::UCVTFUWDri;
1151       case TargetOpcode::G_FPTOSI:
1152         return AArch64::FCVTZSUXSr;
1153       case TargetOpcode::G_FPTOUI:
1154         return AArch64::FCVTZUUXSr;
1155       default:
1156         return GenericOpc;
1157       }
1158     case 64:
1159       switch (GenericOpc) {
1160       case TargetOpcode::G_SITOFP:
1161         return AArch64::SCVTFUXDri;
1162       case TargetOpcode::G_UITOFP:
1163         return AArch64::UCVTFUXDri;
1164       case TargetOpcode::G_FPTOSI:
1165         return AArch64::FCVTZSUXDr;
1166       case TargetOpcode::G_FPTOUI:
1167         return AArch64::FCVTZUUXDr;
1168       default:
1169         return GenericOpc;
1170       }
1171     default:
1172       return GenericOpc;
1173     }
1174   default:
1175     return GenericOpc;
1176   };
1177   return GenericOpc;
1178 }
1179 
1180 MachineInstr *
1181 AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1182                                        Register False, AArch64CC::CondCode CC,
1183                                        MachineIRBuilder &MIB) const {
1184   MachineRegisterInfo &MRI = *MIB.getMRI();
1185   assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1186              RBI.getRegBank(True, MRI, TRI)->getID() &&
1187          "Expected both select operands to have the same regbank?");
1188   LLT Ty = MRI.getType(True);
1189   if (Ty.isVector())
1190     return nullptr;
1191   const unsigned Size = Ty.getSizeInBits();
1192   assert((Size == 32 || Size == 64) &&
1193          "Expected 32 bit or 64 bit select only?");
1194   const bool Is32Bit = Size == 32;
1195   if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1196     unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1197     auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1198     constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
1199     return &*FCSel;
1200   }
1201 
1202   // By default, we'll try and emit a CSEL.
1203   unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1204   bool Optimized = false;
1205   auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1206                                  &Optimized](Register &Reg, Register &OtherReg,
1207                                              bool Invert) {
1208     if (Optimized)
1209       return false;
1210 
1211     // Attempt to fold:
1212     //
1213     // %sub = G_SUB 0, %x
1214     // %select = G_SELECT cc, %reg, %sub
1215     //
1216     // Into:
1217     // %select = CSNEG %reg, %x, cc
1218     Register MatchReg;
1219     if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) {
1220       Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1221       Reg = MatchReg;
1222       if (Invert) {
1223         CC = AArch64CC::getInvertedCondCode(CC);
1224         std::swap(Reg, OtherReg);
1225       }
1226       return true;
1227     }
1228 
1229     // Attempt to fold:
1230     //
1231     // %xor = G_XOR %x, -1
1232     // %select = G_SELECT cc, %reg, %xor
1233     //
1234     // Into:
1235     // %select = CSINV %reg, %x, cc
1236     if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) {
1237       Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1238       Reg = MatchReg;
1239       if (Invert) {
1240         CC = AArch64CC::getInvertedCondCode(CC);
1241         std::swap(Reg, OtherReg);
1242       }
1243       return true;
1244     }
1245 
1246     // Attempt to fold:
1247     //
1248     // %add = G_ADD %x, 1
1249     // %select = G_SELECT cc, %reg, %add
1250     //
1251     // Into:
1252     // %select = CSINC %reg, %x, cc
1253     if (mi_match(Reg, MRI,
1254                  m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)),
1255                           m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) {
1256       Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1257       Reg = MatchReg;
1258       if (Invert) {
1259         CC = AArch64CC::getInvertedCondCode(CC);
1260         std::swap(Reg, OtherReg);
1261       }
1262       return true;
1263     }
1264 
1265     return false;
1266   };
1267 
1268   // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1269   // true/false values are constants.
1270   // FIXME: All of these patterns already exist in tablegen. We should be
1271   // able to import these.
1272   auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1273                           &Optimized]() {
1274     if (Optimized)
1275       return false;
1276     auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI);
1277     auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI);
1278     if (!TrueCst && !FalseCst)
1279       return false;
1280 
1281     Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1282     if (TrueCst && FalseCst) {
1283       int64_t T = TrueCst->Value.getSExtValue();
1284       int64_t F = FalseCst->Value.getSExtValue();
1285 
1286       if (T == 0 && F == 1) {
1287         // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1288         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1289         True = ZReg;
1290         False = ZReg;
1291         return true;
1292       }
1293 
1294       if (T == 0 && F == -1) {
1295         // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1296         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1297         True = ZReg;
1298         False = ZReg;
1299         return true;
1300       }
1301     }
1302 
1303     if (TrueCst) {
1304       int64_t T = TrueCst->Value.getSExtValue();
1305       if (T == 1) {
1306         // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1307         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1308         True = False;
1309         False = ZReg;
1310         CC = AArch64CC::getInvertedCondCode(CC);
1311         return true;
1312       }
1313 
1314       if (T == -1) {
1315         // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1316         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1317         True = False;
1318         False = ZReg;
1319         CC = AArch64CC::getInvertedCondCode(CC);
1320         return true;
1321       }
1322     }
1323 
1324     if (FalseCst) {
1325       int64_t F = FalseCst->Value.getSExtValue();
1326       if (F == 1) {
1327         // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1328         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1329         False = ZReg;
1330         return true;
1331       }
1332 
1333       if (F == -1) {
1334         // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1335         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1336         False = ZReg;
1337         return true;
1338       }
1339     }
1340     return false;
1341   };
1342 
1343   Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1344   Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1345   Optimized |= TryOptSelectCst();
1346   auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1347   constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
1348   return &*SelectInst;
1349 }
1350 
1351 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
1352   switch (P) {
1353   default:
1354     llvm_unreachable("Unknown condition code!");
1355   case CmpInst::ICMP_NE:
1356     return AArch64CC::NE;
1357   case CmpInst::ICMP_EQ:
1358     return AArch64CC::EQ;
1359   case CmpInst::ICMP_SGT:
1360     return AArch64CC::GT;
1361   case CmpInst::ICMP_SGE:
1362     return AArch64CC::GE;
1363   case CmpInst::ICMP_SLT:
1364     return AArch64CC::LT;
1365   case CmpInst::ICMP_SLE:
1366     return AArch64CC::LE;
1367   case CmpInst::ICMP_UGT:
1368     return AArch64CC::HI;
1369   case CmpInst::ICMP_UGE:
1370     return AArch64CC::HS;
1371   case CmpInst::ICMP_ULT:
1372     return AArch64CC::LO;
1373   case CmpInst::ICMP_ULE:
1374     return AArch64CC::LS;
1375   }
1376 }
1377 
1378 /// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC.
1379 static void changeFPCCToORAArch64CC(CmpInst::Predicate CC,
1380                                     AArch64CC::CondCode &CondCode,
1381                                     AArch64CC::CondCode &CondCode2) {
1382   CondCode2 = AArch64CC::AL;
1383   switch (CC) {
1384   default:
1385     llvm_unreachable("Unknown FP condition!");
1386   case CmpInst::FCMP_OEQ:
1387     CondCode = AArch64CC::EQ;
1388     break;
1389   case CmpInst::FCMP_OGT:
1390     CondCode = AArch64CC::GT;
1391     break;
1392   case CmpInst::FCMP_OGE:
1393     CondCode = AArch64CC::GE;
1394     break;
1395   case CmpInst::FCMP_OLT:
1396     CondCode = AArch64CC::MI;
1397     break;
1398   case CmpInst::FCMP_OLE:
1399     CondCode = AArch64CC::LS;
1400     break;
1401   case CmpInst::FCMP_ONE:
1402     CondCode = AArch64CC::MI;
1403     CondCode2 = AArch64CC::GT;
1404     break;
1405   case CmpInst::FCMP_ORD:
1406     CondCode = AArch64CC::VC;
1407     break;
1408   case CmpInst::FCMP_UNO:
1409     CondCode = AArch64CC::VS;
1410     break;
1411   case CmpInst::FCMP_UEQ:
1412     CondCode = AArch64CC::EQ;
1413     CondCode2 = AArch64CC::VS;
1414     break;
1415   case CmpInst::FCMP_UGT:
1416     CondCode = AArch64CC::HI;
1417     break;
1418   case CmpInst::FCMP_UGE:
1419     CondCode = AArch64CC::PL;
1420     break;
1421   case CmpInst::FCMP_ULT:
1422     CondCode = AArch64CC::LT;
1423     break;
1424   case CmpInst::FCMP_ULE:
1425     CondCode = AArch64CC::LE;
1426     break;
1427   case CmpInst::FCMP_UNE:
1428     CondCode = AArch64CC::NE;
1429     break;
1430   }
1431 }
1432 
1433 /// Convert an IR fp condition code to an AArch64 CC.
1434 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1435 /// should be AND'ed instead of OR'ed.
1436 static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
1437                                      AArch64CC::CondCode &CondCode,
1438                                      AArch64CC::CondCode &CondCode2) {
1439   CondCode2 = AArch64CC::AL;
1440   switch (CC) {
1441   default:
1442     changeFPCCToORAArch64CC(CC, CondCode, CondCode2);
1443     assert(CondCode2 == AArch64CC::AL);
1444     break;
1445   case CmpInst::FCMP_ONE:
1446     // (a one b)
1447     // == ((a olt b) || (a ogt b))
1448     // == ((a ord b) && (a une b))
1449     CondCode = AArch64CC::VC;
1450     CondCode2 = AArch64CC::NE;
1451     break;
1452   case CmpInst::FCMP_UEQ:
1453     // (a ueq b)
1454     // == ((a uno b) || (a oeq b))
1455     // == ((a ule b) && (a uge b))
1456     CondCode = AArch64CC::PL;
1457     CondCode2 = AArch64CC::LE;
1458     break;
1459   }
1460 }
1461 
1462 /// Return a register which can be used as a bit to test in a TB(N)Z.
1463 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1464                               MachineRegisterInfo &MRI) {
1465   assert(Reg.isValid() && "Expected valid register!");
1466   bool HasZext = false;
1467   while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1468     unsigned Opc = MI->getOpcode();
1469 
1470     if (!MI->getOperand(0).isReg() ||
1471         !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
1472       break;
1473 
1474     // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
1475     //
1476     // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1477     // on the truncated x is the same as the bit number on x.
1478     if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1479         Opc == TargetOpcode::G_TRUNC) {
1480       if (Opc == TargetOpcode::G_ZEXT)
1481         HasZext = true;
1482 
1483       Register NextReg = MI->getOperand(1).getReg();
1484       // Did we find something worth folding?
1485       if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
1486         break;
1487 
1488       // NextReg is worth folding. Keep looking.
1489       Reg = NextReg;
1490       continue;
1491     }
1492 
1493     // Attempt to find a suitable operation with a constant on one side.
1494     std::optional<uint64_t> C;
1495     Register TestReg;
1496     switch (Opc) {
1497     default:
1498       break;
1499     case TargetOpcode::G_AND:
1500     case TargetOpcode::G_XOR: {
1501       TestReg = MI->getOperand(1).getReg();
1502       Register ConstantReg = MI->getOperand(2).getReg();
1503       auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1504       if (!VRegAndVal) {
1505         // AND commutes, check the other side for a constant.
1506         // FIXME: Can we canonicalize the constant so that it's always on the
1507         // same side at some point earlier?
1508         std::swap(ConstantReg, TestReg);
1509         VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1510       }
1511       if (VRegAndVal) {
1512         if (HasZext)
1513           C = VRegAndVal->Value.getZExtValue();
1514         else
1515           C = VRegAndVal->Value.getSExtValue();
1516       }
1517       break;
1518     }
1519     case TargetOpcode::G_ASHR:
1520     case TargetOpcode::G_LSHR:
1521     case TargetOpcode::G_SHL: {
1522       TestReg = MI->getOperand(1).getReg();
1523       auto VRegAndVal =
1524           getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
1525       if (VRegAndVal)
1526         C = VRegAndVal->Value.getSExtValue();
1527       break;
1528     }
1529     }
1530 
1531     // Didn't find a constant or viable register. Bail out of the loop.
1532     if (!C || !TestReg.isValid())
1533       break;
1534 
1535     // We found a suitable instruction with a constant. Check to see if we can
1536     // walk through the instruction.
1537     Register NextReg;
1538     unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits();
1539     switch (Opc) {
1540     default:
1541       break;
1542     case TargetOpcode::G_AND:
1543       // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1544       if ((*C >> Bit) & 1)
1545         NextReg = TestReg;
1546       break;
1547     case TargetOpcode::G_SHL:
1548       // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1549       // the type of the register.
1550       if (*C <= Bit && (Bit - *C) < TestRegSize) {
1551         NextReg = TestReg;
1552         Bit = Bit - *C;
1553       }
1554       break;
1555     case TargetOpcode::G_ASHR:
1556       // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1557       // in x
1558       NextReg = TestReg;
1559       Bit = Bit + *C;
1560       if (Bit >= TestRegSize)
1561         Bit = TestRegSize - 1;
1562       break;
1563     case TargetOpcode::G_LSHR:
1564       // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1565       if ((Bit + *C) < TestRegSize) {
1566         NextReg = TestReg;
1567         Bit = Bit + *C;
1568       }
1569       break;
1570     case TargetOpcode::G_XOR:
1571       // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1572       // appropriate.
1573       //
1574       // e.g. If x' = xor x, c, and the b-th bit is set in c then
1575       //
1576       // tbz x', b -> tbnz x, b
1577       //
1578       // Because x' only has the b-th bit set if x does not.
1579       if ((*C >> Bit) & 1)
1580         Invert = !Invert;
1581       NextReg = TestReg;
1582       break;
1583     }
1584 
1585     // Check if we found anything worth folding.
1586     if (!NextReg.isValid())
1587       return Reg;
1588     Reg = NextReg;
1589   }
1590 
1591   return Reg;
1592 }
1593 
1594 MachineInstr *AArch64InstructionSelector::emitTestBit(
1595     Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1596     MachineIRBuilder &MIB) const {
1597   assert(TestReg.isValid());
1598   assert(ProduceNonFlagSettingCondBr &&
1599          "Cannot emit TB(N)Z with speculation tracking!");
1600   MachineRegisterInfo &MRI = *MIB.getMRI();
1601 
1602   // Attempt to optimize the test bit by walking over instructions.
1603   TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
1604   LLT Ty = MRI.getType(TestReg);
1605   unsigned Size = Ty.getSizeInBits();
1606   assert(!Ty.isVector() && "Expected a scalar!");
1607   assert(Bit < 64 && "Bit is too large!");
1608 
1609   // When the test register is a 64-bit register, we have to narrow to make
1610   // TBNZW work.
1611   bool UseWReg = Bit < 32;
1612   unsigned NecessarySize = UseWReg ? 32 : 64;
1613   if (Size != NecessarySize)
1614     TestReg = moveScalarRegClass(
1615         TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1616         MIB);
1617 
1618   static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1619                                           {AArch64::TBZW, AArch64::TBNZW}};
1620   unsigned Opc = OpcTable[UseWReg][IsNegative];
1621   auto TestBitMI =
1622       MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB);
1623   constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
1624   return &*TestBitMI;
1625 }
1626 
1627 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1628     MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1629     MachineIRBuilder &MIB) const {
1630   assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1631   // Given something like this:
1632   //
1633   //  %x = ...Something...
1634   //  %one = G_CONSTANT i64 1
1635   //  %zero = G_CONSTANT i64 0
1636   //  %and = G_AND %x, %one
1637   //  %cmp = G_ICMP intpred(ne), %and, %zero
1638   //  %cmp_trunc = G_TRUNC %cmp
1639   //  G_BRCOND %cmp_trunc, %bb.3
1640   //
1641   // We want to try and fold the AND into the G_BRCOND and produce either a
1642   // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1643   //
1644   // In this case, we'd get
1645   //
1646   // TBNZ %x %bb.3
1647   //
1648 
1649   // Check if the AND has a constant on its RHS which we can use as a mask.
1650   // If it's a power of 2, then it's the same as checking a specific bit.
1651   // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1652   auto MaybeBit = getIConstantVRegValWithLookThrough(
1653       AndInst.getOperand(2).getReg(), *MIB.getMRI());
1654   if (!MaybeBit)
1655     return false;
1656 
1657   int32_t Bit = MaybeBit->Value.exactLogBase2();
1658   if (Bit < 0)
1659     return false;
1660 
1661   Register TestReg = AndInst.getOperand(1).getReg();
1662 
1663   // Emit a TB(N)Z.
1664   emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
1665   return true;
1666 }
1667 
1668 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1669                                                   bool IsNegative,
1670                                                   MachineBasicBlock *DestMBB,
1671                                                   MachineIRBuilder &MIB) const {
1672   assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1673   MachineRegisterInfo &MRI = *MIB.getMRI();
1674   assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1675              AArch64::GPRRegBankID &&
1676          "Expected GPRs only?");
1677   auto Ty = MRI.getType(CompareReg);
1678   unsigned Width = Ty.getSizeInBits();
1679   assert(!Ty.isVector() && "Expected scalar only?");
1680   assert(Width <= 64 && "Expected width to be at most 64?");
1681   static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1682                                           {AArch64::CBNZW, AArch64::CBNZX}};
1683   unsigned Opc = OpcTable[IsNegative][Width == 64];
1684   auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB);
1685   constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
1686   return &*BranchMI;
1687 }
1688 
1689 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1690     MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1691   assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1692   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1693   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1694   // totally clean.  Some of them require two branches to implement.
1695   auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate();
1696   emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB,
1697                 Pred);
1698   AArch64CC::CondCode CC1, CC2;
1699   changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2);
1700   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1701   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
1702   if (CC2 != AArch64CC::AL)
1703     MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
1704   I.eraseFromParent();
1705   return true;
1706 }
1707 
1708 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1709     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1710   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1711   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1712   // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1713   //
1714   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1715   // instructions will not be produced, as they are conditional branch
1716   // instructions that do not set flags.
1717   if (!ProduceNonFlagSettingCondBr)
1718     return false;
1719 
1720   MachineRegisterInfo &MRI = *MIB.getMRI();
1721   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1722   auto Pred =
1723       static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate());
1724   Register LHS = ICmp.getOperand(2).getReg();
1725   Register RHS = ICmp.getOperand(3).getReg();
1726 
1727   // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1728   auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1729   MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1730 
1731   // When we can emit a TB(N)Z, prefer that.
1732   //
1733   // Handle non-commutative condition codes first.
1734   // Note that we don't want to do this when we have a G_AND because it can
1735   // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1736   if (VRegAndVal && !AndInst) {
1737     int64_t C = VRegAndVal->Value.getSExtValue();
1738 
1739     // When we have a greater-than comparison, we can just test if the msb is
1740     // zero.
1741     if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1742       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1743       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1744       I.eraseFromParent();
1745       return true;
1746     }
1747 
1748     // When we have a less than comparison, we can just test if the msb is not
1749     // zero.
1750     if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1751       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1752       emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB);
1753       I.eraseFromParent();
1754       return true;
1755     }
1756 
1757     // Inversely, if we have a signed greater-than-or-equal comparison to zero,
1758     // we can test if the msb is zero.
1759     if (C == 0 && Pred == CmpInst::ICMP_SGE) {
1760       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1761       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1762       I.eraseFromParent();
1763       return true;
1764     }
1765   }
1766 
1767   // Attempt to handle commutative condition codes. Right now, that's only
1768   // eq/ne.
1769   if (ICmpInst::isEquality(Pred)) {
1770     if (!VRegAndVal) {
1771       std::swap(RHS, LHS);
1772       VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1773       AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1774     }
1775 
1776     if (VRegAndVal && VRegAndVal->Value == 0) {
1777       // If there's a G_AND feeding into this branch, try to fold it away by
1778       // emitting a TB(N)Z instead.
1779       //
1780       // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1781       // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1782       // would be redundant.
1783       if (AndInst &&
1784           tryOptAndIntoCompareBranch(
1785               *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) {
1786         I.eraseFromParent();
1787         return true;
1788       }
1789 
1790       // Otherwise, try to emit a CB(N)Z instead.
1791       auto LHSTy = MRI.getType(LHS);
1792       if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1793         emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1794         I.eraseFromParent();
1795         return true;
1796       }
1797     }
1798   }
1799 
1800   return false;
1801 }
1802 
1803 bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1804     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1805   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1806   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1807   if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1808     return true;
1809 
1810   // Couldn't optimize. Emit a compare + a Bcc.
1811   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1812   auto PredOp = ICmp.getOperand(1);
1813   emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
1814   const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1815       static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
1816   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
1817   I.eraseFromParent();
1818   return true;
1819 }
1820 
1821 bool AArch64InstructionSelector::selectCompareBranch(
1822     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1823   Register CondReg = I.getOperand(0).getReg();
1824   MachineInstr *CCMI = MRI.getVRegDef(CondReg);
1825   // Try to select the G_BRCOND using whatever is feeding the condition if
1826   // possible.
1827   unsigned CCMIOpc = CCMI->getOpcode();
1828   if (CCMIOpc == TargetOpcode::G_FCMP)
1829     return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
1830   if (CCMIOpc == TargetOpcode::G_ICMP)
1831     return selectCompareBranchFedByICmp(I, *CCMI, MIB);
1832 
1833   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1834   // instructions will not be produced, as they are conditional branch
1835   // instructions that do not set flags.
1836   if (ProduceNonFlagSettingCondBr) {
1837     emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1838                 I.getOperand(1).getMBB(), MIB);
1839     I.eraseFromParent();
1840     return true;
1841   }
1842 
1843   // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1844   auto TstMI =
1845       MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
1846   constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
1847   auto Bcc = MIB.buildInstr(AArch64::Bcc)
1848                  .addImm(AArch64CC::NE)
1849                  .addMBB(I.getOperand(1).getMBB());
1850   I.eraseFromParent();
1851   return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
1852 }
1853 
1854 /// Returns the element immediate value of a vector shift operand if found.
1855 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
1856 static std::optional<int64_t> getVectorShiftImm(Register Reg,
1857                                                 MachineRegisterInfo &MRI) {
1858   assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1859   MachineInstr *OpMI = MRI.getVRegDef(Reg);
1860   return getAArch64VectorSplatScalar(*OpMI, MRI);
1861 }
1862 
1863 /// Matches and returns the shift immediate value for a SHL instruction given
1864 /// a shift operand.
1865 static std::optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg,
1866                                               MachineRegisterInfo &MRI) {
1867   std::optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1868   if (!ShiftImm)
1869     return std::nullopt;
1870   // Check the immediate is in range for a SHL.
1871   int64_t Imm = *ShiftImm;
1872   if (Imm < 0)
1873     return std::nullopt;
1874   switch (SrcTy.getElementType().getSizeInBits()) {
1875   default:
1876     LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1877     return std::nullopt;
1878   case 8:
1879     if (Imm > 7)
1880       return std::nullopt;
1881     break;
1882   case 16:
1883     if (Imm > 15)
1884       return std::nullopt;
1885     break;
1886   case 32:
1887     if (Imm > 31)
1888       return std::nullopt;
1889     break;
1890   case 64:
1891     if (Imm > 63)
1892       return std::nullopt;
1893     break;
1894   }
1895   return Imm;
1896 }
1897 
1898 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1899                                                  MachineRegisterInfo &MRI) {
1900   assert(I.getOpcode() == TargetOpcode::G_SHL);
1901   Register DstReg = I.getOperand(0).getReg();
1902   const LLT Ty = MRI.getType(DstReg);
1903   Register Src1Reg = I.getOperand(1).getReg();
1904   Register Src2Reg = I.getOperand(2).getReg();
1905 
1906   if (!Ty.isVector())
1907     return false;
1908 
1909   // Check if we have a vector of constants on RHS that we can select as the
1910   // immediate form.
1911   std::optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI);
1912 
1913   unsigned Opc = 0;
1914   if (Ty == LLT::fixed_vector(2, 64)) {
1915     Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1916   } else if (Ty == LLT::fixed_vector(4, 32)) {
1917     Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1918   } else if (Ty == LLT::fixed_vector(2, 32)) {
1919     Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1920   } else if (Ty == LLT::fixed_vector(4, 16)) {
1921     Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1922   } else if (Ty == LLT::fixed_vector(8, 16)) {
1923     Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1924   } else if (Ty == LLT::fixed_vector(16, 8)) {
1925     Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1926   } else if (Ty == LLT::fixed_vector(8, 8)) {
1927     Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1928   } else {
1929     LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1930     return false;
1931   }
1932 
1933   auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg});
1934   if (ImmVal)
1935     Shl.addImm(*ImmVal);
1936   else
1937     Shl.addUse(Src2Reg);
1938   constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
1939   I.eraseFromParent();
1940   return true;
1941 }
1942 
1943 bool AArch64InstructionSelector::selectVectorAshrLshr(
1944     MachineInstr &I, MachineRegisterInfo &MRI) {
1945   assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1946          I.getOpcode() == TargetOpcode::G_LSHR);
1947   Register DstReg = I.getOperand(0).getReg();
1948   const LLT Ty = MRI.getType(DstReg);
1949   Register Src1Reg = I.getOperand(1).getReg();
1950   Register Src2Reg = I.getOperand(2).getReg();
1951 
1952   if (!Ty.isVector())
1953     return false;
1954 
1955   bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1956 
1957   // We expect the immediate case to be lowered in the PostLegalCombiner to
1958   // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1959 
1960   // There is not a shift right register instruction, but the shift left
1961   // register instruction takes a signed value, where negative numbers specify a
1962   // right shift.
1963 
1964   unsigned Opc = 0;
1965   unsigned NegOpc = 0;
1966   const TargetRegisterClass *RC =
1967       getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID));
1968   if (Ty == LLT::fixed_vector(2, 64)) {
1969     Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1970     NegOpc = AArch64::NEGv2i64;
1971   } else if (Ty == LLT::fixed_vector(4, 32)) {
1972     Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1973     NegOpc = AArch64::NEGv4i32;
1974   } else if (Ty == LLT::fixed_vector(2, 32)) {
1975     Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1976     NegOpc = AArch64::NEGv2i32;
1977   } else if (Ty == LLT::fixed_vector(4, 16)) {
1978     Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1979     NegOpc = AArch64::NEGv4i16;
1980   } else if (Ty == LLT::fixed_vector(8, 16)) {
1981     Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1982     NegOpc = AArch64::NEGv8i16;
1983   } else if (Ty == LLT::fixed_vector(16, 8)) {
1984     Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1985     NegOpc = AArch64::NEGv16i8;
1986   } else if (Ty == LLT::fixed_vector(8, 8)) {
1987     Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1988     NegOpc = AArch64::NEGv8i8;
1989   } else {
1990     LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1991     return false;
1992   }
1993 
1994   auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
1995   constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
1996   auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
1997   constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
1998   I.eraseFromParent();
1999   return true;
2000 }
2001 
2002 bool AArch64InstructionSelector::selectVaStartAAPCS(
2003     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
2004 
2005   if (STI.isCallingConvWin64(MF.getFunction().getCallingConv(),
2006                              MF.getFunction().isVarArg()))
2007     return false;
2008 
2009   // The layout of the va_list struct is specified in the AArch64 Procedure Call
2010   // Standard, section 10.1.5.
2011 
2012   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2013   const unsigned PtrSize = STI.isTargetILP32() ? 4 : 8;
2014   const auto *PtrRegClass =
2015       STI.isTargetILP32() ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
2016 
2017   const MCInstrDesc &MCIDAddAddr =
2018       TII.get(STI.isTargetILP32() ? AArch64::ADDWri : AArch64::ADDXri);
2019   const MCInstrDesc &MCIDStoreAddr =
2020       TII.get(STI.isTargetILP32() ? AArch64::STRWui : AArch64::STRXui);
2021 
2022   /*
2023    * typedef struct va_list {
2024    *  void * stack; // next stack param
2025    *  void * gr_top; // end of GP arg reg save area
2026    *  void * vr_top; // end of FP/SIMD arg reg save area
2027    *  int gr_offs; // offset from gr_top to next GP register arg
2028    *  int vr_offs; // offset from vr_top to next FP/SIMD register arg
2029    * } va_list;
2030    */
2031   const auto VAList = I.getOperand(0).getReg();
2032 
2033   // Our current offset in bytes from the va_list struct (VAList).
2034   unsigned OffsetBytes = 0;
2035 
2036   // Helper function to store (FrameIndex + Imm) to VAList at offset OffsetBytes
2037   // and increment OffsetBytes by PtrSize.
2038   const auto PushAddress = [&](const int FrameIndex, const int64_t Imm) {
2039     const Register Top = MRI.createVirtualRegister(PtrRegClass);
2040     auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), MCIDAddAddr)
2041                    .addDef(Top)
2042                    .addFrameIndex(FrameIndex)
2043                    .addImm(Imm)
2044                    .addImm(0);
2045     constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2046 
2047     const auto *MMO = *I.memoperands_begin();
2048     MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), MCIDStoreAddr)
2049               .addUse(Top)
2050               .addUse(VAList)
2051               .addImm(OffsetBytes / PtrSize)
2052               .addMemOperand(MF.getMachineMemOperand(
2053                   MMO->getPointerInfo().getWithOffset(OffsetBytes),
2054                   MachineMemOperand::MOStore, PtrSize, MMO->getBaseAlign()));
2055     constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2056 
2057     OffsetBytes += PtrSize;
2058   };
2059 
2060   // void* stack at offset 0
2061   PushAddress(FuncInfo->getVarArgsStackIndex(), 0);
2062 
2063   // void* gr_top at offset 8 (4 on ILP32)
2064   const unsigned GPRSize = FuncInfo->getVarArgsGPRSize();
2065   PushAddress(FuncInfo->getVarArgsGPRIndex(), GPRSize);
2066 
2067   // void* vr_top at offset 16 (8 on ILP32)
2068   const unsigned FPRSize = FuncInfo->getVarArgsFPRSize();
2069   PushAddress(FuncInfo->getVarArgsFPRIndex(), FPRSize);
2070 
2071   // Helper function to store a 4-byte integer constant to VAList at offset
2072   // OffsetBytes, and increment OffsetBytes by 4.
2073   const auto PushIntConstant = [&](const int32_t Value) {
2074     constexpr int IntSize = 4;
2075     const Register Temp = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2076     auto MIB =
2077         BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::MOVi32imm))
2078             .addDef(Temp)
2079             .addImm(Value);
2080     constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2081 
2082     const auto *MMO = *I.memoperands_begin();
2083     MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRWui))
2084               .addUse(Temp)
2085               .addUse(VAList)
2086               .addImm(OffsetBytes / IntSize)
2087               .addMemOperand(MF.getMachineMemOperand(
2088                   MMO->getPointerInfo().getWithOffset(OffsetBytes),
2089                   MachineMemOperand::MOStore, IntSize, MMO->getBaseAlign()));
2090     constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2091     OffsetBytes += IntSize;
2092   };
2093 
2094   // int gr_offs at offset 24 (12 on ILP32)
2095   PushIntConstant(-static_cast<int32_t>(GPRSize));
2096 
2097   // int vr_offs at offset 28 (16 on ILP32)
2098   PushIntConstant(-static_cast<int32_t>(FPRSize));
2099 
2100   assert(OffsetBytes == (STI.isTargetILP32() ? 20 : 32) && "Unexpected offset");
2101 
2102   I.eraseFromParent();
2103   return true;
2104 }
2105 
2106 bool AArch64InstructionSelector::selectVaStartDarwin(
2107     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
2108   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2109   Register ListReg = I.getOperand(0).getReg();
2110 
2111   Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2112 
2113   int FrameIdx = FuncInfo->getVarArgsStackIndex();
2114   if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64(
2115           MF.getFunction().getCallingConv(), MF.getFunction().isVarArg())) {
2116     FrameIdx = FuncInfo->getVarArgsGPRSize() > 0
2117                    ? FuncInfo->getVarArgsGPRIndex()
2118                    : FuncInfo->getVarArgsStackIndex();
2119   }
2120 
2121   auto MIB =
2122       BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
2123           .addDef(ArgsAddrReg)
2124           .addFrameIndex(FrameIdx)
2125           .addImm(0)
2126           .addImm(0);
2127 
2128   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2129 
2130   MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
2131             .addUse(ArgsAddrReg)
2132             .addUse(ListReg)
2133             .addImm(0)
2134             .addMemOperand(*I.memoperands_begin());
2135 
2136   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2137   I.eraseFromParent();
2138   return true;
2139 }
2140 
2141 void AArch64InstructionSelector::materializeLargeCMVal(
2142     MachineInstr &I, const Value *V, unsigned OpFlags) {
2143   MachineBasicBlock &MBB = *I.getParent();
2144   MachineFunction &MF = *MBB.getParent();
2145   MachineRegisterInfo &MRI = MF.getRegInfo();
2146 
2147   auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
2148   MovZ->addOperand(MF, I.getOperand(1));
2149   MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
2150                                      AArch64II::MO_NC);
2151   MovZ->addOperand(MF, MachineOperand::CreateImm(0));
2152   constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
2153 
2154   auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
2155                        Register ForceDstReg) {
2156     Register DstReg = ForceDstReg
2157                           ? ForceDstReg
2158                           : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2159     auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
2160     if (auto *GV = dyn_cast<GlobalValue>(V)) {
2161       MovI->addOperand(MF, MachineOperand::CreateGA(
2162                                GV, MovZ->getOperand(1).getOffset(), Flags));
2163     } else {
2164       MovI->addOperand(
2165           MF, MachineOperand::CreateBA(cast<BlockAddress>(V),
2166                                        MovZ->getOperand(1).getOffset(), Flags));
2167     }
2168     MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
2169     constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
2170     return DstReg;
2171   };
2172   Register DstReg = BuildMovK(MovZ.getReg(0),
2173                               AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
2174   DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
2175   BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
2176 }
2177 
2178 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
2179   MachineBasicBlock &MBB = *I.getParent();
2180   MachineFunction &MF = *MBB.getParent();
2181   MachineRegisterInfo &MRI = MF.getRegInfo();
2182 
2183   switch (I.getOpcode()) {
2184   case TargetOpcode::G_STORE: {
2185     bool Changed = contractCrossBankCopyIntoStore(I, MRI);
2186     MachineOperand &SrcOp = I.getOperand(0);
2187     if (MRI.getType(SrcOp.getReg()).isPointer()) {
2188       // Allow matching with imported patterns for stores of pointers. Unlike
2189       // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
2190       // and constrain.
2191       auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp);
2192       Register NewSrc = Copy.getReg(0);
2193       SrcOp.setReg(NewSrc);
2194       RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
2195       Changed = true;
2196     }
2197     return Changed;
2198   }
2199   case TargetOpcode::G_PTR_ADD:
2200     return convertPtrAddToAdd(I, MRI);
2201   case TargetOpcode::G_LOAD: {
2202     // For scalar loads of pointers, we try to convert the dest type from p0
2203     // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
2204     // conversion, this should be ok because all users should have been
2205     // selected already, so the type doesn't matter for them.
2206     Register DstReg = I.getOperand(0).getReg();
2207     const LLT DstTy = MRI.getType(DstReg);
2208     if (!DstTy.isPointer())
2209       return false;
2210     MRI.setType(DstReg, LLT::scalar(64));
2211     return true;
2212   }
2213   case AArch64::G_DUP: {
2214     // Convert the type from p0 to s64 to help selection.
2215     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2216     if (!DstTy.isPointerVector())
2217       return false;
2218     auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
2219     MRI.setType(I.getOperand(0).getReg(),
2220                 DstTy.changeElementType(LLT::scalar(64)));
2221     MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
2222     I.getOperand(1).setReg(NewSrc.getReg(0));
2223     return true;
2224   }
2225   case AArch64::G_INSERT_VECTOR_ELT: {
2226     // Convert the type from p0 to s64 to help selection.
2227     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2228     LLT SrcVecTy = MRI.getType(I.getOperand(1).getReg());
2229     if (!SrcVecTy.isPointerVector())
2230       return false;
2231     auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(2).getReg());
2232     MRI.setType(I.getOperand(1).getReg(),
2233                 DstTy.changeElementType(LLT::scalar(64)));
2234     MRI.setType(I.getOperand(0).getReg(),
2235                 DstTy.changeElementType(LLT::scalar(64)));
2236     MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
2237     I.getOperand(2).setReg(NewSrc.getReg(0));
2238     return true;
2239   }
2240   case TargetOpcode::G_UITOFP:
2241   case TargetOpcode::G_SITOFP: {
2242     // If both source and destination regbanks are FPR, then convert the opcode
2243     // to G_SITOF so that the importer can select it to an fpr variant.
2244     // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
2245     // copy.
2246     Register SrcReg = I.getOperand(1).getReg();
2247     LLT SrcTy = MRI.getType(SrcReg);
2248     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2249     if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
2250       return false;
2251 
2252     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
2253       if (I.getOpcode() == TargetOpcode::G_SITOFP)
2254         I.setDesc(TII.get(AArch64::G_SITOF));
2255       else
2256         I.setDesc(TII.get(AArch64::G_UITOF));
2257       return true;
2258     }
2259     return false;
2260   }
2261   default:
2262     return false;
2263   }
2264 }
2265 
2266 /// This lowering tries to look for G_PTR_ADD instructions and then converts
2267 /// them to a standard G_ADD with a COPY on the source.
2268 ///
2269 /// The motivation behind this is to expose the add semantics to the imported
2270 /// tablegen patterns. We shouldn't need to check for uses being loads/stores,
2271 /// because the selector works bottom up, uses before defs. By the time we
2272 /// end up trying to select a G_PTR_ADD, we should have already attempted to
2273 /// fold this into addressing modes and were therefore unsuccessful.
2274 bool AArch64InstructionSelector::convertPtrAddToAdd(
2275     MachineInstr &I, MachineRegisterInfo &MRI) {
2276   assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2277   Register DstReg = I.getOperand(0).getReg();
2278   Register AddOp1Reg = I.getOperand(1).getReg();
2279   const LLT PtrTy = MRI.getType(DstReg);
2280   if (PtrTy.getAddressSpace() != 0)
2281     return false;
2282 
2283   const LLT CastPtrTy =
2284       PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64);
2285   auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
2286   // Set regbanks on the registers.
2287   if (PtrTy.isVector())
2288     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
2289   else
2290     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
2291 
2292   // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2293   // %dst(intty) = G_ADD %intbase, off
2294   I.setDesc(TII.get(TargetOpcode::G_ADD));
2295   MRI.setType(DstReg, CastPtrTy);
2296   I.getOperand(1).setReg(PtrToInt.getReg(0));
2297   if (!select(*PtrToInt)) {
2298     LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2299     return false;
2300   }
2301 
2302   // Also take the opportunity here to try to do some optimization.
2303   // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2304   Register NegatedReg;
2305   if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg))))
2306     return true;
2307   I.getOperand(2).setReg(NegatedReg);
2308   I.setDesc(TII.get(TargetOpcode::G_SUB));
2309   return true;
2310 }
2311 
2312 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2313                                                 MachineRegisterInfo &MRI) {
2314   // We try to match the immediate variant of LSL, which is actually an alias
2315   // for a special case of UBFM. Otherwise, we fall back to the imported
2316   // selector which will match the register variant.
2317   assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2318   const auto &MO = I.getOperand(2);
2319   auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI);
2320   if (!VRegAndVal)
2321     return false;
2322 
2323   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2324   if (DstTy.isVector())
2325     return false;
2326   bool Is64Bit = DstTy.getSizeInBits() == 64;
2327   auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
2328   auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
2329 
2330   if (!Imm1Fn || !Imm2Fn)
2331     return false;
2332 
2333   auto NewI =
2334       MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2335                      {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
2336 
2337   for (auto &RenderFn : *Imm1Fn)
2338     RenderFn(NewI);
2339   for (auto &RenderFn : *Imm2Fn)
2340     RenderFn(NewI);
2341 
2342   I.eraseFromParent();
2343   return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
2344 }
2345 
2346 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2347     MachineInstr &I, MachineRegisterInfo &MRI) {
2348   assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2349   // If we're storing a scalar, it doesn't matter what register bank that
2350   // scalar is on. All that matters is the size.
2351   //
2352   // So, if we see something like this (with a 32-bit scalar as an example):
2353   //
2354   // %x:gpr(s32) = ... something ...
2355   // %y:fpr(s32) = COPY %x:gpr(s32)
2356   // G_STORE %y:fpr(s32)
2357   //
2358   // We can fix this up into something like this:
2359   //
2360   // G_STORE %x:gpr(s32)
2361   //
2362   // And then continue the selection process normally.
2363   Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
2364   if (!DefDstReg.isValid())
2365     return false;
2366   LLT DefDstTy = MRI.getType(DefDstReg);
2367   Register StoreSrcReg = I.getOperand(0).getReg();
2368   LLT StoreSrcTy = MRI.getType(StoreSrcReg);
2369 
2370   // If we get something strange like a physical register, then we shouldn't
2371   // go any further.
2372   if (!DefDstTy.isValid())
2373     return false;
2374 
2375   // Are the source and dst types the same size?
2376   if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2377     return false;
2378 
2379   if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
2380       RBI.getRegBank(DefDstReg, MRI, TRI))
2381     return false;
2382 
2383   // We have a cross-bank copy, which is entering a store. Let's fold it.
2384   I.getOperand(0).setReg(DefDstReg);
2385   return true;
2386 }
2387 
2388 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2389   assert(I.getParent() && "Instruction should be in a basic block!");
2390   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2391 
2392   MachineBasicBlock &MBB = *I.getParent();
2393   MachineFunction &MF = *MBB.getParent();
2394   MachineRegisterInfo &MRI = MF.getRegInfo();
2395 
2396   switch (I.getOpcode()) {
2397   case AArch64::G_DUP: {
2398     // Before selecting a DUP instruction, check if it is better selected as a
2399     // MOV or load from a constant pool.
2400     Register Src = I.getOperand(1).getReg();
2401     auto ValAndVReg = getAnyConstantVRegValWithLookThrough(Src, MRI);
2402     if (!ValAndVReg)
2403       return false;
2404     LLVMContext &Ctx = MF.getFunction().getContext();
2405     Register Dst = I.getOperand(0).getReg();
2406     auto *CV = ConstantDataVector::getSplat(
2407         MRI.getType(Dst).getNumElements(),
2408         ConstantInt::get(
2409             Type::getIntNTy(Ctx, MRI.getType(Dst).getScalarSizeInBits()),
2410             ValAndVReg->Value.trunc(MRI.getType(Dst).getScalarSizeInBits())));
2411     if (!emitConstantVector(Dst, CV, MIB, MRI))
2412       return false;
2413     I.eraseFromParent();
2414     return true;
2415   }
2416   case TargetOpcode::G_SEXT:
2417     // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV
2418     // over a normal extend.
2419     if (selectUSMovFromExtend(I, MRI))
2420       return true;
2421     return false;
2422   case TargetOpcode::G_BR:
2423     return false;
2424   case TargetOpcode::G_SHL:
2425     return earlySelectSHL(I, MRI);
2426   case TargetOpcode::G_CONSTANT: {
2427     bool IsZero = false;
2428     if (I.getOperand(1).isCImm())
2429       IsZero = I.getOperand(1).getCImm()->isZero();
2430     else if (I.getOperand(1).isImm())
2431       IsZero = I.getOperand(1).getImm() == 0;
2432 
2433     if (!IsZero)
2434       return false;
2435 
2436     Register DefReg = I.getOperand(0).getReg();
2437     LLT Ty = MRI.getType(DefReg);
2438     if (Ty.getSizeInBits() == 64) {
2439       I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
2440       RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
2441     } else if (Ty.getSizeInBits() == 32) {
2442       I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
2443       RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
2444     } else
2445       return false;
2446 
2447     I.setDesc(TII.get(TargetOpcode::COPY));
2448     return true;
2449   }
2450 
2451   case TargetOpcode::G_ADD: {
2452     // Check if this is being fed by a G_ICMP on either side.
2453     //
2454     // (cmp pred, x, y) + z
2455     //
2456     // In the above case, when the cmp is true, we increment z by 1. So, we can
2457     // fold the add into the cset for the cmp by using cinc.
2458     //
2459     // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2460     Register AddDst = I.getOperand(0).getReg();
2461     Register AddLHS = I.getOperand(1).getReg();
2462     Register AddRHS = I.getOperand(2).getReg();
2463     // Only handle scalars.
2464     LLT Ty = MRI.getType(AddLHS);
2465     if (Ty.isVector())
2466       return false;
2467     // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64
2468     // bits.
2469     unsigned Size = Ty.getSizeInBits();
2470     if (Size != 32 && Size != 64)
2471       return false;
2472     auto MatchCmp = [&](Register Reg) -> MachineInstr * {
2473       if (!MRI.hasOneNonDBGUse(Reg))
2474         return nullptr;
2475       // If the LHS of the add is 32 bits, then we want to fold a 32-bit
2476       // compare.
2477       if (Size == 32)
2478         return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI);
2479       // We model scalar compares using 32-bit destinations right now.
2480       // If it's a 64-bit compare, it'll have 64-bit sources.
2481       Register ZExt;
2482       if (!mi_match(Reg, MRI,
2483                     m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt))))))
2484         return nullptr;
2485       auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI);
2486       if (!Cmp ||
2487           MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64)
2488         return nullptr;
2489       return Cmp;
2490     };
2491     // Try to match
2492     // z + (cmp pred, x, y)
2493     MachineInstr *Cmp = MatchCmp(AddRHS);
2494     if (!Cmp) {
2495       // (cmp pred, x, y) + z
2496       std::swap(AddLHS, AddRHS);
2497       Cmp = MatchCmp(AddRHS);
2498       if (!Cmp)
2499         return false;
2500     }
2501     auto &PredOp = Cmp->getOperand(1);
2502     auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
2503     const AArch64CC::CondCode InvCC =
2504         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
2505     MIB.setInstrAndDebugLoc(I);
2506     emitIntegerCompare(/*LHS=*/Cmp->getOperand(2),
2507                        /*RHS=*/Cmp->getOperand(3), PredOp, MIB);
2508     emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB);
2509     I.eraseFromParent();
2510     return true;
2511   }
2512   case TargetOpcode::G_OR: {
2513     // Look for operations that take the lower `Width=Size-ShiftImm` bits of
2514     // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2515     // shifting and masking that we can replace with a BFI (encoded as a BFM).
2516     Register Dst = I.getOperand(0).getReg();
2517     LLT Ty = MRI.getType(Dst);
2518 
2519     if (!Ty.isScalar())
2520       return false;
2521 
2522     unsigned Size = Ty.getSizeInBits();
2523     if (Size != 32 && Size != 64)
2524       return false;
2525 
2526     Register ShiftSrc;
2527     int64_t ShiftImm;
2528     Register MaskSrc;
2529     int64_t MaskImm;
2530     if (!mi_match(
2531             Dst, MRI,
2532             m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))),
2533                   m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm))))))
2534       return false;
2535 
2536     if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
2537       return false;
2538 
2539     int64_t Immr = Size - ShiftImm;
2540     int64_t Imms = Size - ShiftImm - 1;
2541     unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
2542     emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB);
2543     I.eraseFromParent();
2544     return true;
2545   }
2546   case TargetOpcode::G_FENCE: {
2547     if (I.getOperand(1).getImm() == 0)
2548       BuildMI(MBB, I, MIMetadata(I), TII.get(TargetOpcode::MEMBARRIER));
2549     else
2550       BuildMI(MBB, I, MIMetadata(I), TII.get(AArch64::DMB))
2551           .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb);
2552     I.eraseFromParent();
2553     return true;
2554   }
2555   default:
2556     return false;
2557   }
2558 }
2559 
2560 bool AArch64InstructionSelector::select(MachineInstr &I) {
2561   assert(I.getParent() && "Instruction should be in a basic block!");
2562   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2563 
2564   MachineBasicBlock &MBB = *I.getParent();
2565   MachineFunction &MF = *MBB.getParent();
2566   MachineRegisterInfo &MRI = MF.getRegInfo();
2567 
2568   const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
2569   if (Subtarget->requiresStrictAlign()) {
2570     // We don't support this feature yet.
2571     LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2572     return false;
2573   }
2574 
2575   MIB.setInstrAndDebugLoc(I);
2576 
2577   unsigned Opcode = I.getOpcode();
2578   // G_PHI requires same handling as PHI
2579   if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2580     // Certain non-generic instructions also need some special handling.
2581 
2582     if (Opcode ==  TargetOpcode::LOAD_STACK_GUARD)
2583       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2584 
2585     if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2586       const Register DefReg = I.getOperand(0).getReg();
2587       const LLT DefTy = MRI.getType(DefReg);
2588 
2589       const RegClassOrRegBank &RegClassOrBank =
2590         MRI.getRegClassOrRegBank(DefReg);
2591 
2592       const TargetRegisterClass *DefRC =
2593           dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
2594       if (!DefRC) {
2595         if (!DefTy.isValid()) {
2596           LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2597           return false;
2598         }
2599         const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
2600         DefRC = getRegClassForTypeOnBank(DefTy, RB);
2601         if (!DefRC) {
2602           LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2603           return false;
2604         }
2605       }
2606 
2607       I.setDesc(TII.get(TargetOpcode::PHI));
2608 
2609       return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
2610     }
2611 
2612     if (I.isCopy())
2613       return selectCopy(I, TII, MRI, TRI, RBI);
2614 
2615     if (I.isDebugInstr())
2616       return selectDebugInstr(I, MRI, RBI);
2617 
2618     return true;
2619   }
2620 
2621 
2622   if (I.getNumOperands() != I.getNumExplicitOperands()) {
2623     LLVM_DEBUG(
2624         dbgs() << "Generic instruction has unexpected implicit operands\n");
2625     return false;
2626   }
2627 
2628   // Try to do some lowering before we start instruction selecting. These
2629   // lowerings are purely transformations on the input G_MIR and so selection
2630   // must continue after any modification of the instruction.
2631   if (preISelLower(I)) {
2632     Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2633   }
2634 
2635   // There may be patterns where the importer can't deal with them optimally,
2636   // but does select it to a suboptimal sequence so our custom C++ selection
2637   // code later never has a chance to work on it. Therefore, we have an early
2638   // selection attempt here to give priority to certain selection routines
2639   // over the imported ones.
2640   if (earlySelect(I))
2641     return true;
2642 
2643   if (selectImpl(I, *CoverageInfo))
2644     return true;
2645 
2646   LLT Ty =
2647       I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
2648 
2649   switch (Opcode) {
2650   case TargetOpcode::G_SBFX:
2651   case TargetOpcode::G_UBFX: {
2652     static const unsigned OpcTable[2][2] = {
2653         {AArch64::UBFMWri, AArch64::UBFMXri},
2654         {AArch64::SBFMWri, AArch64::SBFMXri}};
2655     bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2656     unsigned Size = Ty.getSizeInBits();
2657     unsigned Opc = OpcTable[IsSigned][Size == 64];
2658     auto Cst1 =
2659         getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI);
2660     assert(Cst1 && "Should have gotten a constant for src 1?");
2661     auto Cst2 =
2662         getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI);
2663     assert(Cst2 && "Should have gotten a constant for src 2?");
2664     auto LSB = Cst1->Value.getZExtValue();
2665     auto Width = Cst2->Value.getZExtValue();
2666     auto BitfieldInst =
2667         MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)})
2668             .addImm(LSB)
2669             .addImm(LSB + Width - 1);
2670     I.eraseFromParent();
2671     return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI);
2672   }
2673   case TargetOpcode::G_BRCOND:
2674     return selectCompareBranch(I, MF, MRI);
2675 
2676   case TargetOpcode::G_BRINDIRECT: {
2677     const Function &Fn = MF.getFunction();
2678     if (std::optional<uint16_t> BADisc =
2679             STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(Fn)) {
2680       auto MI = MIB.buildInstr(AArch64::BRA, {}, {I.getOperand(0).getReg()});
2681       MI.addImm(AArch64PACKey::IA);
2682       MI.addImm(*BADisc);
2683       MI.addReg(/*AddrDisc=*/AArch64::XZR);
2684       I.eraseFromParent();
2685       return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
2686     }
2687     I.setDesc(TII.get(AArch64::BR));
2688     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2689   }
2690 
2691   case TargetOpcode::G_BRJT:
2692     return selectBrJT(I, MRI);
2693 
2694   case AArch64::G_ADD_LOW: {
2695     // This op may have been separated from it's ADRP companion by the localizer
2696     // or some other code motion pass. Given that many CPUs will try to
2697     // macro fuse these operations anyway, select this into a MOVaddr pseudo
2698     // which will later be expanded into an ADRP+ADD pair after scheduling.
2699     MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg());
2700     if (BaseMI->getOpcode() != AArch64::ADRP) {
2701       I.setDesc(TII.get(AArch64::ADDXri));
2702       I.addOperand(MachineOperand::CreateImm(0));
2703       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2704     }
2705     assert(TM.getCodeModel() == CodeModel::Small &&
2706            "Expected small code model");
2707     auto Op1 = BaseMI->getOperand(1);
2708     auto Op2 = I.getOperand(2);
2709     auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
2710                        .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
2711                                          Op1.getTargetFlags())
2712                        .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
2713                                          Op2.getTargetFlags());
2714     I.eraseFromParent();
2715     return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
2716   }
2717 
2718   case TargetOpcode::G_FCONSTANT:
2719   case TargetOpcode::G_CONSTANT: {
2720     const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
2721 
2722     const LLT s8 = LLT::scalar(8);
2723     const LLT s16 = LLT::scalar(16);
2724     const LLT s32 = LLT::scalar(32);
2725     const LLT s64 = LLT::scalar(64);
2726     const LLT s128 = LLT::scalar(128);
2727     const LLT p0 = LLT::pointer(0, 64);
2728 
2729     const Register DefReg = I.getOperand(0).getReg();
2730     const LLT DefTy = MRI.getType(DefReg);
2731     const unsigned DefSize = DefTy.getSizeInBits();
2732     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2733 
2734     // FIXME: Redundant check, but even less readable when factored out.
2735     if (isFP) {
2736       if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) {
2737         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2738                           << " constant, expected: " << s16 << " or " << s32
2739                           << " or " << s64 << " or " << s128 << '\n');
2740         return false;
2741       }
2742 
2743       if (RB.getID() != AArch64::FPRRegBankID) {
2744         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2745                           << " constant on bank: " << RB
2746                           << ", expected: FPR\n");
2747         return false;
2748       }
2749 
2750       // The case when we have 0.0 is covered by tablegen. Reject it here so we
2751       // can be sure tablegen works correctly and isn't rescued by this code.
2752       // 0.0 is not covered by tablegen for FP128. So we will handle this
2753       // scenario in the code here.
2754       if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
2755         return false;
2756     } else {
2757       // s32 and s64 are covered by tablegen.
2758       if (Ty != p0 && Ty != s8 && Ty != s16) {
2759         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2760                           << " constant, expected: " << s32 << ", " << s64
2761                           << ", or " << p0 << '\n');
2762         return false;
2763       }
2764 
2765       if (RB.getID() != AArch64::GPRRegBankID) {
2766         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2767                           << " constant on bank: " << RB
2768                           << ", expected: GPR\n");
2769         return false;
2770       }
2771     }
2772 
2773     if (isFP) {
2774       const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB);
2775       // For 16, 64, and 128b values, emit a constant pool load.
2776       switch (DefSize) {
2777       default:
2778         llvm_unreachable("Unexpected destination size for G_FCONSTANT?");
2779       case 32:
2780       case 64: {
2781         bool OptForSize = shouldOptForSize(&MF);
2782         const auto &TLI = MF.getSubtarget().getTargetLowering();
2783         // If TLI says that this fpimm is illegal, then we'll expand to a
2784         // constant pool load.
2785         if (TLI->isFPImmLegal(I.getOperand(1).getFPImm()->getValueAPF(),
2786                               EVT::getFloatingPointVT(DefSize), OptForSize))
2787           break;
2788         [[fallthrough]];
2789       }
2790       case 16:
2791       case 128: {
2792         auto *FPImm = I.getOperand(1).getFPImm();
2793         auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
2794         if (!LoadMI) {
2795           LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2796           return false;
2797         }
2798         MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()});
2799         I.eraseFromParent();
2800         return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
2801       }
2802       }
2803 
2804       assert((DefSize == 32 || DefSize == 64) && "Unexpected const def size");
2805       // Either emit a FMOV, or emit a copy to emit a normal mov.
2806       const Register DefGPRReg = MRI.createVirtualRegister(
2807           DefSize == 32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
2808       MachineOperand &RegOp = I.getOperand(0);
2809       RegOp.setReg(DefGPRReg);
2810       MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2811       MIB.buildCopy({DefReg}, {DefGPRReg});
2812 
2813       if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
2814         LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2815         return false;
2816       }
2817 
2818       MachineOperand &ImmOp = I.getOperand(1);
2819       // FIXME: Is going through int64_t always correct?
2820       ImmOp.ChangeToImmediate(
2821           ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2822     } else if (I.getOperand(1).isCImm()) {
2823       uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
2824       I.getOperand(1).ChangeToImmediate(Val);
2825     } else if (I.getOperand(1).isImm()) {
2826       uint64_t Val = I.getOperand(1).getImm();
2827       I.getOperand(1).ChangeToImmediate(Val);
2828     }
2829 
2830     const unsigned MovOpc =
2831         DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2832     I.setDesc(TII.get(MovOpc));
2833     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2834     return true;
2835   }
2836   case TargetOpcode::G_EXTRACT: {
2837     Register DstReg = I.getOperand(0).getReg();
2838     Register SrcReg = I.getOperand(1).getReg();
2839     LLT SrcTy = MRI.getType(SrcReg);
2840     LLT DstTy = MRI.getType(DstReg);
2841     (void)DstTy;
2842     unsigned SrcSize = SrcTy.getSizeInBits();
2843 
2844     if (SrcTy.getSizeInBits() > 64) {
2845       // This should be an extract of an s128, which is like a vector extract.
2846       if (SrcTy.getSizeInBits() != 128)
2847         return false;
2848       // Only support extracting 64 bits from an s128 at the moment.
2849       if (DstTy.getSizeInBits() != 64)
2850         return false;
2851 
2852       unsigned Offset = I.getOperand(2).getImm();
2853       if (Offset % 64 != 0)
2854         return false;
2855 
2856       // Check we have the right regbank always.
2857       const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2858       const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2859       assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2860 
2861       if (SrcRB.getID() == AArch64::GPRRegBankID) {
2862         auto NewI =
2863             MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
2864                 .addUse(SrcReg, 0,
2865                         Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2866         constrainOperandRegClass(MF, TRI, MRI, TII, RBI, *NewI,
2867                                  AArch64::GPR64RegClass, NewI->getOperand(0));
2868         I.eraseFromParent();
2869         return true;
2870       }
2871 
2872       // Emit the same code as a vector extract.
2873       // Offset must be a multiple of 64.
2874       unsigned LaneIdx = Offset / 64;
2875       MachineInstr *Extract = emitExtractVectorElt(
2876           DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
2877       if (!Extract)
2878         return false;
2879       I.eraseFromParent();
2880       return true;
2881     }
2882 
2883     I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2884     MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
2885                                       Ty.getSizeInBits() - 1);
2886 
2887     if (SrcSize < 64) {
2888       assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2889              "unexpected G_EXTRACT types");
2890       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2891     }
2892 
2893     DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2894     MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2895     MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
2896         .addReg(DstReg, 0, AArch64::sub_32);
2897     RBI.constrainGenericRegister(I.getOperand(0).getReg(),
2898                                  AArch64::GPR32RegClass, MRI);
2899     I.getOperand(0).setReg(DstReg);
2900 
2901     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2902   }
2903 
2904   case TargetOpcode::G_INSERT: {
2905     LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
2906     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2907     unsigned DstSize = DstTy.getSizeInBits();
2908     // Larger inserts are vectors, same-size ones should be something else by
2909     // now (split up or turned into COPYs).
2910     if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2911       return false;
2912 
2913     I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2914     unsigned LSB = I.getOperand(3).getImm();
2915     unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
2916     I.getOperand(3).setImm((DstSize - LSB) % DstSize);
2917     MachineInstrBuilder(MF, I).addImm(Width - 1);
2918 
2919     if (DstSize < 64) {
2920       assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2921              "unexpected G_INSERT types");
2922       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2923     }
2924 
2925     Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2926     BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
2927             TII.get(AArch64::SUBREG_TO_REG))
2928         .addDef(SrcReg)
2929         .addImm(0)
2930         .addUse(I.getOperand(2).getReg())
2931         .addImm(AArch64::sub_32);
2932     RBI.constrainGenericRegister(I.getOperand(2).getReg(),
2933                                  AArch64::GPR32RegClass, MRI);
2934     I.getOperand(2).setReg(SrcReg);
2935 
2936     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2937   }
2938   case TargetOpcode::G_FRAME_INDEX: {
2939     // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2940     if (Ty != LLT::pointer(0, 64)) {
2941       LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2942                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2943       return false;
2944     }
2945     I.setDesc(TII.get(AArch64::ADDXri));
2946 
2947     // MOs for a #0 shifted immediate.
2948     I.addOperand(MachineOperand::CreateImm(0));
2949     I.addOperand(MachineOperand::CreateImm(0));
2950 
2951     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2952   }
2953 
2954   case TargetOpcode::G_GLOBAL_VALUE: {
2955     const GlobalValue *GV = nullptr;
2956     unsigned OpFlags;
2957     if (I.getOperand(1).isSymbol()) {
2958       OpFlags = I.getOperand(1).getTargetFlags();
2959       // Currently only used by "RtLibUseGOT".
2960       assert(OpFlags == AArch64II::MO_GOT);
2961     } else {
2962       GV = I.getOperand(1).getGlobal();
2963       if (GV->isThreadLocal())
2964         return selectTLSGlobalValue(I, MRI);
2965       OpFlags = STI.ClassifyGlobalReference(GV, TM);
2966     }
2967 
2968     if (OpFlags & AArch64II::MO_GOT) {
2969       I.setDesc(TII.get(MF.getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
2970                             ? AArch64::LOADgotAUTH
2971                             : AArch64::LOADgot));
2972       I.getOperand(1).setTargetFlags(OpFlags);
2973     } else if (TM.getCodeModel() == CodeModel::Large &&
2974                !TM.isPositionIndependent()) {
2975       // Materialize the global using movz/movk instructions.
2976       materializeLargeCMVal(I, GV, OpFlags);
2977       I.eraseFromParent();
2978       return true;
2979     } else if (TM.getCodeModel() == CodeModel::Tiny) {
2980       I.setDesc(TII.get(AArch64::ADR));
2981       I.getOperand(1).setTargetFlags(OpFlags);
2982     } else {
2983       I.setDesc(TII.get(AArch64::MOVaddr));
2984       I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2985       MachineInstrBuilder MIB(MF, I);
2986       MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
2987                            OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2988     }
2989     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2990   }
2991 
2992   case TargetOpcode::G_PTRAUTH_GLOBAL_VALUE:
2993     return selectPtrAuthGlobalValue(I, MRI);
2994 
2995   case TargetOpcode::G_ZEXTLOAD:
2996   case TargetOpcode::G_LOAD:
2997   case TargetOpcode::G_STORE: {
2998     GLoadStore &LdSt = cast<GLoadStore>(I);
2999     bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
3000     LLT PtrTy = MRI.getType(LdSt.getPointerReg());
3001 
3002     if (PtrTy != LLT::pointer(0, 64)) {
3003       LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
3004                         << ", expected: " << LLT::pointer(0, 64) << '\n');
3005       return false;
3006     }
3007 
3008     uint64_t MemSizeInBytes = LdSt.getMemSize().getValue();
3009     unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue();
3010     AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
3011 
3012     // Need special instructions for atomics that affect ordering.
3013     if (Order != AtomicOrdering::NotAtomic &&
3014         Order != AtomicOrdering::Unordered &&
3015         Order != AtomicOrdering::Monotonic) {
3016       assert(!isa<GZExtLoad>(LdSt));
3017       assert(MemSizeInBytes <= 8 &&
3018              "128-bit atomics should already be custom-legalized");
3019 
3020       if (isa<GLoad>(LdSt)) {
3021         static constexpr unsigned LDAPROpcodes[] = {
3022             AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX};
3023         static constexpr unsigned LDAROpcodes[] = {
3024             AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX};
3025         ArrayRef<unsigned> Opcodes =
3026             STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent
3027                 ? LDAPROpcodes
3028                 : LDAROpcodes;
3029         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
3030       } else {
3031         static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
3032                                                AArch64::STLRW, AArch64::STLRX};
3033         Register ValReg = LdSt.getReg(0);
3034         if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) {
3035           // Emit a subreg copy of 32 bits.
3036           Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3037           MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {})
3038               .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32);
3039           I.getOperand(0).setReg(NewVal);
3040         }
3041         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
3042       }
3043       constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3044       return true;
3045     }
3046 
3047 #ifndef NDEBUG
3048     const Register PtrReg = LdSt.getPointerReg();
3049     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
3050     // Check that the pointer register is valid.
3051     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
3052            "Load/Store pointer operand isn't a GPR");
3053     assert(MRI.getType(PtrReg).isPointer() &&
3054            "Load/Store pointer operand isn't a pointer");
3055 #endif
3056 
3057     const Register ValReg = LdSt.getReg(0);
3058     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
3059     LLT ValTy = MRI.getType(ValReg);
3060 
3061     // The code below doesn't support truncating stores, so we need to split it
3062     // again.
3063     if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
3064       unsigned SubReg;
3065       LLT MemTy = LdSt.getMMO().getMemoryType();
3066       auto *RC = getRegClassForTypeOnBank(MemTy, RB);
3067       if (!getSubRegForClass(RC, TRI, SubReg))
3068         return false;
3069 
3070       // Generate a subreg copy.
3071       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {})
3072                       .addReg(ValReg, 0, SubReg)
3073                       .getReg(0);
3074       RBI.constrainGenericRegister(Copy, *RC, MRI);
3075       LdSt.getOperand(0).setReg(Copy);
3076     } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
3077       // If this is an any-extending load from the FPR bank, split it into a regular
3078       // load + extend.
3079       if (RB.getID() == AArch64::FPRRegBankID) {
3080         unsigned SubReg;
3081         LLT MemTy = LdSt.getMMO().getMemoryType();
3082         auto *RC = getRegClassForTypeOnBank(MemTy, RB);
3083         if (!getSubRegForClass(RC, TRI, SubReg))
3084           return false;
3085         Register OldDst = LdSt.getReg(0);
3086         Register NewDst =
3087             MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType());
3088         LdSt.getOperand(0).setReg(NewDst);
3089         MRI.setRegBank(NewDst, RB);
3090         // Generate a SUBREG_TO_REG to extend it.
3091         MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator()));
3092         MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {})
3093             .addImm(0)
3094             .addUse(NewDst)
3095             .addImm(SubReg);
3096         auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB);
3097         RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI);
3098         MIB.setInstr(LdSt);
3099         ValTy = MemTy; // This is no longer an extending load.
3100       }
3101     }
3102 
3103     // Helper lambda for partially selecting I. Either returns the original
3104     // instruction with an updated opcode, or a new instruction.
3105     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
3106       bool IsStore = isa<GStore>(I);
3107       const unsigned NewOpc =
3108           selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
3109       if (NewOpc == I.getOpcode())
3110         return nullptr;
3111       // Check if we can fold anything into the addressing mode.
3112       auto AddrModeFns =
3113           selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
3114       if (!AddrModeFns) {
3115         // Can't fold anything. Use the original instruction.
3116         I.setDesc(TII.get(NewOpc));
3117         I.addOperand(MachineOperand::CreateImm(0));
3118         return &I;
3119       }
3120 
3121       // Folded something. Create a new instruction and return it.
3122       auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
3123       Register CurValReg = I.getOperand(0).getReg();
3124       IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
3125       NewInst.cloneMemRefs(I);
3126       for (auto &Fn : *AddrModeFns)
3127         Fn(NewInst);
3128       I.eraseFromParent();
3129       return &*NewInst;
3130     };
3131 
3132     MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
3133     if (!LoadStore)
3134       return false;
3135 
3136     // If we're storing a 0, use WZR/XZR.
3137     if (Opcode == TargetOpcode::G_STORE) {
3138       auto CVal = getIConstantVRegValWithLookThrough(
3139           LoadStore->getOperand(0).getReg(), MRI);
3140       if (CVal && CVal->Value == 0) {
3141         switch (LoadStore->getOpcode()) {
3142         case AArch64::STRWui:
3143         case AArch64::STRHHui:
3144         case AArch64::STRBBui:
3145           LoadStore->getOperand(0).setReg(AArch64::WZR);
3146           break;
3147         case AArch64::STRXui:
3148           LoadStore->getOperand(0).setReg(AArch64::XZR);
3149           break;
3150         }
3151       }
3152     }
3153 
3154     if (IsZExtLoad || (Opcode == TargetOpcode::G_LOAD &&
3155                        ValTy == LLT::scalar(64) && MemSizeInBits == 32)) {
3156       // The any/zextload from a smaller type to i32 should be handled by the
3157       // importer.
3158       if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64)
3159         return false;
3160       // If we have an extending load then change the load's type to be a
3161       // narrower reg and zero_extend with SUBREG_TO_REG.
3162       Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3163       Register DstReg = LoadStore->getOperand(0).getReg();
3164       LoadStore->getOperand(0).setReg(LdReg);
3165 
3166       MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator()));
3167       MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
3168           .addImm(0)
3169           .addUse(LdReg)
3170           .addImm(AArch64::sub_32);
3171       constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3172       return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
3173                                           MRI);
3174     }
3175     return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3176   }
3177 
3178   case TargetOpcode::G_INDEXED_ZEXTLOAD:
3179   case TargetOpcode::G_INDEXED_SEXTLOAD:
3180     return selectIndexedExtLoad(I, MRI);
3181   case TargetOpcode::G_INDEXED_LOAD:
3182     return selectIndexedLoad(I, MRI);
3183   case TargetOpcode::G_INDEXED_STORE:
3184     return selectIndexedStore(cast<GIndexedStore>(I), MRI);
3185 
3186   case TargetOpcode::G_LSHR:
3187   case TargetOpcode::G_ASHR:
3188     if (MRI.getType(I.getOperand(0).getReg()).isVector())
3189       return selectVectorAshrLshr(I, MRI);
3190     [[fallthrough]];
3191   case TargetOpcode::G_SHL:
3192     if (Opcode == TargetOpcode::G_SHL &&
3193         MRI.getType(I.getOperand(0).getReg()).isVector())
3194       return selectVectorSHL(I, MRI);
3195 
3196     // These shifts were legalized to have 64 bit shift amounts because we
3197     // want to take advantage of the selection patterns that assume the
3198     // immediates are s64s, however, selectBinaryOp will assume both operands
3199     // will have the same bit size.
3200     {
3201       Register SrcReg = I.getOperand(1).getReg();
3202       Register ShiftReg = I.getOperand(2).getReg();
3203       const LLT ShiftTy = MRI.getType(ShiftReg);
3204       const LLT SrcTy = MRI.getType(SrcReg);
3205       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
3206           ShiftTy.getSizeInBits() == 64) {
3207         assert(!ShiftTy.isVector() && "unexpected vector shift ty");
3208         // Insert a subregister copy to implement a 64->32 trunc
3209         auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
3210                          .addReg(ShiftReg, 0, AArch64::sub_32);
3211         MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
3212         I.getOperand(2).setReg(Trunc.getReg(0));
3213       }
3214     }
3215     [[fallthrough]];
3216   case TargetOpcode::G_OR: {
3217     // Reject the various things we don't support yet.
3218     if (unsupportedBinOp(I, RBI, MRI, TRI))
3219       return false;
3220 
3221     const unsigned OpSize = Ty.getSizeInBits();
3222 
3223     const Register DefReg = I.getOperand(0).getReg();
3224     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
3225 
3226     const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
3227     if (NewOpc == I.getOpcode())
3228       return false;
3229 
3230     I.setDesc(TII.get(NewOpc));
3231     // FIXME: Should the type be always reset in setDesc?
3232 
3233     // Now that we selected an opcode, we need to constrain the register
3234     // operands to use appropriate classes.
3235     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3236   }
3237 
3238   case TargetOpcode::G_PTR_ADD: {
3239     emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB);
3240     I.eraseFromParent();
3241     return true;
3242   }
3243 
3244   case TargetOpcode::G_SADDE:
3245   case TargetOpcode::G_UADDE:
3246   case TargetOpcode::G_SSUBE:
3247   case TargetOpcode::G_USUBE:
3248   case TargetOpcode::G_SADDO:
3249   case TargetOpcode::G_UADDO:
3250   case TargetOpcode::G_SSUBO:
3251   case TargetOpcode::G_USUBO:
3252     return selectOverflowOp(I, MRI);
3253 
3254   case TargetOpcode::G_PTRMASK: {
3255     Register MaskReg = I.getOperand(2).getReg();
3256     std::optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI);
3257     // TODO: Implement arbitrary cases
3258     if (!MaskVal || !isShiftedMask_64(*MaskVal))
3259       return false;
3260 
3261     uint64_t Mask = *MaskVal;
3262     I.setDesc(TII.get(AArch64::ANDXri));
3263     I.getOperand(2).ChangeToImmediate(
3264         AArch64_AM::encodeLogicalImmediate(Mask, 64));
3265 
3266     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3267   }
3268   case TargetOpcode::G_PTRTOINT:
3269   case TargetOpcode::G_TRUNC: {
3270     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3271     const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3272 
3273     const Register DstReg = I.getOperand(0).getReg();
3274     const Register SrcReg = I.getOperand(1).getReg();
3275 
3276     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3277     const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3278 
3279     if (DstRB.getID() != SrcRB.getID()) {
3280       LLVM_DEBUG(
3281           dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
3282       return false;
3283     }
3284 
3285     if (DstRB.getID() == AArch64::GPRRegBankID) {
3286       const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3287       if (!DstRC)
3288         return false;
3289 
3290       const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB);
3291       if (!SrcRC)
3292         return false;
3293 
3294       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
3295           !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
3296         LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
3297         return false;
3298       }
3299 
3300       if (DstRC == SrcRC) {
3301         // Nothing to be done
3302       } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
3303                  SrcTy == LLT::scalar(64)) {
3304         llvm_unreachable("TableGen can import this case");
3305         return false;
3306       } else if (DstRC == &AArch64::GPR32RegClass &&
3307                  SrcRC == &AArch64::GPR64RegClass) {
3308         I.getOperand(1).setSubReg(AArch64::sub_32);
3309       } else {
3310         LLVM_DEBUG(
3311             dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
3312         return false;
3313       }
3314 
3315       I.setDesc(TII.get(TargetOpcode::COPY));
3316       return true;
3317     } else if (DstRB.getID() == AArch64::FPRRegBankID) {
3318       if (DstTy == LLT::fixed_vector(4, 16) &&
3319           SrcTy == LLT::fixed_vector(4, 32)) {
3320         I.setDesc(TII.get(AArch64::XTNv4i16));
3321         constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3322         return true;
3323       }
3324 
3325       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
3326         MachineInstr *Extract = emitExtractVectorElt(
3327             DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
3328         if (!Extract)
3329           return false;
3330         I.eraseFromParent();
3331         return true;
3332       }
3333 
3334       // We might have a vector G_PTRTOINT, in which case just emit a COPY.
3335       if (Opcode == TargetOpcode::G_PTRTOINT) {
3336         assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3337         I.setDesc(TII.get(TargetOpcode::COPY));
3338         return selectCopy(I, TII, MRI, TRI, RBI);
3339       }
3340     }
3341 
3342     return false;
3343   }
3344 
3345   case TargetOpcode::G_ANYEXT: {
3346     if (selectUSMovFromExtend(I, MRI))
3347       return true;
3348 
3349     const Register DstReg = I.getOperand(0).getReg();
3350     const Register SrcReg = I.getOperand(1).getReg();
3351 
3352     const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
3353     if (RBDst.getID() != AArch64::GPRRegBankID) {
3354       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3355                         << ", expected: GPR\n");
3356       return false;
3357     }
3358 
3359     const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
3360     if (RBSrc.getID() != AArch64::GPRRegBankID) {
3361       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3362                         << ", expected: GPR\n");
3363       return false;
3364     }
3365 
3366     const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
3367 
3368     if (DstSize == 0) {
3369       LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3370       return false;
3371     }
3372 
3373     if (DstSize != 64 && DstSize > 32) {
3374       LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3375                         << ", expected: 32 or 64\n");
3376       return false;
3377     }
3378     // At this point G_ANYEXT is just like a plain COPY, but we need
3379     // to explicitly form the 64-bit value if any.
3380     if (DstSize > 32) {
3381       Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
3382       BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
3383           .addDef(ExtSrc)
3384           .addImm(0)
3385           .addUse(SrcReg)
3386           .addImm(AArch64::sub_32);
3387       I.getOperand(1).setReg(ExtSrc);
3388     }
3389     return selectCopy(I, TII, MRI, TRI, RBI);
3390   }
3391 
3392   case TargetOpcode::G_ZEXT:
3393   case TargetOpcode::G_SEXT_INREG:
3394   case TargetOpcode::G_SEXT: {
3395     if (selectUSMovFromExtend(I, MRI))
3396       return true;
3397 
3398     unsigned Opcode = I.getOpcode();
3399     const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3400     const Register DefReg = I.getOperand(0).getReg();
3401     Register SrcReg = I.getOperand(1).getReg();
3402     const LLT DstTy = MRI.getType(DefReg);
3403     const LLT SrcTy = MRI.getType(SrcReg);
3404     unsigned DstSize = DstTy.getSizeInBits();
3405     unsigned SrcSize = SrcTy.getSizeInBits();
3406 
3407     // SEXT_INREG has the same src reg size as dst, the size of the value to be
3408     // extended is encoded in the imm.
3409     if (Opcode == TargetOpcode::G_SEXT_INREG)
3410       SrcSize = I.getOperand(2).getImm();
3411 
3412     if (DstTy.isVector())
3413       return false; // Should be handled by imported patterns.
3414 
3415     assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3416                AArch64::GPRRegBankID &&
3417            "Unexpected ext regbank");
3418 
3419     MachineInstr *ExtI;
3420 
3421     // First check if we're extending the result of a load which has a dest type
3422     // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3423     // GPR register on AArch64 and all loads which are smaller automatically
3424     // zero-extend the upper bits. E.g.
3425     // %v(s8) = G_LOAD %p, :: (load 1)
3426     // %v2(s32) = G_ZEXT %v(s8)
3427     if (!IsSigned) {
3428       auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
3429       bool IsGPR =
3430           RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3431       if (LoadMI && IsGPR) {
3432         const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
3433         unsigned BytesLoaded = MemOp->getSize().getValue();
3434         if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
3435           return selectCopy(I, TII, MRI, TRI, RBI);
3436       }
3437 
3438       // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3439       // + SUBREG_TO_REG.
3440       if (IsGPR && SrcSize == 32 && DstSize == 64) {
3441         Register SubregToRegSrc =
3442             MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3443         const Register ZReg = AArch64::WZR;
3444         MIB.buildInstr(AArch64::ORRWrs, {SubregToRegSrc}, {ZReg, SrcReg})
3445             .addImm(0);
3446 
3447         MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
3448             .addImm(0)
3449             .addUse(SubregToRegSrc)
3450             .addImm(AArch64::sub_32);
3451 
3452         if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
3453                                           MRI)) {
3454           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3455           return false;
3456         }
3457 
3458         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3459                                           MRI)) {
3460           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3461           return false;
3462         }
3463 
3464         I.eraseFromParent();
3465         return true;
3466       }
3467     }
3468 
3469     if (DstSize == 64) {
3470       if (Opcode != TargetOpcode::G_SEXT_INREG) {
3471         // FIXME: Can we avoid manually doing this?
3472         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3473                                           MRI)) {
3474           LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3475                             << " operand\n");
3476           return false;
3477         }
3478         SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
3479                                 {&AArch64::GPR64RegClass}, {})
3480                      .addImm(0)
3481                      .addUse(SrcReg)
3482                      .addImm(AArch64::sub_32)
3483                      .getReg(0);
3484       }
3485 
3486       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3487                              {DefReg}, {SrcReg})
3488                   .addImm(0)
3489                   .addImm(SrcSize - 1);
3490     } else if (DstSize <= 32) {
3491       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3492                              {DefReg}, {SrcReg})
3493                   .addImm(0)
3494                   .addImm(SrcSize - 1);
3495     } else {
3496       return false;
3497     }
3498 
3499     constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
3500     I.eraseFromParent();
3501     return true;
3502   }
3503 
3504   case TargetOpcode::G_SITOFP:
3505   case TargetOpcode::G_UITOFP:
3506   case TargetOpcode::G_FPTOSI:
3507   case TargetOpcode::G_FPTOUI: {
3508     const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
3509               SrcTy = MRI.getType(I.getOperand(1).getReg());
3510     const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
3511     if (NewOpc == Opcode)
3512       return false;
3513 
3514     I.setDesc(TII.get(NewOpc));
3515     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3516     I.setFlags(MachineInstr::NoFPExcept);
3517 
3518     return true;
3519   }
3520 
3521   case TargetOpcode::G_FREEZE:
3522     return selectCopy(I, TII, MRI, TRI, RBI);
3523 
3524   case TargetOpcode::G_INTTOPTR:
3525     // The importer is currently unable to import pointer types since they
3526     // didn't exist in SelectionDAG.
3527     return selectCopy(I, TII, MRI, TRI, RBI);
3528 
3529   case TargetOpcode::G_BITCAST:
3530     // Imported SelectionDAG rules can handle every bitcast except those that
3531     // bitcast from a type to the same type. Ideally, these shouldn't occur
3532     // but we might not run an optimizer that deletes them. The other exception
3533     // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3534     // of them.
3535     return selectCopy(I, TII, MRI, TRI, RBI);
3536 
3537   case TargetOpcode::G_SELECT: {
3538     auto &Sel = cast<GSelect>(I);
3539     const Register CondReg = Sel.getCondReg();
3540     const Register TReg = Sel.getTrueReg();
3541     const Register FReg = Sel.getFalseReg();
3542 
3543     if (tryOptSelect(Sel))
3544       return true;
3545 
3546     // Make sure to use an unused vreg instead of wzr, so that the peephole
3547     // optimizations will be able to optimize these.
3548     Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3549     auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
3550                      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
3551     constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
3552     if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB))
3553       return false;
3554     Sel.eraseFromParent();
3555     return true;
3556   }
3557   case TargetOpcode::G_ICMP: {
3558     if (Ty.isVector())
3559       return false;
3560 
3561     if (Ty != LLT::scalar(32)) {
3562       LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3563                         << ", expected: " << LLT::scalar(32) << '\n');
3564       return false;
3565     }
3566 
3567     auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3568     const AArch64CC::CondCode InvCC =
3569         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
3570     emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB);
3571     emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR,
3572               /*Src2=*/AArch64::WZR, InvCC, MIB);
3573     I.eraseFromParent();
3574     return true;
3575   }
3576 
3577   case TargetOpcode::G_FCMP: {
3578     CmpInst::Predicate Pred =
3579         static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3580     if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB,
3581                        Pred) ||
3582         !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB))
3583       return false;
3584     I.eraseFromParent();
3585     return true;
3586   }
3587   case TargetOpcode::G_VASTART:
3588     return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3589                                 : selectVaStartAAPCS(I, MF, MRI);
3590   case TargetOpcode::G_INTRINSIC:
3591     return selectIntrinsic(I, MRI);
3592   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3593     return selectIntrinsicWithSideEffects(I, MRI);
3594   case TargetOpcode::G_IMPLICIT_DEF: {
3595     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
3596     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3597     const Register DstReg = I.getOperand(0).getReg();
3598     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3599     const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3600     RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
3601     return true;
3602   }
3603   case TargetOpcode::G_BLOCK_ADDR: {
3604     Function *BAFn = I.getOperand(1).getBlockAddress()->getFunction();
3605     if (std::optional<uint16_t> BADisc =
3606             STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(*BAFn)) {
3607       MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X16}, {});
3608       MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
3609       MIB.buildInstr(AArch64::MOVaddrPAC)
3610           .addBlockAddress(I.getOperand(1).getBlockAddress())
3611           .addImm(AArch64PACKey::IA)
3612           .addReg(/*AddrDisc=*/AArch64::XZR)
3613           .addImm(*BADisc)
3614           .constrainAllUses(TII, TRI, RBI);
3615       MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X16));
3616       RBI.constrainGenericRegister(I.getOperand(0).getReg(),
3617                                    AArch64::GPR64RegClass, MRI);
3618       I.eraseFromParent();
3619       return true;
3620     }
3621     if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) {
3622       materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
3623       I.eraseFromParent();
3624       return true;
3625     } else {
3626       I.setDesc(TII.get(AArch64::MOVaddrBA));
3627       auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
3628                            I.getOperand(0).getReg())
3629                        .addBlockAddress(I.getOperand(1).getBlockAddress(),
3630                                         /* Offset */ 0, AArch64II::MO_PAGE)
3631                        .addBlockAddress(
3632                            I.getOperand(1).getBlockAddress(), /* Offset */ 0,
3633                            AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3634       I.eraseFromParent();
3635       return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3636     }
3637   }
3638   case AArch64::G_DUP: {
3639     // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3640     // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3641     // difficult because at RBS we may end up pessimizing the fpr case if we
3642     // decided to add an anyextend to fix this. Manual selection is the most
3643     // robust solution for now.
3644     if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
3645         AArch64::GPRRegBankID)
3646       return false; // We expect the fpr regbank case to be imported.
3647     LLT VecTy = MRI.getType(I.getOperand(0).getReg());
3648     if (VecTy == LLT::fixed_vector(8, 8))
3649       I.setDesc(TII.get(AArch64::DUPv8i8gpr));
3650     else if (VecTy == LLT::fixed_vector(16, 8))
3651       I.setDesc(TII.get(AArch64::DUPv16i8gpr));
3652     else if (VecTy == LLT::fixed_vector(4, 16))
3653       I.setDesc(TII.get(AArch64::DUPv4i16gpr));
3654     else if (VecTy == LLT::fixed_vector(8, 16))
3655       I.setDesc(TII.get(AArch64::DUPv8i16gpr));
3656     else
3657       return false;
3658     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3659   }
3660   case TargetOpcode::G_BUILD_VECTOR:
3661     return selectBuildVector(I, MRI);
3662   case TargetOpcode::G_MERGE_VALUES:
3663     return selectMergeValues(I, MRI);
3664   case TargetOpcode::G_UNMERGE_VALUES:
3665     return selectUnmergeValues(I, MRI);
3666   case TargetOpcode::G_SHUFFLE_VECTOR:
3667     return selectShuffleVector(I, MRI);
3668   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3669     return selectExtractElt(I, MRI);
3670   case TargetOpcode::G_CONCAT_VECTORS:
3671     return selectConcatVectors(I, MRI);
3672   case TargetOpcode::G_JUMP_TABLE:
3673     return selectJumpTable(I, MRI);
3674   case TargetOpcode::G_MEMCPY:
3675   case TargetOpcode::G_MEMCPY_INLINE:
3676   case TargetOpcode::G_MEMMOVE:
3677   case TargetOpcode::G_MEMSET:
3678     assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
3679     return selectMOPS(I, MRI);
3680   }
3681 
3682   return false;
3683 }
3684 
3685 bool AArch64InstructionSelector::selectAndRestoreState(MachineInstr &I) {
3686   MachineIRBuilderState OldMIBState = MIB.getState();
3687   bool Success = select(I);
3688   MIB.setState(OldMIBState);
3689   return Success;
3690 }
3691 
3692 bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
3693                                             MachineRegisterInfo &MRI) {
3694   unsigned Mopcode;
3695   switch (GI.getOpcode()) {
3696   case TargetOpcode::G_MEMCPY:
3697   case TargetOpcode::G_MEMCPY_INLINE:
3698     Mopcode = AArch64::MOPSMemoryCopyPseudo;
3699     break;
3700   case TargetOpcode::G_MEMMOVE:
3701     Mopcode = AArch64::MOPSMemoryMovePseudo;
3702     break;
3703   case TargetOpcode::G_MEMSET:
3704     // For tagged memset see llvm.aarch64.mops.memset.tag
3705     Mopcode = AArch64::MOPSMemorySetPseudo;
3706     break;
3707   }
3708 
3709   auto &DstPtr = GI.getOperand(0);
3710   auto &SrcOrVal = GI.getOperand(1);
3711   auto &Size = GI.getOperand(2);
3712 
3713   // Create copies of the registers that can be clobbered.
3714   const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg());
3715   const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg());
3716   const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg());
3717 
3718   const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo;
3719   const auto &SrcValRegClass =
3720       IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;
3721 
3722   // Constrain to specific registers
3723   RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI);
3724   RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI);
3725   RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI);
3726 
3727   MIB.buildCopy(DstPtrCopy, DstPtr);
3728   MIB.buildCopy(SrcValCopy, SrcOrVal);
3729   MIB.buildCopy(SizeCopy, Size);
3730 
3731   // New instruction uses the copied registers because it must update them.
3732   // The defs are not used since they don't exist in G_MEM*. They are still
3733   // tied.
3734   // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
3735   Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
3736   Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3737   if (IsSet) {
3738     MIB.buildInstr(Mopcode, {DefDstPtr, DefSize},
3739                    {DstPtrCopy, SizeCopy, SrcValCopy});
3740   } else {
3741     Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass);
3742     MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize},
3743                    {DstPtrCopy, SrcValCopy, SizeCopy});
3744   }
3745 
3746   GI.eraseFromParent();
3747   return true;
3748 }
3749 
3750 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3751                                             MachineRegisterInfo &MRI) {
3752   assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3753   Register JTAddr = I.getOperand(0).getReg();
3754   unsigned JTI = I.getOperand(1).getIndex();
3755   Register Index = I.getOperand(2).getReg();
3756 
3757   MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
3758 
3759   // With aarch64-jump-table-hardening, we only expand the jump table dispatch
3760   // sequence later, to guarantee the integrity of the intermediate values.
3761   if (MF->getFunction().hasFnAttribute("aarch64-jump-table-hardening")) {
3762     CodeModel::Model CM = TM.getCodeModel();
3763     if (STI.isTargetMachO()) {
3764       if (CM != CodeModel::Small && CM != CodeModel::Large)
3765         report_fatal_error("Unsupported code-model for hardened jump-table");
3766     } else {
3767       // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
3768       assert(STI.isTargetELF() &&
3769              "jump table hardening only supported on MachO/ELF");
3770       if (CM != CodeModel::Small)
3771         report_fatal_error("Unsupported code-model for hardened jump-table");
3772     }
3773 
3774     MIB.buildCopy({AArch64::X16}, I.getOperand(2).getReg());
3775     MIB.buildInstr(AArch64::BR_JumpTable)
3776         .addJumpTableIndex(I.getOperand(1).getIndex());
3777     I.eraseFromParent();
3778     return true;
3779   }
3780 
3781   Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3782   Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
3783 
3784   auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
3785                                       {TargetReg, ScratchReg}, {JTAddr, Index})
3786                            .addJumpTableIndex(JTI);
3787   // Save the jump table info.
3788   MIB.buildInstr(TargetOpcode::JUMP_TABLE_DEBUG_INFO, {},
3789                  {static_cast<int64_t>(JTI)});
3790   // Build the indirect branch.
3791   MIB.buildInstr(AArch64::BR, {}, {TargetReg});
3792   I.eraseFromParent();
3793   return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
3794 }
3795 
3796 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3797                                                  MachineRegisterInfo &MRI) {
3798   assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3799   assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3800 
3801   Register DstReg = I.getOperand(0).getReg();
3802   unsigned JTI = I.getOperand(1).getIndex();
3803   // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3804   auto MovMI =
3805     MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
3806           .addJumpTableIndex(JTI, AArch64II::MO_PAGE)
3807           .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3808   I.eraseFromParent();
3809   return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3810 }
3811 
3812 bool AArch64InstructionSelector::selectTLSGlobalValue(
3813     MachineInstr &I, MachineRegisterInfo &MRI) {
3814   if (!STI.isTargetMachO())
3815     return false;
3816   MachineFunction &MF = *I.getParent()->getParent();
3817   MF.getFrameInfo().setAdjustsStack(true);
3818 
3819   const auto &GlobalOp = I.getOperand(1);
3820   assert(GlobalOp.getOffset() == 0 &&
3821          "Shouldn't have an offset on TLS globals!");
3822   const GlobalValue &GV = *GlobalOp.getGlobal();
3823 
3824   auto LoadGOT =
3825       MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
3826           .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
3827 
3828   auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
3829                              {LoadGOT.getReg(0)})
3830                   .addImm(0);
3831 
3832   MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
3833   // TLS calls preserve all registers except those that absolutely must be
3834   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3835   // silly).
3836   unsigned Opcode = getBLRCallOpcode(MF);
3837 
3838   // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
3839   if (MF.getFunction().hasFnAttribute("ptrauth-calls")) {
3840     assert(Opcode == AArch64::BLR);
3841     Opcode = AArch64::BLRAAZ;
3842   }
3843 
3844   MIB.buildInstr(Opcode, {}, {Load})
3845       .addUse(AArch64::X0, RegState::Implicit)
3846       .addDef(AArch64::X0, RegState::Implicit)
3847       .addRegMask(TRI.getTLSCallPreservedMask());
3848 
3849   MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
3850   RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
3851                                MRI);
3852   I.eraseFromParent();
3853   return true;
3854 }
3855 
3856 MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3857     unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3858     MachineIRBuilder &MIRBuilder) const {
3859   auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
3860 
3861   auto BuildFn = [&](unsigned SubregIndex) {
3862     auto Ins =
3863         MIRBuilder
3864             .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar})
3865             .addImm(SubregIndex);
3866     constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
3867     constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
3868     return &*Ins;
3869   };
3870 
3871   switch (EltSize) {
3872   case 8:
3873     return BuildFn(AArch64::bsub);
3874   case 16:
3875     return BuildFn(AArch64::hsub);
3876   case 32:
3877     return BuildFn(AArch64::ssub);
3878   case 64:
3879     return BuildFn(AArch64::dsub);
3880   default:
3881     return nullptr;
3882   }
3883 }
3884 
3885 MachineInstr *
3886 AArch64InstructionSelector::emitNarrowVector(Register DstReg, Register SrcReg,
3887                                              MachineIRBuilder &MIB,
3888                                              MachineRegisterInfo &MRI) const {
3889   LLT DstTy = MRI.getType(DstReg);
3890   const TargetRegisterClass *RC =
3891       getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(SrcReg, MRI, TRI));
3892   if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
3893     LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
3894     return nullptr;
3895   }
3896   unsigned SubReg = 0;
3897   if (!getSubRegForClass(RC, TRI, SubReg))
3898     return nullptr;
3899   if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
3900     LLVM_DEBUG(dbgs() << "Unsupported destination size! ("
3901                       << DstTy.getSizeInBits() << "\n");
3902     return nullptr;
3903   }
3904   auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
3905                   .addReg(SrcReg, 0, SubReg);
3906   RBI.constrainGenericRegister(DstReg, *RC, MRI);
3907   return Copy;
3908 }
3909 
3910 bool AArch64InstructionSelector::selectMergeValues(
3911     MachineInstr &I, MachineRegisterInfo &MRI) {
3912   assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
3913   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3914   const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3915   assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
3916   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
3917 
3918   if (I.getNumOperands() != 3)
3919     return false;
3920 
3921   // Merging 2 s64s into an s128.
3922   if (DstTy == LLT::scalar(128)) {
3923     if (SrcTy.getSizeInBits() != 64)
3924       return false;
3925     Register DstReg = I.getOperand(0).getReg();
3926     Register Src1Reg = I.getOperand(1).getReg();
3927     Register Src2Reg = I.getOperand(2).getReg();
3928     auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
3929     MachineInstr *InsMI = emitLaneInsert(std::nullopt, Tmp.getReg(0), Src1Reg,
3930                                          /* LaneIdx */ 0, RB, MIB);
3931     if (!InsMI)
3932       return false;
3933     MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
3934                                           Src2Reg, /* LaneIdx */ 1, RB, MIB);
3935     if (!Ins2MI)
3936       return false;
3937     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
3938     constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
3939     I.eraseFromParent();
3940     return true;
3941   }
3942 
3943   if (RB.getID() != AArch64::GPRRegBankID)
3944     return false;
3945 
3946   if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
3947     return false;
3948 
3949   auto *DstRC = &AArch64::GPR64RegClass;
3950   Register SubToRegDef = MRI.createVirtualRegister(DstRC);
3951   MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3952                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3953                                 .addDef(SubToRegDef)
3954                                 .addImm(0)
3955                                 .addUse(I.getOperand(1).getReg())
3956                                 .addImm(AArch64::sub_32);
3957   Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
3958   // Need to anyext the second scalar before we can use bfm
3959   MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3960                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3961                                 .addDef(SubToRegDef2)
3962                                 .addImm(0)
3963                                 .addUse(I.getOperand(2).getReg())
3964                                 .addImm(AArch64::sub_32);
3965   MachineInstr &BFM =
3966       *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
3967            .addDef(I.getOperand(0).getReg())
3968            .addUse(SubToRegDef)
3969            .addUse(SubToRegDef2)
3970            .addImm(32)
3971            .addImm(31);
3972   constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
3973   constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
3974   constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
3975   I.eraseFromParent();
3976   return true;
3977 }
3978 
3979 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
3980                               const unsigned EltSize) {
3981   // Choose a lane copy opcode and subregister based off of the size of the
3982   // vector's elements.
3983   switch (EltSize) {
3984   case 8:
3985     CopyOpc = AArch64::DUPi8;
3986     ExtractSubReg = AArch64::bsub;
3987     break;
3988   case 16:
3989     CopyOpc = AArch64::DUPi16;
3990     ExtractSubReg = AArch64::hsub;
3991     break;
3992   case 32:
3993     CopyOpc = AArch64::DUPi32;
3994     ExtractSubReg = AArch64::ssub;
3995     break;
3996   case 64:
3997     CopyOpc = AArch64::DUPi64;
3998     ExtractSubReg = AArch64::dsub;
3999     break;
4000   default:
4001     // Unknown size, bail out.
4002     LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
4003     return false;
4004   }
4005   return true;
4006 }
4007 
4008 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
4009     std::optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
4010     Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
4011   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4012   unsigned CopyOpc = 0;
4013   unsigned ExtractSubReg = 0;
4014   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
4015     LLVM_DEBUG(
4016         dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
4017     return nullptr;
4018   }
4019 
4020   const TargetRegisterClass *DstRC =
4021       getRegClassForTypeOnBank(ScalarTy, DstRB, true);
4022   if (!DstRC) {
4023     LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
4024     return nullptr;
4025   }
4026 
4027   const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
4028   const LLT &VecTy = MRI.getType(VecReg);
4029   const TargetRegisterClass *VecRC =
4030       getRegClassForTypeOnBank(VecTy, VecRB, true);
4031   if (!VecRC) {
4032     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
4033     return nullptr;
4034   }
4035 
4036   // The register that we're going to copy into.
4037   Register InsertReg = VecReg;
4038   if (!DstReg)
4039     DstReg = MRI.createVirtualRegister(DstRC);
4040   // If the lane index is 0, we just use a subregister COPY.
4041   if (LaneIdx == 0) {
4042     auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {})
4043                     .addReg(VecReg, 0, ExtractSubReg);
4044     RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4045     return &*Copy;
4046   }
4047 
4048   // Lane copies require 128-bit wide registers. If we're dealing with an
4049   // unpacked vector, then we need to move up to that width. Insert an implicit
4050   // def and a subregister insert to get us there.
4051   if (VecTy.getSizeInBits() != 128) {
4052     MachineInstr *ScalarToVector = emitScalarToVector(
4053         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
4054     if (!ScalarToVector)
4055       return nullptr;
4056     InsertReg = ScalarToVector->getOperand(0).getReg();
4057   }
4058 
4059   MachineInstr *LaneCopyMI =
4060       MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
4061   constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
4062 
4063   // Make sure that we actually constrain the initial copy.
4064   RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4065   return LaneCopyMI;
4066 }
4067 
4068 bool AArch64InstructionSelector::selectExtractElt(
4069     MachineInstr &I, MachineRegisterInfo &MRI) {
4070   assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
4071          "unexpected opcode!");
4072   Register DstReg = I.getOperand(0).getReg();
4073   const LLT NarrowTy = MRI.getType(DstReg);
4074   const Register SrcReg = I.getOperand(1).getReg();
4075   const LLT WideTy = MRI.getType(SrcReg);
4076   (void)WideTy;
4077   assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
4078          "source register size too small!");
4079   assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
4080 
4081   // Need the lane index to determine the correct copy opcode.
4082   MachineOperand &LaneIdxOp = I.getOperand(2);
4083   assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
4084 
4085   if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
4086     LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
4087     return false;
4088   }
4089 
4090   // Find the index to extract from.
4091   auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
4092   if (!VRegAndVal)
4093     return false;
4094   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4095 
4096 
4097   const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
4098   MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
4099                                                LaneIdx, MIB);
4100   if (!Extract)
4101     return false;
4102 
4103   I.eraseFromParent();
4104   return true;
4105 }
4106 
4107 bool AArch64InstructionSelector::selectSplitVectorUnmerge(
4108     MachineInstr &I, MachineRegisterInfo &MRI) {
4109   unsigned NumElts = I.getNumOperands() - 1;
4110   Register SrcReg = I.getOperand(NumElts).getReg();
4111   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4112   const LLT SrcTy = MRI.getType(SrcReg);
4113 
4114   assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
4115   if (SrcTy.getSizeInBits() > 128) {
4116     LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
4117     return false;
4118   }
4119 
4120   // We implement a split vector operation by treating the sub-vectors as
4121   // scalars and extracting them.
4122   const RegisterBank &DstRB =
4123       *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
4124   for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
4125     Register Dst = I.getOperand(OpIdx).getReg();
4126     MachineInstr *Extract =
4127         emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
4128     if (!Extract)
4129       return false;
4130   }
4131   I.eraseFromParent();
4132   return true;
4133 }
4134 
4135 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
4136                                                      MachineRegisterInfo &MRI) {
4137   assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4138          "unexpected opcode");
4139 
4140   // TODO: Handle unmerging into GPRs and from scalars to scalars.
4141   if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
4142           AArch64::FPRRegBankID ||
4143       RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
4144           AArch64::FPRRegBankID) {
4145     LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4146                          "currently unsupported.\n");
4147     return false;
4148   }
4149 
4150   // The last operand is the vector source register, and every other operand is
4151   // a register to unpack into.
4152   unsigned NumElts = I.getNumOperands() - 1;
4153   Register SrcReg = I.getOperand(NumElts).getReg();
4154   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4155   const LLT WideTy = MRI.getType(SrcReg);
4156   (void)WideTy;
4157   assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
4158          "can only unmerge from vector or s128 types!");
4159   assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4160          "source register size too small!");
4161 
4162   if (!NarrowTy.isScalar())
4163     return selectSplitVectorUnmerge(I, MRI);
4164 
4165   // Choose a lane copy opcode and subregister based off of the size of the
4166   // vector's elements.
4167   unsigned CopyOpc = 0;
4168   unsigned ExtractSubReg = 0;
4169   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits()))
4170     return false;
4171 
4172   // Set up for the lane copies.
4173   MachineBasicBlock &MBB = *I.getParent();
4174 
4175   // Stores the registers we'll be copying from.
4176   SmallVector<Register, 4> InsertRegs;
4177 
4178   // We'll use the first register twice, so we only need NumElts-1 registers.
4179   unsigned NumInsertRegs = NumElts - 1;
4180 
4181   // If our elements fit into exactly 128 bits, then we can copy from the source
4182   // directly. Otherwise, we need to do a bit of setup with some subregister
4183   // inserts.
4184   if (NarrowTy.getSizeInBits() * NumElts == 128) {
4185     InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
4186   } else {
4187     // No. We have to perform subregister inserts. For each insert, create an
4188     // implicit def and a subregister insert, and save the register we create.
4189     const TargetRegisterClass *RC = getRegClassForTypeOnBank(
4190         LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()),
4191         *RBI.getRegBank(SrcReg, MRI, TRI));
4192     unsigned SubReg = 0;
4193     bool Found = getSubRegForClass(RC, TRI, SubReg);
4194     (void)Found;
4195     assert(Found && "expected to find last operand's subeg idx");
4196     for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
4197       Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4198       MachineInstr &ImpDefMI =
4199           *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
4200                    ImpDefReg);
4201 
4202       // Now, create the subregister insert from SrcReg.
4203       Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4204       MachineInstr &InsMI =
4205           *BuildMI(MBB, I, I.getDebugLoc(),
4206                    TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
4207                .addUse(ImpDefReg)
4208                .addUse(SrcReg)
4209                .addImm(SubReg);
4210 
4211       constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
4212       constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
4213 
4214       // Save the register so that we can copy from it after.
4215       InsertRegs.push_back(InsertReg);
4216     }
4217   }
4218 
4219   // Now that we've created any necessary subregister inserts, we can
4220   // create the copies.
4221   //
4222   // Perform the first copy separately as a subregister copy.
4223   Register CopyTo = I.getOperand(0).getReg();
4224   auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {})
4225                        .addReg(InsertRegs[0], 0, ExtractSubReg);
4226   constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
4227 
4228   // Now, perform the remaining copies as vector lane copies.
4229   unsigned LaneIdx = 1;
4230   for (Register InsReg : InsertRegs) {
4231     Register CopyTo = I.getOperand(LaneIdx).getReg();
4232     MachineInstr &CopyInst =
4233         *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
4234              .addUse(InsReg)
4235              .addImm(LaneIdx);
4236     constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
4237     ++LaneIdx;
4238   }
4239 
4240   // Separately constrain the first copy's destination. Because of the
4241   // limitation in constrainOperandRegClass, we can't guarantee that this will
4242   // actually be constrained. So, do it ourselves using the second operand.
4243   const TargetRegisterClass *RC =
4244       MRI.getRegClassOrNull(I.getOperand(1).getReg());
4245   if (!RC) {
4246     LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4247     return false;
4248   }
4249 
4250   RBI.constrainGenericRegister(CopyTo, *RC, MRI);
4251   I.eraseFromParent();
4252   return true;
4253 }
4254 
4255 bool AArch64InstructionSelector::selectConcatVectors(
4256     MachineInstr &I, MachineRegisterInfo &MRI)  {
4257   assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4258          "Unexpected opcode");
4259   Register Dst = I.getOperand(0).getReg();
4260   Register Op1 = I.getOperand(1).getReg();
4261   Register Op2 = I.getOperand(2).getReg();
4262   MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB);
4263   if (!ConcatMI)
4264     return false;
4265   I.eraseFromParent();
4266   return true;
4267 }
4268 
4269 unsigned
4270 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4271                                                   MachineFunction &MF) const {
4272   Type *CPTy = CPVal->getType();
4273   Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy);
4274 
4275   MachineConstantPool *MCP = MF.getConstantPool();
4276   return MCP->getConstantPoolIndex(CPVal, Alignment);
4277 }
4278 
4279 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4280     const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4281   const TargetRegisterClass *RC;
4282   unsigned Opc;
4283   bool IsTiny = TM.getCodeModel() == CodeModel::Tiny;
4284   unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType());
4285   switch (Size) {
4286   case 16:
4287     RC = &AArch64::FPR128RegClass;
4288     Opc = IsTiny ? AArch64::LDRQl : AArch64::LDRQui;
4289     break;
4290   case 8:
4291     RC = &AArch64::FPR64RegClass;
4292     Opc = IsTiny ? AArch64::LDRDl : AArch64::LDRDui;
4293     break;
4294   case 4:
4295     RC = &AArch64::FPR32RegClass;
4296     Opc = IsTiny ? AArch64::LDRSl : AArch64::LDRSui;
4297     break;
4298   case 2:
4299     RC = &AArch64::FPR16RegClass;
4300     Opc = AArch64::LDRHui;
4301     break;
4302   default:
4303     LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4304                       << *CPVal->getType());
4305     return nullptr;
4306   }
4307 
4308   MachineInstr *LoadMI = nullptr;
4309   auto &MF = MIRBuilder.getMF();
4310   unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4311   if (IsTiny && (Size == 16 || Size == 8 || Size == 4)) {
4312     // Use load(literal) for tiny code model.
4313     LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {}).addConstantPoolIndex(CPIdx);
4314   } else {
4315     auto Adrp =
4316         MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
4317             .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
4318 
4319     LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {Adrp})
4320                    .addConstantPoolIndex(
4321                        CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4322 
4323     constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
4324   }
4325 
4326   MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4327   LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo,
4328                                                     MachineMemOperand::MOLoad,
4329                                                     Size, Align(Size)));
4330   constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
4331   return LoadMI;
4332 }
4333 
4334 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4335 /// size and RB.
4336 static std::pair<unsigned, unsigned>
4337 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4338   unsigned Opc, SubregIdx;
4339   if (RB.getID() == AArch64::GPRRegBankID) {
4340     if (EltSize == 8) {
4341       Opc = AArch64::INSvi8gpr;
4342       SubregIdx = AArch64::bsub;
4343     } else if (EltSize == 16) {
4344       Opc = AArch64::INSvi16gpr;
4345       SubregIdx = AArch64::ssub;
4346     } else if (EltSize == 32) {
4347       Opc = AArch64::INSvi32gpr;
4348       SubregIdx = AArch64::ssub;
4349     } else if (EltSize == 64) {
4350       Opc = AArch64::INSvi64gpr;
4351       SubregIdx = AArch64::dsub;
4352     } else {
4353       llvm_unreachable("invalid elt size!");
4354     }
4355   } else {
4356     if (EltSize == 8) {
4357       Opc = AArch64::INSvi8lane;
4358       SubregIdx = AArch64::bsub;
4359     } else if (EltSize == 16) {
4360       Opc = AArch64::INSvi16lane;
4361       SubregIdx = AArch64::hsub;
4362     } else if (EltSize == 32) {
4363       Opc = AArch64::INSvi32lane;
4364       SubregIdx = AArch64::ssub;
4365     } else if (EltSize == 64) {
4366       Opc = AArch64::INSvi64lane;
4367       SubregIdx = AArch64::dsub;
4368     } else {
4369       llvm_unreachable("invalid elt size!");
4370     }
4371   }
4372   return std::make_pair(Opc, SubregIdx);
4373 }
4374 
4375 MachineInstr *AArch64InstructionSelector::emitInstr(
4376     unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4377     std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4378     const ComplexRendererFns &RenderFns) const {
4379   assert(Opcode && "Expected an opcode?");
4380   assert(!isPreISelGenericOpcode(Opcode) &&
4381          "Function should only be used to produce selected instructions!");
4382   auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps);
4383   if (RenderFns)
4384     for (auto &Fn : *RenderFns)
4385       Fn(MI);
4386   constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
4387   return &*MI;
4388 }
4389 
4390 MachineInstr *AArch64InstructionSelector::emitAddSub(
4391     const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4392     Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4393     MachineIRBuilder &MIRBuilder) const {
4394   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4395   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4396   auto Ty = MRI.getType(LHS.getReg());
4397   assert(!Ty.isVector() && "Expected a scalar or pointer?");
4398   unsigned Size = Ty.getSizeInBits();
4399   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4400   bool Is32Bit = Size == 32;
4401 
4402   // INSTRri form with positive arithmetic immediate.
4403   if (auto Fns = selectArithImmed(RHS))
4404     return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS},
4405                      MIRBuilder, Fns);
4406 
4407   // INSTRri form with negative arithmetic immediate.
4408   if (auto Fns = selectNegArithImmed(RHS))
4409     return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS},
4410                      MIRBuilder, Fns);
4411 
4412   // INSTRrx form.
4413   if (auto Fns = selectArithExtendedRegister(RHS))
4414     return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS},
4415                      MIRBuilder, Fns);
4416 
4417   // INSTRrs form.
4418   if (auto Fns = selectShiftedRegister(RHS))
4419     return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS},
4420                      MIRBuilder, Fns);
4421   return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS},
4422                    MIRBuilder);
4423 }
4424 
4425 MachineInstr *
4426 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4427                                     MachineOperand &RHS,
4428                                     MachineIRBuilder &MIRBuilder) const {
4429   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4430       {{AArch64::ADDXri, AArch64::ADDWri},
4431        {AArch64::ADDXrs, AArch64::ADDWrs},
4432        {AArch64::ADDXrr, AArch64::ADDWrr},
4433        {AArch64::SUBXri, AArch64::SUBWri},
4434        {AArch64::ADDXrx, AArch64::ADDWrx}}};
4435   return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder);
4436 }
4437 
4438 MachineInstr *
4439 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4440                                      MachineOperand &RHS,
4441                                      MachineIRBuilder &MIRBuilder) const {
4442   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4443       {{AArch64::ADDSXri, AArch64::ADDSWri},
4444        {AArch64::ADDSXrs, AArch64::ADDSWrs},
4445        {AArch64::ADDSXrr, AArch64::ADDSWrr},
4446        {AArch64::SUBSXri, AArch64::SUBSWri},
4447        {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4448   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4449 }
4450 
4451 MachineInstr *
4452 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4453                                      MachineOperand &RHS,
4454                                      MachineIRBuilder &MIRBuilder) const {
4455   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4456       {{AArch64::SUBSXri, AArch64::SUBSWri},
4457        {AArch64::SUBSXrs, AArch64::SUBSWrs},
4458        {AArch64::SUBSXrr, AArch64::SUBSWrr},
4459        {AArch64::ADDSXri, AArch64::ADDSWri},
4460        {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4461   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4462 }
4463 
4464 MachineInstr *
4465 AArch64InstructionSelector::emitADCS(Register Dst, MachineOperand &LHS,
4466                                      MachineOperand &RHS,
4467                                      MachineIRBuilder &MIRBuilder) const {
4468   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4469   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4470   bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32);
4471   static const unsigned OpcTable[2] = {AArch64::ADCSXr, AArch64::ADCSWr};
4472   return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder);
4473 }
4474 
4475 MachineInstr *
4476 AArch64InstructionSelector::emitSBCS(Register Dst, MachineOperand &LHS,
4477                                      MachineOperand &RHS,
4478                                      MachineIRBuilder &MIRBuilder) const {
4479   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4480   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4481   bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32);
4482   static const unsigned OpcTable[2] = {AArch64::SBCSXr, AArch64::SBCSWr};
4483   return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder);
4484 }
4485 
4486 MachineInstr *
4487 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4488                                     MachineIRBuilder &MIRBuilder) const {
4489   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4490   bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
4491   auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4492   return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
4493 }
4494 
4495 MachineInstr *
4496 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4497                                     MachineIRBuilder &MIRBuilder) const {
4498   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4499   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4500   LLT Ty = MRI.getType(LHS.getReg());
4501   unsigned RegSize = Ty.getSizeInBits();
4502   bool Is32Bit = (RegSize == 32);
4503   const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4504                                    {AArch64::ANDSXrs, AArch64::ANDSWrs},
4505                                    {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4506   // ANDS needs a logical immediate for its immediate form. Check if we can
4507   // fold one in.
4508   if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
4509     int64_t Imm = ValAndVReg->Value.getSExtValue();
4510 
4511     if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
4512       auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS});
4513       TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
4514       constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
4515       return &*TstMI;
4516     }
4517   }
4518 
4519   if (auto Fns = selectLogicalShiftedRegister(RHS))
4520     return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns);
4521   return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder);
4522 }
4523 
4524 MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4525     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4526     MachineIRBuilder &MIRBuilder) const {
4527   assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4528   assert(Predicate.isPredicate() && "Expected predicate?");
4529   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4530   LLT CmpTy = MRI.getType(LHS.getReg());
4531   assert(!CmpTy.isVector() && "Expected scalar or pointer");
4532   unsigned Size = CmpTy.getSizeInBits();
4533   (void)Size;
4534   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4535   // Fold the compare into a cmn or tst if possible.
4536   if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4537     return FoldCmp;
4538   auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
4539   return emitSUBS(Dst, LHS, RHS, MIRBuilder);
4540 }
4541 
4542 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4543     Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4544   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4545 #ifndef NDEBUG
4546   LLT Ty = MRI.getType(Dst);
4547   assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4548          "Expected a 32-bit scalar register?");
4549 #endif
4550   const Register ZReg = AArch64::WZR;
4551   AArch64CC::CondCode CC1, CC2;
4552   changeFCMPPredToAArch64CC(Pred, CC1, CC2);
4553   auto InvCC1 = AArch64CC::getInvertedCondCode(CC1);
4554   if (CC2 == AArch64CC::AL)
4555     return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1,
4556                      MIRBuilder);
4557   const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4558   Register Def1Reg = MRI.createVirtualRegister(RC);
4559   Register Def2Reg = MRI.createVirtualRegister(RC);
4560   auto InvCC2 = AArch64CC::getInvertedCondCode(CC2);
4561   emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder);
4562   emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder);
4563   auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
4564   constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
4565   return &*OrMI;
4566 }
4567 
4568 MachineInstr *AArch64InstructionSelector::emitFPCompare(
4569     Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
4570     std::optional<CmpInst::Predicate> Pred) const {
4571   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4572   LLT Ty = MRI.getType(LHS);
4573   if (Ty.isVector())
4574     return nullptr;
4575   unsigned OpSize = Ty.getSizeInBits();
4576   assert(OpSize == 16 || OpSize == 32 || OpSize == 64);
4577 
4578   // If this is a compare against +0.0, then we don't have
4579   // to explicitly materialize a constant.
4580   const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
4581   bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4582 
4583   auto IsEqualityPred = [](CmpInst::Predicate P) {
4584     return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4585            P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4586   };
4587   if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4588     // Try commutating the operands.
4589     const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI);
4590     if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4591       ShouldUseImm = true;
4592       std::swap(LHS, RHS);
4593     }
4594   }
4595   unsigned CmpOpcTbl[2][3] = {
4596       {AArch64::FCMPHrr, AArch64::FCMPSrr, AArch64::FCMPDrr},
4597       {AArch64::FCMPHri, AArch64::FCMPSri, AArch64::FCMPDri}};
4598   unsigned CmpOpc =
4599       CmpOpcTbl[ShouldUseImm][OpSize == 16 ? 0 : (OpSize == 32 ? 1 : 2)];
4600 
4601   // Partially build the compare. Decide if we need to add a use for the
4602   // third operand based off whether or not we're comparing against 0.0.
4603   auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
4604   CmpMI.setMIFlags(MachineInstr::NoFPExcept);
4605   if (!ShouldUseImm)
4606     CmpMI.addUse(RHS);
4607   constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
4608   return &*CmpMI;
4609 }
4610 
4611 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4612     std::optional<Register> Dst, Register Op1, Register Op2,
4613     MachineIRBuilder &MIRBuilder) const {
4614   // We implement a vector concat by:
4615   // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4616   // 2. Insert the upper vector into the destination's upper element
4617   // TODO: some of this code is common with G_BUILD_VECTOR handling.
4618   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4619 
4620   const LLT Op1Ty = MRI.getType(Op1);
4621   const LLT Op2Ty = MRI.getType(Op2);
4622 
4623   if (Op1Ty != Op2Ty) {
4624     LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4625     return nullptr;
4626   }
4627   assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4628 
4629   if (Op1Ty.getSizeInBits() >= 128) {
4630     LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4631     return nullptr;
4632   }
4633 
4634   // At the moment we just support 64 bit vector concats.
4635   if (Op1Ty.getSizeInBits() != 64) {
4636     LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4637     return nullptr;
4638   }
4639 
4640   const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
4641   const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
4642   const TargetRegisterClass *DstRC =
4643       getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank);
4644 
4645   MachineInstr *WidenedOp1 =
4646       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
4647   MachineInstr *WidenedOp2 =
4648       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
4649   if (!WidenedOp1 || !WidenedOp2) {
4650     LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4651     return nullptr;
4652   }
4653 
4654   // Now do the insert of the upper element.
4655   unsigned InsertOpc, InsSubRegIdx;
4656   std::tie(InsertOpc, InsSubRegIdx) =
4657       getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
4658 
4659   if (!Dst)
4660     Dst = MRI.createVirtualRegister(DstRC);
4661   auto InsElt =
4662       MIRBuilder
4663           .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()})
4664           .addImm(1) /* Lane index */
4665           .addUse(WidenedOp2->getOperand(0).getReg())
4666           .addImm(0);
4667   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4668   return &*InsElt;
4669 }
4670 
4671 MachineInstr *
4672 AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
4673                                       Register Src2, AArch64CC::CondCode Pred,
4674                                       MachineIRBuilder &MIRBuilder) const {
4675   auto &MRI = *MIRBuilder.getMRI();
4676   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst);
4677   // If we used a register class, then this won't necessarily have an LLT.
4678   // Compute the size based off whether or not we have a class or bank.
4679   unsigned Size;
4680   if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
4681     Size = TRI.getRegSizeInBits(*RC);
4682   else
4683     Size = MRI.getType(Dst).getSizeInBits();
4684   // Some opcodes use s1.
4685   assert(Size <= 64 && "Expected 64 bits or less only!");
4686   static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr};
4687   unsigned Opc = OpcTable[Size == 64];
4688   auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred);
4689   constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI);
4690   return &*CSINC;
4691 }
4692 
4693 MachineInstr *AArch64InstructionSelector::emitCarryIn(MachineInstr &I,
4694                                                       Register CarryReg) {
4695   MachineRegisterInfo *MRI = MIB.getMRI();
4696   unsigned Opcode = I.getOpcode();
4697 
4698   // If the instruction is a SUB, we need to negate the carry,
4699   // because borrowing is indicated by carry-flag == 0.
4700   bool NeedsNegatedCarry =
4701       (Opcode == TargetOpcode::G_USUBE || Opcode == TargetOpcode::G_SSUBE);
4702 
4703   // If the previous instruction will already produce the correct carry, do not
4704   // emit a carry generating instruction. E.g. for G_UADDE/G_USUBE sequences
4705   // generated during legalization of wide add/sub. This optimization depends on
4706   // these sequences not being interrupted by other instructions.
4707   // We have to select the previous instruction before the carry-using
4708   // instruction is deleted by the calling function, otherwise the previous
4709   // instruction might become dead and would get deleted.
4710   MachineInstr *SrcMI = MRI->getVRegDef(CarryReg);
4711   if (SrcMI == I.getPrevNode()) {
4712     if (auto *CarrySrcMI = dyn_cast<GAddSubCarryOut>(SrcMI)) {
4713       bool ProducesNegatedCarry = CarrySrcMI->isSub();
4714       if (NeedsNegatedCarry == ProducesNegatedCarry &&
4715           CarrySrcMI->isUnsigned() &&
4716           CarrySrcMI->getCarryOutReg() == CarryReg &&
4717           selectAndRestoreState(*SrcMI))
4718         return nullptr;
4719     }
4720   }
4721 
4722   Register DeadReg = MRI->createVirtualRegister(&AArch64::GPR32RegClass);
4723 
4724   if (NeedsNegatedCarry) {
4725     // (0 - Carry) sets !C in NZCV when Carry == 1
4726     Register ZReg = AArch64::WZR;
4727     return emitInstr(AArch64::SUBSWrr, {DeadReg}, {ZReg, CarryReg}, MIB);
4728   }
4729 
4730   // (Carry - 1) sets !C in NZCV when Carry == 0
4731   auto Fns = select12BitValueWithLeftShift(1);
4732   return emitInstr(AArch64::SUBSWri, {DeadReg}, {CarryReg}, MIB, Fns);
4733 }
4734 
4735 bool AArch64InstructionSelector::selectOverflowOp(MachineInstr &I,
4736                                                   MachineRegisterInfo &MRI) {
4737   auto &CarryMI = cast<GAddSubCarryOut>(I);
4738 
4739   if (auto *CarryInMI = dyn_cast<GAddSubCarryInOut>(&I)) {
4740     // Set NZCV carry according to carry-in VReg
4741     emitCarryIn(I, CarryInMI->getCarryInReg());
4742   }
4743 
4744   // Emit the operation and get the correct condition code.
4745   auto OpAndCC = emitOverflowOp(I.getOpcode(), CarryMI.getDstReg(),
4746                                 CarryMI.getLHS(), CarryMI.getRHS(), MIB);
4747 
4748   Register CarryOutReg = CarryMI.getCarryOutReg();
4749 
4750   // Don't convert carry-out to VReg if it is never used
4751   if (!MRI.use_nodbg_empty(CarryOutReg)) {
4752     // Now, put the overflow result in the register given by the first operand
4753     // to the overflow op. CSINC increments the result when the predicate is
4754     // false, so to get the increment when it's true, we need to use the
4755     // inverse. In this case, we want to increment when carry is set.
4756     Register ZReg = AArch64::WZR;
4757     emitCSINC(/*Dst=*/CarryOutReg, /*Src1=*/ZReg, /*Src2=*/ZReg,
4758               getInvertedCondCode(OpAndCC.second), MIB);
4759   }
4760 
4761   I.eraseFromParent();
4762   return true;
4763 }
4764 
4765 std::pair<MachineInstr *, AArch64CC::CondCode>
4766 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4767                                            MachineOperand &LHS,
4768                                            MachineOperand &RHS,
4769                                            MachineIRBuilder &MIRBuilder) const {
4770   switch (Opcode) {
4771   default:
4772     llvm_unreachable("Unexpected opcode!");
4773   case TargetOpcode::G_SADDO:
4774     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4775   case TargetOpcode::G_UADDO:
4776     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4777   case TargetOpcode::G_SSUBO:
4778     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4779   case TargetOpcode::G_USUBO:
4780     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4781   case TargetOpcode::G_SADDE:
4782     return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4783   case TargetOpcode::G_UADDE:
4784     return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4785   case TargetOpcode::G_SSUBE:
4786     return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4787   case TargetOpcode::G_USUBE:
4788     return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4789   }
4790 }
4791 
4792 /// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
4793 /// expressed as a conjunction.
4794 /// \param CanNegate    Set to true if we can negate the whole sub-tree just by
4795 ///                     changing the conditions on the CMP tests.
4796 ///                     (this means we can call emitConjunctionRec() with
4797 ///                      Negate==true on this sub-tree)
4798 /// \param MustBeFirst  Set to true if this subtree needs to be negated and we
4799 ///                     cannot do the negation naturally. We are required to
4800 ///                     emit the subtree first in this case.
4801 /// \param WillNegate   Is true if are called when the result of this
4802 ///                     subexpression must be negated. This happens when the
4803 ///                     outer expression is an OR. We can use this fact to know
4804 ///                     that we have a double negation (or (or ...) ...) that
4805 ///                     can be implemented for free.
4806 static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
4807                                bool WillNegate, MachineRegisterInfo &MRI,
4808                                unsigned Depth = 0) {
4809   if (!MRI.hasOneNonDBGUse(Val))
4810     return false;
4811   MachineInstr *ValDef = MRI.getVRegDef(Val);
4812   unsigned Opcode = ValDef->getOpcode();
4813   if (isa<GAnyCmp>(ValDef)) {
4814     CanNegate = true;
4815     MustBeFirst = false;
4816     return true;
4817   }
4818   // Protect against exponential runtime and stack overflow.
4819   if (Depth > 6)
4820     return false;
4821   if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
4822     bool IsOR = Opcode == TargetOpcode::G_OR;
4823     Register O0 = ValDef->getOperand(1).getReg();
4824     Register O1 = ValDef->getOperand(2).getReg();
4825     bool CanNegateL;
4826     bool MustBeFirstL;
4827     if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1))
4828       return false;
4829     bool CanNegateR;
4830     bool MustBeFirstR;
4831     if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1))
4832       return false;
4833 
4834     if (MustBeFirstL && MustBeFirstR)
4835       return false;
4836 
4837     if (IsOR) {
4838       // For an OR expression we need to be able to naturally negate at least
4839       // one side or we cannot do the transformation at all.
4840       if (!CanNegateL && !CanNegateR)
4841         return false;
4842       // If we the result of the OR will be negated and we can naturally negate
4843       // the leaves, then this sub-tree as a whole negates naturally.
4844       CanNegate = WillNegate && CanNegateL && CanNegateR;
4845       // If we cannot naturally negate the whole sub-tree, then this must be
4846       // emitted first.
4847       MustBeFirst = !CanNegate;
4848     } else {
4849       assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
4850       // We cannot naturally negate an AND operation.
4851       CanNegate = false;
4852       MustBeFirst = MustBeFirstL || MustBeFirstR;
4853     }
4854     return true;
4855   }
4856   return false;
4857 }
4858 
4859 MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
4860     Register LHS, Register RHS, CmpInst::Predicate CC,
4861     AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
4862     MachineIRBuilder &MIB) const {
4863   auto &MRI = *MIB.getMRI();
4864   LLT OpTy = MRI.getType(LHS);
4865   unsigned CCmpOpc;
4866   std::optional<ValueAndVReg> C;
4867   if (CmpInst::isIntPredicate(CC)) {
4868     assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
4869     C = getIConstantVRegValWithLookThrough(RHS, MRI);
4870     if (!C || C->Value.sgt(31) || C->Value.slt(-31))
4871       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
4872     else if (C->Value.ule(31))
4873       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi;
4874     else
4875       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMNWi : AArch64::CCMNXi;
4876   } else {
4877     assert(OpTy.getSizeInBits() == 16 || OpTy.getSizeInBits() == 32 ||
4878            OpTy.getSizeInBits() == 64);
4879     switch (OpTy.getSizeInBits()) {
4880     case 16:
4881       assert(STI.hasFullFP16() && "Expected Full FP16 for fp16 comparisons");
4882       CCmpOpc = AArch64::FCCMPHrr;
4883       break;
4884     case 32:
4885       CCmpOpc = AArch64::FCCMPSrr;
4886       break;
4887     case 64:
4888       CCmpOpc = AArch64::FCCMPDrr;
4889       break;
4890     default:
4891       return nullptr;
4892     }
4893   }
4894   AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
4895   unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
4896   auto CCmp =
4897       MIB.buildInstr(CCmpOpc, {}, {LHS});
4898   if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi)
4899     CCmp.addImm(C->Value.getZExtValue());
4900   else if (CCmpOpc == AArch64::CCMNWi || CCmpOpc == AArch64::CCMNXi)
4901     CCmp.addImm(C->Value.abs().getZExtValue());
4902   else
4903     CCmp.addReg(RHS);
4904   CCmp.addImm(NZCV).addImm(Predicate);
4905   constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI);
4906   return &*CCmp;
4907 }
4908 
4909 MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
4910     Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
4911     AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
4912   // We're at a tree leaf, produce a conditional comparison operation.
4913   auto &MRI = *MIB.getMRI();
4914   MachineInstr *ValDef = MRI.getVRegDef(Val);
4915   unsigned Opcode = ValDef->getOpcode();
4916   if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) {
4917     Register LHS = Cmp->getLHSReg();
4918     Register RHS = Cmp->getRHSReg();
4919     CmpInst::Predicate CC = Cmp->getCond();
4920     if (Negate)
4921       CC = CmpInst::getInversePredicate(CC);
4922     if (isa<GICmp>(Cmp)) {
4923       OutCC = changeICMPPredToAArch64CC(CC);
4924     } else {
4925       // Handle special FP cases.
4926       AArch64CC::CondCode ExtraCC;
4927       changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
4928       // Some floating point conditions can't be tested with a single condition
4929       // code. Construct an additional comparison in this case.
4930       if (ExtraCC != AArch64CC::AL) {
4931         MachineInstr *ExtraCmp;
4932         if (!CCOp)
4933           ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC);
4934         else
4935           ExtraCmp =
4936               emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB);
4937         CCOp = ExtraCmp->getOperand(0).getReg();
4938         Predicate = ExtraCC;
4939       }
4940     }
4941 
4942     // Produce a normal comparison if we are first in the chain
4943     if (!CCOp) {
4944       auto Dst = MRI.cloneVirtualRegister(LHS);
4945       if (isa<GICmp>(Cmp))
4946         return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB);
4947       return emitFPCompare(Cmp->getOperand(2).getReg(),
4948                            Cmp->getOperand(3).getReg(), MIB);
4949     }
4950     // Otherwise produce a ccmp.
4951     return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
4952   }
4953   assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
4954 
4955   bool IsOR = Opcode == TargetOpcode::G_OR;
4956 
4957   Register LHS = ValDef->getOperand(1).getReg();
4958   bool CanNegateL;
4959   bool MustBeFirstL;
4960   bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI);
4961   assert(ValidL && "Valid conjunction/disjunction tree");
4962   (void)ValidL;
4963 
4964   Register RHS = ValDef->getOperand(2).getReg();
4965   bool CanNegateR;
4966   bool MustBeFirstR;
4967   bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI);
4968   assert(ValidR && "Valid conjunction/disjunction tree");
4969   (void)ValidR;
4970 
4971   // Swap sub-tree that must come first to the right side.
4972   if (MustBeFirstL) {
4973     assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4974     std::swap(LHS, RHS);
4975     std::swap(CanNegateL, CanNegateR);
4976     std::swap(MustBeFirstL, MustBeFirstR);
4977   }
4978 
4979   bool NegateR;
4980   bool NegateAfterR;
4981   bool NegateL;
4982   bool NegateAfterAll;
4983   if (Opcode == TargetOpcode::G_OR) {
4984     // Swap the sub-tree that we can negate naturally to the left.
4985     if (!CanNegateL) {
4986       assert(CanNegateR && "at least one side must be negatable");
4987       assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4988       assert(!Negate);
4989       std::swap(LHS, RHS);
4990       NegateR = false;
4991       NegateAfterR = true;
4992     } else {
4993       // Negate the left sub-tree if possible, otherwise negate the result.
4994       NegateR = CanNegateR;
4995       NegateAfterR = !CanNegateR;
4996     }
4997     NegateL = true;
4998     NegateAfterAll = !Negate;
4999   } else {
5000     assert(Opcode == TargetOpcode::G_AND &&
5001            "Valid conjunction/disjunction tree");
5002     assert(!Negate && "Valid conjunction/disjunction tree");
5003 
5004     NegateL = false;
5005     NegateR = false;
5006     NegateAfterR = false;
5007     NegateAfterAll = false;
5008   }
5009 
5010   // Emit sub-trees.
5011   AArch64CC::CondCode RHSCC;
5012   MachineInstr *CmpR =
5013       emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB);
5014   if (NegateAfterR)
5015     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
5016   MachineInstr *CmpL = emitConjunctionRec(
5017       LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB);
5018   if (NegateAfterAll)
5019     OutCC = AArch64CC::getInvertedCondCode(OutCC);
5020   return CmpL;
5021 }
5022 
5023 MachineInstr *AArch64InstructionSelector::emitConjunction(
5024     Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
5025   bool DummyCanNegate;
5026   bool DummyMustBeFirst;
5027   if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false,
5028                           *MIB.getMRI()))
5029     return nullptr;
5030   return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB);
5031 }
5032 
5033 bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
5034                                                          MachineInstr &CondMI) {
5035   AArch64CC::CondCode AArch64CC;
5036   MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB);
5037   if (!ConjMI)
5038     return false;
5039 
5040   emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB);
5041   SelI.eraseFromParent();
5042   return true;
5043 }
5044 
5045 bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
5046   MachineRegisterInfo &MRI = *MIB.getMRI();
5047   // We want to recognize this pattern:
5048   //
5049   // $z = G_FCMP pred, $x, $y
5050   // ...
5051   // $w = G_SELECT $z, $a, $b
5052   //
5053   // Where the value of $z is *only* ever used by the G_SELECT (possibly with
5054   // some copies/truncs in between.)
5055   //
5056   // If we see this, then we can emit something like this:
5057   //
5058   // fcmp $x, $y
5059   // fcsel $w, $a, $b, pred
5060   //
5061   // Rather than emitting both of the rather long sequences in the standard
5062   // G_FCMP/G_SELECT select methods.
5063 
5064   // First, check if the condition is defined by a compare.
5065   MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
5066 
5067   // We can only fold if all of the defs have one use.
5068   Register CondDefReg = CondDef->getOperand(0).getReg();
5069   if (!MRI.hasOneNonDBGUse(CondDefReg)) {
5070     // Unless it's another select.
5071     for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
5072       if (CondDef == &UI)
5073         continue;
5074       if (UI.getOpcode() != TargetOpcode::G_SELECT)
5075         return false;
5076     }
5077   }
5078 
5079   // Is the condition defined by a compare?
5080   unsigned CondOpc = CondDef->getOpcode();
5081   if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
5082     if (tryOptSelectConjunction(I, *CondDef))
5083       return true;
5084     return false;
5085   }
5086 
5087   AArch64CC::CondCode CondCode;
5088   if (CondOpc == TargetOpcode::G_ICMP) {
5089     auto Pred =
5090         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
5091     CondCode = changeICMPPredToAArch64CC(Pred);
5092     emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
5093                        CondDef->getOperand(1), MIB);
5094   } else {
5095     // Get the condition code for the select.
5096     auto Pred =
5097         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
5098     AArch64CC::CondCode CondCode2;
5099     changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
5100 
5101     // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
5102     // instructions to emit the comparison.
5103     // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
5104     // unnecessary.
5105     if (CondCode2 != AArch64CC::AL)
5106       return false;
5107 
5108     if (!emitFPCompare(CondDef->getOperand(2).getReg(),
5109                        CondDef->getOperand(3).getReg(), MIB)) {
5110       LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
5111       return false;
5112     }
5113   }
5114 
5115   // Emit the select.
5116   emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(),
5117              I.getOperand(3).getReg(), CondCode, MIB);
5118   I.eraseFromParent();
5119   return true;
5120 }
5121 
5122 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
5123     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
5124     MachineIRBuilder &MIRBuilder) const {
5125   assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
5126          "Unexpected MachineOperand");
5127   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5128   // We want to find this sort of thing:
5129   // x = G_SUB 0, y
5130   // G_ICMP z, x
5131   //
5132   // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
5133   // e.g:
5134   //
5135   // cmn z, y
5136 
5137   // Check if the RHS or LHS of the G_ICMP is defined by a SUB
5138   MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
5139   MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
5140   auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
5141   // Given this:
5142   //
5143   // x = G_SUB 0, y
5144   // G_ICMP x, z
5145   //
5146   // Produce this:
5147   //
5148   // cmn y, z
5149   if (isCMN(LHSDef, P, MRI))
5150     return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
5151 
5152   // Same idea here, but with the RHS of the compare instead:
5153   //
5154   // Given this:
5155   //
5156   // x = G_SUB 0, y
5157   // G_ICMP z, x
5158   //
5159   // Produce this:
5160   //
5161   // cmn z, y
5162   if (isCMN(RHSDef, P, MRI))
5163     return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
5164 
5165   // Given this:
5166   //
5167   // z = G_AND x, y
5168   // G_ICMP z, 0
5169   //
5170   // Produce this if the compare is signed:
5171   //
5172   // tst x, y
5173   if (!CmpInst::isUnsigned(P) && LHSDef &&
5174       LHSDef->getOpcode() == TargetOpcode::G_AND) {
5175     // Make sure that the RHS is 0.
5176     auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
5177     if (!ValAndVReg || ValAndVReg->Value != 0)
5178       return nullptr;
5179 
5180     return emitTST(LHSDef->getOperand(1),
5181                    LHSDef->getOperand(2), MIRBuilder);
5182   }
5183 
5184   return nullptr;
5185 }
5186 
5187 bool AArch64InstructionSelector::selectShuffleVector(
5188     MachineInstr &I, MachineRegisterInfo &MRI) {
5189   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5190   Register Src1Reg = I.getOperand(1).getReg();
5191   const LLT Src1Ty = MRI.getType(Src1Reg);
5192   Register Src2Reg = I.getOperand(2).getReg();
5193   const LLT Src2Ty = MRI.getType(Src2Reg);
5194   ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
5195 
5196   MachineBasicBlock &MBB = *I.getParent();
5197   MachineFunction &MF = *MBB.getParent();
5198   LLVMContext &Ctx = MF.getFunction().getContext();
5199 
5200   // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
5201   // it's originated from a <1 x T> type. Those should have been lowered into
5202   // G_BUILD_VECTOR earlier.
5203   if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
5204     LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
5205     return false;
5206   }
5207 
5208   unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
5209 
5210   SmallVector<Constant *, 64> CstIdxs;
5211   for (int Val : Mask) {
5212     // For now, any undef indexes we'll just assume to be 0. This should be
5213     // optimized in future, e.g. to select DUP etc.
5214     Val = Val < 0 ? 0 : Val;
5215     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
5216       unsigned Offset = Byte + Val * BytesPerElt;
5217       CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
5218     }
5219   }
5220 
5221   // Use a constant pool to load the index vector for TBL.
5222   Constant *CPVal = ConstantVector::get(CstIdxs);
5223   MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB);
5224   if (!IndexLoad) {
5225     LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
5226     return false;
5227   }
5228 
5229   if (DstTy.getSizeInBits() != 128) {
5230     assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
5231     // This case can be done with TBL1.
5232     MachineInstr *Concat =
5233         emitVectorConcat(std::nullopt, Src1Reg, Src2Reg, MIB);
5234     if (!Concat) {
5235       LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
5236       return false;
5237     }
5238 
5239     // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
5240     IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass,
5241                                    IndexLoad->getOperand(0).getReg(), MIB);
5242 
5243     auto TBL1 = MIB.buildInstr(
5244         AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
5245         {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
5246     constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
5247 
5248     auto Copy =
5249         MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
5250             .addReg(TBL1.getReg(0), 0, AArch64::dsub);
5251     RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
5252     I.eraseFromParent();
5253     return true;
5254   }
5255 
5256   // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
5257   // Q registers for regalloc.
5258   SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
5259   auto RegSeq = createQTuple(Regs, MIB);
5260   auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
5261                              {RegSeq, IndexLoad->getOperand(0)});
5262   constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
5263   I.eraseFromParent();
5264   return true;
5265 }
5266 
5267 MachineInstr *AArch64InstructionSelector::emitLaneInsert(
5268     std::optional<Register> DstReg, Register SrcReg, Register EltReg,
5269     unsigned LaneIdx, const RegisterBank &RB,
5270     MachineIRBuilder &MIRBuilder) const {
5271   MachineInstr *InsElt = nullptr;
5272   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5273   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5274 
5275   // Create a register to define with the insert if one wasn't passed in.
5276   if (!DstReg)
5277     DstReg = MRI.createVirtualRegister(DstRC);
5278 
5279   unsigned EltSize = MRI.getType(EltReg).getSizeInBits();
5280   unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
5281 
5282   if (RB.getID() == AArch64::FPRRegBankID) {
5283     auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder);
5284     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5285                  .addImm(LaneIdx)
5286                  .addUse(InsSub->getOperand(0).getReg())
5287                  .addImm(0);
5288   } else {
5289     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5290                  .addImm(LaneIdx)
5291                  .addUse(EltReg);
5292   }
5293 
5294   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
5295   return InsElt;
5296 }
5297 
5298 bool AArch64InstructionSelector::selectUSMovFromExtend(
5299     MachineInstr &MI, MachineRegisterInfo &MRI) {
5300   if (MI.getOpcode() != TargetOpcode::G_SEXT &&
5301       MI.getOpcode() != TargetOpcode::G_ZEXT &&
5302       MI.getOpcode() != TargetOpcode::G_ANYEXT)
5303     return false;
5304   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT;
5305   const Register DefReg = MI.getOperand(0).getReg();
5306   const LLT DstTy = MRI.getType(DefReg);
5307   unsigned DstSize = DstTy.getSizeInBits();
5308 
5309   if (DstSize != 32 && DstSize != 64)
5310     return false;
5311 
5312   MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT,
5313                                        MI.getOperand(1).getReg(), MRI);
5314   int64_t Lane;
5315   if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane)))
5316     return false;
5317   Register Src0 = Extract->getOperand(1).getReg();
5318 
5319   const LLT VecTy = MRI.getType(Src0);
5320   if (VecTy.isScalableVector())
5321     return false;
5322 
5323   if (VecTy.getSizeInBits() != 128) {
5324     const MachineInstr *ScalarToVector = emitScalarToVector(
5325         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB);
5326     assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!");
5327     Src0 = ScalarToVector->getOperand(0).getReg();
5328   }
5329 
5330   unsigned Opcode;
5331   if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32)
5332     Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32;
5333   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16)
5334     Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16;
5335   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8)
5336     Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8;
5337   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16)
5338     Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16;
5339   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8)
5340     Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8;
5341   else
5342     llvm_unreachable("Unexpected type combo for S/UMov!");
5343 
5344   // We may need to generate one of these, depending on the type and sign of the
5345   // input:
5346   //  DstReg = SMOV Src0, Lane;
5347   //  NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32;
5348   MachineInstr *ExtI = nullptr;
5349   if (DstSize == 64 && !IsSigned) {
5350     Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
5351     MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane);
5352     ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
5353                .addImm(0)
5354                .addUse(NewReg)
5355                .addImm(AArch64::sub_32);
5356     RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
5357   } else
5358     ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane);
5359 
5360   constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
5361   MI.eraseFromParent();
5362   return true;
5363 }
5364 
5365 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm8(
5366     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5367   unsigned int Op;
5368   if (DstSize == 128) {
5369     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5370       return nullptr;
5371     Op = AArch64::MOVIv16b_ns;
5372   } else {
5373     Op = AArch64::MOVIv8b_ns;
5374   }
5375 
5376   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5377 
5378   if (AArch64_AM::isAdvSIMDModImmType9(Val)) {
5379     Val = AArch64_AM::encodeAdvSIMDModImmType9(Val);
5380     auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5381     constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5382     return &*Mov;
5383   }
5384   return nullptr;
5385 }
5386 
5387 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm16(
5388     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5389     bool Inv) {
5390 
5391   unsigned int Op;
5392   if (DstSize == 128) {
5393     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5394       return nullptr;
5395     Op = Inv ? AArch64::MVNIv8i16 : AArch64::MOVIv8i16;
5396   } else {
5397     Op = Inv ? AArch64::MVNIv4i16 : AArch64::MOVIv4i16;
5398   }
5399 
5400   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5401   uint64_t Shift;
5402 
5403   if (AArch64_AM::isAdvSIMDModImmType5(Val)) {
5404     Val = AArch64_AM::encodeAdvSIMDModImmType5(Val);
5405     Shift = 0;
5406   } else if (AArch64_AM::isAdvSIMDModImmType6(Val)) {
5407     Val = AArch64_AM::encodeAdvSIMDModImmType6(Val);
5408     Shift = 8;
5409   } else
5410     return nullptr;
5411 
5412   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5413   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5414   return &*Mov;
5415 }
5416 
5417 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm32(
5418     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5419     bool Inv) {
5420 
5421   unsigned int Op;
5422   if (DstSize == 128) {
5423     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5424       return nullptr;
5425     Op = Inv ? AArch64::MVNIv4i32 : AArch64::MOVIv4i32;
5426   } else {
5427     Op = Inv ? AArch64::MVNIv2i32 : AArch64::MOVIv2i32;
5428   }
5429 
5430   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5431   uint64_t Shift;
5432 
5433   if ((AArch64_AM::isAdvSIMDModImmType1(Val))) {
5434     Val = AArch64_AM::encodeAdvSIMDModImmType1(Val);
5435     Shift = 0;
5436   } else if ((AArch64_AM::isAdvSIMDModImmType2(Val))) {
5437     Val = AArch64_AM::encodeAdvSIMDModImmType2(Val);
5438     Shift = 8;
5439   } else if ((AArch64_AM::isAdvSIMDModImmType3(Val))) {
5440     Val = AArch64_AM::encodeAdvSIMDModImmType3(Val);
5441     Shift = 16;
5442   } else if ((AArch64_AM::isAdvSIMDModImmType4(Val))) {
5443     Val = AArch64_AM::encodeAdvSIMDModImmType4(Val);
5444     Shift = 24;
5445   } else
5446     return nullptr;
5447 
5448   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5449   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5450   return &*Mov;
5451 }
5452 
5453 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm64(
5454     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5455 
5456   unsigned int Op;
5457   if (DstSize == 128) {
5458     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5459       return nullptr;
5460     Op = AArch64::MOVIv2d_ns;
5461   } else {
5462     Op = AArch64::MOVID;
5463   }
5464 
5465   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5466   if (AArch64_AM::isAdvSIMDModImmType10(Val)) {
5467     Val = AArch64_AM::encodeAdvSIMDModImmType10(Val);
5468     auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5469     constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5470     return &*Mov;
5471   }
5472   return nullptr;
5473 }
5474 
5475 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm321s(
5476     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5477     bool Inv) {
5478 
5479   unsigned int Op;
5480   if (DstSize == 128) {
5481     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5482       return nullptr;
5483     Op = Inv ? AArch64::MVNIv4s_msl : AArch64::MOVIv4s_msl;
5484   } else {
5485     Op = Inv ? AArch64::MVNIv2s_msl : AArch64::MOVIv2s_msl;
5486   }
5487 
5488   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5489   uint64_t Shift;
5490 
5491   if (AArch64_AM::isAdvSIMDModImmType7(Val)) {
5492     Val = AArch64_AM::encodeAdvSIMDModImmType7(Val);
5493     Shift = 264;
5494   } else if (AArch64_AM::isAdvSIMDModImmType8(Val)) {
5495     Val = AArch64_AM::encodeAdvSIMDModImmType8(Val);
5496     Shift = 272;
5497   } else
5498     return nullptr;
5499 
5500   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5501   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5502   return &*Mov;
5503 }
5504 
5505 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP(
5506     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5507 
5508   unsigned int Op;
5509   bool IsWide = false;
5510   if (DstSize == 128) {
5511     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5512       return nullptr;
5513     Op = AArch64::FMOVv4f32_ns;
5514     IsWide = true;
5515   } else {
5516     Op = AArch64::FMOVv2f32_ns;
5517   }
5518 
5519   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5520 
5521   if (AArch64_AM::isAdvSIMDModImmType11(Val)) {
5522     Val = AArch64_AM::encodeAdvSIMDModImmType11(Val);
5523   } else if (IsWide && AArch64_AM::isAdvSIMDModImmType12(Val)) {
5524     Val = AArch64_AM::encodeAdvSIMDModImmType12(Val);
5525     Op = AArch64::FMOVv2f64_ns;
5526   } else
5527     return nullptr;
5528 
5529   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5530   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5531   return &*Mov;
5532 }
5533 
5534 bool AArch64InstructionSelector::selectIndexedExtLoad(
5535     MachineInstr &MI, MachineRegisterInfo &MRI) {
5536   auto &ExtLd = cast<GIndexedAnyExtLoad>(MI);
5537   Register Dst = ExtLd.getDstReg();
5538   Register WriteBack = ExtLd.getWritebackReg();
5539   Register Base = ExtLd.getBaseReg();
5540   Register Offset = ExtLd.getOffsetReg();
5541   LLT Ty = MRI.getType(Dst);
5542   assert(Ty.getSizeInBits() <= 64); // Only for scalar GPRs.
5543   unsigned MemSizeBits = ExtLd.getMMO().getMemoryType().getSizeInBits();
5544   bool IsPre = ExtLd.isPre();
5545   bool IsSExt = isa<GIndexedSExtLoad>(ExtLd);
5546   bool InsertIntoXReg = false;
5547   bool IsDst64 = Ty.getSizeInBits() == 64;
5548 
5549   unsigned Opc = 0;
5550   LLT NewLdDstTy;
5551   LLT s32 = LLT::scalar(32);
5552   LLT s64 = LLT::scalar(64);
5553 
5554   if (MemSizeBits == 8) {
5555     if (IsSExt) {
5556       if (IsDst64)
5557         Opc = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
5558       else
5559         Opc = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
5560       NewLdDstTy = IsDst64 ? s64 : s32;
5561     } else {
5562       Opc = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
5563       InsertIntoXReg = IsDst64;
5564       NewLdDstTy = s32;
5565     }
5566   } else if (MemSizeBits == 16) {
5567     if (IsSExt) {
5568       if (IsDst64)
5569         Opc = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
5570       else
5571         Opc = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
5572       NewLdDstTy = IsDst64 ? s64 : s32;
5573     } else {
5574       Opc = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
5575       InsertIntoXReg = IsDst64;
5576       NewLdDstTy = s32;
5577     }
5578   } else if (MemSizeBits == 32) {
5579     if (IsSExt) {
5580       Opc = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
5581       NewLdDstTy = s64;
5582     } else {
5583       Opc = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
5584       InsertIntoXReg = IsDst64;
5585       NewLdDstTy = s32;
5586     }
5587   } else {
5588     llvm_unreachable("Unexpected size for indexed load");
5589   }
5590 
5591   if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5592     return false; // We should be on gpr.
5593 
5594   auto Cst = getIConstantVRegVal(Offset, MRI);
5595   if (!Cst)
5596     return false; // Shouldn't happen, but just in case.
5597 
5598   auto LdMI = MIB.buildInstr(Opc, {WriteBack, NewLdDstTy}, {Base})
5599                   .addImm(Cst->getSExtValue());
5600   LdMI.cloneMemRefs(ExtLd);
5601   constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
5602   // Make sure to select the load with the MemTy as the dest type, and then
5603   // insert into X reg if needed.
5604   if (InsertIntoXReg) {
5605     // Generate a SUBREG_TO_REG.
5606     auto SubToReg = MIB.buildInstr(TargetOpcode::SUBREG_TO_REG, {Dst}, {})
5607                         .addImm(0)
5608                         .addUse(LdMI.getReg(1))
5609                         .addImm(AArch64::sub_32);
5610     RBI.constrainGenericRegister(SubToReg.getReg(0), AArch64::GPR64RegClass,
5611                                  MRI);
5612   } else {
5613     auto Copy = MIB.buildCopy(Dst, LdMI.getReg(1));
5614     selectCopy(*Copy, TII, MRI, TRI, RBI);
5615   }
5616   MI.eraseFromParent();
5617 
5618   return true;
5619 }
5620 
5621 bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI,
5622                                                    MachineRegisterInfo &MRI) {
5623   auto &Ld = cast<GIndexedLoad>(MI);
5624   Register Dst = Ld.getDstReg();
5625   Register WriteBack = Ld.getWritebackReg();
5626   Register Base = Ld.getBaseReg();
5627   Register Offset = Ld.getOffsetReg();
5628   assert(MRI.getType(Dst).getSizeInBits() <= 128 &&
5629          "Unexpected type for indexed load");
5630   unsigned MemSize = Ld.getMMO().getMemoryType().getSizeInBytes();
5631 
5632   if (MemSize < MRI.getType(Dst).getSizeInBytes())
5633     return selectIndexedExtLoad(MI, MRI);
5634 
5635   unsigned Opc = 0;
5636   if (Ld.isPre()) {
5637     static constexpr unsigned GPROpcodes[] = {
5638         AArch64::LDRBBpre, AArch64::LDRHHpre, AArch64::LDRWpre,
5639         AArch64::LDRXpre};
5640     static constexpr unsigned FPROpcodes[] = {
5641         AArch64::LDRBpre, AArch64::LDRHpre, AArch64::LDRSpre, AArch64::LDRDpre,
5642         AArch64::LDRQpre};
5643     if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5644       Opc = FPROpcodes[Log2_32(MemSize)];
5645     else
5646       Opc = GPROpcodes[Log2_32(MemSize)];
5647   } else {
5648     static constexpr unsigned GPROpcodes[] = {
5649         AArch64::LDRBBpost, AArch64::LDRHHpost, AArch64::LDRWpost,
5650         AArch64::LDRXpost};
5651     static constexpr unsigned FPROpcodes[] = {
5652         AArch64::LDRBpost, AArch64::LDRHpost, AArch64::LDRSpost,
5653         AArch64::LDRDpost, AArch64::LDRQpost};
5654     if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5655       Opc = FPROpcodes[Log2_32(MemSize)];
5656     else
5657       Opc = GPROpcodes[Log2_32(MemSize)];
5658   }
5659   auto Cst = getIConstantVRegVal(Offset, MRI);
5660   if (!Cst)
5661     return false; // Shouldn't happen, but just in case.
5662   auto LdMI =
5663       MIB.buildInstr(Opc, {WriteBack, Dst}, {Base}).addImm(Cst->getSExtValue());
5664   LdMI.cloneMemRefs(Ld);
5665   constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
5666   MI.eraseFromParent();
5667   return true;
5668 }
5669 
5670 bool AArch64InstructionSelector::selectIndexedStore(GIndexedStore &I,
5671                                                     MachineRegisterInfo &MRI) {
5672   Register Dst = I.getWritebackReg();
5673   Register Val = I.getValueReg();
5674   Register Base = I.getBaseReg();
5675   Register Offset = I.getOffsetReg();
5676   LLT ValTy = MRI.getType(Val);
5677   assert(ValTy.getSizeInBits() <= 128 && "Unexpected type for indexed store");
5678 
5679   unsigned Opc = 0;
5680   if (I.isPre()) {
5681     static constexpr unsigned GPROpcodes[] = {
5682         AArch64::STRBBpre, AArch64::STRHHpre, AArch64::STRWpre,
5683         AArch64::STRXpre};
5684     static constexpr unsigned FPROpcodes[] = {
5685         AArch64::STRBpre, AArch64::STRHpre, AArch64::STRSpre, AArch64::STRDpre,
5686         AArch64::STRQpre};
5687 
5688     if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5689       Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5690     else
5691       Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5692   } else {
5693     static constexpr unsigned GPROpcodes[] = {
5694         AArch64::STRBBpost, AArch64::STRHHpost, AArch64::STRWpost,
5695         AArch64::STRXpost};
5696     static constexpr unsigned FPROpcodes[] = {
5697         AArch64::STRBpost, AArch64::STRHpost, AArch64::STRSpost,
5698         AArch64::STRDpost, AArch64::STRQpost};
5699 
5700     if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5701       Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5702     else
5703       Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5704   }
5705 
5706   auto Cst = getIConstantVRegVal(Offset, MRI);
5707   if (!Cst)
5708     return false; // Shouldn't happen, but just in case.
5709   auto Str =
5710       MIB.buildInstr(Opc, {Dst}, {Val, Base}).addImm(Cst->getSExtValue());
5711   Str.cloneMemRefs(I);
5712   constrainSelectedInstRegOperands(*Str, TII, TRI, RBI);
5713   I.eraseFromParent();
5714   return true;
5715 }
5716 
5717 MachineInstr *
5718 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
5719                                                MachineIRBuilder &MIRBuilder,
5720                                                MachineRegisterInfo &MRI) {
5721   LLT DstTy = MRI.getType(Dst);
5722   unsigned DstSize = DstTy.getSizeInBits();
5723   if (CV->isNullValue()) {
5724     if (DstSize == 128) {
5725       auto Mov =
5726           MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0);
5727       constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5728       return &*Mov;
5729     }
5730 
5731     if (DstSize == 64) {
5732       auto Mov =
5733           MIRBuilder
5734               .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
5735               .addImm(0);
5736       auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {})
5737                       .addReg(Mov.getReg(0), 0, AArch64::dsub);
5738       RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI);
5739       return &*Copy;
5740     }
5741   }
5742 
5743   if (CV->getSplatValue()) {
5744     APInt DefBits = APInt::getSplat(
5745         DstSize, CV->getUniqueInteger().trunc(DstTy.getScalarSizeInBits()));
5746     auto TryMOVIWithBits = [&](APInt DefBits) -> MachineInstr * {
5747       MachineInstr *NewOp;
5748       bool Inv = false;
5749       if ((NewOp = tryAdvSIMDModImm64(Dst, DstSize, DefBits, MIRBuilder)) ||
5750           (NewOp =
5751                tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5752           (NewOp =
5753                tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5754           (NewOp =
5755                tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5756           (NewOp = tryAdvSIMDModImm8(Dst, DstSize, DefBits, MIRBuilder)) ||
5757           (NewOp = tryAdvSIMDModImmFP(Dst, DstSize, DefBits, MIRBuilder)))
5758         return NewOp;
5759 
5760       DefBits = ~DefBits;
5761       Inv = true;
5762       if ((NewOp =
5763                tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5764           (NewOp =
5765                tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5766           (NewOp = tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv)))
5767         return NewOp;
5768       return nullptr;
5769     };
5770 
5771     if (auto *NewOp = TryMOVIWithBits(DefBits))
5772       return NewOp;
5773 
5774     // See if a fneg of the constant can be materialized with a MOVI, etc
5775     auto TryWithFNeg = [&](APInt DefBits, int NumBits,
5776                            unsigned NegOpc) -> MachineInstr * {
5777       // FNegate each sub-element of the constant
5778       APInt Neg = APInt::getHighBitsSet(NumBits, 1).zext(DstSize);
5779       APInt NegBits(DstSize, 0);
5780       unsigned NumElts = DstSize / NumBits;
5781       for (unsigned i = 0; i < NumElts; i++)
5782         NegBits |= Neg << (NumBits * i);
5783       NegBits = DefBits ^ NegBits;
5784 
5785       // Try to create the new constants with MOVI, and if so generate a fneg
5786       // for it.
5787       if (auto *NewOp = TryMOVIWithBits(NegBits)) {
5788         Register NewDst = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
5789         NewOp->getOperand(0).setReg(NewDst);
5790         return MIRBuilder.buildInstr(NegOpc, {Dst}, {NewDst});
5791       }
5792       return nullptr;
5793     };
5794     MachineInstr *R;
5795     if ((R = TryWithFNeg(DefBits, 32, AArch64::FNEGv4f32)) ||
5796         (R = TryWithFNeg(DefBits, 64, AArch64::FNEGv2f64)) ||
5797         (STI.hasFullFP16() &&
5798          (R = TryWithFNeg(DefBits, 16, AArch64::FNEGv8f16))))
5799       return R;
5800   }
5801 
5802   auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder);
5803   if (!CPLoad) {
5804     LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
5805     return nullptr;
5806   }
5807 
5808   auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0));
5809   RBI.constrainGenericRegister(
5810       Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI);
5811   return &*Copy;
5812 }
5813 
5814 bool AArch64InstructionSelector::tryOptConstantBuildVec(
5815     MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
5816   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5817   unsigned DstSize = DstTy.getSizeInBits();
5818   assert(DstSize <= 128 && "Unexpected build_vec type!");
5819   if (DstSize < 32)
5820     return false;
5821   // Check if we're building a constant vector, in which case we want to
5822   // generate a constant pool load instead of a vector insert sequence.
5823   SmallVector<Constant *, 16> Csts;
5824   for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
5825     // Try to find G_CONSTANT or G_FCONSTANT
5826     auto *OpMI =
5827         getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI);
5828     if (OpMI)
5829       Csts.emplace_back(
5830           const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm()));
5831     else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT,
5832                                   I.getOperand(Idx).getReg(), MRI)))
5833       Csts.emplace_back(
5834           const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm()));
5835     else
5836       return false;
5837   }
5838   Constant *CV = ConstantVector::get(Csts);
5839   if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI))
5840     return false;
5841   I.eraseFromParent();
5842   return true;
5843 }
5844 
5845 bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
5846     MachineInstr &I, MachineRegisterInfo &MRI) {
5847   // Given:
5848   //  %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef
5849   //
5850   // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt.
5851   Register Dst = I.getOperand(0).getReg();
5852   Register EltReg = I.getOperand(1).getReg();
5853   LLT EltTy = MRI.getType(EltReg);
5854   // If the index isn't on the same bank as its elements, then this can't be a
5855   // SUBREG_TO_REG.
5856   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5857   const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI);
5858   if (EltRB != DstRB)
5859     return false;
5860   if (any_of(drop_begin(I.operands(), 2), [&MRI](const MachineOperand &Op) {
5861         return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(), MRI);
5862       }))
5863     return false;
5864   unsigned SubReg;
5865   const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB);
5866   if (!EltRC)
5867     return false;
5868   const TargetRegisterClass *DstRC =
5869       getRegClassForTypeOnBank(MRI.getType(Dst), DstRB);
5870   if (!DstRC)
5871     return false;
5872   if (!getSubRegForClass(EltRC, TRI, SubReg))
5873     return false;
5874   auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {})
5875                          .addImm(0)
5876                          .addUse(EltReg)
5877                          .addImm(SubReg);
5878   I.eraseFromParent();
5879   constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI);
5880   return RBI.constrainGenericRegister(Dst, *DstRC, MRI);
5881 }
5882 
5883 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
5884                                                    MachineRegisterInfo &MRI) {
5885   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5886   // Until we port more of the optimized selections, for now just use a vector
5887   // insert sequence.
5888   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5889   const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
5890   unsigned EltSize = EltTy.getSizeInBits();
5891 
5892   if (tryOptConstantBuildVec(I, DstTy, MRI))
5893     return true;
5894   if (tryOptBuildVecToSubregToReg(I, MRI))
5895     return true;
5896 
5897   if (EltSize != 8 && EltSize != 16 && EltSize != 32 && EltSize != 64)
5898     return false; // Don't support all element types yet.
5899   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
5900 
5901   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5902   MachineInstr *ScalarToVec =
5903       emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
5904                          I.getOperand(1).getReg(), MIB);
5905   if (!ScalarToVec)
5906     return false;
5907 
5908   Register DstVec = ScalarToVec->getOperand(0).getReg();
5909   unsigned DstSize = DstTy.getSizeInBits();
5910 
5911   // Keep track of the last MI we inserted. Later on, we might be able to save
5912   // a copy using it.
5913   MachineInstr *PrevMI = ScalarToVec;
5914   for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
5915     // Note that if we don't do a subregister copy, we can end up making an
5916     // extra register.
5917     Register OpReg = I.getOperand(i).getReg();
5918     // Do not emit inserts for undefs
5919     if (!getOpcodeDef<GImplicitDef>(OpReg, MRI)) {
5920       PrevMI = &*emitLaneInsert(std::nullopt, DstVec, OpReg, i - 1, RB, MIB);
5921       DstVec = PrevMI->getOperand(0).getReg();
5922     }
5923   }
5924 
5925   // If DstTy's size in bits is less than 128, then emit a subregister copy
5926   // from DstVec to the last register we've defined.
5927   if (DstSize < 128) {
5928     // Force this to be FPR using the destination vector.
5929     const TargetRegisterClass *RC =
5930         getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
5931     if (!RC)
5932       return false;
5933     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5934       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5935       return false;
5936     }
5937 
5938     unsigned SubReg = 0;
5939     if (!getSubRegForClass(RC, TRI, SubReg))
5940       return false;
5941     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5942       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5943                         << "\n");
5944       return false;
5945     }
5946 
5947     Register Reg = MRI.createVirtualRegister(RC);
5948     Register DstReg = I.getOperand(0).getReg();
5949 
5950     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg);
5951     MachineOperand &RegOp = I.getOperand(1);
5952     RegOp.setReg(Reg);
5953     RBI.constrainGenericRegister(DstReg, *RC, MRI);
5954   } else {
5955     // We either have a vector with all elements (except the first one) undef or
5956     // at least one non-undef non-first element. In the first case, we need to
5957     // constrain the output register ourselves as we may have generated an
5958     // INSERT_SUBREG operation which is a generic operation for which the
5959     // output regclass cannot be automatically chosen.
5960     //
5961     // In the second case, there is no need to do this as it may generate an
5962     // instruction like INSvi32gpr where the regclass can be automatically
5963     // chosen.
5964     //
5965     // Also, we save a copy by re-using the destination register on the final
5966     // insert.
5967     PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
5968     constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
5969 
5970     Register DstReg = PrevMI->getOperand(0).getReg();
5971     if (PrevMI == ScalarToVec && DstReg.isVirtual()) {
5972       const TargetRegisterClass *RC =
5973           getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
5974       RBI.constrainGenericRegister(DstReg, *RC, MRI);
5975     }
5976   }
5977 
5978   I.eraseFromParent();
5979   return true;
5980 }
5981 
5982 bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
5983                                                            unsigned NumVecs,
5984                                                            MachineInstr &I) {
5985   assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5986   assert(Opc && "Expected an opcode?");
5987   assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
5988   auto &MRI = *MIB.getMRI();
5989   LLT Ty = MRI.getType(I.getOperand(0).getReg());
5990   unsigned Size = Ty.getSizeInBits();
5991   assert((Size == 64 || Size == 128) &&
5992          "Destination must be 64 bits or 128 bits?");
5993   unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0;
5994   auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg();
5995   assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
5996   auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr});
5997   Load.cloneMemRefs(I);
5998   constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
5999   Register SelectedLoadDst = Load->getOperand(0).getReg();
6000   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
6001     auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {})
6002                    .addReg(SelectedLoadDst, 0, SubReg + Idx);
6003     // Emit the subreg copies and immediately select them.
6004     // FIXME: We should refactor our copy code into an emitCopy helper and
6005     // clean up uses of this pattern elsewhere in the selector.
6006     selectCopy(*Vec, TII, MRI, TRI, RBI);
6007   }
6008   return true;
6009 }
6010 
6011 bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic(
6012     unsigned Opc, unsigned NumVecs, MachineInstr &I) {
6013   assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
6014   assert(Opc && "Expected an opcode?");
6015   assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
6016   auto &MRI = *MIB.getMRI();
6017   LLT Ty = MRI.getType(I.getOperand(0).getReg());
6018   bool Narrow = Ty.getSizeInBits() == 64;
6019 
6020   auto FirstSrcRegIt = I.operands_begin() + NumVecs + 1;
6021   SmallVector<Register, 4> Regs(NumVecs);
6022   std::transform(FirstSrcRegIt, FirstSrcRegIt + NumVecs, Regs.begin(),
6023                  [](auto MO) { return MO.getReg(); });
6024 
6025   if (Narrow) {
6026     transform(Regs, Regs.begin(), [this](Register Reg) {
6027       return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB)
6028           ->getOperand(0)
6029           .getReg();
6030     });
6031     Ty = Ty.multiplyElements(2);
6032   }
6033 
6034   Register Tuple = createQTuple(Regs, MIB);
6035   auto LaneNo = getIConstantVRegVal((FirstSrcRegIt + NumVecs)->getReg(), MRI);
6036   if (!LaneNo)
6037     return false;
6038 
6039   Register Ptr = (FirstSrcRegIt + NumVecs + 1)->getReg();
6040   auto Load = MIB.buildInstr(Opc, {Ty}, {})
6041                   .addReg(Tuple)
6042                   .addImm(LaneNo->getZExtValue())
6043                   .addReg(Ptr);
6044   Load.cloneMemRefs(I);
6045   constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
6046   Register SelectedLoadDst = Load->getOperand(0).getReg();
6047   unsigned SubReg = AArch64::qsub0;
6048   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
6049     auto Vec = MIB.buildInstr(TargetOpcode::COPY,
6050                               {Narrow ? DstOp(&AArch64::FPR128RegClass)
6051                                       : DstOp(I.getOperand(Idx).getReg())},
6052                               {})
6053                    .addReg(SelectedLoadDst, 0, SubReg + Idx);
6054     Register WideReg = Vec.getReg(0);
6055     // Emit the subreg copies and immediately select them.
6056     selectCopy(*Vec, TII, MRI, TRI, RBI);
6057     if (Narrow &&
6058         !emitNarrowVector(I.getOperand(Idx).getReg(), WideReg, MIB, MRI))
6059       return false;
6060   }
6061   return true;
6062 }
6063 
6064 void AArch64InstructionSelector::selectVectorStoreIntrinsic(MachineInstr &I,
6065                                                             unsigned NumVecs,
6066                                                             unsigned Opc) {
6067   MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6068   LLT Ty = MRI.getType(I.getOperand(1).getReg());
6069   Register Ptr = I.getOperand(1 + NumVecs).getReg();
6070 
6071   SmallVector<Register, 2> Regs(NumVecs);
6072   std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs,
6073                  Regs.begin(), [](auto MO) { return MO.getReg(); });
6074 
6075   Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
6076                                              : createDTuple(Regs, MIB);
6077   auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr});
6078   Store.cloneMemRefs(I);
6079   constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
6080 }
6081 
6082 bool AArch64InstructionSelector::selectVectorStoreLaneIntrinsic(
6083     MachineInstr &I, unsigned NumVecs, unsigned Opc) {
6084   MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6085   LLT Ty = MRI.getType(I.getOperand(1).getReg());
6086   bool Narrow = Ty.getSizeInBits() == 64;
6087 
6088   SmallVector<Register, 2> Regs(NumVecs);
6089   std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs,
6090                  Regs.begin(), [](auto MO) { return MO.getReg(); });
6091 
6092   if (Narrow)
6093     transform(Regs, Regs.begin(), [this](Register Reg) {
6094       return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB)
6095           ->getOperand(0)
6096           .getReg();
6097     });
6098 
6099   Register Tuple = createQTuple(Regs, MIB);
6100 
6101   auto LaneNo = getIConstantVRegVal(I.getOperand(1 + NumVecs).getReg(), MRI);
6102   if (!LaneNo)
6103     return false;
6104   Register Ptr = I.getOperand(1 + NumVecs + 1).getReg();
6105   auto Store = MIB.buildInstr(Opc, {}, {})
6106                    .addReg(Tuple)
6107                    .addImm(LaneNo->getZExtValue())
6108                    .addReg(Ptr);
6109   Store.cloneMemRefs(I);
6110   constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
6111   return true;
6112 }
6113 
6114 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
6115     MachineInstr &I, MachineRegisterInfo &MRI) {
6116   // Find the intrinsic ID.
6117   unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID();
6118 
6119   const LLT S8 = LLT::scalar(8);
6120   const LLT S16 = LLT::scalar(16);
6121   const LLT S32 = LLT::scalar(32);
6122   const LLT S64 = LLT::scalar(64);
6123   const LLT P0 = LLT::pointer(0, 64);
6124   // Select the instruction.
6125   switch (IntrinID) {
6126   default:
6127     return false;
6128   case Intrinsic::aarch64_ldxp:
6129   case Intrinsic::aarch64_ldaxp: {
6130     auto NewI = MIB.buildInstr(
6131         IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
6132         {I.getOperand(0).getReg(), I.getOperand(1).getReg()},
6133         {I.getOperand(3)});
6134     NewI.cloneMemRefs(I);
6135     constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
6136     break;
6137   }
6138   case Intrinsic::aarch64_neon_ld1x2: {
6139     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6140     unsigned Opc = 0;
6141     if (Ty == LLT::fixed_vector(8, S8))
6142       Opc = AArch64::LD1Twov8b;
6143     else if (Ty == LLT::fixed_vector(16, S8))
6144       Opc = AArch64::LD1Twov16b;
6145     else if (Ty == LLT::fixed_vector(4, S16))
6146       Opc = AArch64::LD1Twov4h;
6147     else if (Ty == LLT::fixed_vector(8, S16))
6148       Opc = AArch64::LD1Twov8h;
6149     else if (Ty == LLT::fixed_vector(2, S32))
6150       Opc = AArch64::LD1Twov2s;
6151     else if (Ty == LLT::fixed_vector(4, S32))
6152       Opc = AArch64::LD1Twov4s;
6153     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6154       Opc = AArch64::LD1Twov2d;
6155     else if (Ty == S64 || Ty == P0)
6156       Opc = AArch64::LD1Twov1d;
6157     else
6158       llvm_unreachable("Unexpected type for ld1x2!");
6159     selectVectorLoadIntrinsic(Opc, 2, I);
6160     break;
6161   }
6162   case Intrinsic::aarch64_neon_ld1x3: {
6163     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6164     unsigned Opc = 0;
6165     if (Ty == LLT::fixed_vector(8, S8))
6166       Opc = AArch64::LD1Threev8b;
6167     else if (Ty == LLT::fixed_vector(16, S8))
6168       Opc = AArch64::LD1Threev16b;
6169     else if (Ty == LLT::fixed_vector(4, S16))
6170       Opc = AArch64::LD1Threev4h;
6171     else if (Ty == LLT::fixed_vector(8, S16))
6172       Opc = AArch64::LD1Threev8h;
6173     else if (Ty == LLT::fixed_vector(2, S32))
6174       Opc = AArch64::LD1Threev2s;
6175     else if (Ty == LLT::fixed_vector(4, S32))
6176       Opc = AArch64::LD1Threev4s;
6177     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6178       Opc = AArch64::LD1Threev2d;
6179     else if (Ty == S64 || Ty == P0)
6180       Opc = AArch64::LD1Threev1d;
6181     else
6182       llvm_unreachable("Unexpected type for ld1x3!");
6183     selectVectorLoadIntrinsic(Opc, 3, I);
6184     break;
6185   }
6186   case Intrinsic::aarch64_neon_ld1x4: {
6187     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6188     unsigned Opc = 0;
6189     if (Ty == LLT::fixed_vector(8, S8))
6190       Opc = AArch64::LD1Fourv8b;
6191     else if (Ty == LLT::fixed_vector(16, S8))
6192       Opc = AArch64::LD1Fourv16b;
6193     else if (Ty == LLT::fixed_vector(4, S16))
6194       Opc = AArch64::LD1Fourv4h;
6195     else if (Ty == LLT::fixed_vector(8, S16))
6196       Opc = AArch64::LD1Fourv8h;
6197     else if (Ty == LLT::fixed_vector(2, S32))
6198       Opc = AArch64::LD1Fourv2s;
6199     else if (Ty == LLT::fixed_vector(4, S32))
6200       Opc = AArch64::LD1Fourv4s;
6201     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6202       Opc = AArch64::LD1Fourv2d;
6203     else if (Ty == S64 || Ty == P0)
6204       Opc = AArch64::LD1Fourv1d;
6205     else
6206       llvm_unreachable("Unexpected type for ld1x4!");
6207     selectVectorLoadIntrinsic(Opc, 4, I);
6208     break;
6209   }
6210   case Intrinsic::aarch64_neon_ld2: {
6211     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6212     unsigned Opc = 0;
6213     if (Ty == LLT::fixed_vector(8, S8))
6214       Opc = AArch64::LD2Twov8b;
6215     else if (Ty == LLT::fixed_vector(16, S8))
6216       Opc = AArch64::LD2Twov16b;
6217     else if (Ty == LLT::fixed_vector(4, S16))
6218       Opc = AArch64::LD2Twov4h;
6219     else if (Ty == LLT::fixed_vector(8, S16))
6220       Opc = AArch64::LD2Twov8h;
6221     else if (Ty == LLT::fixed_vector(2, S32))
6222       Opc = AArch64::LD2Twov2s;
6223     else if (Ty == LLT::fixed_vector(4, S32))
6224       Opc = AArch64::LD2Twov4s;
6225     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6226       Opc = AArch64::LD2Twov2d;
6227     else if (Ty == S64 || Ty == P0)
6228       Opc = AArch64::LD1Twov1d;
6229     else
6230       llvm_unreachable("Unexpected type for ld2!");
6231     selectVectorLoadIntrinsic(Opc, 2, I);
6232     break;
6233   }
6234   case Intrinsic::aarch64_neon_ld2lane: {
6235     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6236     unsigned Opc;
6237     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6238       Opc = AArch64::LD2i8;
6239     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6240       Opc = AArch64::LD2i16;
6241     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6242       Opc = AArch64::LD2i32;
6243     else if (Ty == LLT::fixed_vector(2, S64) ||
6244              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6245       Opc = AArch64::LD2i64;
6246     else
6247       llvm_unreachable("Unexpected type for st2lane!");
6248     if (!selectVectorLoadLaneIntrinsic(Opc, 2, I))
6249       return false;
6250     break;
6251   }
6252   case Intrinsic::aarch64_neon_ld2r: {
6253     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6254     unsigned Opc = 0;
6255     if (Ty == LLT::fixed_vector(8, S8))
6256       Opc = AArch64::LD2Rv8b;
6257     else if (Ty == LLT::fixed_vector(16, S8))
6258       Opc = AArch64::LD2Rv16b;
6259     else if (Ty == LLT::fixed_vector(4, S16))
6260       Opc = AArch64::LD2Rv4h;
6261     else if (Ty == LLT::fixed_vector(8, S16))
6262       Opc = AArch64::LD2Rv8h;
6263     else if (Ty == LLT::fixed_vector(2, S32))
6264       Opc = AArch64::LD2Rv2s;
6265     else if (Ty == LLT::fixed_vector(4, S32))
6266       Opc = AArch64::LD2Rv4s;
6267     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6268       Opc = AArch64::LD2Rv2d;
6269     else if (Ty == S64 || Ty == P0)
6270       Opc = AArch64::LD2Rv1d;
6271     else
6272       llvm_unreachable("Unexpected type for ld2r!");
6273     selectVectorLoadIntrinsic(Opc, 2, I);
6274     break;
6275   }
6276   case Intrinsic::aarch64_neon_ld3: {
6277     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6278     unsigned Opc = 0;
6279     if (Ty == LLT::fixed_vector(8, S8))
6280       Opc = AArch64::LD3Threev8b;
6281     else if (Ty == LLT::fixed_vector(16, S8))
6282       Opc = AArch64::LD3Threev16b;
6283     else if (Ty == LLT::fixed_vector(4, S16))
6284       Opc = AArch64::LD3Threev4h;
6285     else if (Ty == LLT::fixed_vector(8, S16))
6286       Opc = AArch64::LD3Threev8h;
6287     else if (Ty == LLT::fixed_vector(2, S32))
6288       Opc = AArch64::LD3Threev2s;
6289     else if (Ty == LLT::fixed_vector(4, S32))
6290       Opc = AArch64::LD3Threev4s;
6291     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6292       Opc = AArch64::LD3Threev2d;
6293     else if (Ty == S64 || Ty == P0)
6294       Opc = AArch64::LD1Threev1d;
6295     else
6296       llvm_unreachable("Unexpected type for ld3!");
6297     selectVectorLoadIntrinsic(Opc, 3, I);
6298     break;
6299   }
6300   case Intrinsic::aarch64_neon_ld3lane: {
6301     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6302     unsigned Opc;
6303     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6304       Opc = AArch64::LD3i8;
6305     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6306       Opc = AArch64::LD3i16;
6307     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6308       Opc = AArch64::LD3i32;
6309     else if (Ty == LLT::fixed_vector(2, S64) ||
6310              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6311       Opc = AArch64::LD3i64;
6312     else
6313       llvm_unreachable("Unexpected type for st3lane!");
6314     if (!selectVectorLoadLaneIntrinsic(Opc, 3, I))
6315       return false;
6316     break;
6317   }
6318   case Intrinsic::aarch64_neon_ld3r: {
6319     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6320     unsigned Opc = 0;
6321     if (Ty == LLT::fixed_vector(8, S8))
6322       Opc = AArch64::LD3Rv8b;
6323     else if (Ty == LLT::fixed_vector(16, S8))
6324       Opc = AArch64::LD3Rv16b;
6325     else if (Ty == LLT::fixed_vector(4, S16))
6326       Opc = AArch64::LD3Rv4h;
6327     else if (Ty == LLT::fixed_vector(8, S16))
6328       Opc = AArch64::LD3Rv8h;
6329     else if (Ty == LLT::fixed_vector(2, S32))
6330       Opc = AArch64::LD3Rv2s;
6331     else if (Ty == LLT::fixed_vector(4, S32))
6332       Opc = AArch64::LD3Rv4s;
6333     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6334       Opc = AArch64::LD3Rv2d;
6335     else if (Ty == S64 || Ty == P0)
6336       Opc = AArch64::LD3Rv1d;
6337     else
6338       llvm_unreachable("Unexpected type for ld3r!");
6339     selectVectorLoadIntrinsic(Opc, 3, I);
6340     break;
6341   }
6342   case Intrinsic::aarch64_neon_ld4: {
6343     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6344     unsigned Opc = 0;
6345     if (Ty == LLT::fixed_vector(8, S8))
6346       Opc = AArch64::LD4Fourv8b;
6347     else if (Ty == LLT::fixed_vector(16, S8))
6348       Opc = AArch64::LD4Fourv16b;
6349     else if (Ty == LLT::fixed_vector(4, S16))
6350       Opc = AArch64::LD4Fourv4h;
6351     else if (Ty == LLT::fixed_vector(8, S16))
6352       Opc = AArch64::LD4Fourv8h;
6353     else if (Ty == LLT::fixed_vector(2, S32))
6354       Opc = AArch64::LD4Fourv2s;
6355     else if (Ty == LLT::fixed_vector(4, S32))
6356       Opc = AArch64::LD4Fourv4s;
6357     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6358       Opc = AArch64::LD4Fourv2d;
6359     else if (Ty == S64 || Ty == P0)
6360       Opc = AArch64::LD1Fourv1d;
6361     else
6362       llvm_unreachable("Unexpected type for ld4!");
6363     selectVectorLoadIntrinsic(Opc, 4, I);
6364     break;
6365   }
6366   case Intrinsic::aarch64_neon_ld4lane: {
6367     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6368     unsigned Opc;
6369     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6370       Opc = AArch64::LD4i8;
6371     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6372       Opc = AArch64::LD4i16;
6373     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6374       Opc = AArch64::LD4i32;
6375     else if (Ty == LLT::fixed_vector(2, S64) ||
6376              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6377       Opc = AArch64::LD4i64;
6378     else
6379       llvm_unreachable("Unexpected type for st4lane!");
6380     if (!selectVectorLoadLaneIntrinsic(Opc, 4, I))
6381       return false;
6382     break;
6383   }
6384   case Intrinsic::aarch64_neon_ld4r: {
6385     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6386     unsigned Opc = 0;
6387     if (Ty == LLT::fixed_vector(8, S8))
6388       Opc = AArch64::LD4Rv8b;
6389     else if (Ty == LLT::fixed_vector(16, S8))
6390       Opc = AArch64::LD4Rv16b;
6391     else if (Ty == LLT::fixed_vector(4, S16))
6392       Opc = AArch64::LD4Rv4h;
6393     else if (Ty == LLT::fixed_vector(8, S16))
6394       Opc = AArch64::LD4Rv8h;
6395     else if (Ty == LLT::fixed_vector(2, S32))
6396       Opc = AArch64::LD4Rv2s;
6397     else if (Ty == LLT::fixed_vector(4, S32))
6398       Opc = AArch64::LD4Rv4s;
6399     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6400       Opc = AArch64::LD4Rv2d;
6401     else if (Ty == S64 || Ty == P0)
6402       Opc = AArch64::LD4Rv1d;
6403     else
6404       llvm_unreachable("Unexpected type for ld4r!");
6405     selectVectorLoadIntrinsic(Opc, 4, I);
6406     break;
6407   }
6408   case Intrinsic::aarch64_neon_st1x2: {
6409     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6410     unsigned Opc;
6411     if (Ty == LLT::fixed_vector(8, S8))
6412       Opc = AArch64::ST1Twov8b;
6413     else if (Ty == LLT::fixed_vector(16, S8))
6414       Opc = AArch64::ST1Twov16b;
6415     else if (Ty == LLT::fixed_vector(4, S16))
6416       Opc = AArch64::ST1Twov4h;
6417     else if (Ty == LLT::fixed_vector(8, S16))
6418       Opc = AArch64::ST1Twov8h;
6419     else if (Ty == LLT::fixed_vector(2, S32))
6420       Opc = AArch64::ST1Twov2s;
6421     else if (Ty == LLT::fixed_vector(4, S32))
6422       Opc = AArch64::ST1Twov4s;
6423     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6424       Opc = AArch64::ST1Twov2d;
6425     else if (Ty == S64 || Ty == P0)
6426       Opc = AArch64::ST1Twov1d;
6427     else
6428       llvm_unreachable("Unexpected type for st1x2!");
6429     selectVectorStoreIntrinsic(I, 2, Opc);
6430     break;
6431   }
6432   case Intrinsic::aarch64_neon_st1x3: {
6433     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6434     unsigned Opc;
6435     if (Ty == LLT::fixed_vector(8, S8))
6436       Opc = AArch64::ST1Threev8b;
6437     else if (Ty == LLT::fixed_vector(16, S8))
6438       Opc = AArch64::ST1Threev16b;
6439     else if (Ty == LLT::fixed_vector(4, S16))
6440       Opc = AArch64::ST1Threev4h;
6441     else if (Ty == LLT::fixed_vector(8, S16))
6442       Opc = AArch64::ST1Threev8h;
6443     else if (Ty == LLT::fixed_vector(2, S32))
6444       Opc = AArch64::ST1Threev2s;
6445     else if (Ty == LLT::fixed_vector(4, S32))
6446       Opc = AArch64::ST1Threev4s;
6447     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6448       Opc = AArch64::ST1Threev2d;
6449     else if (Ty == S64 || Ty == P0)
6450       Opc = AArch64::ST1Threev1d;
6451     else
6452       llvm_unreachable("Unexpected type for st1x3!");
6453     selectVectorStoreIntrinsic(I, 3, Opc);
6454     break;
6455   }
6456   case Intrinsic::aarch64_neon_st1x4: {
6457     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6458     unsigned Opc;
6459     if (Ty == LLT::fixed_vector(8, S8))
6460       Opc = AArch64::ST1Fourv8b;
6461     else if (Ty == LLT::fixed_vector(16, S8))
6462       Opc = AArch64::ST1Fourv16b;
6463     else if (Ty == LLT::fixed_vector(4, S16))
6464       Opc = AArch64::ST1Fourv4h;
6465     else if (Ty == LLT::fixed_vector(8, S16))
6466       Opc = AArch64::ST1Fourv8h;
6467     else if (Ty == LLT::fixed_vector(2, S32))
6468       Opc = AArch64::ST1Fourv2s;
6469     else if (Ty == LLT::fixed_vector(4, S32))
6470       Opc = AArch64::ST1Fourv4s;
6471     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6472       Opc = AArch64::ST1Fourv2d;
6473     else if (Ty == S64 || Ty == P0)
6474       Opc = AArch64::ST1Fourv1d;
6475     else
6476       llvm_unreachable("Unexpected type for st1x4!");
6477     selectVectorStoreIntrinsic(I, 4, Opc);
6478     break;
6479   }
6480   case Intrinsic::aarch64_neon_st2: {
6481     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6482     unsigned Opc;
6483     if (Ty == LLT::fixed_vector(8, S8))
6484       Opc = AArch64::ST2Twov8b;
6485     else if (Ty == LLT::fixed_vector(16, S8))
6486       Opc = AArch64::ST2Twov16b;
6487     else if (Ty == LLT::fixed_vector(4, S16))
6488       Opc = AArch64::ST2Twov4h;
6489     else if (Ty == LLT::fixed_vector(8, S16))
6490       Opc = AArch64::ST2Twov8h;
6491     else if (Ty == LLT::fixed_vector(2, S32))
6492       Opc = AArch64::ST2Twov2s;
6493     else if (Ty == LLT::fixed_vector(4, S32))
6494       Opc = AArch64::ST2Twov4s;
6495     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6496       Opc = AArch64::ST2Twov2d;
6497     else if (Ty == S64 || Ty == P0)
6498       Opc = AArch64::ST1Twov1d;
6499     else
6500       llvm_unreachable("Unexpected type for st2!");
6501     selectVectorStoreIntrinsic(I, 2, Opc);
6502     break;
6503   }
6504   case Intrinsic::aarch64_neon_st3: {
6505     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6506     unsigned Opc;
6507     if (Ty == LLT::fixed_vector(8, S8))
6508       Opc = AArch64::ST3Threev8b;
6509     else if (Ty == LLT::fixed_vector(16, S8))
6510       Opc = AArch64::ST3Threev16b;
6511     else if (Ty == LLT::fixed_vector(4, S16))
6512       Opc = AArch64::ST3Threev4h;
6513     else if (Ty == LLT::fixed_vector(8, S16))
6514       Opc = AArch64::ST3Threev8h;
6515     else if (Ty == LLT::fixed_vector(2, S32))
6516       Opc = AArch64::ST3Threev2s;
6517     else if (Ty == LLT::fixed_vector(4, S32))
6518       Opc = AArch64::ST3Threev4s;
6519     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6520       Opc = AArch64::ST3Threev2d;
6521     else if (Ty == S64 || Ty == P0)
6522       Opc = AArch64::ST1Threev1d;
6523     else
6524       llvm_unreachable("Unexpected type for st3!");
6525     selectVectorStoreIntrinsic(I, 3, Opc);
6526     break;
6527   }
6528   case Intrinsic::aarch64_neon_st4: {
6529     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6530     unsigned Opc;
6531     if (Ty == LLT::fixed_vector(8, S8))
6532       Opc = AArch64::ST4Fourv8b;
6533     else if (Ty == LLT::fixed_vector(16, S8))
6534       Opc = AArch64::ST4Fourv16b;
6535     else if (Ty == LLT::fixed_vector(4, S16))
6536       Opc = AArch64::ST4Fourv4h;
6537     else if (Ty == LLT::fixed_vector(8, S16))
6538       Opc = AArch64::ST4Fourv8h;
6539     else if (Ty == LLT::fixed_vector(2, S32))
6540       Opc = AArch64::ST4Fourv2s;
6541     else if (Ty == LLT::fixed_vector(4, S32))
6542       Opc = AArch64::ST4Fourv4s;
6543     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6544       Opc = AArch64::ST4Fourv2d;
6545     else if (Ty == S64 || Ty == P0)
6546       Opc = AArch64::ST1Fourv1d;
6547     else
6548       llvm_unreachable("Unexpected type for st4!");
6549     selectVectorStoreIntrinsic(I, 4, Opc);
6550     break;
6551   }
6552   case Intrinsic::aarch64_neon_st2lane: {
6553     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6554     unsigned Opc;
6555     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6556       Opc = AArch64::ST2i8;
6557     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6558       Opc = AArch64::ST2i16;
6559     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6560       Opc = AArch64::ST2i32;
6561     else if (Ty == LLT::fixed_vector(2, S64) ||
6562              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6563       Opc = AArch64::ST2i64;
6564     else
6565       llvm_unreachable("Unexpected type for st2lane!");
6566     if (!selectVectorStoreLaneIntrinsic(I, 2, Opc))
6567       return false;
6568     break;
6569   }
6570   case Intrinsic::aarch64_neon_st3lane: {
6571     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6572     unsigned Opc;
6573     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6574       Opc = AArch64::ST3i8;
6575     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6576       Opc = AArch64::ST3i16;
6577     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6578       Opc = AArch64::ST3i32;
6579     else if (Ty == LLT::fixed_vector(2, S64) ||
6580              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6581       Opc = AArch64::ST3i64;
6582     else
6583       llvm_unreachable("Unexpected type for st3lane!");
6584     if (!selectVectorStoreLaneIntrinsic(I, 3, Opc))
6585       return false;
6586     break;
6587   }
6588   case Intrinsic::aarch64_neon_st4lane: {
6589     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6590     unsigned Opc;
6591     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6592       Opc = AArch64::ST4i8;
6593     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6594       Opc = AArch64::ST4i16;
6595     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6596       Opc = AArch64::ST4i32;
6597     else if (Ty == LLT::fixed_vector(2, S64) ||
6598              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6599       Opc = AArch64::ST4i64;
6600     else
6601       llvm_unreachable("Unexpected type for st4lane!");
6602     if (!selectVectorStoreLaneIntrinsic(I, 4, Opc))
6603       return false;
6604     break;
6605   }
6606   case Intrinsic::aarch64_mops_memset_tag: {
6607     // Transform
6608     //    %dst:gpr(p0) = \
6609     //      G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
6610     //      \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
6611     // where %dst is updated, into
6612     //    %Rd:GPR64common, %Rn:GPR64) = \
6613     //      MOPSMemorySetTaggingPseudo \
6614     //      %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
6615     // where Rd and Rn are tied.
6616     // It is expected that %val has been extended to s64 in legalization.
6617     // Note that the order of the size/value operands are swapped.
6618 
6619     Register DstDef = I.getOperand(0).getReg();
6620     // I.getOperand(1) is the intrinsic function
6621     Register DstUse = I.getOperand(2).getReg();
6622     Register ValUse = I.getOperand(3).getReg();
6623     Register SizeUse = I.getOperand(4).getReg();
6624 
6625     // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
6626     // Therefore an additional virtual register is requried for the updated size
6627     // operand. This value is not accessible via the semantics of the intrinsic.
6628     Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64));
6629 
6630     auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo,
6631                                  {DstDef, SizeDef}, {DstUse, SizeUse, ValUse});
6632     Memset.cloneMemRefs(I);
6633     constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI);
6634     break;
6635   }
6636   }
6637 
6638   I.eraseFromParent();
6639   return true;
6640 }
6641 
6642 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
6643                                                  MachineRegisterInfo &MRI) {
6644   unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID();
6645 
6646   switch (IntrinID) {
6647   default:
6648     break;
6649   case Intrinsic::aarch64_crypto_sha1h: {
6650     Register DstReg = I.getOperand(0).getReg();
6651     Register SrcReg = I.getOperand(2).getReg();
6652 
6653     // FIXME: Should this be an assert?
6654     if (MRI.getType(DstReg).getSizeInBits() != 32 ||
6655         MRI.getType(SrcReg).getSizeInBits() != 32)
6656       return false;
6657 
6658     // The operation has to happen on FPRs. Set up some new FPR registers for
6659     // the source and destination if they are on GPRs.
6660     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
6661       SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
6662       MIB.buildCopy({SrcReg}, {I.getOperand(2)});
6663 
6664       // Make sure the copy ends up getting constrained properly.
6665       RBI.constrainGenericRegister(I.getOperand(2).getReg(),
6666                                    AArch64::GPR32RegClass, MRI);
6667     }
6668 
6669     if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
6670       DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
6671 
6672     // Actually insert the instruction.
6673     auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
6674     constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
6675 
6676     // Did we create a new register for the destination?
6677     if (DstReg != I.getOperand(0).getReg()) {
6678       // Yep. Copy the result of the instruction back into the original
6679       // destination.
6680       MIB.buildCopy({I.getOperand(0)}, {DstReg});
6681       RBI.constrainGenericRegister(I.getOperand(0).getReg(),
6682                                    AArch64::GPR32RegClass, MRI);
6683     }
6684 
6685     I.eraseFromParent();
6686     return true;
6687   }
6688   case Intrinsic::ptrauth_resign: {
6689     Register DstReg = I.getOperand(0).getReg();
6690     Register ValReg = I.getOperand(2).getReg();
6691     uint64_t AUTKey = I.getOperand(3).getImm();
6692     Register AUTDisc = I.getOperand(4).getReg();
6693     uint64_t PACKey = I.getOperand(5).getImm();
6694     Register PACDisc = I.getOperand(6).getReg();
6695 
6696     Register AUTAddrDisc = AUTDisc;
6697     uint16_t AUTConstDiscC = 0;
6698     std::tie(AUTConstDiscC, AUTAddrDisc) =
6699         extractPtrauthBlendDiscriminators(AUTDisc, MRI);
6700 
6701     Register PACAddrDisc = PACDisc;
6702     uint16_t PACConstDiscC = 0;
6703     std::tie(PACConstDiscC, PACAddrDisc) =
6704         extractPtrauthBlendDiscriminators(PACDisc, MRI);
6705 
6706     MIB.buildCopy({AArch64::X16}, {ValReg});
6707     MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
6708     MIB.buildInstr(AArch64::AUTPAC)
6709         .addImm(AUTKey)
6710         .addImm(AUTConstDiscC)
6711         .addUse(AUTAddrDisc)
6712         .addImm(PACKey)
6713         .addImm(PACConstDiscC)
6714         .addUse(PACAddrDisc)
6715         .constrainAllUses(TII, TRI, RBI);
6716     MIB.buildCopy({DstReg}, Register(AArch64::X16));
6717 
6718     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
6719     I.eraseFromParent();
6720     return true;
6721   }
6722   case Intrinsic::ptrauth_auth: {
6723     Register DstReg = I.getOperand(0).getReg();
6724     Register ValReg = I.getOperand(2).getReg();
6725     uint64_t AUTKey = I.getOperand(3).getImm();
6726     Register AUTDisc = I.getOperand(4).getReg();
6727 
6728     Register AUTAddrDisc = AUTDisc;
6729     uint16_t AUTConstDiscC = 0;
6730     std::tie(AUTConstDiscC, AUTAddrDisc) =
6731         extractPtrauthBlendDiscriminators(AUTDisc, MRI);
6732 
6733     MIB.buildCopy({AArch64::X16}, {ValReg});
6734     MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
6735     MIB.buildInstr(AArch64::AUT)
6736         .addImm(AUTKey)
6737         .addImm(AUTConstDiscC)
6738         .addUse(AUTAddrDisc)
6739         .constrainAllUses(TII, TRI, RBI);
6740     MIB.buildCopy({DstReg}, Register(AArch64::X16));
6741 
6742     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
6743     I.eraseFromParent();
6744     return true;
6745   }
6746   case Intrinsic::frameaddress:
6747   case Intrinsic::returnaddress: {
6748     MachineFunction &MF = *I.getParent()->getParent();
6749     MachineFrameInfo &MFI = MF.getFrameInfo();
6750 
6751     unsigned Depth = I.getOperand(2).getImm();
6752     Register DstReg = I.getOperand(0).getReg();
6753     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
6754 
6755     if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
6756       if (!MFReturnAddr) {
6757         // Insert the copy from LR/X30 into the entry block, before it can be
6758         // clobbered by anything.
6759         MFI.setReturnAddressIsTaken(true);
6760         MFReturnAddr = getFunctionLiveInPhysReg(
6761             MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc());
6762       }
6763 
6764       if (STI.hasPAuth()) {
6765         MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
6766       } else {
6767         MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
6768         MIB.buildInstr(AArch64::XPACLRI);
6769         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6770       }
6771 
6772       I.eraseFromParent();
6773       return true;
6774     }
6775 
6776     MFI.setFrameAddressIsTaken(true);
6777     Register FrameAddr(AArch64::FP);
6778     while (Depth--) {
6779       Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
6780       auto Ldr =
6781           MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0);
6782       constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
6783       FrameAddr = NextFrame;
6784     }
6785 
6786     if (IntrinID == Intrinsic::frameaddress)
6787       MIB.buildCopy({DstReg}, {FrameAddr});
6788     else {
6789       MFI.setReturnAddressIsTaken(true);
6790 
6791       if (STI.hasPAuth()) {
6792         Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
6793         MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
6794         MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
6795       } else {
6796         MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr})
6797             .addImm(1);
6798         MIB.buildInstr(AArch64::XPACLRI);
6799         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6800       }
6801     }
6802 
6803     I.eraseFromParent();
6804     return true;
6805   }
6806   case Intrinsic::aarch64_neon_tbl2:
6807     SelectTable(I, MRI, 2, AArch64::TBLv8i8Two, AArch64::TBLv16i8Two, false);
6808     return true;
6809   case Intrinsic::aarch64_neon_tbl3:
6810     SelectTable(I, MRI, 3, AArch64::TBLv8i8Three, AArch64::TBLv16i8Three,
6811                 false);
6812     return true;
6813   case Intrinsic::aarch64_neon_tbl4:
6814     SelectTable(I, MRI, 4, AArch64::TBLv8i8Four, AArch64::TBLv16i8Four, false);
6815     return true;
6816   case Intrinsic::aarch64_neon_tbx2:
6817     SelectTable(I, MRI, 2, AArch64::TBXv8i8Two, AArch64::TBXv16i8Two, true);
6818     return true;
6819   case Intrinsic::aarch64_neon_tbx3:
6820     SelectTable(I, MRI, 3, AArch64::TBXv8i8Three, AArch64::TBXv16i8Three, true);
6821     return true;
6822   case Intrinsic::aarch64_neon_tbx4:
6823     SelectTable(I, MRI, 4, AArch64::TBXv8i8Four, AArch64::TBXv16i8Four, true);
6824     return true;
6825   case Intrinsic::swift_async_context_addr:
6826     auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
6827                               {Register(AArch64::FP)})
6828                    .addImm(8)
6829                    .addImm(0);
6830     constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI);
6831 
6832     MF->getFrameInfo().setFrameAddressIsTaken(true);
6833     MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6834     I.eraseFromParent();
6835     return true;
6836   }
6837   return false;
6838 }
6839 
6840 // G_PTRAUTH_GLOBAL_VALUE lowering
6841 //
6842 // We have 3 lowering alternatives to choose from:
6843 // - MOVaddrPAC: similar to MOVaddr, with added PAC.
6844 //   If the GV doesn't need a GOT load (i.e., is locally defined)
6845 //   materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
6846 //
6847 // - LOADgotPAC: similar to LOADgot, with added PAC.
6848 //   If the GV needs a GOT load, materialize the pointer using the usual
6849 //   GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
6850 //   section is assumed to be read-only (for example, via relro mechanism). See
6851 //   LowerMOVaddrPAC.
6852 //
6853 // - LOADauthptrstatic: similar to LOADgot, but use a
6854 //   special stub slot instead of a GOT slot.
6855 //   Load a signed pointer for symbol 'sym' from a stub slot named
6856 //   'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
6857 //   resolving. This usually lowers to adrp+ldr, but also emits an entry into
6858 //   .data with an
6859 //   @AUTH relocation. See LowerLOADauthptrstatic.
6860 //
6861 // All 3 are pseudos that are expand late to longer sequences: this lets us
6862 // provide integrity guarantees on the to-be-signed intermediate values.
6863 //
6864 // LOADauthptrstatic is undesirable because it requires a large section filled
6865 // with often similarly-signed pointers, making it a good harvesting target.
6866 // Thus, it's only used for ptrauth references to extern_weak to avoid null
6867 // checks.
6868 
6869 bool AArch64InstructionSelector::selectPtrAuthGlobalValue(
6870     MachineInstr &I, MachineRegisterInfo &MRI) const {
6871   Register DefReg = I.getOperand(0).getReg();
6872   Register Addr = I.getOperand(1).getReg();
6873   uint64_t Key = I.getOperand(2).getImm();
6874   Register AddrDisc = I.getOperand(3).getReg();
6875   uint64_t Disc = I.getOperand(4).getImm();
6876   int64_t Offset = 0;
6877 
6878   if (Key > AArch64PACKey::LAST)
6879     report_fatal_error("key in ptrauth global out of range [0, " +
6880                        Twine((int)AArch64PACKey::LAST) + "]");
6881 
6882   // Blend only works if the integer discriminator is 16-bit wide.
6883   if (!isUInt<16>(Disc))
6884     report_fatal_error(
6885         "constant discriminator in ptrauth global out of range [0, 0xffff]");
6886 
6887   // Choosing between 3 lowering alternatives is target-specific.
6888   if (!STI.isTargetELF() && !STI.isTargetMachO())
6889     report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
6890 
6891   if (!MRI.hasOneDef(Addr))
6892     return false;
6893 
6894   // First match any offset we take from the real global.
6895   const MachineInstr *DefMI = &*MRI.def_instr_begin(Addr);
6896   if (DefMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6897     Register OffsetReg = DefMI->getOperand(2).getReg();
6898     if (!MRI.hasOneDef(OffsetReg))
6899       return false;
6900     const MachineInstr &OffsetMI = *MRI.def_instr_begin(OffsetReg);
6901     if (OffsetMI.getOpcode() != TargetOpcode::G_CONSTANT)
6902       return false;
6903 
6904     Addr = DefMI->getOperand(1).getReg();
6905     if (!MRI.hasOneDef(Addr))
6906       return false;
6907 
6908     DefMI = &*MRI.def_instr_begin(Addr);
6909     Offset = OffsetMI.getOperand(1).getCImm()->getSExtValue();
6910   }
6911 
6912   // We should be left with a genuine unauthenticated GlobalValue.
6913   const GlobalValue *GV;
6914   if (DefMI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE) {
6915     GV = DefMI->getOperand(1).getGlobal();
6916     Offset += DefMI->getOperand(1).getOffset();
6917   } else if (DefMI->getOpcode() == AArch64::G_ADD_LOW) {
6918     GV = DefMI->getOperand(2).getGlobal();
6919     Offset += DefMI->getOperand(2).getOffset();
6920   } else {
6921     return false;
6922   }
6923 
6924   MachineIRBuilder MIB(I);
6925 
6926   // Classify the reference to determine whether it needs a GOT load.
6927   unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
6928   const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
6929   assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
6930          "unsupported non-GOT op flags on ptrauth global reference");
6931   assert((!GV->hasExternalWeakLinkage() || NeedsGOTLoad) &&
6932          "unsupported non-GOT reference to weak ptrauth global");
6933 
6934   std::optional<APInt> AddrDiscVal = getIConstantVRegVal(AddrDisc, MRI);
6935   bool HasAddrDisc = !AddrDiscVal || *AddrDiscVal != 0;
6936 
6937   // Non-extern_weak:
6938   // - No GOT load needed -> MOVaddrPAC
6939   // - GOT load for non-extern_weak -> LOADgotPAC
6940   //   Note that we disallow extern_weak refs to avoid null checks later.
6941   if (!GV->hasExternalWeakLinkage()) {
6942     MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X16}, {});
6943     MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
6944     MIB.buildInstr(NeedsGOTLoad ? AArch64::LOADgotPAC : AArch64::MOVaddrPAC)
6945         .addGlobalAddress(GV, Offset)
6946         .addImm(Key)
6947         .addReg(HasAddrDisc ? AddrDisc : AArch64::XZR)
6948         .addImm(Disc)
6949         .constrainAllUses(TII, TRI, RBI);
6950     MIB.buildCopy(DefReg, Register(AArch64::X16));
6951     RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
6952     I.eraseFromParent();
6953     return true;
6954   }
6955 
6956   // extern_weak -> LOADauthptrstatic
6957 
6958   // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
6959   // offset alone as a pointer if the symbol wasn't available, which would
6960   // probably break null checks in users. Ptrauth complicates things further:
6961   // error out.
6962   if (Offset != 0)
6963     report_fatal_error(
6964         "unsupported non-zero offset in weak ptrauth global reference");
6965 
6966   if (HasAddrDisc)
6967     report_fatal_error("unsupported weak addr-div ptrauth global");
6968 
6969   MIB.buildInstr(AArch64::LOADauthptrstatic, {DefReg}, {})
6970       .addGlobalAddress(GV, Offset)
6971       .addImm(Key)
6972       .addImm(Disc);
6973   RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
6974 
6975   I.eraseFromParent();
6976   return true;
6977 }
6978 
6979 void AArch64InstructionSelector::SelectTable(MachineInstr &I,
6980                                              MachineRegisterInfo &MRI,
6981                                              unsigned NumVec, unsigned Opc1,
6982                                              unsigned Opc2, bool isExt) {
6983   Register DstReg = I.getOperand(0).getReg();
6984   unsigned Opc = MRI.getType(DstReg) == LLT::fixed_vector(8, 8) ? Opc1 : Opc2;
6985 
6986   // Create the REG_SEQUENCE
6987   SmallVector<Register, 4> Regs;
6988   for (unsigned i = 0; i < NumVec; i++)
6989     Regs.push_back(I.getOperand(i + 2 + isExt).getReg());
6990   Register RegSeq = createQTuple(Regs, MIB);
6991 
6992   Register IdxReg = I.getOperand(2 + NumVec + isExt).getReg();
6993   MachineInstrBuilder Instr;
6994   if (isExt) {
6995     Register Reg = I.getOperand(2).getReg();
6996     Instr = MIB.buildInstr(Opc, {DstReg}, {Reg, RegSeq, IdxReg});
6997   } else
6998     Instr = MIB.buildInstr(Opc, {DstReg}, {RegSeq, IdxReg});
6999   constrainSelectedInstRegOperands(*Instr, TII, TRI, RBI);
7000   I.eraseFromParent();
7001 }
7002 
7003 InstructionSelector::ComplexRendererFns
7004 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
7005   auto MaybeImmed = getImmedFromMO(Root);
7006   if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
7007     return std::nullopt;
7008   uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
7009   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
7010 }
7011 
7012 InstructionSelector::ComplexRendererFns
7013 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
7014   auto MaybeImmed = getImmedFromMO(Root);
7015   if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
7016     return std::nullopt;
7017   uint64_t Enc = 31 - *MaybeImmed;
7018   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
7019 }
7020 
7021 InstructionSelector::ComplexRendererFns
7022 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
7023   auto MaybeImmed = getImmedFromMO(Root);
7024   if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
7025     return std::nullopt;
7026   uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
7027   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
7028 }
7029 
7030 InstructionSelector::ComplexRendererFns
7031 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
7032   auto MaybeImmed = getImmedFromMO(Root);
7033   if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
7034     return std::nullopt;
7035   uint64_t Enc = 63 - *MaybeImmed;
7036   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
7037 }
7038 
7039 /// Helper to select an immediate value that can be represented as a 12-bit
7040 /// value shifted left by either 0 or 12. If it is possible to do so, return
7041 /// the immediate and shift value. If not, return std::nullopt.
7042 ///
7043 /// Used by selectArithImmed and selectNegArithImmed.
7044 InstructionSelector::ComplexRendererFns
7045 AArch64InstructionSelector::select12BitValueWithLeftShift(
7046     uint64_t Immed) const {
7047   unsigned ShiftAmt;
7048   if (Immed >> 12 == 0) {
7049     ShiftAmt = 0;
7050   } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
7051     ShiftAmt = 12;
7052     Immed = Immed >> 12;
7053   } else
7054     return std::nullopt;
7055 
7056   unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
7057   return {{
7058       [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
7059       [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
7060   }};
7061 }
7062 
7063 /// SelectArithImmed - Select an immediate value that can be represented as
7064 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
7065 /// Val set to the 12-bit value and Shift set to the shifter operand.
7066 InstructionSelector::ComplexRendererFns
7067 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
7068   // This function is called from the addsub_shifted_imm ComplexPattern,
7069   // which lists [imm] as the list of opcode it's interested in, however
7070   // we still need to check whether the operand is actually an immediate
7071   // here because the ComplexPattern opcode list is only used in
7072   // root-level opcode matching.
7073   auto MaybeImmed = getImmedFromMO(Root);
7074   if (MaybeImmed == std::nullopt)
7075     return std::nullopt;
7076   return select12BitValueWithLeftShift(*MaybeImmed);
7077 }
7078 
7079 /// SelectNegArithImmed - As above, but negates the value before trying to
7080 /// select it.
7081 InstructionSelector::ComplexRendererFns
7082 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
7083   // We need a register here, because we need to know if we have a 64 or 32
7084   // bit immediate.
7085   if (!Root.isReg())
7086     return std::nullopt;
7087   auto MaybeImmed = getImmedFromMO(Root);
7088   if (MaybeImmed == std::nullopt)
7089     return std::nullopt;
7090   uint64_t Immed = *MaybeImmed;
7091 
7092   // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
7093   // have the opposite effect on the C flag, so this pattern mustn't match under
7094   // those circumstances.
7095   if (Immed == 0)
7096     return std::nullopt;
7097 
7098   // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
7099   // the root.
7100   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7101   if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
7102     Immed = ~((uint32_t)Immed) + 1;
7103   else
7104     Immed = ~Immed + 1ULL;
7105 
7106   if (Immed & 0xFFFFFFFFFF000000ULL)
7107     return std::nullopt;
7108 
7109   Immed &= 0xFFFFFFULL;
7110   return select12BitValueWithLeftShift(Immed);
7111 }
7112 
7113 /// Checks if we are sure that folding MI into load/store addressing mode is
7114 /// beneficial or not.
7115 ///
7116 /// Returns:
7117 /// - true if folding MI would be beneficial.
7118 /// - false if folding MI would be bad.
7119 /// - std::nullopt if it is not sure whether folding MI is beneficial.
7120 ///
7121 /// \p MI can be the offset operand of G_PTR_ADD, e.g. G_SHL in the example:
7122 ///
7123 /// %13:gpr(s64) = G_CONSTANT i64 1
7124 /// %8:gpr(s64) = G_SHL %6, %13(s64)
7125 /// %9:gpr(p0) = G_PTR_ADD %0, %8(s64)
7126 /// %12:gpr(s32) = G_LOAD %9(p0) :: (load (s16))
7127 std::optional<bool> AArch64InstructionSelector::isWorthFoldingIntoAddrMode(
7128     MachineInstr &MI, const MachineRegisterInfo &MRI) const {
7129   if (MI.getOpcode() == AArch64::G_SHL) {
7130     // Address operands with shifts are free, except for running on subtargets
7131     // with AddrLSLSlow14.
7132     if (const auto ValAndVeg = getIConstantVRegValWithLookThrough(
7133             MI.getOperand(2).getReg(), MRI)) {
7134       const APInt ShiftVal = ValAndVeg->Value;
7135 
7136       // Don't fold if we know this will be slow.
7137       return !(STI.hasAddrLSLSlow14() && (ShiftVal == 1 || ShiftVal == 4));
7138     }
7139   }
7140   return std::nullopt;
7141 }
7142 
7143 /// Return true if it is worth folding MI into an extended register. That is,
7144 /// if it's safe to pull it into the addressing mode of a load or store as a
7145 /// shift.
7146 /// \p IsAddrOperand whether the def of MI is used as an address operand
7147 /// (e.g. feeding into an LDR/STR).
7148 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
7149     MachineInstr &MI, const MachineRegisterInfo &MRI,
7150     bool IsAddrOperand) const {
7151 
7152   // Always fold if there is one use, or if we're optimizing for size.
7153   Register DefReg = MI.getOperand(0).getReg();
7154   if (MRI.hasOneNonDBGUse(DefReg) ||
7155       MI.getParent()->getParent()->getFunction().hasOptSize())
7156     return true;
7157 
7158   if (IsAddrOperand) {
7159     // If we are already sure that folding MI is good or bad, return the result.
7160     if (const auto Worth = isWorthFoldingIntoAddrMode(MI, MRI))
7161       return *Worth;
7162 
7163     // Fold G_PTR_ADD if its offset operand can be folded
7164     if (MI.getOpcode() == AArch64::G_PTR_ADD) {
7165       MachineInstr *OffsetInst =
7166           getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
7167 
7168       // Note, we already know G_PTR_ADD is used by at least two instructions.
7169       // If we are also sure about whether folding is beneficial or not,
7170       // return the result.
7171       if (const auto Worth = isWorthFoldingIntoAddrMode(*OffsetInst, MRI))
7172         return *Worth;
7173     }
7174   }
7175 
7176   // FIXME: Consider checking HasALULSLFast as appropriate.
7177 
7178   // We have a fastpath, so folding a shift in and potentially computing it
7179   // many times may be beneficial. Check if this is only used in memory ops.
7180   // If it is, then we should fold.
7181   return all_of(MRI.use_nodbg_instructions(DefReg),
7182                 [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
7183 }
7184 
7185 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
7186   switch (Type) {
7187   case AArch64_AM::SXTB:
7188   case AArch64_AM::SXTH:
7189   case AArch64_AM::SXTW:
7190     return true;
7191   default:
7192     return false;
7193   }
7194 }
7195 
7196 InstructionSelector::ComplexRendererFns
7197 AArch64InstructionSelector::selectExtendedSHL(
7198     MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
7199     unsigned SizeInBytes, bool WantsExt) const {
7200   assert(Base.isReg() && "Expected base to be a register operand");
7201   assert(Offset.isReg() && "Expected offset to be a register operand");
7202 
7203   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7204   MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
7205 
7206   unsigned OffsetOpc = OffsetInst->getOpcode();
7207   bool LookedThroughZExt = false;
7208   if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
7209     // Try to look through a ZEXT.
7210     if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
7211       return std::nullopt;
7212 
7213     OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg());
7214     OffsetOpc = OffsetInst->getOpcode();
7215     LookedThroughZExt = true;
7216 
7217     if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
7218       return std::nullopt;
7219   }
7220   // Make sure that the memory op is a valid size.
7221   int64_t LegalShiftVal = Log2_32(SizeInBytes);
7222   if (LegalShiftVal == 0)
7223     return std::nullopt;
7224   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI, true))
7225     return std::nullopt;
7226 
7227   // Now, try to find the specific G_CONSTANT. Start by assuming that the
7228   // register we will offset is the LHS, and the register containing the
7229   // constant is the RHS.
7230   Register OffsetReg = OffsetInst->getOperand(1).getReg();
7231   Register ConstantReg = OffsetInst->getOperand(2).getReg();
7232   auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
7233   if (!ValAndVReg) {
7234     // We didn't get a constant on the RHS. If the opcode is a shift, then
7235     // we're done.
7236     if (OffsetOpc == TargetOpcode::G_SHL)
7237       return std::nullopt;
7238 
7239     // If we have a G_MUL, we can use either register. Try looking at the RHS.
7240     std::swap(OffsetReg, ConstantReg);
7241     ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
7242     if (!ValAndVReg)
7243       return std::nullopt;
7244   }
7245 
7246   // The value must fit into 3 bits, and must be positive. Make sure that is
7247   // true.
7248   int64_t ImmVal = ValAndVReg->Value.getSExtValue();
7249 
7250   // Since we're going to pull this into a shift, the constant value must be
7251   // a power of 2. If we got a multiply, then we need to check this.
7252   if (OffsetOpc == TargetOpcode::G_MUL) {
7253     if (!llvm::has_single_bit<uint32_t>(ImmVal))
7254       return std::nullopt;
7255 
7256     // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
7257     ImmVal = Log2_32(ImmVal);
7258   }
7259 
7260   if ((ImmVal & 0x7) != ImmVal)
7261     return std::nullopt;
7262 
7263   // We are only allowed to shift by LegalShiftVal. This shift value is built
7264   // into the instruction, so we can't just use whatever we want.
7265   if (ImmVal != LegalShiftVal)
7266     return std::nullopt;
7267 
7268   unsigned SignExtend = 0;
7269   if (WantsExt) {
7270     // Check if the offset is defined by an extend, unless we looked through a
7271     // G_ZEXT earlier.
7272     if (!LookedThroughZExt) {
7273       MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
7274       auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
7275       if (Ext == AArch64_AM::InvalidShiftExtend)
7276         return std::nullopt;
7277 
7278       SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
7279       // We only support SXTW for signed extension here.
7280       if (SignExtend && Ext != AArch64_AM::SXTW)
7281         return std::nullopt;
7282       OffsetReg = ExtInst->getOperand(1).getReg();
7283     }
7284 
7285     // Need a 32-bit wide register here.
7286     MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
7287     OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
7288   }
7289 
7290   // We can use the LHS of the GEP as the base, and the LHS of the shift as an
7291   // offset. Signify that we are shifting by setting the shift flag to 1.
7292   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); },
7293            [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
7294            [=](MachineInstrBuilder &MIB) {
7295              // Need to add both immediates here to make sure that they are both
7296              // added to the instruction.
7297              MIB.addImm(SignExtend);
7298              MIB.addImm(1);
7299            }}};
7300 }
7301 
7302 /// This is used for computing addresses like this:
7303 ///
7304 /// ldr x1, [x2, x3, lsl #3]
7305 ///
7306 /// Where x2 is the base register, and x3 is an offset register. The shift-left
7307 /// is a constant value specific to this load instruction. That is, we'll never
7308 /// see anything other than a 3 here (which corresponds to the size of the
7309 /// element being loaded.)
7310 InstructionSelector::ComplexRendererFns
7311 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
7312     MachineOperand &Root, unsigned SizeInBytes) const {
7313   if (!Root.isReg())
7314     return std::nullopt;
7315   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7316 
7317   // We want to find something like this:
7318   //
7319   // val = G_CONSTANT LegalShiftVal
7320   // shift = G_SHL off_reg val
7321   // ptr = G_PTR_ADD base_reg shift
7322   // x = G_LOAD ptr
7323   //
7324   // And fold it into this addressing mode:
7325   //
7326   // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
7327 
7328   // Check if we can find the G_PTR_ADD.
7329   MachineInstr *PtrAdd =
7330       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7331   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI, true))
7332     return std::nullopt;
7333 
7334   // Now, try to match an opcode which will match our specific offset.
7335   // We want a G_SHL or a G_MUL.
7336   MachineInstr *OffsetInst =
7337       getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI);
7338   return selectExtendedSHL(Root, PtrAdd->getOperand(1),
7339                            OffsetInst->getOperand(0), SizeInBytes,
7340                            /*WantsExt=*/false);
7341 }
7342 
7343 /// This is used for computing addresses like this:
7344 ///
7345 /// ldr x1, [x2, x3]
7346 ///
7347 /// Where x2 is the base register, and x3 is an offset register.
7348 ///
7349 /// When possible (or profitable) to fold a G_PTR_ADD into the address
7350 /// calculation, this will do so. Otherwise, it will return std::nullopt.
7351 InstructionSelector::ComplexRendererFns
7352 AArch64InstructionSelector::selectAddrModeRegisterOffset(
7353     MachineOperand &Root) const {
7354   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7355 
7356   // We need a GEP.
7357   MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
7358   if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
7359     return std::nullopt;
7360 
7361   // If this is used more than once, let's not bother folding.
7362   // TODO: Check if they are memory ops. If they are, then we can still fold
7363   // without having to recompute anything.
7364   if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg()))
7365     return std::nullopt;
7366 
7367   // Base is the GEP's LHS, offset is its RHS.
7368   return {{[=](MachineInstrBuilder &MIB) {
7369              MIB.addUse(Gep->getOperand(1).getReg());
7370            },
7371            [=](MachineInstrBuilder &MIB) {
7372              MIB.addUse(Gep->getOperand(2).getReg());
7373            },
7374            [=](MachineInstrBuilder &MIB) {
7375              // Need to add both immediates here to make sure that they are both
7376              // added to the instruction.
7377              MIB.addImm(0);
7378              MIB.addImm(0);
7379            }}};
7380 }
7381 
7382 /// This is intended to be equivalent to selectAddrModeXRO in
7383 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
7384 InstructionSelector::ComplexRendererFns
7385 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
7386                                               unsigned SizeInBytes) const {
7387   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7388   if (!Root.isReg())
7389     return std::nullopt;
7390   MachineInstr *PtrAdd =
7391       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7392   if (!PtrAdd)
7393     return std::nullopt;
7394 
7395   // Check for an immediates which cannot be encoded in the [base + imm]
7396   // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
7397   // end up with code like:
7398   //
7399   // mov x0, wide
7400   // add x1 base, x0
7401   // ldr x2, [x1, x0]
7402   //
7403   // In this situation, we can use the [base, xreg] addressing mode to save an
7404   // add/sub:
7405   //
7406   // mov x0, wide
7407   // ldr x2, [base, x0]
7408   auto ValAndVReg =
7409       getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
7410   if (ValAndVReg) {
7411     unsigned Scale = Log2_32(SizeInBytes);
7412     int64_t ImmOff = ValAndVReg->Value.getSExtValue();
7413 
7414     // Skip immediates that can be selected in the load/store addresing
7415     // mode.
7416     if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
7417         ImmOff < (0x1000 << Scale))
7418       return std::nullopt;
7419 
7420     // Helper lambda to decide whether or not it is preferable to emit an add.
7421     auto isPreferredADD = [](int64_t ImmOff) {
7422       // Constants in [0x0, 0xfff] can be encoded in an add.
7423       if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
7424         return true;
7425 
7426       // Can it be encoded in an add lsl #12?
7427       if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
7428         return false;
7429 
7430       // It can be encoded in an add lsl #12, but we may not want to. If it is
7431       // possible to select this as a single movz, then prefer that. A single
7432       // movz is faster than an add with a shift.
7433       return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
7434              (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
7435     };
7436 
7437     // If the immediate can be encoded in a single add/sub, then bail out.
7438     if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
7439       return std::nullopt;
7440   }
7441 
7442   // Try to fold shifts into the addressing mode.
7443   auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
7444   if (AddrModeFns)
7445     return AddrModeFns;
7446 
7447   // If that doesn't work, see if it's possible to fold in registers from
7448   // a GEP.
7449   return selectAddrModeRegisterOffset(Root);
7450 }
7451 
7452 /// This is used for computing addresses like this:
7453 ///
7454 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
7455 ///
7456 /// Where we have a 64-bit base register, a 32-bit offset register, and an
7457 /// extend (which may or may not be signed).
7458 InstructionSelector::ComplexRendererFns
7459 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
7460                                               unsigned SizeInBytes) const {
7461   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7462 
7463   MachineInstr *PtrAdd =
7464       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7465   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI, true))
7466     return std::nullopt;
7467 
7468   MachineOperand &LHS = PtrAdd->getOperand(1);
7469   MachineOperand &RHS = PtrAdd->getOperand(2);
7470   MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI);
7471 
7472   // The first case is the same as selectAddrModeXRO, except we need an extend.
7473   // In this case, we try to find a shift and extend, and fold them into the
7474   // addressing mode.
7475   //
7476   // E.g.
7477   //
7478   // off_reg = G_Z/S/ANYEXT ext_reg
7479   // val = G_CONSTANT LegalShiftVal
7480   // shift = G_SHL off_reg val
7481   // ptr = G_PTR_ADD base_reg shift
7482   // x = G_LOAD ptr
7483   //
7484   // In this case we can get a load like this:
7485   //
7486   // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
7487   auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0),
7488                                        SizeInBytes, /*WantsExt=*/true);
7489   if (ExtendedShl)
7490     return ExtendedShl;
7491 
7492   // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
7493   //
7494   // e.g.
7495   // ldr something, [base_reg, ext_reg, sxtw]
7496   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI, true))
7497     return std::nullopt;
7498 
7499   // Check if this is an extend. We'll get an extend type if it is.
7500   AArch64_AM::ShiftExtendType Ext =
7501       getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true);
7502   if (Ext == AArch64_AM::InvalidShiftExtend)
7503     return std::nullopt;
7504 
7505   // Need a 32-bit wide register.
7506   MachineIRBuilder MIB(*PtrAdd);
7507   Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(),
7508                                        AArch64::GPR32RegClass, MIB);
7509   unsigned SignExtend = Ext == AArch64_AM::SXTW;
7510 
7511   // Base is LHS, offset is ExtReg.
7512   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); },
7513            [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
7514            [=](MachineInstrBuilder &MIB) {
7515              MIB.addImm(SignExtend);
7516              MIB.addImm(0);
7517            }}};
7518 }
7519 
7520 /// Select a "register plus unscaled signed 9-bit immediate" address.  This
7521 /// should only match when there is an offset that is not valid for a scaled
7522 /// immediate addressing mode.  The "Size" argument is the size in bytes of the
7523 /// memory reference, which is needed here to know what is valid for a scaled
7524 /// immediate.
7525 InstructionSelector::ComplexRendererFns
7526 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
7527                                                    unsigned Size) const {
7528   MachineRegisterInfo &MRI =
7529       Root.getParent()->getParent()->getParent()->getRegInfo();
7530 
7531   if (!Root.isReg())
7532     return std::nullopt;
7533 
7534   if (!isBaseWithConstantOffset(Root, MRI))
7535     return std::nullopt;
7536 
7537   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
7538 
7539   MachineOperand &OffImm = RootDef->getOperand(2);
7540   if (!OffImm.isReg())
7541     return std::nullopt;
7542   MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
7543   if (RHS->getOpcode() != TargetOpcode::G_CONSTANT)
7544     return std::nullopt;
7545   int64_t RHSC;
7546   MachineOperand &RHSOp1 = RHS->getOperand(1);
7547   if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
7548     return std::nullopt;
7549   RHSC = RHSOp1.getCImm()->getSExtValue();
7550 
7551   if (RHSC >= -256 && RHSC < 256) {
7552     MachineOperand &Base = RootDef->getOperand(1);
7553     return {{
7554         [=](MachineInstrBuilder &MIB) { MIB.add(Base); },
7555         [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
7556     }};
7557   }
7558   return std::nullopt;
7559 }
7560 
7561 InstructionSelector::ComplexRendererFns
7562 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
7563                                                  unsigned Size,
7564                                                  MachineRegisterInfo &MRI) const {
7565   if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
7566     return std::nullopt;
7567   MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg());
7568   if (Adrp.getOpcode() != AArch64::ADRP)
7569     return std::nullopt;
7570 
7571   // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
7572   auto Offset = Adrp.getOperand(1).getOffset();
7573   if (Offset % Size != 0)
7574     return std::nullopt;
7575 
7576   auto GV = Adrp.getOperand(1).getGlobal();
7577   if (GV->isThreadLocal())
7578     return std::nullopt;
7579 
7580   auto &MF = *RootDef.getParent()->getParent();
7581   if (GV->getPointerAlignment(MF.getDataLayout()) < Size)
7582     return std::nullopt;
7583 
7584   unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget());
7585   MachineIRBuilder MIRBuilder(RootDef);
7586   Register AdrpReg = Adrp.getOperand(0).getReg();
7587   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
7588            [=](MachineInstrBuilder &MIB) {
7589              MIB.addGlobalAddress(GV, Offset,
7590                                   OpFlags | AArch64II::MO_PAGEOFF |
7591                                       AArch64II::MO_NC);
7592            }}};
7593 }
7594 
7595 /// Select a "register plus scaled unsigned 12-bit immediate" address.  The
7596 /// "Size" argument is the size in bytes of the memory reference, which
7597 /// determines the scale.
7598 InstructionSelector::ComplexRendererFns
7599 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
7600                                                   unsigned Size) const {
7601   MachineFunction &MF = *Root.getParent()->getParent()->getParent();
7602   MachineRegisterInfo &MRI = MF.getRegInfo();
7603 
7604   if (!Root.isReg())
7605     return std::nullopt;
7606 
7607   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
7608   if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
7609     return {{
7610         [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
7611         [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
7612     }};
7613   }
7614 
7615   CodeModel::Model CM = MF.getTarget().getCodeModel();
7616   // Check if we can fold in the ADD of small code model ADRP + ADD address.
7617   if (CM == CodeModel::Small) {
7618     auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI);
7619     if (OpFns)
7620       return OpFns;
7621   }
7622 
7623   if (isBaseWithConstantOffset(Root, MRI)) {
7624     MachineOperand &LHS = RootDef->getOperand(1);
7625     MachineOperand &RHS = RootDef->getOperand(2);
7626     MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
7627     MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
7628 
7629     int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
7630     unsigned Scale = Log2_32(Size);
7631     if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
7632       if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
7633         return {{
7634             [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
7635             [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
7636         }};
7637 
7638       return {{
7639           [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
7640           [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
7641       }};
7642     }
7643   }
7644 
7645   // Before falling back to our general case, check if the unscaled
7646   // instructions can handle this. If so, that's preferable.
7647   if (selectAddrModeUnscaled(Root, Size))
7648     return std::nullopt;
7649 
7650   return {{
7651       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
7652       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
7653   }};
7654 }
7655 
7656 /// Given a shift instruction, return the correct shift type for that
7657 /// instruction.
7658 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
7659   switch (MI.getOpcode()) {
7660   default:
7661     return AArch64_AM::InvalidShiftExtend;
7662   case TargetOpcode::G_SHL:
7663     return AArch64_AM::LSL;
7664   case TargetOpcode::G_LSHR:
7665     return AArch64_AM::LSR;
7666   case TargetOpcode::G_ASHR:
7667     return AArch64_AM::ASR;
7668   case TargetOpcode::G_ROTR:
7669     return AArch64_AM::ROR;
7670   }
7671 }
7672 
7673 /// Select a "shifted register" operand. If the value is not shifted, set the
7674 /// shift operand to a default value of "lsl 0".
7675 InstructionSelector::ComplexRendererFns
7676 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
7677                                                   bool AllowROR) const {
7678   if (!Root.isReg())
7679     return std::nullopt;
7680   MachineRegisterInfo &MRI =
7681       Root.getParent()->getParent()->getParent()->getRegInfo();
7682 
7683   // Check if the operand is defined by an instruction which corresponds to
7684   // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
7685   MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
7686   AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
7687   if (ShType == AArch64_AM::InvalidShiftExtend)
7688     return std::nullopt;
7689   if (ShType == AArch64_AM::ROR && !AllowROR)
7690     return std::nullopt;
7691   if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI, false))
7692     return std::nullopt;
7693 
7694   // Need an immediate on the RHS.
7695   MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
7696   auto Immed = getImmedFromMO(ShiftRHS);
7697   if (!Immed)
7698     return std::nullopt;
7699 
7700   // We have something that we can fold. Fold in the shift's LHS and RHS into
7701   // the instruction.
7702   MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
7703   Register ShiftReg = ShiftLHS.getReg();
7704 
7705   unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
7706   unsigned Val = *Immed & (NumBits - 1);
7707   unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
7708 
7709   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
7710            [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
7711 }
7712 
7713 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
7714     MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
7715   unsigned Opc = MI.getOpcode();
7716 
7717   // Handle explicit extend instructions first.
7718   if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
7719     unsigned Size;
7720     if (Opc == TargetOpcode::G_SEXT)
7721       Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
7722     else
7723       Size = MI.getOperand(2).getImm();
7724     assert(Size != 64 && "Extend from 64 bits?");
7725     switch (Size) {
7726     case 8:
7727       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
7728     case 16:
7729       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
7730     case 32:
7731       return AArch64_AM::SXTW;
7732     default:
7733       return AArch64_AM::InvalidShiftExtend;
7734     }
7735   }
7736 
7737   if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
7738     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
7739     assert(Size != 64 && "Extend from 64 bits?");
7740     switch (Size) {
7741     case 8:
7742       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
7743     case 16:
7744       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
7745     case 32:
7746       return AArch64_AM::UXTW;
7747     default:
7748       return AArch64_AM::InvalidShiftExtend;
7749     }
7750   }
7751 
7752   // Don't have an explicit extend. Try to handle a G_AND with a constant mask
7753   // on the RHS.
7754   if (Opc != TargetOpcode::G_AND)
7755     return AArch64_AM::InvalidShiftExtend;
7756 
7757   std::optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
7758   if (!MaybeAndMask)
7759     return AArch64_AM::InvalidShiftExtend;
7760   uint64_t AndMask = *MaybeAndMask;
7761   switch (AndMask) {
7762   default:
7763     return AArch64_AM::InvalidShiftExtend;
7764   case 0xFF:
7765     return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
7766   case 0xFFFF:
7767     return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
7768   case 0xFFFFFFFF:
7769     return AArch64_AM::UXTW;
7770   }
7771 }
7772 
7773 Register AArch64InstructionSelector::moveScalarRegClass(
7774     Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
7775   MachineRegisterInfo &MRI = *MIB.getMRI();
7776   auto Ty = MRI.getType(Reg);
7777   assert(!Ty.isVector() && "Expected scalars only!");
7778   if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
7779     return Reg;
7780 
7781   // Create a copy and immediately select it.
7782   // FIXME: We should have an emitCopy function?
7783   auto Copy = MIB.buildCopy({&RC}, {Reg});
7784   selectCopy(*Copy, TII, MRI, TRI, RBI);
7785   return Copy.getReg(0);
7786 }
7787 
7788 /// Select an "extended register" operand. This operand folds in an extend
7789 /// followed by an optional left shift.
7790 InstructionSelector::ComplexRendererFns
7791 AArch64InstructionSelector::selectArithExtendedRegister(
7792     MachineOperand &Root) const {
7793   if (!Root.isReg())
7794     return std::nullopt;
7795   MachineRegisterInfo &MRI =
7796       Root.getParent()->getParent()->getParent()->getRegInfo();
7797 
7798   uint64_t ShiftVal = 0;
7799   Register ExtReg;
7800   AArch64_AM::ShiftExtendType Ext;
7801   MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
7802   if (!RootDef)
7803     return std::nullopt;
7804 
7805   if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI, false))
7806     return std::nullopt;
7807 
7808   // Check if we can fold a shift and an extend.
7809   if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
7810     // Look for a constant on the RHS of the shift.
7811     MachineOperand &RHS = RootDef->getOperand(2);
7812     std::optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
7813     if (!MaybeShiftVal)
7814       return std::nullopt;
7815     ShiftVal = *MaybeShiftVal;
7816     if (ShiftVal > 4)
7817       return std::nullopt;
7818     // Look for a valid extend instruction on the LHS of the shift.
7819     MachineOperand &LHS = RootDef->getOperand(1);
7820     MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
7821     if (!ExtDef)
7822       return std::nullopt;
7823     Ext = getExtendTypeForInst(*ExtDef, MRI);
7824     if (Ext == AArch64_AM::InvalidShiftExtend)
7825       return std::nullopt;
7826     ExtReg = ExtDef->getOperand(1).getReg();
7827   } else {
7828     // Didn't get a shift. Try just folding an extend.
7829     Ext = getExtendTypeForInst(*RootDef, MRI);
7830     if (Ext == AArch64_AM::InvalidShiftExtend)
7831       return std::nullopt;
7832     ExtReg = RootDef->getOperand(1).getReg();
7833 
7834     // If we have a 32 bit instruction which zeroes out the high half of a
7835     // register, we get an implicit zero extend for free. Check if we have one.
7836     // FIXME: We actually emit the extend right now even though we don't have
7837     // to.
7838     if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
7839       MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
7840       if (isDef32(*ExtInst))
7841         return std::nullopt;
7842     }
7843   }
7844 
7845   // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
7846   // copy.
7847   MachineIRBuilder MIB(*RootDef);
7848   ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
7849 
7850   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
7851            [=](MachineInstrBuilder &MIB) {
7852              MIB.addImm(getArithExtendImm(Ext, ShiftVal));
7853            }}};
7854 }
7855 
7856 InstructionSelector::ComplexRendererFns
7857 AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const {
7858   if (!Root.isReg())
7859     return std::nullopt;
7860   MachineRegisterInfo &MRI =
7861       Root.getParent()->getParent()->getParent()->getRegInfo();
7862 
7863   auto Extract = getDefSrcRegIgnoringCopies(Root.getReg(), MRI);
7864   while (Extract && Extract->MI->getOpcode() == TargetOpcode::G_BITCAST &&
7865          STI.isLittleEndian())
7866     Extract =
7867         getDefSrcRegIgnoringCopies(Extract->MI->getOperand(1).getReg(), MRI);
7868   if (!Extract)
7869     return std::nullopt;
7870 
7871   if (Extract->MI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
7872     if (Extract->Reg == Extract->MI->getOperand(1).getReg()) {
7873       Register ExtReg = Extract->MI->getOperand(2).getReg();
7874       return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}};
7875     }
7876   }
7877   if (Extract->MI->getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) {
7878     LLT SrcTy = MRI.getType(Extract->MI->getOperand(1).getReg());
7879     auto LaneIdx = getIConstantVRegValWithLookThrough(
7880         Extract->MI->getOperand(2).getReg(), MRI);
7881     if (LaneIdx && SrcTy == LLT::fixed_vector(2, 64) &&
7882         LaneIdx->Value.getSExtValue() == 1) {
7883       Register ExtReg = Extract->MI->getOperand(1).getReg();
7884       return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}};
7885     }
7886   }
7887 
7888   return std::nullopt;
7889 }
7890 
7891 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
7892                                                 const MachineInstr &MI,
7893                                                 int OpIdx) const {
7894   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7895   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7896          "Expected G_CONSTANT");
7897   std::optional<int64_t> CstVal =
7898       getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
7899   assert(CstVal && "Expected constant value");
7900   MIB.addImm(*CstVal);
7901 }
7902 
7903 void AArch64InstructionSelector::renderLogicalImm32(
7904   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7905   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7906          "Expected G_CONSTANT");
7907   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
7908   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
7909   MIB.addImm(Enc);
7910 }
7911 
7912 void AArch64InstructionSelector::renderLogicalImm64(
7913   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7914   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7915          "Expected G_CONSTANT");
7916   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
7917   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
7918   MIB.addImm(Enc);
7919 }
7920 
7921 void AArch64InstructionSelector::renderUbsanTrap(MachineInstrBuilder &MIB,
7922                                                  const MachineInstr &MI,
7923                                                  int OpIdx) const {
7924   assert(MI.getOpcode() == TargetOpcode::G_UBSANTRAP && OpIdx == 0 &&
7925          "Expected G_UBSANTRAP");
7926   MIB.addImm(MI.getOperand(0).getImm() | ('U' << 8));
7927 }
7928 
7929 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
7930                                                const MachineInstr &MI,
7931                                                int OpIdx) const {
7932   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7933          "Expected G_FCONSTANT");
7934   MIB.addImm(
7935       AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
7936 }
7937 
7938 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
7939                                                const MachineInstr &MI,
7940                                                int OpIdx) const {
7941   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7942          "Expected G_FCONSTANT");
7943   MIB.addImm(
7944       AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
7945 }
7946 
7947 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
7948                                                const MachineInstr &MI,
7949                                                int OpIdx) const {
7950   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7951          "Expected G_FCONSTANT");
7952   MIB.addImm(
7953       AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
7954 }
7955 
7956 void AArch64InstructionSelector::renderFPImm32SIMDModImmType4(
7957     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7958   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7959          "Expected G_FCONSTANT");
7960   MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1)
7961                                                       .getFPImm()
7962                                                       ->getValueAPF()
7963                                                       .bitcastToAPInt()
7964                                                       .getZExtValue()));
7965 }
7966 
7967 bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
7968     const MachineInstr &MI, unsigned NumBytes) const {
7969   if (!MI.mayLoadOrStore())
7970     return false;
7971   assert(MI.hasOneMemOperand() &&
7972          "Expected load/store to have only one mem op!");
7973   return (*MI.memoperands_begin())->getSize() == NumBytes;
7974 }
7975 
7976 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
7977   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7978   if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
7979     return false;
7980 
7981   // Only return true if we know the operation will zero-out the high half of
7982   // the 64-bit register. Truncates can be subregister copies, which don't
7983   // zero out the high bits. Copies and other copy-like instructions can be
7984   // fed by truncates, or could be lowered as subregister copies.
7985   switch (MI.getOpcode()) {
7986   default:
7987     return true;
7988   case TargetOpcode::COPY:
7989   case TargetOpcode::G_BITCAST:
7990   case TargetOpcode::G_TRUNC:
7991   case TargetOpcode::G_PHI:
7992     return false;
7993   }
7994 }
7995 
7996 
7997 // Perform fixups on the given PHI instruction's operands to force them all
7998 // to be the same as the destination regbank.
7999 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
8000                             const AArch64RegisterBankInfo &RBI) {
8001   assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
8002   Register DstReg = MI.getOperand(0).getReg();
8003   const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg);
8004   assert(DstRB && "Expected PHI dst to have regbank assigned");
8005   MachineIRBuilder MIB(MI);
8006 
8007   // Go through each operand and ensure it has the same regbank.
8008   for (MachineOperand &MO : llvm::drop_begin(MI.operands())) {
8009     if (!MO.isReg())
8010       continue;
8011     Register OpReg = MO.getReg();
8012     const RegisterBank *RB = MRI.getRegBankOrNull(OpReg);
8013     if (RB != DstRB) {
8014       // Insert a cross-bank copy.
8015       auto *OpDef = MRI.getVRegDef(OpReg);
8016       const LLT &Ty = MRI.getType(OpReg);
8017       MachineBasicBlock &OpDefBB = *OpDef->getParent();
8018 
8019       // Any instruction we insert must appear after all PHIs in the block
8020       // for the block to be valid MIR.
8021       MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator());
8022       if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
8023         InsertPt = OpDefBB.getFirstNonPHI();
8024       MIB.setInsertPt(*OpDef->getParent(), InsertPt);
8025       auto Copy = MIB.buildCopy(Ty, OpReg);
8026       MRI.setRegBank(Copy.getReg(0), *DstRB);
8027       MO.setReg(Copy.getReg(0));
8028     }
8029   }
8030 }
8031 
8032 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
8033   // We're looking for PHIs, build a list so we don't invalidate iterators.
8034   MachineRegisterInfo &MRI = MF.getRegInfo();
8035   SmallVector<MachineInstr *, 32> Phis;
8036   for (auto &BB : MF) {
8037     for (auto &MI : BB) {
8038       if (MI.getOpcode() == TargetOpcode::G_PHI)
8039         Phis.emplace_back(&MI);
8040     }
8041   }
8042 
8043   for (auto *MI : Phis) {
8044     // We need to do some work here if the operand types are < 16 bit and they
8045     // are split across fpr/gpr banks. Since all types <32b on gpr
8046     // end up being assigned gpr32 regclasses, we can end up with PHIs here
8047     // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
8048     // be selecting heterogenous regbanks for operands if possible, but we
8049     // still need to be able to deal with it here.
8050     //
8051     // To fix this, if we have a gpr-bank operand < 32b in size and at least
8052     // one other operand is on the fpr bank, then we add cross-bank copies
8053     // to homogenize the operand banks. For simplicity the bank that we choose
8054     // to settle on is whatever bank the def operand has. For example:
8055     //
8056     // %endbb:
8057     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
8058     //  =>
8059     // %bb2:
8060     //   ...
8061     //   %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
8062     //   ...
8063     // %endbb:
8064     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
8065     bool HasGPROp = false, HasFPROp = false;
8066     for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) {
8067       if (!MO.isReg())
8068         continue;
8069       const LLT &Ty = MRI.getType(MO.getReg());
8070       if (!Ty.isValid() || !Ty.isScalar())
8071         break;
8072       if (Ty.getSizeInBits() >= 32)
8073         break;
8074       const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
8075       // If for some reason we don't have a regbank yet. Don't try anything.
8076       if (!RB)
8077         break;
8078 
8079       if (RB->getID() == AArch64::GPRRegBankID)
8080         HasGPROp = true;
8081       else
8082         HasFPROp = true;
8083     }
8084     // We have heterogenous regbanks, need to fixup.
8085     if (HasGPROp && HasFPROp)
8086       fixupPHIOpBanks(*MI, MRI, RBI);
8087   }
8088 }
8089 
8090 namespace llvm {
8091 InstructionSelector *
8092 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
8093                                  const AArch64Subtarget &Subtarget,
8094                                  const AArch64RegisterBankInfo &RBI) {
8095   return new AArch64InstructionSelector(TM, Subtarget, RBI);
8096 }
8097 }
8098