xref: /netbsd-src/external/apache2/llvm/dist/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64GlobalISelUtils.h"
15 #include "AArch64InstrInfo.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64RegisterBankInfo.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "AArch64TargetMachine.h"
21 #include "AArch64GlobalISelUtils.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "MCTargetDesc/AArch64MCTargetDesc.h"
24 #include "llvm/ADT/Optional.h"
25 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
26 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
27 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
28 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
29 #include "llvm/CodeGen/MachineBasicBlock.h"
30 #include "llvm/CodeGen/MachineConstantPool.h"
31 #include "llvm/CodeGen/MachineFunction.h"
32 #include "llvm/CodeGen/MachineInstr.h"
33 #include "llvm/CodeGen/MachineInstrBuilder.h"
34 #include "llvm/CodeGen/MachineMemOperand.h"
35 #include "llvm/CodeGen/MachineOperand.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/TargetOpcodes.h"
38 #include "llvm/IR/Constants.h"
39 #include "llvm/IR/DerivedTypes.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/PatternMatch.h"
42 #include "llvm/IR/Type.h"
43 #include "llvm/IR/IntrinsicsAArch64.h"
44 #include "llvm/Pass.h"
45 #include "llvm/Support/Debug.h"
46 #include "llvm/Support/raw_ostream.h"
47 
48 #define DEBUG_TYPE "aarch64-isel"
49 
50 using namespace llvm;
51 using namespace MIPatternMatch;
52 using namespace AArch64GISelUtils;
53 
54 namespace llvm {
55 class BlockFrequencyInfo;
56 class ProfileSummaryInfo;
57 }
58 
59 namespace {
60 
61 #define GET_GLOBALISEL_PREDICATE_BITSET
62 #include "AArch64GenGlobalISel.inc"
63 #undef GET_GLOBALISEL_PREDICATE_BITSET
64 
65 class AArch64InstructionSelector : public InstructionSelector {
66 public:
67   AArch64InstructionSelector(const AArch64TargetMachine &TM,
68                              const AArch64Subtarget &STI,
69                              const AArch64RegisterBankInfo &RBI);
70 
71   bool select(MachineInstr &I) override;
getName()72   static const char *getName() { return DEBUG_TYPE; }
73 
setupMF(MachineFunction & MF,GISelKnownBits * KB,CodeGenCoverage & CoverageInfo,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)74   void setupMF(MachineFunction &MF, GISelKnownBits *KB,
75                CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI,
76                BlockFrequencyInfo *BFI) override {
77     InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
78     MIB.setMF(MF);
79 
80     // hasFnAttribute() is expensive to call on every BRCOND selection, so
81     // cache it here for each run of the selector.
82     ProduceNonFlagSettingCondBr =
83         !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
84     MFReturnAddr = Register();
85 
86     processPHIs(MF);
87   }
88 
89 private:
90   /// tblgen-erated 'select' implementation, used as the initial selector for
91   /// the patterns that don't require complex C++.
92   bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
93 
94   // A lowering phase that runs before any selection attempts.
95   // Returns true if the instruction was modified.
96   bool preISelLower(MachineInstr &I);
97 
98   // An early selection function that runs before the selectImpl() call.
99   bool earlySelect(MachineInstr &I);
100 
101   // Do some preprocessing of G_PHIs before we begin selection.
102   void processPHIs(MachineFunction &MF);
103 
104   bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
105 
106   /// Eliminate same-sized cross-bank copies into stores before selectImpl().
107   bool contractCrossBankCopyIntoStore(MachineInstr &I,
108                                       MachineRegisterInfo &MRI);
109 
110   bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
111 
112   bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
113                           MachineRegisterInfo &MRI) const;
114   bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
115                            MachineRegisterInfo &MRI) const;
116 
117   ///@{
118   /// Helper functions for selectCompareBranch.
119   bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
120                                     MachineIRBuilder &MIB) const;
121   bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
122                                     MachineIRBuilder &MIB) const;
123   bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
124                                     MachineIRBuilder &MIB) const;
125   bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
126                                   MachineBasicBlock *DstMBB,
127                                   MachineIRBuilder &MIB) const;
128   ///@}
129 
130   bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
131                            MachineRegisterInfo &MRI);
132 
133   bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
134   bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
135 
136   // Helper to generate an equivalent of scalar_to_vector into a new register,
137   // returned via 'Dst'.
138   MachineInstr *emitScalarToVector(unsigned EltSize,
139                                    const TargetRegisterClass *DstRC,
140                                    Register Scalar,
141                                    MachineIRBuilder &MIRBuilder) const;
142 
143   /// Emit a lane insert into \p DstReg, or a new vector register if None is
144   /// provided.
145   ///
146   /// The lane inserted into is defined by \p LaneIdx. The vector source
147   /// register is given by \p SrcReg. The register containing the element is
148   /// given by \p EltReg.
149   MachineInstr *emitLaneInsert(Optional<Register> DstReg, Register SrcReg,
150                                Register EltReg, unsigned LaneIdx,
151                                const RegisterBank &RB,
152                                MachineIRBuilder &MIRBuilder) const;
153 
154   /// Emit a sequence of instructions representing a constant \p CV for a
155   /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
156   ///
157   /// \returns the last instruction in the sequence on success, and nullptr
158   /// otherwise.
159   MachineInstr *emitConstantVector(Register Dst, Constant *CV,
160                                    MachineIRBuilder &MIRBuilder,
161                                    MachineRegisterInfo &MRI);
162 
163   bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI);
164   bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
165                               MachineRegisterInfo &MRI);
166   bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
167   bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
168   bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
169 
170   bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
171   bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
172   bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
173   bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
174   bool selectIntrinsicWithSideEffects(MachineInstr &I,
175                                       MachineRegisterInfo &MRI);
176   bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
177   bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI);
178   bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const;
179   bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const;
180   bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
181   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
182   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
183   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
184 
185   unsigned emitConstantPoolEntry(const Constant *CPVal,
186                                  MachineFunction &MF) const;
187   MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
188                                          MachineIRBuilder &MIRBuilder) const;
189 
190   // Emit a vector concat operation.
191   MachineInstr *emitVectorConcat(Optional<Register> Dst, Register Op1,
192                                  Register Op2,
193                                  MachineIRBuilder &MIRBuilder) const;
194 
195   // Emit an integer compare between LHS and RHS, which checks for Predicate.
196   MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
197                                    MachineOperand &Predicate,
198                                    MachineIRBuilder &MIRBuilder) const;
199 
200   /// Emit a floating point comparison between \p LHS and \p RHS.
201   /// \p Pred if given is the intended predicate to use.
202   MachineInstr *emitFPCompare(Register LHS, Register RHS,
203                               MachineIRBuilder &MIRBuilder,
204                               Optional<CmpInst::Predicate> = None) const;
205 
206   MachineInstr *emitInstr(unsigned Opcode,
207                           std::initializer_list<llvm::DstOp> DstOps,
208                           std::initializer_list<llvm::SrcOp> SrcOps,
209                           MachineIRBuilder &MIRBuilder,
210                           const ComplexRendererFns &RenderFns = None) const;
211   /// Helper function to emit an add or sub instruction.
212   ///
213   /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
214   /// in a specific order.
215   ///
216   /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
217   ///
218   /// \code
219   ///   const std::array<std::array<unsigned, 2>, 4> Table {
220   ///    {{AArch64::ADDXri, AArch64::ADDWri},
221   ///     {AArch64::ADDXrs, AArch64::ADDWrs},
222   ///     {AArch64::ADDXrr, AArch64::ADDWrr},
223   ///     {AArch64::SUBXri, AArch64::SUBWri},
224   ///     {AArch64::ADDXrx, AArch64::ADDWrx}}};
225   /// \endcode
226   ///
227   /// Each row in the table corresponds to a different addressing mode. Each
228   /// column corresponds to a different register size.
229   ///
230   /// \attention Rows must be structured as follows:
231   ///   - Row 0: The ri opcode variants
232   ///   - Row 1: The rs opcode variants
233   ///   - Row 2: The rr opcode variants
234   ///   - Row 3: The ri opcode variants for negative immediates
235   ///   - Row 4: The rx opcode variants
236   ///
237   /// \attention Columns must be structured as follows:
238   ///   - Column 0: The 64-bit opcode variants
239   ///   - Column 1: The 32-bit opcode variants
240   ///
241   /// \p Dst is the destination register of the binop to emit.
242   /// \p LHS is the left-hand operand of the binop to emit.
243   /// \p RHS is the right-hand operand of the binop to emit.
244   MachineInstr *emitAddSub(
245       const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
246       Register Dst, MachineOperand &LHS, MachineOperand &RHS,
247       MachineIRBuilder &MIRBuilder) const;
248   MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
249                         MachineOperand &RHS,
250                         MachineIRBuilder &MIRBuilder) const;
251   MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
252                          MachineIRBuilder &MIRBuilder) const;
253   MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
254                          MachineIRBuilder &MIRBuilder) const;
255   MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
256                         MachineIRBuilder &MIRBuilder) const;
257   MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
258                         MachineIRBuilder &MIRBuilder) const;
259   MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
260                            AArch64CC::CondCode CC,
261                            MachineIRBuilder &MIRBuilder) const;
262   MachineInstr *emitExtractVectorElt(Optional<Register> DstReg,
263                                      const RegisterBank &DstRB, LLT ScalarTy,
264                                      Register VecReg, unsigned LaneIdx,
265                                      MachineIRBuilder &MIRBuilder) const;
266 
267   /// Emit a CSet for an integer compare.
268   ///
269   /// \p DefReg and \p SrcReg are expected to be 32-bit scalar registers.
270   MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred,
271                                 MachineIRBuilder &MIRBuilder,
272                                 Register SrcReg = AArch64::WZR) const;
273   /// Emit a CSet for a FP compare.
274   ///
275   /// \p Dst is expected to be a 32-bit scalar register.
276   MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
277                                 MachineIRBuilder &MIRBuilder) const;
278 
279   /// Emit the overflow op for \p Opcode.
280   ///
281   /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
282   /// G_USUBO, etc.
283   std::pair<MachineInstr *, AArch64CC::CondCode>
284   emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
285                  MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
286 
287   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
288   /// \p IsNegative is true if the test should be "not zero".
289   /// This will also optimize the test bit instruction when possible.
290   MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
291                             MachineBasicBlock *DstMBB,
292                             MachineIRBuilder &MIB) const;
293 
294   /// Emit a CB(N)Z instruction which branches to \p DestMBB.
295   MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
296                         MachineBasicBlock *DestMBB,
297                         MachineIRBuilder &MIB) const;
298 
299   // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
300   // We use these manually instead of using the importer since it doesn't
301   // support SDNodeXForm.
302   ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
303   ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
304   ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
305   ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
306 
307   ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
308   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
309   ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
310 
311   ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
312                                             unsigned Size) const;
313 
selectAddrModeUnscaled8(MachineOperand & Root) const314   ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
315     return selectAddrModeUnscaled(Root, 1);
316   }
selectAddrModeUnscaled16(MachineOperand & Root) const317   ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
318     return selectAddrModeUnscaled(Root, 2);
319   }
selectAddrModeUnscaled32(MachineOperand & Root) const320   ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
321     return selectAddrModeUnscaled(Root, 4);
322   }
selectAddrModeUnscaled64(MachineOperand & Root) const323   ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
324     return selectAddrModeUnscaled(Root, 8);
325   }
selectAddrModeUnscaled128(MachineOperand & Root) const326   ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
327     return selectAddrModeUnscaled(Root, 16);
328   }
329 
330   /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
331   /// from complex pattern matchers like selectAddrModeIndexed().
332   ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
333                                           MachineRegisterInfo &MRI) const;
334 
335   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
336                                            unsigned Size) const;
337   template <int Width>
selectAddrModeIndexed(MachineOperand & Root) const338   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
339     return selectAddrModeIndexed(Root, Width / 8);
340   }
341 
342   bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
343                                      const MachineRegisterInfo &MRI) const;
344   ComplexRendererFns
345   selectAddrModeShiftedExtendXReg(MachineOperand &Root,
346                                   unsigned SizeInBytes) const;
347 
348   /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
349   /// or not a shift + extend should be folded into an addressing mode. Returns
350   /// None when this is not profitable or possible.
351   ComplexRendererFns
352   selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
353                     MachineOperand &Offset, unsigned SizeInBytes,
354                     bool WantsExt) const;
355   ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
356   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
357                                        unsigned SizeInBytes) const;
358   template <int Width>
selectAddrModeXRO(MachineOperand & Root) const359   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
360     return selectAddrModeXRO(Root, Width / 8);
361   }
362 
363   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
364                                        unsigned SizeInBytes) const;
365   template <int Width>
selectAddrModeWRO(MachineOperand & Root) const366   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
367     return selectAddrModeWRO(Root, Width / 8);
368   }
369 
370   ComplexRendererFns selectShiftedRegister(MachineOperand &Root) const;
371 
selectArithShiftedRegister(MachineOperand & Root) const372   ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
373     return selectShiftedRegister(Root);
374   }
375 
selectLogicalShiftedRegister(MachineOperand & Root) const376   ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
377     // TODO: selectShiftedRegister should allow for rotates on logical shifts.
378     // For now, make them the same. The only difference between the two is that
379     // logical shifts are allowed to fold in rotates. Otherwise, these are
380     // functionally the same.
381     return selectShiftedRegister(Root);
382   }
383 
384   /// Given an extend instruction, determine the correct shift-extend type for
385   /// that instruction.
386   ///
387   /// If the instruction is going to be used in a load or store, pass
388   /// \p IsLoadStore = true.
389   AArch64_AM::ShiftExtendType
390   getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
391                        bool IsLoadStore = false) const;
392 
393   /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
394   ///
395   /// \returns Either \p Reg if no change was necessary, or the new register
396   /// created by moving \p Reg.
397   ///
398   /// Note: This uses emitCopy right now.
399   Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
400                               MachineIRBuilder &MIB) const;
401 
402   ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
403 
404   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
405                       int OpIdx = -1) const;
406   void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
407                           int OpIdx = -1) const;
408   void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
409                           int OpIdx = -1) const;
410   void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
411                      int OpIdx = -1) const;
412   void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
413                      int OpIdx = -1) const;
414   void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
415                      int OpIdx = -1) const;
416 
417   // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
418   void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
419 
420   // Optimization methods.
421   bool tryOptSelect(MachineInstr &MI);
422   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
423                                       MachineOperand &Predicate,
424                                       MachineIRBuilder &MIRBuilder) const;
425 
426   /// Return true if \p MI is a load or store of \p NumBytes bytes.
427   bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
428 
429   /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
430   /// register zeroed out. In other words, the result of MI has been explicitly
431   /// zero extended.
432   bool isDef32(const MachineInstr &MI) const;
433 
434   const AArch64TargetMachine &TM;
435   const AArch64Subtarget &STI;
436   const AArch64InstrInfo &TII;
437   const AArch64RegisterInfo &TRI;
438   const AArch64RegisterBankInfo &RBI;
439 
440   bool ProduceNonFlagSettingCondBr = false;
441 
442   // Some cached values used during selection.
443   // We use LR as a live-in register, and we keep track of it here as it can be
444   // clobbered by calls.
445   Register MFReturnAddr;
446 
447   MachineIRBuilder MIB;
448 
449 #define GET_GLOBALISEL_PREDICATES_DECL
450 #include "AArch64GenGlobalISel.inc"
451 #undef GET_GLOBALISEL_PREDICATES_DECL
452 
453 // We declare the temporaries used by selectImpl() in the class to minimize the
454 // cost of constructing placeholder values.
455 #define GET_GLOBALISEL_TEMPORARIES_DECL
456 #include "AArch64GenGlobalISel.inc"
457 #undef GET_GLOBALISEL_TEMPORARIES_DECL
458 };
459 
460 } // end anonymous namespace
461 
462 #define GET_GLOBALISEL_IMPL
463 #include "AArch64GenGlobalISel.inc"
464 #undef GET_GLOBALISEL_IMPL
465 
AArch64InstructionSelector(const AArch64TargetMachine & TM,const AArch64Subtarget & STI,const AArch64RegisterBankInfo & RBI)466 AArch64InstructionSelector::AArch64InstructionSelector(
467     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
468     const AArch64RegisterBankInfo &RBI)
469     : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
470       TRI(*STI.getRegisterInfo()), RBI(RBI),
471 #define GET_GLOBALISEL_PREDICATES_INIT
472 #include "AArch64GenGlobalISel.inc"
473 #undef GET_GLOBALISEL_PREDICATES_INIT
474 #define GET_GLOBALISEL_TEMPORARIES_INIT
475 #include "AArch64GenGlobalISel.inc"
476 #undef GET_GLOBALISEL_TEMPORARIES_INIT
477 {
478 }
479 
480 // FIXME: This should be target-independent, inferred from the types declared
481 // for each class in the bank.
482 static const TargetRegisterClass *
getRegClassForTypeOnBank(LLT Ty,const RegisterBank & RB,const RegisterBankInfo & RBI,bool GetAllRegSet=false)483 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
484                          const RegisterBankInfo &RBI,
485                          bool GetAllRegSet = false) {
486   if (RB.getID() == AArch64::GPRRegBankID) {
487     if (Ty.getSizeInBits() <= 32)
488       return GetAllRegSet ? &AArch64::GPR32allRegClass
489                           : &AArch64::GPR32RegClass;
490     if (Ty.getSizeInBits() == 64)
491       return GetAllRegSet ? &AArch64::GPR64allRegClass
492                           : &AArch64::GPR64RegClass;
493     if (Ty.getSizeInBits() == 128)
494       return &AArch64::XSeqPairsClassRegClass;
495     return nullptr;
496   }
497 
498   if (RB.getID() == AArch64::FPRRegBankID) {
499     if (Ty.getSizeInBits() <= 16)
500       return &AArch64::FPR16RegClass;
501     if (Ty.getSizeInBits() == 32)
502       return &AArch64::FPR32RegClass;
503     if (Ty.getSizeInBits() == 64)
504       return &AArch64::FPR64RegClass;
505     if (Ty.getSizeInBits() == 128)
506       return &AArch64::FPR128RegClass;
507     return nullptr;
508   }
509 
510   return nullptr;
511 }
512 
513 /// Given a register bank, and size in bits, return the smallest register class
514 /// that can represent that combination.
515 static const TargetRegisterClass *
getMinClassForRegBank(const RegisterBank & RB,unsigned SizeInBits,bool GetAllRegSet=false)516 getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits,
517                       bool GetAllRegSet = false) {
518   unsigned RegBankID = RB.getID();
519 
520   if (RegBankID == AArch64::GPRRegBankID) {
521     if (SizeInBits <= 32)
522       return GetAllRegSet ? &AArch64::GPR32allRegClass
523                           : &AArch64::GPR32RegClass;
524     if (SizeInBits == 64)
525       return GetAllRegSet ? &AArch64::GPR64allRegClass
526                           : &AArch64::GPR64RegClass;
527     if (SizeInBits == 128)
528       return &AArch64::XSeqPairsClassRegClass;
529   }
530 
531   if (RegBankID == AArch64::FPRRegBankID) {
532     switch (SizeInBits) {
533     default:
534       return nullptr;
535     case 8:
536       return &AArch64::FPR8RegClass;
537     case 16:
538       return &AArch64::FPR16RegClass;
539     case 32:
540       return &AArch64::FPR32RegClass;
541     case 64:
542       return &AArch64::FPR64RegClass;
543     case 128:
544       return &AArch64::FPR128RegClass;
545     }
546   }
547 
548   return nullptr;
549 }
550 
551 /// Returns the correct subregister to use for a given register class.
getSubRegForClass(const TargetRegisterClass * RC,const TargetRegisterInfo & TRI,unsigned & SubReg)552 static bool getSubRegForClass(const TargetRegisterClass *RC,
553                               const TargetRegisterInfo &TRI, unsigned &SubReg) {
554   switch (TRI.getRegSizeInBits(*RC)) {
555   case 8:
556     SubReg = AArch64::bsub;
557     break;
558   case 16:
559     SubReg = AArch64::hsub;
560     break;
561   case 32:
562     if (RC != &AArch64::FPR32RegClass)
563       SubReg = AArch64::sub_32;
564     else
565       SubReg = AArch64::ssub;
566     break;
567   case 64:
568     SubReg = AArch64::dsub;
569     break;
570   default:
571     LLVM_DEBUG(
572         dbgs() << "Couldn't find appropriate subregister for register class.");
573     return false;
574   }
575 
576   return true;
577 }
578 
579 /// Returns the minimum size the given register bank can hold.
getMinSizeForRegBank(const RegisterBank & RB)580 static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
581   switch (RB.getID()) {
582   case AArch64::GPRRegBankID:
583     return 32;
584   case AArch64::FPRRegBankID:
585     return 8;
586   default:
587     llvm_unreachable("Tried to get minimum size for unknown register bank.");
588   }
589 }
590 
getImmedFromMO(const MachineOperand & Root)591 static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
592   auto &MI = *Root.getParent();
593   auto &MBB = *MI.getParent();
594   auto &MF = *MBB.getParent();
595   auto &MRI = MF.getRegInfo();
596   uint64_t Immed;
597   if (Root.isImm())
598     Immed = Root.getImm();
599   else if (Root.isCImm())
600     Immed = Root.getCImm()->getZExtValue();
601   else if (Root.isReg()) {
602     auto ValAndVReg =
603         getConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
604     if (!ValAndVReg)
605       return None;
606     Immed = ValAndVReg->Value.getSExtValue();
607   } else
608     return None;
609   return Immed;
610 }
611 
612 /// Check whether \p I is a currently unsupported binary operation:
613 /// - it has an unsized type
614 /// - an operand is not a vreg
615 /// - all operands are not in the same bank
616 /// These are checks that should someday live in the verifier, but right now,
617 /// these are mostly limitations of the aarch64 selector.
unsupportedBinOp(const MachineInstr & I,const AArch64RegisterBankInfo & RBI,const MachineRegisterInfo & MRI,const AArch64RegisterInfo & TRI)618 static bool unsupportedBinOp(const MachineInstr &I,
619                              const AArch64RegisterBankInfo &RBI,
620                              const MachineRegisterInfo &MRI,
621                              const AArch64RegisterInfo &TRI) {
622   LLT Ty = MRI.getType(I.getOperand(0).getReg());
623   if (!Ty.isValid()) {
624     LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
625     return true;
626   }
627 
628   const RegisterBank *PrevOpBank = nullptr;
629   for (auto &MO : I.operands()) {
630     // FIXME: Support non-register operands.
631     if (!MO.isReg()) {
632       LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
633       return true;
634     }
635 
636     // FIXME: Can generic operations have physical registers operands? If
637     // so, this will need to be taught about that, and we'll need to get the
638     // bank out of the minimal class for the register.
639     // Either way, this needs to be documented (and possibly verified).
640     if (!Register::isVirtualRegister(MO.getReg())) {
641       LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
642       return true;
643     }
644 
645     const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
646     if (!OpBank) {
647       LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
648       return true;
649     }
650 
651     if (PrevOpBank && OpBank != PrevOpBank) {
652       LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
653       return true;
654     }
655     PrevOpBank = OpBank;
656   }
657   return false;
658 }
659 
660 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc
661 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
662 /// and of size \p OpSize.
663 /// \returns \p GenericOpc if the combination is unsupported.
selectBinaryOp(unsigned GenericOpc,unsigned RegBankID,unsigned OpSize)664 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
665                                unsigned OpSize) {
666   switch (RegBankID) {
667   case AArch64::GPRRegBankID:
668     if (OpSize == 32) {
669       switch (GenericOpc) {
670       case TargetOpcode::G_SHL:
671         return AArch64::LSLVWr;
672       case TargetOpcode::G_LSHR:
673         return AArch64::LSRVWr;
674       case TargetOpcode::G_ASHR:
675         return AArch64::ASRVWr;
676       default:
677         return GenericOpc;
678       }
679     } else if (OpSize == 64) {
680       switch (GenericOpc) {
681       case TargetOpcode::G_PTR_ADD:
682         return AArch64::ADDXrr;
683       case TargetOpcode::G_SHL:
684         return AArch64::LSLVXr;
685       case TargetOpcode::G_LSHR:
686         return AArch64::LSRVXr;
687       case TargetOpcode::G_ASHR:
688         return AArch64::ASRVXr;
689       default:
690         return GenericOpc;
691       }
692     }
693     break;
694   case AArch64::FPRRegBankID:
695     switch (OpSize) {
696     case 32:
697       switch (GenericOpc) {
698       case TargetOpcode::G_FADD:
699         return AArch64::FADDSrr;
700       case TargetOpcode::G_FSUB:
701         return AArch64::FSUBSrr;
702       case TargetOpcode::G_FMUL:
703         return AArch64::FMULSrr;
704       case TargetOpcode::G_FDIV:
705         return AArch64::FDIVSrr;
706       default:
707         return GenericOpc;
708       }
709     case 64:
710       switch (GenericOpc) {
711       case TargetOpcode::G_FADD:
712         return AArch64::FADDDrr;
713       case TargetOpcode::G_FSUB:
714         return AArch64::FSUBDrr;
715       case TargetOpcode::G_FMUL:
716         return AArch64::FMULDrr;
717       case TargetOpcode::G_FDIV:
718         return AArch64::FDIVDrr;
719       case TargetOpcode::G_OR:
720         return AArch64::ORRv8i8;
721       default:
722         return GenericOpc;
723       }
724     }
725     break;
726   }
727   return GenericOpc;
728 }
729 
730 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
731 /// appropriate for the (value) register bank \p RegBankID and of memory access
732 /// size \p OpSize.  This returns the variant with the base+unsigned-immediate
733 /// addressing mode (e.g., LDRXui).
734 /// \returns \p GenericOpc if the combination is unsupported.
selectLoadStoreUIOp(unsigned GenericOpc,unsigned RegBankID,unsigned OpSize)735 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
736                                     unsigned OpSize) {
737   const bool isStore = GenericOpc == TargetOpcode::G_STORE;
738   switch (RegBankID) {
739   case AArch64::GPRRegBankID:
740     switch (OpSize) {
741     case 8:
742       return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
743     case 16:
744       return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
745     case 32:
746       return isStore ? AArch64::STRWui : AArch64::LDRWui;
747     case 64:
748       return isStore ? AArch64::STRXui : AArch64::LDRXui;
749     }
750     break;
751   case AArch64::FPRRegBankID:
752     switch (OpSize) {
753     case 8:
754       return isStore ? AArch64::STRBui : AArch64::LDRBui;
755     case 16:
756       return isStore ? AArch64::STRHui : AArch64::LDRHui;
757     case 32:
758       return isStore ? AArch64::STRSui : AArch64::LDRSui;
759     case 64:
760       return isStore ? AArch64::STRDui : AArch64::LDRDui;
761     }
762     break;
763   }
764   return GenericOpc;
765 }
766 
767 #ifndef NDEBUG
768 /// Helper function that verifies that we have a valid copy at the end of
769 /// selectCopy. Verifies that the source and dest have the expected sizes and
770 /// then returns true.
isValidCopy(const MachineInstr & I,const RegisterBank & DstBank,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const RegisterBankInfo & RBI)771 static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
772                         const MachineRegisterInfo &MRI,
773                         const TargetRegisterInfo &TRI,
774                         const RegisterBankInfo &RBI) {
775   const Register DstReg = I.getOperand(0).getReg();
776   const Register SrcReg = I.getOperand(1).getReg();
777   const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
778   const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
779 
780   // Make sure the size of the source and dest line up.
781   assert(
782       (DstSize == SrcSize ||
783        // Copies are a mean to setup initial types, the number of
784        // bits may not exactly match.
785        (Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) ||
786        // Copies are a mean to copy bits around, as long as we are
787        // on the same register class, that's fine. Otherwise, that
788        // means we need some SUBREG_TO_REG or AND & co.
789        (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) &&
790       "Copy with different width?!");
791 
792   // Check the size of the destination.
793   assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) &&
794          "GPRs cannot get more than 64-bit width values");
795 
796   return true;
797 }
798 #endif
799 
800 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
801 /// to \p *To.
802 ///
803 /// E.g "To = COPY SrcReg:SubReg"
copySubReg(MachineInstr & I,MachineRegisterInfo & MRI,const RegisterBankInfo & RBI,Register SrcReg,const TargetRegisterClass * To,unsigned SubReg)804 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
805                        const RegisterBankInfo &RBI, Register SrcReg,
806                        const TargetRegisterClass *To, unsigned SubReg) {
807   assert(SrcReg.isValid() && "Expected a valid source register?");
808   assert(To && "Destination register class cannot be null");
809   assert(SubReg && "Expected a valid subregister");
810 
811   MachineIRBuilder MIB(I);
812   auto SubRegCopy =
813       MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg);
814   MachineOperand &RegOp = I.getOperand(1);
815   RegOp.setReg(SubRegCopy.getReg(0));
816 
817   // It's possible that the destination register won't be constrained. Make
818   // sure that happens.
819   if (!Register::isPhysicalRegister(I.getOperand(0).getReg()))
820     RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
821 
822   return true;
823 }
824 
825 /// Helper function to get the source and destination register classes for a
826 /// copy. Returns a std::pair containing the source register class for the
827 /// copy, and the destination register class for the copy. If a register class
828 /// cannot be determined, then it will be nullptr.
829 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
getRegClassesForCopy(MachineInstr & I,const TargetInstrInfo & TII,MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const RegisterBankInfo & RBI)830 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
831                      MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
832                      const RegisterBankInfo &RBI) {
833   Register DstReg = I.getOperand(0).getReg();
834   Register SrcReg = I.getOperand(1).getReg();
835   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
836   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
837   unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
838   unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
839 
840   // Special casing for cross-bank copies of s1s. We can technically represent
841   // a 1-bit value with any size of register. The minimum size for a GPR is 32
842   // bits. So, we need to put the FPR on 32 bits as well.
843   //
844   // FIXME: I'm not sure if this case holds true outside of copies. If it does,
845   // then we can pull it into the helpers that get the appropriate class for a
846   // register bank. Or make a new helper that carries along some constraint
847   // information.
848   if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
849     SrcSize = DstSize = 32;
850 
851   return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
852           getMinClassForRegBank(DstRegBank, DstSize, true)};
853 }
854 
selectCopy(MachineInstr & I,const TargetInstrInfo & TII,MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const RegisterBankInfo & RBI)855 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
856                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
857                        const RegisterBankInfo &RBI) {
858   Register DstReg = I.getOperand(0).getReg();
859   Register SrcReg = I.getOperand(1).getReg();
860   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
861   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
862 
863   // Find the correct register classes for the source and destination registers.
864   const TargetRegisterClass *SrcRC;
865   const TargetRegisterClass *DstRC;
866   std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
867 
868   if (!DstRC) {
869     LLVM_DEBUG(dbgs() << "Unexpected dest size "
870                       << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
871     return false;
872   }
873 
874   // A couple helpers below, for making sure that the copy we produce is valid.
875 
876   // Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want
877   // to verify that the src and dst are the same size, since that's handled by
878   // the SUBREG_TO_REG.
879   bool KnownValid = false;
880 
881   // Returns true, or asserts if something we don't expect happens. Instead of
882   // returning true, we return isValidCopy() to ensure that we verify the
883   // result.
884   auto CheckCopy = [&]() {
885     // If we have a bitcast or something, we can't have physical registers.
886     assert((I.isCopy() ||
887             (!Register::isPhysicalRegister(I.getOperand(0).getReg()) &&
888              !Register::isPhysicalRegister(I.getOperand(1).getReg()))) &&
889            "No phys reg on generic operator!");
890     bool ValidCopy = true;
891 #ifndef NDEBUG
892     ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI);
893     assert(ValidCopy && "Invalid copy.");
894 #endif
895     (void)KnownValid;
896     return ValidCopy;
897   };
898 
899   // Is this a copy? If so, then we may need to insert a subregister copy.
900   if (I.isCopy()) {
901     // Yes. Check if there's anything to fix up.
902     if (!SrcRC) {
903       LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
904       return false;
905     }
906 
907     unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
908     unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
909     unsigned SubReg;
910 
911     // If the source bank doesn't support a subregister copy small enough,
912     // then we first need to copy to the destination bank.
913     if (getMinSizeForRegBank(SrcRegBank) > DstSize) {
914       const TargetRegisterClass *DstTempRC =
915           getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true);
916       getSubRegForClass(DstRC, TRI, SubReg);
917 
918       MachineIRBuilder MIB(I);
919       auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg});
920       copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg);
921     } else if (SrcSize > DstSize) {
922       // If the source register is bigger than the destination we need to
923       // perform a subregister copy.
924       const TargetRegisterClass *SubRegRC =
925           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
926       getSubRegForClass(SubRegRC, TRI, SubReg);
927       copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg);
928     } else if (DstSize > SrcSize) {
929       // If the destination register is bigger than the source we need to do
930       // a promotion using SUBREG_TO_REG.
931       const TargetRegisterClass *PromotionRC =
932           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
933       getSubRegForClass(SrcRC, TRI, SubReg);
934 
935       Register PromoteReg = MRI.createVirtualRegister(PromotionRC);
936       BuildMI(*I.getParent(), I, I.getDebugLoc(),
937               TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
938           .addImm(0)
939           .addUse(SrcReg)
940           .addImm(SubReg);
941       MachineOperand &RegOp = I.getOperand(1);
942       RegOp.setReg(PromoteReg);
943 
944       // Promise that the copy is implicitly validated by the SUBREG_TO_REG.
945       KnownValid = true;
946     }
947 
948     // If the destination is a physical register, then there's nothing to
949     // change, so we're done.
950     if (Register::isPhysicalRegister(DstReg))
951       return CheckCopy();
952   }
953 
954   // No need to constrain SrcReg. It will get constrained when we hit another
955   // of its use or its defs. Copies do not have constraints.
956   if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
957     LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
958                       << " operand\n");
959     return false;
960   }
961   I.setDesc(TII.get(AArch64::COPY));
962   return CheckCopy();
963 }
964 
selectFPConvOpc(unsigned GenericOpc,LLT DstTy,LLT SrcTy)965 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
966   if (!DstTy.isScalar() || !SrcTy.isScalar())
967     return GenericOpc;
968 
969   const unsigned DstSize = DstTy.getSizeInBits();
970   const unsigned SrcSize = SrcTy.getSizeInBits();
971 
972   switch (DstSize) {
973   case 32:
974     switch (SrcSize) {
975     case 32:
976       switch (GenericOpc) {
977       case TargetOpcode::G_SITOFP:
978         return AArch64::SCVTFUWSri;
979       case TargetOpcode::G_UITOFP:
980         return AArch64::UCVTFUWSri;
981       case TargetOpcode::G_FPTOSI:
982         return AArch64::FCVTZSUWSr;
983       case TargetOpcode::G_FPTOUI:
984         return AArch64::FCVTZUUWSr;
985       default:
986         return GenericOpc;
987       }
988     case 64:
989       switch (GenericOpc) {
990       case TargetOpcode::G_SITOFP:
991         return AArch64::SCVTFUXSri;
992       case TargetOpcode::G_UITOFP:
993         return AArch64::UCVTFUXSri;
994       case TargetOpcode::G_FPTOSI:
995         return AArch64::FCVTZSUWDr;
996       case TargetOpcode::G_FPTOUI:
997         return AArch64::FCVTZUUWDr;
998       default:
999         return GenericOpc;
1000       }
1001     default:
1002       return GenericOpc;
1003     }
1004   case 64:
1005     switch (SrcSize) {
1006     case 32:
1007       switch (GenericOpc) {
1008       case TargetOpcode::G_SITOFP:
1009         return AArch64::SCVTFUWDri;
1010       case TargetOpcode::G_UITOFP:
1011         return AArch64::UCVTFUWDri;
1012       case TargetOpcode::G_FPTOSI:
1013         return AArch64::FCVTZSUXSr;
1014       case TargetOpcode::G_FPTOUI:
1015         return AArch64::FCVTZUUXSr;
1016       default:
1017         return GenericOpc;
1018       }
1019     case 64:
1020       switch (GenericOpc) {
1021       case TargetOpcode::G_SITOFP:
1022         return AArch64::SCVTFUXDri;
1023       case TargetOpcode::G_UITOFP:
1024         return AArch64::UCVTFUXDri;
1025       case TargetOpcode::G_FPTOSI:
1026         return AArch64::FCVTZSUXDr;
1027       case TargetOpcode::G_FPTOUI:
1028         return AArch64::FCVTZUUXDr;
1029       default:
1030         return GenericOpc;
1031       }
1032     default:
1033       return GenericOpc;
1034     }
1035   default:
1036     return GenericOpc;
1037   };
1038   return GenericOpc;
1039 }
1040 
1041 MachineInstr *
emitSelect(Register Dst,Register True,Register False,AArch64CC::CondCode CC,MachineIRBuilder & MIB) const1042 AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1043                                        Register False, AArch64CC::CondCode CC,
1044                                        MachineIRBuilder &MIB) const {
1045   MachineRegisterInfo &MRI = *MIB.getMRI();
1046   assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1047              RBI.getRegBank(True, MRI, TRI)->getID() &&
1048          "Expected both select operands to have the same regbank?");
1049   LLT Ty = MRI.getType(True);
1050   if (Ty.isVector())
1051     return nullptr;
1052   const unsigned Size = Ty.getSizeInBits();
1053   assert((Size == 32 || Size == 64) &&
1054          "Expected 32 bit or 64 bit select only?");
1055   const bool Is32Bit = Size == 32;
1056   if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1057     unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1058     auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1059     constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
1060     return &*FCSel;
1061   }
1062 
1063   // By default, we'll try and emit a CSEL.
1064   unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1065   bool Optimized = false;
1066   auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1067                                  &Optimized](Register &Reg, Register &OtherReg,
1068                                              bool Invert) {
1069     if (Optimized)
1070       return false;
1071 
1072     // Attempt to fold:
1073     //
1074     // %sub = G_SUB 0, %x
1075     // %select = G_SELECT cc, %reg, %sub
1076     //
1077     // Into:
1078     // %select = CSNEG %reg, %x, cc
1079     Register MatchReg;
1080     if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) {
1081       Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1082       Reg = MatchReg;
1083       if (Invert) {
1084         CC = AArch64CC::getInvertedCondCode(CC);
1085         std::swap(Reg, OtherReg);
1086       }
1087       return true;
1088     }
1089 
1090     // Attempt to fold:
1091     //
1092     // %xor = G_XOR %x, -1
1093     // %select = G_SELECT cc, %reg, %xor
1094     //
1095     // Into:
1096     // %select = CSINV %reg, %x, cc
1097     if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) {
1098       Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1099       Reg = MatchReg;
1100       if (Invert) {
1101         CC = AArch64CC::getInvertedCondCode(CC);
1102         std::swap(Reg, OtherReg);
1103       }
1104       return true;
1105     }
1106 
1107     // Attempt to fold:
1108     //
1109     // %add = G_ADD %x, 1
1110     // %select = G_SELECT cc, %reg, %add
1111     //
1112     // Into:
1113     // %select = CSINC %reg, %x, cc
1114     if (mi_match(Reg, MRI,
1115                  m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)),
1116                           m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) {
1117       Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1118       Reg = MatchReg;
1119       if (Invert) {
1120         CC = AArch64CC::getInvertedCondCode(CC);
1121         std::swap(Reg, OtherReg);
1122       }
1123       return true;
1124     }
1125 
1126     return false;
1127   };
1128 
1129   // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1130   // true/false values are constants.
1131   // FIXME: All of these patterns already exist in tablegen. We should be
1132   // able to import these.
1133   auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1134                           &Optimized]() {
1135     if (Optimized)
1136       return false;
1137     auto TrueCst = getConstantVRegValWithLookThrough(True, MRI);
1138     auto FalseCst = getConstantVRegValWithLookThrough(False, MRI);
1139     if (!TrueCst && !FalseCst)
1140       return false;
1141 
1142     Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1143     if (TrueCst && FalseCst) {
1144       int64_t T = TrueCst->Value.getSExtValue();
1145       int64_t F = FalseCst->Value.getSExtValue();
1146 
1147       if (T == 0 && F == 1) {
1148         // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1149         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1150         True = ZReg;
1151         False = ZReg;
1152         return true;
1153       }
1154 
1155       if (T == 0 && F == -1) {
1156         // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1157         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1158         True = ZReg;
1159         False = ZReg;
1160         return true;
1161       }
1162     }
1163 
1164     if (TrueCst) {
1165       int64_t T = TrueCst->Value.getSExtValue();
1166       if (T == 1) {
1167         // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1168         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1169         True = False;
1170         False = ZReg;
1171         CC = AArch64CC::getInvertedCondCode(CC);
1172         return true;
1173       }
1174 
1175       if (T == -1) {
1176         // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1177         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1178         True = False;
1179         False = ZReg;
1180         CC = AArch64CC::getInvertedCondCode(CC);
1181         return true;
1182       }
1183     }
1184 
1185     if (FalseCst) {
1186       int64_t F = FalseCst->Value.getSExtValue();
1187       if (F == 1) {
1188         // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1189         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1190         False = ZReg;
1191         return true;
1192       }
1193 
1194       if (F == -1) {
1195         // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1196         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1197         False = ZReg;
1198         return true;
1199       }
1200     }
1201     return false;
1202   };
1203 
1204   Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1205   Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1206   Optimized |= TryOptSelectCst();
1207   auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1208   constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
1209   return &*SelectInst;
1210 }
1211 
changeICMPPredToAArch64CC(CmpInst::Predicate P)1212 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
1213   switch (P) {
1214   default:
1215     llvm_unreachable("Unknown condition code!");
1216   case CmpInst::ICMP_NE:
1217     return AArch64CC::NE;
1218   case CmpInst::ICMP_EQ:
1219     return AArch64CC::EQ;
1220   case CmpInst::ICMP_SGT:
1221     return AArch64CC::GT;
1222   case CmpInst::ICMP_SGE:
1223     return AArch64CC::GE;
1224   case CmpInst::ICMP_SLT:
1225     return AArch64CC::LT;
1226   case CmpInst::ICMP_SLE:
1227     return AArch64CC::LE;
1228   case CmpInst::ICMP_UGT:
1229     return AArch64CC::HI;
1230   case CmpInst::ICMP_UGE:
1231     return AArch64CC::HS;
1232   case CmpInst::ICMP_ULT:
1233     return AArch64CC::LO;
1234   case CmpInst::ICMP_ULE:
1235     return AArch64CC::LS;
1236   }
1237 }
1238 
1239 /// Return a register which can be used as a bit to test in a TB(N)Z.
getTestBitReg(Register Reg,uint64_t & Bit,bool & Invert,MachineRegisterInfo & MRI)1240 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1241                               MachineRegisterInfo &MRI) {
1242   assert(Reg.isValid() && "Expected valid register!");
1243   while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1244     unsigned Opc = MI->getOpcode();
1245 
1246     if (!MI->getOperand(0).isReg() ||
1247         !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
1248       break;
1249 
1250     // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
1251     //
1252     // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1253     // on the truncated x is the same as the bit number on x.
1254     if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1255         Opc == TargetOpcode::G_TRUNC) {
1256       Register NextReg = MI->getOperand(1).getReg();
1257       // Did we find something worth folding?
1258       if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
1259         break;
1260 
1261       // NextReg is worth folding. Keep looking.
1262       Reg = NextReg;
1263       continue;
1264     }
1265 
1266     // Attempt to find a suitable operation with a constant on one side.
1267     Optional<uint64_t> C;
1268     Register TestReg;
1269     switch (Opc) {
1270     default:
1271       break;
1272     case TargetOpcode::G_AND:
1273     case TargetOpcode::G_XOR: {
1274       TestReg = MI->getOperand(1).getReg();
1275       Register ConstantReg = MI->getOperand(2).getReg();
1276       auto VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI);
1277       if (!VRegAndVal) {
1278         // AND commutes, check the other side for a constant.
1279         // FIXME: Can we canonicalize the constant so that it's always on the
1280         // same side at some point earlier?
1281         std::swap(ConstantReg, TestReg);
1282         VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI);
1283       }
1284       if (VRegAndVal)
1285         C = VRegAndVal->Value.getSExtValue();
1286       break;
1287     }
1288     case TargetOpcode::G_ASHR:
1289     case TargetOpcode::G_LSHR:
1290     case TargetOpcode::G_SHL: {
1291       TestReg = MI->getOperand(1).getReg();
1292       auto VRegAndVal =
1293           getConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
1294       if (VRegAndVal)
1295         C = VRegAndVal->Value.getSExtValue();
1296       break;
1297     }
1298     }
1299 
1300     // Didn't find a constant or viable register. Bail out of the loop.
1301     if (!C || !TestReg.isValid())
1302       break;
1303 
1304     // We found a suitable instruction with a constant. Check to see if we can
1305     // walk through the instruction.
1306     Register NextReg;
1307     unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits();
1308     switch (Opc) {
1309     default:
1310       break;
1311     case TargetOpcode::G_AND:
1312       // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1313       if ((*C >> Bit) & 1)
1314         NextReg = TestReg;
1315       break;
1316     case TargetOpcode::G_SHL:
1317       // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1318       // the type of the register.
1319       if (*C <= Bit && (Bit - *C) < TestRegSize) {
1320         NextReg = TestReg;
1321         Bit = Bit - *C;
1322       }
1323       break;
1324     case TargetOpcode::G_ASHR:
1325       // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1326       // in x
1327       NextReg = TestReg;
1328       Bit = Bit + *C;
1329       if (Bit >= TestRegSize)
1330         Bit = TestRegSize - 1;
1331       break;
1332     case TargetOpcode::G_LSHR:
1333       // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1334       if ((Bit + *C) < TestRegSize) {
1335         NextReg = TestReg;
1336         Bit = Bit + *C;
1337       }
1338       break;
1339     case TargetOpcode::G_XOR:
1340       // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1341       // appropriate.
1342       //
1343       // e.g. If x' = xor x, c, and the b-th bit is set in c then
1344       //
1345       // tbz x', b -> tbnz x, b
1346       //
1347       // Because x' only has the b-th bit set if x does not.
1348       if ((*C >> Bit) & 1)
1349         Invert = !Invert;
1350       NextReg = TestReg;
1351       break;
1352     }
1353 
1354     // Check if we found anything worth folding.
1355     if (!NextReg.isValid())
1356       return Reg;
1357     Reg = NextReg;
1358   }
1359 
1360   return Reg;
1361 }
1362 
emitTestBit(Register TestReg,uint64_t Bit,bool IsNegative,MachineBasicBlock * DstMBB,MachineIRBuilder & MIB) const1363 MachineInstr *AArch64InstructionSelector::emitTestBit(
1364     Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1365     MachineIRBuilder &MIB) const {
1366   assert(TestReg.isValid());
1367   assert(ProduceNonFlagSettingCondBr &&
1368          "Cannot emit TB(N)Z with speculation tracking!");
1369   MachineRegisterInfo &MRI = *MIB.getMRI();
1370 
1371   // Attempt to optimize the test bit by walking over instructions.
1372   TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
1373   LLT Ty = MRI.getType(TestReg);
1374   unsigned Size = Ty.getSizeInBits();
1375   assert(!Ty.isVector() && "Expected a scalar!");
1376   assert(Bit < 64 && "Bit is too large!");
1377 
1378   // When the test register is a 64-bit register, we have to narrow to make
1379   // TBNZW work.
1380   bool UseWReg = Bit < 32;
1381   unsigned NecessarySize = UseWReg ? 32 : 64;
1382   if (Size != NecessarySize)
1383     TestReg = moveScalarRegClass(
1384         TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1385         MIB);
1386 
1387   static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1388                                           {AArch64::TBZW, AArch64::TBNZW}};
1389   unsigned Opc = OpcTable[UseWReg][IsNegative];
1390   auto TestBitMI =
1391       MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB);
1392   constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
1393   return &*TestBitMI;
1394 }
1395 
tryOptAndIntoCompareBranch(MachineInstr & AndInst,bool Invert,MachineBasicBlock * DstMBB,MachineIRBuilder & MIB) const1396 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1397     MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1398     MachineIRBuilder &MIB) const {
1399   assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1400   // Given something like this:
1401   //
1402   //  %x = ...Something...
1403   //  %one = G_CONSTANT i64 1
1404   //  %zero = G_CONSTANT i64 0
1405   //  %and = G_AND %x, %one
1406   //  %cmp = G_ICMP intpred(ne), %and, %zero
1407   //  %cmp_trunc = G_TRUNC %cmp
1408   //  G_BRCOND %cmp_trunc, %bb.3
1409   //
1410   // We want to try and fold the AND into the G_BRCOND and produce either a
1411   // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1412   //
1413   // In this case, we'd get
1414   //
1415   // TBNZ %x %bb.3
1416   //
1417 
1418   // Check if the AND has a constant on its RHS which we can use as a mask.
1419   // If it's a power of 2, then it's the same as checking a specific bit.
1420   // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1421   auto MaybeBit = getConstantVRegValWithLookThrough(
1422       AndInst.getOperand(2).getReg(), *MIB.getMRI());
1423   if (!MaybeBit)
1424     return false;
1425 
1426   int32_t Bit = MaybeBit->Value.exactLogBase2();
1427   if (Bit < 0)
1428     return false;
1429 
1430   Register TestReg = AndInst.getOperand(1).getReg();
1431 
1432   // Emit a TB(N)Z.
1433   emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
1434   return true;
1435 }
1436 
emitCBZ(Register CompareReg,bool IsNegative,MachineBasicBlock * DestMBB,MachineIRBuilder & MIB) const1437 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1438                                                   bool IsNegative,
1439                                                   MachineBasicBlock *DestMBB,
1440                                                   MachineIRBuilder &MIB) const {
1441   assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1442   MachineRegisterInfo &MRI = *MIB.getMRI();
1443   assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1444              AArch64::GPRRegBankID &&
1445          "Expected GPRs only?");
1446   auto Ty = MRI.getType(CompareReg);
1447   unsigned Width = Ty.getSizeInBits();
1448   assert(!Ty.isVector() && "Expected scalar only?");
1449   assert(Width <= 64 && "Expected width to be at most 64?");
1450   static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1451                                           {AArch64::CBNZW, AArch64::CBNZX}};
1452   unsigned Opc = OpcTable[IsNegative][Width == 64];
1453   auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB);
1454   constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
1455   return &*BranchMI;
1456 }
1457 
selectCompareBranchFedByFCmp(MachineInstr & I,MachineInstr & FCmp,MachineIRBuilder & MIB) const1458 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1459     MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1460   assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1461   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1462   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1463   // totally clean.  Some of them require two branches to implement.
1464   auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate();
1465   emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB,
1466                 Pred);
1467   AArch64CC::CondCode CC1, CC2;
1468   changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2);
1469   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1470   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
1471   if (CC2 != AArch64CC::AL)
1472     MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
1473   I.eraseFromParent();
1474   return true;
1475 }
1476 
tryOptCompareBranchFedByICmp(MachineInstr & I,MachineInstr & ICmp,MachineIRBuilder & MIB) const1477 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1478     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1479   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1480   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1481   // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1482   //
1483   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1484   // instructions will not be produced, as they are conditional branch
1485   // instructions that do not set flags.
1486   if (!ProduceNonFlagSettingCondBr)
1487     return false;
1488 
1489   MachineRegisterInfo &MRI = *MIB.getMRI();
1490   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1491   auto Pred =
1492       static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate());
1493   Register LHS = ICmp.getOperand(2).getReg();
1494   Register RHS = ICmp.getOperand(3).getReg();
1495 
1496   // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1497   auto VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
1498   MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1499 
1500   // When we can emit a TB(N)Z, prefer that.
1501   //
1502   // Handle non-commutative condition codes first.
1503   // Note that we don't want to do this when we have a G_AND because it can
1504   // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1505   if (VRegAndVal && !AndInst) {
1506     int64_t C = VRegAndVal->Value.getSExtValue();
1507 
1508     // When we have a greater-than comparison, we can just test if the msb is
1509     // zero.
1510     if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1511       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1512       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1513       I.eraseFromParent();
1514       return true;
1515     }
1516 
1517     // When we have a less than comparison, we can just test if the msb is not
1518     // zero.
1519     if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1520       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1521       emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB);
1522       I.eraseFromParent();
1523       return true;
1524     }
1525   }
1526 
1527   // Attempt to handle commutative condition codes. Right now, that's only
1528   // eq/ne.
1529   if (ICmpInst::isEquality(Pred)) {
1530     if (!VRegAndVal) {
1531       std::swap(RHS, LHS);
1532       VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
1533       AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1534     }
1535 
1536     if (VRegAndVal && VRegAndVal->Value == 0) {
1537       // If there's a G_AND feeding into this branch, try to fold it away by
1538       // emitting a TB(N)Z instead.
1539       //
1540       // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1541       // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1542       // would be redundant.
1543       if (AndInst &&
1544           tryOptAndIntoCompareBranch(
1545               *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) {
1546         I.eraseFromParent();
1547         return true;
1548       }
1549 
1550       // Otherwise, try to emit a CB(N)Z instead.
1551       auto LHSTy = MRI.getType(LHS);
1552       if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1553         emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1554         I.eraseFromParent();
1555         return true;
1556       }
1557     }
1558   }
1559 
1560   return false;
1561 }
1562 
selectCompareBranchFedByICmp(MachineInstr & I,MachineInstr & ICmp,MachineIRBuilder & MIB) const1563 bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1564     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1565   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1566   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1567   if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1568     return true;
1569 
1570   // Couldn't optimize. Emit a compare + a Bcc.
1571   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1572   auto PredOp = ICmp.getOperand(1);
1573   emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
1574   const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1575       static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
1576   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
1577   I.eraseFromParent();
1578   return true;
1579 }
1580 
selectCompareBranch(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI)1581 bool AArch64InstructionSelector::selectCompareBranch(
1582     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1583   Register CondReg = I.getOperand(0).getReg();
1584   MachineInstr *CCMI = MRI.getVRegDef(CondReg);
1585   if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) {
1586     CondReg = CCMI->getOperand(1).getReg();
1587     CCMI = MRI.getVRegDef(CondReg);
1588   }
1589 
1590   // Try to select the G_BRCOND using whatever is feeding the condition if
1591   // possible.
1592   unsigned CCMIOpc = CCMI->getOpcode();
1593   if (CCMIOpc == TargetOpcode::G_FCMP)
1594     return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
1595   if (CCMIOpc == TargetOpcode::G_ICMP)
1596     return selectCompareBranchFedByICmp(I, *CCMI, MIB);
1597 
1598   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1599   // instructions will not be produced, as they are conditional branch
1600   // instructions that do not set flags.
1601   if (ProduceNonFlagSettingCondBr) {
1602     emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1603                 I.getOperand(1).getMBB(), MIB);
1604     I.eraseFromParent();
1605     return true;
1606   }
1607 
1608   // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1609   auto TstMI =
1610       MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
1611   constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
1612   auto Bcc = MIB.buildInstr(AArch64::Bcc)
1613                  .addImm(AArch64CC::EQ)
1614                  .addMBB(I.getOperand(1).getMBB());
1615   I.eraseFromParent();
1616   return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
1617 }
1618 
1619 /// Returns the element immediate value of a vector shift operand if found.
1620 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
getVectorShiftImm(Register Reg,MachineRegisterInfo & MRI)1621 static Optional<int64_t> getVectorShiftImm(Register Reg,
1622                                            MachineRegisterInfo &MRI) {
1623   assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1624   MachineInstr *OpMI = MRI.getVRegDef(Reg);
1625   assert(OpMI && "Expected to find a vreg def for vector shift operand");
1626   return getAArch64VectorSplatScalar(*OpMI, MRI);
1627 }
1628 
1629 /// Matches and returns the shift immediate value for a SHL instruction given
1630 /// a shift operand.
getVectorSHLImm(LLT SrcTy,Register Reg,MachineRegisterInfo & MRI)1631 static Optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg, MachineRegisterInfo &MRI) {
1632   Optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1633   if (!ShiftImm)
1634     return None;
1635   // Check the immediate is in range for a SHL.
1636   int64_t Imm = *ShiftImm;
1637   if (Imm < 0)
1638     return None;
1639   switch (SrcTy.getElementType().getSizeInBits()) {
1640   default:
1641     LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1642     return None;
1643   case 8:
1644     if (Imm > 7)
1645       return None;
1646     break;
1647   case 16:
1648     if (Imm > 15)
1649       return None;
1650     break;
1651   case 32:
1652     if (Imm > 31)
1653       return None;
1654     break;
1655   case 64:
1656     if (Imm > 63)
1657       return None;
1658     break;
1659   }
1660   return Imm;
1661 }
1662 
selectVectorSHL(MachineInstr & I,MachineRegisterInfo & MRI)1663 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1664                                                  MachineRegisterInfo &MRI) {
1665   assert(I.getOpcode() == TargetOpcode::G_SHL);
1666   Register DstReg = I.getOperand(0).getReg();
1667   const LLT Ty = MRI.getType(DstReg);
1668   Register Src1Reg = I.getOperand(1).getReg();
1669   Register Src2Reg = I.getOperand(2).getReg();
1670 
1671   if (!Ty.isVector())
1672     return false;
1673 
1674   // Check if we have a vector of constants on RHS that we can select as the
1675   // immediate form.
1676   Optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI);
1677 
1678   unsigned Opc = 0;
1679   if (Ty == LLT::vector(2, 64)) {
1680     Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1681   } else if (Ty == LLT::vector(4, 32)) {
1682     Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1683   } else if (Ty == LLT::vector(2, 32)) {
1684     Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1685   } else if (Ty == LLT::vector(4, 16)) {
1686     Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1687   } else if (Ty == LLT::vector(8, 16)) {
1688     Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1689   } else if (Ty == LLT::vector(16, 8)) {
1690     Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1691   } else if (Ty == LLT::vector(8, 8)) {
1692     Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1693   } else {
1694     LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1695     return false;
1696   }
1697 
1698   auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg});
1699   if (ImmVal)
1700     Shl.addImm(*ImmVal);
1701   else
1702     Shl.addUse(Src2Reg);
1703   constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
1704   I.eraseFromParent();
1705   return true;
1706 }
1707 
selectVectorAshrLshr(MachineInstr & I,MachineRegisterInfo & MRI)1708 bool AArch64InstructionSelector::selectVectorAshrLshr(
1709     MachineInstr &I, MachineRegisterInfo &MRI) {
1710   assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1711          I.getOpcode() == TargetOpcode::G_LSHR);
1712   Register DstReg = I.getOperand(0).getReg();
1713   const LLT Ty = MRI.getType(DstReg);
1714   Register Src1Reg = I.getOperand(1).getReg();
1715   Register Src2Reg = I.getOperand(2).getReg();
1716 
1717   if (!Ty.isVector())
1718     return false;
1719 
1720   bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1721 
1722   // We expect the immediate case to be lowered in the PostLegalCombiner to
1723   // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1724 
1725   // There is not a shift right register instruction, but the shift left
1726   // register instruction takes a signed value, where negative numbers specify a
1727   // right shift.
1728 
1729   unsigned Opc = 0;
1730   unsigned NegOpc = 0;
1731   const TargetRegisterClass *RC =
1732       getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID), RBI);
1733   if (Ty == LLT::vector(2, 64)) {
1734     Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1735     NegOpc = AArch64::NEGv2i64;
1736   } else if (Ty == LLT::vector(4, 32)) {
1737     Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1738     NegOpc = AArch64::NEGv4i32;
1739   } else if (Ty == LLT::vector(2, 32)) {
1740     Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1741     NegOpc = AArch64::NEGv2i32;
1742   } else if (Ty == LLT::vector(4, 16)) {
1743     Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1744     NegOpc = AArch64::NEGv4i16;
1745   } else if (Ty == LLT::vector(8, 16)) {
1746     Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1747     NegOpc = AArch64::NEGv8i16;
1748   } else if (Ty == LLT::vector(16, 8)) {
1749     Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1750     NegOpc = AArch64::NEGv16i8;
1751   } else if (Ty == LLT::vector(8, 8)) {
1752     Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1753     NegOpc = AArch64::NEGv8i8;
1754   } else {
1755     LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1756     return false;
1757   }
1758 
1759   auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
1760   constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
1761   auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
1762   constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
1763   I.eraseFromParent();
1764   return true;
1765 }
1766 
selectVaStartAAPCS(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI) const1767 bool AArch64InstructionSelector::selectVaStartAAPCS(
1768     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1769   return false;
1770 }
1771 
selectVaStartDarwin(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI) const1772 bool AArch64InstructionSelector::selectVaStartDarwin(
1773     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1774   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
1775   Register ListReg = I.getOperand(0).getReg();
1776 
1777   Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1778 
1779   auto MIB =
1780       BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
1781           .addDef(ArgsAddrReg)
1782           .addFrameIndex(FuncInfo->getVarArgsStackIndex())
1783           .addImm(0)
1784           .addImm(0);
1785 
1786   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1787 
1788   MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
1789             .addUse(ArgsAddrReg)
1790             .addUse(ListReg)
1791             .addImm(0)
1792             .addMemOperand(*I.memoperands_begin());
1793 
1794   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1795   I.eraseFromParent();
1796   return true;
1797 }
1798 
materializeLargeCMVal(MachineInstr & I,const Value * V,unsigned OpFlags)1799 void AArch64InstructionSelector::materializeLargeCMVal(
1800     MachineInstr &I, const Value *V, unsigned OpFlags) {
1801   MachineBasicBlock &MBB = *I.getParent();
1802   MachineFunction &MF = *MBB.getParent();
1803   MachineRegisterInfo &MRI = MF.getRegInfo();
1804 
1805   auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
1806   MovZ->addOperand(MF, I.getOperand(1));
1807   MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
1808                                      AArch64II::MO_NC);
1809   MovZ->addOperand(MF, MachineOperand::CreateImm(0));
1810   constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
1811 
1812   auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
1813                        Register ForceDstReg) {
1814     Register DstReg = ForceDstReg
1815                           ? ForceDstReg
1816                           : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1817     auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
1818     if (auto *GV = dyn_cast<GlobalValue>(V)) {
1819       MovI->addOperand(MF, MachineOperand::CreateGA(
1820                                GV, MovZ->getOperand(1).getOffset(), Flags));
1821     } else {
1822       MovI->addOperand(
1823           MF, MachineOperand::CreateBA(cast<BlockAddress>(V),
1824                                        MovZ->getOperand(1).getOffset(), Flags));
1825     }
1826     MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
1827     constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
1828     return DstReg;
1829   };
1830   Register DstReg = BuildMovK(MovZ.getReg(0),
1831                               AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
1832   DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
1833   BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
1834 }
1835 
preISelLower(MachineInstr & I)1836 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
1837   MachineBasicBlock &MBB = *I.getParent();
1838   MachineFunction &MF = *MBB.getParent();
1839   MachineRegisterInfo &MRI = MF.getRegInfo();
1840 
1841   switch (I.getOpcode()) {
1842   case TargetOpcode::G_SHL:
1843   case TargetOpcode::G_ASHR:
1844   case TargetOpcode::G_LSHR: {
1845     // These shifts are legalized to have 64 bit shift amounts because we want
1846     // to take advantage of the existing imported selection patterns that assume
1847     // the immediates are s64s. However, if the shifted type is 32 bits and for
1848     // some reason we receive input GMIR that has an s64 shift amount that's not
1849     // a G_CONSTANT, insert a truncate so that we can still select the s32
1850     // register-register variant.
1851     Register SrcReg = I.getOperand(1).getReg();
1852     Register ShiftReg = I.getOperand(2).getReg();
1853     const LLT ShiftTy = MRI.getType(ShiftReg);
1854     const LLT SrcTy = MRI.getType(SrcReg);
1855     if (SrcTy.isVector())
1856       return false;
1857     assert(!ShiftTy.isVector() && "unexpected vector shift ty");
1858     if (SrcTy.getSizeInBits() != 32 || ShiftTy.getSizeInBits() != 64)
1859       return false;
1860     auto *AmtMI = MRI.getVRegDef(ShiftReg);
1861     assert(AmtMI && "could not find a vreg definition for shift amount");
1862     if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) {
1863       // Insert a subregister copy to implement a 64->32 trunc
1864       auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
1865                        .addReg(ShiftReg, 0, AArch64::sub_32);
1866       MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
1867       I.getOperand(2).setReg(Trunc.getReg(0));
1868     }
1869     return true;
1870   }
1871   case TargetOpcode::G_STORE: {
1872     bool Changed = contractCrossBankCopyIntoStore(I, MRI);
1873     MachineOperand &SrcOp = I.getOperand(0);
1874     if (MRI.getType(SrcOp.getReg()).isPointer()) {
1875       // Allow matching with imported patterns for stores of pointers. Unlike
1876       // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
1877       // and constrain.
1878       auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp);
1879       Register NewSrc = Copy.getReg(0);
1880       SrcOp.setReg(NewSrc);
1881       RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
1882       Changed = true;
1883     }
1884     return Changed;
1885   }
1886   case TargetOpcode::G_PTR_ADD:
1887     return convertPtrAddToAdd(I, MRI);
1888   case TargetOpcode::G_LOAD: {
1889     // For scalar loads of pointers, we try to convert the dest type from p0
1890     // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
1891     // conversion, this should be ok because all users should have been
1892     // selected already, so the type doesn't matter for them.
1893     Register DstReg = I.getOperand(0).getReg();
1894     const LLT DstTy = MRI.getType(DstReg);
1895     if (!DstTy.isPointer())
1896       return false;
1897     MRI.setType(DstReg, LLT::scalar(64));
1898     return true;
1899   }
1900   case AArch64::G_DUP: {
1901     // Convert the type from p0 to s64 to help selection.
1902     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
1903     if (!DstTy.getElementType().isPointer())
1904       return false;
1905     auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
1906     MRI.setType(I.getOperand(0).getReg(),
1907                 DstTy.changeElementType(LLT::scalar(64)));
1908     MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
1909     I.getOperand(1).setReg(NewSrc.getReg(0));
1910     return true;
1911   }
1912   case TargetOpcode::G_UITOFP:
1913   case TargetOpcode::G_SITOFP: {
1914     // If both source and destination regbanks are FPR, then convert the opcode
1915     // to G_SITOF so that the importer can select it to an fpr variant.
1916     // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
1917     // copy.
1918     Register SrcReg = I.getOperand(1).getReg();
1919     LLT SrcTy = MRI.getType(SrcReg);
1920     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
1921     if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
1922       return false;
1923 
1924     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
1925       if (I.getOpcode() == TargetOpcode::G_SITOFP)
1926         I.setDesc(TII.get(AArch64::G_SITOF));
1927       else
1928         I.setDesc(TII.get(AArch64::G_UITOF));
1929       return true;
1930     }
1931     return false;
1932   }
1933   default:
1934     return false;
1935   }
1936 }
1937 
1938 /// This lowering tries to look for G_PTR_ADD instructions and then converts
1939 /// them to a standard G_ADD with a COPY on the source.
1940 ///
1941 /// The motivation behind this is to expose the add semantics to the imported
1942 /// tablegen patterns. We shouldn't need to check for uses being loads/stores,
1943 /// because the selector works bottom up, uses before defs. By the time we
1944 /// end up trying to select a G_PTR_ADD, we should have already attempted to
1945 /// fold this into addressing modes and were therefore unsuccessful.
convertPtrAddToAdd(MachineInstr & I,MachineRegisterInfo & MRI)1946 bool AArch64InstructionSelector::convertPtrAddToAdd(
1947     MachineInstr &I, MachineRegisterInfo &MRI) {
1948   assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
1949   Register DstReg = I.getOperand(0).getReg();
1950   Register AddOp1Reg = I.getOperand(1).getReg();
1951   const LLT PtrTy = MRI.getType(DstReg);
1952   if (PtrTy.getAddressSpace() != 0)
1953     return false;
1954 
1955   const LLT CastPtrTy = PtrTy.isVector() ? LLT::vector(2, 64) : LLT::scalar(64);
1956   auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
1957   // Set regbanks on the registers.
1958   if (PtrTy.isVector())
1959     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
1960   else
1961     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
1962 
1963   // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
1964   // %dst(intty) = G_ADD %intbase, off
1965   I.setDesc(TII.get(TargetOpcode::G_ADD));
1966   MRI.setType(DstReg, CastPtrTy);
1967   I.getOperand(1).setReg(PtrToInt.getReg(0));
1968   if (!select(*PtrToInt)) {
1969     LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
1970     return false;
1971   }
1972 
1973   // Also take the opportunity here to try to do some optimization.
1974   // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
1975   Register NegatedReg;
1976   if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg))))
1977     return true;
1978   I.getOperand(2).setReg(NegatedReg);
1979   I.setDesc(TII.get(TargetOpcode::G_SUB));
1980   return true;
1981 }
1982 
earlySelectSHL(MachineInstr & I,MachineRegisterInfo & MRI)1983 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
1984                                                 MachineRegisterInfo &MRI) {
1985   // We try to match the immediate variant of LSL, which is actually an alias
1986   // for a special case of UBFM. Otherwise, we fall back to the imported
1987   // selector which will match the register variant.
1988   assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
1989   const auto &MO = I.getOperand(2);
1990   auto VRegAndVal = getConstantVRegVal(MO.getReg(), MRI);
1991   if (!VRegAndVal)
1992     return false;
1993 
1994   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
1995   if (DstTy.isVector())
1996     return false;
1997   bool Is64Bit = DstTy.getSizeInBits() == 64;
1998   auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
1999   auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
2000 
2001   if (!Imm1Fn || !Imm2Fn)
2002     return false;
2003 
2004   auto NewI =
2005       MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2006                      {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
2007 
2008   for (auto &RenderFn : *Imm1Fn)
2009     RenderFn(NewI);
2010   for (auto &RenderFn : *Imm2Fn)
2011     RenderFn(NewI);
2012 
2013   I.eraseFromParent();
2014   return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
2015 }
2016 
contractCrossBankCopyIntoStore(MachineInstr & I,MachineRegisterInfo & MRI)2017 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2018     MachineInstr &I, MachineRegisterInfo &MRI) {
2019   assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2020   // If we're storing a scalar, it doesn't matter what register bank that
2021   // scalar is on. All that matters is the size.
2022   //
2023   // So, if we see something like this (with a 32-bit scalar as an example):
2024   //
2025   // %x:gpr(s32) = ... something ...
2026   // %y:fpr(s32) = COPY %x:gpr(s32)
2027   // G_STORE %y:fpr(s32)
2028   //
2029   // We can fix this up into something like this:
2030   //
2031   // G_STORE %x:gpr(s32)
2032   //
2033   // And then continue the selection process normally.
2034   Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
2035   if (!DefDstReg.isValid())
2036     return false;
2037   LLT DefDstTy = MRI.getType(DefDstReg);
2038   Register StoreSrcReg = I.getOperand(0).getReg();
2039   LLT StoreSrcTy = MRI.getType(StoreSrcReg);
2040 
2041   // If we get something strange like a physical register, then we shouldn't
2042   // go any further.
2043   if (!DefDstTy.isValid())
2044     return false;
2045 
2046   // Are the source and dst types the same size?
2047   if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2048     return false;
2049 
2050   if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
2051       RBI.getRegBank(DefDstReg, MRI, TRI))
2052     return false;
2053 
2054   // We have a cross-bank copy, which is entering a store. Let's fold it.
2055   I.getOperand(0).setReg(DefDstReg);
2056   return true;
2057 }
2058 
earlySelect(MachineInstr & I)2059 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2060   assert(I.getParent() && "Instruction should be in a basic block!");
2061   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2062 
2063   MachineBasicBlock &MBB = *I.getParent();
2064   MachineFunction &MF = *MBB.getParent();
2065   MachineRegisterInfo &MRI = MF.getRegInfo();
2066 
2067   switch (I.getOpcode()) {
2068   case AArch64::G_DUP: {
2069     // Before selecting a DUP instruction, check if it is better selected as a
2070     // MOV or load from a constant pool.
2071     Register Src = I.getOperand(1).getReg();
2072     auto ValAndVReg = getConstantVRegValWithLookThrough(Src, MRI);
2073     if (!ValAndVReg)
2074       return false;
2075     LLVMContext &Ctx = MF.getFunction().getContext();
2076     Register Dst = I.getOperand(0).getReg();
2077     auto *CV = ConstantDataVector::getSplat(
2078         MRI.getType(Dst).getNumElements(),
2079         ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()),
2080                          ValAndVReg->Value));
2081     if (!emitConstantVector(Dst, CV, MIB, MRI))
2082       return false;
2083     I.eraseFromParent();
2084     return true;
2085   }
2086   case TargetOpcode::G_BR: {
2087     // If the branch jumps to the fallthrough block, don't bother emitting it.
2088     // Only do this for -O0 for a good code size improvement, because when
2089     // optimizations are enabled we want to leave this choice to
2090     // MachineBlockPlacement.
2091     bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOpt::None;
2092     if (EnableOpt || !MBB.isLayoutSuccessor(I.getOperand(0).getMBB()))
2093       return false;
2094     I.eraseFromParent();
2095     return true;
2096   }
2097   case TargetOpcode::G_SHL:
2098     return earlySelectSHL(I, MRI);
2099   case TargetOpcode::G_CONSTANT: {
2100     bool IsZero = false;
2101     if (I.getOperand(1).isCImm())
2102       IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0;
2103     else if (I.getOperand(1).isImm())
2104       IsZero = I.getOperand(1).getImm() == 0;
2105 
2106     if (!IsZero)
2107       return false;
2108 
2109     Register DefReg = I.getOperand(0).getReg();
2110     LLT Ty = MRI.getType(DefReg);
2111     if (Ty.getSizeInBits() == 64) {
2112       I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
2113       RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
2114     } else if (Ty.getSizeInBits() == 32) {
2115       I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
2116       RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
2117     } else
2118       return false;
2119 
2120     I.setDesc(TII.get(TargetOpcode::COPY));
2121     return true;
2122   }
2123 
2124   case TargetOpcode::G_ADD: {
2125     // Check if this is being fed by a G_ICMP on either side.
2126     //
2127     // (cmp pred, x, y) + z
2128     //
2129     // In the above case, when the cmp is true, we increment z by 1. So, we can
2130     // fold the add into the cset for the cmp by using cinc.
2131     //
2132     // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2133     Register X = I.getOperand(1).getReg();
2134 
2135     // Only handle scalars. Scalar G_ICMP is only legal for s32, so bail out
2136     // early if we see it.
2137     LLT Ty = MRI.getType(X);
2138     if (Ty.isVector() || Ty.getSizeInBits() != 32)
2139       return false;
2140 
2141     Register CmpReg = I.getOperand(2).getReg();
2142     MachineInstr *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, CmpReg, MRI);
2143     if (!Cmp) {
2144       std::swap(X, CmpReg);
2145       Cmp = getOpcodeDef(TargetOpcode::G_ICMP, CmpReg, MRI);
2146       if (!Cmp)
2147         return false;
2148     }
2149     auto Pred =
2150         static_cast<CmpInst::Predicate>(Cmp->getOperand(1).getPredicate());
2151     emitIntegerCompare(Cmp->getOperand(2), Cmp->getOperand(3),
2152                        Cmp->getOperand(1), MIB);
2153     emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIB, X);
2154     I.eraseFromParent();
2155     return true;
2156   }
2157   default:
2158     return false;
2159   }
2160 }
2161 
select(MachineInstr & I)2162 bool AArch64InstructionSelector::select(MachineInstr &I) {
2163   assert(I.getParent() && "Instruction should be in a basic block!");
2164   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2165 
2166   MachineBasicBlock &MBB = *I.getParent();
2167   MachineFunction &MF = *MBB.getParent();
2168   MachineRegisterInfo &MRI = MF.getRegInfo();
2169 
2170   const AArch64Subtarget *Subtarget =
2171       &static_cast<const AArch64Subtarget &>(MF.getSubtarget());
2172   if (Subtarget->requiresStrictAlign()) {
2173     // We don't support this feature yet.
2174     LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2175     return false;
2176   }
2177 
2178   MIB.setInstrAndDebugLoc(I);
2179 
2180   unsigned Opcode = I.getOpcode();
2181   // G_PHI requires same handling as PHI
2182   if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2183     // Certain non-generic instructions also need some special handling.
2184 
2185     if (Opcode ==  TargetOpcode::LOAD_STACK_GUARD)
2186       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2187 
2188     if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2189       const Register DefReg = I.getOperand(0).getReg();
2190       const LLT DefTy = MRI.getType(DefReg);
2191 
2192       const RegClassOrRegBank &RegClassOrBank =
2193         MRI.getRegClassOrRegBank(DefReg);
2194 
2195       const TargetRegisterClass *DefRC
2196         = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
2197       if (!DefRC) {
2198         if (!DefTy.isValid()) {
2199           LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2200           return false;
2201         }
2202         const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
2203         DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI);
2204         if (!DefRC) {
2205           LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2206           return false;
2207         }
2208       }
2209 
2210       I.setDesc(TII.get(TargetOpcode::PHI));
2211 
2212       return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
2213     }
2214 
2215     if (I.isCopy())
2216       return selectCopy(I, TII, MRI, TRI, RBI);
2217 
2218     return true;
2219   }
2220 
2221 
2222   if (I.getNumOperands() != I.getNumExplicitOperands()) {
2223     LLVM_DEBUG(
2224         dbgs() << "Generic instruction has unexpected implicit operands\n");
2225     return false;
2226   }
2227 
2228   // Try to do some lowering before we start instruction selecting. These
2229   // lowerings are purely transformations on the input G_MIR and so selection
2230   // must continue after any modification of the instruction.
2231   if (preISelLower(I)) {
2232     Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2233   }
2234 
2235   // There may be patterns where the importer can't deal with them optimally,
2236   // but does select it to a suboptimal sequence so our custom C++ selection
2237   // code later never has a chance to work on it. Therefore, we have an early
2238   // selection attempt here to give priority to certain selection routines
2239   // over the imported ones.
2240   if (earlySelect(I))
2241     return true;
2242 
2243   if (selectImpl(I, *CoverageInfo))
2244     return true;
2245 
2246   LLT Ty =
2247       I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
2248 
2249   switch (Opcode) {
2250   case TargetOpcode::G_SBFX:
2251   case TargetOpcode::G_UBFX: {
2252     static const unsigned OpcTable[2][2] = {
2253         {AArch64::UBFMWri, AArch64::UBFMXri},
2254         {AArch64::SBFMWri, AArch64::SBFMXri}};
2255     bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2256     unsigned Size = Ty.getSizeInBits();
2257     unsigned Opc = OpcTable[IsSigned][Size == 64];
2258     auto Cst1 =
2259         getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI);
2260     assert(Cst1 && "Should have gotten a constant for src 1?");
2261     auto Cst2 =
2262         getConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI);
2263     assert(Cst2 && "Should have gotten a constant for src 2?");
2264     auto LSB = Cst1->Value.getZExtValue();
2265     auto Width = Cst2->Value.getZExtValue();
2266     auto BitfieldInst =
2267         MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)})
2268             .addImm(LSB)
2269             .addImm(LSB + Width - 1);
2270     I.eraseFromParent();
2271     return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI);
2272   }
2273   case TargetOpcode::G_BRCOND:
2274     return selectCompareBranch(I, MF, MRI);
2275 
2276   case TargetOpcode::G_BRINDIRECT: {
2277     I.setDesc(TII.get(AArch64::BR));
2278     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2279   }
2280 
2281   case TargetOpcode::G_BRJT:
2282     return selectBrJT(I, MRI);
2283 
2284   case AArch64::G_ADD_LOW: {
2285     // This op may have been separated from it's ADRP companion by the localizer
2286     // or some other code motion pass. Given that many CPUs will try to
2287     // macro fuse these operations anyway, select this into a MOVaddr pseudo
2288     // which will later be expanded into an ADRP+ADD pair after scheduling.
2289     MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg());
2290     if (BaseMI->getOpcode() != AArch64::ADRP) {
2291       I.setDesc(TII.get(AArch64::ADDXri));
2292       I.addOperand(MachineOperand::CreateImm(0));
2293       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2294     }
2295     assert(TM.getCodeModel() == CodeModel::Small &&
2296            "Expected small code model");
2297     auto Op1 = BaseMI->getOperand(1);
2298     auto Op2 = I.getOperand(2);
2299     auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
2300                        .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
2301                                          Op1.getTargetFlags())
2302                        .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
2303                                          Op2.getTargetFlags());
2304     I.eraseFromParent();
2305     return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
2306   }
2307 
2308   case TargetOpcode::G_BSWAP: {
2309     // Handle vector types for G_BSWAP directly.
2310     Register DstReg = I.getOperand(0).getReg();
2311     LLT DstTy = MRI.getType(DstReg);
2312 
2313     // We should only get vector types here; everything else is handled by the
2314     // importer right now.
2315     if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) {
2316       LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n");
2317       return false;
2318     }
2319 
2320     // Only handle 4 and 2 element vectors for now.
2321     // TODO: 16-bit elements.
2322     unsigned NumElts = DstTy.getNumElements();
2323     if (NumElts != 4 && NumElts != 2) {
2324       LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n");
2325       return false;
2326     }
2327 
2328     // Choose the correct opcode for the supported types. Right now, that's
2329     // v2s32, v4s32, and v2s64.
2330     unsigned Opc = 0;
2331     unsigned EltSize = DstTy.getElementType().getSizeInBits();
2332     if (EltSize == 32)
2333       Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8
2334                                           : AArch64::REV32v16i8;
2335     else if (EltSize == 64)
2336       Opc = AArch64::REV64v16i8;
2337 
2338     // We should always get something by the time we get here...
2339     assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?");
2340 
2341     I.setDesc(TII.get(Opc));
2342     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2343   }
2344 
2345   case TargetOpcode::G_FCONSTANT:
2346   case TargetOpcode::G_CONSTANT: {
2347     const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
2348 
2349     const LLT s8 = LLT::scalar(8);
2350     const LLT s16 = LLT::scalar(16);
2351     const LLT s32 = LLT::scalar(32);
2352     const LLT s64 = LLT::scalar(64);
2353     const LLT s128 = LLT::scalar(128);
2354     const LLT p0 = LLT::pointer(0, 64);
2355 
2356     const Register DefReg = I.getOperand(0).getReg();
2357     const LLT DefTy = MRI.getType(DefReg);
2358     const unsigned DefSize = DefTy.getSizeInBits();
2359     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2360 
2361     // FIXME: Redundant check, but even less readable when factored out.
2362     if (isFP) {
2363       if (Ty != s32 && Ty != s64 && Ty != s128) {
2364         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2365                           << " constant, expected: " << s32 << " or " << s64
2366                           << " or " << s128 << '\n');
2367         return false;
2368       }
2369 
2370       if (RB.getID() != AArch64::FPRRegBankID) {
2371         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2372                           << " constant on bank: " << RB
2373                           << ", expected: FPR\n");
2374         return false;
2375       }
2376 
2377       // The case when we have 0.0 is covered by tablegen. Reject it here so we
2378       // can be sure tablegen works correctly and isn't rescued by this code.
2379       // 0.0 is not covered by tablegen for FP128. So we will handle this
2380       // scenario in the code here.
2381       if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
2382         return false;
2383     } else {
2384       // s32 and s64 are covered by tablegen.
2385       if (Ty != p0 && Ty != s8 && Ty != s16) {
2386         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2387                           << " constant, expected: " << s32 << ", " << s64
2388                           << ", or " << p0 << '\n');
2389         return false;
2390       }
2391 
2392       if (RB.getID() != AArch64::GPRRegBankID) {
2393         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2394                           << " constant on bank: " << RB
2395                           << ", expected: GPR\n");
2396         return false;
2397       }
2398     }
2399 
2400     // We allow G_CONSTANT of types < 32b.
2401     const unsigned MovOpc =
2402         DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2403 
2404     if (isFP) {
2405       // Either emit a FMOV, or emit a copy to emit a normal mov.
2406       const TargetRegisterClass &GPRRC =
2407           DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass;
2408       const TargetRegisterClass &FPRRC =
2409           DefSize == 32 ? AArch64::FPR32RegClass
2410                         : (DefSize == 64 ? AArch64::FPR64RegClass
2411                                          : AArch64::FPR128RegClass);
2412 
2413       // For 64b values, emit a constant pool load instead.
2414       // For s32, use a cp load if we have optsize/minsize.
2415       if (DefSize == 64 || DefSize == 128 ||
2416           (DefSize == 32 && shouldOptForSize(&MF))) {
2417         auto *FPImm = I.getOperand(1).getFPImm();
2418         auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
2419         if (!LoadMI) {
2420           LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2421           return false;
2422         }
2423         MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()});
2424         I.eraseFromParent();
2425         return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
2426       }
2427 
2428       // Nope. Emit a copy and use a normal mov instead.
2429       const Register DefGPRReg = MRI.createVirtualRegister(&GPRRC);
2430       MachineOperand &RegOp = I.getOperand(0);
2431       RegOp.setReg(DefGPRReg);
2432       MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2433       MIB.buildCopy({DefReg}, {DefGPRReg});
2434 
2435       if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
2436         LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2437         return false;
2438       }
2439 
2440       MachineOperand &ImmOp = I.getOperand(1);
2441       // FIXME: Is going through int64_t always correct?
2442       ImmOp.ChangeToImmediate(
2443           ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2444     } else if (I.getOperand(1).isCImm()) {
2445       uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
2446       I.getOperand(1).ChangeToImmediate(Val);
2447     } else if (I.getOperand(1).isImm()) {
2448       uint64_t Val = I.getOperand(1).getImm();
2449       I.getOperand(1).ChangeToImmediate(Val);
2450     }
2451 
2452     I.setDesc(TII.get(MovOpc));
2453     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2454     return true;
2455   }
2456   case TargetOpcode::G_EXTRACT: {
2457     Register DstReg = I.getOperand(0).getReg();
2458     Register SrcReg = I.getOperand(1).getReg();
2459     LLT SrcTy = MRI.getType(SrcReg);
2460     LLT DstTy = MRI.getType(DstReg);
2461     (void)DstTy;
2462     unsigned SrcSize = SrcTy.getSizeInBits();
2463 
2464     if (SrcTy.getSizeInBits() > 64) {
2465       // This should be an extract of an s128, which is like a vector extract.
2466       if (SrcTy.getSizeInBits() != 128)
2467         return false;
2468       // Only support extracting 64 bits from an s128 at the moment.
2469       if (DstTy.getSizeInBits() != 64)
2470         return false;
2471 
2472       unsigned Offset = I.getOperand(2).getImm();
2473       if (Offset % 64 != 0)
2474         return false;
2475 
2476       // Check we have the right regbank always.
2477       const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2478       const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2479       assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2480 
2481       if (SrcRB.getID() == AArch64::GPRRegBankID) {
2482         MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
2483             .addUse(SrcReg, 0, Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2484         I.eraseFromParent();
2485         return true;
2486       }
2487 
2488       // Emit the same code as a vector extract.
2489       // Offset must be a multiple of 64.
2490       unsigned LaneIdx = Offset / 64;
2491       MachineInstr *Extract = emitExtractVectorElt(
2492           DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
2493       if (!Extract)
2494         return false;
2495       I.eraseFromParent();
2496       return true;
2497     }
2498 
2499     I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2500     MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
2501                                       Ty.getSizeInBits() - 1);
2502 
2503     if (SrcSize < 64) {
2504       assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2505              "unexpected G_EXTRACT types");
2506       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2507     }
2508 
2509     DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2510     MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2511     MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
2512         .addReg(DstReg, 0, AArch64::sub_32);
2513     RBI.constrainGenericRegister(I.getOperand(0).getReg(),
2514                                  AArch64::GPR32RegClass, MRI);
2515     I.getOperand(0).setReg(DstReg);
2516 
2517     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2518   }
2519 
2520   case TargetOpcode::G_INSERT: {
2521     LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
2522     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2523     unsigned DstSize = DstTy.getSizeInBits();
2524     // Larger inserts are vectors, same-size ones should be something else by
2525     // now (split up or turned into COPYs).
2526     if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2527       return false;
2528 
2529     I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2530     unsigned LSB = I.getOperand(3).getImm();
2531     unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
2532     I.getOperand(3).setImm((DstSize - LSB) % DstSize);
2533     MachineInstrBuilder(MF, I).addImm(Width - 1);
2534 
2535     if (DstSize < 64) {
2536       assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2537              "unexpected G_INSERT types");
2538       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2539     }
2540 
2541     Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2542     BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
2543             TII.get(AArch64::SUBREG_TO_REG))
2544         .addDef(SrcReg)
2545         .addImm(0)
2546         .addUse(I.getOperand(2).getReg())
2547         .addImm(AArch64::sub_32);
2548     RBI.constrainGenericRegister(I.getOperand(2).getReg(),
2549                                  AArch64::GPR32RegClass, MRI);
2550     I.getOperand(2).setReg(SrcReg);
2551 
2552     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2553   }
2554   case TargetOpcode::G_FRAME_INDEX: {
2555     // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2556     if (Ty != LLT::pointer(0, 64)) {
2557       LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2558                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2559       return false;
2560     }
2561     I.setDesc(TII.get(AArch64::ADDXri));
2562 
2563     // MOs for a #0 shifted immediate.
2564     I.addOperand(MachineOperand::CreateImm(0));
2565     I.addOperand(MachineOperand::CreateImm(0));
2566 
2567     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2568   }
2569 
2570   case TargetOpcode::G_GLOBAL_VALUE: {
2571     auto GV = I.getOperand(1).getGlobal();
2572     if (GV->isThreadLocal())
2573       return selectTLSGlobalValue(I, MRI);
2574 
2575     unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
2576     if (OpFlags & AArch64II::MO_GOT) {
2577       I.setDesc(TII.get(AArch64::LOADgot));
2578       I.getOperand(1).setTargetFlags(OpFlags);
2579     } else if (TM.getCodeModel() == CodeModel::Large) {
2580       // Materialize the global using movz/movk instructions.
2581       materializeLargeCMVal(I, GV, OpFlags);
2582       I.eraseFromParent();
2583       return true;
2584     } else if (TM.getCodeModel() == CodeModel::Tiny) {
2585       I.setDesc(TII.get(AArch64::ADR));
2586       I.getOperand(1).setTargetFlags(OpFlags);
2587     } else {
2588       I.setDesc(TII.get(AArch64::MOVaddr));
2589       I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2590       MachineInstrBuilder MIB(MF, I);
2591       MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
2592                            OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2593     }
2594     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2595   }
2596 
2597   case TargetOpcode::G_ZEXTLOAD:
2598   case TargetOpcode::G_LOAD:
2599   case TargetOpcode::G_STORE: {
2600     bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
2601     LLT PtrTy = MRI.getType(I.getOperand(1).getReg());
2602 
2603     if (PtrTy != LLT::pointer(0, 64)) {
2604       LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
2605                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2606       return false;
2607     }
2608 
2609     auto &MemOp = **I.memoperands_begin();
2610     uint64_t MemSizeInBytes = MemOp.getSize();
2611     unsigned MemSizeInBits = MemSizeInBytes * 8;
2612     AtomicOrdering Order = MemOp.getOrdering();
2613 
2614     // Need special instructions for atomics that affect ordering.
2615     if (Order != AtomicOrdering::NotAtomic &&
2616         Order != AtomicOrdering::Unordered &&
2617         Order != AtomicOrdering::Monotonic)
2618       return false;
2619 
2620 #ifndef NDEBUG
2621     const Register PtrReg = I.getOperand(1).getReg();
2622     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
2623     // Sanity-check the pointer register.
2624     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
2625            "Load/Store pointer operand isn't a GPR");
2626     assert(MRI.getType(PtrReg).isPointer() &&
2627            "Load/Store pointer operand isn't a pointer");
2628 #endif
2629 
2630     const Register ValReg = I.getOperand(0).getReg();
2631     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
2632 
2633     // Helper lambda for partially selecting I. Either returns the original
2634     // instruction with an updated opcode, or a new instruction.
2635     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
2636       bool IsStore = I.getOpcode() == TargetOpcode::G_STORE;
2637       const unsigned NewOpc =
2638           selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
2639       if (NewOpc == I.getOpcode())
2640         return nullptr;
2641       // Check if we can fold anything into the addressing mode.
2642       auto AddrModeFns =
2643           selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
2644       if (!AddrModeFns) {
2645         // Can't fold anything. Use the original instruction.
2646         I.setDesc(TII.get(NewOpc));
2647         I.addOperand(MachineOperand::CreateImm(0));
2648         return &I;
2649       }
2650 
2651       // Folded something. Create a new instruction and return it.
2652       auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
2653       IsStore ? NewInst.addUse(ValReg) : NewInst.addDef(ValReg);
2654       NewInst.cloneMemRefs(I);
2655       for (auto &Fn : *AddrModeFns)
2656         Fn(NewInst);
2657       I.eraseFromParent();
2658       return &*NewInst;
2659     };
2660 
2661     MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
2662     if (!LoadStore)
2663       return false;
2664 
2665     // If we're storing a 0, use WZR/XZR.
2666     if (Opcode == TargetOpcode::G_STORE) {
2667       auto CVal = getConstantVRegValWithLookThrough(
2668           LoadStore->getOperand(0).getReg(), MRI, /*LookThroughInstrs = */ true,
2669           /*HandleFConstants = */ false);
2670       if (CVal && CVal->Value == 0) {
2671         switch (LoadStore->getOpcode()) {
2672         case AArch64::STRWui:
2673         case AArch64::STRHHui:
2674         case AArch64::STRBBui:
2675           LoadStore->getOperand(0).setReg(AArch64::WZR);
2676           break;
2677         case AArch64::STRXui:
2678           LoadStore->getOperand(0).setReg(AArch64::XZR);
2679           break;
2680         }
2681       }
2682     }
2683 
2684     if (IsZExtLoad) {
2685       // The zextload from a smaller type to i32 should be handled by the
2686       // importer.
2687       if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64)
2688         return false;
2689       // If we have a ZEXTLOAD then change the load's type to be a narrower reg
2690       // and zero_extend with SUBREG_TO_REG.
2691       Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2692       Register DstReg = LoadStore->getOperand(0).getReg();
2693       LoadStore->getOperand(0).setReg(LdReg);
2694 
2695       MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator()));
2696       MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
2697           .addImm(0)
2698           .addUse(LdReg)
2699           .addImm(AArch64::sub_32);
2700       constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
2701       return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
2702                                           MRI);
2703     }
2704     return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
2705   }
2706 
2707   case TargetOpcode::G_SMULH:
2708   case TargetOpcode::G_UMULH: {
2709     // Reject the various things we don't support yet.
2710     if (unsupportedBinOp(I, RBI, MRI, TRI))
2711       return false;
2712 
2713     const Register DefReg = I.getOperand(0).getReg();
2714     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2715 
2716     if (RB.getID() != AArch64::GPRRegBankID) {
2717       LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
2718       return false;
2719     }
2720 
2721     if (Ty != LLT::scalar(64)) {
2722       LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
2723                         << ", expected: " << LLT::scalar(64) << '\n');
2724       return false;
2725     }
2726 
2727     unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
2728                                                              : AArch64::UMULHrr;
2729     I.setDesc(TII.get(NewOpc));
2730 
2731     // Now that we selected an opcode, we need to constrain the register
2732     // operands to use appropriate classes.
2733     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2734   }
2735   case TargetOpcode::G_LSHR:
2736   case TargetOpcode::G_ASHR:
2737     if (MRI.getType(I.getOperand(0).getReg()).isVector())
2738       return selectVectorAshrLshr(I, MRI);
2739     LLVM_FALLTHROUGH;
2740   case TargetOpcode::G_SHL:
2741     if (Opcode == TargetOpcode::G_SHL &&
2742         MRI.getType(I.getOperand(0).getReg()).isVector())
2743       return selectVectorSHL(I, MRI);
2744     LLVM_FALLTHROUGH;
2745   case TargetOpcode::G_FADD:
2746   case TargetOpcode::G_FSUB:
2747   case TargetOpcode::G_FMUL:
2748   case TargetOpcode::G_FDIV:
2749   case TargetOpcode::G_OR: {
2750     // Reject the various things we don't support yet.
2751     if (unsupportedBinOp(I, RBI, MRI, TRI))
2752       return false;
2753 
2754     const unsigned OpSize = Ty.getSizeInBits();
2755 
2756     const Register DefReg = I.getOperand(0).getReg();
2757     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2758 
2759     const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
2760     if (NewOpc == I.getOpcode())
2761       return false;
2762 
2763     I.setDesc(TII.get(NewOpc));
2764     // FIXME: Should the type be always reset in setDesc?
2765 
2766     // Now that we selected an opcode, we need to constrain the register
2767     // operands to use appropriate classes.
2768     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2769   }
2770 
2771   case TargetOpcode::G_PTR_ADD: {
2772     emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB);
2773     I.eraseFromParent();
2774     return true;
2775   }
2776   case TargetOpcode::G_SADDO:
2777   case TargetOpcode::G_UADDO:
2778   case TargetOpcode::G_SSUBO:
2779   case TargetOpcode::G_USUBO: {
2780     // Emit the operation and get the correct condition code.
2781     auto OpAndCC = emitOverflowOp(Opcode, I.getOperand(0).getReg(),
2782                                   I.getOperand(2), I.getOperand(3), MIB);
2783 
2784     // Now, put the overflow result in the register given by the first operand
2785     // to the overflow op. CSINC increments the result when the predicate is
2786     // false, so to get the increment when it's true, we need to use the
2787     // inverse. In this case, we want to increment when carry is set.
2788     Register ZReg = AArch64::WZR;
2789     auto CsetMI = MIB.buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()},
2790                                  {ZReg, ZReg})
2791                       .addImm(getInvertedCondCode(OpAndCC.second));
2792     constrainSelectedInstRegOperands(*CsetMI, TII, TRI, RBI);
2793     I.eraseFromParent();
2794     return true;
2795   }
2796 
2797   case TargetOpcode::G_PTRMASK: {
2798     Register MaskReg = I.getOperand(2).getReg();
2799     Optional<int64_t> MaskVal = getConstantVRegSExtVal(MaskReg, MRI);
2800     // TODO: Implement arbitrary cases
2801     if (!MaskVal || !isShiftedMask_64(*MaskVal))
2802       return false;
2803 
2804     uint64_t Mask = *MaskVal;
2805     I.setDesc(TII.get(AArch64::ANDXri));
2806     I.getOperand(2).ChangeToImmediate(
2807         AArch64_AM::encodeLogicalImmediate(Mask, 64));
2808 
2809     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2810   }
2811   case TargetOpcode::G_PTRTOINT:
2812   case TargetOpcode::G_TRUNC: {
2813     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2814     const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
2815 
2816     const Register DstReg = I.getOperand(0).getReg();
2817     const Register SrcReg = I.getOperand(1).getReg();
2818 
2819     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2820     const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2821 
2822     if (DstRB.getID() != SrcRB.getID()) {
2823       LLVM_DEBUG(
2824           dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
2825       return false;
2826     }
2827 
2828     if (DstRB.getID() == AArch64::GPRRegBankID) {
2829       const TargetRegisterClass *DstRC =
2830           getRegClassForTypeOnBank(DstTy, DstRB, RBI);
2831       if (!DstRC)
2832         return false;
2833 
2834       const TargetRegisterClass *SrcRC =
2835           getRegClassForTypeOnBank(SrcTy, SrcRB, RBI);
2836       if (!SrcRC)
2837         return false;
2838 
2839       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
2840           !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
2841         LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
2842         return false;
2843       }
2844 
2845       if (DstRC == SrcRC) {
2846         // Nothing to be done
2847       } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
2848                  SrcTy == LLT::scalar(64)) {
2849         llvm_unreachable("TableGen can import this case");
2850         return false;
2851       } else if (DstRC == &AArch64::GPR32RegClass &&
2852                  SrcRC == &AArch64::GPR64RegClass) {
2853         I.getOperand(1).setSubReg(AArch64::sub_32);
2854       } else {
2855         LLVM_DEBUG(
2856             dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
2857         return false;
2858       }
2859 
2860       I.setDesc(TII.get(TargetOpcode::COPY));
2861       return true;
2862     } else if (DstRB.getID() == AArch64::FPRRegBankID) {
2863       if (DstTy == LLT::vector(4, 16) && SrcTy == LLT::vector(4, 32)) {
2864         I.setDesc(TII.get(AArch64::XTNv4i16));
2865         constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2866         return true;
2867       }
2868 
2869       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
2870         MachineInstr *Extract = emitExtractVectorElt(
2871             DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
2872         if (!Extract)
2873           return false;
2874         I.eraseFromParent();
2875         return true;
2876       }
2877 
2878       // We might have a vector G_PTRTOINT, in which case just emit a COPY.
2879       if (Opcode == TargetOpcode::G_PTRTOINT) {
2880         assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
2881         I.setDesc(TII.get(TargetOpcode::COPY));
2882         return true;
2883       }
2884     }
2885 
2886     return false;
2887   }
2888 
2889   case TargetOpcode::G_ANYEXT: {
2890     const Register DstReg = I.getOperand(0).getReg();
2891     const Register SrcReg = I.getOperand(1).getReg();
2892 
2893     const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
2894     if (RBDst.getID() != AArch64::GPRRegBankID) {
2895       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
2896                         << ", expected: GPR\n");
2897       return false;
2898     }
2899 
2900     const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
2901     if (RBSrc.getID() != AArch64::GPRRegBankID) {
2902       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
2903                         << ", expected: GPR\n");
2904       return false;
2905     }
2906 
2907     const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
2908 
2909     if (DstSize == 0) {
2910       LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
2911       return false;
2912     }
2913 
2914     if (DstSize != 64 && DstSize > 32) {
2915       LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
2916                         << ", expected: 32 or 64\n");
2917       return false;
2918     }
2919     // At this point G_ANYEXT is just like a plain COPY, but we need
2920     // to explicitly form the 64-bit value if any.
2921     if (DstSize > 32) {
2922       Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
2923       BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
2924           .addDef(ExtSrc)
2925           .addImm(0)
2926           .addUse(SrcReg)
2927           .addImm(AArch64::sub_32);
2928       I.getOperand(1).setReg(ExtSrc);
2929     }
2930     return selectCopy(I, TII, MRI, TRI, RBI);
2931   }
2932 
2933   case TargetOpcode::G_ZEXT:
2934   case TargetOpcode::G_SEXT_INREG:
2935   case TargetOpcode::G_SEXT: {
2936     unsigned Opcode = I.getOpcode();
2937     const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
2938     const Register DefReg = I.getOperand(0).getReg();
2939     Register SrcReg = I.getOperand(1).getReg();
2940     const LLT DstTy = MRI.getType(DefReg);
2941     const LLT SrcTy = MRI.getType(SrcReg);
2942     unsigned DstSize = DstTy.getSizeInBits();
2943     unsigned SrcSize = SrcTy.getSizeInBits();
2944 
2945     // SEXT_INREG has the same src reg size as dst, the size of the value to be
2946     // extended is encoded in the imm.
2947     if (Opcode == TargetOpcode::G_SEXT_INREG)
2948       SrcSize = I.getOperand(2).getImm();
2949 
2950     if (DstTy.isVector())
2951       return false; // Should be handled by imported patterns.
2952 
2953     assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
2954                AArch64::GPRRegBankID &&
2955            "Unexpected ext regbank");
2956 
2957     MachineInstr *ExtI;
2958 
2959     // First check if we're extending the result of a load which has a dest type
2960     // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
2961     // GPR register on AArch64 and all loads which are smaller automatically
2962     // zero-extend the upper bits. E.g.
2963     // %v(s8) = G_LOAD %p, :: (load 1)
2964     // %v2(s32) = G_ZEXT %v(s8)
2965     if (!IsSigned) {
2966       auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
2967       bool IsGPR =
2968           RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
2969       if (LoadMI && IsGPR) {
2970         const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
2971         unsigned BytesLoaded = MemOp->getSize();
2972         if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
2973           return selectCopy(I, TII, MRI, TRI, RBI);
2974       }
2975 
2976       // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
2977       // + SUBREG_TO_REG.
2978       //
2979       // If we are zero extending from 32 bits to 64 bits, it's possible that
2980       // the instruction implicitly does the zero extend for us. In that case,
2981       // we only need the SUBREG_TO_REG.
2982       if (IsGPR && SrcSize == 32 && DstSize == 64) {
2983         // Unlike with the G_LOAD case, we don't want to look through copies
2984         // here. (See isDef32.)
2985         MachineInstr *Def = MRI.getVRegDef(SrcReg);
2986         Register SubregToRegSrc = SrcReg;
2987 
2988         // Does the instruction implicitly zero extend?
2989         if (!Def || !isDef32(*Def)) {
2990           // No. Zero out using an OR.
2991           Register OrDst = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2992           const Register ZReg = AArch64::WZR;
2993           MIB.buildInstr(AArch64::ORRWrs, {OrDst}, {ZReg, SrcReg}).addImm(0);
2994           SubregToRegSrc = OrDst;
2995         }
2996 
2997         MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
2998             .addImm(0)
2999             .addUse(SubregToRegSrc)
3000             .addImm(AArch64::sub_32);
3001 
3002         if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
3003                                           MRI)) {
3004           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3005           return false;
3006         }
3007 
3008         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3009                                           MRI)) {
3010           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3011           return false;
3012         }
3013 
3014         I.eraseFromParent();
3015         return true;
3016       }
3017     }
3018 
3019     if (DstSize == 64) {
3020       if (Opcode != TargetOpcode::G_SEXT_INREG) {
3021         // FIXME: Can we avoid manually doing this?
3022         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3023                                           MRI)) {
3024           LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3025                             << " operand\n");
3026           return false;
3027         }
3028         SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
3029                                 {&AArch64::GPR64RegClass}, {})
3030                      .addImm(0)
3031                      .addUse(SrcReg)
3032                      .addImm(AArch64::sub_32)
3033                      .getReg(0);
3034       }
3035 
3036       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3037                              {DefReg}, {SrcReg})
3038                   .addImm(0)
3039                   .addImm(SrcSize - 1);
3040     } else if (DstSize <= 32) {
3041       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3042                              {DefReg}, {SrcReg})
3043                   .addImm(0)
3044                   .addImm(SrcSize - 1);
3045     } else {
3046       return false;
3047     }
3048 
3049     constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
3050     I.eraseFromParent();
3051     return true;
3052   }
3053 
3054   case TargetOpcode::G_SITOFP:
3055   case TargetOpcode::G_UITOFP:
3056   case TargetOpcode::G_FPTOSI:
3057   case TargetOpcode::G_FPTOUI: {
3058     const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
3059               SrcTy = MRI.getType(I.getOperand(1).getReg());
3060     const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
3061     if (NewOpc == Opcode)
3062       return false;
3063 
3064     I.setDesc(TII.get(NewOpc));
3065     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3066 
3067     return true;
3068   }
3069 
3070   case TargetOpcode::G_FREEZE:
3071     return selectCopy(I, TII, MRI, TRI, RBI);
3072 
3073   case TargetOpcode::G_INTTOPTR:
3074     // The importer is currently unable to import pointer types since they
3075     // didn't exist in SelectionDAG.
3076     return selectCopy(I, TII, MRI, TRI, RBI);
3077 
3078   case TargetOpcode::G_BITCAST:
3079     // Imported SelectionDAG rules can handle every bitcast except those that
3080     // bitcast from a type to the same type. Ideally, these shouldn't occur
3081     // but we might not run an optimizer that deletes them. The other exception
3082     // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3083     // of them.
3084     return selectCopy(I, TII, MRI, TRI, RBI);
3085 
3086   case TargetOpcode::G_SELECT: {
3087     if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
3088       LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
3089                         << ", expected: " << LLT::scalar(1) << '\n');
3090       return false;
3091     }
3092 
3093     const Register CondReg = I.getOperand(1).getReg();
3094     const Register TReg = I.getOperand(2).getReg();
3095     const Register FReg = I.getOperand(3).getReg();
3096 
3097     if (tryOptSelect(I))
3098       return true;
3099 
3100     // Make sure to use an unused vreg instead of wzr, so that the peephole
3101     // optimizations will be able to optimize these.
3102     Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3103     auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
3104                      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
3105     constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
3106     if (!emitSelect(I.getOperand(0).getReg(), TReg, FReg, AArch64CC::NE, MIB))
3107       return false;
3108     I.eraseFromParent();
3109     return true;
3110   }
3111   case TargetOpcode::G_ICMP: {
3112     if (Ty.isVector())
3113       return selectVectorICmp(I, MRI);
3114 
3115     if (Ty != LLT::scalar(32)) {
3116       LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3117                         << ", expected: " << LLT::scalar(32) << '\n');
3118       return false;
3119     }
3120 
3121     auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3122     emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1),
3123                        MIB);
3124     emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIB);
3125     I.eraseFromParent();
3126     return true;
3127   }
3128 
3129   case TargetOpcode::G_FCMP: {
3130     CmpInst::Predicate Pred =
3131         static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3132     if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB,
3133                        Pred) ||
3134         !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB))
3135       return false;
3136     I.eraseFromParent();
3137     return true;
3138   }
3139   case TargetOpcode::G_VASTART:
3140     return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3141                                 : selectVaStartAAPCS(I, MF, MRI);
3142   case TargetOpcode::G_INTRINSIC:
3143     return selectIntrinsic(I, MRI);
3144   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3145     return selectIntrinsicWithSideEffects(I, MRI);
3146   case TargetOpcode::G_IMPLICIT_DEF: {
3147     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
3148     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3149     const Register DstReg = I.getOperand(0).getReg();
3150     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3151     const TargetRegisterClass *DstRC =
3152         getRegClassForTypeOnBank(DstTy, DstRB, RBI);
3153     RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
3154     return true;
3155   }
3156   case TargetOpcode::G_BLOCK_ADDR: {
3157     if (TM.getCodeModel() == CodeModel::Large) {
3158       materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
3159       I.eraseFromParent();
3160       return true;
3161     } else {
3162       I.setDesc(TII.get(AArch64::MOVaddrBA));
3163       auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
3164                            I.getOperand(0).getReg())
3165                        .addBlockAddress(I.getOperand(1).getBlockAddress(),
3166                                         /* Offset */ 0, AArch64II::MO_PAGE)
3167                        .addBlockAddress(
3168                            I.getOperand(1).getBlockAddress(), /* Offset */ 0,
3169                            AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3170       I.eraseFromParent();
3171       return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3172     }
3173   }
3174   case AArch64::G_DUP: {
3175     // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3176     // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3177     // difficult because at RBS we may end up pessimizing the fpr case if we
3178     // decided to add an anyextend to fix this. Manual selection is the most
3179     // robust solution for now.
3180     if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
3181         AArch64::GPRRegBankID)
3182       return false; // We expect the fpr regbank case to be imported.
3183     LLT VecTy = MRI.getType(I.getOperand(0).getReg());
3184     if (VecTy == LLT::vector(8, 8))
3185       I.setDesc(TII.get(AArch64::DUPv8i8gpr));
3186     else if (VecTy == LLT::vector(16, 8))
3187       I.setDesc(TII.get(AArch64::DUPv16i8gpr));
3188     else if (VecTy == LLT::vector(4, 16))
3189       I.setDesc(TII.get(AArch64::DUPv4i16gpr));
3190     else if (VecTy == LLT::vector(8, 16))
3191       I.setDesc(TII.get(AArch64::DUPv8i16gpr));
3192     else
3193       return false;
3194     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3195   }
3196   case TargetOpcode::G_INTRINSIC_TRUNC:
3197     return selectIntrinsicTrunc(I, MRI);
3198   case TargetOpcode::G_INTRINSIC_ROUND:
3199     return selectIntrinsicRound(I, MRI);
3200   case TargetOpcode::G_BUILD_VECTOR:
3201     return selectBuildVector(I, MRI);
3202   case TargetOpcode::G_MERGE_VALUES:
3203     return selectMergeValues(I, MRI);
3204   case TargetOpcode::G_UNMERGE_VALUES:
3205     return selectUnmergeValues(I, MRI);
3206   case TargetOpcode::G_SHUFFLE_VECTOR:
3207     return selectShuffleVector(I, MRI);
3208   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3209     return selectExtractElt(I, MRI);
3210   case TargetOpcode::G_INSERT_VECTOR_ELT:
3211     return selectInsertElt(I, MRI);
3212   case TargetOpcode::G_CONCAT_VECTORS:
3213     return selectConcatVectors(I, MRI);
3214   case TargetOpcode::G_JUMP_TABLE:
3215     return selectJumpTable(I, MRI);
3216   case TargetOpcode::G_VECREDUCE_FADD:
3217   case TargetOpcode::G_VECREDUCE_ADD:
3218     return selectReduction(I, MRI);
3219   }
3220 
3221   return false;
3222 }
3223 
selectReduction(MachineInstr & I,MachineRegisterInfo & MRI)3224 bool AArch64InstructionSelector::selectReduction(MachineInstr &I,
3225                                                  MachineRegisterInfo &MRI) {
3226   Register VecReg = I.getOperand(1).getReg();
3227   LLT VecTy = MRI.getType(VecReg);
3228   if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) {
3229     // For <2 x i32> ADDPv2i32 generates an FPR64 value, so we need to emit
3230     // a subregister copy afterwards.
3231     if (VecTy == LLT::vector(2, 32)) {
3232       Register DstReg = I.getOperand(0).getReg();
3233       auto AddP = MIB.buildInstr(AArch64::ADDPv2i32, {&AArch64::FPR64RegClass},
3234                                  {VecReg, VecReg});
3235       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
3236                       .addReg(AddP.getReg(0), 0, AArch64::ssub)
3237                       .getReg(0);
3238       RBI.constrainGenericRegister(Copy, AArch64::FPR32RegClass, MRI);
3239       I.eraseFromParent();
3240       return constrainSelectedInstRegOperands(*AddP, TII, TRI, RBI);
3241     }
3242 
3243     unsigned Opc = 0;
3244     if (VecTy == LLT::vector(16, 8))
3245       Opc = AArch64::ADDVv16i8v;
3246     else if (VecTy == LLT::vector(8, 16))
3247       Opc = AArch64::ADDVv8i16v;
3248     else if (VecTy == LLT::vector(4, 32))
3249       Opc = AArch64::ADDVv4i32v;
3250     else if (VecTy == LLT::vector(2, 64))
3251       Opc = AArch64::ADDPv2i64p;
3252     else {
3253       LLVM_DEBUG(dbgs() << "Unhandled type for add reduction");
3254       return false;
3255     }
3256     I.setDesc(TII.get(Opc));
3257     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3258   }
3259 
3260   if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) {
3261     unsigned Opc = 0;
3262     if (VecTy == LLT::vector(2, 32))
3263       Opc = AArch64::FADDPv2i32p;
3264     else if (VecTy == LLT::vector(2, 64))
3265       Opc = AArch64::FADDPv2i64p;
3266     else {
3267       LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction");
3268       return false;
3269     }
3270     I.setDesc(TII.get(Opc));
3271     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3272   }
3273   return false;
3274 }
3275 
selectBrJT(MachineInstr & I,MachineRegisterInfo & MRI)3276 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3277                                             MachineRegisterInfo &MRI) {
3278   assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3279   Register JTAddr = I.getOperand(0).getReg();
3280   unsigned JTI = I.getOperand(1).getIndex();
3281   Register Index = I.getOperand(2).getReg();
3282 
3283   Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3284   Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
3285 
3286   MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
3287   auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
3288                                       {TargetReg, ScratchReg}, {JTAddr, Index})
3289                            .addJumpTableIndex(JTI);
3290   // Build the indirect branch.
3291   MIB.buildInstr(AArch64::BR, {}, {TargetReg});
3292   I.eraseFromParent();
3293   return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
3294 }
3295 
selectJumpTable(MachineInstr & I,MachineRegisterInfo & MRI)3296 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3297                                                  MachineRegisterInfo &MRI) {
3298   assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3299   assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3300 
3301   Register DstReg = I.getOperand(0).getReg();
3302   unsigned JTI = I.getOperand(1).getIndex();
3303   // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3304   auto MovMI =
3305     MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
3306           .addJumpTableIndex(JTI, AArch64II::MO_PAGE)
3307           .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3308   I.eraseFromParent();
3309   return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3310 }
3311 
selectTLSGlobalValue(MachineInstr & I,MachineRegisterInfo & MRI)3312 bool AArch64InstructionSelector::selectTLSGlobalValue(
3313     MachineInstr &I, MachineRegisterInfo &MRI) {
3314   if (!STI.isTargetMachO())
3315     return false;
3316   MachineFunction &MF = *I.getParent()->getParent();
3317   MF.getFrameInfo().setAdjustsStack(true);
3318 
3319   const auto &GlobalOp = I.getOperand(1);
3320   assert(GlobalOp.getOffset() == 0 &&
3321          "Shouldn't have an offset on TLS globals!");
3322   const GlobalValue &GV = *GlobalOp.getGlobal();
3323 
3324   auto LoadGOT =
3325       MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
3326           .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
3327 
3328   auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
3329                              {LoadGOT.getReg(0)})
3330                   .addImm(0);
3331 
3332   MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
3333   // TLS calls preserve all registers except those that absolutely must be
3334   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3335   // silly).
3336   MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
3337       .addUse(AArch64::X0, RegState::Implicit)
3338       .addDef(AArch64::X0, RegState::Implicit)
3339       .addRegMask(TRI.getTLSCallPreservedMask());
3340 
3341   MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
3342   RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
3343                                MRI);
3344   I.eraseFromParent();
3345   return true;
3346 }
3347 
selectIntrinsicTrunc(MachineInstr & I,MachineRegisterInfo & MRI) const3348 bool AArch64InstructionSelector::selectIntrinsicTrunc(
3349     MachineInstr &I, MachineRegisterInfo &MRI) const {
3350   const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
3351 
3352   // Select the correct opcode.
3353   unsigned Opc = 0;
3354   if (!SrcTy.isVector()) {
3355     switch (SrcTy.getSizeInBits()) {
3356     default:
3357     case 16:
3358       Opc = AArch64::FRINTZHr;
3359       break;
3360     case 32:
3361       Opc = AArch64::FRINTZSr;
3362       break;
3363     case 64:
3364       Opc = AArch64::FRINTZDr;
3365       break;
3366     }
3367   } else {
3368     unsigned NumElts = SrcTy.getNumElements();
3369     switch (SrcTy.getElementType().getSizeInBits()) {
3370     default:
3371       break;
3372     case 16:
3373       if (NumElts == 4)
3374         Opc = AArch64::FRINTZv4f16;
3375       else if (NumElts == 8)
3376         Opc = AArch64::FRINTZv8f16;
3377       break;
3378     case 32:
3379       if (NumElts == 2)
3380         Opc = AArch64::FRINTZv2f32;
3381       else if (NumElts == 4)
3382         Opc = AArch64::FRINTZv4f32;
3383       break;
3384     case 64:
3385       if (NumElts == 2)
3386         Opc = AArch64::FRINTZv2f64;
3387       break;
3388     }
3389   }
3390 
3391   if (!Opc) {
3392     // Didn't get an opcode above, bail.
3393     LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n");
3394     return false;
3395   }
3396 
3397   // Legalization would have set us up perfectly for this; we just need to
3398   // set the opcode and move on.
3399   I.setDesc(TII.get(Opc));
3400   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3401 }
3402 
selectIntrinsicRound(MachineInstr & I,MachineRegisterInfo & MRI) const3403 bool AArch64InstructionSelector::selectIntrinsicRound(
3404     MachineInstr &I, MachineRegisterInfo &MRI) const {
3405   const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
3406 
3407   // Select the correct opcode.
3408   unsigned Opc = 0;
3409   if (!SrcTy.isVector()) {
3410     switch (SrcTy.getSizeInBits()) {
3411     default:
3412     case 16:
3413       Opc = AArch64::FRINTAHr;
3414       break;
3415     case 32:
3416       Opc = AArch64::FRINTASr;
3417       break;
3418     case 64:
3419       Opc = AArch64::FRINTADr;
3420       break;
3421     }
3422   } else {
3423     unsigned NumElts = SrcTy.getNumElements();
3424     switch (SrcTy.getElementType().getSizeInBits()) {
3425     default:
3426       break;
3427     case 16:
3428       if (NumElts == 4)
3429         Opc = AArch64::FRINTAv4f16;
3430       else if (NumElts == 8)
3431         Opc = AArch64::FRINTAv8f16;
3432       break;
3433     case 32:
3434       if (NumElts == 2)
3435         Opc = AArch64::FRINTAv2f32;
3436       else if (NumElts == 4)
3437         Opc = AArch64::FRINTAv4f32;
3438       break;
3439     case 64:
3440       if (NumElts == 2)
3441         Opc = AArch64::FRINTAv2f64;
3442       break;
3443     }
3444   }
3445 
3446   if (!Opc) {
3447     // Didn't get an opcode above, bail.
3448     LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n");
3449     return false;
3450   }
3451 
3452   // Legalization would have set us up perfectly for this; we just need to
3453   // set the opcode and move on.
3454   I.setDesc(TII.get(Opc));
3455   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3456 }
3457 
selectVectorICmp(MachineInstr & I,MachineRegisterInfo & MRI)3458 bool AArch64InstructionSelector::selectVectorICmp(
3459     MachineInstr &I, MachineRegisterInfo &MRI) {
3460   Register DstReg = I.getOperand(0).getReg();
3461   LLT DstTy = MRI.getType(DstReg);
3462   Register SrcReg = I.getOperand(2).getReg();
3463   Register Src2Reg = I.getOperand(3).getReg();
3464   LLT SrcTy = MRI.getType(SrcReg);
3465 
3466   unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
3467   unsigned NumElts = DstTy.getNumElements();
3468 
3469   // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b
3470   // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16
3471   // Third index is cc opcode:
3472   // 0 == eq
3473   // 1 == ugt
3474   // 2 == uge
3475   // 3 == ult
3476   // 4 == ule
3477   // 5 == sgt
3478   // 6 == sge
3479   // 7 == slt
3480   // 8 == sle
3481   // ne is done by negating 'eq' result.
3482 
3483   // This table below assumes that for some comparisons the operands will be
3484   // commuted.
3485   // ult op == commute + ugt op
3486   // ule op == commute + uge op
3487   // slt op == commute + sgt op
3488   // sle op == commute + sge op
3489   unsigned PredIdx = 0;
3490   bool SwapOperands = false;
3491   CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
3492   switch (Pred) {
3493   case CmpInst::ICMP_NE:
3494   case CmpInst::ICMP_EQ:
3495     PredIdx = 0;
3496     break;
3497   case CmpInst::ICMP_UGT:
3498     PredIdx = 1;
3499     break;
3500   case CmpInst::ICMP_UGE:
3501     PredIdx = 2;
3502     break;
3503   case CmpInst::ICMP_ULT:
3504     PredIdx = 3;
3505     SwapOperands = true;
3506     break;
3507   case CmpInst::ICMP_ULE:
3508     PredIdx = 4;
3509     SwapOperands = true;
3510     break;
3511   case CmpInst::ICMP_SGT:
3512     PredIdx = 5;
3513     break;
3514   case CmpInst::ICMP_SGE:
3515     PredIdx = 6;
3516     break;
3517   case CmpInst::ICMP_SLT:
3518     PredIdx = 7;
3519     SwapOperands = true;
3520     break;
3521   case CmpInst::ICMP_SLE:
3522     PredIdx = 8;
3523     SwapOperands = true;
3524     break;
3525   default:
3526     llvm_unreachable("Unhandled icmp predicate");
3527     return false;
3528   }
3529 
3530   // This table obviously should be tablegen'd when we have our GISel native
3531   // tablegen selector.
3532 
3533   static const unsigned OpcTable[4][4][9] = {
3534       {
3535           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3536            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3537            0 /* invalid */},
3538           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3539            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3540            0 /* invalid */},
3541           {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8,
3542            AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8,
3543            AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8},
3544           {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8,
3545            AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8,
3546            AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8}
3547       },
3548       {
3549           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3550            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3551            0 /* invalid */},
3552           {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16,
3553            AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16,
3554            AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16},
3555           {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16,
3556            AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16,
3557            AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16},
3558           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3559            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3560            0 /* invalid */}
3561       },
3562       {
3563           {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32,
3564            AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32,
3565            AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32},
3566           {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32,
3567            AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32,
3568            AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32},
3569           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3570            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3571            0 /* invalid */},
3572           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3573            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3574            0 /* invalid */}
3575       },
3576       {
3577           {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64,
3578            AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64,
3579            AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64},
3580           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3581            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3582            0 /* invalid */},
3583           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3584            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3585            0 /* invalid */},
3586           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3587            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3588            0 /* invalid */}
3589       },
3590   };
3591   unsigned EltIdx = Log2_32(SrcEltSize / 8);
3592   unsigned NumEltsIdx = Log2_32(NumElts / 2);
3593   unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx];
3594   if (!Opc) {
3595     LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode");
3596     return false;
3597   }
3598 
3599   const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3600   const TargetRegisterClass *SrcRC =
3601       getRegClassForTypeOnBank(SrcTy, VecRB, RBI, true);
3602   if (!SrcRC) {
3603     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3604     return false;
3605   }
3606 
3607   unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0;
3608   if (SrcTy.getSizeInBits() == 128)
3609     NotOpc = NotOpc ? AArch64::NOTv16i8 : 0;
3610 
3611   if (SwapOperands)
3612     std::swap(SrcReg, Src2Reg);
3613 
3614   auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg});
3615   constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3616 
3617   // Invert if we had a 'ne' cc.
3618   if (NotOpc) {
3619     Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp});
3620     constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3621   } else {
3622     MIB.buildCopy(DstReg, Cmp.getReg(0));
3623   }
3624   RBI.constrainGenericRegister(DstReg, *SrcRC, MRI);
3625   I.eraseFromParent();
3626   return true;
3627 }
3628 
emitScalarToVector(unsigned EltSize,const TargetRegisterClass * DstRC,Register Scalar,MachineIRBuilder & MIRBuilder) const3629 MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3630     unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3631     MachineIRBuilder &MIRBuilder) const {
3632   auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
3633 
3634   auto BuildFn = [&](unsigned SubregIndex) {
3635     auto Ins =
3636         MIRBuilder
3637             .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar})
3638             .addImm(SubregIndex);
3639     constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
3640     constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
3641     return &*Ins;
3642   };
3643 
3644   switch (EltSize) {
3645   case 16:
3646     return BuildFn(AArch64::hsub);
3647   case 32:
3648     return BuildFn(AArch64::ssub);
3649   case 64:
3650     return BuildFn(AArch64::dsub);
3651   default:
3652     return nullptr;
3653   }
3654 }
3655 
selectMergeValues(MachineInstr & I,MachineRegisterInfo & MRI)3656 bool AArch64InstructionSelector::selectMergeValues(
3657     MachineInstr &I, MachineRegisterInfo &MRI) {
3658   assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
3659   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3660   const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3661   assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
3662   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
3663 
3664   if (I.getNumOperands() != 3)
3665     return false;
3666 
3667   // Merging 2 s64s into an s128.
3668   if (DstTy == LLT::scalar(128)) {
3669     if (SrcTy.getSizeInBits() != 64)
3670       return false;
3671     Register DstReg = I.getOperand(0).getReg();
3672     Register Src1Reg = I.getOperand(1).getReg();
3673     Register Src2Reg = I.getOperand(2).getReg();
3674     auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
3675     MachineInstr *InsMI =
3676         emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB);
3677     if (!InsMI)
3678       return false;
3679     MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
3680                                           Src2Reg, /* LaneIdx */ 1, RB, MIB);
3681     if (!Ins2MI)
3682       return false;
3683     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
3684     constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
3685     I.eraseFromParent();
3686     return true;
3687   }
3688 
3689   if (RB.getID() != AArch64::GPRRegBankID)
3690     return false;
3691 
3692   if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
3693     return false;
3694 
3695   auto *DstRC = &AArch64::GPR64RegClass;
3696   Register SubToRegDef = MRI.createVirtualRegister(DstRC);
3697   MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3698                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3699                                 .addDef(SubToRegDef)
3700                                 .addImm(0)
3701                                 .addUse(I.getOperand(1).getReg())
3702                                 .addImm(AArch64::sub_32);
3703   Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
3704   // Need to anyext the second scalar before we can use bfm
3705   MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3706                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3707                                 .addDef(SubToRegDef2)
3708                                 .addImm(0)
3709                                 .addUse(I.getOperand(2).getReg())
3710                                 .addImm(AArch64::sub_32);
3711   MachineInstr &BFM =
3712       *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
3713            .addDef(I.getOperand(0).getReg())
3714            .addUse(SubToRegDef)
3715            .addUse(SubToRegDef2)
3716            .addImm(32)
3717            .addImm(31);
3718   constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
3719   constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
3720   constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
3721   I.eraseFromParent();
3722   return true;
3723 }
3724 
getLaneCopyOpcode(unsigned & CopyOpc,unsigned & ExtractSubReg,const unsigned EltSize)3725 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
3726                               const unsigned EltSize) {
3727   // Choose a lane copy opcode and subregister based off of the size of the
3728   // vector's elements.
3729   switch (EltSize) {
3730   case 16:
3731     CopyOpc = AArch64::CPYi16;
3732     ExtractSubReg = AArch64::hsub;
3733     break;
3734   case 32:
3735     CopyOpc = AArch64::CPYi32;
3736     ExtractSubReg = AArch64::ssub;
3737     break;
3738   case 64:
3739     CopyOpc = AArch64::CPYi64;
3740     ExtractSubReg = AArch64::dsub;
3741     break;
3742   default:
3743     // Unknown size, bail out.
3744     LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
3745     return false;
3746   }
3747   return true;
3748 }
3749 
emitExtractVectorElt(Optional<Register> DstReg,const RegisterBank & DstRB,LLT ScalarTy,Register VecReg,unsigned LaneIdx,MachineIRBuilder & MIRBuilder) const3750 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
3751     Optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
3752     Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
3753   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3754   unsigned CopyOpc = 0;
3755   unsigned ExtractSubReg = 0;
3756   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
3757     LLVM_DEBUG(
3758         dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
3759     return nullptr;
3760   }
3761 
3762   const TargetRegisterClass *DstRC =
3763       getRegClassForTypeOnBank(ScalarTy, DstRB, RBI, true);
3764   if (!DstRC) {
3765     LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
3766     return nullptr;
3767   }
3768 
3769   const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
3770   const LLT &VecTy = MRI.getType(VecReg);
3771   const TargetRegisterClass *VecRC =
3772       getRegClassForTypeOnBank(VecTy, VecRB, RBI, true);
3773   if (!VecRC) {
3774     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3775     return nullptr;
3776   }
3777 
3778   // The register that we're going to copy into.
3779   Register InsertReg = VecReg;
3780   if (!DstReg)
3781     DstReg = MRI.createVirtualRegister(DstRC);
3782   // If the lane index is 0, we just use a subregister COPY.
3783   if (LaneIdx == 0) {
3784     auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {})
3785                     .addReg(VecReg, 0, ExtractSubReg);
3786     RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
3787     return &*Copy;
3788   }
3789 
3790   // Lane copies require 128-bit wide registers. If we're dealing with an
3791   // unpacked vector, then we need to move up to that width. Insert an implicit
3792   // def and a subregister insert to get us there.
3793   if (VecTy.getSizeInBits() != 128) {
3794     MachineInstr *ScalarToVector = emitScalarToVector(
3795         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
3796     if (!ScalarToVector)
3797       return nullptr;
3798     InsertReg = ScalarToVector->getOperand(0).getReg();
3799   }
3800 
3801   MachineInstr *LaneCopyMI =
3802       MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
3803   constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
3804 
3805   // Make sure that we actually constrain the initial copy.
3806   RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
3807   return LaneCopyMI;
3808 }
3809 
selectExtractElt(MachineInstr & I,MachineRegisterInfo & MRI)3810 bool AArch64InstructionSelector::selectExtractElt(
3811     MachineInstr &I, MachineRegisterInfo &MRI) {
3812   assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
3813          "unexpected opcode!");
3814   Register DstReg = I.getOperand(0).getReg();
3815   const LLT NarrowTy = MRI.getType(DstReg);
3816   const Register SrcReg = I.getOperand(1).getReg();
3817   const LLT WideTy = MRI.getType(SrcReg);
3818   (void)WideTy;
3819   assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
3820          "source register size too small!");
3821   assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
3822 
3823   // Need the lane index to determine the correct copy opcode.
3824   MachineOperand &LaneIdxOp = I.getOperand(2);
3825   assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
3826 
3827   if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
3828     LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
3829     return false;
3830   }
3831 
3832   // Find the index to extract from.
3833   auto VRegAndVal = getConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
3834   if (!VRegAndVal)
3835     return false;
3836   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
3837 
3838 
3839   const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3840   MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
3841                                                LaneIdx, MIB);
3842   if (!Extract)
3843     return false;
3844 
3845   I.eraseFromParent();
3846   return true;
3847 }
3848 
selectSplitVectorUnmerge(MachineInstr & I,MachineRegisterInfo & MRI)3849 bool AArch64InstructionSelector::selectSplitVectorUnmerge(
3850     MachineInstr &I, MachineRegisterInfo &MRI) {
3851   unsigned NumElts = I.getNumOperands() - 1;
3852   Register SrcReg = I.getOperand(NumElts).getReg();
3853   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
3854   const LLT SrcTy = MRI.getType(SrcReg);
3855 
3856   assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
3857   if (SrcTy.getSizeInBits() > 128) {
3858     LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
3859     return false;
3860   }
3861 
3862   // We implement a split vector operation by treating the sub-vectors as
3863   // scalars and extracting them.
3864   const RegisterBank &DstRB =
3865       *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
3866   for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
3867     Register Dst = I.getOperand(OpIdx).getReg();
3868     MachineInstr *Extract =
3869         emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
3870     if (!Extract)
3871       return false;
3872   }
3873   I.eraseFromParent();
3874   return true;
3875 }
3876 
selectUnmergeValues(MachineInstr & I,MachineRegisterInfo & MRI)3877 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
3878                                                      MachineRegisterInfo &MRI) {
3879   assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
3880          "unexpected opcode");
3881 
3882   // TODO: Handle unmerging into GPRs and from scalars to scalars.
3883   if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
3884           AArch64::FPRRegBankID ||
3885       RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
3886           AArch64::FPRRegBankID) {
3887     LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
3888                          "currently unsupported.\n");
3889     return false;
3890   }
3891 
3892   // The last operand is the vector source register, and every other operand is
3893   // a register to unpack into.
3894   unsigned NumElts = I.getNumOperands() - 1;
3895   Register SrcReg = I.getOperand(NumElts).getReg();
3896   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
3897   const LLT WideTy = MRI.getType(SrcReg);
3898   (void)WideTy;
3899   assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
3900          "can only unmerge from vector or s128 types!");
3901   assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
3902          "source register size too small!");
3903 
3904   if (!NarrowTy.isScalar())
3905     return selectSplitVectorUnmerge(I, MRI);
3906 
3907   // Choose a lane copy opcode and subregister based off of the size of the
3908   // vector's elements.
3909   unsigned CopyOpc = 0;
3910   unsigned ExtractSubReg = 0;
3911   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits()))
3912     return false;
3913 
3914   // Set up for the lane copies.
3915   MachineBasicBlock &MBB = *I.getParent();
3916 
3917   // Stores the registers we'll be copying from.
3918   SmallVector<Register, 4> InsertRegs;
3919 
3920   // We'll use the first register twice, so we only need NumElts-1 registers.
3921   unsigned NumInsertRegs = NumElts - 1;
3922 
3923   // If our elements fit into exactly 128 bits, then we can copy from the source
3924   // directly. Otherwise, we need to do a bit of setup with some subregister
3925   // inserts.
3926   if (NarrowTy.getSizeInBits() * NumElts == 128) {
3927     InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
3928   } else {
3929     // No. We have to perform subregister inserts. For each insert, create an
3930     // implicit def and a subregister insert, and save the register we create.
3931     for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
3932       Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
3933       MachineInstr &ImpDefMI =
3934           *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
3935                    ImpDefReg);
3936 
3937       // Now, create the subregister insert from SrcReg.
3938       Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
3939       MachineInstr &InsMI =
3940           *BuildMI(MBB, I, I.getDebugLoc(),
3941                    TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
3942                .addUse(ImpDefReg)
3943                .addUse(SrcReg)
3944                .addImm(AArch64::dsub);
3945 
3946       constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
3947       constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
3948 
3949       // Save the register so that we can copy from it after.
3950       InsertRegs.push_back(InsertReg);
3951     }
3952   }
3953 
3954   // Now that we've created any necessary subregister inserts, we can
3955   // create the copies.
3956   //
3957   // Perform the first copy separately as a subregister copy.
3958   Register CopyTo = I.getOperand(0).getReg();
3959   auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {})
3960                        .addReg(InsertRegs[0], 0, ExtractSubReg);
3961   constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
3962 
3963   // Now, perform the remaining copies as vector lane copies.
3964   unsigned LaneIdx = 1;
3965   for (Register InsReg : InsertRegs) {
3966     Register CopyTo = I.getOperand(LaneIdx).getReg();
3967     MachineInstr &CopyInst =
3968         *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
3969              .addUse(InsReg)
3970              .addImm(LaneIdx);
3971     constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
3972     ++LaneIdx;
3973   }
3974 
3975   // Separately constrain the first copy's destination. Because of the
3976   // limitation in constrainOperandRegClass, we can't guarantee that this will
3977   // actually be constrained. So, do it ourselves using the second operand.
3978   const TargetRegisterClass *RC =
3979       MRI.getRegClassOrNull(I.getOperand(1).getReg());
3980   if (!RC) {
3981     LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
3982     return false;
3983   }
3984 
3985   RBI.constrainGenericRegister(CopyTo, *RC, MRI);
3986   I.eraseFromParent();
3987   return true;
3988 }
3989 
selectConcatVectors(MachineInstr & I,MachineRegisterInfo & MRI)3990 bool AArch64InstructionSelector::selectConcatVectors(
3991     MachineInstr &I, MachineRegisterInfo &MRI)  {
3992   assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
3993          "Unexpected opcode");
3994   Register Dst = I.getOperand(0).getReg();
3995   Register Op1 = I.getOperand(1).getReg();
3996   Register Op2 = I.getOperand(2).getReg();
3997   MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB);
3998   if (!ConcatMI)
3999     return false;
4000   I.eraseFromParent();
4001   return true;
4002 }
4003 
4004 unsigned
emitConstantPoolEntry(const Constant * CPVal,MachineFunction & MF) const4005 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4006                                                   MachineFunction &MF) const {
4007   Type *CPTy = CPVal->getType();
4008   Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy);
4009 
4010   MachineConstantPool *MCP = MF.getConstantPool();
4011   return MCP->getConstantPoolIndex(CPVal, Alignment);
4012 }
4013 
emitLoadFromConstantPool(const Constant * CPVal,MachineIRBuilder & MIRBuilder) const4014 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4015     const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4016   auto &MF = MIRBuilder.getMF();
4017   unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4018 
4019   auto Adrp =
4020       MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
4021           .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
4022 
4023   MachineInstr *LoadMI = nullptr;
4024   MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4025   unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType());
4026   switch (Size) {
4027   case 16:
4028     LoadMI =
4029         &*MIRBuilder
4030               .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
4031               .addConstantPoolIndex(CPIdx, 0,
4032                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4033     break;
4034   case 8:
4035     LoadMI =
4036         &*MIRBuilder
4037               .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp})
4038               .addConstantPoolIndex(CPIdx, 0,
4039                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4040     break;
4041   case 4:
4042     LoadMI =
4043         &*MIRBuilder
4044               .buildInstr(AArch64::LDRSui, {&AArch64::FPR32RegClass}, {Adrp})
4045               .addConstantPoolIndex(CPIdx, 0,
4046                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4047     break;
4048   default:
4049     LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4050                       << *CPVal->getType());
4051     return nullptr;
4052   }
4053   LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo,
4054                                                     MachineMemOperand::MOLoad,
4055                                                     Size, Align(Size)));
4056   constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
4057   constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
4058   return LoadMI;
4059 }
4060 
4061 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4062 /// size and RB.
4063 static std::pair<unsigned, unsigned>
getInsertVecEltOpInfo(const RegisterBank & RB,unsigned EltSize)4064 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4065   unsigned Opc, SubregIdx;
4066   if (RB.getID() == AArch64::GPRRegBankID) {
4067     if (EltSize == 16) {
4068       Opc = AArch64::INSvi16gpr;
4069       SubregIdx = AArch64::ssub;
4070     } else if (EltSize == 32) {
4071       Opc = AArch64::INSvi32gpr;
4072       SubregIdx = AArch64::ssub;
4073     } else if (EltSize == 64) {
4074       Opc = AArch64::INSvi64gpr;
4075       SubregIdx = AArch64::dsub;
4076     } else {
4077       llvm_unreachable("invalid elt size!");
4078     }
4079   } else {
4080     if (EltSize == 8) {
4081       Opc = AArch64::INSvi8lane;
4082       SubregIdx = AArch64::bsub;
4083     } else if (EltSize == 16) {
4084       Opc = AArch64::INSvi16lane;
4085       SubregIdx = AArch64::hsub;
4086     } else if (EltSize == 32) {
4087       Opc = AArch64::INSvi32lane;
4088       SubregIdx = AArch64::ssub;
4089     } else if (EltSize == 64) {
4090       Opc = AArch64::INSvi64lane;
4091       SubregIdx = AArch64::dsub;
4092     } else {
4093       llvm_unreachable("invalid elt size!");
4094     }
4095   }
4096   return std::make_pair(Opc, SubregIdx);
4097 }
4098 
emitInstr(unsigned Opcode,std::initializer_list<llvm::DstOp> DstOps,std::initializer_list<llvm::SrcOp> SrcOps,MachineIRBuilder & MIRBuilder,const ComplexRendererFns & RenderFns) const4099 MachineInstr *AArch64InstructionSelector::emitInstr(
4100     unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4101     std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4102     const ComplexRendererFns &RenderFns) const {
4103   assert(Opcode && "Expected an opcode?");
4104   assert(!isPreISelGenericOpcode(Opcode) &&
4105          "Function should only be used to produce selected instructions!");
4106   auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps);
4107   if (RenderFns)
4108     for (auto &Fn : *RenderFns)
4109       Fn(MI);
4110   constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
4111   return &*MI;
4112 }
4113 
emitAddSub(const std::array<std::array<unsigned,2>,5> & AddrModeAndSizeToOpcode,Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4114 MachineInstr *AArch64InstructionSelector::emitAddSub(
4115     const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4116     Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4117     MachineIRBuilder &MIRBuilder) const {
4118   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4119   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4120   auto Ty = MRI.getType(LHS.getReg());
4121   assert(!Ty.isVector() && "Expected a scalar or pointer?");
4122   unsigned Size = Ty.getSizeInBits();
4123   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4124   bool Is32Bit = Size == 32;
4125 
4126   // INSTRri form with positive arithmetic immediate.
4127   if (auto Fns = selectArithImmed(RHS))
4128     return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS},
4129                      MIRBuilder, Fns);
4130 
4131   // INSTRri form with negative arithmetic immediate.
4132   if (auto Fns = selectNegArithImmed(RHS))
4133     return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS},
4134                      MIRBuilder, Fns);
4135 
4136   // INSTRrx form.
4137   if (auto Fns = selectArithExtendedRegister(RHS))
4138     return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS},
4139                      MIRBuilder, Fns);
4140 
4141   // INSTRrs form.
4142   if (auto Fns = selectShiftedRegister(RHS))
4143     return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS},
4144                      MIRBuilder, Fns);
4145   return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS},
4146                    MIRBuilder);
4147 }
4148 
4149 MachineInstr *
emitADD(Register DefReg,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4150 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4151                                     MachineOperand &RHS,
4152                                     MachineIRBuilder &MIRBuilder) const {
4153   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4154       {{AArch64::ADDXri, AArch64::ADDWri},
4155        {AArch64::ADDXrs, AArch64::ADDWrs},
4156        {AArch64::ADDXrr, AArch64::ADDWrr},
4157        {AArch64::SUBXri, AArch64::SUBWri},
4158        {AArch64::ADDXrx, AArch64::ADDWrx}}};
4159   return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder);
4160 }
4161 
4162 MachineInstr *
emitADDS(Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4163 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4164                                      MachineOperand &RHS,
4165                                      MachineIRBuilder &MIRBuilder) const {
4166   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4167       {{AArch64::ADDSXri, AArch64::ADDSWri},
4168        {AArch64::ADDSXrs, AArch64::ADDSWrs},
4169        {AArch64::ADDSXrr, AArch64::ADDSWrr},
4170        {AArch64::SUBSXri, AArch64::SUBSWri},
4171        {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4172   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4173 }
4174 
4175 MachineInstr *
emitSUBS(Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4176 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4177                                      MachineOperand &RHS,
4178                                      MachineIRBuilder &MIRBuilder) const {
4179   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4180       {{AArch64::SUBSXri, AArch64::SUBSWri},
4181        {AArch64::SUBSXrs, AArch64::SUBSWrs},
4182        {AArch64::SUBSXrr, AArch64::SUBSWrr},
4183        {AArch64::ADDSXri, AArch64::ADDSWri},
4184        {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4185   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4186 }
4187 
4188 MachineInstr *
emitCMN(MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4189 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4190                                     MachineIRBuilder &MIRBuilder) const {
4191   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4192   bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
4193   auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4194   return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
4195 }
4196 
4197 MachineInstr *
emitTST(MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4198 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4199                                     MachineIRBuilder &MIRBuilder) const {
4200   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4201   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4202   LLT Ty = MRI.getType(LHS.getReg());
4203   unsigned RegSize = Ty.getSizeInBits();
4204   bool Is32Bit = (RegSize == 32);
4205   const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4206                                    {AArch64::ANDSXrs, AArch64::ANDSWrs},
4207                                    {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4208   // ANDS needs a logical immediate for its immediate form. Check if we can
4209   // fold one in.
4210   if (auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
4211     int64_t Imm = ValAndVReg->Value.getSExtValue();
4212 
4213     if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
4214       auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS});
4215       TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
4216       constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
4217       return &*TstMI;
4218     }
4219   }
4220 
4221   if (auto Fns = selectLogicalShiftedRegister(RHS))
4222     return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns);
4223   return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder);
4224 }
4225 
emitIntegerCompare(MachineOperand & LHS,MachineOperand & RHS,MachineOperand & Predicate,MachineIRBuilder & MIRBuilder) const4226 MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4227     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4228     MachineIRBuilder &MIRBuilder) const {
4229   assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4230   assert(Predicate.isPredicate() && "Expected predicate?");
4231   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4232   LLT CmpTy = MRI.getType(LHS.getReg());
4233   assert(!CmpTy.isVector() && "Expected scalar or pointer");
4234   unsigned Size = CmpTy.getSizeInBits();
4235   (void)Size;
4236   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4237   // Fold the compare into a cmn or tst if possible.
4238   if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4239     return FoldCmp;
4240   auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
4241   return emitSUBS(Dst, LHS, RHS, MIRBuilder);
4242 }
4243 
emitCSetForFCmp(Register Dst,CmpInst::Predicate Pred,MachineIRBuilder & MIRBuilder) const4244 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4245     Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4246   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4247 #ifndef NDEBUG
4248   LLT Ty = MRI.getType(Dst);
4249   assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4250          "Expected a 32-bit scalar register?");
4251 #endif
4252   const Register ZeroReg = AArch64::WZR;
4253   auto EmitCSet = [&](Register CsetDst, AArch64CC::CondCode CC) {
4254     auto CSet =
4255         MIRBuilder.buildInstr(AArch64::CSINCWr, {CsetDst}, {ZeroReg, ZeroReg})
4256             .addImm(getInvertedCondCode(CC));
4257     constrainSelectedInstRegOperands(*CSet, TII, TRI, RBI);
4258     return &*CSet;
4259   };
4260 
4261   AArch64CC::CondCode CC1, CC2;
4262   changeFCMPPredToAArch64CC(Pred, CC1, CC2);
4263   if (CC2 == AArch64CC::AL)
4264     return EmitCSet(Dst, CC1);
4265 
4266   const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4267   Register Def1Reg = MRI.createVirtualRegister(RC);
4268   Register Def2Reg = MRI.createVirtualRegister(RC);
4269   EmitCSet(Def1Reg, CC1);
4270   EmitCSet(Def2Reg, CC2);
4271   auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
4272   constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
4273   return &*OrMI;
4274 }
4275 
4276 MachineInstr *
emitFPCompare(Register LHS,Register RHS,MachineIRBuilder & MIRBuilder,Optional<CmpInst::Predicate> Pred) const4277 AArch64InstructionSelector::emitFPCompare(Register LHS, Register RHS,
4278                                           MachineIRBuilder &MIRBuilder,
4279                                           Optional<CmpInst::Predicate> Pred) const {
4280   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4281   LLT Ty = MRI.getType(LHS);
4282   if (Ty.isVector())
4283     return nullptr;
4284   unsigned OpSize = Ty.getSizeInBits();
4285   if (OpSize != 32 && OpSize != 64)
4286     return nullptr;
4287 
4288   // If this is a compare against +0.0, then we don't have
4289   // to explicitly materialize a constant.
4290   const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
4291   bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4292 
4293   auto IsEqualityPred = [](CmpInst::Predicate P) {
4294     return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4295            P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4296   };
4297   if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4298     // Try commutating the operands.
4299     const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI);
4300     if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4301       ShouldUseImm = true;
4302       std::swap(LHS, RHS);
4303     }
4304   }
4305   unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
4306                               {AArch64::FCMPSri, AArch64::FCMPDri}};
4307   unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64];
4308 
4309   // Partially build the compare. Decide if we need to add a use for the
4310   // third operand based off whether or not we're comparing against 0.0.
4311   auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
4312   if (!ShouldUseImm)
4313     CmpMI.addUse(RHS);
4314   constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
4315   return &*CmpMI;
4316 }
4317 
emitVectorConcat(Optional<Register> Dst,Register Op1,Register Op2,MachineIRBuilder & MIRBuilder) const4318 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4319     Optional<Register> Dst, Register Op1, Register Op2,
4320     MachineIRBuilder &MIRBuilder) const {
4321   // We implement a vector concat by:
4322   // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4323   // 2. Insert the upper vector into the destination's upper element
4324   // TODO: some of this code is common with G_BUILD_VECTOR handling.
4325   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4326 
4327   const LLT Op1Ty = MRI.getType(Op1);
4328   const LLT Op2Ty = MRI.getType(Op2);
4329 
4330   if (Op1Ty != Op2Ty) {
4331     LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4332     return nullptr;
4333   }
4334   assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4335 
4336   if (Op1Ty.getSizeInBits() >= 128) {
4337     LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4338     return nullptr;
4339   }
4340 
4341   // At the moment we just support 64 bit vector concats.
4342   if (Op1Ty.getSizeInBits() != 64) {
4343     LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4344     return nullptr;
4345   }
4346 
4347   const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
4348   const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
4349   const TargetRegisterClass *DstRC =
4350       getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2);
4351 
4352   MachineInstr *WidenedOp1 =
4353       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
4354   MachineInstr *WidenedOp2 =
4355       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
4356   if (!WidenedOp1 || !WidenedOp2) {
4357     LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4358     return nullptr;
4359   }
4360 
4361   // Now do the insert of the upper element.
4362   unsigned InsertOpc, InsSubRegIdx;
4363   std::tie(InsertOpc, InsSubRegIdx) =
4364       getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
4365 
4366   if (!Dst)
4367     Dst = MRI.createVirtualRegister(DstRC);
4368   auto InsElt =
4369       MIRBuilder
4370           .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()})
4371           .addImm(1) /* Lane index */
4372           .addUse(WidenedOp2->getOperand(0).getReg())
4373           .addImm(0);
4374   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4375   return &*InsElt;
4376 }
4377 
4378 MachineInstr *
emitCSetForICMP(Register DefReg,unsigned Pred,MachineIRBuilder & MIRBuilder,Register SrcReg) const4379 AArch64InstructionSelector::emitCSetForICMP(Register DefReg, unsigned Pred,
4380                                             MachineIRBuilder &MIRBuilder,
4381                                             Register SrcReg) const {
4382   // CSINC increments the result when the predicate is false. Invert it.
4383   const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
4384       CmpInst::getInversePredicate((CmpInst::Predicate)Pred));
4385   auto I = MIRBuilder.buildInstr(AArch64::CSINCWr, {DefReg}, {SrcReg, SrcReg})
4386                .addImm(InvCC);
4387   constrainSelectedInstRegOperands(*I, TII, TRI, RBI);
4388   return &*I;
4389 }
4390 
4391 std::pair<MachineInstr *, AArch64CC::CondCode>
emitOverflowOp(unsigned Opcode,Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4392 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4393                                            MachineOperand &LHS,
4394                                            MachineOperand &RHS,
4395                                            MachineIRBuilder &MIRBuilder) const {
4396   switch (Opcode) {
4397   default:
4398     llvm_unreachable("Unexpected opcode!");
4399   case TargetOpcode::G_SADDO:
4400     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4401   case TargetOpcode::G_UADDO:
4402     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4403   case TargetOpcode::G_SSUBO:
4404     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4405   case TargetOpcode::G_USUBO:
4406     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4407   }
4408 }
4409 
tryOptSelect(MachineInstr & I)4410 bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) {
4411   MachineRegisterInfo &MRI = *MIB.getMRI();
4412   // We want to recognize this pattern:
4413   //
4414   // $z = G_FCMP pred, $x, $y
4415   // ...
4416   // $w = G_SELECT $z, $a, $b
4417   //
4418   // Where the value of $z is *only* ever used by the G_SELECT (possibly with
4419   // some copies/truncs in between.)
4420   //
4421   // If we see this, then we can emit something like this:
4422   //
4423   // fcmp $x, $y
4424   // fcsel $w, $a, $b, pred
4425   //
4426   // Rather than emitting both of the rather long sequences in the standard
4427   // G_FCMP/G_SELECT select methods.
4428 
4429   // First, check if the condition is defined by a compare.
4430   MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
4431   while (CondDef) {
4432     // We can only fold if all of the defs have one use.
4433     Register CondDefReg = CondDef->getOperand(0).getReg();
4434     if (!MRI.hasOneNonDBGUse(CondDefReg)) {
4435       // Unless it's another select.
4436       for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
4437         if (CondDef == &UI)
4438           continue;
4439         if (UI.getOpcode() != TargetOpcode::G_SELECT)
4440           return false;
4441       }
4442     }
4443 
4444     // We can skip over G_TRUNC since the condition is 1-bit.
4445     // Truncating/extending can have no impact on the value.
4446     unsigned Opc = CondDef->getOpcode();
4447     if (Opc != TargetOpcode::COPY && Opc != TargetOpcode::G_TRUNC)
4448       break;
4449 
4450     // Can't see past copies from physregs.
4451     if (Opc == TargetOpcode::COPY &&
4452         Register::isPhysicalRegister(CondDef->getOperand(1).getReg()))
4453       return false;
4454 
4455     CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg());
4456   }
4457 
4458   // Is the condition defined by a compare?
4459   if (!CondDef)
4460     return false;
4461 
4462   unsigned CondOpc = CondDef->getOpcode();
4463   if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP)
4464     return false;
4465 
4466   AArch64CC::CondCode CondCode;
4467   if (CondOpc == TargetOpcode::G_ICMP) {
4468     auto Pred =
4469         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
4470     CondCode = changeICMPPredToAArch64CC(Pred);
4471     emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
4472                        CondDef->getOperand(1), MIB);
4473   } else {
4474     // Get the condition code for the select.
4475     auto Pred =
4476         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
4477     AArch64CC::CondCode CondCode2;
4478     changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
4479 
4480     // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
4481     // instructions to emit the comparison.
4482     // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
4483     // unnecessary.
4484     if (CondCode2 != AArch64CC::AL)
4485       return false;
4486 
4487     if (!emitFPCompare(CondDef->getOperand(2).getReg(),
4488                        CondDef->getOperand(3).getReg(), MIB)) {
4489       LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
4490       return false;
4491     }
4492   }
4493 
4494   // Emit the select.
4495   emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(),
4496              I.getOperand(3).getReg(), CondCode, MIB);
4497   I.eraseFromParent();
4498   return true;
4499 }
4500 
tryFoldIntegerCompare(MachineOperand & LHS,MachineOperand & RHS,MachineOperand & Predicate,MachineIRBuilder & MIRBuilder) const4501 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
4502     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4503     MachineIRBuilder &MIRBuilder) const {
4504   assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
4505          "Unexpected MachineOperand");
4506   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4507   // We want to find this sort of thing:
4508   // x = G_SUB 0, y
4509   // G_ICMP z, x
4510   //
4511   // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
4512   // e.g:
4513   //
4514   // cmn z, y
4515 
4516   // Check if the RHS or LHS of the G_ICMP is defined by a SUB
4517   MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
4518   MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
4519   auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
4520   // Given this:
4521   //
4522   // x = G_SUB 0, y
4523   // G_ICMP x, z
4524   //
4525   // Produce this:
4526   //
4527   // cmn y, z
4528   if (isCMN(LHSDef, P, MRI))
4529     return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
4530 
4531   // Same idea here, but with the RHS of the compare instead:
4532   //
4533   // Given this:
4534   //
4535   // x = G_SUB 0, y
4536   // G_ICMP z, x
4537   //
4538   // Produce this:
4539   //
4540   // cmn z, y
4541   if (isCMN(RHSDef, P, MRI))
4542     return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
4543 
4544   // Given this:
4545   //
4546   // z = G_AND x, y
4547   // G_ICMP z, 0
4548   //
4549   // Produce this if the compare is signed:
4550   //
4551   // tst x, y
4552   if (!CmpInst::isUnsigned(P) && LHSDef &&
4553       LHSDef->getOpcode() == TargetOpcode::G_AND) {
4554     // Make sure that the RHS is 0.
4555     auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI);
4556     if (!ValAndVReg || ValAndVReg->Value != 0)
4557       return nullptr;
4558 
4559     return emitTST(LHSDef->getOperand(1),
4560                    LHSDef->getOperand(2), MIRBuilder);
4561   }
4562 
4563   return nullptr;
4564 }
4565 
selectShuffleVector(MachineInstr & I,MachineRegisterInfo & MRI)4566 bool AArch64InstructionSelector::selectShuffleVector(
4567     MachineInstr &I, MachineRegisterInfo &MRI) {
4568   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
4569   Register Src1Reg = I.getOperand(1).getReg();
4570   const LLT Src1Ty = MRI.getType(Src1Reg);
4571   Register Src2Reg = I.getOperand(2).getReg();
4572   const LLT Src2Ty = MRI.getType(Src2Reg);
4573   ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
4574 
4575   MachineBasicBlock &MBB = *I.getParent();
4576   MachineFunction &MF = *MBB.getParent();
4577   LLVMContext &Ctx = MF.getFunction().getContext();
4578 
4579   // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
4580   // it's originated from a <1 x T> type. Those should have been lowered into
4581   // G_BUILD_VECTOR earlier.
4582   if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
4583     LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
4584     return false;
4585   }
4586 
4587   unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
4588 
4589   SmallVector<Constant *, 64> CstIdxs;
4590   for (int Val : Mask) {
4591     // For now, any undef indexes we'll just assume to be 0. This should be
4592     // optimized in future, e.g. to select DUP etc.
4593     Val = Val < 0 ? 0 : Val;
4594     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
4595       unsigned Offset = Byte + Val * BytesPerElt;
4596       CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
4597     }
4598   }
4599 
4600   // Use a constant pool to load the index vector for TBL.
4601   Constant *CPVal = ConstantVector::get(CstIdxs);
4602   MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB);
4603   if (!IndexLoad) {
4604     LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
4605     return false;
4606   }
4607 
4608   if (DstTy.getSizeInBits() != 128) {
4609     assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
4610     // This case can be done with TBL1.
4611     MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIB);
4612     if (!Concat) {
4613       LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
4614       return false;
4615     }
4616 
4617     // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
4618     IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass,
4619                                    IndexLoad->getOperand(0).getReg(), MIB);
4620 
4621     auto TBL1 = MIB.buildInstr(
4622         AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
4623         {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
4624     constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
4625 
4626     auto Copy =
4627         MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
4628             .addReg(TBL1.getReg(0), 0, AArch64::dsub);
4629     RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
4630     I.eraseFromParent();
4631     return true;
4632   }
4633 
4634   // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
4635   // Q registers for regalloc.
4636   auto RegSeq = MIB.buildInstr(TargetOpcode::REG_SEQUENCE,
4637                                {&AArch64::QQRegClass}, {Src1Reg})
4638                     .addImm(AArch64::qsub0)
4639                     .addUse(Src2Reg)
4640                     .addImm(AArch64::qsub1);
4641 
4642   auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
4643                              {RegSeq, IndexLoad->getOperand(0)});
4644   constrainSelectedInstRegOperands(*RegSeq, TII, TRI, RBI);
4645   constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
4646   I.eraseFromParent();
4647   return true;
4648 }
4649 
emitLaneInsert(Optional<Register> DstReg,Register SrcReg,Register EltReg,unsigned LaneIdx,const RegisterBank & RB,MachineIRBuilder & MIRBuilder) const4650 MachineInstr *AArch64InstructionSelector::emitLaneInsert(
4651     Optional<Register> DstReg, Register SrcReg, Register EltReg,
4652     unsigned LaneIdx, const RegisterBank &RB,
4653     MachineIRBuilder &MIRBuilder) const {
4654   MachineInstr *InsElt = nullptr;
4655   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
4656   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4657 
4658   // Create a register to define with the insert if one wasn't passed in.
4659   if (!DstReg)
4660     DstReg = MRI.createVirtualRegister(DstRC);
4661 
4662   unsigned EltSize = MRI.getType(EltReg).getSizeInBits();
4663   unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
4664 
4665   if (RB.getID() == AArch64::FPRRegBankID) {
4666     auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder);
4667     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
4668                  .addImm(LaneIdx)
4669                  .addUse(InsSub->getOperand(0).getReg())
4670                  .addImm(0);
4671   } else {
4672     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
4673                  .addImm(LaneIdx)
4674                  .addUse(EltReg);
4675   }
4676 
4677   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4678   return InsElt;
4679 }
4680 
selectInsertElt(MachineInstr & I,MachineRegisterInfo & MRI)4681 bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
4682                                                  MachineRegisterInfo &MRI) {
4683   assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
4684 
4685   // Get information on the destination.
4686   Register DstReg = I.getOperand(0).getReg();
4687   const LLT DstTy = MRI.getType(DstReg);
4688   unsigned VecSize = DstTy.getSizeInBits();
4689 
4690   // Get information on the element we want to insert into the destination.
4691   Register EltReg = I.getOperand(2).getReg();
4692   const LLT EltTy = MRI.getType(EltReg);
4693   unsigned EltSize = EltTy.getSizeInBits();
4694   if (EltSize < 16 || EltSize > 64)
4695     return false; // Don't support all element types yet.
4696 
4697   // Find the definition of the index. Bail out if it's not defined by a
4698   // G_CONSTANT.
4699   Register IdxReg = I.getOperand(3).getReg();
4700   auto VRegAndVal = getConstantVRegValWithLookThrough(IdxReg, MRI);
4701   if (!VRegAndVal)
4702     return false;
4703   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4704 
4705   // Perform the lane insert.
4706   Register SrcReg = I.getOperand(1).getReg();
4707   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
4708 
4709   if (VecSize < 128) {
4710     // If the vector we're inserting into is smaller than 128 bits, widen it
4711     // to 128 to do the insert.
4712     MachineInstr *ScalarToVec =
4713         emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB);
4714     if (!ScalarToVec)
4715       return false;
4716     SrcReg = ScalarToVec->getOperand(0).getReg();
4717   }
4718 
4719   // Create an insert into a new FPR128 register.
4720   // Note that if our vector is already 128 bits, we end up emitting an extra
4721   // register.
4722   MachineInstr *InsMI =
4723       emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIB);
4724 
4725   if (VecSize < 128) {
4726     // If we had to widen to perform the insert, then we have to demote back to
4727     // the original size to get the result we want.
4728     Register DemoteVec = InsMI->getOperand(0).getReg();
4729     const TargetRegisterClass *RC =
4730         getMinClassForRegBank(*RBI.getRegBank(DemoteVec, MRI, TRI), VecSize);
4731     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
4732       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
4733       return false;
4734     }
4735     unsigned SubReg = 0;
4736     if (!getSubRegForClass(RC, TRI, SubReg))
4737       return false;
4738     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
4739       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize
4740                         << "\n");
4741       return false;
4742     }
4743     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
4744         .addReg(DemoteVec, 0, SubReg);
4745     RBI.constrainGenericRegister(DstReg, *RC, MRI);
4746   } else {
4747     // No widening needed.
4748     InsMI->getOperand(0).setReg(DstReg);
4749     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
4750   }
4751 
4752   I.eraseFromParent();
4753   return true;
4754 }
4755 
4756 MachineInstr *
emitConstantVector(Register Dst,Constant * CV,MachineIRBuilder & MIRBuilder,MachineRegisterInfo & MRI)4757 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
4758                                                MachineIRBuilder &MIRBuilder,
4759                                                MachineRegisterInfo &MRI) {
4760   LLT DstTy = MRI.getType(Dst);
4761   unsigned DstSize = DstTy.getSizeInBits();
4762   if (CV->isNullValue()) {
4763     if (DstSize == 128) {
4764       auto Mov =
4765           MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0);
4766       constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
4767       return &*Mov;
4768     }
4769 
4770     if (DstSize == 64) {
4771       auto Mov =
4772           MIRBuilder
4773               .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
4774               .addImm(0);
4775       auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {})
4776                       .addReg(Mov.getReg(0), 0, AArch64::dsub);
4777       RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI);
4778       return &*Copy;
4779     }
4780   }
4781 
4782   auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder);
4783   if (!CPLoad) {
4784     LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
4785     return nullptr;
4786   }
4787 
4788   auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0));
4789   RBI.constrainGenericRegister(
4790       Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI);
4791   return &*Copy;
4792 }
4793 
tryOptConstantBuildVec(MachineInstr & I,LLT DstTy,MachineRegisterInfo & MRI)4794 bool AArch64InstructionSelector::tryOptConstantBuildVec(
4795     MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
4796   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
4797   unsigned DstSize = DstTy.getSizeInBits();
4798   assert(DstSize <= 128 && "Unexpected build_vec type!");
4799   if (DstSize < 32)
4800     return false;
4801   // Check if we're building a constant vector, in which case we want to
4802   // generate a constant pool load instead of a vector insert sequence.
4803   SmallVector<Constant *, 16> Csts;
4804   for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
4805     // Try to find G_CONSTANT or G_FCONSTANT
4806     auto *OpMI =
4807         getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI);
4808     if (OpMI)
4809       Csts.emplace_back(
4810           const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm()));
4811     else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT,
4812                                   I.getOperand(Idx).getReg(), MRI)))
4813       Csts.emplace_back(
4814           const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm()));
4815     else
4816       return false;
4817   }
4818   Constant *CV = ConstantVector::get(Csts);
4819   if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI))
4820     return false;
4821   I.eraseFromParent();
4822   return true;
4823 }
4824 
selectBuildVector(MachineInstr & I,MachineRegisterInfo & MRI)4825 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
4826                                                    MachineRegisterInfo &MRI) {
4827   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
4828   // Until we port more of the optimized selections, for now just use a vector
4829   // insert sequence.
4830   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
4831   const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
4832   unsigned EltSize = EltTy.getSizeInBits();
4833 
4834   if (tryOptConstantBuildVec(I, DstTy, MRI))
4835     return true;
4836   if (EltSize < 16 || EltSize > 64)
4837     return false; // Don't support all element types yet.
4838   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
4839 
4840   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
4841   MachineInstr *ScalarToVec =
4842       emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
4843                          I.getOperand(1).getReg(), MIB);
4844   if (!ScalarToVec)
4845     return false;
4846 
4847   Register DstVec = ScalarToVec->getOperand(0).getReg();
4848   unsigned DstSize = DstTy.getSizeInBits();
4849 
4850   // Keep track of the last MI we inserted. Later on, we might be able to save
4851   // a copy using it.
4852   MachineInstr *PrevMI = nullptr;
4853   for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
4854     // Note that if we don't do a subregister copy, we can end up making an
4855     // extra register.
4856     PrevMI = &*emitLaneInsert(None, DstVec, I.getOperand(i).getReg(), i - 1, RB,
4857                               MIB);
4858     DstVec = PrevMI->getOperand(0).getReg();
4859   }
4860 
4861   // If DstTy's size in bits is less than 128, then emit a subregister copy
4862   // from DstVec to the last register we've defined.
4863   if (DstSize < 128) {
4864     // Force this to be FPR using the destination vector.
4865     const TargetRegisterClass *RC =
4866         getMinClassForRegBank(*RBI.getRegBank(DstVec, MRI, TRI), DstSize);
4867     if (!RC)
4868       return false;
4869     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
4870       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
4871       return false;
4872     }
4873 
4874     unsigned SubReg = 0;
4875     if (!getSubRegForClass(RC, TRI, SubReg))
4876       return false;
4877     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
4878       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
4879                         << "\n");
4880       return false;
4881     }
4882 
4883     Register Reg = MRI.createVirtualRegister(RC);
4884     Register DstReg = I.getOperand(0).getReg();
4885 
4886     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg);
4887     MachineOperand &RegOp = I.getOperand(1);
4888     RegOp.setReg(Reg);
4889     RBI.constrainGenericRegister(DstReg, *RC, MRI);
4890   } else {
4891     // We don't need a subregister copy. Save a copy by re-using the
4892     // destination register on the final insert.
4893     assert(PrevMI && "PrevMI was null?");
4894     PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
4895     constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
4896   }
4897 
4898   I.eraseFromParent();
4899   return true;
4900 }
4901 
4902 /// Helper function to find an intrinsic ID on an a MachineInstr. Returns the
4903 /// ID if it exists, and 0 otherwise.
findIntrinsicID(MachineInstr & I)4904 static unsigned findIntrinsicID(MachineInstr &I) {
4905   auto IntrinOp = find_if(I.operands(), [&](const MachineOperand &Op) {
4906     return Op.isIntrinsicID();
4907   });
4908   if (IntrinOp == I.operands_end())
4909     return 0;
4910   return IntrinOp->getIntrinsicID();
4911 }
4912 
selectIntrinsicWithSideEffects(MachineInstr & I,MachineRegisterInfo & MRI)4913 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
4914     MachineInstr &I, MachineRegisterInfo &MRI) {
4915   // Find the intrinsic ID.
4916   unsigned IntrinID = findIntrinsicID(I);
4917   if (!IntrinID)
4918     return false;
4919 
4920   // Select the instruction.
4921   switch (IntrinID) {
4922   default:
4923     return false;
4924   case Intrinsic::aarch64_ldxp:
4925   case Intrinsic::aarch64_ldaxp: {
4926     auto NewI = MIB.buildInstr(
4927         IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
4928         {I.getOperand(0).getReg(), I.getOperand(1).getReg()},
4929         {I.getOperand(3)});
4930     NewI.cloneMemRefs(I);
4931     break;
4932   }
4933   case Intrinsic::trap:
4934     MIB.buildInstr(AArch64::BRK, {}, {}).addImm(1);
4935     break;
4936   case Intrinsic::debugtrap:
4937     MIB.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
4938     break;
4939   case Intrinsic::ubsantrap:
4940     MIB.buildInstr(AArch64::BRK, {}, {})
4941         .addImm(I.getOperand(1).getImm() | ('U' << 8));
4942     break;
4943   }
4944 
4945   I.eraseFromParent();
4946   return true;
4947 }
4948 
selectIntrinsic(MachineInstr & I,MachineRegisterInfo & MRI)4949 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
4950                                                  MachineRegisterInfo &MRI) {
4951   unsigned IntrinID = findIntrinsicID(I);
4952   if (!IntrinID)
4953     return false;
4954 
4955   switch (IntrinID) {
4956   default:
4957     break;
4958   case Intrinsic::aarch64_crypto_sha1h: {
4959     Register DstReg = I.getOperand(0).getReg();
4960     Register SrcReg = I.getOperand(2).getReg();
4961 
4962     // FIXME: Should this be an assert?
4963     if (MRI.getType(DstReg).getSizeInBits() != 32 ||
4964         MRI.getType(SrcReg).getSizeInBits() != 32)
4965       return false;
4966 
4967     // The operation has to happen on FPRs. Set up some new FPR registers for
4968     // the source and destination if they are on GPRs.
4969     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
4970       SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
4971       MIB.buildCopy({SrcReg}, {I.getOperand(2)});
4972 
4973       // Make sure the copy ends up getting constrained properly.
4974       RBI.constrainGenericRegister(I.getOperand(2).getReg(),
4975                                    AArch64::GPR32RegClass, MRI);
4976     }
4977 
4978     if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
4979       DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
4980 
4981     // Actually insert the instruction.
4982     auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
4983     constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
4984 
4985     // Did we create a new register for the destination?
4986     if (DstReg != I.getOperand(0).getReg()) {
4987       // Yep. Copy the result of the instruction back into the original
4988       // destination.
4989       MIB.buildCopy({I.getOperand(0)}, {DstReg});
4990       RBI.constrainGenericRegister(I.getOperand(0).getReg(),
4991                                    AArch64::GPR32RegClass, MRI);
4992     }
4993 
4994     I.eraseFromParent();
4995     return true;
4996   }
4997   case Intrinsic::frameaddress:
4998   case Intrinsic::returnaddress: {
4999     MachineFunction &MF = *I.getParent()->getParent();
5000     MachineFrameInfo &MFI = MF.getFrameInfo();
5001 
5002     unsigned Depth = I.getOperand(2).getImm();
5003     Register DstReg = I.getOperand(0).getReg();
5004     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
5005 
5006     if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
5007       if (!MFReturnAddr) {
5008         // Insert the copy from LR/X30 into the entry block, before it can be
5009         // clobbered by anything.
5010         MFI.setReturnAddressIsTaken(true);
5011         MFReturnAddr = getFunctionLiveInPhysReg(MF, TII, AArch64::LR,
5012                                                 AArch64::GPR64RegClass);
5013       }
5014 
5015       if (STI.hasPAuth()) {
5016         MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
5017       } else {
5018         MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
5019         MIB.buildInstr(AArch64::XPACLRI);
5020         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
5021       }
5022 
5023       I.eraseFromParent();
5024       return true;
5025     }
5026 
5027     MFI.setFrameAddressIsTaken(true);
5028     Register FrameAddr(AArch64::FP);
5029     while (Depth--) {
5030       Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
5031       auto Ldr =
5032           MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0);
5033       constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
5034       FrameAddr = NextFrame;
5035     }
5036 
5037     if (IntrinID == Intrinsic::frameaddress)
5038       MIB.buildCopy({DstReg}, {FrameAddr});
5039     else {
5040       MFI.setReturnAddressIsTaken(true);
5041 
5042       if (STI.hasPAuth()) {
5043         Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
5044         MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
5045         MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
5046       } else {
5047         MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr})
5048             .addImm(1);
5049         MIB.buildInstr(AArch64::XPACLRI);
5050         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
5051       }
5052     }
5053 
5054     I.eraseFromParent();
5055     return true;
5056   }
5057   case Intrinsic::swift_async_context_addr:
5058     auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
5059                               {Register(AArch64::FP)})
5060                    .addImm(8)
5061                    .addImm(0);
5062     constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI);
5063 
5064     MF->getFrameInfo().setFrameAddressIsTaken(true);
5065     MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
5066     I.eraseFromParent();
5067     return true;
5068   }
5069   return false;
5070 }
5071 
5072 InstructionSelector::ComplexRendererFns
selectShiftA_32(const MachineOperand & Root) const5073 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
5074   auto MaybeImmed = getImmedFromMO(Root);
5075   if (MaybeImmed == None || *MaybeImmed > 31)
5076     return None;
5077   uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
5078   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5079 }
5080 
5081 InstructionSelector::ComplexRendererFns
selectShiftB_32(const MachineOperand & Root) const5082 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
5083   auto MaybeImmed = getImmedFromMO(Root);
5084   if (MaybeImmed == None || *MaybeImmed > 31)
5085     return None;
5086   uint64_t Enc = 31 - *MaybeImmed;
5087   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5088 }
5089 
5090 InstructionSelector::ComplexRendererFns
selectShiftA_64(const MachineOperand & Root) const5091 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
5092   auto MaybeImmed = getImmedFromMO(Root);
5093   if (MaybeImmed == None || *MaybeImmed > 63)
5094     return None;
5095   uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
5096   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5097 }
5098 
5099 InstructionSelector::ComplexRendererFns
selectShiftB_64(const MachineOperand & Root) const5100 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
5101   auto MaybeImmed = getImmedFromMO(Root);
5102   if (MaybeImmed == None || *MaybeImmed > 63)
5103     return None;
5104   uint64_t Enc = 63 - *MaybeImmed;
5105   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5106 }
5107 
5108 /// Helper to select an immediate value that can be represented as a 12-bit
5109 /// value shifted left by either 0 or 12. If it is possible to do so, return
5110 /// the immediate and shift value. If not, return None.
5111 ///
5112 /// Used by selectArithImmed and selectNegArithImmed.
5113 InstructionSelector::ComplexRendererFns
select12BitValueWithLeftShift(uint64_t Immed) const5114 AArch64InstructionSelector::select12BitValueWithLeftShift(
5115     uint64_t Immed) const {
5116   unsigned ShiftAmt;
5117   if (Immed >> 12 == 0) {
5118     ShiftAmt = 0;
5119   } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
5120     ShiftAmt = 12;
5121     Immed = Immed >> 12;
5122   } else
5123     return None;
5124 
5125   unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
5126   return {{
5127       [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
5128       [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
5129   }};
5130 }
5131 
5132 /// SelectArithImmed - Select an immediate value that can be represented as
5133 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
5134 /// Val set to the 12-bit value and Shift set to the shifter operand.
5135 InstructionSelector::ComplexRendererFns
selectArithImmed(MachineOperand & Root) const5136 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
5137   // This function is called from the addsub_shifted_imm ComplexPattern,
5138   // which lists [imm] as the list of opcode it's interested in, however
5139   // we still need to check whether the operand is actually an immediate
5140   // here because the ComplexPattern opcode list is only used in
5141   // root-level opcode matching.
5142   auto MaybeImmed = getImmedFromMO(Root);
5143   if (MaybeImmed == None)
5144     return None;
5145   return select12BitValueWithLeftShift(*MaybeImmed);
5146 }
5147 
5148 /// SelectNegArithImmed - As above, but negates the value before trying to
5149 /// select it.
5150 InstructionSelector::ComplexRendererFns
selectNegArithImmed(MachineOperand & Root) const5151 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
5152   // We need a register here, because we need to know if we have a 64 or 32
5153   // bit immediate.
5154   if (!Root.isReg())
5155     return None;
5156   auto MaybeImmed = getImmedFromMO(Root);
5157   if (MaybeImmed == None)
5158     return None;
5159   uint64_t Immed = *MaybeImmed;
5160 
5161   // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
5162   // have the opposite effect on the C flag, so this pattern mustn't match under
5163   // those circumstances.
5164   if (Immed == 0)
5165     return None;
5166 
5167   // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
5168   // the root.
5169   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5170   if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
5171     Immed = ~((uint32_t)Immed) + 1;
5172   else
5173     Immed = ~Immed + 1ULL;
5174 
5175   if (Immed & 0xFFFFFFFFFF000000ULL)
5176     return None;
5177 
5178   Immed &= 0xFFFFFFULL;
5179   return select12BitValueWithLeftShift(Immed);
5180 }
5181 
5182 /// Return true if it is worth folding MI into an extended register. That is,
5183 /// if it's safe to pull it into the addressing mode of a load or store as a
5184 /// shift.
isWorthFoldingIntoExtendedReg(MachineInstr & MI,const MachineRegisterInfo & MRI) const5185 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
5186     MachineInstr &MI, const MachineRegisterInfo &MRI) const {
5187   // Always fold if there is one use, or if we're optimizing for size.
5188   Register DefReg = MI.getOperand(0).getReg();
5189   if (MRI.hasOneNonDBGUse(DefReg) ||
5190       MI.getParent()->getParent()->getFunction().hasOptSize())
5191     return true;
5192 
5193   // It's better to avoid folding and recomputing shifts when we don't have a
5194   // fastpath.
5195   if (!STI.hasLSLFast())
5196     return false;
5197 
5198   // We have a fastpath, so folding a shift in and potentially computing it
5199   // many times may be beneficial. Check if this is only used in memory ops.
5200   // If it is, then we should fold.
5201   return all_of(MRI.use_nodbg_instructions(DefReg),
5202                 [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
5203 }
5204 
isSignExtendShiftType(AArch64_AM::ShiftExtendType Type)5205 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
5206   switch (Type) {
5207   case AArch64_AM::SXTB:
5208   case AArch64_AM::SXTH:
5209   case AArch64_AM::SXTW:
5210     return true;
5211   default:
5212     return false;
5213   }
5214 }
5215 
5216 InstructionSelector::ComplexRendererFns
selectExtendedSHL(MachineOperand & Root,MachineOperand & Base,MachineOperand & Offset,unsigned SizeInBytes,bool WantsExt) const5217 AArch64InstructionSelector::selectExtendedSHL(
5218     MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
5219     unsigned SizeInBytes, bool WantsExt) const {
5220   assert(Base.isReg() && "Expected base to be a register operand");
5221   assert(Offset.isReg() && "Expected offset to be a register operand");
5222 
5223   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5224   MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
5225   if (!OffsetInst)
5226     return None;
5227 
5228   unsigned OffsetOpc = OffsetInst->getOpcode();
5229   bool LookedThroughZExt = false;
5230   if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
5231     // Try to look through a ZEXT.
5232     if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
5233       return None;
5234 
5235     OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg());
5236     OffsetOpc = OffsetInst->getOpcode();
5237     LookedThroughZExt = true;
5238 
5239     if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
5240       return None;
5241   }
5242   // Make sure that the memory op is a valid size.
5243   int64_t LegalShiftVal = Log2_32(SizeInBytes);
5244   if (LegalShiftVal == 0)
5245     return None;
5246   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
5247     return None;
5248 
5249   // Now, try to find the specific G_CONSTANT. Start by assuming that the
5250   // register we will offset is the LHS, and the register containing the
5251   // constant is the RHS.
5252   Register OffsetReg = OffsetInst->getOperand(1).getReg();
5253   Register ConstantReg = OffsetInst->getOperand(2).getReg();
5254   auto ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
5255   if (!ValAndVReg) {
5256     // We didn't get a constant on the RHS. If the opcode is a shift, then
5257     // we're done.
5258     if (OffsetOpc == TargetOpcode::G_SHL)
5259       return None;
5260 
5261     // If we have a G_MUL, we can use either register. Try looking at the RHS.
5262     std::swap(OffsetReg, ConstantReg);
5263     ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
5264     if (!ValAndVReg)
5265       return None;
5266   }
5267 
5268   // The value must fit into 3 bits, and must be positive. Make sure that is
5269   // true.
5270   int64_t ImmVal = ValAndVReg->Value.getSExtValue();
5271 
5272   // Since we're going to pull this into a shift, the constant value must be
5273   // a power of 2. If we got a multiply, then we need to check this.
5274   if (OffsetOpc == TargetOpcode::G_MUL) {
5275     if (!isPowerOf2_32(ImmVal))
5276       return None;
5277 
5278     // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
5279     ImmVal = Log2_32(ImmVal);
5280   }
5281 
5282   if ((ImmVal & 0x7) != ImmVal)
5283     return None;
5284 
5285   // We are only allowed to shift by LegalShiftVal. This shift value is built
5286   // into the instruction, so we can't just use whatever we want.
5287   if (ImmVal != LegalShiftVal)
5288     return None;
5289 
5290   unsigned SignExtend = 0;
5291   if (WantsExt) {
5292     // Check if the offset is defined by an extend, unless we looked through a
5293     // G_ZEXT earlier.
5294     if (!LookedThroughZExt) {
5295       MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
5296       auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
5297       if (Ext == AArch64_AM::InvalidShiftExtend)
5298         return None;
5299 
5300       SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
5301       // We only support SXTW for signed extension here.
5302       if (SignExtend && Ext != AArch64_AM::SXTW)
5303         return None;
5304       OffsetReg = ExtInst->getOperand(1).getReg();
5305     }
5306 
5307     // Need a 32-bit wide register here.
5308     MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
5309     OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
5310   }
5311 
5312   // We can use the LHS of the GEP as the base, and the LHS of the shift as an
5313   // offset. Signify that we are shifting by setting the shift flag to 1.
5314   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); },
5315            [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
5316            [=](MachineInstrBuilder &MIB) {
5317              // Need to add both immediates here to make sure that they are both
5318              // added to the instruction.
5319              MIB.addImm(SignExtend);
5320              MIB.addImm(1);
5321            }}};
5322 }
5323 
5324 /// This is used for computing addresses like this:
5325 ///
5326 /// ldr x1, [x2, x3, lsl #3]
5327 ///
5328 /// Where x2 is the base register, and x3 is an offset register. The shift-left
5329 /// is a constant value specific to this load instruction. That is, we'll never
5330 /// see anything other than a 3 here (which corresponds to the size of the
5331 /// element being loaded.)
5332 InstructionSelector::ComplexRendererFns
selectAddrModeShiftedExtendXReg(MachineOperand & Root,unsigned SizeInBytes) const5333 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
5334     MachineOperand &Root, unsigned SizeInBytes) const {
5335   if (!Root.isReg())
5336     return None;
5337   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5338 
5339   // We want to find something like this:
5340   //
5341   // val = G_CONSTANT LegalShiftVal
5342   // shift = G_SHL off_reg val
5343   // ptr = G_PTR_ADD base_reg shift
5344   // x = G_LOAD ptr
5345   //
5346   // And fold it into this addressing mode:
5347   //
5348   // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
5349 
5350   // Check if we can find the G_PTR_ADD.
5351   MachineInstr *PtrAdd =
5352       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
5353   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
5354     return None;
5355 
5356   // Now, try to match an opcode which will match our specific offset.
5357   // We want a G_SHL or a G_MUL.
5358   MachineInstr *OffsetInst =
5359       getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI);
5360   return selectExtendedSHL(Root, PtrAdd->getOperand(1),
5361                            OffsetInst->getOperand(0), SizeInBytes,
5362                            /*WantsExt=*/false);
5363 }
5364 
5365 /// This is used for computing addresses like this:
5366 ///
5367 /// ldr x1, [x2, x3]
5368 ///
5369 /// Where x2 is the base register, and x3 is an offset register.
5370 ///
5371 /// When possible (or profitable) to fold a G_PTR_ADD into the address calculation,
5372 /// this will do so. Otherwise, it will return None.
5373 InstructionSelector::ComplexRendererFns
selectAddrModeRegisterOffset(MachineOperand & Root) const5374 AArch64InstructionSelector::selectAddrModeRegisterOffset(
5375     MachineOperand &Root) const {
5376   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5377 
5378   // We need a GEP.
5379   MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
5380   if (!Gep || Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
5381     return None;
5382 
5383   // If this is used more than once, let's not bother folding.
5384   // TODO: Check if they are memory ops. If they are, then we can still fold
5385   // without having to recompute anything.
5386   if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg()))
5387     return None;
5388 
5389   // Base is the GEP's LHS, offset is its RHS.
5390   return {{[=](MachineInstrBuilder &MIB) {
5391              MIB.addUse(Gep->getOperand(1).getReg());
5392            },
5393            [=](MachineInstrBuilder &MIB) {
5394              MIB.addUse(Gep->getOperand(2).getReg());
5395            },
5396            [=](MachineInstrBuilder &MIB) {
5397              // Need to add both immediates here to make sure that they are both
5398              // added to the instruction.
5399              MIB.addImm(0);
5400              MIB.addImm(0);
5401            }}};
5402 }
5403 
5404 /// This is intended to be equivalent to selectAddrModeXRO in
5405 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
5406 InstructionSelector::ComplexRendererFns
selectAddrModeXRO(MachineOperand & Root,unsigned SizeInBytes) const5407 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
5408                                               unsigned SizeInBytes) const {
5409   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5410   if (!Root.isReg())
5411     return None;
5412   MachineInstr *PtrAdd =
5413       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
5414   if (!PtrAdd)
5415     return None;
5416 
5417   // Check for an immediates which cannot be encoded in the [base + imm]
5418   // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
5419   // end up with code like:
5420   //
5421   // mov x0, wide
5422   // add x1 base, x0
5423   // ldr x2, [x1, x0]
5424   //
5425   // In this situation, we can use the [base, xreg] addressing mode to save an
5426   // add/sub:
5427   //
5428   // mov x0, wide
5429   // ldr x2, [base, x0]
5430   auto ValAndVReg =
5431       getConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
5432   if (ValAndVReg) {
5433     unsigned Scale = Log2_32(SizeInBytes);
5434     int64_t ImmOff = ValAndVReg->Value.getSExtValue();
5435 
5436     // Skip immediates that can be selected in the load/store addresing
5437     // mode.
5438     if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
5439         ImmOff < (0x1000 << Scale))
5440       return None;
5441 
5442     // Helper lambda to decide whether or not it is preferable to emit an add.
5443     auto isPreferredADD = [](int64_t ImmOff) {
5444       // Constants in [0x0, 0xfff] can be encoded in an add.
5445       if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
5446         return true;
5447 
5448       // Can it be encoded in an add lsl #12?
5449       if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
5450         return false;
5451 
5452       // It can be encoded in an add lsl #12, but we may not want to. If it is
5453       // possible to select this as a single movz, then prefer that. A single
5454       // movz is faster than an add with a shift.
5455       return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
5456              (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
5457     };
5458 
5459     // If the immediate can be encoded in a single add/sub, then bail out.
5460     if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
5461       return None;
5462   }
5463 
5464   // Try to fold shifts into the addressing mode.
5465   auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
5466   if (AddrModeFns)
5467     return AddrModeFns;
5468 
5469   // If that doesn't work, see if it's possible to fold in registers from
5470   // a GEP.
5471   return selectAddrModeRegisterOffset(Root);
5472 }
5473 
5474 /// This is used for computing addresses like this:
5475 ///
5476 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
5477 ///
5478 /// Where we have a 64-bit base register, a 32-bit offset register, and an
5479 /// extend (which may or may not be signed).
5480 InstructionSelector::ComplexRendererFns
selectAddrModeWRO(MachineOperand & Root,unsigned SizeInBytes) const5481 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
5482                                               unsigned SizeInBytes) const {
5483   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5484 
5485   MachineInstr *PtrAdd =
5486       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
5487   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
5488     return None;
5489 
5490   MachineOperand &LHS = PtrAdd->getOperand(1);
5491   MachineOperand &RHS = PtrAdd->getOperand(2);
5492   MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI);
5493 
5494   // The first case is the same as selectAddrModeXRO, except we need an extend.
5495   // In this case, we try to find a shift and extend, and fold them into the
5496   // addressing mode.
5497   //
5498   // E.g.
5499   //
5500   // off_reg = G_Z/S/ANYEXT ext_reg
5501   // val = G_CONSTANT LegalShiftVal
5502   // shift = G_SHL off_reg val
5503   // ptr = G_PTR_ADD base_reg shift
5504   // x = G_LOAD ptr
5505   //
5506   // In this case we can get a load like this:
5507   //
5508   // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
5509   auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0),
5510                                        SizeInBytes, /*WantsExt=*/true);
5511   if (ExtendedShl)
5512     return ExtendedShl;
5513 
5514   // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
5515   //
5516   // e.g.
5517   // ldr something, [base_reg, ext_reg, sxtw]
5518   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
5519     return None;
5520 
5521   // Check if this is an extend. We'll get an extend type if it is.
5522   AArch64_AM::ShiftExtendType Ext =
5523       getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true);
5524   if (Ext == AArch64_AM::InvalidShiftExtend)
5525     return None;
5526 
5527   // Need a 32-bit wide register.
5528   MachineIRBuilder MIB(*PtrAdd);
5529   Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(),
5530                                        AArch64::GPR32RegClass, MIB);
5531   unsigned SignExtend = Ext == AArch64_AM::SXTW;
5532 
5533   // Base is LHS, offset is ExtReg.
5534   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); },
5535            [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
5536            [=](MachineInstrBuilder &MIB) {
5537              MIB.addImm(SignExtend);
5538              MIB.addImm(0);
5539            }}};
5540 }
5541 
5542 /// Select a "register plus unscaled signed 9-bit immediate" address.  This
5543 /// should only match when there is an offset that is not valid for a scaled
5544 /// immediate addressing mode.  The "Size" argument is the size in bytes of the
5545 /// memory reference, which is needed here to know what is valid for a scaled
5546 /// immediate.
5547 InstructionSelector::ComplexRendererFns
selectAddrModeUnscaled(MachineOperand & Root,unsigned Size) const5548 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
5549                                                    unsigned Size) const {
5550   MachineRegisterInfo &MRI =
5551       Root.getParent()->getParent()->getParent()->getRegInfo();
5552 
5553   if (!Root.isReg())
5554     return None;
5555 
5556   if (!isBaseWithConstantOffset(Root, MRI))
5557     return None;
5558 
5559   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
5560   if (!RootDef)
5561     return None;
5562 
5563   MachineOperand &OffImm = RootDef->getOperand(2);
5564   if (!OffImm.isReg())
5565     return None;
5566   MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
5567   if (!RHS || RHS->getOpcode() != TargetOpcode::G_CONSTANT)
5568     return None;
5569   int64_t RHSC;
5570   MachineOperand &RHSOp1 = RHS->getOperand(1);
5571   if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
5572     return None;
5573   RHSC = RHSOp1.getCImm()->getSExtValue();
5574 
5575   // If the offset is valid as a scaled immediate, don't match here.
5576   if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size)))
5577     return None;
5578   if (RHSC >= -256 && RHSC < 256) {
5579     MachineOperand &Base = RootDef->getOperand(1);
5580     return {{
5581         [=](MachineInstrBuilder &MIB) { MIB.add(Base); },
5582         [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
5583     }};
5584   }
5585   return None;
5586 }
5587 
5588 InstructionSelector::ComplexRendererFns
tryFoldAddLowIntoImm(MachineInstr & RootDef,unsigned Size,MachineRegisterInfo & MRI) const5589 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
5590                                                  unsigned Size,
5591                                                  MachineRegisterInfo &MRI) const {
5592   if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
5593     return None;
5594   MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg());
5595   if (Adrp.getOpcode() != AArch64::ADRP)
5596     return None;
5597 
5598   // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
5599   auto Offset = Adrp.getOperand(1).getOffset();
5600   if (Offset % Size != 0)
5601     return None;
5602 
5603   auto GV = Adrp.getOperand(1).getGlobal();
5604   if (GV->isThreadLocal())
5605     return None;
5606 
5607   auto &MF = *RootDef.getParent()->getParent();
5608   if (GV->getPointerAlignment(MF.getDataLayout()) < Size)
5609     return None;
5610 
5611   unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget());
5612   MachineIRBuilder MIRBuilder(RootDef);
5613   Register AdrpReg = Adrp.getOperand(0).getReg();
5614   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
5615            [=](MachineInstrBuilder &MIB) {
5616              MIB.addGlobalAddress(GV, Offset,
5617                                   OpFlags | AArch64II::MO_PAGEOFF |
5618                                       AArch64II::MO_NC);
5619            }}};
5620 }
5621 
5622 /// Select a "register plus scaled unsigned 12-bit immediate" address.  The
5623 /// "Size" argument is the size in bytes of the memory reference, which
5624 /// determines the scale.
5625 InstructionSelector::ComplexRendererFns
selectAddrModeIndexed(MachineOperand & Root,unsigned Size) const5626 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
5627                                                   unsigned Size) const {
5628   MachineFunction &MF = *Root.getParent()->getParent()->getParent();
5629   MachineRegisterInfo &MRI = MF.getRegInfo();
5630 
5631   if (!Root.isReg())
5632     return None;
5633 
5634   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
5635   if (!RootDef)
5636     return None;
5637 
5638   if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
5639     return {{
5640         [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
5641         [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
5642     }};
5643   }
5644 
5645   CodeModel::Model CM = MF.getTarget().getCodeModel();
5646   // Check if we can fold in the ADD of small code model ADRP + ADD address.
5647   if (CM == CodeModel::Small) {
5648     auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI);
5649     if (OpFns)
5650       return OpFns;
5651   }
5652 
5653   if (isBaseWithConstantOffset(Root, MRI)) {
5654     MachineOperand &LHS = RootDef->getOperand(1);
5655     MachineOperand &RHS = RootDef->getOperand(2);
5656     MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
5657     MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
5658     if (LHSDef && RHSDef) {
5659       int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
5660       unsigned Scale = Log2_32(Size);
5661       if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
5662         if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
5663           return {{
5664               [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
5665               [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
5666           }};
5667 
5668         return {{
5669             [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
5670             [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
5671         }};
5672       }
5673     }
5674   }
5675 
5676   // Before falling back to our general case, check if the unscaled
5677   // instructions can handle this. If so, that's preferable.
5678   if (selectAddrModeUnscaled(Root, Size).hasValue())
5679     return None;
5680 
5681   return {{
5682       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
5683       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
5684   }};
5685 }
5686 
5687 /// Given a shift instruction, return the correct shift type for that
5688 /// instruction.
getShiftTypeForInst(MachineInstr & MI)5689 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
5690   // TODO: Handle AArch64_AM::ROR
5691   switch (MI.getOpcode()) {
5692   default:
5693     return AArch64_AM::InvalidShiftExtend;
5694   case TargetOpcode::G_SHL:
5695     return AArch64_AM::LSL;
5696   case TargetOpcode::G_LSHR:
5697     return AArch64_AM::LSR;
5698   case TargetOpcode::G_ASHR:
5699     return AArch64_AM::ASR;
5700   }
5701 }
5702 
5703 /// Select a "shifted register" operand. If the value is not shifted, set the
5704 /// shift operand to a default value of "lsl 0".
5705 ///
5706 /// TODO: Allow shifted register to be rotated in logical instructions.
5707 InstructionSelector::ComplexRendererFns
selectShiftedRegister(MachineOperand & Root) const5708 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root) const {
5709   if (!Root.isReg())
5710     return None;
5711   MachineRegisterInfo &MRI =
5712       Root.getParent()->getParent()->getParent()->getRegInfo();
5713 
5714   // Check if the operand is defined by an instruction which corresponds to
5715   // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
5716   //
5717   // TODO: Handle AArch64_AM::ROR for logical instructions.
5718   MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
5719   if (!ShiftInst)
5720     return None;
5721   AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
5722   if (ShType == AArch64_AM::InvalidShiftExtend)
5723     return None;
5724   if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI))
5725     return None;
5726 
5727   // Need an immediate on the RHS.
5728   MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
5729   auto Immed = getImmedFromMO(ShiftRHS);
5730   if (!Immed)
5731     return None;
5732 
5733   // We have something that we can fold. Fold in the shift's LHS and RHS into
5734   // the instruction.
5735   MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
5736   Register ShiftReg = ShiftLHS.getReg();
5737 
5738   unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
5739   unsigned Val = *Immed & (NumBits - 1);
5740   unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
5741 
5742   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
5743            [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
5744 }
5745 
getExtendTypeForInst(MachineInstr & MI,MachineRegisterInfo & MRI,bool IsLoadStore) const5746 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
5747     MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
5748   unsigned Opc = MI.getOpcode();
5749 
5750   // Handle explicit extend instructions first.
5751   if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
5752     unsigned Size;
5753     if (Opc == TargetOpcode::G_SEXT)
5754       Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5755     else
5756       Size = MI.getOperand(2).getImm();
5757     assert(Size != 64 && "Extend from 64 bits?");
5758     switch (Size) {
5759     case 8:
5760       return AArch64_AM::SXTB;
5761     case 16:
5762       return AArch64_AM::SXTH;
5763     case 32:
5764       return AArch64_AM::SXTW;
5765     default:
5766       return AArch64_AM::InvalidShiftExtend;
5767     }
5768   }
5769 
5770   if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
5771     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5772     assert(Size != 64 && "Extend from 64 bits?");
5773     switch (Size) {
5774     case 8:
5775       return AArch64_AM::UXTB;
5776     case 16:
5777       return AArch64_AM::UXTH;
5778     case 32:
5779       return AArch64_AM::UXTW;
5780     default:
5781       return AArch64_AM::InvalidShiftExtend;
5782     }
5783   }
5784 
5785   // Don't have an explicit extend. Try to handle a G_AND with a constant mask
5786   // on the RHS.
5787   if (Opc != TargetOpcode::G_AND)
5788     return AArch64_AM::InvalidShiftExtend;
5789 
5790   Optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
5791   if (!MaybeAndMask)
5792     return AArch64_AM::InvalidShiftExtend;
5793   uint64_t AndMask = *MaybeAndMask;
5794   switch (AndMask) {
5795   default:
5796     return AArch64_AM::InvalidShiftExtend;
5797   case 0xFF:
5798     return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
5799   case 0xFFFF:
5800     return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
5801   case 0xFFFFFFFF:
5802     return AArch64_AM::UXTW;
5803   }
5804 }
5805 
moveScalarRegClass(Register Reg,const TargetRegisterClass & RC,MachineIRBuilder & MIB) const5806 Register AArch64InstructionSelector::moveScalarRegClass(
5807     Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
5808   MachineRegisterInfo &MRI = *MIB.getMRI();
5809   auto Ty = MRI.getType(Reg);
5810   assert(!Ty.isVector() && "Expected scalars only!");
5811   if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
5812     return Reg;
5813 
5814   // Create a copy and immediately select it.
5815   // FIXME: We should have an emitCopy function?
5816   auto Copy = MIB.buildCopy({&RC}, {Reg});
5817   selectCopy(*Copy, TII, MRI, TRI, RBI);
5818   return Copy.getReg(0);
5819 }
5820 
5821 /// Select an "extended register" operand. This operand folds in an extend
5822 /// followed by an optional left shift.
5823 InstructionSelector::ComplexRendererFns
selectArithExtendedRegister(MachineOperand & Root) const5824 AArch64InstructionSelector::selectArithExtendedRegister(
5825     MachineOperand &Root) const {
5826   if (!Root.isReg())
5827     return None;
5828   MachineRegisterInfo &MRI =
5829       Root.getParent()->getParent()->getParent()->getRegInfo();
5830 
5831   uint64_t ShiftVal = 0;
5832   Register ExtReg;
5833   AArch64_AM::ShiftExtendType Ext;
5834   MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
5835   if (!RootDef)
5836     return None;
5837 
5838   if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI))
5839     return None;
5840 
5841   // Check if we can fold a shift and an extend.
5842   if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
5843     // Look for a constant on the RHS of the shift.
5844     MachineOperand &RHS = RootDef->getOperand(2);
5845     Optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
5846     if (!MaybeShiftVal)
5847       return None;
5848     ShiftVal = *MaybeShiftVal;
5849     if (ShiftVal > 4)
5850       return None;
5851     // Look for a valid extend instruction on the LHS of the shift.
5852     MachineOperand &LHS = RootDef->getOperand(1);
5853     MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
5854     if (!ExtDef)
5855       return None;
5856     Ext = getExtendTypeForInst(*ExtDef, MRI);
5857     if (Ext == AArch64_AM::InvalidShiftExtend)
5858       return None;
5859     ExtReg = ExtDef->getOperand(1).getReg();
5860   } else {
5861     // Didn't get a shift. Try just folding an extend.
5862     Ext = getExtendTypeForInst(*RootDef, MRI);
5863     if (Ext == AArch64_AM::InvalidShiftExtend)
5864       return None;
5865     ExtReg = RootDef->getOperand(1).getReg();
5866 
5867     // If we have a 32 bit instruction which zeroes out the high half of a
5868     // register, we get an implicit zero extend for free. Check if we have one.
5869     // FIXME: We actually emit the extend right now even though we don't have
5870     // to.
5871     if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
5872       MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
5873       if (ExtInst && isDef32(*ExtInst))
5874         return None;
5875     }
5876   }
5877 
5878   // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
5879   // copy.
5880   MachineIRBuilder MIB(*RootDef);
5881   ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
5882 
5883   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
5884            [=](MachineInstrBuilder &MIB) {
5885              MIB.addImm(getArithExtendImm(Ext, ShiftVal));
5886            }}};
5887 }
5888 
renderTruncImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5889 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
5890                                                 const MachineInstr &MI,
5891                                                 int OpIdx) const {
5892   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5893   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5894          "Expected G_CONSTANT");
5895   Optional<int64_t> CstVal =
5896       getConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
5897   assert(CstVal && "Expected constant value");
5898   MIB.addImm(CstVal.getValue());
5899 }
5900 
renderLogicalImm32(MachineInstrBuilder & MIB,const MachineInstr & I,int OpIdx) const5901 void AArch64InstructionSelector::renderLogicalImm32(
5902   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
5903   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5904          "Expected G_CONSTANT");
5905   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
5906   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
5907   MIB.addImm(Enc);
5908 }
5909 
renderLogicalImm64(MachineInstrBuilder & MIB,const MachineInstr & I,int OpIdx) const5910 void AArch64InstructionSelector::renderLogicalImm64(
5911   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
5912   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5913          "Expected G_CONSTANT");
5914   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
5915   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
5916   MIB.addImm(Enc);
5917 }
5918 
renderFPImm16(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5919 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
5920                                                const MachineInstr &MI,
5921                                                int OpIdx) const {
5922   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
5923          "Expected G_FCONSTANT");
5924   MIB.addImm(
5925       AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
5926 }
5927 
renderFPImm32(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5928 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
5929                                                const MachineInstr &MI,
5930                                                int OpIdx) const {
5931   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
5932          "Expected G_FCONSTANT");
5933   MIB.addImm(
5934       AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
5935 }
5936 
renderFPImm64(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5937 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
5938                                                const MachineInstr &MI,
5939                                                int OpIdx) const {
5940   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
5941          "Expected G_FCONSTANT");
5942   MIB.addImm(
5943       AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
5944 }
5945 
isLoadStoreOfNumBytes(const MachineInstr & MI,unsigned NumBytes) const5946 bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
5947     const MachineInstr &MI, unsigned NumBytes) const {
5948   if (!MI.mayLoadOrStore())
5949     return false;
5950   assert(MI.hasOneMemOperand() &&
5951          "Expected load/store to have only one mem op!");
5952   return (*MI.memoperands_begin())->getSize() == NumBytes;
5953 }
5954 
isDef32(const MachineInstr & MI) const5955 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
5956   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5957   if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
5958     return false;
5959 
5960   // Only return true if we know the operation will zero-out the high half of
5961   // the 64-bit register. Truncates can be subregister copies, which don't
5962   // zero out the high bits. Copies and other copy-like instructions can be
5963   // fed by truncates, or could be lowered as subregister copies.
5964   switch (MI.getOpcode()) {
5965   default:
5966     return true;
5967   case TargetOpcode::COPY:
5968   case TargetOpcode::G_BITCAST:
5969   case TargetOpcode::G_TRUNC:
5970   case TargetOpcode::G_PHI:
5971     return false;
5972   }
5973 }
5974 
5975 
5976 // Perform fixups on the given PHI instruction's operands to force them all
5977 // to be the same as the destination regbank.
fixupPHIOpBanks(MachineInstr & MI,MachineRegisterInfo & MRI,const AArch64RegisterBankInfo & RBI)5978 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
5979                             const AArch64RegisterBankInfo &RBI) {
5980   assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
5981   Register DstReg = MI.getOperand(0).getReg();
5982   const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg);
5983   assert(DstRB && "Expected PHI dst to have regbank assigned");
5984   MachineIRBuilder MIB(MI);
5985 
5986   // Go through each operand and ensure it has the same regbank.
5987   for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
5988     MachineOperand &MO = MI.getOperand(OpIdx);
5989     if (!MO.isReg())
5990       continue;
5991     Register OpReg = MO.getReg();
5992     const RegisterBank *RB = MRI.getRegBankOrNull(OpReg);
5993     if (RB != DstRB) {
5994       // Insert a cross-bank copy.
5995       auto *OpDef = MRI.getVRegDef(OpReg);
5996       const LLT &Ty = MRI.getType(OpReg);
5997       MIB.setInsertPt(*OpDef->getParent(), std::next(OpDef->getIterator()));
5998       auto Copy = MIB.buildCopy(Ty, OpReg);
5999       MRI.setRegBank(Copy.getReg(0), *DstRB);
6000       MO.setReg(Copy.getReg(0));
6001     }
6002   }
6003 }
6004 
processPHIs(MachineFunction & MF)6005 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
6006   // We're looking for PHIs, build a list so we don't invalidate iterators.
6007   MachineRegisterInfo &MRI = MF.getRegInfo();
6008   SmallVector<MachineInstr *, 32> Phis;
6009   for (auto &BB : MF) {
6010     for (auto &MI : BB) {
6011       if (MI.getOpcode() == TargetOpcode::G_PHI)
6012         Phis.emplace_back(&MI);
6013     }
6014   }
6015 
6016   for (auto *MI : Phis) {
6017     // We need to do some work here if the operand types are < 16 bit and they
6018     // are split across fpr/gpr banks. Since all types <32b on gpr
6019     // end up being assigned gpr32 regclasses, we can end up with PHIs here
6020     // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
6021     // be selecting heterogenous regbanks for operands if possible, but we
6022     // still need to be able to deal with it here.
6023     //
6024     // To fix this, if we have a gpr-bank operand < 32b in size and at least
6025     // one other operand is on the fpr bank, then we add cross-bank copies
6026     // to homogenize the operand banks. For simplicity the bank that we choose
6027     // to settle on is whatever bank the def operand has. For example:
6028     //
6029     // %endbb:
6030     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
6031     //  =>
6032     // %bb2:
6033     //   ...
6034     //   %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
6035     //   ...
6036     // %endbb:
6037     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
6038     bool HasGPROp = false, HasFPROp = false;
6039     for (unsigned OpIdx = 1; OpIdx < MI->getNumOperands(); ++OpIdx) {
6040       const auto &MO = MI->getOperand(OpIdx);
6041       if (!MO.isReg())
6042         continue;
6043       const LLT &Ty = MRI.getType(MO.getReg());
6044       if (!Ty.isValid() || !Ty.isScalar())
6045         break;
6046       if (Ty.getSizeInBits() >= 32)
6047         break;
6048       const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
6049       // If for some reason we don't have a regbank yet. Don't try anything.
6050       if (!RB)
6051         break;
6052 
6053       if (RB->getID() == AArch64::GPRRegBankID)
6054         HasGPROp = true;
6055       else
6056         HasFPROp = true;
6057     }
6058     // We have heterogenous regbanks, need to fixup.
6059     if (HasGPROp && HasFPROp)
6060       fixupPHIOpBanks(*MI, MRI, RBI);
6061   }
6062 }
6063 
6064 namespace llvm {
6065 InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine & TM,AArch64Subtarget & Subtarget,AArch64RegisterBankInfo & RBI)6066 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
6067                                  AArch64Subtarget &Subtarget,
6068                                  AArch64RegisterBankInfo &RBI) {
6069   return new AArch64InstructionSelector(TM, Subtarget, RBI);
6070 }
6071 }
6072