xref: /netbsd-src/external/apache2/llvm/dist/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 //===-- AArch64ISelDAGToDAG.cpp - A dag to dag inst selector for AArch64 --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines an instruction selector for the AArch64 target.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64MachineFunctionInfo.h"
14 #include "AArch64TargetMachine.h"
15 #include "MCTargetDesc/AArch64AddressingModes.h"
16 #include "llvm/ADT/APSInt.h"
17 #include "llvm/CodeGen/SelectionDAGISel.h"
18 #include "llvm/IR/Function.h" // To access function attributes.
19 #include "llvm/IR/GlobalValue.h"
20 #include "llvm/IR/Intrinsics.h"
21 #include "llvm/IR/IntrinsicsAArch64.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Support/ErrorHandling.h"
24 #include "llvm/Support/KnownBits.h"
25 #include "llvm/Support/MathExtras.h"
26 #include "llvm/Support/raw_ostream.h"
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "aarch64-isel"
31 
32 //===--------------------------------------------------------------------===//
33 /// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine
34 /// instructions for SelectionDAG operations.
35 ///
36 namespace {
37 
38 class AArch64DAGToDAGISel : public SelectionDAGISel {
39 
40   /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
41   /// make the right decision when generating code for different targets.
42   const AArch64Subtarget *Subtarget;
43 
44 public:
AArch64DAGToDAGISel(AArch64TargetMachine & tm,CodeGenOpt::Level OptLevel)45   explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
46                                CodeGenOpt::Level OptLevel)
47       : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr) {}
48 
getPassName() const49   StringRef getPassName() const override {
50     return "AArch64 Instruction Selection";
51   }
52 
runOnMachineFunction(MachineFunction & MF)53   bool runOnMachineFunction(MachineFunction &MF) override {
54     Subtarget = &MF.getSubtarget<AArch64Subtarget>();
55     return SelectionDAGISel::runOnMachineFunction(MF);
56   }
57 
58   void Select(SDNode *Node) override;
59 
60   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
61   /// inline asm expressions.
62   bool SelectInlineAsmMemoryOperand(const SDValue &Op,
63                                     unsigned ConstraintID,
64                                     std::vector<SDValue> &OutOps) override;
65 
66   template <signed Low, signed High, signed Scale>
67   bool SelectRDVLImm(SDValue N, SDValue &Imm);
68 
69   bool tryMLAV64LaneV128(SDNode *N);
70   bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N);
71   bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
72   bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
73   bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
SelectArithShiftedRegister(SDValue N,SDValue & Reg,SDValue & Shift)74   bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
75     return SelectShiftedRegister(N, false, Reg, Shift);
76   }
SelectLogicalShiftedRegister(SDValue N,SDValue & Reg,SDValue & Shift)77   bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
78     return SelectShiftedRegister(N, true, Reg, Shift);
79   }
SelectAddrModeIndexed7S8(SDValue N,SDValue & Base,SDValue & OffImm)80   bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) {
81     return SelectAddrModeIndexed7S(N, 1, Base, OffImm);
82   }
SelectAddrModeIndexed7S16(SDValue N,SDValue & Base,SDValue & OffImm)83   bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) {
84     return SelectAddrModeIndexed7S(N, 2, Base, OffImm);
85   }
SelectAddrModeIndexed7S32(SDValue N,SDValue & Base,SDValue & OffImm)86   bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) {
87     return SelectAddrModeIndexed7S(N, 4, Base, OffImm);
88   }
SelectAddrModeIndexed7S64(SDValue N,SDValue & Base,SDValue & OffImm)89   bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) {
90     return SelectAddrModeIndexed7S(N, 8, Base, OffImm);
91   }
SelectAddrModeIndexed7S128(SDValue N,SDValue & Base,SDValue & OffImm)92   bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
93     return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
94   }
SelectAddrModeIndexedS9S128(SDValue N,SDValue & Base,SDValue & OffImm)95   bool SelectAddrModeIndexedS9S128(SDValue N, SDValue &Base, SDValue &OffImm) {
96     return SelectAddrModeIndexedBitWidth(N, true, 9, 16, Base, OffImm);
97   }
SelectAddrModeIndexedU6S128(SDValue N,SDValue & Base,SDValue & OffImm)98   bool SelectAddrModeIndexedU6S128(SDValue N, SDValue &Base, SDValue &OffImm) {
99     return SelectAddrModeIndexedBitWidth(N, false, 6, 16, Base, OffImm);
100   }
SelectAddrModeIndexed8(SDValue N,SDValue & Base,SDValue & OffImm)101   bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
102     return SelectAddrModeIndexed(N, 1, Base, OffImm);
103   }
SelectAddrModeIndexed16(SDValue N,SDValue & Base,SDValue & OffImm)104   bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
105     return SelectAddrModeIndexed(N, 2, Base, OffImm);
106   }
SelectAddrModeIndexed32(SDValue N,SDValue & Base,SDValue & OffImm)107   bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
108     return SelectAddrModeIndexed(N, 4, Base, OffImm);
109   }
SelectAddrModeIndexed64(SDValue N,SDValue & Base,SDValue & OffImm)110   bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
111     return SelectAddrModeIndexed(N, 8, Base, OffImm);
112   }
SelectAddrModeIndexed128(SDValue N,SDValue & Base,SDValue & OffImm)113   bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
114     return SelectAddrModeIndexed(N, 16, Base, OffImm);
115   }
SelectAddrModeUnscaled8(SDValue N,SDValue & Base,SDValue & OffImm)116   bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
117     return SelectAddrModeUnscaled(N, 1, Base, OffImm);
118   }
SelectAddrModeUnscaled16(SDValue N,SDValue & Base,SDValue & OffImm)119   bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
120     return SelectAddrModeUnscaled(N, 2, Base, OffImm);
121   }
SelectAddrModeUnscaled32(SDValue N,SDValue & Base,SDValue & OffImm)122   bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
123     return SelectAddrModeUnscaled(N, 4, Base, OffImm);
124   }
SelectAddrModeUnscaled64(SDValue N,SDValue & Base,SDValue & OffImm)125   bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
126     return SelectAddrModeUnscaled(N, 8, Base, OffImm);
127   }
SelectAddrModeUnscaled128(SDValue N,SDValue & Base,SDValue & OffImm)128   bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
129     return SelectAddrModeUnscaled(N, 16, Base, OffImm);
130   }
131 
132   template<int Width>
SelectAddrModeWRO(SDValue N,SDValue & Base,SDValue & Offset,SDValue & SignExtend,SDValue & DoShift)133   bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
134                          SDValue &SignExtend, SDValue &DoShift) {
135     return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
136   }
137 
138   template<int Width>
SelectAddrModeXRO(SDValue N,SDValue & Base,SDValue & Offset,SDValue & SignExtend,SDValue & DoShift)139   bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset,
140                          SDValue &SignExtend, SDValue &DoShift) {
141     return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
142   }
143 
SelectDupZeroOrUndef(SDValue N)144   bool SelectDupZeroOrUndef(SDValue N) {
145     switch(N->getOpcode()) {
146     case ISD::UNDEF:
147       return true;
148     case AArch64ISD::DUP:
149     case ISD::SPLAT_VECTOR: {
150       auto Opnd0 = N->getOperand(0);
151       if (auto CN = dyn_cast<ConstantSDNode>(Opnd0))
152         if (CN->isNullValue())
153           return true;
154       if (auto CN = dyn_cast<ConstantFPSDNode>(Opnd0))
155         if (CN->isZero())
156           return true;
157       break;
158     }
159     default:
160       break;
161     }
162 
163     return false;
164   }
165 
SelectDupZero(SDValue N)166   bool SelectDupZero(SDValue N) {
167     switch(N->getOpcode()) {
168     case AArch64ISD::DUP:
169     case ISD::SPLAT_VECTOR: {
170       auto Opnd0 = N->getOperand(0);
171       if (auto CN = dyn_cast<ConstantSDNode>(Opnd0))
172         if (CN->isNullValue())
173           return true;
174       if (auto CN = dyn_cast<ConstantFPSDNode>(Opnd0))
175         if (CN->isZero())
176           return true;
177       break;
178     }
179     }
180 
181     return false;
182   }
183 
184   template<MVT::SimpleValueType VT>
SelectSVEAddSubImm(SDValue N,SDValue & Imm,SDValue & Shift)185   bool SelectSVEAddSubImm(SDValue N, SDValue &Imm, SDValue &Shift) {
186     return SelectSVEAddSubImm(N, VT, Imm, Shift);
187   }
188 
189   template <MVT::SimpleValueType VT, bool Invert = false>
SelectSVELogicalImm(SDValue N,SDValue & Imm)190   bool SelectSVELogicalImm(SDValue N, SDValue &Imm) {
191     return SelectSVELogicalImm(N, VT, Imm, Invert);
192   }
193 
194   template <MVT::SimpleValueType VT>
SelectSVEArithImm(SDValue N,SDValue & Imm)195   bool SelectSVEArithImm(SDValue N, SDValue &Imm) {
196     return SelectSVEArithImm(N, VT, Imm);
197   }
198 
199   template <unsigned Low, unsigned High, bool AllowSaturation = false>
SelectSVEShiftImm(SDValue N,SDValue & Imm)200   bool SelectSVEShiftImm(SDValue N, SDValue &Imm) {
201     return SelectSVEShiftImm(N, Low, High, AllowSaturation, Imm);
202   }
203 
204   // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
205   template<signed Min, signed Max, signed Scale, bool Shift>
SelectCntImm(SDValue N,SDValue & Imm)206   bool SelectCntImm(SDValue N, SDValue &Imm) {
207     if (!isa<ConstantSDNode>(N))
208       return false;
209 
210     int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue();
211     if (Shift)
212       MulImm = 1LL << MulImm;
213 
214     if ((MulImm % std::abs(Scale)) != 0)
215       return false;
216 
217     MulImm /= Scale;
218     if ((MulImm >= Min) && (MulImm <= Max)) {
219       Imm = CurDAG->getTargetConstant(MulImm, SDLoc(N), MVT::i32);
220       return true;
221     }
222 
223     return false;
224   }
225 
226   /// Form sequences of consecutive 64/128-bit registers for use in NEON
227   /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
228   /// between 1 and 4 elements. If it contains a single element that is returned
229   /// unchanged; otherwise a REG_SEQUENCE value is returned.
230   SDValue createDTuple(ArrayRef<SDValue> Vecs);
231   SDValue createQTuple(ArrayRef<SDValue> Vecs);
232   // Form a sequence of SVE registers for instructions using list of vectors,
233   // e.g. structured loads and stores (ldN, stN).
234   SDValue createZTuple(ArrayRef<SDValue> Vecs);
235 
236   /// Generic helper for the createDTuple/createQTuple
237   /// functions. Those should almost always be called instead.
238   SDValue createTuple(ArrayRef<SDValue> Vecs, const unsigned RegClassIDs[],
239                       const unsigned SubRegs[]);
240 
241   void SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
242 
243   bool tryIndexedLoad(SDNode *N);
244 
245   bool trySelectStackSlotTagP(SDNode *N);
246   void SelectTagP(SDNode *N);
247 
248   void SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
249                      unsigned SubRegIdx);
250   void SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
251                          unsigned SubRegIdx);
252   void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
253   void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
254   void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale,
255                             unsigned Opc_rr, unsigned Opc_ri);
256 
257   bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
258   /// SVE Reg+Imm addressing mode.
259   template <int64_t Min, int64_t Max>
260   bool SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, SDValue &Base,
261                                 SDValue &OffImm);
262   /// SVE Reg+Reg address mode.
263   template <unsigned Scale>
SelectSVERegRegAddrMode(SDValue N,SDValue & Base,SDValue & Offset)264   bool SelectSVERegRegAddrMode(SDValue N, SDValue &Base, SDValue &Offset) {
265     return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
266   }
267 
268   void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
269   void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
270   void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
271   void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
272   void SelectPredicatedStore(SDNode *N, unsigned NumVecs, unsigned Scale,
273                              unsigned Opc_rr, unsigned Opc_ri);
274   std::tuple<unsigned, SDValue, SDValue>
275   findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr, unsigned Opc_ri,
276                            const SDValue &OldBase, const SDValue &OldOffset,
277                            unsigned Scale);
278 
279   bool tryBitfieldExtractOp(SDNode *N);
280   bool tryBitfieldExtractOpFromSExt(SDNode *N);
281   bool tryBitfieldInsertOp(SDNode *N);
282   bool tryBitfieldInsertInZeroOp(SDNode *N);
283   bool tryShiftAmountMod(SDNode *N);
284   bool tryHighFPExt(SDNode *N);
285 
286   bool tryReadRegister(SDNode *N);
287   bool tryWriteRegister(SDNode *N);
288 
289 // Include the pieces autogenerated from the target description.
290 #include "AArch64GenDAGISel.inc"
291 
292 private:
293   bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
294                              SDValue &Shift);
SelectAddrModeIndexed7S(SDValue N,unsigned Size,SDValue & Base,SDValue & OffImm)295   bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
296                                SDValue &OffImm) {
297     return SelectAddrModeIndexedBitWidth(N, true, 7, Size, Base, OffImm);
298   }
299   bool SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm, unsigned BW,
300                                      unsigned Size, SDValue &Base,
301                                      SDValue &OffImm);
302   bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
303                              SDValue &OffImm);
304   bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
305                               SDValue &OffImm);
306   bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base,
307                          SDValue &Offset, SDValue &SignExtend,
308                          SDValue &DoShift);
309   bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
310                          SDValue &Offset, SDValue &SignExtend,
311                          SDValue &DoShift);
312   bool isWorthFolding(SDValue V) const;
313   bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
314                          SDValue &Offset, SDValue &SignExtend);
315 
316   template<unsigned RegWidth>
SelectCVTFixedPosOperand(SDValue N,SDValue & FixedPos)317   bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
318     return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
319   }
320 
321   bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
322 
323   bool SelectCMP_SWAP(SDNode *N);
324 
325   bool SelectSVE8BitLslImm(SDValue N, SDValue &Imm, SDValue &Shift);
326 
327   bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
328 
329   bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm, bool Invert);
330 
331   bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
332   bool SelectSVEShiftImm(SDValue N, uint64_t Low, uint64_t High,
333                          bool AllowSaturation, SDValue &Imm);
334 
335   bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm);
336   bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
337                                SDValue &Offset);
338 
339   bool SelectAllActivePredicate(SDValue N);
340 };
341 } // end anonymous namespace
342 
343 /// isIntImmediate - This method tests to see if the node is a constant
344 /// operand. If so Imm will receive the 32-bit value.
isIntImmediate(const SDNode * N,uint64_t & Imm)345 static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
346   if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
347     Imm = C->getZExtValue();
348     return true;
349   }
350   return false;
351 }
352 
353 // isIntImmediate - This method tests to see if a constant operand.
354 // If so Imm will receive the value.
isIntImmediate(SDValue N,uint64_t & Imm)355 static bool isIntImmediate(SDValue N, uint64_t &Imm) {
356   return isIntImmediate(N.getNode(), Imm);
357 }
358 
359 // isOpcWithIntImmediate - This method tests to see if the node is a specific
360 // opcode and that it has a immediate integer right operand.
361 // If so Imm will receive the 32 bit value.
isOpcWithIntImmediate(const SDNode * N,unsigned Opc,uint64_t & Imm)362 static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
363                                   uint64_t &Imm) {
364   return N->getOpcode() == Opc &&
365          isIntImmediate(N->getOperand(1).getNode(), Imm);
366 }
367 
SelectInlineAsmMemoryOperand(const SDValue & Op,unsigned ConstraintID,std::vector<SDValue> & OutOps)368 bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
369     const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
370   switch(ConstraintID) {
371   default:
372     llvm_unreachable("Unexpected asm memory constraint");
373   case InlineAsm::Constraint_m:
374   case InlineAsm::Constraint_o:
375   case InlineAsm::Constraint_Q:
376     // We need to make sure that this one operand does not end up in XZR, thus
377     // require the address to be in a PointerRegClass register.
378     const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
379     const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF);
380     SDLoc dl(Op);
381     SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i64);
382     SDValue NewOp =
383         SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
384                                        dl, Op.getValueType(),
385                                        Op, RC), 0);
386     OutOps.push_back(NewOp);
387     return false;
388   }
389   return true;
390 }
391 
392 /// SelectArithImmed - Select an immediate value that can be represented as
393 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
394 /// Val set to the 12-bit value and Shift set to the shifter operand.
SelectArithImmed(SDValue N,SDValue & Val,SDValue & Shift)395 bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
396                                            SDValue &Shift) {
397   // This function is called from the addsub_shifted_imm ComplexPattern,
398   // which lists [imm] as the list of opcode it's interested in, however
399   // we still need to check whether the operand is actually an immediate
400   // here because the ComplexPattern opcode list is only used in
401   // root-level opcode matching.
402   if (!isa<ConstantSDNode>(N.getNode()))
403     return false;
404 
405   uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
406   unsigned ShiftAmt;
407 
408   if (Immed >> 12 == 0) {
409     ShiftAmt = 0;
410   } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
411     ShiftAmt = 12;
412     Immed = Immed >> 12;
413   } else
414     return false;
415 
416   unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
417   SDLoc dl(N);
418   Val = CurDAG->getTargetConstant(Immed, dl, MVT::i32);
419   Shift = CurDAG->getTargetConstant(ShVal, dl, MVT::i32);
420   return true;
421 }
422 
423 /// SelectNegArithImmed - As above, but negates the value before trying to
424 /// select it.
SelectNegArithImmed(SDValue N,SDValue & Val,SDValue & Shift)425 bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
426                                               SDValue &Shift) {
427   // This function is called from the addsub_shifted_imm ComplexPattern,
428   // which lists [imm] as the list of opcode it's interested in, however
429   // we still need to check whether the operand is actually an immediate
430   // here because the ComplexPattern opcode list is only used in
431   // root-level opcode matching.
432   if (!isa<ConstantSDNode>(N.getNode()))
433     return false;
434 
435   // The immediate operand must be a 24-bit zero-extended immediate.
436   uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
437 
438   // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
439   // have the opposite effect on the C flag, so this pattern mustn't match under
440   // those circumstances.
441   if (Immed == 0)
442     return false;
443 
444   if (N.getValueType() == MVT::i32)
445     Immed = ~((uint32_t)Immed) + 1;
446   else
447     Immed = ~Immed + 1ULL;
448   if (Immed & 0xFFFFFFFFFF000000ULL)
449     return false;
450 
451   Immed &= 0xFFFFFFULL;
452   return SelectArithImmed(CurDAG->getConstant(Immed, SDLoc(N), MVT::i32), Val,
453                           Shift);
454 }
455 
456 /// getShiftTypeForNode - Translate a shift node to the corresponding
457 /// ShiftType value.
getShiftTypeForNode(SDValue N)458 static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
459   switch (N.getOpcode()) {
460   default:
461     return AArch64_AM::InvalidShiftExtend;
462   case ISD::SHL:
463     return AArch64_AM::LSL;
464   case ISD::SRL:
465     return AArch64_AM::LSR;
466   case ISD::SRA:
467     return AArch64_AM::ASR;
468   case ISD::ROTR:
469     return AArch64_AM::ROR;
470   }
471 }
472 
473 /// Determine whether it is worth it to fold SHL into the addressing
474 /// mode.
isWorthFoldingSHL(SDValue V)475 static bool isWorthFoldingSHL(SDValue V) {
476   assert(V.getOpcode() == ISD::SHL && "invalid opcode");
477   // It is worth folding logical shift of up to three places.
478   auto *CSD = dyn_cast<ConstantSDNode>(V.getOperand(1));
479   if (!CSD)
480     return false;
481   unsigned ShiftVal = CSD->getZExtValue();
482   if (ShiftVal > 3)
483     return false;
484 
485   // Check if this particular node is reused in any non-memory related
486   // operation.  If yes, do not try to fold this node into the address
487   // computation, since the computation will be kept.
488   const SDNode *Node = V.getNode();
489   for (SDNode *UI : Node->uses())
490     if (!isa<MemSDNode>(*UI))
491       for (SDNode *UII : UI->uses())
492         if (!isa<MemSDNode>(*UII))
493           return false;
494   return true;
495 }
496 
497 /// Determine whether it is worth to fold V into an extended register.
isWorthFolding(SDValue V) const498 bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
499   // Trivial if we are optimizing for code size or if there is only
500   // one use of the value.
501   if (CurDAG->shouldOptForSize() || V.hasOneUse())
502     return true;
503   // If a subtarget has a fastpath LSL we can fold a logical shift into
504   // the addressing mode and save a cycle.
505   if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL &&
506       isWorthFoldingSHL(V))
507     return true;
508   if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) {
509     const SDValue LHS = V.getOperand(0);
510     const SDValue RHS = V.getOperand(1);
511     if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
512       return true;
513     if (RHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(RHS))
514       return true;
515   }
516 
517   // It hurts otherwise, since the value will be reused.
518   return false;
519 }
520 
521 /// SelectShiftedRegister - Select a "shifted register" operand.  If the value
522 /// is not shifted, set the Shift operand to default of "LSL 0".  The logical
523 /// instructions allow the shifted register to be rotated, but the arithmetic
524 /// instructions do not.  The AllowROR parameter specifies whether ROR is
525 /// supported.
SelectShiftedRegister(SDValue N,bool AllowROR,SDValue & Reg,SDValue & Shift)526 bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
527                                                 SDValue &Reg, SDValue &Shift) {
528   AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
529   if (ShType == AArch64_AM::InvalidShiftExtend)
530     return false;
531   if (!AllowROR && ShType == AArch64_AM::ROR)
532     return false;
533 
534   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
535     unsigned BitSize = N.getValueSizeInBits();
536     unsigned Val = RHS->getZExtValue() & (BitSize - 1);
537     unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
538 
539     Reg = N.getOperand(0);
540     Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32);
541     return isWorthFolding(N);
542   }
543 
544   return false;
545 }
546 
547 /// getExtendTypeForNode - Translate an extend node to the corresponding
548 /// ExtendType value.
549 static AArch64_AM::ShiftExtendType
getExtendTypeForNode(SDValue N,bool IsLoadStore=false)550 getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
551   if (N.getOpcode() == ISD::SIGN_EXTEND ||
552       N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
553     EVT SrcVT;
554     if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
555       SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
556     else
557       SrcVT = N.getOperand(0).getValueType();
558 
559     if (!IsLoadStore && SrcVT == MVT::i8)
560       return AArch64_AM::SXTB;
561     else if (!IsLoadStore && SrcVT == MVT::i16)
562       return AArch64_AM::SXTH;
563     else if (SrcVT == MVT::i32)
564       return AArch64_AM::SXTW;
565     assert(SrcVT != MVT::i64 && "extend from 64-bits?");
566 
567     return AArch64_AM::InvalidShiftExtend;
568   } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
569              N.getOpcode() == ISD::ANY_EXTEND) {
570     EVT SrcVT = N.getOperand(0).getValueType();
571     if (!IsLoadStore && SrcVT == MVT::i8)
572       return AArch64_AM::UXTB;
573     else if (!IsLoadStore && SrcVT == MVT::i16)
574       return AArch64_AM::UXTH;
575     else if (SrcVT == MVT::i32)
576       return AArch64_AM::UXTW;
577     assert(SrcVT != MVT::i64 && "extend from 64-bits?");
578 
579     return AArch64_AM::InvalidShiftExtend;
580   } else if (N.getOpcode() == ISD::AND) {
581     ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
582     if (!CSD)
583       return AArch64_AM::InvalidShiftExtend;
584     uint64_t AndMask = CSD->getZExtValue();
585 
586     switch (AndMask) {
587     default:
588       return AArch64_AM::InvalidShiftExtend;
589     case 0xFF:
590       return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
591     case 0xFFFF:
592       return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
593     case 0xFFFFFFFF:
594       return AArch64_AM::UXTW;
595     }
596   }
597 
598   return AArch64_AM::InvalidShiftExtend;
599 }
600 
601 // Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
checkHighLaneIndex(SDNode * DL,SDValue & LaneOp,int & LaneIdx)602 static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
603   if (DL->getOpcode() != AArch64ISD::DUPLANE16 &&
604       DL->getOpcode() != AArch64ISD::DUPLANE32)
605     return false;
606 
607   SDValue SV = DL->getOperand(0);
608   if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
609     return false;
610 
611   SDValue EV = SV.getOperand(1);
612   if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
613     return false;
614 
615   ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
616   ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
617   LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
618   LaneOp = EV.getOperand(0);
619 
620   return true;
621 }
622 
623 // Helper for SelectOpcV64LaneV128 - Recognize operations where one operand is a
624 // high lane extract.
checkV64LaneV128(SDValue Op0,SDValue Op1,SDValue & StdOp,SDValue & LaneOp,int & LaneIdx)625 static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
626                              SDValue &LaneOp, int &LaneIdx) {
627 
628   if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
629     std::swap(Op0, Op1);
630     if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
631       return false;
632   }
633   StdOp = Op1;
634   return true;
635 }
636 
637 /// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
638 /// is a lane in the upper half of a 128-bit vector.  Recognize and select this
639 /// so that we don't emit unnecessary lane extracts.
tryMLAV64LaneV128(SDNode * N)640 bool AArch64DAGToDAGISel::tryMLAV64LaneV128(SDNode *N) {
641   SDLoc dl(N);
642   SDValue Op0 = N->getOperand(0);
643   SDValue Op1 = N->getOperand(1);
644   SDValue MLAOp1;   // Will hold ordinary multiplicand for MLA.
645   SDValue MLAOp2;   // Will hold lane-accessed multiplicand for MLA.
646   int LaneIdx = -1; // Will hold the lane index.
647 
648   if (Op1.getOpcode() != ISD::MUL ||
649       !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
650                         LaneIdx)) {
651     std::swap(Op0, Op1);
652     if (Op1.getOpcode() != ISD::MUL ||
653         !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
654                           LaneIdx))
655       return false;
656   }
657 
658   SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
659 
660   SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
661 
662   unsigned MLAOpc = ~0U;
663 
664   switch (N->getSimpleValueType(0).SimpleTy) {
665   default:
666     llvm_unreachable("Unrecognized MLA.");
667   case MVT::v4i16:
668     MLAOpc = AArch64::MLAv4i16_indexed;
669     break;
670   case MVT::v8i16:
671     MLAOpc = AArch64::MLAv8i16_indexed;
672     break;
673   case MVT::v2i32:
674     MLAOpc = AArch64::MLAv2i32_indexed;
675     break;
676   case MVT::v4i32:
677     MLAOpc = AArch64::MLAv4i32_indexed;
678     break;
679   }
680 
681   ReplaceNode(N, CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops));
682   return true;
683 }
684 
tryMULLV64LaneV128(unsigned IntNo,SDNode * N)685 bool AArch64DAGToDAGISel::tryMULLV64LaneV128(unsigned IntNo, SDNode *N) {
686   SDLoc dl(N);
687   SDValue SMULLOp0;
688   SDValue SMULLOp1;
689   int LaneIdx;
690 
691   if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
692                         LaneIdx))
693     return false;
694 
695   SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
696 
697   SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
698 
699   unsigned SMULLOpc = ~0U;
700 
701   if (IntNo == Intrinsic::aarch64_neon_smull) {
702     switch (N->getSimpleValueType(0).SimpleTy) {
703     default:
704       llvm_unreachable("Unrecognized SMULL.");
705     case MVT::v4i32:
706       SMULLOpc = AArch64::SMULLv4i16_indexed;
707       break;
708     case MVT::v2i64:
709       SMULLOpc = AArch64::SMULLv2i32_indexed;
710       break;
711     }
712   } else if (IntNo == Intrinsic::aarch64_neon_umull) {
713     switch (N->getSimpleValueType(0).SimpleTy) {
714     default:
715       llvm_unreachable("Unrecognized SMULL.");
716     case MVT::v4i32:
717       SMULLOpc = AArch64::UMULLv4i16_indexed;
718       break;
719     case MVT::v2i64:
720       SMULLOpc = AArch64::UMULLv2i32_indexed;
721       break;
722     }
723   } else
724     llvm_unreachable("Unrecognized intrinsic.");
725 
726   ReplaceNode(N, CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops));
727   return true;
728 }
729 
730 /// Instructions that accept extend modifiers like UXTW expect the register
731 /// being extended to be a GPR32, but the incoming DAG might be acting on a
732 /// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
733 /// this is the case.
narrowIfNeeded(SelectionDAG * CurDAG,SDValue N)734 static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
735   if (N.getValueType() == MVT::i32)
736     return N;
737 
738   SDLoc dl(N);
739   SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
740   MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
741                                                dl, MVT::i32, N, SubReg);
742   return SDValue(Node, 0);
743 }
744 
745 // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
746 template<signed Low, signed High, signed Scale>
SelectRDVLImm(SDValue N,SDValue & Imm)747 bool AArch64DAGToDAGISel::SelectRDVLImm(SDValue N, SDValue &Imm) {
748   if (!isa<ConstantSDNode>(N))
749     return false;
750 
751   int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue();
752   if ((MulImm % std::abs(Scale)) == 0) {
753     int64_t RDVLImm = MulImm / Scale;
754     if ((RDVLImm >= Low) && (RDVLImm <= High)) {
755       Imm = CurDAG->getTargetConstant(RDVLImm, SDLoc(N), MVT::i32);
756       return true;
757     }
758   }
759 
760   return false;
761 }
762 
763 /// SelectArithExtendedRegister - Select a "extended register" operand.  This
764 /// operand folds in an extend followed by an optional left shift.
SelectArithExtendedRegister(SDValue N,SDValue & Reg,SDValue & Shift)765 bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
766                                                       SDValue &Shift) {
767   unsigned ShiftVal = 0;
768   AArch64_AM::ShiftExtendType Ext;
769 
770   if (N.getOpcode() == ISD::SHL) {
771     ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
772     if (!CSD)
773       return false;
774     ShiftVal = CSD->getZExtValue();
775     if (ShiftVal > 4)
776       return false;
777 
778     Ext = getExtendTypeForNode(N.getOperand(0));
779     if (Ext == AArch64_AM::InvalidShiftExtend)
780       return false;
781 
782     Reg = N.getOperand(0).getOperand(0);
783   } else {
784     Ext = getExtendTypeForNode(N);
785     if (Ext == AArch64_AM::InvalidShiftExtend)
786       return false;
787 
788     Reg = N.getOperand(0);
789 
790     // Don't match if free 32-bit -> 64-bit zext can be used instead.
791     if (Ext == AArch64_AM::UXTW &&
792         Reg->getValueType(0).getSizeInBits() == 32 && isDef32(*Reg.getNode()))
793       return false;
794   }
795 
796   // AArch64 mandates that the RHS of the operation must use the smallest
797   // register class that could contain the size being extended from.  Thus,
798   // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
799   // there might not be an actual 32-bit value in the program.  We can
800   // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
801   assert(Ext != AArch64_AM::UXTX && Ext != AArch64_AM::SXTX);
802   Reg = narrowIfNeeded(CurDAG, Reg);
803   Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
804                                     MVT::i32);
805   return isWorthFolding(N);
806 }
807 
808 /// If there's a use of this ADDlow that's not itself a load/store then we'll
809 /// need to create a real ADD instruction from it anyway and there's no point in
810 /// folding it into the mem op. Theoretically, it shouldn't matter, but there's
811 /// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding
812 /// leads to duplicated ADRP instructions.
isWorthFoldingADDlow(SDValue N)813 static bool isWorthFoldingADDlow(SDValue N) {
814   for (auto Use : N->uses()) {
815     if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
816         Use->getOpcode() != ISD::ATOMIC_LOAD &&
817         Use->getOpcode() != ISD::ATOMIC_STORE)
818       return false;
819 
820     // ldar and stlr have much more restrictive addressing modes (just a
821     // register).
822     if (isStrongerThanMonotonic(cast<MemSDNode>(Use)->getOrdering()))
823       return false;
824   }
825 
826   return true;
827 }
828 
829 /// SelectAddrModeIndexedBitWidth - Select a "register plus scaled (un)signed BW-bit
830 /// immediate" address.  The "Size" argument is the size in bytes of the memory
831 /// reference, which determines the scale.
SelectAddrModeIndexedBitWidth(SDValue N,bool IsSignedImm,unsigned BW,unsigned Size,SDValue & Base,SDValue & OffImm)832 bool AArch64DAGToDAGISel::SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm,
833                                                         unsigned BW, unsigned Size,
834                                                         SDValue &Base,
835                                                         SDValue &OffImm) {
836   SDLoc dl(N);
837   const DataLayout &DL = CurDAG->getDataLayout();
838   const TargetLowering *TLI = getTargetLowering();
839   if (N.getOpcode() == ISD::FrameIndex) {
840     int FI = cast<FrameIndexSDNode>(N)->getIndex();
841     Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
842     OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
843     return true;
844   }
845 
846   // As opposed to the (12-bit) Indexed addressing mode below, the 7/9-bit signed
847   // selected here doesn't support labels/immediates, only base+offset.
848   if (CurDAG->isBaseWithConstantOffset(N)) {
849     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
850       if (IsSignedImm) {
851         int64_t RHSC = RHS->getSExtValue();
852         unsigned Scale = Log2_32(Size);
853         int64_t Range = 0x1LL << (BW - 1);
854 
855         if ((RHSC & (Size - 1)) == 0 && RHSC >= -(Range << Scale) &&
856             RHSC < (Range << Scale)) {
857           Base = N.getOperand(0);
858           if (Base.getOpcode() == ISD::FrameIndex) {
859             int FI = cast<FrameIndexSDNode>(Base)->getIndex();
860             Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
861           }
862           OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
863           return true;
864         }
865       } else {
866         // unsigned Immediate
867         uint64_t RHSC = RHS->getZExtValue();
868         unsigned Scale = Log2_32(Size);
869         uint64_t Range = 0x1ULL << BW;
870 
871         if ((RHSC & (Size - 1)) == 0 && RHSC < (Range << Scale)) {
872           Base = N.getOperand(0);
873           if (Base.getOpcode() == ISD::FrameIndex) {
874             int FI = cast<FrameIndexSDNode>(Base)->getIndex();
875             Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
876           }
877           OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
878           return true;
879         }
880       }
881     }
882   }
883   // Base only. The address will be materialized into a register before
884   // the memory is accessed.
885   //    add x0, Xbase, #offset
886   //    stp x1, x2, [x0]
887   Base = N;
888   OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
889   return true;
890 }
891 
892 /// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
893 /// immediate" address.  The "Size" argument is the size in bytes of the memory
894 /// reference, which determines the scale.
SelectAddrModeIndexed(SDValue N,unsigned Size,SDValue & Base,SDValue & OffImm)895 bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
896                                               SDValue &Base, SDValue &OffImm) {
897   SDLoc dl(N);
898   const DataLayout &DL = CurDAG->getDataLayout();
899   const TargetLowering *TLI = getTargetLowering();
900   if (N.getOpcode() == ISD::FrameIndex) {
901     int FI = cast<FrameIndexSDNode>(N)->getIndex();
902     Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
903     OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
904     return true;
905   }
906 
907   if (N.getOpcode() == AArch64ISD::ADDlow && isWorthFoldingADDlow(N)) {
908     GlobalAddressSDNode *GAN =
909         dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
910     Base = N.getOperand(0);
911     OffImm = N.getOperand(1);
912     if (!GAN)
913       return true;
914 
915     if (GAN->getOffset() % Size == 0 &&
916         GAN->getGlobal()->getPointerAlignment(DL) >= Size)
917       return true;
918   }
919 
920   if (CurDAG->isBaseWithConstantOffset(N)) {
921     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
922       int64_t RHSC = (int64_t)RHS->getZExtValue();
923       unsigned Scale = Log2_32(Size);
924       if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
925         Base = N.getOperand(0);
926         if (Base.getOpcode() == ISD::FrameIndex) {
927           int FI = cast<FrameIndexSDNode>(Base)->getIndex();
928           Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
929         }
930         OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
931         return true;
932       }
933     }
934   }
935 
936   // Before falling back to our general case, check if the unscaled
937   // instructions can handle this. If so, that's preferable.
938   if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
939     return false;
940 
941   // Base only. The address will be materialized into a register before
942   // the memory is accessed.
943   //    add x0, Xbase, #offset
944   //    ldr x0, [x0]
945   Base = N;
946   OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
947   return true;
948 }
949 
950 /// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
951 /// immediate" address.  This should only match when there is an offset that
952 /// is not valid for a scaled immediate addressing mode.  The "Size" argument
953 /// is the size in bytes of the memory reference, which is needed here to know
954 /// what is valid for a scaled immediate.
SelectAddrModeUnscaled(SDValue N,unsigned Size,SDValue & Base,SDValue & OffImm)955 bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
956                                                  SDValue &Base,
957                                                  SDValue &OffImm) {
958   if (!CurDAG->isBaseWithConstantOffset(N))
959     return false;
960   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
961     int64_t RHSC = RHS->getSExtValue();
962     // If the offset is valid as a scaled immediate, don't match here.
963     if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
964         RHSC < (0x1000 << Log2_32(Size)))
965       return false;
966     if (RHSC >= -256 && RHSC < 256) {
967       Base = N.getOperand(0);
968       if (Base.getOpcode() == ISD::FrameIndex) {
969         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
970         const TargetLowering *TLI = getTargetLowering();
971         Base = CurDAG->getTargetFrameIndex(
972             FI, TLI->getPointerTy(CurDAG->getDataLayout()));
973       }
974       OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i64);
975       return true;
976     }
977   }
978   return false;
979 }
980 
Widen(SelectionDAG * CurDAG,SDValue N)981 static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
982   SDLoc dl(N);
983   SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
984   SDValue ImpDef = SDValue(
985       CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, MVT::i64), 0);
986   MachineSDNode *Node = CurDAG->getMachineNode(
987       TargetOpcode::INSERT_SUBREG, dl, MVT::i64, ImpDef, N, SubReg);
988   return SDValue(Node, 0);
989 }
990 
991 /// Check if the given SHL node (\p N), can be used to form an
992 /// extended register for an addressing mode.
SelectExtendedSHL(SDValue N,unsigned Size,bool WantExtend,SDValue & Offset,SDValue & SignExtend)993 bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
994                                             bool WantExtend, SDValue &Offset,
995                                             SDValue &SignExtend) {
996   assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
997   ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
998   if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue())
999     return false;
1000 
1001   SDLoc dl(N);
1002   if (WantExtend) {
1003     AArch64_AM::ShiftExtendType Ext =
1004         getExtendTypeForNode(N.getOperand(0), true);
1005     if (Ext == AArch64_AM::InvalidShiftExtend)
1006       return false;
1007 
1008     Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
1009     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
1010                                            MVT::i32);
1011   } else {
1012     Offset = N.getOperand(0);
1013     SignExtend = CurDAG->getTargetConstant(0, dl, MVT::i32);
1014   }
1015 
1016   unsigned LegalShiftVal = Log2_32(Size);
1017   unsigned ShiftVal = CSD->getZExtValue();
1018 
1019   if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
1020     return false;
1021 
1022   return isWorthFolding(N);
1023 }
1024 
SelectAddrModeWRO(SDValue N,unsigned Size,SDValue & Base,SDValue & Offset,SDValue & SignExtend,SDValue & DoShift)1025 bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
1026                                             SDValue &Base, SDValue &Offset,
1027                                             SDValue &SignExtend,
1028                                             SDValue &DoShift) {
1029   if (N.getOpcode() != ISD::ADD)
1030     return false;
1031   SDValue LHS = N.getOperand(0);
1032   SDValue RHS = N.getOperand(1);
1033   SDLoc dl(N);
1034 
1035   // We don't want to match immediate adds here, because they are better lowered
1036   // to the register-immediate addressing modes.
1037   if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
1038     return false;
1039 
1040   // Check if this particular node is reused in any non-memory related
1041   // operation.  If yes, do not try to fold this node into the address
1042   // computation, since the computation will be kept.
1043   const SDNode *Node = N.getNode();
1044   for (SDNode *UI : Node->uses()) {
1045     if (!isa<MemSDNode>(*UI))
1046       return false;
1047   }
1048 
1049   // Remember if it is worth folding N when it produces extended register.
1050   bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
1051 
1052   // Try to match a shifted extend on the RHS.
1053   if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
1054       SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) {
1055     Base = LHS;
1056     DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
1057     return true;
1058   }
1059 
1060   // Try to match a shifted extend on the LHS.
1061   if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
1062       SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) {
1063     Base = RHS;
1064     DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
1065     return true;
1066   }
1067 
1068   // There was no shift, whatever else we find.
1069   DoShift = CurDAG->getTargetConstant(false, dl, MVT::i32);
1070 
1071   AArch64_AM::ShiftExtendType Ext = AArch64_AM::InvalidShiftExtend;
1072   // Try to match an unshifted extend on the LHS.
1073   if (IsExtendedRegisterWorthFolding &&
1074       (Ext = getExtendTypeForNode(LHS, true)) !=
1075           AArch64_AM::InvalidShiftExtend) {
1076     Base = RHS;
1077     Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
1078     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
1079                                            MVT::i32);
1080     if (isWorthFolding(LHS))
1081       return true;
1082   }
1083 
1084   // Try to match an unshifted extend on the RHS.
1085   if (IsExtendedRegisterWorthFolding &&
1086       (Ext = getExtendTypeForNode(RHS, true)) !=
1087           AArch64_AM::InvalidShiftExtend) {
1088     Base = LHS;
1089     Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
1090     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
1091                                            MVT::i32);
1092     if (isWorthFolding(RHS))
1093       return true;
1094   }
1095 
1096   return false;
1097 }
1098 
1099 // Check if the given immediate is preferred by ADD. If an immediate can be
1100 // encoded in an ADD, or it can be encoded in an "ADD LSL #12" and can not be
1101 // encoded by one MOVZ, return true.
isPreferredADD(int64_t ImmOff)1102 static bool isPreferredADD(int64_t ImmOff) {
1103   // Constant in [0x0, 0xfff] can be encoded in ADD.
1104   if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
1105     return true;
1106   // Check if it can be encoded in an "ADD LSL #12".
1107   if ((ImmOff & 0xffffffffff000fffLL) == 0x0LL)
1108     // As a single MOVZ is faster than a "ADD of LSL #12", ignore such constant.
1109     return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
1110            (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
1111   return false;
1112 }
1113 
SelectAddrModeXRO(SDValue N,unsigned Size,SDValue & Base,SDValue & Offset,SDValue & SignExtend,SDValue & DoShift)1114 bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
1115                                             SDValue &Base, SDValue &Offset,
1116                                             SDValue &SignExtend,
1117                                             SDValue &DoShift) {
1118   if (N.getOpcode() != ISD::ADD)
1119     return false;
1120   SDValue LHS = N.getOperand(0);
1121   SDValue RHS = N.getOperand(1);
1122   SDLoc DL(N);
1123 
1124   // Check if this particular node is reused in any non-memory related
1125   // operation.  If yes, do not try to fold this node into the address
1126   // computation, since the computation will be kept.
1127   const SDNode *Node = N.getNode();
1128   for (SDNode *UI : Node->uses()) {
1129     if (!isa<MemSDNode>(*UI))
1130       return false;
1131   }
1132 
1133   // Watch out if RHS is a wide immediate, it can not be selected into
1134   // [BaseReg+Imm] addressing mode. Also it may not be able to be encoded into
1135   // ADD/SUB. Instead it will use [BaseReg + 0] address mode and generate
1136   // instructions like:
1137   //     MOV  X0, WideImmediate
1138   //     ADD  X1, BaseReg, X0
1139   //     LDR  X2, [X1, 0]
1140   // For such situation, using [BaseReg, XReg] addressing mode can save one
1141   // ADD/SUB:
1142   //     MOV  X0, WideImmediate
1143   //     LDR  X2, [BaseReg, X0]
1144   if (isa<ConstantSDNode>(RHS)) {
1145     int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue();
1146     unsigned Scale = Log2_32(Size);
1147     // Skip the immediate can be selected by load/store addressing mode.
1148     // Also skip the immediate can be encoded by a single ADD (SUB is also
1149     // checked by using -ImmOff).
1150     if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) ||
1151         isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
1152       return false;
1153 
1154     SDValue Ops[] = { RHS };
1155     SDNode *MOVI =
1156         CurDAG->getMachineNode(AArch64::MOVi64imm, DL, MVT::i64, Ops);
1157     SDValue MOVIV = SDValue(MOVI, 0);
1158     // This ADD of two X register will be selected into [Reg+Reg] mode.
1159     N = CurDAG->getNode(ISD::ADD, DL, MVT::i64, LHS, MOVIV);
1160   }
1161 
1162   // Remember if it is worth folding N when it produces extended register.
1163   bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
1164 
1165   // Try to match a shifted extend on the RHS.
1166   if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
1167       SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) {
1168     Base = LHS;
1169     DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
1170     return true;
1171   }
1172 
1173   // Try to match a shifted extend on the LHS.
1174   if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
1175       SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) {
1176     Base = RHS;
1177     DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
1178     return true;
1179   }
1180 
1181   // Match any non-shifted, non-extend, non-immediate add expression.
1182   Base = LHS;
1183   Offset = RHS;
1184   SignExtend = CurDAG->getTargetConstant(false, DL, MVT::i32);
1185   DoShift = CurDAG->getTargetConstant(false, DL, MVT::i32);
1186   // Reg1 + Reg2 is free: no check needed.
1187   return true;
1188 }
1189 
createDTuple(ArrayRef<SDValue> Regs)1190 SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
1191   static const unsigned RegClassIDs[] = {
1192       AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
1193   static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
1194                                      AArch64::dsub2, AArch64::dsub3};
1195 
1196   return createTuple(Regs, RegClassIDs, SubRegs);
1197 }
1198 
createQTuple(ArrayRef<SDValue> Regs)1199 SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
1200   static const unsigned RegClassIDs[] = {
1201       AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
1202   static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
1203                                      AArch64::qsub2, AArch64::qsub3};
1204 
1205   return createTuple(Regs, RegClassIDs, SubRegs);
1206 }
1207 
createZTuple(ArrayRef<SDValue> Regs)1208 SDValue AArch64DAGToDAGISel::createZTuple(ArrayRef<SDValue> Regs) {
1209   static const unsigned RegClassIDs[] = {AArch64::ZPR2RegClassID,
1210                                          AArch64::ZPR3RegClassID,
1211                                          AArch64::ZPR4RegClassID};
1212   static const unsigned SubRegs[] = {AArch64::zsub0, AArch64::zsub1,
1213                                      AArch64::zsub2, AArch64::zsub3};
1214 
1215   return createTuple(Regs, RegClassIDs, SubRegs);
1216 }
1217 
createTuple(ArrayRef<SDValue> Regs,const unsigned RegClassIDs[],const unsigned SubRegs[])1218 SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
1219                                          const unsigned RegClassIDs[],
1220                                          const unsigned SubRegs[]) {
1221   // There's no special register-class for a vector-list of 1 element: it's just
1222   // a vector.
1223   if (Regs.size() == 1)
1224     return Regs[0];
1225 
1226   assert(Regs.size() >= 2 && Regs.size() <= 4);
1227 
1228   SDLoc DL(Regs[0]);
1229 
1230   SmallVector<SDValue, 4> Ops;
1231 
1232   // First operand of REG_SEQUENCE is the desired RegClass.
1233   Ops.push_back(
1234       CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], DL, MVT::i32));
1235 
1236   // Then we get pairs of source & subregister-position for the components.
1237   for (unsigned i = 0; i < Regs.size(); ++i) {
1238     Ops.push_back(Regs[i]);
1239     Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], DL, MVT::i32));
1240   }
1241 
1242   SDNode *N =
1243       CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
1244   return SDValue(N, 0);
1245 }
1246 
SelectTable(SDNode * N,unsigned NumVecs,unsigned Opc,bool isExt)1247 void AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc,
1248                                       bool isExt) {
1249   SDLoc dl(N);
1250   EVT VT = N->getValueType(0);
1251 
1252   unsigned ExtOff = isExt;
1253 
1254   // Form a REG_SEQUENCE to force register allocation.
1255   unsigned Vec0Off = ExtOff + 1;
1256   SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
1257                                N->op_begin() + Vec0Off + NumVecs);
1258   SDValue RegSeq = createQTuple(Regs);
1259 
1260   SmallVector<SDValue, 6> Ops;
1261   if (isExt)
1262     Ops.push_back(N->getOperand(1));
1263   Ops.push_back(RegSeq);
1264   Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
1265   ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
1266 }
1267 
tryIndexedLoad(SDNode * N)1268 bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
1269   LoadSDNode *LD = cast<LoadSDNode>(N);
1270   if (LD->isUnindexed())
1271     return false;
1272   EVT VT = LD->getMemoryVT();
1273   EVT DstVT = N->getValueType(0);
1274   ISD::MemIndexedMode AM = LD->getAddressingMode();
1275   bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
1276 
1277   // We're not doing validity checking here. That was done when checking
1278   // if we should mark the load as indexed or not. We're just selecting
1279   // the right instruction.
1280   unsigned Opcode = 0;
1281 
1282   ISD::LoadExtType ExtType = LD->getExtensionType();
1283   bool InsertTo64 = false;
1284   if (VT == MVT::i64)
1285     Opcode = IsPre ? AArch64::LDRXpre : AArch64::LDRXpost;
1286   else if (VT == MVT::i32) {
1287     if (ExtType == ISD::NON_EXTLOAD)
1288       Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
1289     else if (ExtType == ISD::SEXTLOAD)
1290       Opcode = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
1291     else {
1292       Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
1293       InsertTo64 = true;
1294       // The result of the load is only i32. It's the subreg_to_reg that makes
1295       // it into an i64.
1296       DstVT = MVT::i32;
1297     }
1298   } else if (VT == MVT::i16) {
1299     if (ExtType == ISD::SEXTLOAD) {
1300       if (DstVT == MVT::i64)
1301         Opcode = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
1302       else
1303         Opcode = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
1304     } else {
1305       Opcode = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
1306       InsertTo64 = DstVT == MVT::i64;
1307       // The result of the load is only i32. It's the subreg_to_reg that makes
1308       // it into an i64.
1309       DstVT = MVT::i32;
1310     }
1311   } else if (VT == MVT::i8) {
1312     if (ExtType == ISD::SEXTLOAD) {
1313       if (DstVT == MVT::i64)
1314         Opcode = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
1315       else
1316         Opcode = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
1317     } else {
1318       Opcode = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
1319       InsertTo64 = DstVT == MVT::i64;
1320       // The result of the load is only i32. It's the subreg_to_reg that makes
1321       // it into an i64.
1322       DstVT = MVT::i32;
1323     }
1324   } else if (VT == MVT::f16) {
1325     Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
1326   } else if (VT == MVT::bf16) {
1327     Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
1328   } else if (VT == MVT::f32) {
1329     Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
1330   } else if (VT == MVT::f64 || VT.is64BitVector()) {
1331     Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost;
1332   } else if (VT.is128BitVector()) {
1333     Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
1334   } else
1335     return false;
1336   SDValue Chain = LD->getChain();
1337   SDValue Base = LD->getBasePtr();
1338   ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
1339   int OffsetVal = (int)OffsetOp->getZExtValue();
1340   SDLoc dl(N);
1341   SDValue Offset = CurDAG->getTargetConstant(OffsetVal, dl, MVT::i64);
1342   SDValue Ops[] = { Base, Offset, Chain };
1343   SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT,
1344                                        MVT::Other, Ops);
1345 
1346   // Transfer memoperands.
1347   MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
1348   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Res), {MemOp});
1349 
1350   // Either way, we're replacing the node, so tell the caller that.
1351   SDValue LoadedVal = SDValue(Res, 1);
1352   if (InsertTo64) {
1353     SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
1354     LoadedVal =
1355         SDValue(CurDAG->getMachineNode(
1356                     AArch64::SUBREG_TO_REG, dl, MVT::i64,
1357                     CurDAG->getTargetConstant(0, dl, MVT::i64), LoadedVal,
1358                     SubReg),
1359                 0);
1360   }
1361 
1362   ReplaceUses(SDValue(N, 0), LoadedVal);
1363   ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
1364   ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
1365   CurDAG->RemoveDeadNode(N);
1366   return true;
1367 }
1368 
SelectLoad(SDNode * N,unsigned NumVecs,unsigned Opc,unsigned SubRegIdx)1369 void AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
1370                                      unsigned SubRegIdx) {
1371   SDLoc dl(N);
1372   EVT VT = N->getValueType(0);
1373   SDValue Chain = N->getOperand(0);
1374 
1375   SDValue Ops[] = {N->getOperand(2), // Mem operand;
1376                    Chain};
1377 
1378   const EVT ResTys[] = {MVT::Untyped, MVT::Other};
1379 
1380   SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1381   SDValue SuperReg = SDValue(Ld, 0);
1382   for (unsigned i = 0; i < NumVecs; ++i)
1383     ReplaceUses(SDValue(N, i),
1384         CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
1385 
1386   ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
1387 
1388   // Transfer memoperands. In the case of AArch64::LD64B, there won't be one,
1389   // because it's too simple to have needed special treatment during lowering.
1390   if (auto *MemIntr = dyn_cast<MemIntrinsicSDNode>(N)) {
1391     MachineMemOperand *MemOp = MemIntr->getMemOperand();
1392     CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
1393   }
1394 
1395   CurDAG->RemoveDeadNode(N);
1396 }
1397 
SelectPostLoad(SDNode * N,unsigned NumVecs,unsigned Opc,unsigned SubRegIdx)1398 void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
1399                                          unsigned Opc, unsigned SubRegIdx) {
1400   SDLoc dl(N);
1401   EVT VT = N->getValueType(0);
1402   SDValue Chain = N->getOperand(0);
1403 
1404   SDValue Ops[] = {N->getOperand(1), // Mem operand
1405                    N->getOperand(2), // Incremental
1406                    Chain};
1407 
1408   const EVT ResTys[] = {MVT::i64, // Type of the write back register
1409                         MVT::Untyped, MVT::Other};
1410 
1411   SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1412 
1413   // Update uses of write back register
1414   ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
1415 
1416   // Update uses of vector list
1417   SDValue SuperReg = SDValue(Ld, 1);
1418   if (NumVecs == 1)
1419     ReplaceUses(SDValue(N, 0), SuperReg);
1420   else
1421     for (unsigned i = 0; i < NumVecs; ++i)
1422       ReplaceUses(SDValue(N, i),
1423           CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
1424 
1425   // Update the chain
1426   ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
1427   CurDAG->RemoveDeadNode(N);
1428 }
1429 
1430 /// Optimize \param OldBase and \param OldOffset selecting the best addressing
1431 /// mode. Returns a tuple consisting of an Opcode, an SDValue representing the
1432 /// new Base and an SDValue representing the new offset.
1433 std::tuple<unsigned, SDValue, SDValue>
findAddrModeSVELoadStore(SDNode * N,unsigned Opc_rr,unsigned Opc_ri,const SDValue & OldBase,const SDValue & OldOffset,unsigned Scale)1434 AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr,
1435                                               unsigned Opc_ri,
1436                                               const SDValue &OldBase,
1437                                               const SDValue &OldOffset,
1438                                               unsigned Scale) {
1439   SDValue NewBase = OldBase;
1440   SDValue NewOffset = OldOffset;
1441   // Detect a possible Reg+Imm addressing mode.
1442   const bool IsRegImm = SelectAddrModeIndexedSVE</*Min=*/-8, /*Max=*/7>(
1443       N, OldBase, NewBase, NewOffset);
1444 
1445   // Detect a possible reg+reg addressing mode, but only if we haven't already
1446   // detected a Reg+Imm one.
1447   const bool IsRegReg =
1448       !IsRegImm && SelectSVERegRegAddrMode(OldBase, Scale, NewBase, NewOffset);
1449 
1450   // Select the instruction.
1451   return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset);
1452 }
1453 
SelectPredicatedLoad(SDNode * N,unsigned NumVecs,unsigned Scale,unsigned Opc_ri,unsigned Opc_rr)1454 void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
1455                                                unsigned Scale, unsigned Opc_ri,
1456                                                unsigned Opc_rr) {
1457   assert(Scale < 4 && "Invalid scaling value.");
1458   SDLoc DL(N);
1459   EVT VT = N->getValueType(0);
1460   SDValue Chain = N->getOperand(0);
1461 
1462   // Optimize addressing mode.
1463   SDValue Base, Offset;
1464   unsigned Opc;
1465   std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore(
1466       N, Opc_rr, Opc_ri, N->getOperand(2),
1467       CurDAG->getTargetConstant(0, DL, MVT::i64), Scale);
1468 
1469   SDValue Ops[] = {N->getOperand(1), // Predicate
1470                    Base,             // Memory operand
1471                    Offset, Chain};
1472 
1473   const EVT ResTys[] = {MVT::Untyped, MVT::Other};
1474 
1475   SDNode *Load = CurDAG->getMachineNode(Opc, DL, ResTys, Ops);
1476   SDValue SuperReg = SDValue(Load, 0);
1477   for (unsigned i = 0; i < NumVecs; ++i)
1478     ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg(
1479                                    AArch64::zsub0 + i, DL, VT, SuperReg));
1480 
1481   // Copy chain
1482   unsigned ChainIdx = NumVecs;
1483   ReplaceUses(SDValue(N, ChainIdx), SDValue(Load, 1));
1484   CurDAG->RemoveDeadNode(N);
1485 }
1486 
SelectStore(SDNode * N,unsigned NumVecs,unsigned Opc)1487 void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
1488                                       unsigned Opc) {
1489   SDLoc dl(N);
1490   EVT VT = N->getOperand(2)->getValueType(0);
1491 
1492   // Form a REG_SEQUENCE to force register allocation.
1493   bool Is128Bit = VT.getSizeInBits() == 128;
1494   SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1495   SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
1496 
1497   SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)};
1498   SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
1499 
1500   // Transfer memoperands.
1501   MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1502   CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
1503 
1504   ReplaceNode(N, St);
1505 }
1506 
SelectPredicatedStore(SDNode * N,unsigned NumVecs,unsigned Scale,unsigned Opc_rr,unsigned Opc_ri)1507 void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs,
1508                                                 unsigned Scale, unsigned Opc_rr,
1509                                                 unsigned Opc_ri) {
1510   SDLoc dl(N);
1511 
1512   // Form a REG_SEQUENCE to force register allocation.
1513   SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1514   SDValue RegSeq = createZTuple(Regs);
1515 
1516   // Optimize addressing mode.
1517   unsigned Opc;
1518   SDValue Offset, Base;
1519   std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore(
1520       N, Opc_rr, Opc_ri, N->getOperand(NumVecs + 3),
1521       CurDAG->getTargetConstant(0, dl, MVT::i64), Scale);
1522 
1523   SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), // predicate
1524                    Base,                               // address
1525                    Offset,                             // offset
1526                    N->getOperand(0)};                  // chain
1527   SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
1528 
1529   ReplaceNode(N, St);
1530 }
1531 
SelectAddrModeFrameIndexSVE(SDValue N,SDValue & Base,SDValue & OffImm)1532 bool AArch64DAGToDAGISel::SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base,
1533                                                       SDValue &OffImm) {
1534   SDLoc dl(N);
1535   const DataLayout &DL = CurDAG->getDataLayout();
1536   const TargetLowering *TLI = getTargetLowering();
1537 
1538   // Try to match it for the frame address
1539   if (auto FINode = dyn_cast<FrameIndexSDNode>(N)) {
1540     int FI = FINode->getIndex();
1541     Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
1542     OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
1543     return true;
1544   }
1545 
1546   return false;
1547 }
1548 
SelectPostStore(SDNode * N,unsigned NumVecs,unsigned Opc)1549 void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
1550                                           unsigned Opc) {
1551   SDLoc dl(N);
1552   EVT VT = N->getOperand(2)->getValueType(0);
1553   const EVT ResTys[] = {MVT::i64,    // Type of the write back register
1554                         MVT::Other}; // Type for the Chain
1555 
1556   // Form a REG_SEQUENCE to force register allocation.
1557   bool Is128Bit = VT.getSizeInBits() == 128;
1558   SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
1559   SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
1560 
1561   SDValue Ops[] = {RegSeq,
1562                    N->getOperand(NumVecs + 1), // base register
1563                    N->getOperand(NumVecs + 2), // Incremental
1564                    N->getOperand(0)};          // Chain
1565   SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1566 
1567   ReplaceNode(N, St);
1568 }
1569 
1570 namespace {
1571 /// WidenVector - Given a value in the V64 register class, produce the
1572 /// equivalent value in the V128 register class.
1573 class WidenVector {
1574   SelectionDAG &DAG;
1575 
1576 public:
WidenVector(SelectionDAG & DAG)1577   WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
1578 
operator ()(SDValue V64Reg)1579   SDValue operator()(SDValue V64Reg) {
1580     EVT VT = V64Reg.getValueType();
1581     unsigned NarrowSize = VT.getVectorNumElements();
1582     MVT EltTy = VT.getVectorElementType().getSimpleVT();
1583     MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
1584     SDLoc DL(V64Reg);
1585 
1586     SDValue Undef =
1587         SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
1588     return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg);
1589   }
1590 };
1591 } // namespace
1592 
1593 /// NarrowVector - Given a value in the V128 register class, produce the
1594 /// equivalent value in the V64 register class.
NarrowVector(SDValue V128Reg,SelectionDAG & DAG)1595 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
1596   EVT VT = V128Reg.getValueType();
1597   unsigned WideSize = VT.getVectorNumElements();
1598   MVT EltTy = VT.getVectorElementType().getSimpleVT();
1599   MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
1600 
1601   return DAG.getTargetExtractSubreg(AArch64::dsub, SDLoc(V128Reg), NarrowTy,
1602                                     V128Reg);
1603 }
1604 
SelectLoadLane(SDNode * N,unsigned NumVecs,unsigned Opc)1605 void AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
1606                                          unsigned Opc) {
1607   SDLoc dl(N);
1608   EVT VT = N->getValueType(0);
1609   bool Narrow = VT.getSizeInBits() == 64;
1610 
1611   // Form a REG_SEQUENCE to force register allocation.
1612   SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1613 
1614   if (Narrow)
1615     transform(Regs, Regs.begin(),
1616                    WidenVector(*CurDAG));
1617 
1618   SDValue RegSeq = createQTuple(Regs);
1619 
1620   const EVT ResTys[] = {MVT::Untyped, MVT::Other};
1621 
1622   unsigned LaneNo =
1623       cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
1624 
1625   SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
1626                    N->getOperand(NumVecs + 3), N->getOperand(0)};
1627   SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1628   SDValue SuperReg = SDValue(Ld, 0);
1629 
1630   EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
1631   static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
1632                                     AArch64::qsub2, AArch64::qsub3 };
1633   for (unsigned i = 0; i < NumVecs; ++i) {
1634     SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
1635     if (Narrow)
1636       NV = NarrowVector(NV, *CurDAG);
1637     ReplaceUses(SDValue(N, i), NV);
1638   }
1639 
1640   ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
1641   CurDAG->RemoveDeadNode(N);
1642 }
1643 
SelectPostLoadLane(SDNode * N,unsigned NumVecs,unsigned Opc)1644 void AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
1645                                              unsigned Opc) {
1646   SDLoc dl(N);
1647   EVT VT = N->getValueType(0);
1648   bool Narrow = VT.getSizeInBits() == 64;
1649 
1650   // Form a REG_SEQUENCE to force register allocation.
1651   SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
1652 
1653   if (Narrow)
1654     transform(Regs, Regs.begin(),
1655                    WidenVector(*CurDAG));
1656 
1657   SDValue RegSeq = createQTuple(Regs);
1658 
1659   const EVT ResTys[] = {MVT::i64, // Type of the write back register
1660                         RegSeq->getValueType(0), MVT::Other};
1661 
1662   unsigned LaneNo =
1663       cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
1664 
1665   SDValue Ops[] = {RegSeq,
1666                    CurDAG->getTargetConstant(LaneNo, dl,
1667                                              MVT::i64),         // Lane Number
1668                    N->getOperand(NumVecs + 2),                  // Base register
1669                    N->getOperand(NumVecs + 3),                  // Incremental
1670                    N->getOperand(0)};
1671   SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1672 
1673   // Update uses of the write back register
1674   ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
1675 
1676   // Update uses of the vector list
1677   SDValue SuperReg = SDValue(Ld, 1);
1678   if (NumVecs == 1) {
1679     ReplaceUses(SDValue(N, 0),
1680                 Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
1681   } else {
1682     EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
1683     static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
1684                                       AArch64::qsub2, AArch64::qsub3 };
1685     for (unsigned i = 0; i < NumVecs; ++i) {
1686       SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
1687                                                   SuperReg);
1688       if (Narrow)
1689         NV = NarrowVector(NV, *CurDAG);
1690       ReplaceUses(SDValue(N, i), NV);
1691     }
1692   }
1693 
1694   // Update the Chain
1695   ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
1696   CurDAG->RemoveDeadNode(N);
1697 }
1698 
SelectStoreLane(SDNode * N,unsigned NumVecs,unsigned Opc)1699 void AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
1700                                           unsigned Opc) {
1701   SDLoc dl(N);
1702   EVT VT = N->getOperand(2)->getValueType(0);
1703   bool Narrow = VT.getSizeInBits() == 64;
1704 
1705   // Form a REG_SEQUENCE to force register allocation.
1706   SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1707 
1708   if (Narrow)
1709     transform(Regs, Regs.begin(),
1710                    WidenVector(*CurDAG));
1711 
1712   SDValue RegSeq = createQTuple(Regs);
1713 
1714   unsigned LaneNo =
1715       cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
1716 
1717   SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
1718                    N->getOperand(NumVecs + 3), N->getOperand(0)};
1719   SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
1720 
1721   // Transfer memoperands.
1722   MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1723   CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
1724 
1725   ReplaceNode(N, St);
1726 }
1727 
SelectPostStoreLane(SDNode * N,unsigned NumVecs,unsigned Opc)1728 void AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
1729                                               unsigned Opc) {
1730   SDLoc dl(N);
1731   EVT VT = N->getOperand(2)->getValueType(0);
1732   bool Narrow = VT.getSizeInBits() == 64;
1733 
1734   // Form a REG_SEQUENCE to force register allocation.
1735   SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
1736 
1737   if (Narrow)
1738     transform(Regs, Regs.begin(),
1739                    WidenVector(*CurDAG));
1740 
1741   SDValue RegSeq = createQTuple(Regs);
1742 
1743   const EVT ResTys[] = {MVT::i64, // Type of the write back register
1744                         MVT::Other};
1745 
1746   unsigned LaneNo =
1747       cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
1748 
1749   SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
1750                    N->getOperand(NumVecs + 2), // Base Register
1751                    N->getOperand(NumVecs + 3), // Incremental
1752                    N->getOperand(0)};
1753   SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1754 
1755   // Transfer memoperands.
1756   MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1757   CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
1758 
1759   ReplaceNode(N, St);
1760 }
1761 
isBitfieldExtractOpFromAnd(SelectionDAG * CurDAG,SDNode * N,unsigned & Opc,SDValue & Opd0,unsigned & LSB,unsigned & MSB,unsigned NumberOfIgnoredLowBits,bool BiggerPattern)1762 static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
1763                                        unsigned &Opc, SDValue &Opd0,
1764                                        unsigned &LSB, unsigned &MSB,
1765                                        unsigned NumberOfIgnoredLowBits,
1766                                        bool BiggerPattern) {
1767   assert(N->getOpcode() == ISD::AND &&
1768          "N must be a AND operation to call this function");
1769 
1770   EVT VT = N->getValueType(0);
1771 
1772   // Here we can test the type of VT and return false when the type does not
1773   // match, but since it is done prior to that call in the current context
1774   // we turned that into an assert to avoid redundant code.
1775   assert((VT == MVT::i32 || VT == MVT::i64) &&
1776          "Type checking must have been done before calling this function");
1777 
1778   // FIXME: simplify-demanded-bits in DAGCombine will probably have
1779   // changed the AND node to a 32-bit mask operation. We'll have to
1780   // undo that as part of the transform here if we want to catch all
1781   // the opportunities.
1782   // Currently the NumberOfIgnoredLowBits argument helps to recover
1783   // form these situations when matching bigger pattern (bitfield insert).
1784 
1785   // For unsigned extracts, check for a shift right and mask
1786   uint64_t AndImm = 0;
1787   if (!isOpcWithIntImmediate(N, ISD::AND, AndImm))
1788     return false;
1789 
1790   const SDNode *Op0 = N->getOperand(0).getNode();
1791 
1792   // Because of simplify-demanded-bits in DAGCombine, the mask may have been
1793   // simplified. Try to undo that
1794   AndImm |= maskTrailingOnes<uint64_t>(NumberOfIgnoredLowBits);
1795 
1796   // The immediate is a mask of the low bits iff imm & (imm+1) == 0
1797   if (AndImm & (AndImm + 1))
1798     return false;
1799 
1800   bool ClampMSB = false;
1801   uint64_t SrlImm = 0;
1802   // Handle the SRL + ANY_EXTEND case.
1803   if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
1804       isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, SrlImm)) {
1805     // Extend the incoming operand of the SRL to 64-bit.
1806     Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
1807     // Make sure to clamp the MSB so that we preserve the semantics of the
1808     // original operations.
1809     ClampMSB = true;
1810   } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
1811              isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
1812                                    SrlImm)) {
1813     // If the shift result was truncated, we can still combine them.
1814     Opd0 = Op0->getOperand(0).getOperand(0);
1815 
1816     // Use the type of SRL node.
1817     VT = Opd0->getValueType(0);
1818   } else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) {
1819     Opd0 = Op0->getOperand(0);
1820   } else if (BiggerPattern) {
1821     // Let's pretend a 0 shift right has been performed.
1822     // The resulting code will be at least as good as the original one
1823     // plus it may expose more opportunities for bitfield insert pattern.
1824     // FIXME: Currently we limit this to the bigger pattern, because
1825     // some optimizations expect AND and not UBFM.
1826     Opd0 = N->getOperand(0);
1827   } else
1828     return false;
1829 
1830   // Bail out on large immediates. This happens when no proper
1831   // combining/constant folding was performed.
1832   if (!BiggerPattern && (SrlImm <= 0 || SrlImm >= VT.getSizeInBits())) {
1833     LLVM_DEBUG(
1834         (dbgs() << N
1835                 << ": Found large shift immediate, this should not happen\n"));
1836     return false;
1837   }
1838 
1839   LSB = SrlImm;
1840   MSB = SrlImm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(AndImm)
1841                                  : countTrailingOnes<uint64_t>(AndImm)) -
1842         1;
1843   if (ClampMSB)
1844     // Since we're moving the extend before the right shift operation, we need
1845     // to clamp the MSB to make sure we don't shift in undefined bits instead of
1846     // the zeros which would get shifted in with the original right shift
1847     // operation.
1848     MSB = MSB > 31 ? 31 : MSB;
1849 
1850   Opc = VT == MVT::i32 ? AArch64::UBFMWri : AArch64::UBFMXri;
1851   return true;
1852 }
1853 
isBitfieldExtractOpFromSExtInReg(SDNode * N,unsigned & Opc,SDValue & Opd0,unsigned & Immr,unsigned & Imms)1854 static bool isBitfieldExtractOpFromSExtInReg(SDNode *N, unsigned &Opc,
1855                                              SDValue &Opd0, unsigned &Immr,
1856                                              unsigned &Imms) {
1857   assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
1858 
1859   EVT VT = N->getValueType(0);
1860   unsigned BitWidth = VT.getSizeInBits();
1861   assert((VT == MVT::i32 || VT == MVT::i64) &&
1862          "Type checking must have been done before calling this function");
1863 
1864   SDValue Op = N->getOperand(0);
1865   if (Op->getOpcode() == ISD::TRUNCATE) {
1866     Op = Op->getOperand(0);
1867     VT = Op->getValueType(0);
1868     BitWidth = VT.getSizeInBits();
1869   }
1870 
1871   uint64_t ShiftImm;
1872   if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRL, ShiftImm) &&
1873       !isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
1874     return false;
1875 
1876   unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
1877   if (ShiftImm + Width > BitWidth)
1878     return false;
1879 
1880   Opc = (VT == MVT::i32) ? AArch64::SBFMWri : AArch64::SBFMXri;
1881   Opd0 = Op.getOperand(0);
1882   Immr = ShiftImm;
1883   Imms = ShiftImm + Width - 1;
1884   return true;
1885 }
1886 
isSeveralBitsExtractOpFromShr(SDNode * N,unsigned & Opc,SDValue & Opd0,unsigned & LSB,unsigned & MSB)1887 static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
1888                                           SDValue &Opd0, unsigned &LSB,
1889                                           unsigned &MSB) {
1890   // We are looking for the following pattern which basically extracts several
1891   // continuous bits from the source value and places it from the LSB of the
1892   // destination value, all other bits of the destination value or set to zero:
1893   //
1894   // Value2 = AND Value, MaskImm
1895   // SRL Value2, ShiftImm
1896   //
1897   // with MaskImm >> ShiftImm to search for the bit width.
1898   //
1899   // This gets selected into a single UBFM:
1900   //
1901   // UBFM Value, ShiftImm, BitWide + SrlImm -1
1902   //
1903 
1904   if (N->getOpcode() != ISD::SRL)
1905     return false;
1906 
1907   uint64_t AndMask = 0;
1908   if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, AndMask))
1909     return false;
1910 
1911   Opd0 = N->getOperand(0).getOperand(0);
1912 
1913   uint64_t SrlImm = 0;
1914   if (!isIntImmediate(N->getOperand(1), SrlImm))
1915     return false;
1916 
1917   // Check whether we really have several bits extract here.
1918   unsigned BitWide = 64 - countLeadingOnes(~(AndMask >> SrlImm));
1919   if (BitWide && isMask_64(AndMask >> SrlImm)) {
1920     if (N->getValueType(0) == MVT::i32)
1921       Opc = AArch64::UBFMWri;
1922     else
1923       Opc = AArch64::UBFMXri;
1924 
1925     LSB = SrlImm;
1926     MSB = BitWide + SrlImm - 1;
1927     return true;
1928   }
1929 
1930   return false;
1931 }
1932 
isBitfieldExtractOpFromShr(SDNode * N,unsigned & Opc,SDValue & Opd0,unsigned & Immr,unsigned & Imms,bool BiggerPattern)1933 static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
1934                                        unsigned &Immr, unsigned &Imms,
1935                                        bool BiggerPattern) {
1936   assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
1937          "N must be a SHR/SRA operation to call this function");
1938 
1939   EVT VT = N->getValueType(0);
1940 
1941   // Here we can test the type of VT and return false when the type does not
1942   // match, but since it is done prior to that call in the current context
1943   // we turned that into an assert to avoid redundant code.
1944   assert((VT == MVT::i32 || VT == MVT::i64) &&
1945          "Type checking must have been done before calling this function");
1946 
1947   // Check for AND + SRL doing several bits extract.
1948   if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms))
1949     return true;
1950 
1951   // We're looking for a shift of a shift.
1952   uint64_t ShlImm = 0;
1953   uint64_t TruncBits = 0;
1954   if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, ShlImm)) {
1955     Opd0 = N->getOperand(0).getOperand(0);
1956   } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
1957              N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
1958     // We are looking for a shift of truncate. Truncate from i64 to i32 could
1959     // be considered as setting high 32 bits as zero. Our strategy here is to
1960     // always generate 64bit UBFM. This consistency will help the CSE pass
1961     // later find more redundancy.
1962     Opd0 = N->getOperand(0).getOperand(0);
1963     TruncBits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
1964     VT = Opd0.getValueType();
1965     assert(VT == MVT::i64 && "the promoted type should be i64");
1966   } else if (BiggerPattern) {
1967     // Let's pretend a 0 shift left has been performed.
1968     // FIXME: Currently we limit this to the bigger pattern case,
1969     // because some optimizations expect AND and not UBFM
1970     Opd0 = N->getOperand(0);
1971   } else
1972     return false;
1973 
1974   // Missing combines/constant folding may have left us with strange
1975   // constants.
1976   if (ShlImm >= VT.getSizeInBits()) {
1977     LLVM_DEBUG(
1978         (dbgs() << N
1979                 << ": Found large shift immediate, this should not happen\n"));
1980     return false;
1981   }
1982 
1983   uint64_t SrlImm = 0;
1984   if (!isIntImmediate(N->getOperand(1), SrlImm))
1985     return false;
1986 
1987   assert(SrlImm > 0 && SrlImm < VT.getSizeInBits() &&
1988          "bad amount in shift node!");
1989   int immr = SrlImm - ShlImm;
1990   Immr = immr < 0 ? immr + VT.getSizeInBits() : immr;
1991   Imms = VT.getSizeInBits() - ShlImm - TruncBits - 1;
1992   // SRA requires a signed extraction
1993   if (VT == MVT::i32)
1994     Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
1995   else
1996     Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMXri : AArch64::UBFMXri;
1997   return true;
1998 }
1999 
tryBitfieldExtractOpFromSExt(SDNode * N)2000 bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) {
2001   assert(N->getOpcode() == ISD::SIGN_EXTEND);
2002 
2003   EVT VT = N->getValueType(0);
2004   EVT NarrowVT = N->getOperand(0)->getValueType(0);
2005   if (VT != MVT::i64 || NarrowVT != MVT::i32)
2006     return false;
2007 
2008   uint64_t ShiftImm;
2009   SDValue Op = N->getOperand(0);
2010   if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
2011     return false;
2012 
2013   SDLoc dl(N);
2014   // Extend the incoming operand of the shift to 64-bits.
2015   SDValue Opd0 = Widen(CurDAG, Op.getOperand(0));
2016   unsigned Immr = ShiftImm;
2017   unsigned Imms = NarrowVT.getSizeInBits() - 1;
2018   SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
2019                    CurDAG->getTargetConstant(Imms, dl, VT)};
2020   CurDAG->SelectNodeTo(N, AArch64::SBFMXri, VT, Ops);
2021   return true;
2022 }
2023 
2024 /// Try to form fcvtl2 instructions from a floating-point extend of a high-half
2025 /// extract of a subvector.
tryHighFPExt(SDNode * N)2026 bool AArch64DAGToDAGISel::tryHighFPExt(SDNode *N) {
2027   assert(N->getOpcode() == ISD::FP_EXTEND);
2028 
2029   // There are 2 forms of fcvtl2 - extend to double or extend to float.
2030   SDValue Extract = N->getOperand(0);
2031   EVT VT = N->getValueType(0);
2032   EVT NarrowVT = Extract.getValueType();
2033   if ((VT != MVT::v2f64 || NarrowVT != MVT::v2f32) &&
2034       (VT != MVT::v4f32 || NarrowVT != MVT::v4f16))
2035     return false;
2036 
2037   // Optionally look past a bitcast.
2038   Extract = peekThroughBitcasts(Extract);
2039   if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
2040     return false;
2041 
2042   // Match extract from start of high half index.
2043   // Example: v8i16 -> v4i16 means the extract must begin at index 4.
2044   unsigned ExtractIndex = Extract.getConstantOperandVal(1);
2045   if (ExtractIndex != Extract.getValueType().getVectorNumElements())
2046     return false;
2047 
2048   auto Opcode = VT == MVT::v2f64 ? AArch64::FCVTLv4i32 : AArch64::FCVTLv8i16;
2049   CurDAG->SelectNodeTo(N, Opcode, VT, Extract.getOperand(0));
2050   return true;
2051 }
2052 
isBitfieldExtractOp(SelectionDAG * CurDAG,SDNode * N,unsigned & Opc,SDValue & Opd0,unsigned & Immr,unsigned & Imms,unsigned NumberOfIgnoredLowBits=0,bool BiggerPattern=false)2053 static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
2054                                 SDValue &Opd0, unsigned &Immr, unsigned &Imms,
2055                                 unsigned NumberOfIgnoredLowBits = 0,
2056                                 bool BiggerPattern = false) {
2057   if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
2058     return false;
2059 
2060   switch (N->getOpcode()) {
2061   default:
2062     if (!N->isMachineOpcode())
2063       return false;
2064     break;
2065   case ISD::AND:
2066     return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, Immr, Imms,
2067                                       NumberOfIgnoredLowBits, BiggerPattern);
2068   case ISD::SRL:
2069   case ISD::SRA:
2070     return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern);
2071 
2072   case ISD::SIGN_EXTEND_INREG:
2073     return isBitfieldExtractOpFromSExtInReg(N, Opc, Opd0, Immr, Imms);
2074   }
2075 
2076   unsigned NOpc = N->getMachineOpcode();
2077   switch (NOpc) {
2078   default:
2079     return false;
2080   case AArch64::SBFMWri:
2081   case AArch64::UBFMWri:
2082   case AArch64::SBFMXri:
2083   case AArch64::UBFMXri:
2084     Opc = NOpc;
2085     Opd0 = N->getOperand(0);
2086     Immr = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
2087     Imms = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
2088     return true;
2089   }
2090   // Unreachable
2091   return false;
2092 }
2093 
tryBitfieldExtractOp(SDNode * N)2094 bool AArch64DAGToDAGISel::tryBitfieldExtractOp(SDNode *N) {
2095   unsigned Opc, Immr, Imms;
2096   SDValue Opd0;
2097   if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms))
2098     return false;
2099 
2100   EVT VT = N->getValueType(0);
2101   SDLoc dl(N);
2102 
2103   // If the bit extract operation is 64bit but the original type is 32bit, we
2104   // need to add one EXTRACT_SUBREG.
2105   if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) {
2106     SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, MVT::i64),
2107                        CurDAG->getTargetConstant(Imms, dl, MVT::i64)};
2108 
2109     SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64);
2110     SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
2111     ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
2112                                           MVT::i32, SDValue(BFM, 0), SubReg));
2113     return true;
2114   }
2115 
2116   SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
2117                    CurDAG->getTargetConstant(Imms, dl, VT)};
2118   CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2119   return true;
2120 }
2121 
2122 /// Does DstMask form a complementary pair with the mask provided by
2123 /// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
2124 /// this asks whether DstMask zeroes precisely those bits that will be set by
2125 /// the other half.
isBitfieldDstMask(uint64_t DstMask,const APInt & BitsToBeInserted,unsigned NumberOfIgnoredHighBits,EVT VT)2126 static bool isBitfieldDstMask(uint64_t DstMask, const APInt &BitsToBeInserted,
2127                               unsigned NumberOfIgnoredHighBits, EVT VT) {
2128   assert((VT == MVT::i32 || VT == MVT::i64) &&
2129          "i32 or i64 mask type expected!");
2130   unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
2131 
2132   APInt SignificantDstMask = APInt(BitWidth, DstMask);
2133   APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
2134 
2135   return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
2136          (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
2137 }
2138 
2139 // Look for bits that will be useful for later uses.
2140 // A bit is consider useless as soon as it is dropped and never used
2141 // before it as been dropped.
2142 // E.g., looking for useful bit of x
2143 // 1. y = x & 0x7
2144 // 2. z = y >> 2
2145 // After #1, x useful bits are 0x7, then the useful bits of x, live through
2146 // y.
2147 // After #2, the useful bits of x are 0x4.
2148 // However, if x is used on an unpredicatable instruction, then all its bits
2149 // are useful.
2150 // E.g.
2151 // 1. y = x & 0x7
2152 // 2. z = y >> 2
2153 // 3. str x, [@x]
2154 static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
2155 
getUsefulBitsFromAndWithImmediate(SDValue Op,APInt & UsefulBits,unsigned Depth)2156 static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
2157                                               unsigned Depth) {
2158   uint64_t Imm =
2159       cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
2160   Imm = AArch64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
2161   UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
2162   getUsefulBits(Op, UsefulBits, Depth + 1);
2163 }
2164 
getUsefulBitsFromBitfieldMoveOpd(SDValue Op,APInt & UsefulBits,uint64_t Imm,uint64_t MSB,unsigned Depth)2165 static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
2166                                              uint64_t Imm, uint64_t MSB,
2167                                              unsigned Depth) {
2168   // inherit the bitwidth value
2169   APInt OpUsefulBits(UsefulBits);
2170   OpUsefulBits = 1;
2171 
2172   if (MSB >= Imm) {
2173     OpUsefulBits <<= MSB - Imm + 1;
2174     --OpUsefulBits;
2175     // The interesting part will be in the lower part of the result
2176     getUsefulBits(Op, OpUsefulBits, Depth + 1);
2177     // The interesting part was starting at Imm in the argument
2178     OpUsefulBits <<= Imm;
2179   } else {
2180     OpUsefulBits <<= MSB + 1;
2181     --OpUsefulBits;
2182     // The interesting part will be shifted in the result
2183     OpUsefulBits <<= OpUsefulBits.getBitWidth() - Imm;
2184     getUsefulBits(Op, OpUsefulBits, Depth + 1);
2185     // The interesting part was at zero in the argument
2186     OpUsefulBits.lshrInPlace(OpUsefulBits.getBitWidth() - Imm);
2187   }
2188 
2189   UsefulBits &= OpUsefulBits;
2190 }
2191 
getUsefulBitsFromUBFM(SDValue Op,APInt & UsefulBits,unsigned Depth)2192 static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
2193                                   unsigned Depth) {
2194   uint64_t Imm =
2195       cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
2196   uint64_t MSB =
2197       cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
2198 
2199   getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
2200 }
2201 
getUsefulBitsFromOrWithShiftedReg(SDValue Op,APInt & UsefulBits,unsigned Depth)2202 static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
2203                                               unsigned Depth) {
2204   uint64_t ShiftTypeAndValue =
2205       cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
2206   APInt Mask(UsefulBits);
2207   Mask.clearAllBits();
2208   Mask.flipAllBits();
2209 
2210   if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) {
2211     // Shift Left
2212     uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
2213     Mask <<= ShiftAmt;
2214     getUsefulBits(Op, Mask, Depth + 1);
2215     Mask.lshrInPlace(ShiftAmt);
2216   } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) {
2217     // Shift Right
2218     // We do not handle AArch64_AM::ASR, because the sign will change the
2219     // number of useful bits
2220     uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
2221     Mask.lshrInPlace(ShiftAmt);
2222     getUsefulBits(Op, Mask, Depth + 1);
2223     Mask <<= ShiftAmt;
2224   } else
2225     return;
2226 
2227   UsefulBits &= Mask;
2228 }
2229 
getUsefulBitsFromBFM(SDValue Op,SDValue Orig,APInt & UsefulBits,unsigned Depth)2230 static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
2231                                  unsigned Depth) {
2232   uint64_t Imm =
2233       cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
2234   uint64_t MSB =
2235       cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
2236 
2237   APInt OpUsefulBits(UsefulBits);
2238   OpUsefulBits = 1;
2239 
2240   APInt ResultUsefulBits(UsefulBits.getBitWidth(), 0);
2241   ResultUsefulBits.flipAllBits();
2242   APInt Mask(UsefulBits.getBitWidth(), 0);
2243 
2244   getUsefulBits(Op, ResultUsefulBits, Depth + 1);
2245 
2246   if (MSB >= Imm) {
2247     // The instruction is a BFXIL.
2248     uint64_t Width = MSB - Imm + 1;
2249     uint64_t LSB = Imm;
2250 
2251     OpUsefulBits <<= Width;
2252     --OpUsefulBits;
2253 
2254     if (Op.getOperand(1) == Orig) {
2255       // Copy the low bits from the result to bits starting from LSB.
2256       Mask = ResultUsefulBits & OpUsefulBits;
2257       Mask <<= LSB;
2258     }
2259 
2260     if (Op.getOperand(0) == Orig)
2261       // Bits starting from LSB in the input contribute to the result.
2262       Mask |= (ResultUsefulBits & ~OpUsefulBits);
2263   } else {
2264     // The instruction is a BFI.
2265     uint64_t Width = MSB + 1;
2266     uint64_t LSB = UsefulBits.getBitWidth() - Imm;
2267 
2268     OpUsefulBits <<= Width;
2269     --OpUsefulBits;
2270     OpUsefulBits <<= LSB;
2271 
2272     if (Op.getOperand(1) == Orig) {
2273       // Copy the bits from the result to the zero bits.
2274       Mask = ResultUsefulBits & OpUsefulBits;
2275       Mask.lshrInPlace(LSB);
2276     }
2277 
2278     if (Op.getOperand(0) == Orig)
2279       Mask |= (ResultUsefulBits & ~OpUsefulBits);
2280   }
2281 
2282   UsefulBits &= Mask;
2283 }
2284 
getUsefulBitsForUse(SDNode * UserNode,APInt & UsefulBits,SDValue Orig,unsigned Depth)2285 static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
2286                                 SDValue Orig, unsigned Depth) {
2287 
2288   // Users of this node should have already been instruction selected
2289   // FIXME: Can we turn that into an assert?
2290   if (!UserNode->isMachineOpcode())
2291     return;
2292 
2293   switch (UserNode->getMachineOpcode()) {
2294   default:
2295     return;
2296   case AArch64::ANDSWri:
2297   case AArch64::ANDSXri:
2298   case AArch64::ANDWri:
2299   case AArch64::ANDXri:
2300     // We increment Depth only when we call the getUsefulBits
2301     return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
2302                                              Depth);
2303   case AArch64::UBFMWri:
2304   case AArch64::UBFMXri:
2305     return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
2306 
2307   case AArch64::ORRWrs:
2308   case AArch64::ORRXrs:
2309     if (UserNode->getOperand(1) != Orig)
2310       return;
2311     return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
2312                                              Depth);
2313   case AArch64::BFMWri:
2314   case AArch64::BFMXri:
2315     return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
2316 
2317   case AArch64::STRBBui:
2318   case AArch64::STURBBi:
2319     if (UserNode->getOperand(0) != Orig)
2320       return;
2321     UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xff);
2322     return;
2323 
2324   case AArch64::STRHHui:
2325   case AArch64::STURHHi:
2326     if (UserNode->getOperand(0) != Orig)
2327       return;
2328     UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xffff);
2329     return;
2330   }
2331 }
2332 
getUsefulBits(SDValue Op,APInt & UsefulBits,unsigned Depth)2333 static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
2334   if (Depth >= SelectionDAG::MaxRecursionDepth)
2335     return;
2336   // Initialize UsefulBits
2337   if (!Depth) {
2338     unsigned Bitwidth = Op.getScalarValueSizeInBits();
2339     // At the beginning, assume every produced bits is useful
2340     UsefulBits = APInt(Bitwidth, 0);
2341     UsefulBits.flipAllBits();
2342   }
2343   APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
2344 
2345   for (SDNode *Node : Op.getNode()->uses()) {
2346     // A use cannot produce useful bits
2347     APInt UsefulBitsForUse = APInt(UsefulBits);
2348     getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
2349     UsersUsefulBits |= UsefulBitsForUse;
2350   }
2351   // UsefulBits contains the produced bits that are meaningful for the
2352   // current definition, thus a user cannot make a bit meaningful at
2353   // this point
2354   UsefulBits &= UsersUsefulBits;
2355 }
2356 
2357 /// Create a machine node performing a notional SHL of Op by ShlAmount. If
2358 /// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
2359 /// 0, return Op unchanged.
getLeftShift(SelectionDAG * CurDAG,SDValue Op,int ShlAmount)2360 static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
2361   if (ShlAmount == 0)
2362     return Op;
2363 
2364   EVT VT = Op.getValueType();
2365   SDLoc dl(Op);
2366   unsigned BitWidth = VT.getSizeInBits();
2367   unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri;
2368 
2369   SDNode *ShiftNode;
2370   if (ShlAmount > 0) {
2371     // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
2372     ShiftNode = CurDAG->getMachineNode(
2373         UBFMOpc, dl, VT, Op,
2374         CurDAG->getTargetConstant(BitWidth - ShlAmount, dl, VT),
2375         CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, dl, VT));
2376   } else {
2377     // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
2378     assert(ShlAmount < 0 && "expected right shift");
2379     int ShrAmount = -ShlAmount;
2380     ShiftNode = CurDAG->getMachineNode(
2381         UBFMOpc, dl, VT, Op, CurDAG->getTargetConstant(ShrAmount, dl, VT),
2382         CurDAG->getTargetConstant(BitWidth - 1, dl, VT));
2383   }
2384 
2385   return SDValue(ShiftNode, 0);
2386 }
2387 
2388 /// Does this tree qualify as an attempt to move a bitfield into position,
2389 /// essentially "(and (shl VAL, N), Mask)".
isBitfieldPositioningOp(SelectionDAG * CurDAG,SDValue Op,bool BiggerPattern,SDValue & Src,int & ShiftAmount,int & MaskWidth)2390 static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
2391                                     bool BiggerPattern,
2392                                     SDValue &Src, int &ShiftAmount,
2393                                     int &MaskWidth) {
2394   EVT VT = Op.getValueType();
2395   unsigned BitWidth = VT.getSizeInBits();
2396   (void)BitWidth;
2397   assert(BitWidth == 32 || BitWidth == 64);
2398 
2399   KnownBits Known = CurDAG->computeKnownBits(Op);
2400 
2401   // Non-zero in the sense that they're not provably zero, which is the key
2402   // point if we want to use this value
2403   uint64_t NonZeroBits = (~Known.Zero).getZExtValue();
2404 
2405   // Discard a constant AND mask if present. It's safe because the node will
2406   // already have been factored into the computeKnownBits calculation above.
2407   uint64_t AndImm;
2408   if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
2409     assert((~APInt(BitWidth, AndImm) & ~Known.Zero) == 0);
2410     Op = Op.getOperand(0);
2411   }
2412 
2413   // Don't match if the SHL has more than one use, since then we'll end up
2414   // generating SHL+UBFIZ instead of just keeping SHL+AND.
2415   if (!BiggerPattern && !Op.hasOneUse())
2416     return false;
2417 
2418   uint64_t ShlImm;
2419   if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
2420     return false;
2421   Op = Op.getOperand(0);
2422 
2423   if (!isShiftedMask_64(NonZeroBits))
2424     return false;
2425 
2426   ShiftAmount = countTrailingZeros(NonZeroBits);
2427   MaskWidth = countTrailingOnes(NonZeroBits >> ShiftAmount);
2428 
2429   // BFI encompasses sufficiently many nodes that it's worth inserting an extra
2430   // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
2431   // amount.  BiggerPattern is true when this pattern is being matched for BFI,
2432   // BiggerPattern is false when this pattern is being matched for UBFIZ, in
2433   // which case it is not profitable to insert an extra shift.
2434   if (ShlImm - ShiftAmount != 0 && !BiggerPattern)
2435     return false;
2436   Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
2437 
2438   return true;
2439 }
2440 
isShiftedMask(uint64_t Mask,EVT VT)2441 static bool isShiftedMask(uint64_t Mask, EVT VT) {
2442   assert(VT == MVT::i32 || VT == MVT::i64);
2443   if (VT == MVT::i32)
2444     return isShiftedMask_32(Mask);
2445   return isShiftedMask_64(Mask);
2446 }
2447 
2448 // Generate a BFI/BFXIL from 'or (and X, MaskImm), OrImm' iff the value being
2449 // inserted only sets known zero bits.
tryBitfieldInsertOpFromOrAndImm(SDNode * N,SelectionDAG * CurDAG)2450 static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
2451   assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
2452 
2453   EVT VT = N->getValueType(0);
2454   if (VT != MVT::i32 && VT != MVT::i64)
2455     return false;
2456 
2457   unsigned BitWidth = VT.getSizeInBits();
2458 
2459   uint64_t OrImm;
2460   if (!isOpcWithIntImmediate(N, ISD::OR, OrImm))
2461     return false;
2462 
2463   // Skip this transformation if the ORR immediate can be encoded in the ORR.
2464   // Otherwise, we'll trade an AND+ORR for ORR+BFI/BFXIL, which is most likely
2465   // performance neutral.
2466   if (AArch64_AM::isLogicalImmediate(OrImm, BitWidth))
2467     return false;
2468 
2469   uint64_t MaskImm;
2470   SDValue And = N->getOperand(0);
2471   // Must be a single use AND with an immediate operand.
2472   if (!And.hasOneUse() ||
2473       !isOpcWithIntImmediate(And.getNode(), ISD::AND, MaskImm))
2474     return false;
2475 
2476   // Compute the Known Zero for the AND as this allows us to catch more general
2477   // cases than just looking for AND with imm.
2478   KnownBits Known = CurDAG->computeKnownBits(And);
2479 
2480   // Non-zero in the sense that they're not provably zero, which is the key
2481   // point if we want to use this value.
2482   uint64_t NotKnownZero = (~Known.Zero).getZExtValue();
2483 
2484   // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00).
2485   if (!isShiftedMask(Known.Zero.getZExtValue(), VT))
2486     return false;
2487 
2488   // The bits being inserted must only set those bits that are known to be zero.
2489   if ((OrImm & NotKnownZero) != 0) {
2490     // FIXME:  It's okay if the OrImm sets NotKnownZero bits to 1, but we don't
2491     // currently handle this case.
2492     return false;
2493   }
2494 
2495   // BFI/BFXIL dst, src, #lsb, #width.
2496   int LSB = countTrailingOnes(NotKnownZero);
2497   int Width = BitWidth - APInt(BitWidth, NotKnownZero).countPopulation();
2498 
2499   // BFI/BFXIL is an alias of BFM, so translate to BFM operands.
2500   unsigned ImmR = (BitWidth - LSB) % BitWidth;
2501   unsigned ImmS = Width - 1;
2502 
2503   // If we're creating a BFI instruction avoid cases where we need more
2504   // instructions to materialize the BFI constant as compared to the original
2505   // ORR.  A BFXIL will use the same constant as the original ORR, so the code
2506   // should be no worse in this case.
2507   bool IsBFI = LSB != 0;
2508   uint64_t BFIImm = OrImm >> LSB;
2509   if (IsBFI && !AArch64_AM::isLogicalImmediate(BFIImm, BitWidth)) {
2510     // We have a BFI instruction and we know the constant can't be materialized
2511     // with a ORR-immediate with the zero register.
2512     unsigned OrChunks = 0, BFIChunks = 0;
2513     for (unsigned Shift = 0; Shift < BitWidth; Shift += 16) {
2514       if (((OrImm >> Shift) & 0xFFFF) != 0)
2515         ++OrChunks;
2516       if (((BFIImm >> Shift) & 0xFFFF) != 0)
2517         ++BFIChunks;
2518     }
2519     if (BFIChunks > OrChunks)
2520       return false;
2521   }
2522 
2523   // Materialize the constant to be inserted.
2524   SDLoc DL(N);
2525   unsigned MOVIOpc = VT == MVT::i32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;
2526   SDNode *MOVI = CurDAG->getMachineNode(
2527       MOVIOpc, DL, VT, CurDAG->getTargetConstant(BFIImm, DL, VT));
2528 
2529   // Create the BFI/BFXIL instruction.
2530   SDValue Ops[] = {And.getOperand(0), SDValue(MOVI, 0),
2531                    CurDAG->getTargetConstant(ImmR, DL, VT),
2532                    CurDAG->getTargetConstant(ImmS, DL, VT)};
2533   unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
2534   CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2535   return true;
2536 }
2537 
tryBitfieldInsertOpFromOr(SDNode * N,const APInt & UsefulBits,SelectionDAG * CurDAG)2538 static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
2539                                       SelectionDAG *CurDAG) {
2540   assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
2541 
2542   EVT VT = N->getValueType(0);
2543   if (VT != MVT::i32 && VT != MVT::i64)
2544     return false;
2545 
2546   unsigned BitWidth = VT.getSizeInBits();
2547 
2548   // Because of simplify-demanded-bits in DAGCombine, involved masks may not
2549   // have the expected shape. Try to undo that.
2550 
2551   unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
2552   unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
2553 
2554   // Given a OR operation, check if we have the following pattern
2555   // ubfm c, b, imm, imm2 (or something that does the same jobs, see
2556   //                       isBitfieldExtractOp)
2557   // d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
2558   //                 countTrailingZeros(mask2) == imm2 - imm + 1
2559   // f = d | c
2560   // if yes, replace the OR instruction with:
2561   // f = BFM Opd0, Opd1, LSB, MSB ; where LSB = imm, and MSB = imm2
2562 
2563   // OR is commutative, check all combinations of operand order and values of
2564   // BiggerPattern, i.e.
2565   //     Opd0, Opd1, BiggerPattern=false
2566   //     Opd1, Opd0, BiggerPattern=false
2567   //     Opd0, Opd1, BiggerPattern=true
2568   //     Opd1, Opd0, BiggerPattern=true
2569   // Several of these combinations may match, so check with BiggerPattern=false
2570   // first since that will produce better results by matching more instructions
2571   // and/or inserting fewer extra instructions.
2572   for (int I = 0; I < 4; ++I) {
2573 
2574     SDValue Dst, Src;
2575     unsigned ImmR, ImmS;
2576     bool BiggerPattern = I / 2;
2577     SDValue OrOpd0Val = N->getOperand(I % 2);
2578     SDNode *OrOpd0 = OrOpd0Val.getNode();
2579     SDValue OrOpd1Val = N->getOperand((I + 1) % 2);
2580     SDNode *OrOpd1 = OrOpd1Val.getNode();
2581 
2582     unsigned BFXOpc;
2583     int DstLSB, Width;
2584     if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
2585                             NumberOfIgnoredLowBits, BiggerPattern)) {
2586       // Check that the returned opcode is compatible with the pattern,
2587       // i.e., same type and zero extended (U and not S)
2588       if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
2589           (BFXOpc != AArch64::UBFMWri && VT == MVT::i32))
2590         continue;
2591 
2592       // Compute the width of the bitfield insertion
2593       DstLSB = 0;
2594       Width = ImmS - ImmR + 1;
2595       // FIXME: This constraint is to catch bitfield insertion we may
2596       // want to widen the pattern if we want to grab general bitfied
2597       // move case
2598       if (Width <= 0)
2599         continue;
2600 
2601       // If the mask on the insertee is correct, we have a BFXIL operation. We
2602       // can share the ImmR and ImmS values from the already-computed UBFM.
2603     } else if (isBitfieldPositioningOp(CurDAG, OrOpd0Val,
2604                                        BiggerPattern,
2605                                        Src, DstLSB, Width)) {
2606       ImmR = (BitWidth - DstLSB) % BitWidth;
2607       ImmS = Width - 1;
2608     } else
2609       continue;
2610 
2611     // Check the second part of the pattern
2612     EVT VT = OrOpd1Val.getValueType();
2613     assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
2614 
2615     // Compute the Known Zero for the candidate of the first operand.
2616     // This allows to catch more general case than just looking for
2617     // AND with imm. Indeed, simplify-demanded-bits may have removed
2618     // the AND instruction because it proves it was useless.
2619     KnownBits Known = CurDAG->computeKnownBits(OrOpd1Val);
2620 
2621     // Check if there is enough room for the second operand to appear
2622     // in the first one
2623     APInt BitsToBeInserted =
2624         APInt::getBitsSet(Known.getBitWidth(), DstLSB, DstLSB + Width);
2625 
2626     if ((BitsToBeInserted & ~Known.Zero) != 0)
2627       continue;
2628 
2629     // Set the first operand
2630     uint64_t Imm;
2631     if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
2632         isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
2633       // In that case, we can eliminate the AND
2634       Dst = OrOpd1->getOperand(0);
2635     else
2636       // Maybe the AND has been removed by simplify-demanded-bits
2637       // or is useful because it discards more bits
2638       Dst = OrOpd1Val;
2639 
2640     // both parts match
2641     SDLoc DL(N);
2642     SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT),
2643                      CurDAG->getTargetConstant(ImmS, DL, VT)};
2644     unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
2645     CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2646     return true;
2647   }
2648 
2649   // Generate a BFXIL from 'or (and X, Mask0Imm), (and Y, Mask1Imm)' iff
2650   // Mask0Imm and ~Mask1Imm are equivalent and one of the MaskImms is a shifted
2651   // mask (e.g., 0x000ffff0).
2652   uint64_t Mask0Imm, Mask1Imm;
2653   SDValue And0 = N->getOperand(0);
2654   SDValue And1 = N->getOperand(1);
2655   if (And0.hasOneUse() && And1.hasOneUse() &&
2656       isOpcWithIntImmediate(And0.getNode(), ISD::AND, Mask0Imm) &&
2657       isOpcWithIntImmediate(And1.getNode(), ISD::AND, Mask1Imm) &&
2658       APInt(BitWidth, Mask0Imm) == ~APInt(BitWidth, Mask1Imm) &&
2659       (isShiftedMask(Mask0Imm, VT) || isShiftedMask(Mask1Imm, VT))) {
2660 
2661     // ORR is commutative, so canonicalize to the form 'or (and X, Mask0Imm),
2662     // (and Y, Mask1Imm)' where Mask1Imm is the shifted mask masking off the
2663     // bits to be inserted.
2664     if (isShiftedMask(Mask0Imm, VT)) {
2665       std::swap(And0, And1);
2666       std::swap(Mask0Imm, Mask1Imm);
2667     }
2668 
2669     SDValue Src = And1->getOperand(0);
2670     SDValue Dst = And0->getOperand(0);
2671     unsigned LSB = countTrailingZeros(Mask1Imm);
2672     int Width = BitWidth - APInt(BitWidth, Mask0Imm).countPopulation();
2673 
2674     // The BFXIL inserts the low-order bits from a source register, so right
2675     // shift the needed bits into place.
2676     SDLoc DL(N);
2677     unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
2678     SDNode *LSR = CurDAG->getMachineNode(
2679         ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LSB, DL, VT),
2680         CurDAG->getTargetConstant(BitWidth - 1, DL, VT));
2681 
2682     // BFXIL is an alias of BFM, so translate to BFM operands.
2683     unsigned ImmR = (BitWidth - LSB) % BitWidth;
2684     unsigned ImmS = Width - 1;
2685 
2686     // Create the BFXIL instruction.
2687     SDValue Ops[] = {Dst, SDValue(LSR, 0),
2688                      CurDAG->getTargetConstant(ImmR, DL, VT),
2689                      CurDAG->getTargetConstant(ImmS, DL, VT)};
2690     unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
2691     CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2692     return true;
2693   }
2694 
2695   return false;
2696 }
2697 
tryBitfieldInsertOp(SDNode * N)2698 bool AArch64DAGToDAGISel::tryBitfieldInsertOp(SDNode *N) {
2699   if (N->getOpcode() != ISD::OR)
2700     return false;
2701 
2702   APInt NUsefulBits;
2703   getUsefulBits(SDValue(N, 0), NUsefulBits);
2704 
2705   // If all bits are not useful, just return UNDEF.
2706   if (!NUsefulBits) {
2707     CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
2708     return true;
2709   }
2710 
2711   if (tryBitfieldInsertOpFromOr(N, NUsefulBits, CurDAG))
2712     return true;
2713 
2714   return tryBitfieldInsertOpFromOrAndImm(N, CurDAG);
2715 }
2716 
2717 /// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the
2718 /// equivalent of a left shift by a constant amount followed by an and masking
2719 /// out a contiguous set of bits.
tryBitfieldInsertInZeroOp(SDNode * N)2720 bool AArch64DAGToDAGISel::tryBitfieldInsertInZeroOp(SDNode *N) {
2721   if (N->getOpcode() != ISD::AND)
2722     return false;
2723 
2724   EVT VT = N->getValueType(0);
2725   if (VT != MVT::i32 && VT != MVT::i64)
2726     return false;
2727 
2728   SDValue Op0;
2729   int DstLSB, Width;
2730   if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false,
2731                                Op0, DstLSB, Width))
2732     return false;
2733 
2734   // ImmR is the rotate right amount.
2735   unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
2736   // ImmS is the most significant bit of the source to be moved.
2737   unsigned ImmS = Width - 1;
2738 
2739   SDLoc DL(N);
2740   SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT),
2741                    CurDAG->getTargetConstant(ImmS, DL, VT)};
2742   unsigned Opc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
2743   CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2744   return true;
2745 }
2746 
2747 /// tryShiftAmountMod - Take advantage of built-in mod of shift amount in
2748 /// variable shift/rotate instructions.
tryShiftAmountMod(SDNode * N)2749 bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
2750   EVT VT = N->getValueType(0);
2751 
2752   unsigned Opc;
2753   switch (N->getOpcode()) {
2754   case ISD::ROTR:
2755     Opc = (VT == MVT::i32) ? AArch64::RORVWr : AArch64::RORVXr;
2756     break;
2757   case ISD::SHL:
2758     Opc = (VT == MVT::i32) ? AArch64::LSLVWr : AArch64::LSLVXr;
2759     break;
2760   case ISD::SRL:
2761     Opc = (VT == MVT::i32) ? AArch64::LSRVWr : AArch64::LSRVXr;
2762     break;
2763   case ISD::SRA:
2764     Opc = (VT == MVT::i32) ? AArch64::ASRVWr : AArch64::ASRVXr;
2765     break;
2766   default:
2767     return false;
2768   }
2769 
2770   uint64_t Size;
2771   uint64_t Bits;
2772   if (VT == MVT::i32) {
2773     Bits = 5;
2774     Size = 32;
2775   } else if (VT == MVT::i64) {
2776     Bits = 6;
2777     Size = 64;
2778   } else
2779     return false;
2780 
2781   SDValue ShiftAmt = N->getOperand(1);
2782   SDLoc DL(N);
2783   SDValue NewShiftAmt;
2784 
2785   // Skip over an extend of the shift amount.
2786   if (ShiftAmt->getOpcode() == ISD::ZERO_EXTEND ||
2787       ShiftAmt->getOpcode() == ISD::ANY_EXTEND)
2788     ShiftAmt = ShiftAmt->getOperand(0);
2789 
2790   if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
2791     SDValue Add0 = ShiftAmt->getOperand(0);
2792     SDValue Add1 = ShiftAmt->getOperand(1);
2793     uint64_t Add0Imm;
2794     uint64_t Add1Imm;
2795     // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
2796     // to avoid the ADD/SUB.
2797     if (isIntImmediate(Add1, Add1Imm) && (Add1Imm % Size == 0))
2798       NewShiftAmt = Add0;
2799     // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
2800     // generate a NEG instead of a SUB of a constant.
2801     else if (ShiftAmt->getOpcode() == ISD::SUB &&
2802              isIntImmediate(Add0, Add0Imm) && Add0Imm != 0 &&
2803              (Add0Imm % Size == 0)) {
2804       unsigned NegOpc;
2805       unsigned ZeroReg;
2806       EVT SubVT = ShiftAmt->getValueType(0);
2807       if (SubVT == MVT::i32) {
2808         NegOpc = AArch64::SUBWrr;
2809         ZeroReg = AArch64::WZR;
2810       } else {
2811         assert(SubVT == MVT::i64);
2812         NegOpc = AArch64::SUBXrr;
2813         ZeroReg = AArch64::XZR;
2814       }
2815       SDValue Zero =
2816           CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, ZeroReg, SubVT);
2817       MachineSDNode *Neg =
2818           CurDAG->getMachineNode(NegOpc, DL, SubVT, Zero, Add1);
2819       NewShiftAmt = SDValue(Neg, 0);
2820     } else
2821       return false;
2822   } else {
2823     // If the shift amount is masked with an AND, check that the mask covers the
2824     // bits that are implicitly ANDed off by the above opcodes and if so, skip
2825     // the AND.
2826     uint64_t MaskImm;
2827     if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm) &&
2828         !isOpcWithIntImmediate(ShiftAmt.getNode(), AArch64ISD::ANDS, MaskImm))
2829       return false;
2830 
2831     if (countTrailingOnes(MaskImm) < Bits)
2832       return false;
2833 
2834     NewShiftAmt = ShiftAmt->getOperand(0);
2835   }
2836 
2837   // Narrow/widen the shift amount to match the size of the shift operation.
2838   if (VT == MVT::i32)
2839     NewShiftAmt = narrowIfNeeded(CurDAG, NewShiftAmt);
2840   else if (VT == MVT::i64 && NewShiftAmt->getValueType(0) == MVT::i32) {
2841     SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, DL, MVT::i32);
2842     MachineSDNode *Ext = CurDAG->getMachineNode(
2843         AArch64::SUBREG_TO_REG, DL, VT,
2844         CurDAG->getTargetConstant(0, DL, MVT::i64), NewShiftAmt, SubReg);
2845     NewShiftAmt = SDValue(Ext, 0);
2846   }
2847 
2848   SDValue Ops[] = {N->getOperand(0), NewShiftAmt};
2849   CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2850   return true;
2851 }
2852 
2853 bool
SelectCVTFixedPosOperand(SDValue N,SDValue & FixedPos,unsigned RegWidth)2854 AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
2855                                               unsigned RegWidth) {
2856   APFloat FVal(0.0);
2857   if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
2858     FVal = CN->getValueAPF();
2859   else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
2860     // Some otherwise illegal constants are allowed in this case.
2861     if (LN->getOperand(1).getOpcode() != AArch64ISD::ADDlow ||
2862         !isa<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1)))
2863       return false;
2864 
2865     ConstantPoolSDNode *CN =
2866         dyn_cast<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1));
2867     FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF();
2868   } else
2869     return false;
2870 
2871   // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
2872   // is between 1 and 32 for a destination w-register, or 1 and 64 for an
2873   // x-register.
2874   //
2875   // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
2876   // want THIS_NODE to be 2^fbits. This is much easier to deal with using
2877   // integers.
2878   bool IsExact;
2879 
2880   // fbits is between 1 and 64 in the worst-case, which means the fmul
2881   // could have 2^64 as an actual operand. Need 65 bits of precision.
2882   APSInt IntVal(65, true);
2883   FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
2884 
2885   // N.b. isPowerOf2 also checks for > 0.
2886   if (!IsExact || !IntVal.isPowerOf2()) return false;
2887   unsigned FBits = IntVal.logBase2();
2888 
2889   // Checks above should have guaranteed that we haven't lost information in
2890   // finding FBits, but it must still be in range.
2891   if (FBits == 0 || FBits > RegWidth) return false;
2892 
2893   FixedPos = CurDAG->getTargetConstant(FBits, SDLoc(N), MVT::i32);
2894   return true;
2895 }
2896 
2897 // Inspects a register string of the form o0:op1:CRn:CRm:op2 gets the fields
2898 // of the string and obtains the integer values from them and combines these
2899 // into a single value to be used in the MRS/MSR instruction.
getIntOperandFromRegisterString(StringRef RegString)2900 static int getIntOperandFromRegisterString(StringRef RegString) {
2901   SmallVector<StringRef, 5> Fields;
2902   RegString.split(Fields, ':');
2903 
2904   if (Fields.size() == 1)
2905     return -1;
2906 
2907   assert(Fields.size() == 5
2908             && "Invalid number of fields in read register string");
2909 
2910   SmallVector<int, 5> Ops;
2911   bool AllIntFields = true;
2912 
2913   for (StringRef Field : Fields) {
2914     unsigned IntField;
2915     AllIntFields &= !Field.getAsInteger(10, IntField);
2916     Ops.push_back(IntField);
2917   }
2918 
2919   assert(AllIntFields &&
2920           "Unexpected non-integer value in special register string.");
2921 
2922   // Need to combine the integer fields of the string into a single value
2923   // based on the bit encoding of MRS/MSR instruction.
2924   return (Ops[0] << 14) | (Ops[1] << 11) | (Ops[2] << 7) |
2925          (Ops[3] << 3) | (Ops[4]);
2926 }
2927 
2928 // Lower the read_register intrinsic to an MRS instruction node if the special
2929 // register string argument is either of the form detailed in the ALCE (the
2930 // form described in getIntOperandsFromRegsterString) or is a named register
2931 // known by the MRS SysReg mapper.
tryReadRegister(SDNode * N)2932 bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
2933   const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
2934   const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
2935   SDLoc DL(N);
2936 
2937   int Reg = getIntOperandFromRegisterString(RegString->getString());
2938   if (Reg != -1) {
2939     ReplaceNode(N, CurDAG->getMachineNode(
2940                        AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
2941                        CurDAG->getTargetConstant(Reg, DL, MVT::i32),
2942                        N->getOperand(0)));
2943     return true;
2944   }
2945 
2946   // Use the sysreg mapper to map the remaining possible strings to the
2947   // value for the register to be used for the instruction operand.
2948   auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
2949   if (TheReg && TheReg->Readable &&
2950       TheReg->haveFeatures(Subtarget->getFeatureBits()))
2951     Reg = TheReg->Encoding;
2952   else
2953     Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
2954 
2955   if (Reg != -1) {
2956     ReplaceNode(N, CurDAG->getMachineNode(
2957                        AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
2958                        CurDAG->getTargetConstant(Reg, DL, MVT::i32),
2959                        N->getOperand(0)));
2960     return true;
2961   }
2962 
2963   if (RegString->getString() == "pc") {
2964     ReplaceNode(N, CurDAG->getMachineNode(
2965                        AArch64::ADR, DL, N->getSimpleValueType(0), MVT::Other,
2966                        CurDAG->getTargetConstant(0, DL, MVT::i32),
2967                        N->getOperand(0)));
2968     return true;
2969   }
2970 
2971   return false;
2972 }
2973 
2974 // Lower the write_register intrinsic to an MSR instruction node if the special
2975 // register string argument is either of the form detailed in the ALCE (the
2976 // form described in getIntOperandsFromRegsterString) or is a named register
2977 // known by the MSR SysReg mapper.
tryWriteRegister(SDNode * N)2978 bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
2979   const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
2980   const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
2981   SDLoc DL(N);
2982 
2983   int Reg = getIntOperandFromRegisterString(RegString->getString());
2984   if (Reg != -1) {
2985     ReplaceNode(
2986         N, CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
2987                                   CurDAG->getTargetConstant(Reg, DL, MVT::i32),
2988                                   N->getOperand(2), N->getOperand(0)));
2989     return true;
2990   }
2991 
2992   // Check if the register was one of those allowed as the pstatefield value in
2993   // the MSR (immediate) instruction. To accept the values allowed in the
2994   // pstatefield for the MSR (immediate) instruction, we also require that an
2995   // immediate value has been provided as an argument, we know that this is
2996   // the case as it has been ensured by semantic checking.
2997   auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());
2998   if (PMapper) {
2999     assert (isa<ConstantSDNode>(N->getOperand(2))
3000               && "Expected a constant integer expression.");
3001     unsigned Reg = PMapper->Encoding;
3002     uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
3003     unsigned State;
3004     if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO || Reg == AArch64PState::SSBS) {
3005       assert(Immed < 2 && "Bad imm");
3006       State = AArch64::MSRpstateImm1;
3007     } else {
3008       assert(Immed < 16 && "Bad imm");
3009       State = AArch64::MSRpstateImm4;
3010     }
3011     ReplaceNode(N, CurDAG->getMachineNode(
3012                        State, DL, MVT::Other,
3013                        CurDAG->getTargetConstant(Reg, DL, MVT::i32),
3014                        CurDAG->getTargetConstant(Immed, DL, MVT::i16),
3015                        N->getOperand(0)));
3016     return true;
3017   }
3018 
3019   // Use the sysreg mapper to attempt to map the remaining possible strings
3020   // to the value for the register to be used for the MSR (register)
3021   // instruction operand.
3022   auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
3023   if (TheReg && TheReg->Writeable &&
3024       TheReg->haveFeatures(Subtarget->getFeatureBits()))
3025     Reg = TheReg->Encoding;
3026   else
3027     Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
3028   if (Reg != -1) {
3029     ReplaceNode(N, CurDAG->getMachineNode(
3030                        AArch64::MSR, DL, MVT::Other,
3031                        CurDAG->getTargetConstant(Reg, DL, MVT::i32),
3032                        N->getOperand(2), N->getOperand(0)));
3033     return true;
3034   }
3035 
3036   return false;
3037 }
3038 
3039 /// We've got special pseudo-instructions for these
SelectCMP_SWAP(SDNode * N)3040 bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
3041   unsigned Opcode;
3042   EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
3043 
3044   // Leave IR for LSE if subtarget supports it.
3045   if (Subtarget->hasLSE()) return false;
3046 
3047   if (MemTy == MVT::i8)
3048     Opcode = AArch64::CMP_SWAP_8;
3049   else if (MemTy == MVT::i16)
3050     Opcode = AArch64::CMP_SWAP_16;
3051   else if (MemTy == MVT::i32)
3052     Opcode = AArch64::CMP_SWAP_32;
3053   else if (MemTy == MVT::i64)
3054     Opcode = AArch64::CMP_SWAP_64;
3055   else
3056     llvm_unreachable("Unknown AtomicCmpSwap type");
3057 
3058   MVT RegTy = MemTy == MVT::i64 ? MVT::i64 : MVT::i32;
3059   SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3),
3060                    N->getOperand(0)};
3061   SDNode *CmpSwap = CurDAG->getMachineNode(
3062       Opcode, SDLoc(N),
3063       CurDAG->getVTList(RegTy, MVT::i32, MVT::Other), Ops);
3064 
3065   MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
3066   CurDAG->setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
3067 
3068   ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
3069   ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
3070   CurDAG->RemoveDeadNode(N);
3071 
3072   return true;
3073 }
3074 
SelectSVE8BitLslImm(SDValue N,SDValue & Base,SDValue & Offset)3075 bool AArch64DAGToDAGISel::SelectSVE8BitLslImm(SDValue N, SDValue &Base,
3076                                                   SDValue &Offset) {
3077   auto C = dyn_cast<ConstantSDNode>(N);
3078   if (!C)
3079     return false;
3080 
3081   auto Ty = N->getValueType(0);
3082 
3083   int64_t Imm = C->getSExtValue();
3084   SDLoc DL(N);
3085 
3086   if ((Imm >= -128) && (Imm <= 127)) {
3087     Base = CurDAG->getTargetConstant(Imm, DL, Ty);
3088     Offset = CurDAG->getTargetConstant(0, DL, Ty);
3089     return true;
3090   }
3091 
3092   if (((Imm % 256) == 0) && (Imm >= -32768) && (Imm <= 32512)) {
3093     Base = CurDAG->getTargetConstant(Imm/256, DL, Ty);
3094     Offset = CurDAG->getTargetConstant(8, DL, Ty);
3095     return true;
3096   }
3097 
3098   return false;
3099 }
3100 
SelectSVEAddSubImm(SDValue N,MVT VT,SDValue & Imm,SDValue & Shift)3101 bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift) {
3102   if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
3103     const int64_t ImmVal = CNode->getSExtValue();
3104     SDLoc DL(N);
3105 
3106     switch (VT.SimpleTy) {
3107     case MVT::i8:
3108       // Can always select i8s, no shift, mask the immediate value to
3109       // deal with sign-extended value from lowering.
3110       Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
3111       Imm = CurDAG->getTargetConstant(ImmVal & 0xFF, DL, MVT::i32);
3112       return true;
3113     case MVT::i16:
3114       // i16 values get sign-extended to 32-bits during lowering.
3115       if ((ImmVal & 0xFF) == ImmVal) {
3116         Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
3117         Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
3118         return true;
3119       } else if ((ImmVal & 0xFF) == 0) {
3120         assert((ImmVal >= -32768) && (ImmVal <= 32512));
3121         Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
3122         Imm = CurDAG->getTargetConstant((ImmVal >> 8) & 0xFF, DL, MVT::i32);
3123         return true;
3124       }
3125       break;
3126     case MVT::i32:
3127     case MVT::i64:
3128       // Range of immediate won't trigger signedness problems for 32/64b.
3129       if ((ImmVal & 0xFF) == ImmVal) {
3130         Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
3131         Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
3132         return true;
3133       } else if ((ImmVal & 0xFF00) == ImmVal) {
3134         Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
3135         Imm = CurDAG->getTargetConstant(ImmVal >> 8, DL, MVT::i32);
3136         return true;
3137       }
3138       break;
3139     default:
3140       break;
3141     }
3142   }
3143 
3144   return false;
3145 }
3146 
SelectSVESignedArithImm(SDValue N,SDValue & Imm)3147 bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) {
3148   if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
3149     int64_t ImmVal = CNode->getSExtValue();
3150     SDLoc DL(N);
3151     if (ImmVal >= -128 && ImmVal < 128) {
3152       Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
3153       return true;
3154     }
3155   }
3156   return false;
3157 }
3158 
SelectSVEArithImm(SDValue N,MVT VT,SDValue & Imm)3159 bool AArch64DAGToDAGISel::SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm) {
3160   if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
3161     uint64_t ImmVal = CNode->getZExtValue();
3162 
3163     switch (VT.SimpleTy) {
3164     case MVT::i8:
3165       ImmVal &= 0xFF;
3166       break;
3167     case MVT::i16:
3168       ImmVal &= 0xFFFF;
3169       break;
3170     case MVT::i32:
3171       ImmVal &= 0xFFFFFFFF;
3172       break;
3173     case MVT::i64:
3174       break;
3175     default:
3176       llvm_unreachable("Unexpected type");
3177     }
3178 
3179     if (ImmVal < 256) {
3180       Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i32);
3181       return true;
3182     }
3183   }
3184   return false;
3185 }
3186 
SelectSVELogicalImm(SDValue N,MVT VT,SDValue & Imm,bool Invert)3187 bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm,
3188                                               bool Invert) {
3189   if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
3190     uint64_t ImmVal = CNode->getZExtValue();
3191     SDLoc DL(N);
3192 
3193     if (Invert)
3194       ImmVal = ~ImmVal;
3195 
3196     // Shift mask depending on type size.
3197     switch (VT.SimpleTy) {
3198     case MVT::i8:
3199       ImmVal &= 0xFF;
3200       ImmVal |= ImmVal << 8;
3201       ImmVal |= ImmVal << 16;
3202       ImmVal |= ImmVal << 32;
3203       break;
3204     case MVT::i16:
3205       ImmVal &= 0xFFFF;
3206       ImmVal |= ImmVal << 16;
3207       ImmVal |= ImmVal << 32;
3208       break;
3209     case MVT::i32:
3210       ImmVal &= 0xFFFFFFFF;
3211       ImmVal |= ImmVal << 32;
3212       break;
3213     case MVT::i64:
3214       break;
3215     default:
3216       llvm_unreachable("Unexpected type");
3217     }
3218 
3219     uint64_t encoding;
3220     if (AArch64_AM::processLogicalImmediate(ImmVal, 64, encoding)) {
3221       Imm = CurDAG->getTargetConstant(encoding, DL, MVT::i64);
3222       return true;
3223     }
3224   }
3225   return false;
3226 }
3227 
3228 // SVE shift intrinsics allow shift amounts larger than the element's bitwidth.
3229 // Rather than attempt to normalise everything we can sometimes saturate the
3230 // shift amount during selection. This function also allows for consistent
3231 // isel patterns by ensuring the resulting "Imm" node is of the i32 type
3232 // required by the instructions.
SelectSVEShiftImm(SDValue N,uint64_t Low,uint64_t High,bool AllowSaturation,SDValue & Imm)3233 bool AArch64DAGToDAGISel::SelectSVEShiftImm(SDValue N, uint64_t Low,
3234                                             uint64_t High, bool AllowSaturation,
3235                                             SDValue &Imm) {
3236   if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
3237     uint64_t ImmVal = CN->getZExtValue();
3238 
3239     // Reject shift amounts that are too small.
3240     if (ImmVal < Low)
3241       return false;
3242 
3243     // Reject or saturate shift amounts that are too big.
3244     if (ImmVal > High) {
3245       if (!AllowSaturation)
3246         return false;
3247       ImmVal = High;
3248     }
3249 
3250     Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i32);
3251     return true;
3252   }
3253 
3254   return false;
3255 }
3256 
trySelectStackSlotTagP(SDNode * N)3257 bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
3258   // tagp(FrameIndex, IRGstack, tag_offset):
3259   // since the offset between FrameIndex and IRGstack is a compile-time
3260   // constant, this can be lowered to a single ADDG instruction.
3261   if (!(isa<FrameIndexSDNode>(N->getOperand(1)))) {
3262     return false;
3263   }
3264 
3265   SDValue IRG_SP = N->getOperand(2);
3266   if (IRG_SP->getOpcode() != ISD::INTRINSIC_W_CHAIN ||
3267       cast<ConstantSDNode>(IRG_SP->getOperand(1))->getZExtValue() !=
3268           Intrinsic::aarch64_irg_sp) {
3269     return false;
3270   }
3271 
3272   const TargetLowering *TLI = getTargetLowering();
3273   SDLoc DL(N);
3274   int FI = cast<FrameIndexSDNode>(N->getOperand(1))->getIndex();
3275   SDValue FiOp = CurDAG->getTargetFrameIndex(
3276       FI, TLI->getPointerTy(CurDAG->getDataLayout()));
3277   int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
3278 
3279   SDNode *Out = CurDAG->getMachineNode(
3280       AArch64::TAGPstack, DL, MVT::i64,
3281       {FiOp, CurDAG->getTargetConstant(0, DL, MVT::i64), N->getOperand(2),
3282        CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)});
3283   ReplaceNode(N, Out);
3284   return true;
3285 }
3286 
SelectTagP(SDNode * N)3287 void AArch64DAGToDAGISel::SelectTagP(SDNode *N) {
3288   assert(isa<ConstantSDNode>(N->getOperand(3)) &&
3289          "llvm.aarch64.tagp third argument must be an immediate");
3290   if (trySelectStackSlotTagP(N))
3291     return;
3292   // FIXME: above applies in any case when offset between Op1 and Op2 is a
3293   // compile-time constant, not just for stack allocations.
3294 
3295   // General case for unrelated pointers in Op1 and Op2.
3296   SDLoc DL(N);
3297   int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
3298   SDNode *N1 = CurDAG->getMachineNode(AArch64::SUBP, DL, MVT::i64,
3299                                       {N->getOperand(1), N->getOperand(2)});
3300   SDNode *N2 = CurDAG->getMachineNode(AArch64::ADDXrr, DL, MVT::i64,
3301                                       {SDValue(N1, 0), N->getOperand(2)});
3302   SDNode *N3 = CurDAG->getMachineNode(
3303       AArch64::ADDG, DL, MVT::i64,
3304       {SDValue(N2, 0), CurDAG->getTargetConstant(0, DL, MVT::i64),
3305        CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)});
3306   ReplaceNode(N, N3);
3307 }
3308 
3309 // NOTE: We cannot use EXTRACT_SUBREG in all cases because the fixed length
3310 // vector types larger than NEON don't have a matching SubRegIndex.
extractSubReg(SelectionDAG * DAG,EVT VT,SDValue V)3311 static SDNode *extractSubReg(SelectionDAG *DAG, EVT VT, SDValue V) {
3312   assert(V.getValueType().isScalableVector() &&
3313          V.getValueType().getSizeInBits().getKnownMinSize() ==
3314              AArch64::SVEBitsPerBlock &&
3315          "Expected to extract from a packed scalable vector!");
3316   assert(VT.isFixedLengthVector() &&
3317          "Expected to extract a fixed length vector!");
3318 
3319   SDLoc DL(V);
3320   switch (VT.getSizeInBits()) {
3321   case 64: {
3322     auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
3323     return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg);
3324   }
3325   case 128: {
3326     auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
3327     return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg);
3328   }
3329   default: {
3330     auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64);
3331     return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
3332   }
3333   }
3334 }
3335 
3336 // NOTE: We cannot use INSERT_SUBREG in all cases because the fixed length
3337 // vector types larger than NEON don't have a matching SubRegIndex.
insertSubReg(SelectionDAG * DAG,EVT VT,SDValue V)3338 static SDNode *insertSubReg(SelectionDAG *DAG, EVT VT, SDValue V) {
3339   assert(VT.isScalableVector() &&
3340          VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock &&
3341          "Expected to insert into a packed scalable vector!");
3342   assert(V.getValueType().isFixedLengthVector() &&
3343          "Expected to insert a fixed length vector!");
3344 
3345   SDLoc DL(V);
3346   switch (V.getValueType().getSizeInBits()) {
3347   case 64: {
3348     auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
3349     auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
3350     return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT,
3351                                SDValue(Container, 0), V, SubReg);
3352   }
3353   case 128: {
3354     auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
3355     auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
3356     return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT,
3357                                SDValue(Container, 0), V, SubReg);
3358   }
3359   default: {
3360     auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64);
3361     return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
3362   }
3363   }
3364 }
3365 
Select(SDNode * Node)3366 void AArch64DAGToDAGISel::Select(SDNode *Node) {
3367   // If we have a custom node, we already have selected!
3368   if (Node->isMachineOpcode()) {
3369     LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
3370     Node->setNodeId(-1);
3371     return;
3372   }
3373 
3374   // Few custom selection stuff.
3375   EVT VT = Node->getValueType(0);
3376 
3377   switch (Node->getOpcode()) {
3378   default:
3379     break;
3380 
3381   case ISD::ATOMIC_CMP_SWAP:
3382     if (SelectCMP_SWAP(Node))
3383       return;
3384     break;
3385 
3386   case ISD::READ_REGISTER:
3387     if (tryReadRegister(Node))
3388       return;
3389     break;
3390 
3391   case ISD::WRITE_REGISTER:
3392     if (tryWriteRegister(Node))
3393       return;
3394     break;
3395 
3396   case ISD::ADD:
3397     if (tryMLAV64LaneV128(Node))
3398       return;
3399     break;
3400 
3401   case ISD::LOAD: {
3402     // Try to select as an indexed load. Fall through to normal processing
3403     // if we can't.
3404     if (tryIndexedLoad(Node))
3405       return;
3406     break;
3407   }
3408 
3409   case ISD::SRL:
3410   case ISD::AND:
3411   case ISD::SRA:
3412   case ISD::SIGN_EXTEND_INREG:
3413     if (tryBitfieldExtractOp(Node))
3414       return;
3415     if (tryBitfieldInsertInZeroOp(Node))
3416       return;
3417     LLVM_FALLTHROUGH;
3418   case ISD::ROTR:
3419   case ISD::SHL:
3420     if (tryShiftAmountMod(Node))
3421       return;
3422     break;
3423 
3424   case ISD::SIGN_EXTEND:
3425     if (tryBitfieldExtractOpFromSExt(Node))
3426       return;
3427     break;
3428 
3429   case ISD::FP_EXTEND:
3430     if (tryHighFPExt(Node))
3431       return;
3432     break;
3433 
3434   case ISD::OR:
3435     if (tryBitfieldInsertOp(Node))
3436       return;
3437     break;
3438 
3439   case ISD::EXTRACT_SUBVECTOR: {
3440     // Bail when not a "cast" like extract_subvector.
3441     if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue() != 0)
3442       break;
3443 
3444     // Bail when normal isel can do the job.
3445     EVT InVT = Node->getOperand(0).getValueType();
3446     if (VT.isScalableVector() || InVT.isFixedLengthVector())
3447       break;
3448 
3449     // NOTE: We can only get here when doing fixed length SVE code generation.
3450     // We do manual selection because the types involved are not linked to real
3451     // registers (despite being legal) and must be coerced into SVE registers.
3452     //
3453     // NOTE: If the above changes, be aware that selection will still not work
3454     // because the td definition of extract_vector does not support extracting
3455     // a fixed length vector from a scalable vector.
3456 
3457     ReplaceNode(Node, extractSubReg(CurDAG, VT, Node->getOperand(0)));
3458     return;
3459   }
3460 
3461   case ISD::INSERT_SUBVECTOR: {
3462     // Bail when not a "cast" like insert_subvector.
3463     if (cast<ConstantSDNode>(Node->getOperand(2))->getZExtValue() != 0)
3464       break;
3465     if (!Node->getOperand(0).isUndef())
3466       break;
3467 
3468     // Bail when normal isel should do the job.
3469     EVT InVT = Node->getOperand(1).getValueType();
3470     if (VT.isFixedLengthVector() || InVT.isScalableVector())
3471       break;
3472 
3473     // NOTE: We can only get here when doing fixed length SVE code generation.
3474     // We do manual selection because the types involved are not linked to real
3475     // registers (despite being legal) and must be coerced into SVE registers.
3476     //
3477     // NOTE: If the above changes, be aware that selection will still not work
3478     // because the td definition of insert_vector does not support inserting a
3479     // fixed length vector into a scalable vector.
3480 
3481     ReplaceNode(Node, insertSubReg(CurDAG, VT, Node->getOperand(1)));
3482     return;
3483   }
3484 
3485   case ISD::Constant: {
3486     // Materialize zero constants as copies from WZR/XZR.  This allows
3487     // the coalescer to propagate these into other instructions.
3488     ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
3489     if (ConstNode->isNullValue()) {
3490       if (VT == MVT::i32) {
3491         SDValue New = CurDAG->getCopyFromReg(
3492             CurDAG->getEntryNode(), SDLoc(Node), AArch64::WZR, MVT::i32);
3493         ReplaceNode(Node, New.getNode());
3494         return;
3495       } else if (VT == MVT::i64) {
3496         SDValue New = CurDAG->getCopyFromReg(
3497             CurDAG->getEntryNode(), SDLoc(Node), AArch64::XZR, MVT::i64);
3498         ReplaceNode(Node, New.getNode());
3499         return;
3500       }
3501     }
3502     break;
3503   }
3504 
3505   case ISD::FrameIndex: {
3506     // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
3507     int FI = cast<FrameIndexSDNode>(Node)->getIndex();
3508     unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
3509     const TargetLowering *TLI = getTargetLowering();
3510     SDValue TFI = CurDAG->getTargetFrameIndex(
3511         FI, TLI->getPointerTy(CurDAG->getDataLayout()));
3512     SDLoc DL(Node);
3513     SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32),
3514                       CurDAG->getTargetConstant(Shifter, DL, MVT::i32) };
3515     CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
3516     return;
3517   }
3518   case ISD::INTRINSIC_W_CHAIN: {
3519     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
3520     switch (IntNo) {
3521     default:
3522       break;
3523     case Intrinsic::aarch64_ldaxp:
3524     case Intrinsic::aarch64_ldxp: {
3525       unsigned Op =
3526           IntNo == Intrinsic::aarch64_ldaxp ? AArch64::LDAXPX : AArch64::LDXPX;
3527       SDValue MemAddr = Node->getOperand(2);
3528       SDLoc DL(Node);
3529       SDValue Chain = Node->getOperand(0);
3530 
3531       SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64,
3532                                           MVT::Other, MemAddr, Chain);
3533 
3534       // Transfer memoperands.
3535       MachineMemOperand *MemOp =
3536           cast<MemIntrinsicSDNode>(Node)->getMemOperand();
3537       CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
3538       ReplaceNode(Node, Ld);
3539       return;
3540     }
3541     case Intrinsic::aarch64_stlxp:
3542     case Intrinsic::aarch64_stxp: {
3543       unsigned Op =
3544           IntNo == Intrinsic::aarch64_stlxp ? AArch64::STLXPX : AArch64::STXPX;
3545       SDLoc DL(Node);
3546       SDValue Chain = Node->getOperand(0);
3547       SDValue ValLo = Node->getOperand(2);
3548       SDValue ValHi = Node->getOperand(3);
3549       SDValue MemAddr = Node->getOperand(4);
3550 
3551       // Place arguments in the right order.
3552       SDValue Ops[] = {ValLo, ValHi, MemAddr, Chain};
3553 
3554       SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
3555       // Transfer memoperands.
3556       MachineMemOperand *MemOp =
3557           cast<MemIntrinsicSDNode>(Node)->getMemOperand();
3558       CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
3559 
3560       ReplaceNode(Node, St);
3561       return;
3562     }
3563     case Intrinsic::aarch64_neon_ld1x2:
3564       if (VT == MVT::v8i8) {
3565         SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
3566         return;
3567       } else if (VT == MVT::v16i8) {
3568         SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
3569         return;
3570       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3571         SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
3572         return;
3573       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3574         SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
3575         return;
3576       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3577         SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
3578         return;
3579       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3580         SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
3581         return;
3582       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3583         SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
3584         return;
3585       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3586         SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
3587         return;
3588       }
3589       break;
3590     case Intrinsic::aarch64_neon_ld1x3:
3591       if (VT == MVT::v8i8) {
3592         SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
3593         return;
3594       } else if (VT == MVT::v16i8) {
3595         SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
3596         return;
3597       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3598         SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
3599         return;
3600       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3601         SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
3602         return;
3603       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3604         SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
3605         return;
3606       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3607         SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
3608         return;
3609       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3610         SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
3611         return;
3612       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3613         SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
3614         return;
3615       }
3616       break;
3617     case Intrinsic::aarch64_neon_ld1x4:
3618       if (VT == MVT::v8i8) {
3619         SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
3620         return;
3621       } else if (VT == MVT::v16i8) {
3622         SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
3623         return;
3624       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3625         SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
3626         return;
3627       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3628         SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
3629         return;
3630       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3631         SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
3632         return;
3633       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3634         SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
3635         return;
3636       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3637         SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
3638         return;
3639       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3640         SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
3641         return;
3642       }
3643       break;
3644     case Intrinsic::aarch64_neon_ld2:
3645       if (VT == MVT::v8i8) {
3646         SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
3647         return;
3648       } else if (VT == MVT::v16i8) {
3649         SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
3650         return;
3651       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3652         SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
3653         return;
3654       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3655         SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
3656         return;
3657       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3658         SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
3659         return;
3660       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3661         SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
3662         return;
3663       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3664         SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
3665         return;
3666       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3667         SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
3668         return;
3669       }
3670       break;
3671     case Intrinsic::aarch64_neon_ld3:
3672       if (VT == MVT::v8i8) {
3673         SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
3674         return;
3675       } else if (VT == MVT::v16i8) {
3676         SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
3677         return;
3678       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3679         SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
3680         return;
3681       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3682         SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
3683         return;
3684       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3685         SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
3686         return;
3687       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3688         SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
3689         return;
3690       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3691         SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
3692         return;
3693       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3694         SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
3695         return;
3696       }
3697       break;
3698     case Intrinsic::aarch64_neon_ld4:
3699       if (VT == MVT::v8i8) {
3700         SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
3701         return;
3702       } else if (VT == MVT::v16i8) {
3703         SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
3704         return;
3705       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3706         SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
3707         return;
3708       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3709         SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
3710         return;
3711       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3712         SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
3713         return;
3714       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3715         SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
3716         return;
3717       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3718         SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
3719         return;
3720       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3721         SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
3722         return;
3723       }
3724       break;
3725     case Intrinsic::aarch64_neon_ld2r:
3726       if (VT == MVT::v8i8) {
3727         SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
3728         return;
3729       } else if (VT == MVT::v16i8) {
3730         SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
3731         return;
3732       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3733         SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
3734         return;
3735       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3736         SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
3737         return;
3738       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3739         SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
3740         return;
3741       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3742         SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
3743         return;
3744       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3745         SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
3746         return;
3747       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3748         SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
3749         return;
3750       }
3751       break;
3752     case Intrinsic::aarch64_neon_ld3r:
3753       if (VT == MVT::v8i8) {
3754         SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
3755         return;
3756       } else if (VT == MVT::v16i8) {
3757         SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
3758         return;
3759       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3760         SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
3761         return;
3762       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3763         SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
3764         return;
3765       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3766         SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
3767         return;
3768       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3769         SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
3770         return;
3771       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3772         SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
3773         return;
3774       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3775         SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
3776         return;
3777       }
3778       break;
3779     case Intrinsic::aarch64_neon_ld4r:
3780       if (VT == MVT::v8i8) {
3781         SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
3782         return;
3783       } else if (VT == MVT::v16i8) {
3784         SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
3785         return;
3786       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3787         SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
3788         return;
3789       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3790         SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
3791         return;
3792       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3793         SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
3794         return;
3795       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3796         SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
3797         return;
3798       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3799         SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
3800         return;
3801       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3802         SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
3803         return;
3804       }
3805       break;
3806     case Intrinsic::aarch64_neon_ld2lane:
3807       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3808         SelectLoadLane(Node, 2, AArch64::LD2i8);
3809         return;
3810       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3811                  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
3812         SelectLoadLane(Node, 2, AArch64::LD2i16);
3813         return;
3814       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3815                  VT == MVT::v2f32) {
3816         SelectLoadLane(Node, 2, AArch64::LD2i32);
3817         return;
3818       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3819                  VT == MVT::v1f64) {
3820         SelectLoadLane(Node, 2, AArch64::LD2i64);
3821         return;
3822       }
3823       break;
3824     case Intrinsic::aarch64_neon_ld3lane:
3825       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3826         SelectLoadLane(Node, 3, AArch64::LD3i8);
3827         return;
3828       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3829                  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
3830         SelectLoadLane(Node, 3, AArch64::LD3i16);
3831         return;
3832       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3833                  VT == MVT::v2f32) {
3834         SelectLoadLane(Node, 3, AArch64::LD3i32);
3835         return;
3836       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3837                  VT == MVT::v1f64) {
3838         SelectLoadLane(Node, 3, AArch64::LD3i64);
3839         return;
3840       }
3841       break;
3842     case Intrinsic::aarch64_neon_ld4lane:
3843       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3844         SelectLoadLane(Node, 4, AArch64::LD4i8);
3845         return;
3846       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3847                  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
3848         SelectLoadLane(Node, 4, AArch64::LD4i16);
3849         return;
3850       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3851                  VT == MVT::v2f32) {
3852         SelectLoadLane(Node, 4, AArch64::LD4i32);
3853         return;
3854       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3855                  VT == MVT::v1f64) {
3856         SelectLoadLane(Node, 4, AArch64::LD4i64);
3857         return;
3858       }
3859       break;
3860     case Intrinsic::aarch64_ld64b:
3861       SelectLoad(Node, 8, AArch64::LD64B, AArch64::x8sub_0);
3862       return;
3863     }
3864   } break;
3865   case ISD::INTRINSIC_WO_CHAIN: {
3866     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
3867     switch (IntNo) {
3868     default:
3869       break;
3870     case Intrinsic::aarch64_tagp:
3871       SelectTagP(Node);
3872       return;
3873     case Intrinsic::aarch64_neon_tbl2:
3874       SelectTable(Node, 2,
3875                   VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
3876                   false);
3877       return;
3878     case Intrinsic::aarch64_neon_tbl3:
3879       SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
3880                                            : AArch64::TBLv16i8Three,
3881                   false);
3882       return;
3883     case Intrinsic::aarch64_neon_tbl4:
3884       SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
3885                                            : AArch64::TBLv16i8Four,
3886                   false);
3887       return;
3888     case Intrinsic::aarch64_neon_tbx2:
3889       SelectTable(Node, 2,
3890                   VT == MVT::v8i8 ? AArch64::TBXv8i8Two : AArch64::TBXv16i8Two,
3891                   true);
3892       return;
3893     case Intrinsic::aarch64_neon_tbx3:
3894       SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
3895                                            : AArch64::TBXv16i8Three,
3896                   true);
3897       return;
3898     case Intrinsic::aarch64_neon_tbx4:
3899       SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
3900                                            : AArch64::TBXv16i8Four,
3901                   true);
3902       return;
3903     case Intrinsic::aarch64_neon_smull:
3904     case Intrinsic::aarch64_neon_umull:
3905       if (tryMULLV64LaneV128(IntNo, Node))
3906         return;
3907       break;
3908     case Intrinsic::swift_async_context_addr: {
3909       SDLoc DL(Node);
3910       CurDAG->SelectNodeTo(Node, AArch64::SUBXri, MVT::i64,
3911                            CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
3912                                                   AArch64::FP, MVT::i64),
3913                            CurDAG->getTargetConstant(8, DL, MVT::i32),
3914                            CurDAG->getTargetConstant(0, DL, MVT::i32));
3915       auto &MF = CurDAG->getMachineFunction();
3916       MF.getFrameInfo().setFrameAddressIsTaken(true);
3917       MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
3918       return;
3919     }
3920     }
3921     break;
3922   }
3923   case ISD::INTRINSIC_VOID: {
3924     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
3925     if (Node->getNumOperands() >= 3)
3926       VT = Node->getOperand(2)->getValueType(0);
3927     switch (IntNo) {
3928     default:
3929       break;
3930     case Intrinsic::aarch64_neon_st1x2: {
3931       if (VT == MVT::v8i8) {
3932         SelectStore(Node, 2, AArch64::ST1Twov8b);
3933         return;
3934       } else if (VT == MVT::v16i8) {
3935         SelectStore(Node, 2, AArch64::ST1Twov16b);
3936         return;
3937       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
3938                  VT == MVT::v4bf16) {
3939         SelectStore(Node, 2, AArch64::ST1Twov4h);
3940         return;
3941       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
3942                  VT == MVT::v8bf16) {
3943         SelectStore(Node, 2, AArch64::ST1Twov8h);
3944         return;
3945       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3946         SelectStore(Node, 2, AArch64::ST1Twov2s);
3947         return;
3948       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3949         SelectStore(Node, 2, AArch64::ST1Twov4s);
3950         return;
3951       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3952         SelectStore(Node, 2, AArch64::ST1Twov2d);
3953         return;
3954       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3955         SelectStore(Node, 2, AArch64::ST1Twov1d);
3956         return;
3957       }
3958       break;
3959     }
3960     case Intrinsic::aarch64_neon_st1x3: {
3961       if (VT == MVT::v8i8) {
3962         SelectStore(Node, 3, AArch64::ST1Threev8b);
3963         return;
3964       } else if (VT == MVT::v16i8) {
3965         SelectStore(Node, 3, AArch64::ST1Threev16b);
3966         return;
3967       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
3968                  VT == MVT::v4bf16) {
3969         SelectStore(Node, 3, AArch64::ST1Threev4h);
3970         return;
3971       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
3972                  VT == MVT::v8bf16) {
3973         SelectStore(Node, 3, AArch64::ST1Threev8h);
3974         return;
3975       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3976         SelectStore(Node, 3, AArch64::ST1Threev2s);
3977         return;
3978       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3979         SelectStore(Node, 3, AArch64::ST1Threev4s);
3980         return;
3981       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3982         SelectStore(Node, 3, AArch64::ST1Threev2d);
3983         return;
3984       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3985         SelectStore(Node, 3, AArch64::ST1Threev1d);
3986         return;
3987       }
3988       break;
3989     }
3990     case Intrinsic::aarch64_neon_st1x4: {
3991       if (VT == MVT::v8i8) {
3992         SelectStore(Node, 4, AArch64::ST1Fourv8b);
3993         return;
3994       } else if (VT == MVT::v16i8) {
3995         SelectStore(Node, 4, AArch64::ST1Fourv16b);
3996         return;
3997       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
3998                  VT == MVT::v4bf16) {
3999         SelectStore(Node, 4, AArch64::ST1Fourv4h);
4000         return;
4001       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
4002                  VT == MVT::v8bf16) {
4003         SelectStore(Node, 4, AArch64::ST1Fourv8h);
4004         return;
4005       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4006         SelectStore(Node, 4, AArch64::ST1Fourv2s);
4007         return;
4008       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4009         SelectStore(Node, 4, AArch64::ST1Fourv4s);
4010         return;
4011       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4012         SelectStore(Node, 4, AArch64::ST1Fourv2d);
4013         return;
4014       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4015         SelectStore(Node, 4, AArch64::ST1Fourv1d);
4016         return;
4017       }
4018       break;
4019     }
4020     case Intrinsic::aarch64_neon_st2: {
4021       if (VT == MVT::v8i8) {
4022         SelectStore(Node, 2, AArch64::ST2Twov8b);
4023         return;
4024       } else if (VT == MVT::v16i8) {
4025         SelectStore(Node, 2, AArch64::ST2Twov16b);
4026         return;
4027       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
4028                  VT == MVT::v4bf16) {
4029         SelectStore(Node, 2, AArch64::ST2Twov4h);
4030         return;
4031       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
4032                  VT == MVT::v8bf16) {
4033         SelectStore(Node, 2, AArch64::ST2Twov8h);
4034         return;
4035       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4036         SelectStore(Node, 2, AArch64::ST2Twov2s);
4037         return;
4038       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4039         SelectStore(Node, 2, AArch64::ST2Twov4s);
4040         return;
4041       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4042         SelectStore(Node, 2, AArch64::ST2Twov2d);
4043         return;
4044       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4045         SelectStore(Node, 2, AArch64::ST1Twov1d);
4046         return;
4047       }
4048       break;
4049     }
4050     case Intrinsic::aarch64_neon_st3: {
4051       if (VT == MVT::v8i8) {
4052         SelectStore(Node, 3, AArch64::ST3Threev8b);
4053         return;
4054       } else if (VT == MVT::v16i8) {
4055         SelectStore(Node, 3, AArch64::ST3Threev16b);
4056         return;
4057       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
4058                  VT == MVT::v4bf16) {
4059         SelectStore(Node, 3, AArch64::ST3Threev4h);
4060         return;
4061       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
4062                  VT == MVT::v8bf16) {
4063         SelectStore(Node, 3, AArch64::ST3Threev8h);
4064         return;
4065       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4066         SelectStore(Node, 3, AArch64::ST3Threev2s);
4067         return;
4068       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4069         SelectStore(Node, 3, AArch64::ST3Threev4s);
4070         return;
4071       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4072         SelectStore(Node, 3, AArch64::ST3Threev2d);
4073         return;
4074       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4075         SelectStore(Node, 3, AArch64::ST1Threev1d);
4076         return;
4077       }
4078       break;
4079     }
4080     case Intrinsic::aarch64_neon_st4: {
4081       if (VT == MVT::v8i8) {
4082         SelectStore(Node, 4, AArch64::ST4Fourv8b);
4083         return;
4084       } else if (VT == MVT::v16i8) {
4085         SelectStore(Node, 4, AArch64::ST4Fourv16b);
4086         return;
4087       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
4088                  VT == MVT::v4bf16) {
4089         SelectStore(Node, 4, AArch64::ST4Fourv4h);
4090         return;
4091       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
4092                  VT == MVT::v8bf16) {
4093         SelectStore(Node, 4, AArch64::ST4Fourv8h);
4094         return;
4095       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4096         SelectStore(Node, 4, AArch64::ST4Fourv2s);
4097         return;
4098       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4099         SelectStore(Node, 4, AArch64::ST4Fourv4s);
4100         return;
4101       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4102         SelectStore(Node, 4, AArch64::ST4Fourv2d);
4103         return;
4104       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4105         SelectStore(Node, 4, AArch64::ST1Fourv1d);
4106         return;
4107       }
4108       break;
4109     }
4110     case Intrinsic::aarch64_neon_st2lane: {
4111       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4112         SelectStoreLane(Node, 2, AArch64::ST2i8);
4113         return;
4114       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4115                  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4116         SelectStoreLane(Node, 2, AArch64::ST2i16);
4117         return;
4118       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4119                  VT == MVT::v2f32) {
4120         SelectStoreLane(Node, 2, AArch64::ST2i32);
4121         return;
4122       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4123                  VT == MVT::v1f64) {
4124         SelectStoreLane(Node, 2, AArch64::ST2i64);
4125         return;
4126       }
4127       break;
4128     }
4129     case Intrinsic::aarch64_neon_st3lane: {
4130       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4131         SelectStoreLane(Node, 3, AArch64::ST3i8);
4132         return;
4133       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4134                  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4135         SelectStoreLane(Node, 3, AArch64::ST3i16);
4136         return;
4137       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4138                  VT == MVT::v2f32) {
4139         SelectStoreLane(Node, 3, AArch64::ST3i32);
4140         return;
4141       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4142                  VT == MVT::v1f64) {
4143         SelectStoreLane(Node, 3, AArch64::ST3i64);
4144         return;
4145       }
4146       break;
4147     }
4148     case Intrinsic::aarch64_neon_st4lane: {
4149       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4150         SelectStoreLane(Node, 4, AArch64::ST4i8);
4151         return;
4152       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4153                  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4154         SelectStoreLane(Node, 4, AArch64::ST4i16);
4155         return;
4156       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4157                  VT == MVT::v2f32) {
4158         SelectStoreLane(Node, 4, AArch64::ST4i32);
4159         return;
4160       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4161                  VT == MVT::v1f64) {
4162         SelectStoreLane(Node, 4, AArch64::ST4i64);
4163         return;
4164       }
4165       break;
4166     }
4167     case Intrinsic::aarch64_sve_st2: {
4168       if (VT == MVT::nxv16i8) {
4169         SelectPredicatedStore(Node, 2, 0, AArch64::ST2B, AArch64::ST2B_IMM);
4170         return;
4171       } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4172                  (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
4173         SelectPredicatedStore(Node, 2, 1, AArch64::ST2H, AArch64::ST2H_IMM);
4174         return;
4175       } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4176         SelectPredicatedStore(Node, 2, 2, AArch64::ST2W, AArch64::ST2W_IMM);
4177         return;
4178       } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4179         SelectPredicatedStore(Node, 2, 3, AArch64::ST2D, AArch64::ST2D_IMM);
4180         return;
4181       }
4182       break;
4183     }
4184     case Intrinsic::aarch64_sve_st3: {
4185       if (VT == MVT::nxv16i8) {
4186         SelectPredicatedStore(Node, 3, 0, AArch64::ST3B, AArch64::ST3B_IMM);
4187         return;
4188       } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4189                  (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
4190         SelectPredicatedStore(Node, 3, 1, AArch64::ST3H, AArch64::ST3H_IMM);
4191         return;
4192       } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4193         SelectPredicatedStore(Node, 3, 2, AArch64::ST3W, AArch64::ST3W_IMM);
4194         return;
4195       } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4196         SelectPredicatedStore(Node, 3, 3, AArch64::ST3D, AArch64::ST3D_IMM);
4197         return;
4198       }
4199       break;
4200     }
4201     case Intrinsic::aarch64_sve_st4: {
4202       if (VT == MVT::nxv16i8) {
4203         SelectPredicatedStore(Node, 4, 0, AArch64::ST4B, AArch64::ST4B_IMM);
4204         return;
4205       } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4206                  (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
4207         SelectPredicatedStore(Node, 4, 1, AArch64::ST4H, AArch64::ST4H_IMM);
4208         return;
4209       } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4210         SelectPredicatedStore(Node, 4, 2, AArch64::ST4W, AArch64::ST4W_IMM);
4211         return;
4212       } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4213         SelectPredicatedStore(Node, 4, 3, AArch64::ST4D, AArch64::ST4D_IMM);
4214         return;
4215       }
4216       break;
4217     }
4218     }
4219     break;
4220   }
4221   case AArch64ISD::LD2post: {
4222     if (VT == MVT::v8i8) {
4223       SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
4224       return;
4225     } else if (VT == MVT::v16i8) {
4226       SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
4227       return;
4228     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4229       SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
4230       return;
4231     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4232       SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
4233       return;
4234     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4235       SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
4236       return;
4237     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4238       SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
4239       return;
4240     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4241       SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
4242       return;
4243     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4244       SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
4245       return;
4246     }
4247     break;
4248   }
4249   case AArch64ISD::LD3post: {
4250     if (VT == MVT::v8i8) {
4251       SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
4252       return;
4253     } else if (VT == MVT::v16i8) {
4254       SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
4255       return;
4256     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4257       SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
4258       return;
4259     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4260       SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
4261       return;
4262     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4263       SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
4264       return;
4265     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4266       SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
4267       return;
4268     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4269       SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
4270       return;
4271     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4272       SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
4273       return;
4274     }
4275     break;
4276   }
4277   case AArch64ISD::LD4post: {
4278     if (VT == MVT::v8i8) {
4279       SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
4280       return;
4281     } else if (VT == MVT::v16i8) {
4282       SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
4283       return;
4284     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4285       SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
4286       return;
4287     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4288       SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
4289       return;
4290     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4291       SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
4292       return;
4293     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4294       SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
4295       return;
4296     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4297       SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
4298       return;
4299     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4300       SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
4301       return;
4302     }
4303     break;
4304   }
4305   case AArch64ISD::LD1x2post: {
4306     if (VT == MVT::v8i8) {
4307       SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
4308       return;
4309     } else if (VT == MVT::v16i8) {
4310       SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
4311       return;
4312     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4313       SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
4314       return;
4315     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4316       SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
4317       return;
4318     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4319       SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
4320       return;
4321     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4322       SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
4323       return;
4324     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4325       SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
4326       return;
4327     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4328       SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
4329       return;
4330     }
4331     break;
4332   }
4333   case AArch64ISD::LD1x3post: {
4334     if (VT == MVT::v8i8) {
4335       SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
4336       return;
4337     } else if (VT == MVT::v16i8) {
4338       SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
4339       return;
4340     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4341       SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
4342       return;
4343     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4344       SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
4345       return;
4346     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4347       SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
4348       return;
4349     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4350       SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
4351       return;
4352     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4353       SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
4354       return;
4355     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4356       SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
4357       return;
4358     }
4359     break;
4360   }
4361   case AArch64ISD::LD1x4post: {
4362     if (VT == MVT::v8i8) {
4363       SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
4364       return;
4365     } else if (VT == MVT::v16i8) {
4366       SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
4367       return;
4368     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4369       SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
4370       return;
4371     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4372       SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
4373       return;
4374     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4375       SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
4376       return;
4377     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4378       SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
4379       return;
4380     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4381       SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
4382       return;
4383     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4384       SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
4385       return;
4386     }
4387     break;
4388   }
4389   case AArch64ISD::LD1DUPpost: {
4390     if (VT == MVT::v8i8) {
4391       SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
4392       return;
4393     } else if (VT == MVT::v16i8) {
4394       SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
4395       return;
4396     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4397       SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
4398       return;
4399     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4400       SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
4401       return;
4402     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4403       SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
4404       return;
4405     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4406       SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
4407       return;
4408     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4409       SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
4410       return;
4411     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4412       SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
4413       return;
4414     }
4415     break;
4416   }
4417   case AArch64ISD::LD2DUPpost: {
4418     if (VT == MVT::v8i8) {
4419       SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
4420       return;
4421     } else if (VT == MVT::v16i8) {
4422       SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
4423       return;
4424     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4425       SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
4426       return;
4427     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4428       SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
4429       return;
4430     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4431       SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
4432       return;
4433     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4434       SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
4435       return;
4436     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4437       SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
4438       return;
4439     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4440       SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
4441       return;
4442     }
4443     break;
4444   }
4445   case AArch64ISD::LD3DUPpost: {
4446     if (VT == MVT::v8i8) {
4447       SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
4448       return;
4449     } else if (VT == MVT::v16i8) {
4450       SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
4451       return;
4452     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4453       SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
4454       return;
4455     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4456       SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
4457       return;
4458     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4459       SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
4460       return;
4461     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4462       SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
4463       return;
4464     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4465       SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
4466       return;
4467     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4468       SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
4469       return;
4470     }
4471     break;
4472   }
4473   case AArch64ISD::LD4DUPpost: {
4474     if (VT == MVT::v8i8) {
4475       SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
4476       return;
4477     } else if (VT == MVT::v16i8) {
4478       SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
4479       return;
4480     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4481       SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
4482       return;
4483     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4484       SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
4485       return;
4486     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4487       SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
4488       return;
4489     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4490       SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
4491       return;
4492     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4493       SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
4494       return;
4495     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4496       SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
4497       return;
4498     }
4499     break;
4500   }
4501   case AArch64ISD::LD1LANEpost: {
4502     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4503       SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
4504       return;
4505     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4506                VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4507       SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
4508       return;
4509     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4510                VT == MVT::v2f32) {
4511       SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
4512       return;
4513     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4514                VT == MVT::v1f64) {
4515       SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
4516       return;
4517     }
4518     break;
4519   }
4520   case AArch64ISD::LD2LANEpost: {
4521     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4522       SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
4523       return;
4524     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4525                VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4526       SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
4527       return;
4528     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4529                VT == MVT::v2f32) {
4530       SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
4531       return;
4532     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4533                VT == MVT::v1f64) {
4534       SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
4535       return;
4536     }
4537     break;
4538   }
4539   case AArch64ISD::LD3LANEpost: {
4540     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4541       SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
4542       return;
4543     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4544                VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4545       SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
4546       return;
4547     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4548                VT == MVT::v2f32) {
4549       SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
4550       return;
4551     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4552                VT == MVT::v1f64) {
4553       SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
4554       return;
4555     }
4556     break;
4557   }
4558   case AArch64ISD::LD4LANEpost: {
4559     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4560       SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
4561       return;
4562     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4563                VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4564       SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
4565       return;
4566     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4567                VT == MVT::v2f32) {
4568       SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
4569       return;
4570     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4571                VT == MVT::v1f64) {
4572       SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
4573       return;
4574     }
4575     break;
4576   }
4577   case AArch64ISD::ST2post: {
4578     VT = Node->getOperand(1).getValueType();
4579     if (VT == MVT::v8i8) {
4580       SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
4581       return;
4582     } else if (VT == MVT::v16i8) {
4583       SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
4584       return;
4585     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4586       SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
4587       return;
4588     } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4589       SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
4590       return;
4591     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4592       SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
4593       return;
4594     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4595       SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
4596       return;
4597     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4598       SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
4599       return;
4600     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4601       SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
4602       return;
4603     }
4604     break;
4605   }
4606   case AArch64ISD::ST3post: {
4607     VT = Node->getOperand(1).getValueType();
4608     if (VT == MVT::v8i8) {
4609       SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
4610       return;
4611     } else if (VT == MVT::v16i8) {
4612       SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
4613       return;
4614     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4615       SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
4616       return;
4617     } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4618       SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
4619       return;
4620     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4621       SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
4622       return;
4623     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4624       SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
4625       return;
4626     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4627       SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
4628       return;
4629     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4630       SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
4631       return;
4632     }
4633     break;
4634   }
4635   case AArch64ISD::ST4post: {
4636     VT = Node->getOperand(1).getValueType();
4637     if (VT == MVT::v8i8) {
4638       SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
4639       return;
4640     } else if (VT == MVT::v16i8) {
4641       SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
4642       return;
4643     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4644       SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
4645       return;
4646     } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4647       SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
4648       return;
4649     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4650       SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
4651       return;
4652     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4653       SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
4654       return;
4655     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4656       SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
4657       return;
4658     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4659       SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
4660       return;
4661     }
4662     break;
4663   }
4664   case AArch64ISD::ST1x2post: {
4665     VT = Node->getOperand(1).getValueType();
4666     if (VT == MVT::v8i8) {
4667       SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
4668       return;
4669     } else if (VT == MVT::v16i8) {
4670       SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
4671       return;
4672     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4673       SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
4674       return;
4675     } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4676       SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
4677       return;
4678     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4679       SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
4680       return;
4681     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4682       SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
4683       return;
4684     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4685       SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
4686       return;
4687     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4688       SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
4689       return;
4690     }
4691     break;
4692   }
4693   case AArch64ISD::ST1x3post: {
4694     VT = Node->getOperand(1).getValueType();
4695     if (VT == MVT::v8i8) {
4696       SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
4697       return;
4698     } else if (VT == MVT::v16i8) {
4699       SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
4700       return;
4701     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4702       SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
4703       return;
4704     } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 ) {
4705       SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
4706       return;
4707     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4708       SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
4709       return;
4710     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4711       SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
4712       return;
4713     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4714       SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
4715       return;
4716     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4717       SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
4718       return;
4719     }
4720     break;
4721   }
4722   case AArch64ISD::ST1x4post: {
4723     VT = Node->getOperand(1).getValueType();
4724     if (VT == MVT::v8i8) {
4725       SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
4726       return;
4727     } else if (VT == MVT::v16i8) {
4728       SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
4729       return;
4730     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4731       SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
4732       return;
4733     } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4734       SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
4735       return;
4736     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4737       SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
4738       return;
4739     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4740       SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
4741       return;
4742     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4743       SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
4744       return;
4745     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4746       SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
4747       return;
4748     }
4749     break;
4750   }
4751   case AArch64ISD::ST2LANEpost: {
4752     VT = Node->getOperand(1).getValueType();
4753     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4754       SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
4755       return;
4756     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4757                VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4758       SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
4759       return;
4760     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4761                VT == MVT::v2f32) {
4762       SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
4763       return;
4764     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4765                VT == MVT::v1f64) {
4766       SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
4767       return;
4768     }
4769     break;
4770   }
4771   case AArch64ISD::ST3LANEpost: {
4772     VT = Node->getOperand(1).getValueType();
4773     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4774       SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
4775       return;
4776     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4777                VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4778       SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
4779       return;
4780     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4781                VT == MVT::v2f32) {
4782       SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
4783       return;
4784     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4785                VT == MVT::v1f64) {
4786       SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
4787       return;
4788     }
4789     break;
4790   }
4791   case AArch64ISD::ST4LANEpost: {
4792     VT = Node->getOperand(1).getValueType();
4793     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4794       SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
4795       return;
4796     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4797                VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4798       SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
4799       return;
4800     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4801                VT == MVT::v2f32) {
4802       SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
4803       return;
4804     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4805                VT == MVT::v1f64) {
4806       SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
4807       return;
4808     }
4809     break;
4810   }
4811   case AArch64ISD::SVE_LD2_MERGE_ZERO: {
4812     if (VT == MVT::nxv16i8) {
4813       SelectPredicatedLoad(Node, 2, 0, AArch64::LD2B_IMM, AArch64::LD2B);
4814       return;
4815     } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4816                (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
4817       SelectPredicatedLoad(Node, 2, 1, AArch64::LD2H_IMM, AArch64::LD2H);
4818       return;
4819     } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4820       SelectPredicatedLoad(Node, 2, 2, AArch64::LD2W_IMM, AArch64::LD2W);
4821       return;
4822     } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4823       SelectPredicatedLoad(Node, 2, 3, AArch64::LD2D_IMM, AArch64::LD2D);
4824       return;
4825     }
4826     break;
4827   }
4828   case AArch64ISD::SVE_LD3_MERGE_ZERO: {
4829     if (VT == MVT::nxv16i8) {
4830       SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B);
4831       return;
4832     } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4833                (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
4834       SelectPredicatedLoad(Node, 3, 1, AArch64::LD3H_IMM, AArch64::LD3H);
4835       return;
4836     } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4837       SelectPredicatedLoad(Node, 3, 2, AArch64::LD3W_IMM, AArch64::LD3W);
4838       return;
4839     } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4840       SelectPredicatedLoad(Node, 3, 3, AArch64::LD3D_IMM, AArch64::LD3D);
4841       return;
4842     }
4843     break;
4844   }
4845   case AArch64ISD::SVE_LD4_MERGE_ZERO: {
4846     if (VT == MVT::nxv16i8) {
4847       SelectPredicatedLoad(Node, 4, 0, AArch64::LD4B_IMM, AArch64::LD4B);
4848       return;
4849     } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4850                (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
4851       SelectPredicatedLoad(Node, 4, 1, AArch64::LD4H_IMM, AArch64::LD4H);
4852       return;
4853     } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4854       SelectPredicatedLoad(Node, 4, 2, AArch64::LD4W_IMM, AArch64::LD4W);
4855       return;
4856     } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4857       SelectPredicatedLoad(Node, 4, 3, AArch64::LD4D_IMM, AArch64::LD4D);
4858       return;
4859     }
4860     break;
4861   }
4862   }
4863 
4864   // Select the default instruction
4865   SelectCode(Node);
4866 }
4867 
4868 /// createAArch64ISelDag - This pass converts a legalized DAG into a
4869 /// AArch64-specific DAG, ready for instruction scheduling.
createAArch64ISelDag(AArch64TargetMachine & TM,CodeGenOpt::Level OptLevel)4870 FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
4871                                          CodeGenOpt::Level OptLevel) {
4872   return new AArch64DAGToDAGISel(TM, OptLevel);
4873 }
4874 
4875 /// When \p PredVT is a scalable vector predicate in the form
4876 /// MVT::nx<M>xi1, it builds the correspondent scalable vector of
4877 /// integers MVT::nx<M>xi<bits> s.t. M x bits = 128. When targeting
4878 /// structured vectors (NumVec >1), the output data type is
4879 /// MVT::nx<M*NumVec>xi<bits> s.t. M x bits = 128. If the input
4880 /// PredVT is not in the form MVT::nx<M>xi1, it returns an invalid
4881 /// EVT.
getPackedVectorTypeFromPredicateType(LLVMContext & Ctx,EVT PredVT,unsigned NumVec)4882 static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT,
4883                                                 unsigned NumVec) {
4884   assert(NumVec > 0 && NumVec < 5 && "Invalid number of vectors.");
4885   if (!PredVT.isScalableVector() || PredVT.getVectorElementType() != MVT::i1)
4886     return EVT();
4887 
4888   if (PredVT != MVT::nxv16i1 && PredVT != MVT::nxv8i1 &&
4889       PredVT != MVT::nxv4i1 && PredVT != MVT::nxv2i1)
4890     return EVT();
4891 
4892   ElementCount EC = PredVT.getVectorElementCount();
4893   EVT ScalarVT =
4894       EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
4895   EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC * NumVec);
4896 
4897   return MemVT;
4898 }
4899 
4900 /// Return the EVT of the data associated to a memory operation in \p
4901 /// Root. If such EVT cannot be retrived, it returns an invalid EVT.
getMemVTFromNode(LLVMContext & Ctx,SDNode * Root)4902 static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) {
4903   if (isa<MemSDNode>(Root))
4904     return cast<MemSDNode>(Root)->getMemoryVT();
4905 
4906   if (isa<MemIntrinsicSDNode>(Root))
4907     return cast<MemIntrinsicSDNode>(Root)->getMemoryVT();
4908 
4909   const unsigned Opcode = Root->getOpcode();
4910   // For custom ISD nodes, we have to look at them individually to extract the
4911   // type of the data moved to/from memory.
4912   switch (Opcode) {
4913   case AArch64ISD::LD1_MERGE_ZERO:
4914   case AArch64ISD::LD1S_MERGE_ZERO:
4915   case AArch64ISD::LDNF1_MERGE_ZERO:
4916   case AArch64ISD::LDNF1S_MERGE_ZERO:
4917     return cast<VTSDNode>(Root->getOperand(3))->getVT();
4918   case AArch64ISD::ST1_PRED:
4919     return cast<VTSDNode>(Root->getOperand(4))->getVT();
4920   case AArch64ISD::SVE_LD2_MERGE_ZERO:
4921     return getPackedVectorTypeFromPredicateType(
4922         Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/2);
4923   case AArch64ISD::SVE_LD3_MERGE_ZERO:
4924     return getPackedVectorTypeFromPredicateType(
4925         Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/3);
4926   case AArch64ISD::SVE_LD4_MERGE_ZERO:
4927     return getPackedVectorTypeFromPredicateType(
4928         Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/4);
4929   default:
4930     break;
4931   }
4932 
4933   if (Opcode != ISD::INTRINSIC_VOID)
4934     return EVT();
4935 
4936   const unsigned IntNo =
4937       cast<ConstantSDNode>(Root->getOperand(1))->getZExtValue();
4938   if (IntNo != Intrinsic::aarch64_sve_prf)
4939     return EVT();
4940 
4941   // We are using an SVE prefetch intrinsic. Type must be inferred
4942   // from the width of the predicate.
4943   return getPackedVectorTypeFromPredicateType(
4944       Ctx, Root->getOperand(2)->getValueType(0), /*NumVec=*/1);
4945 }
4946 
4947 /// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode:
4948 /// Base + OffImm * sizeof(MemVT) for Min >= OffImm <= Max
4949 /// where Root is the memory access using N for its address.
4950 template <int64_t Min, int64_t Max>
SelectAddrModeIndexedSVE(SDNode * Root,SDValue N,SDValue & Base,SDValue & OffImm)4951 bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
4952                                                    SDValue &Base,
4953                                                    SDValue &OffImm) {
4954   const EVT MemVT = getMemVTFromNode(*(CurDAG->getContext()), Root);
4955 
4956   if (MemVT == EVT())
4957     return false;
4958 
4959   if (N.getOpcode() != ISD::ADD)
4960     return false;
4961 
4962   SDValue VScale = N.getOperand(1);
4963   if (VScale.getOpcode() != ISD::VSCALE)
4964     return false;
4965 
4966   TypeSize TS = MemVT.getSizeInBits();
4967   int64_t MemWidthBytes = static_cast<int64_t>(TS.getKnownMinSize()) / 8;
4968   int64_t MulImm = cast<ConstantSDNode>(VScale.getOperand(0))->getSExtValue();
4969 
4970   if ((MulImm % MemWidthBytes) != 0)
4971     return false;
4972 
4973   int64_t Offset = MulImm / MemWidthBytes;
4974   if (Offset < Min || Offset > Max)
4975     return false;
4976 
4977   Base = N.getOperand(0);
4978   OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64);
4979   return true;
4980 }
4981 
4982 /// Select register plus register addressing mode for SVE, with scaled
4983 /// offset.
SelectSVERegRegAddrMode(SDValue N,unsigned Scale,SDValue & Base,SDValue & Offset)4984 bool AArch64DAGToDAGISel::SelectSVERegRegAddrMode(SDValue N, unsigned Scale,
4985                                                   SDValue &Base,
4986                                                   SDValue &Offset) {
4987   if (N.getOpcode() != ISD::ADD)
4988     return false;
4989 
4990   // Process an ADD node.
4991   const SDValue LHS = N.getOperand(0);
4992   const SDValue RHS = N.getOperand(1);
4993 
4994   // 8 bit data does not come with the SHL node, so it is treated
4995   // separately.
4996   if (Scale == 0) {
4997     Base = LHS;
4998     Offset = RHS;
4999     return true;
5000   }
5001 
5002   // Check if the RHS is a shift node with a constant.
5003   if (RHS.getOpcode() != ISD::SHL)
5004     return false;
5005 
5006   const SDValue ShiftRHS = RHS.getOperand(1);
5007   if (auto *C = dyn_cast<ConstantSDNode>(ShiftRHS))
5008     if (C->getZExtValue() == Scale) {
5009       Base = LHS;
5010       Offset = RHS.getOperand(0);
5011       return true;
5012     }
5013 
5014   return false;
5015 }
5016 
SelectAllActivePredicate(SDValue N)5017 bool AArch64DAGToDAGISel::SelectAllActivePredicate(SDValue N) {
5018   const AArch64TargetLowering *TLI =
5019       static_cast<const AArch64TargetLowering *>(getTargetLowering());
5020 
5021   return TLI->isAllActivePredicate(N);
5022 }
5023