xref: /llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 8017ca1d0056907331ff7542ac9ff1ff5ec969a2)
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64ISelLowering.h"
14 #include "AArch64CallingConvention.h"
15 #include "AArch64ExpandImm.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "MCTargetDesc/AArch64AddressingModes.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "Utils/AArch64SMEAttributes.h"
23 #include "llvm/ADT/APFloat.h"
24 #include "llvm/ADT/APInt.h"
25 #include "llvm/ADT/ArrayRef.h"
26 #include "llvm/ADT/STLExtras.h"
27 #include "llvm/ADT/SmallSet.h"
28 #include "llvm/ADT/SmallVector.h"
29 #include "llvm/ADT/SmallVectorExtras.h"
30 #include "llvm/ADT/Statistic.h"
31 #include "llvm/ADT/StringRef.h"
32 #include "llvm/ADT/Twine.h"
33 #include "llvm/Analysis/LoopInfo.h"
34 #include "llvm/Analysis/MemoryLocation.h"
35 #include "llvm/Analysis/ObjCARCUtil.h"
36 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
37 #include "llvm/Analysis/TargetTransformInfo.h"
38 #include "llvm/Analysis/ValueTracking.h"
39 #include "llvm/Analysis/VectorUtils.h"
40 #include "llvm/CodeGen/Analysis.h"
41 #include "llvm/CodeGen/CallingConvLower.h"
42 #include "llvm/CodeGen/ComplexDeinterleavingPass.h"
43 #include "llvm/CodeGen/GlobalISel/Utils.h"
44 #include "llvm/CodeGen/ISDOpcodes.h"
45 #include "llvm/CodeGen/MachineBasicBlock.h"
46 #include "llvm/CodeGen/MachineFrameInfo.h"
47 #include "llvm/CodeGen/MachineFunction.h"
48 #include "llvm/CodeGen/MachineInstr.h"
49 #include "llvm/CodeGen/MachineInstrBuilder.h"
50 #include "llvm/CodeGen/MachineMemOperand.h"
51 #include "llvm/CodeGen/MachineRegisterInfo.h"
52 #include "llvm/CodeGen/SelectionDAG.h"
53 #include "llvm/CodeGen/SelectionDAGNodes.h"
54 #include "llvm/CodeGen/TargetCallingConv.h"
55 #include "llvm/CodeGen/TargetInstrInfo.h"
56 #include "llvm/CodeGen/TargetOpcodes.h"
57 #include "llvm/CodeGen/ValueTypes.h"
58 #include "llvm/CodeGenTypes/MachineValueType.h"
59 #include "llvm/IR/Attributes.h"
60 #include "llvm/IR/Constants.h"
61 #include "llvm/IR/DataLayout.h"
62 #include "llvm/IR/DebugLoc.h"
63 #include "llvm/IR/DerivedTypes.h"
64 #include "llvm/IR/Function.h"
65 #include "llvm/IR/GetElementPtrTypeIterator.h"
66 #include "llvm/IR/GlobalValue.h"
67 #include "llvm/IR/IRBuilder.h"
68 #include "llvm/IR/Instruction.h"
69 #include "llvm/IR/Instructions.h"
70 #include "llvm/IR/IntrinsicInst.h"
71 #include "llvm/IR/Intrinsics.h"
72 #include "llvm/IR/IntrinsicsAArch64.h"
73 #include "llvm/IR/Module.h"
74 #include "llvm/IR/PatternMatch.h"
75 #include "llvm/IR/Type.h"
76 #include "llvm/IR/Use.h"
77 #include "llvm/IR/Value.h"
78 #include "llvm/Support/AtomicOrdering.h"
79 #include "llvm/Support/Casting.h"
80 #include "llvm/Support/CodeGen.h"
81 #include "llvm/Support/CommandLine.h"
82 #include "llvm/Support/Debug.h"
83 #include "llvm/Support/ErrorHandling.h"
84 #include "llvm/Support/InstructionCost.h"
85 #include "llvm/Support/KnownBits.h"
86 #include "llvm/Support/MathExtras.h"
87 #include "llvm/Support/SipHash.h"
88 #include "llvm/Support/raw_ostream.h"
89 #include "llvm/Target/TargetMachine.h"
90 #include "llvm/Target/TargetOptions.h"
91 #include "llvm/TargetParser/Triple.h"
92 #include <algorithm>
93 #include <bitset>
94 #include <cassert>
95 #include <cctype>
96 #include <cstdint>
97 #include <cstdlib>
98 #include <iterator>
99 #include <limits>
100 #include <optional>
101 #include <tuple>
102 #include <utility>
103 #include <vector>
104 
105 using namespace llvm;
106 using namespace llvm::PatternMatch;
107 
108 #define DEBUG_TYPE "aarch64-lower"
109 
110 STATISTIC(NumTailCalls, "Number of tail calls");
111 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113 
114 // FIXME: The necessary dtprel relocations don't seem to be supported
115 // well in the GNU bfd and gold linkers at the moment. Therefore, by
116 // default, for now, fall back to GeneralDynamic code generation.
117 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
118     "aarch64-elf-ldtls-generation", cl::Hidden,
119     cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120     cl::init(false));
121 
122 static cl::opt<bool>
123 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124                          cl::desc("Enable AArch64 logical imm instruction "
125                                   "optimization"),
126                          cl::init(true));
127 
128 // Temporary option added for the purpose of testing functionality added
129 // to DAGCombiner.cpp in D92230. It is expected that this can be removed
130 // in future when both implementations will be based off MGATHER rather
131 // than the GLD1 nodes added for the SVE gather load intrinsics.
132 static cl::opt<bool>
133 EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134                                 cl::desc("Combine extends of AArch64 masked "
135                                          "gather intrinsics"),
136                                 cl::init(true));
137 
138 static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139                                     cl::desc("Combine ext and trunc to TBL"),
140                                     cl::init(true));
141 
142 // All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143 // bottleneck after this transform on high end CPU. So this max leaf node
144 // limitation is guard cmp+ccmp will be profitable.
145 static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146                                  cl::desc("Maximum of xors"));
147 
148 // By turning this on, we will not fallback to DAG ISel when encountering
149 // scalable vector types for all instruction, even if SVE is not yet supported
150 // with some instructions.
151 // See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
152 cl::opt<bool> EnableSVEGISel(
153     "aarch64-enable-gisel-sve", cl::Hidden,
154     cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155     cl::init(false));
156 
157 /// Value type used for condition codes.
158 static const MVT MVT_CC = MVT::i32;
159 
160 static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
161                                        AArch64::X3, AArch64::X4, AArch64::X5,
162                                        AArch64::X6, AArch64::X7};
163 static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
164                                        AArch64::Q3, AArch64::Q4, AArch64::Q5,
165                                        AArch64::Q6, AArch64::Q7};
166 
167 ArrayRef<MCPhysReg> llvm::AArch64::getGPRArgRegs() { return GPRArgRegs; }
168 
169 ArrayRef<MCPhysReg> llvm::AArch64::getFPRArgRegs() { return FPRArgRegs; }
170 
171 static inline EVT getPackedSVEVectorVT(EVT VT) {
172   switch (VT.getSimpleVT().SimpleTy) {
173   default:
174     llvm_unreachable("unexpected element type for vector");
175   case MVT::i8:
176     return MVT::nxv16i8;
177   case MVT::i16:
178     return MVT::nxv8i16;
179   case MVT::i32:
180     return MVT::nxv4i32;
181   case MVT::i64:
182     return MVT::nxv2i64;
183   case MVT::f16:
184     return MVT::nxv8f16;
185   case MVT::f32:
186     return MVT::nxv4f32;
187   case MVT::f64:
188     return MVT::nxv2f64;
189   case MVT::bf16:
190     return MVT::nxv8bf16;
191   }
192 }
193 
194 // NOTE: Currently there's only a need to return integer vector types. If this
195 // changes then just add an extra "type" parameter.
196 static inline EVT getPackedSVEVectorVT(ElementCount EC) {
197   switch (EC.getKnownMinValue()) {
198   default:
199     llvm_unreachable("unexpected element count for vector");
200   case 16:
201     return MVT::nxv16i8;
202   case 8:
203     return MVT::nxv8i16;
204   case 4:
205     return MVT::nxv4i32;
206   case 2:
207     return MVT::nxv2i64;
208   }
209 }
210 
211 static inline EVT getPromotedVTForPredicate(EVT VT) {
212   assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
213          "Expected scalable predicate vector type!");
214   switch (VT.getVectorMinNumElements()) {
215   default:
216     llvm_unreachable("unexpected element count for vector");
217   case 2:
218     return MVT::nxv2i64;
219   case 4:
220     return MVT::nxv4i32;
221   case 8:
222     return MVT::nxv8i16;
223   case 16:
224     return MVT::nxv16i8;
225   }
226 }
227 
228 /// Returns true if VT's elements occupy the lowest bit positions of its
229 /// associated register class without any intervening space.
230 ///
231 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
232 /// same register class, but only nxv8f16 can be treated as a packed vector.
233 static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
234   assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
235          "Expected legal vector type!");
236   return VT.isFixedLengthVector() ||
237          VT.getSizeInBits().getKnownMinValue() == AArch64::SVEBitsPerBlock;
238 }
239 
240 // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
241 // predicate and end with a passthru value matching the result type.
242 static bool isMergePassthruOpcode(unsigned Opc) {
243   switch (Opc) {
244   default:
245     return false;
246   case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
247   case AArch64ISD::BSWAP_MERGE_PASSTHRU:
248   case AArch64ISD::REVH_MERGE_PASSTHRU:
249   case AArch64ISD::REVW_MERGE_PASSTHRU:
250   case AArch64ISD::REVD_MERGE_PASSTHRU:
251   case AArch64ISD::CTLZ_MERGE_PASSTHRU:
252   case AArch64ISD::CTPOP_MERGE_PASSTHRU:
253   case AArch64ISD::DUP_MERGE_PASSTHRU:
254   case AArch64ISD::ABS_MERGE_PASSTHRU:
255   case AArch64ISD::NEG_MERGE_PASSTHRU:
256   case AArch64ISD::FNEG_MERGE_PASSTHRU:
257   case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
258   case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
259   case AArch64ISD::FCEIL_MERGE_PASSTHRU:
260   case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
261   case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
262   case AArch64ISD::FRINT_MERGE_PASSTHRU:
263   case AArch64ISD::FROUND_MERGE_PASSTHRU:
264   case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
265   case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
266   case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
267   case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
268   case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
269   case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
270   case AArch64ISD::FCVTX_MERGE_PASSTHRU:
271   case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
272   case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
273   case AArch64ISD::FSQRT_MERGE_PASSTHRU:
274   case AArch64ISD::FRECPX_MERGE_PASSTHRU:
275   case AArch64ISD::FABS_MERGE_PASSTHRU:
276     return true;
277   }
278 }
279 
280 // Returns true if inactive lanes are known to be zeroed by construction.
281 static bool isZeroingInactiveLanes(SDValue Op) {
282   switch (Op.getOpcode()) {
283   default:
284     return false;
285   // We guarantee i1 splat_vectors to zero the other lanes
286   case ISD::SPLAT_VECTOR:
287   case AArch64ISD::PTRUE:
288   case AArch64ISD::SETCC_MERGE_ZERO:
289     return true;
290   case ISD::INTRINSIC_WO_CHAIN:
291     switch (Op.getConstantOperandVal(0)) {
292     default:
293       return false;
294     case Intrinsic::aarch64_sve_ptrue:
295     case Intrinsic::aarch64_sve_pnext:
296     case Intrinsic::aarch64_sve_cmpeq:
297     case Intrinsic::aarch64_sve_cmpne:
298     case Intrinsic::aarch64_sve_cmpge:
299     case Intrinsic::aarch64_sve_cmpgt:
300     case Intrinsic::aarch64_sve_cmphs:
301     case Intrinsic::aarch64_sve_cmphi:
302     case Intrinsic::aarch64_sve_cmpeq_wide:
303     case Intrinsic::aarch64_sve_cmpne_wide:
304     case Intrinsic::aarch64_sve_cmpge_wide:
305     case Intrinsic::aarch64_sve_cmpgt_wide:
306     case Intrinsic::aarch64_sve_cmplt_wide:
307     case Intrinsic::aarch64_sve_cmple_wide:
308     case Intrinsic::aarch64_sve_cmphs_wide:
309     case Intrinsic::aarch64_sve_cmphi_wide:
310     case Intrinsic::aarch64_sve_cmplo_wide:
311     case Intrinsic::aarch64_sve_cmpls_wide:
312     case Intrinsic::aarch64_sve_fcmpeq:
313     case Intrinsic::aarch64_sve_fcmpne:
314     case Intrinsic::aarch64_sve_fcmpge:
315     case Intrinsic::aarch64_sve_fcmpgt:
316     case Intrinsic::aarch64_sve_fcmpuo:
317     case Intrinsic::aarch64_sve_facgt:
318     case Intrinsic::aarch64_sve_facge:
319     case Intrinsic::aarch64_sve_whilege:
320     case Intrinsic::aarch64_sve_whilegt:
321     case Intrinsic::aarch64_sve_whilehi:
322     case Intrinsic::aarch64_sve_whilehs:
323     case Intrinsic::aarch64_sve_whilele:
324     case Intrinsic::aarch64_sve_whilelo:
325     case Intrinsic::aarch64_sve_whilels:
326     case Intrinsic::aarch64_sve_whilelt:
327     case Intrinsic::aarch64_sve_match:
328     case Intrinsic::aarch64_sve_nmatch:
329     case Intrinsic::aarch64_sve_whilege_x2:
330     case Intrinsic::aarch64_sve_whilegt_x2:
331     case Intrinsic::aarch64_sve_whilehi_x2:
332     case Intrinsic::aarch64_sve_whilehs_x2:
333     case Intrinsic::aarch64_sve_whilele_x2:
334     case Intrinsic::aarch64_sve_whilelo_x2:
335     case Intrinsic::aarch64_sve_whilels_x2:
336     case Intrinsic::aarch64_sve_whilelt_x2:
337       return true;
338     }
339   }
340 }
341 
342 static std::tuple<SDValue, SDValue>
343 extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG) {
344   SDLoc DL(Disc);
345   SDValue AddrDisc;
346   SDValue ConstDisc;
347 
348   // If this is a blend, remember the constant and address discriminators.
349   // Otherwise, it's either a constant discriminator, or a non-blended
350   // address discriminator.
351   if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
352       Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
353     AddrDisc = Disc->getOperand(1);
354     ConstDisc = Disc->getOperand(2);
355   } else {
356     ConstDisc = Disc;
357   }
358 
359   // If the constant discriminator (either the blend RHS, or the entire
360   // discriminator value) isn't a 16-bit constant, bail out, and let the
361   // discriminator be computed separately.
362   const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
363   if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
364     return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
365 
366   // If there's no address discriminator, use NoRegister, which we'll later
367   // replace with XZR, or directly use a Z variant of the inst. when available.
368   if (!AddrDisc)
369     AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
370 
371   return std::make_tuple(
372       DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
373       AddrDisc);
374 }
375 
376 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
377                                              const AArch64Subtarget &STI)
378     : TargetLowering(TM), Subtarget(&STI) {
379   // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
380   // we have to make something up. Arbitrarily, choose ZeroOrOne.
381   setBooleanContents(ZeroOrOneBooleanContent);
382   // When comparing vectors the result sets the different elements in the
383   // vector to all-one or all-zero.
384   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
385 
386   // Set up the register classes.
387   addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
388   addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
389 
390   if (Subtarget->hasLS64()) {
391     addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
392     setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
393     setOperationAction(ISD::STORE, MVT::i64x8, Custom);
394   }
395 
396   if (Subtarget->hasFPARMv8()) {
397     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
398     addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
399     addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
400     addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
401     addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
402   }
403 
404   if (Subtarget->hasNEON()) {
405     addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
406     addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
407 
408     addDRType(MVT::v2f32);
409     addDRType(MVT::v8i8);
410     addDRType(MVT::v4i16);
411     addDRType(MVT::v2i32);
412     addDRType(MVT::v1i64);
413     addDRType(MVT::v1f64);
414     addDRType(MVT::v4f16);
415     addDRType(MVT::v4bf16);
416 
417     addQRType(MVT::v4f32);
418     addQRType(MVT::v2f64);
419     addQRType(MVT::v16i8);
420     addQRType(MVT::v8i16);
421     addQRType(MVT::v4i32);
422     addQRType(MVT::v2i64);
423     addQRType(MVT::v8f16);
424     addQRType(MVT::v8bf16);
425   }
426 
427   if (Subtarget->isSVEorStreamingSVEAvailable()) {
428     // Add legal sve predicate types
429     addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
430     addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
431     addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
432     addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
433     addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
434 
435     // Add legal sve data types
436     addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
437     addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
438     addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
439     addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
440 
441     addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
442     addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
443     addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
444     addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
445     addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
446     addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
447 
448     addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
449     addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
450     addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
451 
452     if (Subtarget->useSVEForFixedLengthVectors()) {
453       for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
454         if (useSVEForFixedLengthVectorVT(VT))
455           addRegisterClass(VT, &AArch64::ZPRRegClass);
456 
457       for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
458         if (useSVEForFixedLengthVectorVT(VT))
459           addRegisterClass(VT, &AArch64::ZPRRegClass);
460     }
461   }
462 
463   if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
464     addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
465     setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
466     setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
467 
468     setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
469     setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
470   }
471 
472   // Compute derived properties from the register classes
473   computeRegisterProperties(Subtarget->getRegisterInfo());
474 
475   // Provide all sorts of operation actions
476   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
477   setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
478   setOperationAction(ISD::SETCC, MVT::i32, Custom);
479   setOperationAction(ISD::SETCC, MVT::i64, Custom);
480   setOperationAction(ISD::SETCC, MVT::bf16, Custom);
481   setOperationAction(ISD::SETCC, MVT::f16, Custom);
482   setOperationAction(ISD::SETCC, MVT::f32, Custom);
483   setOperationAction(ISD::SETCC, MVT::f64, Custom);
484   setOperationAction(ISD::STRICT_FSETCC, MVT::bf16, Custom);
485   setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
486   setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
487   setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
488   setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
489   setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
490   setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
491   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
492   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
493   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
494   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
495   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
496   setOperationAction(ISD::BR_CC, MVT::f16, Custom);
497   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
498   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
499   setOperationAction(ISD::SELECT, MVT::i32, Custom);
500   setOperationAction(ISD::SELECT, MVT::i64, Custom);
501   setOperationAction(ISD::SELECT, MVT::f16, Custom);
502   setOperationAction(ISD::SELECT, MVT::bf16, Custom);
503   setOperationAction(ISD::SELECT, MVT::f32, Custom);
504   setOperationAction(ISD::SELECT, MVT::f64, Custom);
505   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
506   setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
507   setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
508   setOperationAction(ISD::SELECT_CC, MVT::bf16, Custom);
509   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
510   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
511   setOperationAction(ISD::BR_JT, MVT::Other, Custom);
512   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
513   setOperationAction(ISD::BRIND, MVT::Other, Custom);
514   setOperationAction(ISD::SETCCCARRY, MVT::i64, Custom);
515 
516   setOperationAction(ISD::PtrAuthGlobalAddress, MVT::i64, Custom);
517 
518   setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
519   setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
520   setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
521 
522   setOperationAction(ISD::FREM, MVT::f32, Expand);
523   setOperationAction(ISD::FREM, MVT::f64, Expand);
524   setOperationAction(ISD::FREM, MVT::f80, Expand);
525 
526   setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
527 
528   // Custom lowering hooks are needed for XOR
529   // to fold it into CSINC/CSINV.
530   setOperationAction(ISD::XOR, MVT::i32, Custom);
531   setOperationAction(ISD::XOR, MVT::i64, Custom);
532 
533   // Virtually no operation on f128 is legal, but LLVM can't expand them when
534   // there's a valid register class, so we need custom operations in most cases.
535   setOperationAction(ISD::FABS, MVT::f128, Expand);
536   setOperationAction(ISD::FADD, MVT::f128, LibCall);
537   setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
538   setOperationAction(ISD::FCOS, MVT::f128, Expand);
539   setOperationAction(ISD::FDIV, MVT::f128, LibCall);
540   setOperationAction(ISD::FMA, MVT::f128, Expand);
541   setOperationAction(ISD::FMUL, MVT::f128, LibCall);
542   setOperationAction(ISD::FNEG, MVT::f128, Expand);
543   setOperationAction(ISD::FPOW, MVT::f128, Expand);
544   setOperationAction(ISD::FREM, MVT::f128, Expand);
545   setOperationAction(ISD::FRINT, MVT::f128, Expand);
546   setOperationAction(ISD::FSIN, MVT::f128, Expand);
547   setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
548   setOperationAction(ISD::FSQRT, MVT::f128, Expand);
549   setOperationAction(ISD::FSUB, MVT::f128, LibCall);
550   setOperationAction(ISD::FTAN, MVT::f128, Expand);
551   setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
552   setOperationAction(ISD::SETCC, MVT::f128, Custom);
553   setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
554   setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
555   setOperationAction(ISD::BR_CC, MVT::f128, Custom);
556   setOperationAction(ISD::SELECT, MVT::f128, Custom);
557   setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
558   setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
559   // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
560   // aren't handled.
561 
562   // Lowering for many of the conversions is actually specified by the non-f128
563   // type. The LowerXXX function will be trivial when f128 isn't involved.
564   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
565   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
566   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
567   setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
568   setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
569   setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
570   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
571   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
572   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
573   setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
574   setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
575   setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
576   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
577   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
578   setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
579   setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
580   setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
581   setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
582   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
583   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
584   setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
585   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
586   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
587   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
588   if (Subtarget->hasFPARMv8()) {
589     setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
590     setOperationAction(ISD::FP_ROUND, MVT::bf16, Custom);
591   }
592   setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
593   setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
594   if (Subtarget->hasFPARMv8()) {
595     setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
596     setOperationAction(ISD::STRICT_FP_ROUND, MVT::bf16, Custom);
597   }
598   setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
599   setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
600 
601   setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
602   setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
603   setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
604   setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
605 
606   // Variable arguments.
607   setOperationAction(ISD::VASTART, MVT::Other, Custom);
608   setOperationAction(ISD::VAARG, MVT::Other, Custom);
609   setOperationAction(ISD::VACOPY, MVT::Other, Custom);
610   setOperationAction(ISD::VAEND, MVT::Other, Expand);
611 
612   // Variable-sized objects.
613   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
614   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
615 
616   // Lowering Funnel Shifts to EXTR
617   setOperationAction(ISD::FSHR, MVT::i32, Custom);
618   setOperationAction(ISD::FSHR, MVT::i64, Custom);
619   setOperationAction(ISD::FSHL, MVT::i32, Custom);
620   setOperationAction(ISD::FSHL, MVT::i64, Custom);
621 
622   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
623 
624   // Constant pool entries
625   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
626 
627   // BlockAddress
628   setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
629 
630   // AArch64 lacks both left-rotate and popcount instructions.
631   setOperationAction(ISD::ROTL, MVT::i32, Expand);
632   setOperationAction(ISD::ROTL, MVT::i64, Expand);
633   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
634     setOperationAction(ISD::ROTL, VT, Expand);
635     setOperationAction(ISD::ROTR, VT, Expand);
636   }
637 
638   // AArch64 doesn't have i32 MULH{S|U}.
639   setOperationAction(ISD::MULHU, MVT::i32, Expand);
640   setOperationAction(ISD::MULHS, MVT::i32, Expand);
641 
642   // AArch64 doesn't have {U|S}MUL_LOHI.
643   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
644   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
645   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
646   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
647 
648   if (Subtarget->hasCSSC()) {
649     setOperationAction(ISD::CTPOP, MVT::i32, Legal);
650     setOperationAction(ISD::CTPOP, MVT::i64, Legal);
651     setOperationAction(ISD::CTPOP, MVT::i128, Expand);
652 
653     setOperationAction(ISD::PARITY, MVT::i128, Expand);
654 
655     setOperationAction(ISD::CTTZ, MVT::i32, Legal);
656     setOperationAction(ISD::CTTZ, MVT::i64, Legal);
657     setOperationAction(ISD::CTTZ, MVT::i128, Expand);
658 
659     setOperationAction(ISD::ABS, MVT::i32, Legal);
660     setOperationAction(ISD::ABS, MVT::i64, Legal);
661 
662     setOperationAction(ISD::SMAX, MVT::i32, Legal);
663     setOperationAction(ISD::SMAX, MVT::i64, Legal);
664     setOperationAction(ISD::UMAX, MVT::i32, Legal);
665     setOperationAction(ISD::UMAX, MVT::i64, Legal);
666 
667     setOperationAction(ISD::SMIN, MVT::i32, Legal);
668     setOperationAction(ISD::SMIN, MVT::i64, Legal);
669     setOperationAction(ISD::UMIN, MVT::i32, Legal);
670     setOperationAction(ISD::UMIN, MVT::i64, Legal);
671   } else {
672     setOperationAction(ISD::CTPOP, MVT::i32, Custom);
673     setOperationAction(ISD::CTPOP, MVT::i64, Custom);
674     setOperationAction(ISD::CTPOP, MVT::i128, Custom);
675 
676     setOperationAction(ISD::PARITY, MVT::i64, Custom);
677     setOperationAction(ISD::PARITY, MVT::i128, Custom);
678 
679     setOperationAction(ISD::ABS, MVT::i32, Custom);
680     setOperationAction(ISD::ABS, MVT::i64, Custom);
681   }
682 
683   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
684   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
685   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
686     setOperationAction(ISD::SDIVREM, VT, Expand);
687     setOperationAction(ISD::UDIVREM, VT, Expand);
688   }
689   setOperationAction(ISD::SREM, MVT::i32, Expand);
690   setOperationAction(ISD::SREM, MVT::i64, Expand);
691   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
692   setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
693   setOperationAction(ISD::UREM, MVT::i32, Expand);
694   setOperationAction(ISD::UREM, MVT::i64, Expand);
695 
696   // Custom lower Add/Sub/Mul with overflow.
697   setOperationAction(ISD::SADDO, MVT::i32, Custom);
698   setOperationAction(ISD::SADDO, MVT::i64, Custom);
699   setOperationAction(ISD::UADDO, MVT::i32, Custom);
700   setOperationAction(ISD::UADDO, MVT::i64, Custom);
701   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
702   setOperationAction(ISD::SSUBO, MVT::i64, Custom);
703   setOperationAction(ISD::USUBO, MVT::i32, Custom);
704   setOperationAction(ISD::USUBO, MVT::i64, Custom);
705   setOperationAction(ISD::SMULO, MVT::i32, Custom);
706   setOperationAction(ISD::SMULO, MVT::i64, Custom);
707   setOperationAction(ISD::UMULO, MVT::i32, Custom);
708   setOperationAction(ISD::UMULO, MVT::i64, Custom);
709 
710   setOperationAction(ISD::UADDO_CARRY, MVT::i32, Custom);
711   setOperationAction(ISD::UADDO_CARRY, MVT::i64, Custom);
712   setOperationAction(ISD::USUBO_CARRY, MVT::i32, Custom);
713   setOperationAction(ISD::USUBO_CARRY, MVT::i64, Custom);
714   setOperationAction(ISD::SADDO_CARRY, MVT::i32, Custom);
715   setOperationAction(ISD::SADDO_CARRY, MVT::i64, Custom);
716   setOperationAction(ISD::SSUBO_CARRY, MVT::i32, Custom);
717   setOperationAction(ISD::SSUBO_CARRY, MVT::i64, Custom);
718 
719   setOperationAction(ISD::FSIN, MVT::f32, Expand);
720   setOperationAction(ISD::FSIN, MVT::f64, Expand);
721   setOperationAction(ISD::FCOS, MVT::f32, Expand);
722   setOperationAction(ISD::FCOS, MVT::f64, Expand);
723   setOperationAction(ISD::FPOW, MVT::f32, Expand);
724   setOperationAction(ISD::FPOW, MVT::f64, Expand);
725   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
726   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
727   if (Subtarget->hasFullFP16()) {
728     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
729     setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Custom);
730   } else {
731     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
732     setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Promote);
733   }
734 
735   for (auto Op : {ISD::FREM,          ISD::FPOW,          ISD::FPOWI,
736                   ISD::FCOS,          ISD::FSIN,          ISD::FSINCOS,
737                   ISD::FACOS,         ISD::FASIN,         ISD::FATAN,
738                   ISD::FATAN2,        ISD::FCOSH,         ISD::FSINH,
739                   ISD::FTANH,         ISD::FTAN,          ISD::FEXP,
740                   ISD::FEXP2,         ISD::FEXP10,        ISD::FLOG,
741                   ISD::FLOG2,         ISD::FLOG10,        ISD::STRICT_FREM,
742                   ISD::STRICT_FPOW,   ISD::STRICT_FPOWI,  ISD::STRICT_FCOS,
743                   ISD::STRICT_FSIN,   ISD::STRICT_FACOS,  ISD::STRICT_FASIN,
744                   ISD::STRICT_FATAN,  ISD::STRICT_FATAN2, ISD::STRICT_FCOSH,
745                   ISD::STRICT_FSINH,  ISD::STRICT_FTANH,  ISD::STRICT_FEXP,
746                   ISD::STRICT_FEXP2,  ISD::STRICT_FLOG,   ISD::STRICT_FLOG2,
747                   ISD::STRICT_FLOG10, ISD::STRICT_FTAN}) {
748     setOperationAction(Op, MVT::f16, Promote);
749     setOperationAction(Op, MVT::v4f16, Expand);
750     setOperationAction(Op, MVT::v8f16, Expand);
751     setOperationAction(Op, MVT::bf16, Promote);
752     setOperationAction(Op, MVT::v4bf16, Expand);
753     setOperationAction(Op, MVT::v8bf16, Expand);
754   }
755 
756   // For bf16, fpextend is custom lowered to be optionally expanded into shifts.
757   setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
758   setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
759   setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Custom);
760   setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
761   setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
762   setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f32, Custom);
763 
764   auto LegalizeNarrowFP = [this](MVT ScalarVT) {
765     for (auto Op : {
766              ISD::SETCC,
767              ISD::SELECT_CC,
768              ISD::BR_CC,
769              ISD::FADD,
770              ISD::FSUB,
771              ISD::FMUL,
772              ISD::FDIV,
773              ISD::FMA,
774              ISD::FCEIL,
775              ISD::FSQRT,
776              ISD::FFLOOR,
777              ISD::FNEARBYINT,
778              ISD::FRINT,
779              ISD::FROUND,
780              ISD::FROUNDEVEN,
781              ISD::FTRUNC,
782              ISD::FMINNUM,
783              ISD::FMAXNUM,
784              ISD::FMINIMUM,
785              ISD::FMAXIMUM,
786              ISD::FCANONICALIZE,
787              ISD::STRICT_FADD,
788              ISD::STRICT_FSUB,
789              ISD::STRICT_FMUL,
790              ISD::STRICT_FDIV,
791              ISD::STRICT_FMA,
792              ISD::STRICT_FCEIL,
793              ISD::STRICT_FFLOOR,
794              ISD::STRICT_FSQRT,
795              ISD::STRICT_FRINT,
796              ISD::STRICT_FNEARBYINT,
797              ISD::STRICT_FROUND,
798              ISD::STRICT_FTRUNC,
799              ISD::STRICT_FROUNDEVEN,
800              ISD::STRICT_FMINNUM,
801              ISD::STRICT_FMAXNUM,
802              ISD::STRICT_FMINIMUM,
803              ISD::STRICT_FMAXIMUM,
804          })
805       setOperationAction(Op, ScalarVT, Promote);
806 
807     for (auto Op : {ISD::FNEG, ISD::FABS})
808       setOperationAction(Op, ScalarVT, Legal);
809 
810     // Round-to-integer need custom lowering for fp16, as Promote doesn't work
811     // because the result type is integer.
812     for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
813                     ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT,
814                     ISD::STRICT_LLRINT})
815       setOperationAction(Op, ScalarVT, Custom);
816 
817     // promote v4f16 to v4f32 when that is known to be safe.
818     auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
819     setOperationPromotedToType(ISD::FADD,       V4Narrow, MVT::v4f32);
820     setOperationPromotedToType(ISD::FSUB,       V4Narrow, MVT::v4f32);
821     setOperationPromotedToType(ISD::FMUL,       V4Narrow, MVT::v4f32);
822     setOperationPromotedToType(ISD::FDIV,       V4Narrow, MVT::v4f32);
823     setOperationPromotedToType(ISD::FCEIL,      V4Narrow, MVT::v4f32);
824     setOperationPromotedToType(ISD::FFLOOR,     V4Narrow, MVT::v4f32);
825     setOperationPromotedToType(ISD::FROUND,     V4Narrow, MVT::v4f32);
826     setOperationPromotedToType(ISD::FTRUNC,     V4Narrow, MVT::v4f32);
827     setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
828     setOperationPromotedToType(ISD::FRINT,      V4Narrow, MVT::v4f32);
829     setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
830     setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
831 
832     setOperationAction(ISD::FABS,        V4Narrow, Legal);
833     setOperationAction(ISD::FNEG, 	 V4Narrow, Legal);
834     setOperationAction(ISD::FMA,         V4Narrow, Expand);
835     setOperationAction(ISD::SETCC,       V4Narrow, Custom);
836     setOperationAction(ISD::BR_CC,       V4Narrow, Expand);
837     setOperationAction(ISD::SELECT,      V4Narrow, Expand);
838     setOperationAction(ISD::SELECT_CC,   V4Narrow, Expand);
839     setOperationAction(ISD::FCOPYSIGN,   V4Narrow, Custom);
840     setOperationAction(ISD::FSQRT,       V4Narrow, Expand);
841 
842     auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
843     setOperationAction(ISD::FABS,        V8Narrow, Legal);
844     setOperationAction(ISD::FADD,        V8Narrow, Legal);
845     setOperationAction(ISD::FCEIL,       V8Narrow, Legal);
846     setOperationAction(ISD::FCOPYSIGN,   V8Narrow, Custom);
847     setOperationAction(ISD::FDIV,        V8Narrow, Legal);
848     setOperationAction(ISD::FFLOOR,      V8Narrow, Legal);
849     setOperationAction(ISD::FMA,         V8Narrow, Expand);
850     setOperationAction(ISD::FMUL,        V8Narrow, Legal);
851     setOperationAction(ISD::FNEARBYINT,  V8Narrow, Legal);
852     setOperationAction(ISD::FNEG, 	 V8Narrow, Legal);
853     setOperationAction(ISD::FROUND,      V8Narrow, Legal);
854     setOperationAction(ISD::FROUNDEVEN,  V8Narrow, Legal);
855     setOperationAction(ISD::FRINT,       V8Narrow, Legal);
856     setOperationAction(ISD::FSQRT,       V8Narrow, Expand);
857     setOperationAction(ISD::FSUB,        V8Narrow, Legal);
858     setOperationAction(ISD::FTRUNC,      V8Narrow, Legal);
859     setOperationAction(ISD::SETCC,       V8Narrow, Expand);
860     setOperationAction(ISD::BR_CC,       V8Narrow, Expand);
861     setOperationAction(ISD::SELECT,      V8Narrow, Expand);
862     setOperationAction(ISD::SELECT_CC,   V8Narrow, Expand);
863     setOperationAction(ISD::FP_EXTEND,   V8Narrow, Expand);
864     setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
865   };
866 
867   if (!Subtarget->hasFullFP16()) {
868     LegalizeNarrowFP(MVT::f16);
869   }
870   LegalizeNarrowFP(MVT::bf16);
871   setOperationAction(ISD::FP_ROUND, MVT::v4f32, Custom);
872   setOperationAction(ISD::FP_ROUND, MVT::v4bf16, Custom);
873 
874   // AArch64 has implementations of a lot of rounding-like FP operations.
875   // clang-format off
876   for (auto Op :
877        {ISD::FFLOOR,          ISD::FNEARBYINT,      ISD::FCEIL,
878         ISD::FRINT,           ISD::FTRUNC,          ISD::FROUND,
879         ISD::FROUNDEVEN,      ISD::FMINNUM,         ISD::FMAXNUM,
880         ISD::FMINIMUM,        ISD::FMAXIMUM,        ISD::LROUND,
881         ISD::LLROUND,         ISD::LRINT,           ISD::LLRINT,
882         ISD::FMINNUM_IEEE,    ISD::FMAXNUM_IEEE,
883         ISD::STRICT_FFLOOR,   ISD::STRICT_FCEIL,    ISD::STRICT_FNEARBYINT,
884         ISD::STRICT_FRINT,    ISD::STRICT_FTRUNC,   ISD::STRICT_FROUNDEVEN,
885         ISD::STRICT_FROUND,   ISD::STRICT_FMINNUM,  ISD::STRICT_FMAXNUM,
886         ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND,
887         ISD::STRICT_LLROUND,  ISD::STRICT_LRINT,    ISD::STRICT_LLRINT}) {
888     for (MVT Ty : {MVT::f32, MVT::f64})
889       setOperationAction(Op, Ty, Legal);
890     if (Subtarget->hasFullFP16())
891       setOperationAction(Op, MVT::f16, Legal);
892   }
893   // clang-format on
894 
895   // Basic strict FP operations are legal
896   for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
897                   ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) {
898     for (MVT Ty : {MVT::f32, MVT::f64})
899       setOperationAction(Op, Ty, Legal);
900     if (Subtarget->hasFullFP16())
901       setOperationAction(Op, MVT::f16, Legal);
902   }
903 
904   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
905 
906   setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
907   setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
908   setOperationAction(ISD::GET_FPMODE, MVT::i32, Custom);
909   setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);
910   setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
911 
912   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
913   if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
914     setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
915     setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, LibCall);
916   } else {
917     setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
918     setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Expand);
919   }
920   setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
921   setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
922 
923   // Generate outline atomics library calls only if LSE was not specified for
924   // subtarget
925   if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
926     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
927     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
928     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
929     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
930     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
931     setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
932     setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
933     setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
934     setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
935     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
936     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
937     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
938     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
939     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
940     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
941     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
942     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
943     setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
944     setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
945     setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
946     setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
947     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
948     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
949     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
950     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
951 #define LCALLNAMES(A, B, N)                                                    \
952   setLibcallName(A##N##_RELAX, #B #N "_relax");                                \
953   setLibcallName(A##N##_ACQ, #B #N "_acq");                                    \
954   setLibcallName(A##N##_REL, #B #N "_rel");                                    \
955   setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
956 #define LCALLNAME4(A, B)                                                       \
957   LCALLNAMES(A, B, 1)                                                          \
958   LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
959 #define LCALLNAME5(A, B)                                                       \
960   LCALLNAMES(A, B, 1)                                                          \
961   LCALLNAMES(A, B, 2)                                                          \
962   LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
963     LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
964     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
965     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
966     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
967     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
968     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
969 #undef LCALLNAMES
970 #undef LCALLNAME4
971 #undef LCALLNAME5
972   }
973 
974   if (Subtarget->hasLSE128()) {
975     // Custom lowering because i128 is not legal. Must be replaced by 2x64
976     // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
977     setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom);
978     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom);
979     setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom);
980   }
981 
982   // 128-bit loads and stores can be done without expanding
983   setOperationAction(ISD::LOAD, MVT::i128, Custom);
984   setOperationAction(ISD::STORE, MVT::i128, Custom);
985 
986   // Aligned 128-bit loads and stores are single-copy atomic according to the
987   // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
988   if (Subtarget->hasLSE2()) {
989     setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
990     setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
991   }
992 
993   // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
994   // custom lowering, as there are no un-paired non-temporal stores and
995   // legalization will break up 256 bit inputs.
996   setOperationAction(ISD::STORE, MVT::v32i8, Custom);
997   setOperationAction(ISD::STORE, MVT::v16i16, Custom);
998   setOperationAction(ISD::STORE, MVT::v16f16, Custom);
999   setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1000   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1001   setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1002   setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1003   setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1004 
1005   // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1006   // custom lowering, as there are no un-paired non-temporal loads legalization
1007   // will break up 256 bit inputs.
1008   setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1009   setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1010   setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1011   setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1012   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1013   setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1014   setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1015   setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1016 
1017   // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1018   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
1019 
1020   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1021       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1022     // Issue __sincos_stret if available.
1023     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1024     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1025   } else {
1026     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1027     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1028   }
1029 
1030   // Make floating-point constants legal for the large code model, so they don't
1031   // become loads from the constant pool.
1032   if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1033     setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
1034     setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
1035   }
1036 
1037   // AArch64 does not have floating-point extending loads, i1 sign-extending
1038   // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1039   for (MVT VT : MVT::fp_valuetypes()) {
1040     setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1041     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1042     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1043     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1044     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1045   }
1046   for (MVT VT : MVT::integer_valuetypes())
1047     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1048 
1049   for (MVT WideVT : MVT::fp_valuetypes()) {
1050     for (MVT NarrowVT : MVT::fp_valuetypes()) {
1051       if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1052         setTruncStoreAction(WideVT, NarrowVT, Expand);
1053       }
1054     }
1055   }
1056 
1057   if (Subtarget->hasFPARMv8()) {
1058     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
1059     setOperationAction(ISD::BITCAST, MVT::f16, Custom);
1060     setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
1061   }
1062 
1063   // Indexed loads and stores are supported.
1064   for (unsigned im = (unsigned)ISD::PRE_INC;
1065        im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1066     setIndexedLoadAction(im, MVT::i8, Legal);
1067     setIndexedLoadAction(im, MVT::i16, Legal);
1068     setIndexedLoadAction(im, MVT::i32, Legal);
1069     setIndexedLoadAction(im, MVT::i64, Legal);
1070     setIndexedLoadAction(im, MVT::f64, Legal);
1071     setIndexedLoadAction(im, MVT::f32, Legal);
1072     setIndexedLoadAction(im, MVT::f16, Legal);
1073     setIndexedLoadAction(im, MVT::bf16, Legal);
1074     setIndexedStoreAction(im, MVT::i8, Legal);
1075     setIndexedStoreAction(im, MVT::i16, Legal);
1076     setIndexedStoreAction(im, MVT::i32, Legal);
1077     setIndexedStoreAction(im, MVT::i64, Legal);
1078     setIndexedStoreAction(im, MVT::f64, Legal);
1079     setIndexedStoreAction(im, MVT::f32, Legal);
1080     setIndexedStoreAction(im, MVT::f16, Legal);
1081     setIndexedStoreAction(im, MVT::bf16, Legal);
1082   }
1083 
1084   // Trap.
1085   setOperationAction(ISD::TRAP, MVT::Other, Legal);
1086   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1087   setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
1088 
1089   // We combine OR nodes for bitfield operations.
1090   setTargetDAGCombine(ISD::OR);
1091   // Try to create BICs for vector ANDs.
1092   setTargetDAGCombine(ISD::AND);
1093 
1094   // llvm.init.trampoline and llvm.adjust.trampoline
1095   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
1096   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
1097 
1098   // Vector add and sub nodes may conceal a high-half opportunity.
1099   // Also, try to fold ADD into CSINC/CSINV..
1100   setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,
1101                        ISD::UINT_TO_FP});
1102 
1103   setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
1104                        ISD::FP_TO_UINT_SAT, ISD::FADD});
1105 
1106   // Try and combine setcc with csel
1107   setTargetDAGCombine(ISD::SETCC);
1108 
1109   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1110 
1111   setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,
1112                        ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS,
1113                        ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR,
1114                        ISD::STORE, ISD::BUILD_VECTOR});
1115   setTargetDAGCombine(ISD::TRUNCATE);
1116   setTargetDAGCombine(ISD::LOAD);
1117 
1118   setTargetDAGCombine(ISD::MSTORE);
1119 
1120   setTargetDAGCombine(ISD::MUL);
1121 
1122   setTargetDAGCombine({ISD::SELECT, ISD::VSELECT});
1123 
1124   setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
1125                        ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
1126                        ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
1127 
1128   setTargetDAGCombine(
1129       {ISD::MGATHER, ISD::MSCATTER, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM});
1130 
1131   setTargetDAGCombine(ISD::FP_EXTEND);
1132 
1133   setTargetDAGCombine(ISD::GlobalAddress);
1134 
1135   setTargetDAGCombine(ISD::CTLZ);
1136 
1137   setTargetDAGCombine(ISD::VECREDUCE_AND);
1138   setTargetDAGCombine(ISD::VECREDUCE_OR);
1139   setTargetDAGCombine(ISD::VECREDUCE_XOR);
1140 
1141   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1142 
1143   setTargetDAGCombine(ISD::SHL);
1144 
1145   // In case of strict alignment, avoid an excessive number of byte wide stores.
1146   MaxStoresPerMemsetOptSize = 8;
1147   MaxStoresPerMemset =
1148       Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1149 
1150   MaxGluedStoresPerMemcpy = 4;
1151   MaxStoresPerMemcpyOptSize = 4;
1152   MaxStoresPerMemcpy =
1153       Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1154 
1155   MaxStoresPerMemmoveOptSize = 4;
1156   MaxStoresPerMemmove =
1157       Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1158 
1159   MaxLoadsPerMemcmpOptSize = 4;
1160   MaxLoadsPerMemcmp =
1161       Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1162 
1163   setStackPointerRegisterToSaveRestore(AArch64::SP);
1164 
1165   setSchedulingPreference(Sched::Hybrid);
1166 
1167   EnableExtLdPromotion = true;
1168 
1169   // Set required alignment.
1170   setMinFunctionAlignment(Align(4));
1171   // Set preferred alignments.
1172 
1173   // Don't align loops on Windows. The SEH unwind info generation needs to
1174   // know the exact length of functions before the alignments have been
1175   // expanded.
1176   if (!Subtarget->isTargetWindows())
1177     setPrefLoopAlignment(STI.getPrefLoopAlignment());
1178   setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment());
1179   setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
1180 
1181   // Only change the limit for entries in a jump table if specified by
1182   // the sub target, but not at the command line.
1183   unsigned MaxJT = STI.getMaximumJumpTableSize();
1184   if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1185     setMaximumJumpTableSize(MaxJT);
1186 
1187   setHasExtractBitsInsn(true);
1188 
1189   setMaxDivRemBitWidthSupported(128);
1190 
1191   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1192   if (Subtarget->hasSME())
1193     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
1194 
1195   if (Subtarget->isNeonAvailable()) {
1196     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1197     // silliness like this:
1198     // clang-format off
1199     for (auto Op :
1200          {ISD::SELECT,            ISD::SELECT_CC,      ISD::FATAN2,
1201           ISD::BR_CC,             ISD::FADD,           ISD::FSUB,
1202           ISD::FMUL,              ISD::FDIV,           ISD::FMA,
1203           ISD::FNEG,              ISD::FABS,           ISD::FCEIL,
1204           ISD::FSQRT,             ISD::FFLOOR,         ISD::FNEARBYINT,
1205           ISD::FSIN,              ISD::FCOS,           ISD::FTAN,
1206           ISD::FASIN,             ISD::FACOS,          ISD::FATAN,
1207           ISD::FSINH,             ISD::FCOSH,          ISD::FTANH,
1208           ISD::FPOW,              ISD::FLOG,           ISD::FLOG2,
1209           ISD::FLOG10,            ISD::FEXP,           ISD::FEXP2,
1210           ISD::FEXP10,            ISD::FRINT,          ISD::FROUND,
1211           ISD::FROUNDEVEN,        ISD::FTRUNC,         ISD::FMINNUM,
1212           ISD::FMAXNUM,           ISD::FMINIMUM,       ISD::FMAXIMUM,
1213           ISD::FMAXNUM_IEEE,      ISD::FMINNUM_IEEE,
1214           ISD::STRICT_FADD,       ISD::STRICT_FSUB,    ISD::STRICT_FMUL,
1215           ISD::STRICT_FDIV,       ISD::STRICT_FMA,     ISD::STRICT_FCEIL,
1216           ISD::STRICT_FFLOOR,     ISD::STRICT_FSQRT,   ISD::STRICT_FRINT,
1217           ISD::STRICT_FNEARBYINT, ISD::STRICT_FROUND,  ISD::STRICT_FTRUNC,
1218           ISD::STRICT_FROUNDEVEN, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
1219           ISD::STRICT_FMINIMUM,   ISD::STRICT_FMAXIMUM})
1220       setOperationAction(Op, MVT::v1f64, Expand);
1221     // clang-format on
1222 
1223     for (auto Op :
1224          {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,
1225           ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,
1226           ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT,
1227           ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND})
1228       setOperationAction(Op, MVT::v1i64, Expand);
1229 
1230     // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1231     // elements smaller than i32, so promote the input to i32 first.
1232     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1233     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1234 
1235     // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1236     // Or, direct i32 -> f16 vector conversion.  Set it so custom, so the
1237     // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1238     for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1239                     ISD::STRICT_UINT_TO_FP})
1240       for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1241         setOperationAction(Op, VT, Custom);
1242 
1243     if (Subtarget->hasFullFP16()) {
1244       setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
1245       setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
1246 
1247       setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom);
1248       setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1249       setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Custom);
1250       setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1251       setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
1252       setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
1253       setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
1254       setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1255     } else {
1256       // when AArch64 doesn't have fullfp16 support, promote the input
1257       // to i32 first.
1258       setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1259       setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1260       setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1261       setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1262       setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1263       setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1264       setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1265       setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1266     }
1267 
1268     setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
1269     setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
1270     setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);
1271     setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);
1272     setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);
1273     setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);
1274     setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1275     setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom);
1276     for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1277       setOperationAction(ISD::UMAX, VT, Custom);
1278       setOperationAction(ISD::SMAX, VT, Custom);
1279       setOperationAction(ISD::UMIN, VT, Custom);
1280       setOperationAction(ISD::SMIN, VT, Custom);
1281     }
1282 
1283     // Custom handling for some quad-vector types to detect MULL.
1284     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1285     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1286     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1287     setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1288     setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1289     setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1290 
1291     // Saturates
1292     for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1293                     MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1294       setOperationAction(ISD::SADDSAT, VT, Legal);
1295       setOperationAction(ISD::UADDSAT, VT, Legal);
1296       setOperationAction(ISD::SSUBSAT, VT, Legal);
1297       setOperationAction(ISD::USUBSAT, VT, Legal);
1298     }
1299 
1300     for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1301                    MVT::v4i32}) {
1302       setOperationAction(ISD::AVGFLOORS, VT, Legal);
1303       setOperationAction(ISD::AVGFLOORU, VT, Legal);
1304       setOperationAction(ISD::AVGCEILS, VT, Legal);
1305       setOperationAction(ISD::AVGCEILU, VT, Legal);
1306       setOperationAction(ISD::ABDS, VT, Legal);
1307       setOperationAction(ISD::ABDU, VT, Legal);
1308     }
1309 
1310     // Vector reductions
1311     for (MVT VT : { MVT::v4f16, MVT::v2f32,
1312                     MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1313       if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1314         setOperationAction(ISD::VECREDUCE_FMAX, VT, Legal);
1315         setOperationAction(ISD::VECREDUCE_FMIN, VT, Legal);
1316         setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Legal);
1317         setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Legal);
1318 
1319         setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1320       }
1321     }
1322     for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1323                     MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1324       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1325       setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1326       setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1327       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1328       setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1329       setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1330       setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1331       setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1332     }
1333     setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
1334     setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Custom);
1335     setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Custom);
1336     setOperationAction(ISD::VECREDUCE_XOR, MVT::v2i64, Custom);
1337 
1338     setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
1339     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1340     // Likewise, narrowing and extending vector loads/stores aren't handled
1341     // directly.
1342     for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1343       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
1344 
1345       if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1346         setOperationAction(ISD::MULHS, VT, Legal);
1347         setOperationAction(ISD::MULHU, VT, Legal);
1348       } else {
1349         setOperationAction(ISD::MULHS, VT, Expand);
1350         setOperationAction(ISD::MULHU, VT, Expand);
1351       }
1352       setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1353       setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1354 
1355       setOperationAction(ISD::BSWAP, VT, Expand);
1356       setOperationAction(ISD::CTTZ, VT, Expand);
1357 
1358       for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1359         setTruncStoreAction(VT, InnerVT, Expand);
1360         setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1361         setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1362         setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1363       }
1364     }
1365 
1366     for (auto Op :
1367          {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1368           ISD::FROUND, ISD::FROUNDEVEN, ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
1369           ISD::STRICT_FFLOOR, ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL,
1370           ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUND,
1371           ISD::STRICT_FROUNDEVEN}) {
1372       for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1373         setOperationAction(Op, Ty, Legal);
1374       if (Subtarget->hasFullFP16())
1375         for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1376           setOperationAction(Op, Ty, Legal);
1377     }
1378 
1379     // LRINT and LLRINT.
1380     for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1381       for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1382         setOperationAction(Op, Ty, Custom);
1383       if (Subtarget->hasFullFP16())
1384         for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1385           setOperationAction(Op, Ty, Custom);
1386     }
1387 
1388     setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1389 
1390     setOperationAction(ISD::BITCAST, MVT::i2, Custom);
1391     setOperationAction(ISD::BITCAST, MVT::i4, Custom);
1392     setOperationAction(ISD::BITCAST, MVT::i8, Custom);
1393     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
1394 
1395     setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
1396     setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
1397     setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
1398 
1399     setLoadExtAction(ISD::EXTLOAD,  MVT::v4i16, MVT::v4i8, Custom);
1400     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1401     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1402     setLoadExtAction(ISD::EXTLOAD,  MVT::v4i32, MVT::v4i8, Custom);
1403     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1404     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1405 
1406     // ADDP custom lowering
1407     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1408       setOperationAction(ISD::ADD, VT, Custom);
1409     // FADDP custom lowering
1410     for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1411       setOperationAction(ISD::FADD, VT, Custom);
1412   } else /* !isNeonAvailable */ {
1413     for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1414       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1415         setOperationAction(Op, VT, Expand);
1416 
1417       if (VT.is128BitVector() || VT.is64BitVector()) {
1418         setOperationAction(ISD::LOAD, VT, Legal);
1419         setOperationAction(ISD::STORE, VT, Legal);
1420         setOperationAction(ISD::BITCAST, VT,
1421                            Subtarget->isLittleEndian() ? Legal : Expand);
1422       }
1423       for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1424         setTruncStoreAction(VT, InnerVT, Expand);
1425         setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1426         setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1427         setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1428       }
1429     }
1430   }
1431 
1432   for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1433     setOperationAction(ISD::TRUNCATE_SSAT_S, VT, Legal);
1434     setOperationAction(ISD::TRUNCATE_SSAT_U, VT, Legal);
1435     setOperationAction(ISD::TRUNCATE_USAT_U, VT, Legal);
1436   }
1437 
1438   if (Subtarget->hasSME()) {
1439     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1440   }
1441 
1442   // FIXME: Move lowering for more nodes here if those are common between
1443   // SVE and SME.
1444   if (Subtarget->isSVEorStreamingSVEAvailable()) {
1445     for (auto VT :
1446          {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1447       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1448       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1449       setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
1450       setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
1451     }
1452   }
1453 
1454   if (Subtarget->isSVEorStreamingSVEAvailable()) {
1455     for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1456       setOperationAction(ISD::BITREVERSE, VT, Custom);
1457       setOperationAction(ISD::BSWAP, VT, Custom);
1458       setOperationAction(ISD::CTLZ, VT, Custom);
1459       setOperationAction(ISD::CTPOP, VT, Custom);
1460       setOperationAction(ISD::CTTZ, VT, Custom);
1461       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1462       setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1463       setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1464       setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1465       setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1466       setOperationAction(ISD::MLOAD, VT, Custom);
1467       setOperationAction(ISD::MUL, VT, Custom);
1468       setOperationAction(ISD::MULHS, VT, Custom);
1469       setOperationAction(ISD::MULHU, VT, Custom);
1470       setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1471       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1472       setOperationAction(ISD::SELECT, VT, Custom);
1473       setOperationAction(ISD::SETCC, VT, Custom);
1474       setOperationAction(ISD::SDIV, VT, Custom);
1475       setOperationAction(ISD::UDIV, VT, Custom);
1476       setOperationAction(ISD::SMIN, VT, Custom);
1477       setOperationAction(ISD::UMIN, VT, Custom);
1478       setOperationAction(ISD::SMAX, VT, Custom);
1479       setOperationAction(ISD::UMAX, VT, Custom);
1480       setOperationAction(ISD::SHL, VT, Custom);
1481       setOperationAction(ISD::SRL, VT, Custom);
1482       setOperationAction(ISD::SRA, VT, Custom);
1483       setOperationAction(ISD::ABS, VT, Custom);
1484       setOperationAction(ISD::ABDS, VT, Custom);
1485       setOperationAction(ISD::ABDU, VT, Custom);
1486       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1487       setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1488       setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1489       setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1490       setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1491       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1492       setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1493       setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1494       setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
1495       setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
1496 
1497       setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1498       setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1499       setOperationAction(ISD::SELECT_CC, VT, Expand);
1500       setOperationAction(ISD::ROTL, VT, Expand);
1501       setOperationAction(ISD::ROTR, VT, Expand);
1502 
1503       setOperationAction(ISD::SADDSAT, VT, Legal);
1504       setOperationAction(ISD::UADDSAT, VT, Legal);
1505       setOperationAction(ISD::SSUBSAT, VT, Legal);
1506       setOperationAction(ISD::USUBSAT, VT, Legal);
1507       setOperationAction(ISD::UREM, VT, Expand);
1508       setOperationAction(ISD::SREM, VT, Expand);
1509       setOperationAction(ISD::SDIVREM, VT, Expand);
1510       setOperationAction(ISD::UDIVREM, VT, Expand);
1511 
1512       setOperationAction(ISD::AVGFLOORS, VT, Custom);
1513       setOperationAction(ISD::AVGFLOORU, VT, Custom);
1514       setOperationAction(ISD::AVGCEILS, VT, Custom);
1515       setOperationAction(ISD::AVGCEILU, VT, Custom);
1516 
1517       if (!Subtarget->isLittleEndian())
1518         setOperationAction(ISD::BITCAST, VT, Custom);
1519 
1520       if (Subtarget->hasSVE2() ||
1521           (Subtarget->hasSME() && Subtarget->isStreaming()))
1522         // For SLI/SRI.
1523         setOperationAction(ISD::OR, VT, Custom);
1524     }
1525 
1526     // Illegal unpacked integer vector types.
1527     for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1528       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1529       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1530     }
1531 
1532     // Type legalize unpacked bitcasts.
1533     for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1534       setOperationAction(ISD::BITCAST, VT, Custom);
1535 
1536     for (auto VT :
1537          { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1538            MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1539       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
1540 
1541     for (auto VT :
1542          {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1543       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1544       setOperationAction(ISD::SELECT, VT, Custom);
1545       setOperationAction(ISD::SETCC, VT, Custom);
1546       setOperationAction(ISD::TRUNCATE, VT, Custom);
1547       setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1548       setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1549       setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1550 
1551       setOperationAction(ISD::SELECT_CC, VT, Expand);
1552       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1553       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1554 
1555       // There are no legal MVT::nxv16f## based types.
1556       if (VT != MVT::nxv16i1) {
1557         setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1558         setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1559       }
1560     }
1561 
1562     // NEON doesn't support masked loads/stores, but SME and SVE do.
1563     for (auto VT :
1564          {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1565           MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1566           MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1567       setOperationAction(ISD::MLOAD, VT, Custom);
1568       setOperationAction(ISD::MSTORE, VT, Custom);
1569     }
1570 
1571     // Firstly, exclude all scalable vector extending loads/truncating stores,
1572     // include both integer and floating scalable vector.
1573     for (MVT VT : MVT::scalable_vector_valuetypes()) {
1574       for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1575         setTruncStoreAction(VT, InnerVT, Expand);
1576         setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1577         setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1578         setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1579       }
1580     }
1581 
1582     // Then, selectively enable those which we directly support.
1583     setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1584     setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1585     setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1586     setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1587     setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1588     setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1589     for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1590       setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1591       setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1592       setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1593       setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1594       setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1595       setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1596     }
1597 
1598     // SVE supports truncating stores of 64 and 128-bit vectors
1599     setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1600     setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1601     setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1602     setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1603     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1604 
1605     for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1606                     MVT::nxv4f32, MVT::nxv2f64}) {
1607       setOperationAction(ISD::BITCAST, VT, Custom);
1608       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1609       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1610       setOperationAction(ISD::MLOAD, VT, Custom);
1611       setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1612       setOperationAction(ISD::SELECT, VT, Custom);
1613       setOperationAction(ISD::SETCC, VT, Custom);
1614       setOperationAction(ISD::FADD, VT, Custom);
1615       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1616       setOperationAction(ISD::FDIV, VT, Custom);
1617       setOperationAction(ISD::FMA, VT, Custom);
1618       setOperationAction(ISD::FMAXIMUM, VT, Custom);
1619       setOperationAction(ISD::FMAXNUM, VT, Custom);
1620       setOperationAction(ISD::FMINIMUM, VT, Custom);
1621       setOperationAction(ISD::FMINNUM, VT, Custom);
1622       setOperationAction(ISD::FMUL, VT, Custom);
1623       setOperationAction(ISD::FNEG, VT, Custom);
1624       setOperationAction(ISD::FSUB, VT, Custom);
1625       setOperationAction(ISD::FCEIL, VT, Custom);
1626       setOperationAction(ISD::FFLOOR, VT, Custom);
1627       setOperationAction(ISD::FNEARBYINT, VT, Custom);
1628       setOperationAction(ISD::FRINT, VT, Custom);
1629       setOperationAction(ISD::LRINT, VT, Custom);
1630       setOperationAction(ISD::LLRINT, VT, Custom);
1631       setOperationAction(ISD::FROUND, VT, Custom);
1632       setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1633       setOperationAction(ISD::FTRUNC, VT, Custom);
1634       setOperationAction(ISD::FSQRT, VT, Custom);
1635       setOperationAction(ISD::FABS, VT, Custom);
1636       setOperationAction(ISD::FP_EXTEND, VT, Custom);
1637       setOperationAction(ISD::FP_ROUND, VT, Custom);
1638       setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1639       setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1640       setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1641       setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
1642       setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
1643       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1644       setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
1645       setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
1646 
1647       setOperationAction(ISD::SELECT_CC, VT, Expand);
1648       setOperationAction(ISD::FREM, VT, Expand);
1649       setOperationAction(ISD::FPOW, VT, Expand);
1650       setOperationAction(ISD::FPOWI, VT, Expand);
1651       setOperationAction(ISD::FCOS, VT, Expand);
1652       setOperationAction(ISD::FSIN, VT, Expand);
1653       setOperationAction(ISD::FSINCOS, VT, Expand);
1654       setOperationAction(ISD::FTAN, VT, Expand);
1655       setOperationAction(ISD::FACOS, VT, Expand);
1656       setOperationAction(ISD::FASIN, VT, Expand);
1657       setOperationAction(ISD::FATAN, VT, Expand);
1658       setOperationAction(ISD::FATAN2, VT, Expand);
1659       setOperationAction(ISD::FCOSH, VT, Expand);
1660       setOperationAction(ISD::FSINH, VT, Expand);
1661       setOperationAction(ISD::FTANH, VT, Expand);
1662       setOperationAction(ISD::FEXP, VT, Expand);
1663       setOperationAction(ISD::FEXP2, VT, Expand);
1664       setOperationAction(ISD::FEXP10, VT, Expand);
1665       setOperationAction(ISD::FLOG, VT, Expand);
1666       setOperationAction(ISD::FLOG2, VT, Expand);
1667       setOperationAction(ISD::FLOG10, VT, Expand);
1668 
1669       setCondCodeAction(ISD::SETO, VT, Expand);
1670       setCondCodeAction(ISD::SETOLT, VT, Expand);
1671       setCondCodeAction(ISD::SETLT, VT, Expand);
1672       setCondCodeAction(ISD::SETOLE, VT, Expand);
1673       setCondCodeAction(ISD::SETLE, VT, Expand);
1674       setCondCodeAction(ISD::SETULT, VT, Expand);
1675       setCondCodeAction(ISD::SETULE, VT, Expand);
1676       setCondCodeAction(ISD::SETUGE, VT, Expand);
1677       setCondCodeAction(ISD::SETUGT, VT, Expand);
1678       setCondCodeAction(ISD::SETUEQ, VT, Expand);
1679       setCondCodeAction(ISD::SETONE, VT, Expand);
1680     }
1681 
1682     for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1683       setOperationAction(ISD::BITCAST, VT, Custom);
1684       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1685       setOperationAction(ISD::FABS, VT, Legal);
1686       setOperationAction(ISD::FNEG, VT, Legal);
1687       setOperationAction(ISD::FP_EXTEND, VT, Custom);
1688       setOperationAction(ISD::FP_ROUND, VT, Custom);
1689       setOperationAction(ISD::MLOAD, VT, Custom);
1690       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1691       setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1692       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1693 
1694       if (Subtarget->hasSVEB16B16()) {
1695         setOperationAction(ISD::FADD, VT, Legal);
1696         setOperationAction(ISD::FMA, VT, Custom);
1697         setOperationAction(ISD::FMAXIMUM, VT, Custom);
1698         setOperationAction(ISD::FMAXNUM, VT, Custom);
1699         setOperationAction(ISD::FMINIMUM, VT, Custom);
1700         setOperationAction(ISD::FMINNUM, VT, Custom);
1701         setOperationAction(ISD::FMUL, VT, Legal);
1702         setOperationAction(ISD::FSUB, VT, Legal);
1703       }
1704     }
1705 
1706     for (auto Opcode :
1707          {ISD::FCEIL, ISD::FDIV, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
1708           ISD::FROUND, ISD::FROUNDEVEN, ISD::FSQRT, ISD::FTRUNC}) {
1709       setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1710       setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1711       setOperationAction(Opcode, MVT::nxv8bf16, Expand);
1712     }
1713 
1714     if (!Subtarget->hasSVEB16B16()) {
1715       for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM,
1716                           ISD::FMINIMUM, ISD::FMINNUM, ISD::FMUL, ISD::FSUB}) {
1717         setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1718         setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1719         setOperationAction(Opcode, MVT::nxv8bf16, Expand);
1720       }
1721     }
1722 
1723     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
1724     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
1725 
1726     // NEON doesn't support integer divides, but SVE does
1727     for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1728                     MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1729       setOperationAction(ISD::SDIV, VT, Custom);
1730       setOperationAction(ISD::UDIV, VT, Custom);
1731     }
1732 
1733     // NEON doesn't support 64-bit vector integer muls, but SVE does.
1734     setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1735     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1736 
1737     // NOTE: Currently this has to happen after computeRegisterProperties rather
1738     // than the preferred option of combining it with the addRegisterClass call.
1739     if (Subtarget->useSVEForFixedLengthVectors()) {
1740       for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
1741         if (useSVEForFixedLengthVectorVT(
1742                 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1743           addTypeForFixedLengthSVE(VT);
1744       }
1745       for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) {
1746         if (useSVEForFixedLengthVectorVT(
1747                 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1748           addTypeForFixedLengthSVE(VT);
1749       }
1750 
1751       // 64bit results can mean a bigger than NEON input.
1752       for (auto VT : {MVT::v8i8, MVT::v4i16})
1753         setOperationAction(ISD::TRUNCATE, VT, Custom);
1754       setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
1755 
1756       // 128bit results imply a bigger than NEON input.
1757       for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1758         setOperationAction(ISD::TRUNCATE, VT, Custom);
1759       for (auto VT : {MVT::v8f16, MVT::v4f32})
1760         setOperationAction(ISD::FP_ROUND, VT, Custom);
1761 
1762       // These operations are not supported on NEON but SVE can do them.
1763       setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1764       setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1765       setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1766       setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1767       setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1768       setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1769       setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1770       setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1771       setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1772       setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1773       setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1774       setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1775       setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1776       setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1777       setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1778       setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1779       setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
1780       setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
1781       setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
1782       setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
1783 
1784       // Int operations with no NEON support.
1785       for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1786                       MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1787         setOperationAction(ISD::BITREVERSE, VT, Custom);
1788         setOperationAction(ISD::CTTZ, VT, Custom);
1789         setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1790         setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1791         setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1792         setOperationAction(ISD::MULHS, VT, Custom);
1793         setOperationAction(ISD::MULHU, VT, Custom);
1794       }
1795 
1796       // Use SVE for vectors with more than 2 elements.
1797       for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1798         setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1799     }
1800 
1801     setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1802     setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1803     setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1804     setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1805 
1806     setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1807 
1808     for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1809       setOperationAction(ISD::INTRINSIC_WO_CHAIN, VT, Custom);
1810   }
1811 
1812   // Handle operations that are only available in non-streaming SVE mode.
1813   if (Subtarget->isSVEAvailable()) {
1814     for (auto VT : {MVT::nxv16i8,  MVT::nxv8i16, MVT::nxv4i32,  MVT::nxv2i64,
1815                     MVT::nxv2f16,  MVT::nxv4f16, MVT::nxv8f16,  MVT::nxv2f32,
1816                     MVT::nxv4f32,  MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1817                     MVT::nxv8bf16, MVT::v4f16,   MVT::v8f16,    MVT::v2f32,
1818                     MVT::v4f32,    MVT::v1f64,   MVT::v2f64,    MVT::v8i8,
1819                     MVT::v16i8,    MVT::v4i16,   MVT::v8i16,    MVT::v2i32,
1820                     MVT::v4i32,    MVT::v1i64,   MVT::v2i64}) {
1821       setOperationAction(ISD::MGATHER, VT, Custom);
1822       setOperationAction(ISD::MSCATTER, VT, Custom);
1823     }
1824 
1825     for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1826                     MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1827                     MVT::v2f32, MVT::v4f32, MVT::v2f64})
1828       setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1829 
1830     // We can lower types that have <vscale x {2|4}> elements to compact.
1831     for (auto VT :
1832          {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
1833           MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
1834       setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
1835 
1836     // If we have SVE, we can use SVE logic for legal (or smaller than legal)
1837     // NEON vectors in the lowest bits of the SVE register.
1838     for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
1839                     MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
1840       setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
1841 
1842     // Histcnt is SVE2 only
1843     if (Subtarget->hasSVE2()) {
1844       setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv4i32,
1845                          Custom);
1846       setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv2i64,
1847                          Custom);
1848     }
1849   }
1850 
1851 
1852   if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1853     // Only required for llvm.aarch64.mops.memset.tag
1854     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
1855   }
1856 
1857   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1858 
1859   if (Subtarget->hasSVE()) {
1860     setOperationAction(ISD::FLDEXP, MVT::f64, Custom);
1861     setOperationAction(ISD::FLDEXP, MVT::f32, Custom);
1862     setOperationAction(ISD::FLDEXP, MVT::f16, Custom);
1863     setOperationAction(ISD::FLDEXP, MVT::bf16, Custom);
1864   }
1865 
1866   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1867 
1868   IsStrictFPEnabled = true;
1869   setMaxAtomicSizeInBitsSupported(128);
1870 
1871   // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined.  MinGW has
1872   // it, but it's just a wrapper around ldexp.
1873   if (Subtarget->isTargetWindows()) {
1874     for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1875       if (isOperationExpand(Op, MVT::f32))
1876         setOperationAction(Op, MVT::f32, Promote);
1877   }
1878 
1879   // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1880   // isn't legal.
1881   for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1882     if (isOperationExpand(Op, MVT::f16))
1883       setOperationAction(Op, MVT::f16, Promote);
1884 
1885   if (Subtarget->isWindowsArm64EC()) {
1886     // FIXME: are there intrinsics we need to exclude from this?
1887     for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1888       auto code = static_cast<RTLIB::Libcall>(i);
1889       auto libcallName = getLibcallName(code);
1890       if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1891         setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1892       }
1893     }
1894   }
1895 }
1896 
1897 void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1898   assert(VT.isVector() && "VT should be a vector type");
1899 
1900   if (VT.isFloatingPoint()) {
1901     MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
1902     setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1903     setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1904   }
1905 
1906   // Mark vector float intrinsics as expand.
1907   if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1908     setOperationAction(ISD::FSIN, VT, Expand);
1909     setOperationAction(ISD::FCOS, VT, Expand);
1910     setOperationAction(ISD::FTAN, VT, Expand);
1911     setOperationAction(ISD::FASIN, VT, Expand);
1912     setOperationAction(ISD::FACOS, VT, Expand);
1913     setOperationAction(ISD::FATAN, VT, Expand);
1914     setOperationAction(ISD::FATAN2, VT, Expand);
1915     setOperationAction(ISD::FSINH, VT, Expand);
1916     setOperationAction(ISD::FCOSH, VT, Expand);
1917     setOperationAction(ISD::FTANH, VT, Expand);
1918     setOperationAction(ISD::FPOW, VT, Expand);
1919     setOperationAction(ISD::FLOG, VT, Expand);
1920     setOperationAction(ISD::FLOG2, VT, Expand);
1921     setOperationAction(ISD::FLOG10, VT, Expand);
1922     setOperationAction(ISD::FEXP, VT, Expand);
1923     setOperationAction(ISD::FEXP2, VT, Expand);
1924     setOperationAction(ISD::FEXP10, VT, Expand);
1925   }
1926 
1927   // But we do support custom-lowering for FCOPYSIGN.
1928   if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1929       ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1930         VT == MVT::v8f16) &&
1931        Subtarget->hasFullFP16()))
1932     setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1933 
1934   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1935   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1936   setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1937   setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1938   setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1939   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1940   setOperationAction(ISD::SRA, VT, Custom);
1941   setOperationAction(ISD::SRL, VT, Custom);
1942   setOperationAction(ISD::SHL, VT, Custom);
1943   setOperationAction(ISD::OR, VT, Custom);
1944   setOperationAction(ISD::SETCC, VT, Custom);
1945   setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
1946 
1947   setOperationAction(ISD::SELECT, VT, Expand);
1948   setOperationAction(ISD::SELECT_CC, VT, Expand);
1949   setOperationAction(ISD::VSELECT, VT, Expand);
1950   for (MVT InnerVT : MVT::all_valuetypes())
1951     setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1952 
1953   // CNT supports only B element sizes, then use UADDLP to widen.
1954   if (VT != MVT::v8i8 && VT != MVT::v16i8)
1955     setOperationAction(ISD::CTPOP, VT, Custom);
1956 
1957   setOperationAction(ISD::UDIV, VT, Expand);
1958   setOperationAction(ISD::SDIV, VT, Expand);
1959   setOperationAction(ISD::UREM, VT, Expand);
1960   setOperationAction(ISD::SREM, VT, Expand);
1961   setOperationAction(ISD::FREM, VT, Expand);
1962 
1963   for (unsigned Opcode :
1964        {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
1965         ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1966     setOperationAction(Opcode, VT, Custom);
1967 
1968   if (!VT.isFloatingPoint())
1969     setOperationAction(ISD::ABS, VT, Legal);
1970 
1971   // [SU][MIN|MAX] are available for all NEON types apart from i64.
1972   if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1973     for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1974       setOperationAction(Opcode, VT, Legal);
1975 
1976   // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1977   // NEON types.
1978   if (VT.isFloatingPoint() &&
1979       VT.getVectorElementType() != MVT::bf16 &&
1980       (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1981     for (unsigned Opcode :
1982          {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
1983           ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::STRICT_FMINIMUM,
1984           ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
1985           ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
1986           ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT})
1987       setOperationAction(Opcode, VT, Legal);
1988 
1989   // Strict fp extend and trunc are legal
1990   if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1991     setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
1992   if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1993     setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
1994 
1995   // FIXME: We could potentially make use of the vector comparison instructions
1996   // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1997   // complications:
1998   //  * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1999   //    so we would need to expand when the condition code doesn't match the
2000   //    kind of comparison.
2001   //  * Some kinds of comparison require more than one FCMXY instruction so
2002   //    would need to be expanded instead.
2003   //  * The lowering of the non-strict versions involves target-specific ISD
2004   //    nodes so we would likely need to add strict versions of all of them and
2005   //    handle them appropriately.
2006   setOperationAction(ISD::STRICT_FSETCC, VT, Expand);
2007   setOperationAction(ISD::STRICT_FSETCCS, VT, Expand);
2008 
2009   if (Subtarget->isLittleEndian()) {
2010     for (unsigned im = (unsigned)ISD::PRE_INC;
2011          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
2012       setIndexedLoadAction(im, VT, Legal);
2013       setIndexedStoreAction(im, VT, Legal);
2014     }
2015   }
2016 
2017   if (Subtarget->hasD128()) {
2018     setOperationAction(ISD::READ_REGISTER, MVT::i128, Custom);
2019     setOperationAction(ISD::WRITE_REGISTER, MVT::i128, Custom);
2020   }
2021 }
2022 
2023 bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
2024                                                           EVT OpVT) const {
2025   // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2026   if (!Subtarget->hasSVE())
2027     return true;
2028 
2029   // We can only support legal predicate result types. We can use the SVE
2030   // whilelo instruction for generating fixed-width predicates too.
2031   if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
2032       ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
2033       ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
2034     return true;
2035 
2036   // The whilelo instruction only works with i32 or i64 scalar inputs.
2037   if (OpVT != MVT::i32 && OpVT != MVT::i64)
2038     return true;
2039 
2040   return false;
2041 }
2042 
2043 bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
2044     const IntrinsicInst *I) const {
2045   if (I->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add)
2046     return true;
2047 
2048   EVT VT = EVT::getEVT(I->getType());
2049   auto Op1 = I->getOperand(1);
2050   EVT Op1VT = EVT::getEVT(Op1->getType());
2051   if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
2052       (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount() ||
2053        VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount()))
2054     return false;
2055   return true;
2056 }
2057 
2058 bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
2059   if (!Subtarget->isSVEorStreamingSVEAvailable())
2060     return true;
2061 
2062   // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2063   // also support fixed-width predicates.
2064   return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2065          VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2066          VT != MVT::v4i1 && VT != MVT::v2i1;
2067 }
2068 
2069 bool AArch64TargetLowering::shouldExpandVectorMatch(EVT VT,
2070                                                     unsigned SearchSize) const {
2071   // MATCH is SVE2 and only available in non-streaming mode.
2072   if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2073     return true;
2074   // Furthermore, we can only use it for 8-bit or 16-bit elements.
2075   if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2076     return SearchSize != 8;
2077   if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2078     return SearchSize != 8 && SearchSize != 16;
2079   return true;
2080 }
2081 
2082 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2083   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2084 
2085   // By default everything must be expanded.
2086   for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2087     setOperationAction(Op, VT, Expand);
2088 
2089   if (VT.isFloatingPoint()) {
2090     setCondCodeAction(ISD::SETO, VT, Expand);
2091     setCondCodeAction(ISD::SETOLT, VT, Expand);
2092     setCondCodeAction(ISD::SETOLE, VT, Expand);
2093     setCondCodeAction(ISD::SETULT, VT, Expand);
2094     setCondCodeAction(ISD::SETULE, VT, Expand);
2095     setCondCodeAction(ISD::SETUGE, VT, Expand);
2096     setCondCodeAction(ISD::SETUGT, VT, Expand);
2097     setCondCodeAction(ISD::SETUEQ, VT, Expand);
2098     setCondCodeAction(ISD::SETONE, VT, Expand);
2099   }
2100 
2101   TargetLoweringBase::LegalizeAction Default =
2102       VT == MVT::v1f64 ? Expand : Custom;
2103 
2104   // Mark integer truncating stores/extending loads as having custom lowering
2105   if (VT.isInteger()) {
2106     MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2107     while (InnerVT != VT) {
2108       setTruncStoreAction(VT, InnerVT, Default);
2109       setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2110       setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2111       setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2112       InnerVT = InnerVT.changeVectorElementType(
2113           MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2114     }
2115   }
2116 
2117   // Mark floating-point truncating stores/extending loads as having custom
2118   // lowering
2119   if (VT.isFloatingPoint()) {
2120     MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2121     while (InnerVT != VT) {
2122       setTruncStoreAction(VT, InnerVT, Custom);
2123       setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2124       InnerVT = InnerVT.changeVectorElementType(
2125           MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits()));
2126     }
2127   }
2128 
2129   bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2130   bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2131 
2132   // Lower fixed length vector operations to scalable equivalents.
2133   setOperationAction(ISD::ABDS, VT, Default);
2134   setOperationAction(ISD::ABDU, VT, Default);
2135   setOperationAction(ISD::ABS, VT, Default);
2136   setOperationAction(ISD::ADD, VT, Default);
2137   setOperationAction(ISD::AND, VT, Default);
2138   setOperationAction(ISD::ANY_EXTEND, VT, Default);
2139   setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2140   setOperationAction(ISD::BITREVERSE, VT, Default);
2141   setOperationAction(ISD::BSWAP, VT, Default);
2142   setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2143   setOperationAction(ISD::CONCAT_VECTORS, VT, Default);
2144   setOperationAction(ISD::CTLZ, VT, Default);
2145   setOperationAction(ISD::CTPOP, VT, Default);
2146   setOperationAction(ISD::CTTZ, VT, Default);
2147   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Default);
2148   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Default);
2149   setOperationAction(ISD::FABS, VT, Default);
2150   setOperationAction(ISD::FADD, VT, Default);
2151   setOperationAction(ISD::FCEIL, VT, Default);
2152   setOperationAction(ISD::FCOPYSIGN, VT, Default);
2153   setOperationAction(ISD::FDIV, VT, Default);
2154   setOperationAction(ISD::FFLOOR, VT, Default);
2155   setOperationAction(ISD::FMA, VT, Default);
2156   setOperationAction(ISD::FMAXIMUM, VT, Default);
2157   setOperationAction(ISD::FMAXNUM, VT, Default);
2158   setOperationAction(ISD::FMINIMUM, VT, Default);
2159   setOperationAction(ISD::FMINNUM, VT, Default);
2160   setOperationAction(ISD::FMUL, VT, Default);
2161   setOperationAction(ISD::FNEARBYINT, VT, Default);
2162   setOperationAction(ISD::FNEG, VT, Default);
2163   setOperationAction(ISD::FP_EXTEND, VT, Default);
2164   setOperationAction(ISD::FP_ROUND, VT, Default);
2165   setOperationAction(ISD::FP_TO_SINT, VT, Default);
2166   setOperationAction(ISD::FP_TO_UINT, VT, Default);
2167   setOperationAction(ISD::FRINT, VT, Default);
2168   setOperationAction(ISD::LRINT, VT, Default);
2169   setOperationAction(ISD::LLRINT, VT, Default);
2170   setOperationAction(ISD::FROUND, VT, Default);
2171   setOperationAction(ISD::FROUNDEVEN, VT, Default);
2172   setOperationAction(ISD::FSQRT, VT, Default);
2173   setOperationAction(ISD::FSUB, VT, Default);
2174   setOperationAction(ISD::FTRUNC, VT, Default);
2175   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Default);
2176   setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2177   setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2178   setOperationAction(ISD::MLOAD, VT, Default);
2179   setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2180   setOperationAction(ISD::MSTORE, VT, Default);
2181   setOperationAction(ISD::MUL, VT, Default);
2182   setOperationAction(ISD::MULHS, VT, Default);
2183   setOperationAction(ISD::MULHU, VT, Default);
2184   setOperationAction(ISD::OR, VT, Default);
2185   setOperationAction(ISD::SCALAR_TO_VECTOR, VT, PreferNEON ? Legal : Expand);
2186   setOperationAction(ISD::SDIV, VT, Default);
2187   setOperationAction(ISD::SELECT, VT, Default);
2188   setOperationAction(ISD::SETCC, VT, Default);
2189   setOperationAction(ISD::SHL, VT, Default);
2190   setOperationAction(ISD::SIGN_EXTEND, VT, Default);
2191   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Default);
2192   setOperationAction(ISD::SINT_TO_FP, VT, Default);
2193   setOperationAction(ISD::SMAX, VT, Default);
2194   setOperationAction(ISD::SMIN, VT, Default);
2195   setOperationAction(ISD::SPLAT_VECTOR, VT, Default);
2196   setOperationAction(ISD::SRA, VT, Default);
2197   setOperationAction(ISD::SRL, VT, Default);
2198   setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2199   setOperationAction(ISD::SUB, VT, Default);
2200   setOperationAction(ISD::TRUNCATE, VT, Default);
2201   setOperationAction(ISD::UDIV, VT, Default);
2202   setOperationAction(ISD::UINT_TO_FP, VT, Default);
2203   setOperationAction(ISD::UMAX, VT, Default);
2204   setOperationAction(ISD::UMIN, VT, Default);
2205   setOperationAction(ISD::VECREDUCE_ADD, VT, Default);
2206   setOperationAction(ISD::VECREDUCE_AND, VT, Default);
2207   setOperationAction(ISD::VECREDUCE_FADD, VT, Default);
2208   setOperationAction(ISD::VECREDUCE_FMAX, VT, Default);
2209   setOperationAction(ISD::VECREDUCE_FMIN, VT, Default);
2210   setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Default);
2211   setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Default);
2212   setOperationAction(ISD::VECREDUCE_OR, VT, Default);
2213   setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, PreferSVE ? Default : Expand);
2214   setOperationAction(ISD::VECREDUCE_SMAX, VT, Default);
2215   setOperationAction(ISD::VECREDUCE_SMIN, VT, Default);
2216   setOperationAction(ISD::VECREDUCE_UMAX, VT, Default);
2217   setOperationAction(ISD::VECREDUCE_UMIN, VT, Default);
2218   setOperationAction(ISD::VECREDUCE_XOR, VT, Default);
2219   setOperationAction(ISD::VECTOR_SHUFFLE, VT, Default);
2220   setOperationAction(ISD::VECTOR_SPLICE, VT, Default);
2221   setOperationAction(ISD::VSELECT, VT, Default);
2222   setOperationAction(ISD::XOR, VT, Default);
2223   setOperationAction(ISD::ZERO_EXTEND, VT, Default);
2224 }
2225 
2226 void AArch64TargetLowering::addDRType(MVT VT) {
2227   addRegisterClass(VT, &AArch64::FPR64RegClass);
2228   if (Subtarget->isNeonAvailable())
2229     addTypeForNEON(VT);
2230 }
2231 
2232 void AArch64TargetLowering::addQRType(MVT VT) {
2233   addRegisterClass(VT, &AArch64::FPR128RegClass);
2234   if (Subtarget->isNeonAvailable())
2235     addTypeForNEON(VT);
2236 }
2237 
2238 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
2239                                               LLVMContext &C, EVT VT) const {
2240   if (!VT.isVector())
2241     return MVT::i32;
2242   if (VT.isScalableVector())
2243     return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2244   return VT.changeVectorElementTypeToInteger();
2245 }
2246 
2247 // isIntImmediate - This method tests to see if the node is a constant
2248 // operand. If so Imm will receive the value.
2249 static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2250   if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2251     Imm = C->getZExtValue();
2252     return true;
2253   }
2254   return false;
2255 }
2256 
2257 // isOpcWithIntImmediate - This method tests to see if the node is a specific
2258 // opcode and that it has a immediate integer right operand.
2259 // If so Imm will receive the value.
2260 static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2261                                   uint64_t &Imm) {
2262   return N->getOpcode() == Opc &&
2263          isIntImmediate(N->getOperand(1).getNode(), Imm);
2264 }
2265 
2266 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2267                                const APInt &Demanded,
2268                                TargetLowering::TargetLoweringOpt &TLO,
2269                                unsigned NewOpc) {
2270   uint64_t OldImm = Imm, NewImm, Enc;
2271   uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2272 
2273   // Return if the immediate is already all zeros, all ones, a bimm32 or a
2274   // bimm64.
2275   if (Imm == 0 || Imm == Mask ||
2276       AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
2277     return false;
2278 
2279   unsigned EltSize = Size;
2280   uint64_t DemandedBits = Demanded.getZExtValue();
2281 
2282   // Clear bits that are not demanded.
2283   Imm &= DemandedBits;
2284 
2285   while (true) {
2286     // The goal here is to set the non-demanded bits in a way that minimizes
2287     // the number of switching between 0 and 1. In order to achieve this goal,
2288     // we set the non-demanded bits to the value of the preceding demanded bits.
2289     // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2290     // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2291     // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2292     // The final result is 0b11000011.
2293     uint64_t NonDemandedBits = ~DemandedBits;
2294     uint64_t InvertedImm = ~Imm & DemandedBits;
2295     uint64_t RotatedImm =
2296         ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2297         NonDemandedBits;
2298     uint64_t Sum = RotatedImm + NonDemandedBits;
2299     bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2300     uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2301     NewImm = (Imm | Ones) & Mask;
2302 
2303     // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2304     // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2305     // we halve the element size and continue the search.
2306     if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2307       break;
2308 
2309     // We cannot shrink the element size any further if it is 2-bits.
2310     if (EltSize == 2)
2311       return false;
2312 
2313     EltSize /= 2;
2314     Mask >>= EltSize;
2315     uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2316 
2317     // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2318     if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2319       return false;
2320 
2321     // Merge the upper and lower halves of Imm and DemandedBits.
2322     Imm |= Hi;
2323     DemandedBits |= DemandedBitsHi;
2324   }
2325 
2326   ++NumOptimizedImms;
2327 
2328   // Replicate the element across the register width.
2329   while (EltSize < Size) {
2330     NewImm |= NewImm << EltSize;
2331     EltSize *= 2;
2332   }
2333 
2334   (void)OldImm;
2335   assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2336          "demanded bits should never be altered");
2337   assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2338 
2339   // Create the new constant immediate node.
2340   EVT VT = Op.getValueType();
2341   SDLoc DL(Op);
2342   SDValue New;
2343 
2344   // If the new constant immediate is all-zeros or all-ones, let the target
2345   // independent DAG combine optimize this node.
2346   if (NewImm == 0 || NewImm == OrigMask) {
2347     New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2348                           TLO.DAG.getConstant(NewImm, DL, VT));
2349   // Otherwise, create a machine node so that target independent DAG combine
2350   // doesn't undo this optimization.
2351   } else {
2352     Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
2353     SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2354     New = SDValue(
2355         TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2356   }
2357 
2358   return TLO.CombineTo(Op, New);
2359 }
2360 
2361 bool AArch64TargetLowering::targetShrinkDemandedConstant(
2362     SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2363     TargetLoweringOpt &TLO) const {
2364   // Delay this optimization to as late as possible.
2365   if (!TLO.LegalOps)
2366     return false;
2367 
2368   if (!EnableOptimizeLogicalImm)
2369     return false;
2370 
2371   EVT VT = Op.getValueType();
2372   if (VT.isVector())
2373     return false;
2374 
2375   unsigned Size = VT.getSizeInBits();
2376 
2377   if (Size != 32 && Size != 64)
2378     return false;
2379 
2380   // Exit early if we demand all bits.
2381   if (DemandedBits.popcount() == Size)
2382     return false;
2383 
2384   unsigned NewOpc;
2385   switch (Op.getOpcode()) {
2386   default:
2387     return false;
2388   case ISD::AND:
2389     NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2390     break;
2391   case ISD::OR:
2392     NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2393     break;
2394   case ISD::XOR:
2395     NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2396     break;
2397   }
2398   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2399   if (!C)
2400     return false;
2401   uint64_t Imm = C->getZExtValue();
2402   return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2403 }
2404 
2405 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
2406 /// Mask are known to be either zero or one and return them Known.
2407 void AArch64TargetLowering::computeKnownBitsForTargetNode(
2408     const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2409     const SelectionDAG &DAG, unsigned Depth) const {
2410   switch (Op.getOpcode()) {
2411   default:
2412     break;
2413   case AArch64ISD::DUP: {
2414     SDValue SrcOp = Op.getOperand(0);
2415     Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2416     if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2417       assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2418              "Expected DUP implicit truncation");
2419       Known = Known.trunc(Op.getScalarValueSizeInBits());
2420     }
2421     break;
2422   }
2423   case AArch64ISD::CSEL: {
2424     KnownBits Known2;
2425     Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2426     Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2427     Known = Known.intersectWith(Known2);
2428     break;
2429   }
2430   case AArch64ISD::BICi: {
2431     // Compute the bit cleared value.
2432     APInt Mask =
2433         ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2434              .trunc(Known.getBitWidth());
2435     Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2436     Known &= KnownBits::makeConstant(Mask);
2437     break;
2438   }
2439   case AArch64ISD::VLSHR: {
2440     KnownBits Known2;
2441     Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2442     Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2443     Known = KnownBits::lshr(Known, Known2);
2444     break;
2445   }
2446   case AArch64ISD::VASHR: {
2447     KnownBits Known2;
2448     Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2449     Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2450     Known = KnownBits::ashr(Known, Known2);
2451     break;
2452   }
2453   case AArch64ISD::VSHL: {
2454     KnownBits Known2;
2455     Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2456     Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2457     Known = KnownBits::shl(Known, Known2);
2458     break;
2459   }
2460   case AArch64ISD::MOVI: {
2461     Known = KnownBits::makeConstant(
2462         APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2463     break;
2464   }
2465   case AArch64ISD::LOADgot:
2466   case AArch64ISD::ADDlow: {
2467     if (!Subtarget->isTargetILP32())
2468       break;
2469     // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2470     Known.Zero = APInt::getHighBitsSet(64, 32);
2471     break;
2472   }
2473   case AArch64ISD::ASSERT_ZEXT_BOOL: {
2474     Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2475     Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2476     break;
2477   }
2478   case ISD::INTRINSIC_W_CHAIN: {
2479     Intrinsic::ID IntID =
2480         static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2481     switch (IntID) {
2482     default: return;
2483     case Intrinsic::aarch64_ldaxr:
2484     case Intrinsic::aarch64_ldxr: {
2485       unsigned BitWidth = Known.getBitWidth();
2486       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2487       unsigned MemBits = VT.getScalarSizeInBits();
2488       Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2489       return;
2490     }
2491     }
2492     break;
2493   }
2494   case ISD::INTRINSIC_WO_CHAIN:
2495   case ISD::INTRINSIC_VOID: {
2496     unsigned IntNo = Op.getConstantOperandVal(0);
2497     switch (IntNo) {
2498     default:
2499       break;
2500     case Intrinsic::aarch64_neon_uaddlv: {
2501       MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2502       unsigned BitWidth = Known.getBitWidth();
2503       if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2504         unsigned Bound = (VT == MVT::v8i8) ?  11 : 12;
2505         assert(BitWidth >= Bound && "Unexpected width!");
2506         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - Bound);
2507         Known.Zero |= Mask;
2508       }
2509       break;
2510     }
2511     case Intrinsic::aarch64_neon_umaxv:
2512     case Intrinsic::aarch64_neon_uminv: {
2513       // Figure out the datatype of the vector operand. The UMINV instruction
2514       // will zero extend the result, so we can mark as known zero all the
2515       // bits larger than the element datatype. 32-bit or larget doesn't need
2516       // this as those are legal types and will be handled by isel directly.
2517       MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2518       unsigned BitWidth = Known.getBitWidth();
2519       if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2520         assert(BitWidth >= 8 && "Unexpected width!");
2521         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
2522         Known.Zero |= Mask;
2523       } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2524         assert(BitWidth >= 16 && "Unexpected width!");
2525         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
2526         Known.Zero |= Mask;
2527       }
2528       break;
2529     } break;
2530     }
2531   }
2532   }
2533 }
2534 
2535 unsigned AArch64TargetLowering::ComputeNumSignBitsForTargetNode(
2536     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2537     unsigned Depth) const {
2538   EVT VT = Op.getValueType();
2539   unsigned VTBits = VT.getScalarSizeInBits();
2540   unsigned Opcode = Op.getOpcode();
2541   switch (Opcode) {
2542     case AArch64ISD::CMEQ:
2543     case AArch64ISD::CMGE:
2544     case AArch64ISD::CMGT:
2545     case AArch64ISD::CMHI:
2546     case AArch64ISD::CMHS:
2547     case AArch64ISD::FCMEQ:
2548     case AArch64ISD::FCMGE:
2549     case AArch64ISD::FCMGT:
2550     case AArch64ISD::CMEQz:
2551     case AArch64ISD::CMGEz:
2552     case AArch64ISD::CMGTz:
2553     case AArch64ISD::CMLEz:
2554     case AArch64ISD::CMLTz:
2555     case AArch64ISD::FCMEQz:
2556     case AArch64ISD::FCMGEz:
2557     case AArch64ISD::FCMGTz:
2558     case AArch64ISD::FCMLEz:
2559     case AArch64ISD::FCMLTz:
2560       // Compares return either 0 or all-ones
2561       return VTBits;
2562     case AArch64ISD::VASHR: {
2563       unsigned Tmp =
2564           DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2565       return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2566     }
2567   }
2568 
2569   return 1;
2570 }
2571 
2572 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
2573                                                   EVT) const {
2574   return MVT::i64;
2575 }
2576 
2577 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2578     EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2579     unsigned *Fast) const {
2580 
2581   // Allow SVE loads/stores where the alignment >= the size of the element type,
2582   // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2583   // for stores that come from IR, only require element-size alignment (even if
2584   // unaligned accesses are disabled). Without this, these will be forced to
2585   // have 16-byte alignment with +strict-align (and fail to lower as we don't
2586   // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2587   if (VT.isScalableVector()) {
2588     unsigned ElementSizeBits = VT.getScalarSizeInBits();
2589     if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2590       return true;
2591   }
2592 
2593   if (Subtarget->requiresStrictAlign())
2594     return false;
2595 
2596   if (Fast) {
2597     // Some CPUs are fine with unaligned stores except for 128-bit ones.
2598     *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2599             // See comments in performSTORECombine() for more details about
2600             // these conditions.
2601 
2602             // Code that uses clang vector extensions can mark that it
2603             // wants unaligned accesses to be treated as fast by
2604             // underspecifying alignment to be 1 or 2.
2605             Alignment <= 2 ||
2606 
2607             // Disregard v2i64. Memcpy lowering produces those and splitting
2608             // them regresses performance on micro-benchmarks and olden/bh.
2609             VT == MVT::v2i64;
2610   }
2611   return true;
2612 }
2613 
2614 // Same as above but handling LLTs instead.
2615 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2616     LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2617     unsigned *Fast) const {
2618   if (Subtarget->requiresStrictAlign())
2619     return false;
2620 
2621   if (Fast) {
2622     // Some CPUs are fine with unaligned stores except for 128-bit ones.
2623     *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2624             Ty.getSizeInBytes() != 16 ||
2625             // See comments in performSTORECombine() for more details about
2626             // these conditions.
2627 
2628             // Code that uses clang vector extensions can mark that it
2629             // wants unaligned accesses to be treated as fast by
2630             // underspecifying alignment to be 1 or 2.
2631             Alignment <= 2 ||
2632 
2633             // Disregard v2i64. Memcpy lowering produces those and splitting
2634             // them regresses performance on micro-benchmarks and olden/bh.
2635             Ty == LLT::fixed_vector(2, 64);
2636   }
2637   return true;
2638 }
2639 
2640 FastISel *
2641 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
2642                                       const TargetLibraryInfo *libInfo) const {
2643   return AArch64::createFastISel(funcInfo, libInfo);
2644 }
2645 
2646 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2647 #define MAKE_CASE(V)                                                           \
2648   case V:                                                                      \
2649     return #V;
2650   switch ((AArch64ISD::NodeType)Opcode) {
2651   case AArch64ISD::FIRST_NUMBER:
2652     break;
2653     MAKE_CASE(AArch64ISD::ALLOCATE_ZA_BUFFER)
2654     MAKE_CASE(AArch64ISD::INIT_TPIDR2OBJ)
2655     MAKE_CASE(AArch64ISD::GET_SME_SAVE_SIZE)
2656     MAKE_CASE(AArch64ISD::ALLOC_SME_SAVE_BUFFER)
2657     MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
2658     MAKE_CASE(AArch64ISD::VG_SAVE)
2659     MAKE_CASE(AArch64ISD::VG_RESTORE)
2660     MAKE_CASE(AArch64ISD::SMSTART)
2661     MAKE_CASE(AArch64ISD::SMSTOP)
2662     MAKE_CASE(AArch64ISD::RESTORE_ZA)
2663     MAKE_CASE(AArch64ISD::RESTORE_ZT)
2664     MAKE_CASE(AArch64ISD::SAVE_ZT)
2665     MAKE_CASE(AArch64ISD::CALL)
2666     MAKE_CASE(AArch64ISD::ADRP)
2667     MAKE_CASE(AArch64ISD::ADR)
2668     MAKE_CASE(AArch64ISD::ADDlow)
2669     MAKE_CASE(AArch64ISD::AUTH_CALL)
2670     MAKE_CASE(AArch64ISD::AUTH_TC_RETURN)
2671     MAKE_CASE(AArch64ISD::AUTH_CALL_RVMARKER)
2672     MAKE_CASE(AArch64ISD::LOADgot)
2673     MAKE_CASE(AArch64ISD::RET_GLUE)
2674     MAKE_CASE(AArch64ISD::BRCOND)
2675     MAKE_CASE(AArch64ISD::CSEL)
2676     MAKE_CASE(AArch64ISD::CSINV)
2677     MAKE_CASE(AArch64ISD::CSNEG)
2678     MAKE_CASE(AArch64ISD::CSINC)
2679     MAKE_CASE(AArch64ISD::THREAD_POINTER)
2680     MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
2681     MAKE_CASE(AArch64ISD::TLSDESC_AUTH_CALLSEQ)
2682     MAKE_CASE(AArch64ISD::PROBED_ALLOCA)
2683     MAKE_CASE(AArch64ISD::ABDS_PRED)
2684     MAKE_CASE(AArch64ISD::ABDU_PRED)
2685     MAKE_CASE(AArch64ISD::HADDS_PRED)
2686     MAKE_CASE(AArch64ISD::HADDU_PRED)
2687     MAKE_CASE(AArch64ISD::MUL_PRED)
2688     MAKE_CASE(AArch64ISD::MULHS_PRED)
2689     MAKE_CASE(AArch64ISD::MULHU_PRED)
2690     MAKE_CASE(AArch64ISD::RHADDS_PRED)
2691     MAKE_CASE(AArch64ISD::RHADDU_PRED)
2692     MAKE_CASE(AArch64ISD::SDIV_PRED)
2693     MAKE_CASE(AArch64ISD::SHL_PRED)
2694     MAKE_CASE(AArch64ISD::SMAX_PRED)
2695     MAKE_CASE(AArch64ISD::SMIN_PRED)
2696     MAKE_CASE(AArch64ISD::SRA_PRED)
2697     MAKE_CASE(AArch64ISD::SRL_PRED)
2698     MAKE_CASE(AArch64ISD::UDIV_PRED)
2699     MAKE_CASE(AArch64ISD::UMAX_PRED)
2700     MAKE_CASE(AArch64ISD::UMIN_PRED)
2701     MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1)
2702     MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU)
2703     MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
2704     MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
2705     MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU)
2706     MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU)
2707     MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU)
2708     MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU)
2709     MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
2710     MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
2711     MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
2712     MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
2713     MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
2714     MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
2715     MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
2716     MAKE_CASE(AArch64ISD::FCVTX_MERGE_PASSTHRU)
2717     MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
2718     MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU)
2719     MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU)
2720     MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU)
2721     MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU)
2722     MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU)
2723     MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU)
2724     MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
2725     MAKE_CASE(AArch64ISD::ADC)
2726     MAKE_CASE(AArch64ISD::SBC)
2727     MAKE_CASE(AArch64ISD::ADDS)
2728     MAKE_CASE(AArch64ISD::SUBS)
2729     MAKE_CASE(AArch64ISD::ADCS)
2730     MAKE_CASE(AArch64ISD::SBCS)
2731     MAKE_CASE(AArch64ISD::ANDS)
2732     MAKE_CASE(AArch64ISD::CCMP)
2733     MAKE_CASE(AArch64ISD::CCMN)
2734     MAKE_CASE(AArch64ISD::FCCMP)
2735     MAKE_CASE(AArch64ISD::FCMP)
2736     MAKE_CASE(AArch64ISD::STRICT_FCMP)
2737     MAKE_CASE(AArch64ISD::STRICT_FCMPE)
2738     MAKE_CASE(AArch64ISD::FCVTXN)
2739     MAKE_CASE(AArch64ISD::SME_ZA_LDR)
2740     MAKE_CASE(AArch64ISD::SME_ZA_STR)
2741     MAKE_CASE(AArch64ISD::DUP)
2742     MAKE_CASE(AArch64ISD::DUPLANE8)
2743     MAKE_CASE(AArch64ISD::DUPLANE16)
2744     MAKE_CASE(AArch64ISD::DUPLANE32)
2745     MAKE_CASE(AArch64ISD::DUPLANE64)
2746     MAKE_CASE(AArch64ISD::DUPLANE128)
2747     MAKE_CASE(AArch64ISD::MOVI)
2748     MAKE_CASE(AArch64ISD::MOVIshift)
2749     MAKE_CASE(AArch64ISD::MOVIedit)
2750     MAKE_CASE(AArch64ISD::MOVImsl)
2751     MAKE_CASE(AArch64ISD::FMOV)
2752     MAKE_CASE(AArch64ISD::MVNIshift)
2753     MAKE_CASE(AArch64ISD::MVNImsl)
2754     MAKE_CASE(AArch64ISD::BICi)
2755     MAKE_CASE(AArch64ISD::ORRi)
2756     MAKE_CASE(AArch64ISD::BSP)
2757     MAKE_CASE(AArch64ISD::ZIP1)
2758     MAKE_CASE(AArch64ISD::ZIP2)
2759     MAKE_CASE(AArch64ISD::UZP1)
2760     MAKE_CASE(AArch64ISD::UZP2)
2761     MAKE_CASE(AArch64ISD::TRN1)
2762     MAKE_CASE(AArch64ISD::TRN2)
2763     MAKE_CASE(AArch64ISD::REV16)
2764     MAKE_CASE(AArch64ISD::REV32)
2765     MAKE_CASE(AArch64ISD::REV64)
2766     MAKE_CASE(AArch64ISD::EXT)
2767     MAKE_CASE(AArch64ISD::SPLICE)
2768     MAKE_CASE(AArch64ISD::VSHL)
2769     MAKE_CASE(AArch64ISD::VLSHR)
2770     MAKE_CASE(AArch64ISD::VASHR)
2771     MAKE_CASE(AArch64ISD::VSLI)
2772     MAKE_CASE(AArch64ISD::VSRI)
2773     MAKE_CASE(AArch64ISD::CMEQ)
2774     MAKE_CASE(AArch64ISD::CMGE)
2775     MAKE_CASE(AArch64ISD::CMGT)
2776     MAKE_CASE(AArch64ISD::CMHI)
2777     MAKE_CASE(AArch64ISD::CMHS)
2778     MAKE_CASE(AArch64ISD::FCMEQ)
2779     MAKE_CASE(AArch64ISD::FCMGE)
2780     MAKE_CASE(AArch64ISD::FCMGT)
2781     MAKE_CASE(AArch64ISD::CMEQz)
2782     MAKE_CASE(AArch64ISD::CMGEz)
2783     MAKE_CASE(AArch64ISD::CMGTz)
2784     MAKE_CASE(AArch64ISD::CMLEz)
2785     MAKE_CASE(AArch64ISD::CMLTz)
2786     MAKE_CASE(AArch64ISD::FCMEQz)
2787     MAKE_CASE(AArch64ISD::FCMGEz)
2788     MAKE_CASE(AArch64ISD::FCMGTz)
2789     MAKE_CASE(AArch64ISD::FCMLEz)
2790     MAKE_CASE(AArch64ISD::FCMLTz)
2791     MAKE_CASE(AArch64ISD::SADDV)
2792     MAKE_CASE(AArch64ISD::UADDV)
2793     MAKE_CASE(AArch64ISD::UADDLV)
2794     MAKE_CASE(AArch64ISD::SADDLV)
2795     MAKE_CASE(AArch64ISD::SADDWT)
2796     MAKE_CASE(AArch64ISD::SADDWB)
2797     MAKE_CASE(AArch64ISD::UADDWT)
2798     MAKE_CASE(AArch64ISD::UADDWB)
2799     MAKE_CASE(AArch64ISD::SDOT)
2800     MAKE_CASE(AArch64ISD::UDOT)
2801     MAKE_CASE(AArch64ISD::USDOT)
2802     MAKE_CASE(AArch64ISD::SMINV)
2803     MAKE_CASE(AArch64ISD::UMINV)
2804     MAKE_CASE(AArch64ISD::SMAXV)
2805     MAKE_CASE(AArch64ISD::UMAXV)
2806     MAKE_CASE(AArch64ISD::SADDV_PRED)
2807     MAKE_CASE(AArch64ISD::UADDV_PRED)
2808     MAKE_CASE(AArch64ISD::SMAXV_PRED)
2809     MAKE_CASE(AArch64ISD::UMAXV_PRED)
2810     MAKE_CASE(AArch64ISD::SMINV_PRED)
2811     MAKE_CASE(AArch64ISD::UMINV_PRED)
2812     MAKE_CASE(AArch64ISD::ORV_PRED)
2813     MAKE_CASE(AArch64ISD::EORV_PRED)
2814     MAKE_CASE(AArch64ISD::ANDV_PRED)
2815     MAKE_CASE(AArch64ISD::CLASTA_N)
2816     MAKE_CASE(AArch64ISD::CLASTB_N)
2817     MAKE_CASE(AArch64ISD::LASTA)
2818     MAKE_CASE(AArch64ISD::LASTB)
2819     MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
2820     MAKE_CASE(AArch64ISD::LS64_BUILD)
2821     MAKE_CASE(AArch64ISD::LS64_EXTRACT)
2822     MAKE_CASE(AArch64ISD::TBL)
2823     MAKE_CASE(AArch64ISD::FADD_PRED)
2824     MAKE_CASE(AArch64ISD::FADDA_PRED)
2825     MAKE_CASE(AArch64ISD::FADDV_PRED)
2826     MAKE_CASE(AArch64ISD::FDIV_PRED)
2827     MAKE_CASE(AArch64ISD::FMA_PRED)
2828     MAKE_CASE(AArch64ISD::FMAX_PRED)
2829     MAKE_CASE(AArch64ISD::FMAXV_PRED)
2830     MAKE_CASE(AArch64ISD::FMAXNM_PRED)
2831     MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
2832     MAKE_CASE(AArch64ISD::FMIN_PRED)
2833     MAKE_CASE(AArch64ISD::FMINV_PRED)
2834     MAKE_CASE(AArch64ISD::FMINNM_PRED)
2835     MAKE_CASE(AArch64ISD::FMINNMV_PRED)
2836     MAKE_CASE(AArch64ISD::FMUL_PRED)
2837     MAKE_CASE(AArch64ISD::FSUB_PRED)
2838     MAKE_CASE(AArch64ISD::RDSVL)
2839     MAKE_CASE(AArch64ISD::BIC)
2840     MAKE_CASE(AArch64ISD::CBZ)
2841     MAKE_CASE(AArch64ISD::CBNZ)
2842     MAKE_CASE(AArch64ISD::TBZ)
2843     MAKE_CASE(AArch64ISD::TBNZ)
2844     MAKE_CASE(AArch64ISD::TC_RETURN)
2845     MAKE_CASE(AArch64ISD::PREFETCH)
2846     MAKE_CASE(AArch64ISD::SITOF)
2847     MAKE_CASE(AArch64ISD::UITOF)
2848     MAKE_CASE(AArch64ISD::NVCAST)
2849     MAKE_CASE(AArch64ISD::MRS)
2850     MAKE_CASE(AArch64ISD::SQSHL_I)
2851     MAKE_CASE(AArch64ISD::UQSHL_I)
2852     MAKE_CASE(AArch64ISD::SRSHR_I)
2853     MAKE_CASE(AArch64ISD::URSHR_I)
2854     MAKE_CASE(AArch64ISD::SQSHLU_I)
2855     MAKE_CASE(AArch64ISD::WrapperLarge)
2856     MAKE_CASE(AArch64ISD::LD2post)
2857     MAKE_CASE(AArch64ISD::LD3post)
2858     MAKE_CASE(AArch64ISD::LD4post)
2859     MAKE_CASE(AArch64ISD::ST2post)
2860     MAKE_CASE(AArch64ISD::ST3post)
2861     MAKE_CASE(AArch64ISD::ST4post)
2862     MAKE_CASE(AArch64ISD::LD1x2post)
2863     MAKE_CASE(AArch64ISD::LD1x3post)
2864     MAKE_CASE(AArch64ISD::LD1x4post)
2865     MAKE_CASE(AArch64ISD::ST1x2post)
2866     MAKE_CASE(AArch64ISD::ST1x3post)
2867     MAKE_CASE(AArch64ISD::ST1x4post)
2868     MAKE_CASE(AArch64ISD::LD1DUPpost)
2869     MAKE_CASE(AArch64ISD::LD2DUPpost)
2870     MAKE_CASE(AArch64ISD::LD3DUPpost)
2871     MAKE_CASE(AArch64ISD::LD4DUPpost)
2872     MAKE_CASE(AArch64ISD::LD1LANEpost)
2873     MAKE_CASE(AArch64ISD::LD2LANEpost)
2874     MAKE_CASE(AArch64ISD::LD3LANEpost)
2875     MAKE_CASE(AArch64ISD::LD4LANEpost)
2876     MAKE_CASE(AArch64ISD::ST2LANEpost)
2877     MAKE_CASE(AArch64ISD::ST3LANEpost)
2878     MAKE_CASE(AArch64ISD::ST4LANEpost)
2879     MAKE_CASE(AArch64ISD::SMULL)
2880     MAKE_CASE(AArch64ISD::UMULL)
2881     MAKE_CASE(AArch64ISD::PMULL)
2882     MAKE_CASE(AArch64ISD::FRECPE)
2883     MAKE_CASE(AArch64ISD::FRECPS)
2884     MAKE_CASE(AArch64ISD::FRSQRTE)
2885     MAKE_CASE(AArch64ISD::FRSQRTS)
2886     MAKE_CASE(AArch64ISD::STG)
2887     MAKE_CASE(AArch64ISD::STZG)
2888     MAKE_CASE(AArch64ISD::ST2G)
2889     MAKE_CASE(AArch64ISD::STZ2G)
2890     MAKE_CASE(AArch64ISD::SUNPKHI)
2891     MAKE_CASE(AArch64ISD::SUNPKLO)
2892     MAKE_CASE(AArch64ISD::UUNPKHI)
2893     MAKE_CASE(AArch64ISD::UUNPKLO)
2894     MAKE_CASE(AArch64ISD::INSR)
2895     MAKE_CASE(AArch64ISD::PTEST)
2896     MAKE_CASE(AArch64ISD::PTEST_ANY)
2897     MAKE_CASE(AArch64ISD::PTRUE)
2898     MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
2899     MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
2900     MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
2901     MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
2902     MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
2903     MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
2904     MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
2905     MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
2906     MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
2907     MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
2908     MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
2909     MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
2910     MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
2911     MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
2912     MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
2913     MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
2914     MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
2915     MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
2916     MAKE_CASE(AArch64ISD::GLD1Q_MERGE_ZERO)
2917     MAKE_CASE(AArch64ISD::GLD1Q_INDEX_MERGE_ZERO)
2918     MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
2919     MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
2920     MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
2921     MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
2922     MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
2923     MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
2924     MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
2925     MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
2926     MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
2927     MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
2928     MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
2929     MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
2930     MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
2931     MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
2932     MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
2933     MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
2934     MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
2935     MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
2936     MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
2937     MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
2938     MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
2939     MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
2940     MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
2941     MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
2942     MAKE_CASE(AArch64ISD::SST1Q_PRED)
2943     MAKE_CASE(AArch64ISD::SST1Q_INDEX_PRED)
2944     MAKE_CASE(AArch64ISD::ST1_PRED)
2945     MAKE_CASE(AArch64ISD::SST1_PRED)
2946     MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
2947     MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
2948     MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
2949     MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
2950     MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
2951     MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
2952     MAKE_CASE(AArch64ISD::SSTNT1_PRED)
2953     MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
2954     MAKE_CASE(AArch64ISD::LDP)
2955     MAKE_CASE(AArch64ISD::LDIAPP)
2956     MAKE_CASE(AArch64ISD::LDNP)
2957     MAKE_CASE(AArch64ISD::STP)
2958     MAKE_CASE(AArch64ISD::STILP)
2959     MAKE_CASE(AArch64ISD::STNP)
2960     MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
2961     MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
2962     MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU)
2963     MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU)
2964     MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU)
2965     MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
2966     MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
2967     MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
2968     MAKE_CASE(AArch64ISD::INDEX_VECTOR)
2969     MAKE_CASE(AArch64ISD::ADDP)
2970     MAKE_CASE(AArch64ISD::SADDLP)
2971     MAKE_CASE(AArch64ISD::UADDLP)
2972     MAKE_CASE(AArch64ISD::CALL_RVMARKER)
2973     MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
2974     MAKE_CASE(AArch64ISD::CALL_BTI)
2975     MAKE_CASE(AArch64ISD::MRRS)
2976     MAKE_CASE(AArch64ISD::MSRR)
2977     MAKE_CASE(AArch64ISD::RSHRNB_I)
2978     MAKE_CASE(AArch64ISD::CTTZ_ELTS)
2979     MAKE_CASE(AArch64ISD::CALL_ARM64EC_TO_X64)
2980     MAKE_CASE(AArch64ISD::URSHR_I_PRED)
2981   }
2982 #undef MAKE_CASE
2983   return nullptr;
2984 }
2985 
2986 MachineBasicBlock *
2987 AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
2988                                     MachineBasicBlock *MBB) const {
2989   // We materialise the F128CSEL pseudo-instruction as some control flow and a
2990   // phi node:
2991 
2992   // OrigBB:
2993   //     [... previous instrs leading to comparison ...]
2994   //     b.ne TrueBB
2995   //     b EndBB
2996   // TrueBB:
2997   //     ; Fallthrough
2998   // EndBB:
2999   //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
3000 
3001   MachineFunction *MF = MBB->getParent();
3002   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3003   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
3004   DebugLoc DL = MI.getDebugLoc();
3005   MachineFunction::iterator It = ++MBB->getIterator();
3006 
3007   Register DestReg = MI.getOperand(0).getReg();
3008   Register IfTrueReg = MI.getOperand(1).getReg();
3009   Register IfFalseReg = MI.getOperand(2).getReg();
3010   unsigned CondCode = MI.getOperand(3).getImm();
3011   bool NZCVKilled = MI.getOperand(4).isKill();
3012 
3013   MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
3014   MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
3015   MF->insert(It, TrueBB);
3016   MF->insert(It, EndBB);
3017 
3018   // Transfer rest of current basic-block to EndBB
3019   EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
3020                 MBB->end());
3021   EndBB->transferSuccessorsAndUpdatePHIs(MBB);
3022 
3023   BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
3024   BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
3025   MBB->addSuccessor(TrueBB);
3026   MBB->addSuccessor(EndBB);
3027 
3028   // TrueBB falls through to the end.
3029   TrueBB->addSuccessor(EndBB);
3030 
3031   if (!NZCVKilled) {
3032     TrueBB->addLiveIn(AArch64::NZCV);
3033     EndBB->addLiveIn(AArch64::NZCV);
3034   }
3035 
3036   BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
3037       .addReg(IfTrueReg)
3038       .addMBB(TrueBB)
3039       .addReg(IfFalseReg)
3040       .addMBB(MBB);
3041 
3042   MI.eraseFromParent();
3043   return EndBB;
3044 }
3045 
3046 MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
3047        MachineInstr &MI, MachineBasicBlock *BB) const {
3048   assert(!isAsynchronousEHPersonality(classifyEHPersonality(
3049              BB->getParent()->getFunction().getPersonalityFn())) &&
3050          "SEH does not use catchret!");
3051   return BB;
3052 }
3053 
3054 MachineBasicBlock *
3055 AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
3056                                               MachineBasicBlock *MBB) const {
3057   MachineFunction &MF = *MBB->getParent();
3058   MachineBasicBlock::iterator MBBI = MI.getIterator();
3059   DebugLoc DL = MBB->findDebugLoc(MBBI);
3060   const AArch64InstrInfo &TII =
3061       *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
3062   Register TargetReg = MI.getOperand(0).getReg();
3063   MachineBasicBlock::iterator NextInst =
3064       TII.probedStackAlloc(MBBI, TargetReg, false);
3065 
3066   MI.eraseFromParent();
3067   return NextInst->getParent();
3068 }
3069 
3070 MachineBasicBlock *
3071 AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
3072                                     MachineInstr &MI,
3073                                     MachineBasicBlock *BB) const {
3074   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3075   MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3076 
3077   MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
3078   MIB.add(MI.getOperand(1)); // slice index register
3079   MIB.add(MI.getOperand(2)); // slice index offset
3080   MIB.add(MI.getOperand(3)); // pg
3081   MIB.add(MI.getOperand(4)); // base
3082   MIB.add(MI.getOperand(5)); // offset
3083 
3084   MI.eraseFromParent(); // The pseudo is gone now.
3085   return BB;
3086 }
3087 
3088 MachineBasicBlock *
3089 AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
3090   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3091   MachineInstrBuilder MIB =
3092       BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
3093 
3094   MIB.addReg(AArch64::ZA, RegState::Define);
3095   MIB.add(MI.getOperand(0)); // Vector select register
3096   MIB.add(MI.getOperand(1)); // Vector select offset
3097   MIB.add(MI.getOperand(2)); // Base
3098   MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
3099 
3100   MI.eraseFromParent(); // The pseudo is gone now.
3101   return BB;
3102 }
3103 
3104 MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI,
3105                                                       MachineBasicBlock *BB,
3106                                                       unsigned Opcode,
3107                                                       bool Op0IsDef) const {
3108   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3109   MachineInstrBuilder MIB;
3110 
3111   MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
3112             .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
3113   for (unsigned I = 1; I < MI.getNumOperands(); ++I)
3114     MIB.add(MI.getOperand(I));
3115 
3116   MI.eraseFromParent(); // The pseudo is gone now.
3117   return BB;
3118 }
3119 
3120 MachineBasicBlock *
3121 AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3122                                    MachineInstr &MI,
3123                                    MachineBasicBlock *BB) const {
3124   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3125   MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3126   unsigned StartIdx = 0;
3127 
3128   bool HasTile = BaseReg != AArch64::ZA;
3129   bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3130   if (HasZPROut) {
3131     MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3132     ++StartIdx;
3133   }
3134   if (HasTile) {
3135     MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3136                RegState::Define);                           // Output ZA Tile
3137     MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3138     StartIdx++;
3139   } else {
3140     // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3141     if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3142       MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3143       ++StartIdx;
3144     }
3145     MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3146   }
3147   for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3148     MIB.add(MI.getOperand(I));
3149 
3150   MI.eraseFromParent(); // The pseudo is gone now.
3151   return BB;
3152 }
3153 
3154 MachineBasicBlock *
3155 AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
3156   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3157   MachineInstrBuilder MIB =
3158       BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3159   MIB.add(MI.getOperand(0)); // Mask
3160 
3161   unsigned Mask = MI.getOperand(0).getImm();
3162   for (unsigned I = 0; I < 8; I++) {
3163     if (Mask & (1 << I))
3164       MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3165   }
3166 
3167   MI.eraseFromParent(); // The pseudo is gone now.
3168   return BB;
3169 }
3170 
3171 MachineBasicBlock *
3172 AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI,
3173                                             MachineBasicBlock *BB) const {
3174   MachineFunction *MF = BB->getParent();
3175   MachineFrameInfo &MFI = MF->getFrameInfo();
3176   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
3177   TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3178   if (TPIDR2.Uses > 0) {
3179     const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3180     // Store the buffer pointer to the TPIDR2 stack object.
3181     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
3182         .addReg(MI.getOperand(0).getReg())
3183         .addFrameIndex(TPIDR2.FrameIndex)
3184         .addImm(0);
3185     // Set the reserved bytes (10-15) to zero
3186     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
3187         .addReg(AArch64::WZR)
3188         .addFrameIndex(TPIDR2.FrameIndex)
3189         .addImm(5);
3190     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
3191         .addReg(AArch64::WZR)
3192         .addFrameIndex(TPIDR2.FrameIndex)
3193         .addImm(3);
3194   } else
3195     MFI.RemoveStackObject(TPIDR2.FrameIndex);
3196 
3197   BB->remove_instr(&MI);
3198   return BB;
3199 }
3200 
3201 MachineBasicBlock *
3202 AArch64TargetLowering::EmitAllocateZABuffer(MachineInstr &MI,
3203                                             MachineBasicBlock *BB) const {
3204   MachineFunction *MF = BB->getParent();
3205   MachineFrameInfo &MFI = MF->getFrameInfo();
3206   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
3207   // TODO This function grows the stack with a subtraction, which doesn't work
3208   // on Windows. Some refactoring to share the functionality in
3209   // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3210   // supports SME
3211   assert(!MF->getSubtarget<AArch64Subtarget>().isTargetWindows() &&
3212          "Lazy ZA save is not yet supported on Windows");
3213 
3214   TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3215 
3216   if (TPIDR2.Uses > 0) {
3217     const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3218     MachineRegisterInfo &MRI = MF->getRegInfo();
3219 
3220     // The SUBXrs below won't always be emitted in a form that accepts SP
3221     // directly
3222     Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3223     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3224         .addReg(AArch64::SP);
3225 
3226     // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3227     auto Size = MI.getOperand(1).getReg();
3228     auto Dest = MI.getOperand(0).getReg();
3229     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3230         .addReg(Size)
3231         .addReg(Size)
3232         .addReg(SP);
3233     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3234             AArch64::SP)
3235         .addReg(Dest);
3236 
3237     // We have just allocated a variable sized object, tell this to PEI.
3238     MFI.CreateVariableSizedObject(Align(16), nullptr);
3239   }
3240 
3241   BB->remove_instr(&MI);
3242   return BB;
3243 }
3244 
3245 // TODO: Find a way to merge this with EmitAllocateZABuffer.
3246 MachineBasicBlock *
3247 AArch64TargetLowering::EmitAllocateSMESaveBuffer(MachineInstr &MI,
3248                                                  MachineBasicBlock *BB) const {
3249   MachineFunction *MF = BB->getParent();
3250   MachineFrameInfo &MFI = MF->getFrameInfo();
3251   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
3252   assert(!MF->getSubtarget<AArch64Subtarget>().isTargetWindows() &&
3253          "Lazy ZA save is not yet supported on Windows");
3254 
3255   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3256   if (FuncInfo->isSMESaveBufferUsed()) {
3257     // Allocate a buffer object of the size given by MI.getOperand(1).
3258     auto Size = MI.getOperand(1).getReg();
3259     auto Dest = MI.getOperand(0).getReg();
3260     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)
3261         .addReg(AArch64::SP)
3262         .addReg(Size)
3263         .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0));
3264     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)
3265         .addReg(AArch64::SP);
3266 
3267     // We have just allocated a variable sized object, tell this to PEI.
3268     MFI.CreateVariableSizedObject(Align(16), nullptr);
3269   } else
3270     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
3271             MI.getOperand(0).getReg());
3272 
3273   BB->remove_instr(&MI);
3274   return BB;
3275 }
3276 
3277 MachineBasicBlock *
3278 AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI,
3279                                           MachineBasicBlock *BB) const {
3280   // If the buffer is used, emit a call to __arm_sme_state_size()
3281   MachineFunction *MF = BB->getParent();
3282   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
3283   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3284   if (FuncInfo->isSMESaveBufferUsed()) {
3285     const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3286     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3287         .addExternalSymbol("__arm_sme_state_size")
3288         .addReg(AArch64::X0, RegState::ImplicitDefine)
3289         .addRegMask(TRI->getCallPreservedMask(
3290             *MF, CallingConv::
3291                      AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
3292     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3293             MI.getOperand(0).getReg())
3294         .addReg(AArch64::X0);
3295   } else
3296     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3297             MI.getOperand(0).getReg())
3298         .addReg(AArch64::XZR);
3299   BB->remove_instr(&MI);
3300   return BB;
3301 }
3302 
3303 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
3304     MachineInstr &MI, MachineBasicBlock *BB) const {
3305 
3306   int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3307   if (SMEOrigInstr != -1) {
3308     const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3309     uint64_t SMEMatrixType =
3310         TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3311     switch (SMEMatrixType) {
3312     case (AArch64::SMEMatrixArray):
3313       return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3314     case (AArch64::SMEMatrixTileB):
3315       return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3316     case (AArch64::SMEMatrixTileH):
3317       return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3318     case (AArch64::SMEMatrixTileS):
3319       return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3320     case (AArch64::SMEMatrixTileD):
3321       return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3322     case (AArch64::SMEMatrixTileQ):
3323       return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3324     }
3325   }
3326 
3327   switch (MI.getOpcode()) {
3328   default:
3329 #ifndef NDEBUG
3330     MI.dump();
3331 #endif
3332     llvm_unreachable("Unexpected instruction for custom inserter!");
3333   case AArch64::InitTPIDR2Obj:
3334     return EmitInitTPIDR2Object(MI, BB);
3335   case AArch64::AllocateZABuffer:
3336     return EmitAllocateZABuffer(MI, BB);
3337   case AArch64::AllocateSMESaveBuffer:
3338     return EmitAllocateSMESaveBuffer(MI, BB);
3339   case AArch64::GetSMESaveSize:
3340     return EmitGetSMESaveSize(MI, BB);
3341   case AArch64::F128CSEL:
3342     return EmitF128CSEL(MI, BB);
3343   case TargetOpcode::STATEPOINT:
3344     // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3345     // while bl call instruction (where statepoint will be lowered at the end)
3346     // has implicit def. This def is early-clobber as it will be set at
3347     // the moment of the call and earlier than any use is read.
3348     // Add this implicit dead def here as a workaround.
3349     MI.addOperand(*MI.getMF(),
3350                   MachineOperand::CreateReg(
3351                       AArch64::LR, /*isDef*/ true,
3352                       /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3353                       /*isUndef*/ false, /*isEarlyClobber*/ true));
3354     [[fallthrough]];
3355   case TargetOpcode::STACKMAP:
3356   case TargetOpcode::PATCHPOINT:
3357     return emitPatchPoint(MI, BB);
3358 
3359   case TargetOpcode::PATCHABLE_EVENT_CALL:
3360   case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3361     return BB;
3362 
3363   case AArch64::CATCHRET:
3364     return EmitLoweredCatchRet(MI, BB);
3365 
3366   case AArch64::PROBED_STACKALLOC_DYN:
3367     return EmitDynamicProbedAlloc(MI, BB);
3368 
3369   case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3370     return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3371   case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3372     return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3373   case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3374     return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3375   case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3376     return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3377   case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3378     return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3379   case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3380     return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3381   case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3382     return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3383   case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3384     return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3385   case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3386     return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3387   case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3388     return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3389   case AArch64::LDR_ZA_PSEUDO:
3390     return EmitFill(MI, BB);
3391   case AArch64::LDR_TX_PSEUDO:
3392     return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3393   case AArch64::STR_TX_PSEUDO:
3394     return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3395   case AArch64::ZERO_M_PSEUDO:
3396     return EmitZero(MI, BB);
3397   case AArch64::ZERO_T_PSEUDO:
3398     return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3399   case AArch64::MOVT_TIZ_PSEUDO:
3400     return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3401   }
3402 }
3403 
3404 //===----------------------------------------------------------------------===//
3405 // AArch64 Lowering private implementation.
3406 //===----------------------------------------------------------------------===//
3407 
3408 //===----------------------------------------------------------------------===//
3409 // Lowering Code
3410 //===----------------------------------------------------------------------===//
3411 
3412 // Forward declarations of SVE fixed length lowering helpers
3413 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);
3414 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
3415 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
3416 static SDValue convertFixedMaskToScalableVector(SDValue Mask,
3417                                                 SelectionDAG &DAG);
3418 static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT);
3419 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
3420                                              EVT VT);
3421 
3422 /// isZerosVector - Check whether SDNode N is a zero-filled vector.
3423 static bool isZerosVector(const SDNode *N) {
3424   // Look through a bit convert.
3425   while (N->getOpcode() == ISD::BITCAST)
3426     N = N->getOperand(0).getNode();
3427 
3428   if (ISD::isConstantSplatVectorAllZeros(N))
3429     return true;
3430 
3431   if (N->getOpcode() != AArch64ISD::DUP)
3432     return false;
3433 
3434   auto Opnd0 = N->getOperand(0);
3435   return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3436 }
3437 
3438 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3439 /// CC
3440 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
3441   switch (CC) {
3442   default:
3443     llvm_unreachable("Unknown condition code!");
3444   case ISD::SETNE:
3445     return AArch64CC::NE;
3446   case ISD::SETEQ:
3447     return AArch64CC::EQ;
3448   case ISD::SETGT:
3449     return AArch64CC::GT;
3450   case ISD::SETGE:
3451     return AArch64CC::GE;
3452   case ISD::SETLT:
3453     return AArch64CC::LT;
3454   case ISD::SETLE:
3455     return AArch64CC::LE;
3456   case ISD::SETUGT:
3457     return AArch64CC::HI;
3458   case ISD::SETUGE:
3459     return AArch64CC::HS;
3460   case ISD::SETULT:
3461     return AArch64CC::LO;
3462   case ISD::SETULE:
3463     return AArch64CC::LS;
3464   }
3465 }
3466 
3467 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3468 static void changeFPCCToAArch64CC(ISD::CondCode CC,
3469                                   AArch64CC::CondCode &CondCode,
3470                                   AArch64CC::CondCode &CondCode2) {
3471   CondCode2 = AArch64CC::AL;
3472   switch (CC) {
3473   default:
3474     llvm_unreachable("Unknown FP condition!");
3475   case ISD::SETEQ:
3476   case ISD::SETOEQ:
3477     CondCode = AArch64CC::EQ;
3478     break;
3479   case ISD::SETGT:
3480   case ISD::SETOGT:
3481     CondCode = AArch64CC::GT;
3482     break;
3483   case ISD::SETGE:
3484   case ISD::SETOGE:
3485     CondCode = AArch64CC::GE;
3486     break;
3487   case ISD::SETOLT:
3488     CondCode = AArch64CC::MI;
3489     break;
3490   case ISD::SETOLE:
3491     CondCode = AArch64CC::LS;
3492     break;
3493   case ISD::SETONE:
3494     CondCode = AArch64CC::MI;
3495     CondCode2 = AArch64CC::GT;
3496     break;
3497   case ISD::SETO:
3498     CondCode = AArch64CC::VC;
3499     break;
3500   case ISD::SETUO:
3501     CondCode = AArch64CC::VS;
3502     break;
3503   case ISD::SETUEQ:
3504     CondCode = AArch64CC::EQ;
3505     CondCode2 = AArch64CC::VS;
3506     break;
3507   case ISD::SETUGT:
3508     CondCode = AArch64CC::HI;
3509     break;
3510   case ISD::SETUGE:
3511     CondCode = AArch64CC::PL;
3512     break;
3513   case ISD::SETLT:
3514   case ISD::SETULT:
3515     CondCode = AArch64CC::LT;
3516     break;
3517   case ISD::SETLE:
3518   case ISD::SETULE:
3519     CondCode = AArch64CC::LE;
3520     break;
3521   case ISD::SETNE:
3522   case ISD::SETUNE:
3523     CondCode = AArch64CC::NE;
3524     break;
3525   }
3526 }
3527 
3528 /// Convert a DAG fp condition code to an AArch64 CC.
3529 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3530 /// should be AND'ed instead of OR'ed.
3531 static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
3532                                      AArch64CC::CondCode &CondCode,
3533                                      AArch64CC::CondCode &CondCode2) {
3534   CondCode2 = AArch64CC::AL;
3535   switch (CC) {
3536   default:
3537     changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3538     assert(CondCode2 == AArch64CC::AL);
3539     break;
3540   case ISD::SETONE:
3541     // (a one b)
3542     // == ((a olt b) || (a ogt b))
3543     // == ((a ord b) && (a une b))
3544     CondCode = AArch64CC::VC;
3545     CondCode2 = AArch64CC::NE;
3546     break;
3547   case ISD::SETUEQ:
3548     // (a ueq b)
3549     // == ((a uno b) || (a oeq b))
3550     // == ((a ule b) && (a uge b))
3551     CondCode = AArch64CC::PL;
3552     CondCode2 = AArch64CC::LE;
3553     break;
3554   }
3555 }
3556 
3557 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3558 /// CC usable with the vector instructions. Fewer operations are available
3559 /// without a real NZCV register, so we have to use less efficient combinations
3560 /// to get the same effect.
3561 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
3562                                         AArch64CC::CondCode &CondCode,
3563                                         AArch64CC::CondCode &CondCode2,
3564                                         bool &Invert) {
3565   Invert = false;
3566   switch (CC) {
3567   default:
3568     // Mostly the scalar mappings work fine.
3569     changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3570     break;
3571   case ISD::SETUO:
3572     Invert = true;
3573     [[fallthrough]];
3574   case ISD::SETO:
3575     CondCode = AArch64CC::MI;
3576     CondCode2 = AArch64CC::GE;
3577     break;
3578   case ISD::SETUEQ:
3579   case ISD::SETULT:
3580   case ISD::SETULE:
3581   case ISD::SETUGT:
3582   case ISD::SETUGE:
3583     // All of the compare-mask comparisons are ordered, but we can switch
3584     // between the two by a double inversion. E.g. ULE == !OGT.
3585     Invert = true;
3586     changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3587                           CondCode, CondCode2);
3588     break;
3589   }
3590 }
3591 
3592 static bool isLegalArithImmed(uint64_t C) {
3593   // Matches AArch64DAGToDAGISel::SelectArithImmed().
3594   bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3595   LLVM_DEBUG(dbgs() << "Is imm " << C
3596                     << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3597   return IsLegal;
3598 }
3599 
3600 static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG) {
3601   KnownBits KnownSrc = DAG.computeKnownBits(CheckedVal);
3602   return !KnownSrc.getSignedMinValue().isMinSignedValue();
3603 }
3604 
3605 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3606 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3607 // can be set differently by this operation. It comes down to whether
3608 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3609 // everything is fine. If not then the optimization is wrong. Thus general
3610 // comparisons are only valid if op2 != 0.
3611 //
3612 // So, finally, the only LLVM-native comparisons that don't mention C or V
3613 // are the ones that aren't unsigned comparisons. They're the only ones we can
3614 // safely use CMN for in the absence of information about op2.
3615 static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG) {
3616   return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3617          (isIntEqualitySetCC(CC) ||
3618           (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3619           (isSignedIntSetCC(CC) && cannotBeIntMin(Op.getOperand(1), DAG)));
3620 }
3621 
3622 static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
3623                                       SelectionDAG &DAG, SDValue Chain,
3624                                       bool IsSignaling) {
3625   EVT VT = LHS.getValueType();
3626   assert(VT != MVT::f128);
3627 
3628   const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3629 
3630   if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3631     LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3632                       {Chain, LHS});
3633     RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3634                       {LHS.getValue(1), RHS});
3635     Chain = RHS.getValue(1);
3636   }
3637   unsigned Opcode =
3638       IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3639   return DAG.getNode(Opcode, dl, {MVT::i32, MVT::Other}, {Chain, LHS, RHS});
3640 }
3641 
3642 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3643                               const SDLoc &dl, SelectionDAG &DAG) {
3644   EVT VT = LHS.getValueType();
3645   const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3646 
3647   if (VT.isFloatingPoint()) {
3648     assert(VT != MVT::f128);
3649     if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3650       LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3651       RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3652     }
3653     return DAG.getNode(AArch64ISD::FCMP, dl, MVT::i32, LHS, RHS);
3654   }
3655 
3656   // The CMP instruction is just an alias for SUBS, and representing it as
3657   // SUBS means that it's possible to get CSE with subtract operations.
3658   // A later phase can perform the optimization of setting the destination
3659   // register to WZR/XZR if it ends up being unused.
3660   unsigned Opcode = AArch64ISD::SUBS;
3661 
3662   if (isCMN(RHS, CC, DAG)) {
3663     // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3664     Opcode = AArch64ISD::ADDS;
3665     RHS = RHS.getOperand(1);
3666   } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3667              isIntEqualitySetCC(CC)) {
3668     // As we are looking for EQ/NE compares, the operands can be commuted ; can
3669     // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3670     Opcode = AArch64ISD::ADDS;
3671     LHS = LHS.getOperand(1);
3672   } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3673     if (LHS.getOpcode() == ISD::AND) {
3674       // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3675       // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3676       // of the signed comparisons.
3677       const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3678                                            DAG.getVTList(VT, MVT_CC),
3679                                            LHS.getOperand(0),
3680                                            LHS.getOperand(1));
3681       // Replace all users of (and X, Y) with newly generated (ands X, Y)
3682       DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3683       return ANDSNode.getValue(1);
3684     } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3685       // Use result of ANDS
3686       return LHS.getValue(1);
3687     }
3688   }
3689 
3690   return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3691       .getValue(1);
3692 }
3693 
3694 /// \defgroup AArch64CCMP CMP;CCMP matching
3695 ///
3696 /// These functions deal with the formation of CMP;CCMP;... sequences.
3697 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3698 /// a comparison. They set the NZCV flags to a predefined value if their
3699 /// predicate is false. This allows to express arbitrary conjunctions, for
3700 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3701 /// expressed as:
3702 ///   cmp A
3703 ///   ccmp B, inv(CB), CA
3704 ///   check for CB flags
3705 ///
3706 /// This naturally lets us implement chains of AND operations with SETCC
3707 /// operands. And we can even implement some other situations by transforming
3708 /// them:
3709 ///   - We can implement (NEG SETCC) i.e. negating a single comparison by
3710 ///     negating the flags used in a CCMP/FCCMP operations.
3711 ///   - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3712 ///     by negating the flags we test for afterwards. i.e.
3713 ///     NEG (CMP CCMP CCCMP ...) can be implemented.
3714 ///   - Note that we can only ever negate all previously processed results.
3715 ///     What we can not implement by flipping the flags to test is a negation
3716 ///     of two sub-trees (because the negation affects all sub-trees emitted so
3717 ///     far, so the 2nd sub-tree we emit would also affect the first).
3718 /// With those tools we can implement some OR operations:
3719 ///   - (OR (SETCC A) (SETCC B)) can be implemented via:
3720 ///     NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3721 ///   - After transforming OR to NEG/AND combinations we may be able to use NEG
3722 ///     elimination rules from earlier to implement the whole thing as a
3723 ///     CCMP/FCCMP chain.
3724 ///
3725 /// As complete example:
3726 ///     or (or (setCA (cmp A)) (setCB (cmp B)))
3727 ///        (and (setCC (cmp C)) (setCD (cmp D)))"
3728 /// can be reassociated to:
3729 ///     or (and (setCC (cmp C)) setCD (cmp D))
3730 //         (or (setCA (cmp A)) (setCB (cmp B)))
3731 /// can be transformed to:
3732 ///     not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3733 ///              (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3734 /// which can be implemented as:
3735 ///   cmp C
3736 ///   ccmp D, inv(CD), CC
3737 ///   ccmp A, CA, inv(CD)
3738 ///   ccmp B, CB, inv(CA)
3739 ///   check for CB flags
3740 ///
3741 /// A counterexample is "or (and A B) (and C D)" which translates to
3742 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3743 /// can only implement 1 of the inner (not) operations, but not both!
3744 /// @{
3745 
3746 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3747 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
3748                                          ISD::CondCode CC, SDValue CCOp,
3749                                          AArch64CC::CondCode Predicate,
3750                                          AArch64CC::CondCode OutCC,
3751                                          const SDLoc &DL, SelectionDAG &DAG) {
3752   unsigned Opcode = 0;
3753   const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3754 
3755   if (LHS.getValueType().isFloatingPoint()) {
3756     assert(LHS.getValueType() != MVT::f128);
3757     if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3758         LHS.getValueType() == MVT::bf16) {
3759       LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3760       RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3761     }
3762     Opcode = AArch64ISD::FCCMP;
3763   } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3764     APInt Imm = Const->getAPIntValue();
3765     if (Imm.isNegative() && Imm.sgt(-32)) {
3766       Opcode = AArch64ISD::CCMN;
3767       RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3768     }
3769   } else if (isCMN(RHS, CC, DAG)) {
3770     Opcode = AArch64ISD::CCMN;
3771     RHS = RHS.getOperand(1);
3772   } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3773              isIntEqualitySetCC(CC)) {
3774     // As we are looking for EQ/NE compares, the operands can be commuted ; can
3775     // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3776     Opcode = AArch64ISD::CCMN;
3777     LHS = LHS.getOperand(1);
3778   }
3779   if (Opcode == 0)
3780     Opcode = AArch64ISD::CCMP;
3781 
3782   SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3783   AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
3784   unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3785   SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3786   return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3787 }
3788 
3789 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3790 /// expressed as a conjunction. See \ref AArch64CCMP.
3791 /// \param CanNegate    Set to true if we can negate the whole sub-tree just by
3792 ///                     changing the conditions on the SETCC tests.
3793 ///                     (this means we can call emitConjunctionRec() with
3794 ///                      Negate==true on this sub-tree)
3795 /// \param MustBeFirst  Set to true if this subtree needs to be negated and we
3796 ///                     cannot do the negation naturally. We are required to
3797 ///                     emit the subtree first in this case.
3798 /// \param WillNegate   Is true if are called when the result of this
3799 ///                     subexpression must be negated. This happens when the
3800 ///                     outer expression is an OR. We can use this fact to know
3801 ///                     that we have a double negation (or (or ...) ...) that
3802 ///                     can be implemented for free.
3803 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3804                                bool &MustBeFirst, bool WillNegate,
3805                                unsigned Depth = 0) {
3806   if (!Val.hasOneUse())
3807     return false;
3808   unsigned Opcode = Val->getOpcode();
3809   if (Opcode == ISD::SETCC) {
3810     if (Val->getOperand(0).getValueType() == MVT::f128)
3811       return false;
3812     CanNegate = true;
3813     MustBeFirst = false;
3814     return true;
3815   }
3816   // Protect against exponential runtime and stack overflow.
3817   if (Depth > 6)
3818     return false;
3819   if (Opcode == ISD::AND || Opcode == ISD::OR) {
3820     bool IsOR = Opcode == ISD::OR;
3821     SDValue O0 = Val->getOperand(0);
3822     SDValue O1 = Val->getOperand(1);
3823     bool CanNegateL;
3824     bool MustBeFirstL;
3825     if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3826       return false;
3827     bool CanNegateR;
3828     bool MustBeFirstR;
3829     if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3830       return false;
3831 
3832     if (MustBeFirstL && MustBeFirstR)
3833       return false;
3834 
3835     if (IsOR) {
3836       // For an OR expression we need to be able to naturally negate at least
3837       // one side or we cannot do the transformation at all.
3838       if (!CanNegateL && !CanNegateR)
3839         return false;
3840       // If we the result of the OR will be negated and we can naturally negate
3841       // the leafs, then this sub-tree as a whole negates naturally.
3842       CanNegate = WillNegate && CanNegateL && CanNegateR;
3843       // If we cannot naturally negate the whole sub-tree, then this must be
3844       // emitted first.
3845       MustBeFirst = !CanNegate;
3846     } else {
3847       assert(Opcode == ISD::AND && "Must be OR or AND");
3848       // We cannot naturally negate an AND operation.
3849       CanNegate = false;
3850       MustBeFirst = MustBeFirstL || MustBeFirstR;
3851     }
3852     return true;
3853   }
3854   return false;
3855 }
3856 
3857 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3858 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3859 /// Tries to transform the given i1 producing node @p Val to a series compare
3860 /// and conditional compare operations. @returns an NZCV flags producing node
3861 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3862 /// transformation was not possible.
3863 /// \p Negate is true if we want this sub-tree being negated just by changing
3864 /// SETCC conditions.
3865 static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
3866     AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3867     AArch64CC::CondCode Predicate) {
3868   // We're at a tree leaf, produce a conditional comparison operation.
3869   unsigned Opcode = Val->getOpcode();
3870   if (Opcode == ISD::SETCC) {
3871     SDValue LHS = Val->getOperand(0);
3872     SDValue RHS = Val->getOperand(1);
3873     ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3874     bool isInteger = LHS.getValueType().isInteger();
3875     if (Negate)
3876       CC = getSetCCInverse(CC, LHS.getValueType());
3877     SDLoc DL(Val);
3878     // Determine OutCC and handle FP special case.
3879     if (isInteger) {
3880       OutCC = changeIntCCToAArch64CC(CC);
3881     } else {
3882       assert(LHS.getValueType().isFloatingPoint());
3883       AArch64CC::CondCode ExtraCC;
3884       changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3885       // Some floating point conditions can't be tested with a single condition
3886       // code. Construct an additional comparison in this case.
3887       if (ExtraCC != AArch64CC::AL) {
3888         SDValue ExtraCmp;
3889         if (!CCOp.getNode())
3890           ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3891         else
3892           ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3893                                                ExtraCC, DL, DAG);
3894         CCOp = ExtraCmp;
3895         Predicate = ExtraCC;
3896       }
3897     }
3898 
3899     // Produce a normal comparison if we are first in the chain
3900     if (!CCOp)
3901       return emitComparison(LHS, RHS, CC, DL, DAG);
3902     // Otherwise produce a ccmp.
3903     return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3904                                      DAG);
3905   }
3906   assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3907 
3908   bool IsOR = Opcode == ISD::OR;
3909 
3910   SDValue LHS = Val->getOperand(0);
3911   bool CanNegateL;
3912   bool MustBeFirstL;
3913   bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3914   assert(ValidL && "Valid conjunction/disjunction tree");
3915   (void)ValidL;
3916 
3917   SDValue RHS = Val->getOperand(1);
3918   bool CanNegateR;
3919   bool MustBeFirstR;
3920   bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3921   assert(ValidR && "Valid conjunction/disjunction tree");
3922   (void)ValidR;
3923 
3924   // Swap sub-tree that must come first to the right side.
3925   if (MustBeFirstL) {
3926     assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3927     std::swap(LHS, RHS);
3928     std::swap(CanNegateL, CanNegateR);
3929     std::swap(MustBeFirstL, MustBeFirstR);
3930   }
3931 
3932   bool NegateR;
3933   bool NegateAfterR;
3934   bool NegateL;
3935   bool NegateAfterAll;
3936   if (Opcode == ISD::OR) {
3937     // Swap the sub-tree that we can negate naturally to the left.
3938     if (!CanNegateL) {
3939       assert(CanNegateR && "at least one side must be negatable");
3940       assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3941       assert(!Negate);
3942       std::swap(LHS, RHS);
3943       NegateR = false;
3944       NegateAfterR = true;
3945     } else {
3946       // Negate the left sub-tree if possible, otherwise negate the result.
3947       NegateR = CanNegateR;
3948       NegateAfterR = !CanNegateR;
3949     }
3950     NegateL = true;
3951     NegateAfterAll = !Negate;
3952   } else {
3953     assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3954     assert(!Negate && "Valid conjunction/disjunction tree");
3955 
3956     NegateL = false;
3957     NegateR = false;
3958     NegateAfterR = false;
3959     NegateAfterAll = false;
3960   }
3961 
3962   // Emit sub-trees.
3963   AArch64CC::CondCode RHSCC;
3964   SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3965   if (NegateAfterR)
3966     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3967   SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3968   if (NegateAfterAll)
3969     OutCC = AArch64CC::getInvertedCondCode(OutCC);
3970   return CmpL;
3971 }
3972 
3973 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3974 /// In some cases this is even possible with OR operations in the expression.
3975 /// See \ref AArch64CCMP.
3976 /// \see emitConjunctionRec().
3977 static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
3978                                AArch64CC::CondCode &OutCC) {
3979   bool DummyCanNegate;
3980   bool DummyMustBeFirst;
3981   if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3982     return SDValue();
3983 
3984   return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3985 }
3986 
3987 /// @}
3988 
3989 /// Returns how profitable it is to fold a comparison's operand's shift and/or
3990 /// extension operations.
3991 static unsigned getCmpOperandFoldingProfit(SDValue Op) {
3992   auto isSupportedExtend = [&](SDValue V) {
3993     if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3994       return true;
3995 
3996     if (V.getOpcode() == ISD::AND)
3997       if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3998         uint64_t Mask = MaskCst->getZExtValue();
3999         return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
4000       }
4001 
4002     return false;
4003   };
4004 
4005   if (!Op.hasOneUse())
4006     return 0;
4007 
4008   if (isSupportedExtend(Op))
4009     return 1;
4010 
4011   unsigned Opc = Op.getOpcode();
4012   if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
4013     if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4014       uint64_t Shift = ShiftCst->getZExtValue();
4015       if (isSupportedExtend(Op.getOperand(0)))
4016         return (Shift <= 4) ? 2 : 1;
4017       EVT VT = Op.getValueType();
4018       if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
4019         return 1;
4020     }
4021 
4022   return 0;
4023 }
4024 
4025 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4026                              SDValue &AArch64cc, SelectionDAG &DAG,
4027                              const SDLoc &dl) {
4028   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4029     EVT VT = RHS.getValueType();
4030     uint64_t C = RHSC->getZExtValue();
4031     if (!isLegalArithImmed(C)) {
4032       // Constant does not fit, try adjusting it by one?
4033       switch (CC) {
4034       default:
4035         break;
4036       case ISD::SETLT:
4037       case ISD::SETGE:
4038         if ((VT == MVT::i32 && C != 0x80000000 &&
4039              isLegalArithImmed((uint32_t)(C - 1))) ||
4040             (VT == MVT::i64 && C != 0x80000000ULL &&
4041              isLegalArithImmed(C - 1ULL))) {
4042           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4043           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
4044           RHS = DAG.getConstant(C, dl, VT);
4045         }
4046         break;
4047       case ISD::SETULT:
4048       case ISD::SETUGE:
4049         if ((VT == MVT::i32 && C != 0 &&
4050              isLegalArithImmed((uint32_t)(C - 1))) ||
4051             (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
4052           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4053           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
4054           RHS = DAG.getConstant(C, dl, VT);
4055         }
4056         break;
4057       case ISD::SETLE:
4058       case ISD::SETGT:
4059         if ((VT == MVT::i32 && C != INT32_MAX &&
4060              isLegalArithImmed((uint32_t)(C + 1))) ||
4061             (VT == MVT::i64 && C != INT64_MAX &&
4062              isLegalArithImmed(C + 1ULL))) {
4063           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4064           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
4065           RHS = DAG.getConstant(C, dl, VT);
4066         }
4067         break;
4068       case ISD::SETULE:
4069       case ISD::SETUGT:
4070         if ((VT == MVT::i32 && C != UINT32_MAX &&
4071              isLegalArithImmed((uint32_t)(C + 1))) ||
4072             (VT == MVT::i64 && C != UINT64_MAX &&
4073              isLegalArithImmed(C + 1ULL))) {
4074           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4075           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
4076           RHS = DAG.getConstant(C, dl, VT);
4077         }
4078         break;
4079       }
4080     }
4081   }
4082 
4083   // Comparisons are canonicalized so that the RHS operand is simpler than the
4084   // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4085   // can fold some shift+extend operations on the RHS operand, so swap the
4086   // operands if that can be done.
4087   //
4088   // For example:
4089   //    lsl     w13, w11, #1
4090   //    cmp     w13, w12
4091   // can be turned into:
4092   //    cmp     w12, w11, lsl #1
4093   if (!isa<ConstantSDNode>(RHS) ||
4094       !isLegalArithImmed(RHS->getAsAPIntVal().abs().getZExtValue())) {
4095     bool LHSIsCMN = isCMN(LHS, CC, DAG);
4096     bool RHSIsCMN = isCMN(RHS, CC, DAG);
4097     SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4098     SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4099 
4100     if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4101         getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4102       std::swap(LHS, RHS);
4103       CC = ISD::getSetCCSwappedOperands(CC);
4104     }
4105   }
4106 
4107   SDValue Cmp;
4108   AArch64CC::CondCode AArch64CC;
4109   if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4110     const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
4111 
4112     // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4113     // For the i8 operand, the largest immediate is 255, so this can be easily
4114     // encoded in the compare instruction. For the i16 operand, however, the
4115     // largest immediate cannot be encoded in the compare.
4116     // Therefore, use a sign extending load and cmn to avoid materializing the
4117     // -1 constant. For example,
4118     // movz w1, #65535
4119     // ldrh w0, [x0, #0]
4120     // cmp w0, w1
4121     // >
4122     // ldrsh w0, [x0, #0]
4123     // cmn w0, #1
4124     // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4125     // if and only if (sext LHS) == (sext RHS). The checks are in place to
4126     // ensure both the LHS and RHS are truly zero extended and to make sure the
4127     // transformation is profitable.
4128     if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4129         cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4130         cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4131         LHS.getNode()->hasNUsesOfValue(1, 0)) {
4132       int16_t ValueofRHS = RHS->getAsZExtVal();
4133       if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4134         SDValue SExt =
4135             DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
4136                         DAG.getValueType(MVT::i16));
4137         Cmp = emitComparison(
4138             SExt, DAG.getSignedConstant(ValueofRHS, dl, RHS.getValueType()), CC,
4139             dl, DAG);
4140         AArch64CC = changeIntCCToAArch64CC(CC);
4141       }
4142     }
4143 
4144     if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4145       if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4146         if ((CC == ISD::SETNE) ^ RHSC->isZero())
4147           AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
4148       }
4149     }
4150   }
4151 
4152   if (!Cmp) {
4153     Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
4154     AArch64CC = changeIntCCToAArch64CC(CC);
4155   }
4156   AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
4157   return Cmp;
4158 }
4159 
4160 static std::pair<SDValue, SDValue>
4161 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
4162   assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4163          "Unsupported value type");
4164   SDValue Value, Overflow;
4165   SDLoc DL(Op);
4166   SDValue LHS = Op.getOperand(0);
4167   SDValue RHS = Op.getOperand(1);
4168   unsigned Opc = 0;
4169   switch (Op.getOpcode()) {
4170   default:
4171     llvm_unreachable("Unknown overflow instruction!");
4172   case ISD::SADDO:
4173     Opc = AArch64ISD::ADDS;
4174     CC = AArch64CC::VS;
4175     break;
4176   case ISD::UADDO:
4177     Opc = AArch64ISD::ADDS;
4178     CC = AArch64CC::HS;
4179     break;
4180   case ISD::SSUBO:
4181     Opc = AArch64ISD::SUBS;
4182     CC = AArch64CC::VS;
4183     break;
4184   case ISD::USUBO:
4185     Opc = AArch64ISD::SUBS;
4186     CC = AArch64CC::LO;
4187     break;
4188   // Multiply needs a little bit extra work.
4189   case ISD::SMULO:
4190   case ISD::UMULO: {
4191     CC = AArch64CC::NE;
4192     bool IsSigned = Op.getOpcode() == ISD::SMULO;
4193     if (Op.getValueType() == MVT::i32) {
4194       // Extend to 64-bits, then perform a 64-bit multiply.
4195       unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4196       LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4197       RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4198       SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4199       Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4200 
4201       // Check that the result fits into a 32-bit integer.
4202       SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
4203       if (IsSigned) {
4204         // cmp xreg, wreg, sxtw
4205         SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4206         Overflow =
4207             DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4208       } else {
4209         // tst xreg, #0xffffffff00000000
4210         SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4211         Overflow =
4212             DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4213       }
4214       break;
4215     }
4216     assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4217     // For the 64 bit multiply
4218     Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4219     if (IsSigned) {
4220       SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4221       SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4222                                       DAG.getConstant(63, DL, MVT::i64));
4223       // It is important that LowerBits is last, otherwise the arithmetic
4224       // shift will not be folded into the compare (SUBS).
4225       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
4226       Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4227                      .getValue(1);
4228     } else {
4229       SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4230       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
4231       Overflow =
4232           DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4233                       DAG.getConstant(0, DL, MVT::i64),
4234                       UpperBits).getValue(1);
4235     }
4236     break;
4237   }
4238   } // switch (...)
4239 
4240   if (Opc) {
4241     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
4242 
4243     // Emit the AArch64 operation with overflow check.
4244     Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4245     Overflow = Value.getValue(1);
4246   }
4247   return std::make_pair(Value, Overflow);
4248 }
4249 
4250 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4251   if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4252                                    !Subtarget->isNeonAvailable()))
4253     return LowerToScalableOp(Op, DAG);
4254 
4255   SDValue Sel = Op.getOperand(0);
4256   SDValue Other = Op.getOperand(1);
4257   SDLoc dl(Sel);
4258 
4259   // If the operand is an overflow checking operation, invert the condition
4260   // code and kill the Not operation. I.e., transform:
4261   // (xor (overflow_op_bool, 1))
4262   //   -->
4263   // (csel 1, 0, invert(cc), overflow_op_bool)
4264   // ... which later gets transformed to just a cset instruction with an
4265   // inverted condition code, rather than a cset + eor sequence.
4266   if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
4267     // Only lower legal XALUO ops.
4268     if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
4269       return SDValue();
4270 
4271     SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4272     SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4273     AArch64CC::CondCode CC;
4274     SDValue Value, Overflow;
4275     std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4276     SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4277     return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
4278                        CCVal, Overflow);
4279   }
4280   // If neither operand is a SELECT_CC, give up.
4281   if (Sel.getOpcode() != ISD::SELECT_CC)
4282     std::swap(Sel, Other);
4283   if (Sel.getOpcode() != ISD::SELECT_CC)
4284     return Op;
4285 
4286   // The folding we want to perform is:
4287   // (xor x, (select_cc a, b, cc, 0, -1) )
4288   //   -->
4289   // (csel x, (xor x, -1), cc ...)
4290   //
4291   // The latter will get matched to a CSINV instruction.
4292 
4293   ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4294   SDValue LHS = Sel.getOperand(0);
4295   SDValue RHS = Sel.getOperand(1);
4296   SDValue TVal = Sel.getOperand(2);
4297   SDValue FVal = Sel.getOperand(3);
4298 
4299   // FIXME: This could be generalized to non-integer comparisons.
4300   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4301     return Op;
4302 
4303   ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4304   ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4305 
4306   // The values aren't constants, this isn't the pattern we're looking for.
4307   if (!CFVal || !CTVal)
4308     return Op;
4309 
4310   // We can commute the SELECT_CC by inverting the condition.  This
4311   // might be needed to make this fit into a CSINV pattern.
4312   if (CTVal->isAllOnes() && CFVal->isZero()) {
4313     std::swap(TVal, FVal);
4314     std::swap(CTVal, CFVal);
4315     CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4316   }
4317 
4318   // If the constants line up, perform the transform!
4319   if (CTVal->isZero() && CFVal->isAllOnes()) {
4320     SDValue CCVal;
4321     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
4322 
4323     FVal = Other;
4324     TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
4325                        DAG.getAllOnesConstant(dl, Other.getValueType()));
4326 
4327     return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
4328                        CCVal, Cmp);
4329   }
4330 
4331   return Op;
4332 }
4333 
4334 // If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4335 // bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4336 // sets 'C' bit to 0.
4337 static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
4338   SDLoc DL(Value);
4339   EVT VT = Value.getValueType();
4340   SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4341   SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4342   SDValue Cmp =
4343       DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
4344   return Cmp.getValue(1);
4345 }
4346 
4347 // If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4348 // If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4349 static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG,
4350                                 bool Invert) {
4351   assert(Glue.getResNo() == 1);
4352   SDLoc DL(Glue);
4353   SDValue Zero = DAG.getConstant(0, DL, VT);
4354   SDValue One = DAG.getConstant(1, DL, VT);
4355   unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
4356   SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
4357   return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4358 }
4359 
4360 // Value is 1 if 'V' bit of NZCV is 1, else 0
4361 static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG) {
4362   assert(Glue.getResNo() == 1);
4363   SDLoc DL(Glue);
4364   SDValue Zero = DAG.getConstant(0, DL, VT);
4365   SDValue One = DAG.getConstant(1, DL, VT);
4366   SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
4367   return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4368 }
4369 
4370 // This lowering is inefficient, but it will get cleaned up by
4371 // `foldOverflowCheck`
4372 static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,
4373                                   unsigned Opcode, bool IsSigned) {
4374   EVT VT0 = Op.getValue(0).getValueType();
4375   EVT VT1 = Op.getValue(1).getValueType();
4376 
4377   if (VT0 != MVT::i32 && VT0 != MVT::i64)
4378     return SDValue();
4379 
4380   bool InvertCarry = Opcode == AArch64ISD::SBCS;
4381   SDValue OpLHS = Op.getOperand(0);
4382   SDValue OpRHS = Op.getOperand(1);
4383   SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4384 
4385   SDLoc DL(Op);
4386   SDVTList VTs = DAG.getVTList(VT0, VT1);
4387 
4388   SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
4389                             OpRHS, OpCarryIn);
4390 
4391   SDValue OutFlag =
4392       IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4393                : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4394 
4395   return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
4396 }
4397 
4398 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
4399   // Let legalize expand this if it isn't a legal type yet.
4400   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4401     return SDValue();
4402 
4403   SDLoc dl(Op);
4404   AArch64CC::CondCode CC;
4405   // The actual operation that sets the overflow or carry flag.
4406   SDValue Value, Overflow;
4407   std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4408 
4409   // We use 0 and 1 as false and true values.
4410   SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4411   SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4412 
4413   // We use an inverted condition, because the conditional select is inverted
4414   // too. This will allow it to be selected to a single instruction:
4415   // CSINC Wd, WZR, WZR, invert(cond).
4416   SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4417   Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
4418                          CCVal, Overflow);
4419 
4420   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4421   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4422 }
4423 
4424 // Prefetch operands are:
4425 // 1: Address to prefetch
4426 // 2: bool isWrite
4427 // 3: int locality (0 = no locality ... 3 = extreme locality)
4428 // 4: bool isDataCache
4429 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
4430   SDLoc DL(Op);
4431   unsigned IsWrite = Op.getConstantOperandVal(2);
4432   unsigned Locality = Op.getConstantOperandVal(3);
4433   unsigned IsData = Op.getConstantOperandVal(4);
4434 
4435   bool IsStream = !Locality;
4436   // When the locality number is set
4437   if (Locality) {
4438     // The front-end should have filtered out the out-of-range values
4439     assert(Locality <= 3 && "Prefetch locality out-of-range");
4440     // The locality degree is the opposite of the cache speed.
4441     // Put the number the other way around.
4442     // The encoding starts at 0 for level 1
4443     Locality = 3 - Locality;
4444   }
4445 
4446   // built the mask value encoding the expected behavior.
4447   unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
4448                    (!IsData << 3) |     // IsDataCache bit
4449                    (Locality << 1) |    // Cache level bits
4450                    (unsigned)IsStream;  // Stream bit
4451   return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4452                      DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4453                      Op.getOperand(1));
4454 }
4455 
4456 // Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4457 // a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4458 // (AND X Y) Z which produces a better opt with EmitComparison
4459 static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS,
4460                                 SelectionDAG &DAG, const SDLoc dl) {
4461   if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4462     ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4463     ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
4464     if (LHSConstOp && RHSConst) {
4465       uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4466       uint64_t RHSConstant = RHSConst->getZExtValue();
4467       if (isPowerOf2_64(RHSConstant)) {
4468         uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4469         LHS =
4470             DAG.getNode(ISD::AND, dl, LHS.getValueType(), LHS.getOperand(0),
4471                         DAG.getConstant(NewMaskValue, dl, LHS.getValueType()));
4472         RHS = DAG.getConstant(0, dl, RHS.getValueType());
4473         CC = ISD::SETEQ;
4474       }
4475     }
4476   }
4477 }
4478 
4479 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4480                                               SelectionDAG &DAG) const {
4481   EVT VT = Op.getValueType();
4482   if (VT.isScalableVector()) {
4483     SDValue SrcVal = Op.getOperand(0);
4484 
4485     if (SrcVal.getValueType().getScalarType() == MVT::bf16) {
4486       // bf16 and f32 share the same exponent range so the conversion requires
4487       // them to be aligned with the new mantissa bits zero'd. This is just a
4488       // left shift that is best to isel directly.
4489       if (VT == MVT::nxv2f32 || VT == MVT::nxv4f32)
4490         return Op;
4491 
4492       if (VT != MVT::nxv2f64)
4493         return SDValue();
4494 
4495       // Break other conversions in two with the first part converting to f32
4496       // and the second using native f32->VT instructions.
4497       SDLoc DL(Op);
4498       return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4499                          DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4500     }
4501 
4502     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4503   }
4504 
4505   if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4506     return LowerFixedLengthFPExtendToSVE(Op, DAG);
4507 
4508   bool IsStrict = Op->isStrictFPOpcode();
4509   SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4510   EVT Op0VT = Op0.getValueType();
4511   if (VT == MVT::f64) {
4512     // FP16->FP32 extends are legal for v32 and v4f32.
4513     if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4514       return Op;
4515     // Split bf16->f64 extends into two fpextends.
4516     if (Op0VT == MVT::bf16 && IsStrict) {
4517       SDValue Ext1 =
4518           DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4519                       {Op0, Op.getOperand(0)});
4520       return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4521                          {Ext1, Ext1.getValue(1)});
4522     }
4523     if (Op0VT == MVT::bf16)
4524       return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4525                          DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4526     return SDValue();
4527   }
4528 
4529   if (VT.getScalarType() == MVT::f32) {
4530     // FP16->FP32 extends are legal for v32 and v4f32.
4531     if (Op0VT.getScalarType() == MVT::f16)
4532       return Op;
4533     if (Op0VT.getScalarType() == MVT::bf16) {
4534       SDLoc DL(Op);
4535       EVT IVT = VT.changeTypeToInteger();
4536       if (!Op0VT.isVector()) {
4537         Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4bf16, Op0);
4538         IVT = MVT::v4i32;
4539       }
4540 
4541       EVT Op0IVT = Op0.getValueType().changeTypeToInteger();
4542       SDValue Ext =
4543           DAG.getNode(ISD::ANY_EXTEND, DL, IVT, DAG.getBitcast(Op0IVT, Op0));
4544       SDValue Shift =
4545           DAG.getNode(ISD::SHL, DL, IVT, Ext, DAG.getConstant(16, DL, IVT));
4546       if (!Op0VT.isVector())
4547         Shift = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Shift,
4548                             DAG.getConstant(0, DL, MVT::i64));
4549       Shift = DAG.getBitcast(VT, Shift);
4550       return IsStrict ? DAG.getMergeValues({Shift, Op.getOperand(0)}, DL)
4551                       : Shift;
4552     }
4553     return SDValue();
4554   }
4555 
4556   assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4557   return SDValue();
4558 }
4559 
4560 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4561                                              SelectionDAG &DAG) const {
4562   EVT VT = Op.getValueType();
4563   bool IsStrict = Op->isStrictFPOpcode();
4564   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4565   EVT SrcVT = SrcVal.getValueType();
4566   bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4567 
4568   if (VT.isScalableVector()) {
4569     if (VT.getScalarType() != MVT::bf16)
4570       return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4571 
4572     SDLoc DL(Op);
4573     constexpr EVT I32 = MVT::nxv4i32;
4574     auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4575 
4576     SDValue NaN;
4577     SDValue Narrow;
4578 
4579     if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4580       if (Subtarget->hasBF16())
4581         return LowerToPredicatedOp(Op, DAG,
4582                                    AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4583 
4584       Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4585 
4586       // Set the quiet bit.
4587       if (!DAG.isKnownNeverSNaN(SrcVal))
4588         NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4589     } else if (SrcVT == MVT::nxv2f64 &&
4590                (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4591       // Round to float without introducing rounding errors and try again.
4592       SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4593       Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4594                            Pg, SrcVal, DAG.getUNDEF(MVT::nxv2f32));
4595 
4596       SmallVector<SDValue, 3> NewOps;
4597       if (IsStrict)
4598         NewOps.push_back(Op.getOperand(0));
4599       NewOps.push_back(Narrow);
4600       NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4601       return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4602     } else
4603       return SDValue();
4604 
4605     if (!Trunc) {
4606       SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4607       Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4608       SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4609       Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4610     }
4611 
4612     // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4613     // 0x80000000.
4614     if (NaN) {
4615       EVT I1 = I32.changeElementType(MVT::i1);
4616       EVT CondVT = VT.changeElementType(MVT::i1);
4617       SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4618       IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4619       Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4620     }
4621 
4622     // Now that we have rounded, shift the bits into position.
4623     Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4624     return getSVESafeBitCast(VT, Narrow, DAG);
4625   }
4626 
4627   if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4628     return LowerFixedLengthFPRoundToSVE(Op, DAG);
4629 
4630   // Expand cases where the result type is BF16 but we don't have hardware
4631   // instructions to lower it.
4632   if (VT.getScalarType() == MVT::bf16 &&
4633       !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4634         Subtarget->hasBF16())) {
4635     SDLoc dl(Op);
4636     SDValue Narrow = SrcVal;
4637     SDValue NaN;
4638     EVT I32 = SrcVT.changeElementType(MVT::i32);
4639     EVT F32 = SrcVT.changeElementType(MVT::f32);
4640     if (SrcVT.getScalarType() == MVT::f32) {
4641       bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4642       Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4643       if (!NeverSNaN) {
4644         // Set the quiet bit.
4645         NaN = DAG.getNode(ISD::OR, dl, I32, Narrow,
4646                           DAG.getConstant(0x400000, dl, I32));
4647       }
4648     } else if (SrcVT.getScalarType() == MVT::f64) {
4649       Narrow = DAG.getNode(AArch64ISD::FCVTXN, dl, F32, Narrow);
4650       Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4651     } else {
4652       return SDValue();
4653     }
4654     if (!Trunc) {
4655       SDValue One = DAG.getConstant(1, dl, I32);
4656       SDValue Lsb = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4657                                 DAG.getShiftAmountConstant(16, I32, dl));
4658       Lsb = DAG.getNode(ISD::AND, dl, I32, Lsb, One);
4659       SDValue RoundingBias =
4660           DAG.getNode(ISD::ADD, dl, I32, DAG.getConstant(0x7fff, dl, I32), Lsb);
4661       Narrow = DAG.getNode(ISD::ADD, dl, I32, Narrow, RoundingBias);
4662     }
4663 
4664     // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4665     // 0x80000000.
4666     if (NaN) {
4667       SDValue IsNaN = DAG.getSetCC(
4668           dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4669           SrcVal, SrcVal, ISD::SETUO);
4670       Narrow = DAG.getSelect(dl, I32, IsNaN, NaN, Narrow);
4671     }
4672 
4673     // Now that we have rounded, shift the bits into position.
4674     Narrow = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4675                          DAG.getShiftAmountConstant(16, I32, dl));
4676     if (VT.isVector()) {
4677       EVT I16 = I32.changeVectorElementType(MVT::i16);
4678       Narrow = DAG.getNode(ISD::TRUNCATE, dl, I16, Narrow);
4679       return DAG.getNode(ISD::BITCAST, dl, VT, Narrow);
4680     }
4681     Narrow = DAG.getNode(ISD::BITCAST, dl, F32, Narrow);
4682     SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4683     return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, dl)
4684                     : Result;
4685   }
4686 
4687   if (SrcVT != MVT::f128) {
4688     // Expand cases where the input is a vector bigger than NEON.
4689     if (useSVEForFixedLengthVectorVT(SrcVT))
4690       return SDValue();
4691 
4692     // It's legal except when f128 is involved
4693     return Op;
4694   }
4695 
4696   return SDValue();
4697 }
4698 
4699 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4700                                                     SelectionDAG &DAG) const {
4701   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4702   // Any additional optimization in this function should be recorded
4703   // in the cost tables.
4704   bool IsStrict = Op->isStrictFPOpcode();
4705   EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4706   EVT VT = Op.getValueType();
4707 
4708   if (VT.isScalableVector()) {
4709     unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4710                           ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4711                           : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4712     return LowerToPredicatedOp(Op, DAG, Opcode);
4713   }
4714 
4715   if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4716       useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4717     return LowerFixedLengthFPToIntToSVE(Op, DAG);
4718 
4719   unsigned NumElts = InVT.getVectorNumElements();
4720 
4721   // f16 conversions are promoted to f32 when full fp16 is not supported.
4722   if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4723       InVT.getVectorElementType() == MVT::bf16) {
4724     MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4725     SDLoc dl(Op);
4726     if (IsStrict) {
4727       SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4728                                 {Op.getOperand(0), Op.getOperand(1)});
4729       return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4730                          {Ext.getValue(1), Ext.getValue(0)});
4731     }
4732     return DAG.getNode(
4733         Op.getOpcode(), dl, Op.getValueType(),
4734         DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4735   }
4736 
4737   uint64_t VTSize = VT.getFixedSizeInBits();
4738   uint64_t InVTSize = InVT.getFixedSizeInBits();
4739   if (VTSize < InVTSize) {
4740     SDLoc dl(Op);
4741     if (IsStrict) {
4742       InVT = InVT.changeVectorElementTypeToInteger();
4743       SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4744                                {Op.getOperand(0), Op.getOperand(1)});
4745       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4746       return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4747     }
4748     SDValue Cv =
4749         DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4750                     Op.getOperand(0));
4751     return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4752   }
4753 
4754   if (VTSize > InVTSize) {
4755     SDLoc dl(Op);
4756     MVT ExtVT =
4757         MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
4758                          VT.getVectorNumElements());
4759     if (IsStrict) {
4760       SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4761                                 {Op.getOperand(0), Op.getOperand(1)});
4762       return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4763                          {Ext.getValue(1), Ext.getValue(0)});
4764     }
4765     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4766     return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4767   }
4768 
4769   // Use a scalar operation for conversions between single-element vectors of
4770   // the same size.
4771   if (NumElts == 1) {
4772     SDLoc dl(Op);
4773     SDValue Extract = DAG.getNode(
4774         ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
4775         Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4776     EVT ScalarVT = VT.getScalarType();
4777     if (IsStrict)
4778       return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4779                          {Op.getOperand(0), Extract});
4780     return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4781   }
4782 
4783   // Type changing conversions are illegal.
4784   return Op;
4785 }
4786 
4787 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4788                                               SelectionDAG &DAG) const {
4789   bool IsStrict = Op->isStrictFPOpcode();
4790   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4791 
4792   if (SrcVal.getValueType().isVector())
4793     return LowerVectorFP_TO_INT(Op, DAG);
4794 
4795   // f16 conversions are promoted to f32 when full fp16 is not supported.
4796   if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4797       SrcVal.getValueType() == MVT::bf16) {
4798     SDLoc dl(Op);
4799     if (IsStrict) {
4800       SDValue Ext =
4801           DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4802                       {Op.getOperand(0), SrcVal});
4803       return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4804                          {Ext.getValue(1), Ext.getValue(0)});
4805     }
4806     return DAG.getNode(
4807         Op.getOpcode(), dl, Op.getValueType(),
4808         DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4809   }
4810 
4811   if (SrcVal.getValueType() != MVT::f128) {
4812     // It's legal except when f128 is involved
4813     return Op;
4814   }
4815 
4816   return SDValue();
4817 }
4818 
4819 SDValue
4820 AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4821                                                 SelectionDAG &DAG) const {
4822   // AArch64 FP-to-int conversions saturate to the destination element size, so
4823   // we can lower common saturating conversions to simple instructions.
4824   SDValue SrcVal = Op.getOperand(0);
4825   EVT SrcVT = SrcVal.getValueType();
4826   EVT DstVT = Op.getValueType();
4827   EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4828 
4829   uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4830   uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4831   uint64_t SatWidth = SatVT.getScalarSizeInBits();
4832   assert(SatWidth <= DstElementWidth &&
4833          "Saturation width cannot exceed result width");
4834 
4835   // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4836   // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4837   // types, so this is hard to reach.
4838   if (DstVT.isScalableVector())
4839     return SDValue();
4840 
4841   EVT SrcElementVT = SrcVT.getVectorElementType();
4842 
4843   // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4844   SDLoc DL(Op);
4845   SDValue SrcVal2;
4846   if ((SrcElementVT == MVT::f16 &&
4847        (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4848       SrcElementVT == MVT::bf16) {
4849     MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4850     SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
4851     // If we are extending to a v8f32, split into two v4f32 to produce legal
4852     // types.
4853     if (F32VT.getSizeInBits() > 128) {
4854       std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
4855       F32VT = F32VT.getHalfNumVectorElementsVT();
4856     }
4857     SrcVT = F32VT;
4858     SrcElementVT = MVT::f32;
4859     SrcElementWidth = 32;
4860   } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4861              SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4862     return SDValue();
4863 
4864   // Expand to f64 if we are saturating to i64, to help keep the lanes the same
4865   // width and produce a fcvtzu.
4866   if (SatWidth == 64 && SrcElementWidth < 64) {
4867     MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
4868     SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
4869     SrcVT = F64VT;
4870     SrcElementVT = MVT::f64;
4871     SrcElementWidth = 64;
4872   }
4873   // Cases that we can emit directly.
4874   if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
4875     SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4876                               DAG.getValueType(DstVT.getScalarType()));
4877     if (SrcVal2) {
4878       SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
4879                                  DAG.getValueType(DstVT.getScalarType()));
4880       return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
4881     }
4882     return Res;
4883   }
4884 
4885   // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4886   // result. This is only valid if the legal cvt is larger than the saturate
4887   // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4888   // (at least until sqxtn is selected).
4889   if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4890     return SDValue();
4891 
4892   EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4893   SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4894                                   DAG.getValueType(IntVT.getScalarType()));
4895   SDValue NativeCvt2 =
4896       SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
4897                             DAG.getValueType(IntVT.getScalarType()))
4898               : SDValue();
4899   SDValue Sat, Sat2;
4900   if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4901     SDValue MinC = DAG.getConstant(
4902         APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4903     SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4904     SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4905     SDValue MaxC = DAG.getConstant(
4906         APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4907     Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4908     Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
4909   } else {
4910     SDValue MinC = DAG.getConstant(
4911         APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4912     Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4913     Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4914   }
4915 
4916   if (SrcVal2)
4917     Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
4918                       IntVT.getDoubleNumVectorElementsVT(*DAG.getContext()),
4919                       Sat, Sat2);
4920 
4921   return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4922 }
4923 
4924 SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4925                                                   SelectionDAG &DAG) const {
4926   // AArch64 FP-to-int conversions saturate to the destination register size, so
4927   // we can lower common saturating conversions to simple instructions.
4928   SDValue SrcVal = Op.getOperand(0);
4929   EVT SrcVT = SrcVal.getValueType();
4930 
4931   if (SrcVT.isVector())
4932     return LowerVectorFP_TO_INT_SAT(Op, DAG);
4933 
4934   EVT DstVT = Op.getValueType();
4935   EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4936   uint64_t SatWidth = SatVT.getScalarSizeInBits();
4937   uint64_t DstWidth = DstVT.getScalarSizeInBits();
4938   assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4939 
4940   // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4941   if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4942     SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4943     SrcVT = MVT::f32;
4944   } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4945              SrcVT != MVT::bf16)
4946     return SDValue();
4947 
4948   SDLoc DL(Op);
4949   // Cases that we can emit directly.
4950   if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4951        (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4952       DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4953     return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4954                        DAG.getValueType(DstVT));
4955 
4956   // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4957   // result. This is only valid if the legal cvt is larger than the saturate
4958   // width.
4959   if (DstWidth < SatWidth)
4960     return SDValue();
4961 
4962   SDValue NativeCvt =
4963       DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4964   SDValue Sat;
4965   if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4966     SDValue MinC = DAG.getConstant(
4967         APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4968     SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4969     SDValue MaxC = DAG.getConstant(
4970         APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4971     Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4972   } else {
4973     SDValue MinC = DAG.getConstant(
4974         APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4975     Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4976   }
4977 
4978   return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4979 }
4980 
4981 SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
4982                                                 SelectionDAG &DAG) const {
4983   EVT VT = Op.getValueType();
4984   SDValue Src = Op.getOperand(0);
4985   SDLoc DL(Op);
4986 
4987   assert(VT.isVector() && "Expected vector type");
4988 
4989   EVT CastVT =
4990       VT.changeVectorElementType(Src.getValueType().getVectorElementType());
4991 
4992   // Round the floating-point value into a floating-point register with the
4993   // current rounding mode.
4994   SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
4995 
4996   // Truncate the rounded floating point to an integer.
4997   return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
4998                      DAG.getValueType(VT.getVectorElementType()));
4999 }
5000 
5001 SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
5002                                                     SelectionDAG &DAG) const {
5003   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
5004   // Any additional optimization in this function should be recorded
5005   // in the cost tables.
5006   bool IsStrict = Op->isStrictFPOpcode();
5007   EVT VT = Op.getValueType();
5008   SDLoc dl(Op);
5009   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
5010   EVT InVT = In.getValueType();
5011   unsigned Opc = Op.getOpcode();
5012   bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5013 
5014   if (VT.isScalableVector()) {
5015     if (InVT.getVectorElementType() == MVT::i1) {
5016       // We can't directly extend an SVE predicate; extend it first.
5017       unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5018       EVT CastVT = getPromotedVTForPredicate(InVT);
5019       In = DAG.getNode(CastOpc, dl, CastVT, In);
5020       return DAG.getNode(Opc, dl, VT, In);
5021     }
5022 
5023     unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5024                                : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
5025     return LowerToPredicatedOp(Op, DAG, Opcode);
5026   }
5027 
5028   if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5029       useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5030     return LowerFixedLengthIntToFPToSVE(Op, DAG);
5031 
5032   // Promote bf16 conversions to f32.
5033   if (VT.getVectorElementType() == MVT::bf16) {
5034     EVT F32 = VT.changeElementType(MVT::f32);
5035     if (IsStrict) {
5036       SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
5037                                 {Op.getOperand(0), In});
5038       return DAG.getNode(ISD::STRICT_FP_ROUND, dl,
5039                          {Op.getValueType(), MVT::Other},
5040                          {Val.getValue(1), Val.getValue(0),
5041                           DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
5042     }
5043     return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
5044                        DAG.getNode(Op.getOpcode(), dl, F32, In),
5045                        DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
5046   }
5047 
5048   uint64_t VTSize = VT.getFixedSizeInBits();
5049   uint64_t InVTSize = InVT.getFixedSizeInBits();
5050   if (VTSize < InVTSize) {
5051     MVT CastVT =
5052         MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
5053                          InVT.getVectorNumElements());
5054     if (IsStrict) {
5055       In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
5056                        {Op.getOperand(0), In});
5057       return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
5058                          {In.getValue(1), In.getValue(0),
5059                           DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
5060     }
5061     In = DAG.getNode(Opc, dl, CastVT, In);
5062     return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
5063                        DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
5064   }
5065 
5066   if (VTSize > InVTSize) {
5067     unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5068     EVT CastVT = VT.changeVectorElementTypeToInteger();
5069     In = DAG.getNode(CastOpc, dl, CastVT, In);
5070     if (IsStrict)
5071       return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
5072     return DAG.getNode(Opc, dl, VT, In);
5073   }
5074 
5075   // Use a scalar operation for conversions between single-element vectors of
5076   // the same size.
5077   if (VT.getVectorNumElements() == 1) {
5078     SDValue Extract = DAG.getNode(
5079         ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
5080         In, DAG.getConstant(0, dl, MVT::i64));
5081     EVT ScalarVT = VT.getScalarType();
5082     if (IsStrict)
5083       return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
5084                          {Op.getOperand(0), Extract});
5085     return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
5086   }
5087 
5088   return Op;
5089 }
5090 
5091 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5092                                             SelectionDAG &DAG) const {
5093   if (Op.getValueType().isVector())
5094     return LowerVectorINT_TO_FP(Op, DAG);
5095 
5096   bool IsStrict = Op->isStrictFPOpcode();
5097   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5098 
5099   bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5100                   Op->getOpcode() == ISD::SINT_TO_FP;
5101 
5102   auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5103     SDLoc dl(Op);
5104     if (IsStrict) {
5105       SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
5106                                 {Op.getOperand(0), SrcVal});
5107       return DAG.getNode(ISD::STRICT_FP_ROUND, dl,
5108                          {Op.getValueType(), MVT::Other},
5109                          {Val.getValue(1), Val.getValue(0),
5110                           DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
5111     }
5112     return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
5113                        DAG.getNode(Op.getOpcode(), dl, PromoteVT, SrcVal),
5114                        DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
5115   };
5116 
5117   if (Op.getValueType() == MVT::bf16) {
5118     unsigned MaxWidth = IsSigned
5119                             ? DAG.ComputeMaxSignificantBits(SrcVal)
5120                             : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5121     // bf16 conversions are promoted to f32 when converting from i16.
5122     if (MaxWidth <= 24) {
5123       return IntToFpViaPromotion(MVT::f32);
5124     }
5125 
5126     // bf16 conversions are promoted to f64 when converting from i32.
5127     if (MaxWidth <= 53) {
5128       return IntToFpViaPromotion(MVT::f64);
5129     }
5130 
5131     // We need to be careful about i64 -> bf16.
5132     // Consider an i32 22216703.
5133     // This number cannot be represented exactly as an f32 and so a itofp will
5134     // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5135     // However, the correct bf16 was supposed to be 22151168.0
5136     // We need to use sticky rounding to get this correct.
5137     if (SrcVal.getValueType() == MVT::i64) {
5138       SDLoc DL(Op);
5139       // This algorithm is equivalent to the following:
5140       // uint64_t SrcHi = SrcVal & ~0xfffull;
5141       // uint64_t SrcLo = SrcVal &  0xfffull;
5142       // uint64_t Highest = SrcVal >> 53;
5143       // bool HasHighest = Highest != 0;
5144       // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5145       // double  Rounded = static_cast<double>(ToRound);
5146       // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5147       // uint64_t HasLo = SrcLo != 0;
5148       // bool NeedsAdjustment = HasHighest & HasLo;
5149       // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5150       // double Adjusted = std::bit_cast<double>(AdjustedBits);
5151       // return static_cast<__bf16>(Adjusted);
5152       //
5153       // Essentially, what happens is that SrcVal either fits perfectly in a
5154       // double-precision value or it is too big. If it is sufficiently small,
5155       // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5156       // ensure that u64 -> double has no rounding error by only using the 52
5157       // MSB of the input. The low order bits will get merged into a sticky bit
5158       // which will avoid issues incurred by double rounding.
5159 
5160       // Signed conversion is more or less like so:
5161       // copysign((__bf16)abs(SrcVal), SrcVal)
5162       SDValue SignBit;
5163       if (IsSigned) {
5164         SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5165                               DAG.getConstant(1ull << 63, DL, MVT::i64));
5166         SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5167       }
5168       SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5169                                   DAG.getConstant(~0xfffull, DL, MVT::i64));
5170       SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5171                                   DAG.getConstant(0xfffull, DL, MVT::i64));
5172       SDValue Highest =
5173           DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5174                       DAG.getShiftAmountConstant(53, MVT::i64, DL));
5175       SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5176       SDValue ToRound =
5177           DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5178       SDValue Rounded =
5179           IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5180                                  {Op.getOperand(0), ToRound})
5181                    : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5182 
5183       SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5184       if (SignBit) {
5185         RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5186       }
5187 
5188       SDValue HasHighest = DAG.getSetCC(
5189           DL,
5190           getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5191           Highest, Zero64, ISD::SETNE);
5192 
5193       SDValue HasLo = DAG.getSetCC(
5194           DL,
5195           getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5196           SrcLo, Zero64, ISD::SETNE);
5197 
5198       SDValue NeedsAdjustment =
5199           DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5200       NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5201 
5202       SDValue AdjustedBits =
5203           DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5204       SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5205       return IsStrict
5206                  ? DAG.getNode(
5207                        ISD::STRICT_FP_ROUND, DL,
5208                        {Op.getValueType(), MVT::Other},
5209                        {Rounded.getValue(1), Adjusted,
5210                         DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5211                  : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5212                                DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5213     }
5214   }
5215 
5216   // f16 conversions are promoted to f32 when full fp16 is not supported.
5217   if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5218     return IntToFpViaPromotion(MVT::f32);
5219   }
5220 
5221   // i128 conversions are libcalls.
5222   if (SrcVal.getValueType() == MVT::i128)
5223     return SDValue();
5224 
5225   // Other conversions are legal, unless it's to the completely software-based
5226   // fp128.
5227   if (Op.getValueType() != MVT::f128)
5228     return Op;
5229   return SDValue();
5230 }
5231 
5232 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
5233                                             SelectionDAG &DAG) const {
5234   // For iOS, we want to call an alternative entry point: __sincos_stret,
5235   // which returns the values in two S / D registers.
5236   SDLoc dl(Op);
5237   SDValue Arg = Op.getOperand(0);
5238   EVT ArgVT = Arg.getValueType();
5239   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
5240 
5241   ArgListTy Args;
5242   ArgListEntry Entry;
5243 
5244   Entry.Node = Arg;
5245   Entry.Ty = ArgTy;
5246   Entry.IsSExt = false;
5247   Entry.IsZExt = false;
5248   Args.push_back(Entry);
5249 
5250   RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
5251                                         : RTLIB::SINCOS_STRET_F32;
5252   const char *LibcallName = getLibcallName(LC);
5253   SDValue Callee =
5254       DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
5255 
5256   StructType *RetTy = StructType::get(ArgTy, ArgTy);
5257   TargetLowering::CallLoweringInfo CLI(DAG);
5258   CLI.setDebugLoc(dl)
5259       .setChain(DAG.getEntryNode())
5260       .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
5261 
5262   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5263   return CallResult.first;
5264 }
5265 
5266 static MVT getSVEContainerType(EVT ContentTy);
5267 
5268 SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5269                                             SelectionDAG &DAG) const {
5270   EVT OpVT = Op.getValueType();
5271   EVT ArgVT = Op.getOperand(0).getValueType();
5272 
5273   if (useSVEForFixedLengthVectorVT(OpVT))
5274     return LowerFixedLengthBitcastToSVE(Op, DAG);
5275 
5276   if (OpVT.isScalableVector()) {
5277     assert(isTypeLegal(OpVT) && "Unexpected result type!");
5278 
5279     // Handle type legalisation first.
5280     if (!isTypeLegal(ArgVT)) {
5281       assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5282              "Expected int->fp bitcast!");
5283 
5284       // Bitcasting between unpacked vector types of different element counts is
5285       // not a NOP because the live elements are laid out differently.
5286       //                01234567
5287       // e.g. nxv2i32 = XX??XX??
5288       //      nxv4f16 = X?X?X?X?
5289       if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5290         return SDValue();
5291 
5292       SDValue ExtResult =
5293           DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
5294                       Op.getOperand(0));
5295       return getSVESafeBitCast(OpVT, ExtResult, DAG);
5296     }
5297 
5298     // Bitcasts between legal types with the same element count are legal.
5299     if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5300       return Op;
5301 
5302     // getSVESafeBitCast does not support casting between unpacked types.
5303     if (!isPackedVectorType(OpVT, DAG))
5304       return SDValue();
5305 
5306     return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5307   }
5308 
5309   if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5310     return SDValue();
5311 
5312   // Bitcasts between f16 and bf16 are legal.
5313   if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5314     return Op;
5315 
5316   assert(ArgVT == MVT::i16);
5317   SDLoc DL(Op);
5318 
5319   Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
5320   Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5321   return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5322 }
5323 
5324 // Returns lane if Op extracts from a two-element vector and lane is constant
5325 // (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5326 static std::optional<uint64_t>
5327 getConstantLaneNumOfExtractHalfOperand(SDValue &Op) {
5328   SDNode *OpNode = Op.getNode();
5329   if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5330     return std::nullopt;
5331 
5332   EVT VT = OpNode->getOperand(0).getValueType();
5333   ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
5334   if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5335     return std::nullopt;
5336 
5337   return C->getZExtValue();
5338 }
5339 
5340 static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG,
5341                                    bool isSigned) {
5342   EVT VT = N.getValueType();
5343 
5344   if (N.getOpcode() != ISD::BUILD_VECTOR)
5345     return false;
5346 
5347   for (const SDValue &Elt : N->op_values()) {
5348     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
5349       unsigned EltSize = VT.getScalarSizeInBits();
5350       unsigned HalfSize = EltSize / 2;
5351       if (isSigned) {
5352         if (!isIntN(HalfSize, C->getSExtValue()))
5353           return false;
5354       } else {
5355         if (!isUIntN(HalfSize, C->getZExtValue()))
5356           return false;
5357       }
5358       continue;
5359     }
5360     return false;
5361   }
5362 
5363   return true;
5364 }
5365 
5366 static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG) {
5367   EVT VT = N.getValueType();
5368   assert(VT.is128BitVector() && "Unexpected vector MULL size");
5369   EVT HalfVT = EVT::getVectorVT(
5370       *DAG.getContext(),
5371       VT.getScalarType().getHalfSizedIntegerVT(*DAG.getContext()),
5372       VT.getVectorElementCount());
5373   return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5374 }
5375 
5376 static bool isSignExtended(SDValue N, SelectionDAG &DAG) {
5377   return N.getOpcode() == ISD::SIGN_EXTEND ||
5378          N.getOpcode() == ISD::ANY_EXTEND ||
5379          isExtendedBUILD_VECTOR(N, DAG, true);
5380 }
5381 
5382 static bool isZeroExtended(SDValue N, SelectionDAG &DAG) {
5383   return N.getOpcode() == ISD::ZERO_EXTEND ||
5384          N.getOpcode() == ISD::ANY_EXTEND ||
5385          isExtendedBUILD_VECTOR(N, DAG, false);
5386 }
5387 
5388 static bool isAddSubSExt(SDValue N, SelectionDAG &DAG) {
5389   unsigned Opcode = N.getOpcode();
5390   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5391     SDValue N0 = N.getOperand(0);
5392     SDValue N1 = N.getOperand(1);
5393     return N0->hasOneUse() && N1->hasOneUse() &&
5394       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5395   }
5396   return false;
5397 }
5398 
5399 static bool isAddSubZExt(SDValue N, SelectionDAG &DAG) {
5400   unsigned Opcode = N.getOpcode();
5401   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5402     SDValue N0 = N.getOperand(0);
5403     SDValue N1 = N.getOperand(1);
5404     return N0->hasOneUse() && N1->hasOneUse() &&
5405       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5406   }
5407   return false;
5408 }
5409 
5410 SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5411                                                  SelectionDAG &DAG) const {
5412   // The rounding mode is in bits 23:22 of the FPSCR.
5413   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5414   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5415   // so that the shift + and get folded into a bitfield extract.
5416   SDLoc dl(Op);
5417 
5418   SDValue Chain = Op.getOperand(0);
5419   SDValue FPCR_64 = DAG.getNode(
5420       ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
5421       {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
5422   Chain = FPCR_64.getValue(1);
5423   SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
5424   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
5425                                   DAG.getConstant(1U << 22, dl, MVT::i32));
5426   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
5427                               DAG.getConstant(22, dl, MVT::i32));
5428   SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
5429                             DAG.getConstant(3, dl, MVT::i32));
5430   return DAG.getMergeValues({AND, Chain}, dl);
5431 }
5432 
5433 SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5434                                                  SelectionDAG &DAG) const {
5435   SDLoc DL(Op);
5436   SDValue Chain = Op->getOperand(0);
5437   SDValue RMValue = Op->getOperand(1);
5438 
5439   // The rounding mode is in bits 23:22 of the FPCR.
5440   // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5441   // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5442   // ((arg - 1) & 3) << 22).
5443   //
5444   // The argument of llvm.set.rounding must be within the segment [0, 3], so
5445   // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5446   // generated llvm.set.rounding to ensure this condition.
5447 
5448   // Calculate new value of FPCR[23:22].
5449   RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5450                         DAG.getConstant(1, DL, MVT::i32));
5451   RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5452                         DAG.getConstant(0x3, DL, MVT::i32));
5453   RMValue =
5454       DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5455                   DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5456   RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5457 
5458   // Get current value of FPCR.
5459   SDValue Ops[] = {
5460       Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5461   SDValue FPCR =
5462       DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5463   Chain = FPCR.getValue(1);
5464   FPCR = FPCR.getValue(0);
5465 
5466   // Put new rounding mode into FPSCR[23:22].
5467   const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5468   FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5469                      DAG.getConstant(RMMask, DL, MVT::i64));
5470   FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5471   SDValue Ops2[] = {
5472       Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5473       FPCR};
5474   return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5475 }
5476 
5477 SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5478                                                SelectionDAG &DAG) const {
5479   SDLoc DL(Op);
5480   SDValue Chain = Op->getOperand(0);
5481 
5482   // Get current value of FPCR.
5483   SDValue Ops[] = {
5484       Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5485   SDValue FPCR =
5486       DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5487   Chain = FPCR.getValue(1);
5488   FPCR = FPCR.getValue(0);
5489 
5490   // Truncate FPCR to 32 bits.
5491   SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5492 
5493   return DAG.getMergeValues({Result, Chain}, DL);
5494 }
5495 
5496 SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5497                                                SelectionDAG &DAG) const {
5498   SDLoc DL(Op);
5499   SDValue Chain = Op->getOperand(0);
5500   SDValue Mode = Op->getOperand(1);
5501 
5502   // Extend the specified value to 64 bits.
5503   SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5504 
5505   // Set new value of FPCR.
5506   SDValue Ops2[] = {
5507       Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
5508   return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5509 }
5510 
5511 SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5512                                                  SelectionDAG &DAG) const {
5513   SDLoc DL(Op);
5514   SDValue Chain = Op->getOperand(0);
5515 
5516   // Get current value of FPCR.
5517   SDValue Ops[] = {
5518       Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5519   SDValue FPCR =
5520       DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5521   Chain = FPCR.getValue(1);
5522   FPCR = FPCR.getValue(0);
5523 
5524   // Clear bits that are not reserved.
5525   SDValue FPSCRMasked = DAG.getNode(
5526       ISD::AND, DL, MVT::i64, FPCR,
5527       DAG.getConstant(AArch64::ReservedFPControlBits, DL, MVT::i64));
5528 
5529   // Set new value of FPCR.
5530   SDValue Ops2[] = {Chain,
5531                     DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5532                     FPSCRMasked};
5533   return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5534 }
5535 
5536 static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5537                                  SDLoc DL, bool &IsMLA) {
5538   bool IsN0SExt = isSignExtended(N0, DAG);
5539   bool IsN1SExt = isSignExtended(N1, DAG);
5540   if (IsN0SExt && IsN1SExt)
5541     return AArch64ISD::SMULL;
5542 
5543   bool IsN0ZExt = isZeroExtended(N0, DAG);
5544   bool IsN1ZExt = isZeroExtended(N1, DAG);
5545 
5546   if (IsN0ZExt && IsN1ZExt)
5547     return AArch64ISD::UMULL;
5548 
5549   // Select UMULL if we can replace the other operand with an extend.
5550   EVT VT = N0.getValueType();
5551   unsigned EltSize = VT.getScalarSizeInBits();
5552   APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5553   if (IsN0ZExt || IsN1ZExt) {
5554     if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5555       return AArch64ISD::UMULL;
5556   } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5557              DAG.MaskedValueIsZero(N1, Mask)) {
5558     // For v2i64 we look more aggresively at both operands being zero, to avoid
5559     // scalarization.
5560     return AArch64ISD::UMULL;
5561   }
5562 
5563   if (IsN0SExt || IsN1SExt) {
5564     if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5565       return AArch64ISD::SMULL;
5566   } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5567              DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5568     return AArch64ISD::SMULL;
5569   }
5570 
5571   if (!IsN1SExt && !IsN1ZExt)
5572     return 0;
5573 
5574   // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5575   // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5576   if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5577     IsMLA = true;
5578     return AArch64ISD::SMULL;
5579   }
5580   if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5581     IsMLA = true;
5582     return AArch64ISD::UMULL;
5583   }
5584   if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5585     std::swap(N0, N1);
5586     IsMLA = true;
5587     return AArch64ISD::UMULL;
5588   }
5589   return 0;
5590 }
5591 
5592 SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5593   EVT VT = Op.getValueType();
5594 
5595   bool OverrideNEON = !Subtarget->isNeonAvailable();
5596   if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5597     return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5598 
5599   // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5600   // that VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
5601   assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5602          "unexpected type for custom-lowering ISD::MUL");
5603   SDValue N0 = Op.getOperand(0);
5604   SDValue N1 = Op.getOperand(1);
5605   bool isMLA = false;
5606   EVT OVT = VT;
5607   if (VT.is64BitVector()) {
5608     if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5609         isNullConstant(N0.getOperand(1)) &&
5610         N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5611         isNullConstant(N1.getOperand(1))) {
5612       N0 = N0.getOperand(0);
5613       N1 = N1.getOperand(0);
5614       VT = N0.getValueType();
5615     } else {
5616       if (VT == MVT::v1i64) {
5617         if (Subtarget->hasSVE())
5618           return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5619         // Fall through to expand this.  It is not legal.
5620         return SDValue();
5621       } else
5622         // Other vector multiplications are legal.
5623         return Op;
5624     }
5625   }
5626 
5627   SDLoc DL(Op);
5628   unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5629 
5630   if (!NewOpc) {
5631     if (VT.getVectorElementType() == MVT::i64) {
5632       // If SVE is available then i64 vector multiplications can also be made
5633       // legal.
5634       if (Subtarget->hasSVE())
5635         return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5636       // Fall through to expand this.  It is not legal.
5637       return SDValue();
5638     } else
5639       // Other vector multiplications are legal.
5640       return Op;
5641   }
5642 
5643   // Legalize to a S/UMULL instruction
5644   SDValue Op0;
5645   SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5646   if (!isMLA) {
5647     Op0 = skipExtensionForVectorMULL(N0, DAG);
5648     assert(Op0.getValueType().is64BitVector() &&
5649            Op1.getValueType().is64BitVector() &&
5650            "unexpected types for extended operands to VMULL");
5651     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5652                        DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5653                        DAG.getConstant(0, DL, MVT::i64));
5654   }
5655   // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5656   // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5657   // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5658   SDValue N00 = skipExtensionForVectorMULL(N0.getOperand(0), DAG);
5659   SDValue N01 = skipExtensionForVectorMULL(N0.getOperand(1), DAG);
5660   EVT Op1VT = Op1.getValueType();
5661   return DAG.getNode(
5662       ISD::EXTRACT_SUBVECTOR, DL, OVT,
5663       DAG.getNode(N0.getOpcode(), DL, VT,
5664                   DAG.getNode(NewOpc, DL, VT,
5665                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5666                   DAG.getNode(NewOpc, DL, VT,
5667                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5668       DAG.getConstant(0, DL, MVT::i64));
5669 }
5670 
5671 static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5672                                int Pattern) {
5673   if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5674     return DAG.getConstant(1, DL, MVT::nxv1i1);
5675   return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5676                      DAG.getTargetConstant(Pattern, DL, MVT::i32));
5677 }
5678 
5679 static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG,
5680                                          bool IsSigned, bool IsEqual) {
5681   if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
5682       !isa<ConstantSDNode>(Op.getOperand(2)))
5683     return SDValue();
5684 
5685   SDLoc dl(Op);
5686   APInt X = Op.getConstantOperandAPInt(1);
5687   APInt Y = Op.getConstantOperandAPInt(2);
5688 
5689   // When the second operand is the maximum value, comparisons that include
5690   // equality can never fail and thus we can return an all active predicate.
5691   if (IsEqual)
5692     if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5693       return DAG.getConstant(1, dl, Op.getValueType());
5694 
5695   bool Overflow;
5696   APInt NumActiveElems =
5697       IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5698 
5699   if (Overflow)
5700     return SDValue();
5701 
5702   if (IsEqual) {
5703     APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5704     NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5705                               : NumActiveElems.uadd_ov(One, Overflow);
5706     if (Overflow)
5707       return SDValue();
5708   }
5709 
5710   std::optional<unsigned> PredPattern =
5711       getSVEPredPatternFromNumElements(NumActiveElems.getZExtValue());
5712   unsigned MinSVEVectorSize = std::max(
5713       DAG.getSubtarget<AArch64Subtarget>().getMinSVEVectorSizeInBits(), 128u);
5714   unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5715   if (PredPattern != std::nullopt &&
5716       NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5717     return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
5718 
5719   return SDValue();
5720 }
5721 
5722 // Returns a safe bitcast between two scalable vector predicates, where
5723 // any newly created lanes from a widening bitcast are defined as zero.
5724 static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
5725   SDLoc DL(Op);
5726   EVT InVT = Op.getValueType();
5727 
5728   assert(InVT.getVectorElementType() == MVT::i1 &&
5729          VT.getVectorElementType() == MVT::i1 &&
5730          "Expected a predicate-to-predicate bitcast");
5731   assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5732          InVT.isScalableVector() &&
5733          DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5734          "Only expect to cast between legal scalable predicate types!");
5735 
5736   // Return the operand if the cast isn't changing type,
5737   if (InVT == VT)
5738     return Op;
5739 
5740   // Look through casts to <vscale x 16 x i1> when their input has more lanes
5741   // than VT. This will increase the chances of removing casts that introduce
5742   // new lanes, which have to be explicitly zero'd.
5743   if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
5744       Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
5745       Op.getOperand(1).getValueType().bitsGT(VT))
5746     Op = Op.getOperand(1);
5747 
5748   SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5749 
5750   // We only have to zero the lanes if new lanes are being defined, e.g. when
5751   // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5752   // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5753   // we can return here.
5754   if (InVT.bitsGT(VT))
5755     return Reinterpret;
5756 
5757   // Check if the other lanes are already known to be zeroed by
5758   // construction.
5759   if (isZeroingInactiveLanes(Op))
5760     return Reinterpret;
5761 
5762   // Zero the newly introduced lanes.
5763   SDValue Mask = DAG.getConstant(1, DL, InVT);
5764   Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5765   return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5766 }
5767 
5768 SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5769                                                   SDValue Chain, SDLoc DL,
5770                                                   EVT VT) const {
5771   SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
5772                                          getPointerTy(DAG.getDataLayout()));
5773   Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5774   Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5775   TargetLowering::CallLoweringInfo CLI(DAG);
5776   ArgListTy Args;
5777   CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5778       CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
5779       RetTy, Callee, std::move(Args));
5780   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5781   SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5782   return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5783                      Mask);
5784 }
5785 
5786 // Lower an SME LDR/STR ZA intrinsic
5787 // Case 1: If the vector number (vecnum) is an immediate in range, it gets
5788 // folded into the instruction
5789 //    ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5790 // Case 2: If the vecnum is not an immediate, then it is used to modify the base
5791 // and tile slice registers
5792 //    ldr(%tileslice, %ptr, %vecnum)
5793 //    ->
5794 //    %svl = rdsvl
5795 //    %ptr2 = %ptr + %svl * %vecnum
5796 //    %tileslice2 = %tileslice + %vecnum
5797 //    ldr [%tileslice2, 0], [%ptr2, 0]
5798 // Case 3: If the vecnum is an immediate out of range, then the same is done as
5799 // case 2, but the base and slice registers are modified by the greatest
5800 // multiple of 15 lower than the vecnum and the remainder is folded into the
5801 // instruction. This means that successive loads and stores that are offset from
5802 // each other can share the same base and slice register updates.
5803 //    ldr(%tileslice, %ptr, 22)
5804 //    ldr(%tileslice, %ptr, 23)
5805 //    ->
5806 //    %svl = rdsvl
5807 //    %ptr2 = %ptr + %svl * 15
5808 //    %tileslice2 = %tileslice + 15
5809 //    ldr [%tileslice2, 7], [%ptr2, 7]
5810 //    ldr [%tileslice2, 8], [%ptr2, 8]
5811 // Case 4: If the vecnum is an add of an immediate, then the non-immediate
5812 // operand and the immediate can be folded into the instruction, like case 2.
5813 //    ldr(%tileslice, %ptr, %vecnum + 7)
5814 //    ldr(%tileslice, %ptr, %vecnum + 8)
5815 //    ->
5816 //    %svl = rdsvl
5817 //    %ptr2 = %ptr + %svl * %vecnum
5818 //    %tileslice2 = %tileslice + %vecnum
5819 //    ldr [%tileslice2, 7], [%ptr2, 7]
5820 //    ldr [%tileslice2, 8], [%ptr2, 8]
5821 // Case 5: The vecnum being an add of an immediate out of range is also handled,
5822 // in which case the same remainder logic as case 3 is used.
5823 SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) {
5824   SDLoc DL(N);
5825 
5826   SDValue TileSlice = N->getOperand(2);
5827   SDValue Base = N->getOperand(3);
5828   SDValue VecNum = N->getOperand(4);
5829   int32_t ConstAddend = 0;
5830   SDValue VarAddend = VecNum;
5831 
5832   // If the vnum is an add of an immediate, we can fold it into the instruction
5833   if (VecNum.getOpcode() == ISD::ADD &&
5834       isa<ConstantSDNode>(VecNum.getOperand(1))) {
5835     ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5836     VarAddend = VecNum.getOperand(0);
5837   } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5838     ConstAddend = ImmNode->getSExtValue();
5839     VarAddend = SDValue();
5840   }
5841 
5842   int32_t ImmAddend = ConstAddend % 16;
5843   if (int32_t C = (ConstAddend - ImmAddend)) {
5844     SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5845     VarAddend = VarAddend
5846                     ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5847                     : CVal;
5848   }
5849 
5850   if (VarAddend) {
5851     // Get the vector length that will be multiplied by vnum
5852     auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5853                            DAG.getConstant(1, DL, MVT::i32));
5854 
5855     // Multiply SVL and vnum then add it to the base
5856     SDValue Mul = DAG.getNode(
5857         ISD::MUL, DL, MVT::i64,
5858         {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5859     Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5860     // Just add vnum to the tileslice
5861     TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5862   }
5863 
5864   return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
5865                      DL, MVT::Other,
5866                      {/*Chain=*/N.getOperand(0), TileSlice, Base,
5867                       DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5868 }
5869 
5870 SDValue LowerVectorMatch(SDValue Op, SelectionDAG &DAG) {
5871   SDLoc dl(Op);
5872   SDValue ID =
5873       DAG.getTargetConstant(Intrinsic::aarch64_sve_match, dl, MVT::i64);
5874 
5875   auto Op1 = Op.getOperand(1);
5876   auto Op2 = Op.getOperand(2);
5877   auto Mask = Op.getOperand(3);
5878 
5879   EVT Op1VT = Op1.getValueType();
5880   EVT Op2VT = Op2.getValueType();
5881   EVT ResVT = Op.getValueType();
5882 
5883   assert((Op1VT.getVectorElementType() == MVT::i8 ||
5884           Op1VT.getVectorElementType() == MVT::i16) &&
5885          "Expected 8-bit or 16-bit characters.");
5886 
5887   // Scalable vector type used to wrap operands.
5888   // A single container is enough for both operands because ultimately the
5889   // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
5890   EVT OpContainerVT = Op1VT.isScalableVector()
5891                           ? Op1VT
5892                           : getContainerForFixedLengthVector(DAG, Op1VT);
5893 
5894   if (Op2VT.is128BitVector()) {
5895     // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
5896     Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
5897     // Further, if the result is scalable, broadcast Op2 to a full SVE register.
5898     if (ResVT.isScalableVector())
5899       Op2 = DAG.getNode(AArch64ISD::DUPLANE128, dl, OpContainerVT, Op2,
5900                         DAG.getTargetConstant(0, dl, MVT::i64));
5901   } else {
5902     // If Op2 is not a full 128-bit vector, we always need to broadcast it.
5903     unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
5904     MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
5905     EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
5906     Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
5907     Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op2IntVT, Op2,
5908                       DAG.getConstant(0, dl, MVT::i64));
5909     Op2 = DAG.getSplatVector(Op2PromotedVT, dl, Op2);
5910     Op2 = DAG.getBitcast(OpContainerVT, Op2);
5911   }
5912 
5913   // If the result is scalable, we just need to carry out the MATCH.
5914   if (ResVT.isScalableVector())
5915     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, ResVT, ID, Mask, Op1, Op2);
5916 
5917   // If the result is fixed, we can still use MATCH but we need to wrap the
5918   // first operand and the mask in scalable vectors before doing so.
5919 
5920   // Wrap the operands.
5921   Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
5922   Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, Op1VT, Mask);
5923   Mask = convertFixedMaskToScalableVector(Mask, DAG);
5924 
5925   // Carry out the match.
5926   SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Mask.getValueType(),
5927                               ID, Mask, Op1, Op2);
5928 
5929   // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
5930   // (v16i8/v8i8).
5931   Match = DAG.getNode(ISD::SIGN_EXTEND, dl, OpContainerVT, Match);
5932   Match = convertFromScalableVector(DAG, Op1VT, Match);
5933   return DAG.getNode(ISD::TRUNCATE, dl, ResVT, Match);
5934 }
5935 
5936 SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5937                                                    SelectionDAG &DAG) const {
5938   unsigned IntNo = Op.getConstantOperandVal(1);
5939   SDLoc DL(Op);
5940   switch (IntNo) {
5941   default:
5942     return SDValue(); // Don't custom lower most intrinsics.
5943   case Intrinsic::aarch64_prefetch: {
5944     SDValue Chain = Op.getOperand(0);
5945     SDValue Addr = Op.getOperand(2);
5946 
5947     unsigned IsWrite = Op.getConstantOperandVal(3);
5948     unsigned Locality = Op.getConstantOperandVal(4);
5949     unsigned IsStream = Op.getConstantOperandVal(5);
5950     unsigned IsData = Op.getConstantOperandVal(6);
5951     unsigned PrfOp = (IsWrite << 4) |    // Load/Store bit
5952                      (!IsData << 3) |    // IsDataCache bit
5953                      (Locality << 1) |   // Cache level bits
5954                      (unsigned)IsStream; // Stream bit
5955 
5956     return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5957                        DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5958   }
5959   case Intrinsic::aarch64_sme_str:
5960   case Intrinsic::aarch64_sme_ldr: {
5961     return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5962   }
5963   case Intrinsic::aarch64_sme_za_enable:
5964     return DAG.getNode(
5965         AArch64ISD::SMSTART, DL, MVT::Other,
5966         Op->getOperand(0), // Chain
5967         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5968         DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5969   case Intrinsic::aarch64_sme_za_disable:
5970     return DAG.getNode(
5971         AArch64ISD::SMSTOP, DL, MVT::Other,
5972         Op->getOperand(0), // Chain
5973         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5974         DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5975   }
5976 }
5977 
5978 SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5979                                                       SelectionDAG &DAG) const {
5980   unsigned IntNo = Op.getConstantOperandVal(1);
5981   SDLoc DL(Op);
5982   switch (IntNo) {
5983   default:
5984     return SDValue(); // Don't custom lower most intrinsics.
5985   case Intrinsic::aarch64_mops_memset_tag: {
5986     auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5987     SDValue Chain = Node->getChain();
5988     SDValue Dst = Op.getOperand(2);
5989     SDValue Val = Op.getOperand(3);
5990     Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5991     SDValue Size = Op.getOperand(4);
5992     auto Alignment = Node->getMemOperand()->getAlign();
5993     bool IsVol = Node->isVolatile();
5994     auto DstPtrInfo = Node->getPointerInfo();
5995 
5996     const auto &SDI =
5997         static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5998     SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
5999                               Chain, Dst, Val, Size, Alignment, IsVol,
6000                               DstPtrInfo, MachinePointerInfo{});
6001 
6002     // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6003     // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6004     // LowerOperationWrapper will complain that the number of results has
6005     // changed.
6006     return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6007   }
6008   }
6009 }
6010 
6011 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6012                                                      SelectionDAG &DAG) const {
6013   unsigned IntNo = Op.getConstantOperandVal(0);
6014   SDLoc dl(Op);
6015   switch (IntNo) {
6016   default: return SDValue();    // Don't custom lower most intrinsics.
6017   case Intrinsic::thread_pointer: {
6018     EVT PtrVT = getPointerTy(DAG.getDataLayout());
6019     return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
6020   }
6021   case Intrinsic::aarch64_neon_abs: {
6022     EVT Ty = Op.getValueType();
6023     if (Ty == MVT::i64) {
6024       SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
6025                                    Op.getOperand(1));
6026       Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
6027       return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
6028     } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6029       return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
6030     } else {
6031       report_fatal_error("Unexpected type for AArch64 NEON intrinic");
6032     }
6033   }
6034   case Intrinsic::aarch64_neon_pmull64: {
6035     SDValue LHS = Op.getOperand(1);
6036     SDValue RHS = Op.getOperand(2);
6037 
6038     std::optional<uint64_t> LHSLane =
6039         getConstantLaneNumOfExtractHalfOperand(LHS);
6040     std::optional<uint64_t> RHSLane =
6041         getConstantLaneNumOfExtractHalfOperand(RHS);
6042 
6043     assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6044     assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6045 
6046     // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6047     // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6048     // which ISel recognizes better. For example, generate a ldr into d*
6049     // registers as opposed to a GPR load followed by a fmov.
6050     auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6051                                   std::optional<uint64_t> OtherLane,
6052                                   const SDLoc &dl,
6053                                   SelectionDAG &DAG) -> SDValue {
6054       // If the operand is an higher half itself, rewrite it to
6055       // extract_high_v2i64; this way aarch64_neon_pmull64 could
6056       // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6057       if (NLane && *NLane == 1)
6058         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
6059                            N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
6060 
6061       // Operand N is not a higher half but the other operand is.
6062       if (OtherLane && *OtherLane == 1) {
6063         // If this operand is a lower half, rewrite it to
6064         // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6065         // align lanes of two operands. A roundtrip sequence (to move from lane
6066         // 1 to lane 0) is like this:
6067         //   mov x8, v0.d[1]
6068         //   fmov d0, x8
6069         if (NLane && *NLane == 0)
6070           return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
6071                              DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
6072                                          N.getOperand(0),
6073                                          DAG.getConstant(0, dl, MVT::i64)),
6074                              DAG.getConstant(1, dl, MVT::i64));
6075 
6076         // Otherwise just dup from main to all lanes.
6077         return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
6078       }
6079 
6080       // Neither operand is an extract of higher half, so codegen may just use
6081       // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6082       assert(N.getValueType() == MVT::i64 &&
6083              "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6084       return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
6085     };
6086 
6087     LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
6088     RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
6089 
6090     return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
6091   }
6092   case Intrinsic::aarch64_neon_smax:
6093     return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
6094                        Op.getOperand(1), Op.getOperand(2));
6095   case Intrinsic::aarch64_neon_umax:
6096     return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
6097                        Op.getOperand(1), Op.getOperand(2));
6098   case Intrinsic::aarch64_neon_smin:
6099     return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
6100                        Op.getOperand(1), Op.getOperand(2));
6101   case Intrinsic::aarch64_neon_umin:
6102     return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
6103                        Op.getOperand(1), Op.getOperand(2));
6104   case Intrinsic::aarch64_neon_scalar_sqxtn:
6105   case Intrinsic::aarch64_neon_scalar_sqxtun:
6106   case Intrinsic::aarch64_neon_scalar_uqxtn: {
6107     assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6108     if (Op.getValueType() == MVT::i32)
6109       return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
6110                          DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
6111                                      Op.getOperand(0),
6112                                      DAG.getNode(ISD::BITCAST, dl, MVT::f64,
6113                                                  Op.getOperand(1))));
6114     return SDValue();
6115   }
6116   case Intrinsic::aarch64_neon_sqxtn:
6117     return DAG.getNode(ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(),
6118                        Op.getOperand(1));
6119   case Intrinsic::aarch64_neon_sqxtun:
6120     return DAG.getNode(ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(),
6121                        Op.getOperand(1));
6122   case Intrinsic::aarch64_neon_uqxtn:
6123     return DAG.getNode(ISD::TRUNCATE_USAT_U, dl, Op.getValueType(),
6124                        Op.getOperand(1));
6125   case Intrinsic::aarch64_neon_sqshrn:
6126     if (Op.getValueType().isVector())
6127       return DAG.getNode(ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(),
6128                          DAG.getNode(AArch64ISD::VASHR, dl,
6129                                      Op.getOperand(1).getValueType(),
6130                                      Op.getOperand(1), Op.getOperand(2)));
6131     return SDValue();
6132   case Intrinsic::aarch64_neon_sqshrun:
6133     if (Op.getValueType().isVector())
6134       return DAG.getNode(ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(),
6135                          DAG.getNode(AArch64ISD::VASHR, dl,
6136                                      Op.getOperand(1).getValueType(),
6137                                      Op.getOperand(1), Op.getOperand(2)));
6138     return SDValue();
6139   case Intrinsic::aarch64_neon_uqshrn:
6140     if (Op.getValueType().isVector())
6141       return DAG.getNode(ISD::TRUNCATE_USAT_U, dl, Op.getValueType(),
6142                          DAG.getNode(AArch64ISD::VLSHR, dl,
6143                                      Op.getOperand(1).getValueType(),
6144                                      Op.getOperand(1), Op.getOperand(2)));
6145     return SDValue();
6146   case Intrinsic::aarch64_neon_sqrshrn:
6147     if (Op.getValueType().isVector())
6148       return DAG.getNode(
6149           ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(),
6150           DAG.getNode(
6151               AArch64ISD::SRSHR_I, dl, Op.getOperand(1).getValueType(),
6152               Op.getOperand(1), Op.getOperand(2)));
6153     return SDValue();
6154   case Intrinsic::aarch64_neon_sqrshrun:
6155     if (Op.getValueType().isVector())
6156       return DAG.getNode(
6157           ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(),
6158           DAG.getNode(
6159               AArch64ISD::SRSHR_I, dl, Op.getOperand(1).getValueType(),
6160               Op.getOperand(1), Op.getOperand(2)));
6161     return SDValue();
6162   case Intrinsic::aarch64_neon_uqrshrn:
6163     if (Op.getValueType().isVector())
6164       return DAG.getNode(
6165           ISD::TRUNCATE_USAT_U, dl, Op.getValueType(),
6166           DAG.getNode(
6167               AArch64ISD::URSHR_I, dl, Op.getOperand(1).getValueType(), Op.getOperand(1), Op.getOperand(2)));
6168     return SDValue();
6169   case Intrinsic::aarch64_sve_whilelo:
6170     return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
6171                                      /*IsEqual=*/false);
6172   case Intrinsic::aarch64_sve_whilelt:
6173     return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
6174                                      /*IsEqual=*/false);
6175   case Intrinsic::aarch64_sve_whilels:
6176     return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
6177                                      /*IsEqual=*/true);
6178   case Intrinsic::aarch64_sve_whilele:
6179     return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
6180                                      /*IsEqual=*/true);
6181   case Intrinsic::aarch64_sve_sunpkhi:
6182     return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
6183                        Op.getOperand(1));
6184   case Intrinsic::aarch64_sve_sunpklo:
6185     return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
6186                        Op.getOperand(1));
6187   case Intrinsic::aarch64_sve_uunpkhi:
6188     return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
6189                        Op.getOperand(1));
6190   case Intrinsic::aarch64_sve_uunpklo:
6191     return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
6192                        Op.getOperand(1));
6193   case Intrinsic::aarch64_sve_clasta_n:
6194     return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
6195                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6196   case Intrinsic::aarch64_sve_clastb_n:
6197     return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
6198                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6199   case Intrinsic::aarch64_sve_lasta:
6200     return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
6201                        Op.getOperand(1), Op.getOperand(2));
6202   case Intrinsic::aarch64_sve_lastb:
6203     return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
6204                        Op.getOperand(1), Op.getOperand(2));
6205   case Intrinsic::aarch64_sve_rev:
6206     return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
6207                        Op.getOperand(1));
6208   case Intrinsic::aarch64_sve_tbl:
6209     return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
6210                        Op.getOperand(1), Op.getOperand(2));
6211   case Intrinsic::aarch64_sve_trn1:
6212     return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
6213                        Op.getOperand(1), Op.getOperand(2));
6214   case Intrinsic::aarch64_sve_trn2:
6215     return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
6216                        Op.getOperand(1), Op.getOperand(2));
6217   case Intrinsic::aarch64_sve_uzp1:
6218     return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
6219                        Op.getOperand(1), Op.getOperand(2));
6220   case Intrinsic::aarch64_sve_uzp2:
6221     return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
6222                        Op.getOperand(1), Op.getOperand(2));
6223   case Intrinsic::aarch64_sve_zip1:
6224     return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
6225                        Op.getOperand(1), Op.getOperand(2));
6226   case Intrinsic::aarch64_sve_zip2:
6227     return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
6228                        Op.getOperand(1), Op.getOperand(2));
6229   case Intrinsic::aarch64_sve_splice:
6230     return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
6231                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6232   case Intrinsic::aarch64_sve_ptrue:
6233     return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
6234   case Intrinsic::aarch64_sve_clz:
6235     return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
6236                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6237   case Intrinsic::aarch64_sme_cntsb:
6238     return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
6239                        DAG.getConstant(1, dl, MVT::i32));
6240   case Intrinsic::aarch64_sme_cntsh: {
6241     SDValue One = DAG.getConstant(1, dl, MVT::i32);
6242     SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
6243     return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
6244   }
6245   case Intrinsic::aarch64_sme_cntsw: {
6246     SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
6247                                 DAG.getConstant(1, dl, MVT::i32));
6248     return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
6249                        DAG.getConstant(2, dl, MVT::i32));
6250   }
6251   case Intrinsic::aarch64_sme_cntsd: {
6252     SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
6253                                 DAG.getConstant(1, dl, MVT::i32));
6254     return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
6255                        DAG.getConstant(3, dl, MVT::i32));
6256   }
6257   case Intrinsic::aarch64_sve_cnt: {
6258     SDValue Data = Op.getOperand(3);
6259     // CTPOP only supports integer operands.
6260     if (Data.getValueType().isFloatingPoint())
6261       Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
6262     return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
6263                        Op.getOperand(2), Data, Op.getOperand(1));
6264   }
6265   case Intrinsic::aarch64_sve_dupq_lane:
6266     return LowerDUPQLane(Op, DAG);
6267   case Intrinsic::aarch64_sve_convert_from_svbool:
6268     if (Op.getValueType() == MVT::aarch64svcount)
6269       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
6270     return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6271   case Intrinsic::aarch64_sve_convert_to_svbool:
6272     if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6273       return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
6274     return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6275   case Intrinsic::aarch64_sve_fneg:
6276     return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
6277                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6278   case Intrinsic::aarch64_sve_frintp:
6279     return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
6280                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6281   case Intrinsic::aarch64_sve_frintm:
6282     return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
6283                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6284   case Intrinsic::aarch64_sve_frinti:
6285     return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
6286                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6287   case Intrinsic::aarch64_sve_frintx:
6288     return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
6289                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6290   case Intrinsic::aarch64_sve_frinta:
6291     return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
6292                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6293   case Intrinsic::aarch64_sve_frintn:
6294     return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
6295                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6296   case Intrinsic::aarch64_sve_frintz:
6297     return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
6298                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6299   case Intrinsic::aarch64_sve_ucvtf:
6300     return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl,
6301                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6302                        Op.getOperand(1));
6303   case Intrinsic::aarch64_sve_scvtf:
6304     return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl,
6305                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6306                        Op.getOperand(1));
6307   case Intrinsic::aarch64_sve_fcvtzu:
6308     return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl,
6309                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6310                        Op.getOperand(1));
6311   case Intrinsic::aarch64_sve_fcvtzs:
6312     return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl,
6313                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6314                        Op.getOperand(1));
6315   case Intrinsic::aarch64_sve_fsqrt:
6316     return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
6317                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6318   case Intrinsic::aarch64_sve_frecpx:
6319     return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
6320                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6321   case Intrinsic::aarch64_sve_frecpe_x:
6322     return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
6323                        Op.getOperand(1));
6324   case Intrinsic::aarch64_sve_frecps_x:
6325     return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
6326                        Op.getOperand(1), Op.getOperand(2));
6327   case Intrinsic::aarch64_sve_frsqrte_x:
6328     return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
6329                        Op.getOperand(1));
6330   case Intrinsic::aarch64_sve_frsqrts_x:
6331     return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
6332                        Op.getOperand(1), Op.getOperand(2));
6333   case Intrinsic::aarch64_sve_fabs:
6334     return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
6335                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6336   case Intrinsic::aarch64_sve_abs:
6337     return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
6338                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6339   case Intrinsic::aarch64_sve_neg:
6340     return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
6341                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6342   case Intrinsic::aarch64_sve_insr: {
6343     SDValue Scalar = Op.getOperand(2);
6344     EVT ScalarTy = Scalar.getValueType();
6345     if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6346       Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
6347 
6348     return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
6349                        Op.getOperand(1), Scalar);
6350   }
6351   case Intrinsic::aarch64_sve_rbit:
6352     return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl,
6353                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6354                        Op.getOperand(1));
6355   case Intrinsic::aarch64_sve_revb:
6356     return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
6357                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6358   case Intrinsic::aarch64_sve_revh:
6359     return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
6360                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6361   case Intrinsic::aarch64_sve_revw:
6362     return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
6363                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6364   case Intrinsic::aarch64_sve_revd:
6365     return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
6366                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6367   case Intrinsic::aarch64_sve_sxtb:
6368     return DAG.getNode(
6369         AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
6370         Op.getOperand(2), Op.getOperand(3),
6371         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6372         Op.getOperand(1));
6373   case Intrinsic::aarch64_sve_sxth:
6374     return DAG.getNode(
6375         AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
6376         Op.getOperand(2), Op.getOperand(3),
6377         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6378         Op.getOperand(1));
6379   case Intrinsic::aarch64_sve_sxtw:
6380     return DAG.getNode(
6381         AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
6382         Op.getOperand(2), Op.getOperand(3),
6383         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6384         Op.getOperand(1));
6385   case Intrinsic::aarch64_sve_uxtb:
6386     return DAG.getNode(
6387         AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
6388         Op.getOperand(2), Op.getOperand(3),
6389         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6390         Op.getOperand(1));
6391   case Intrinsic::aarch64_sve_uxth:
6392     return DAG.getNode(
6393         AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
6394         Op.getOperand(2), Op.getOperand(3),
6395         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6396         Op.getOperand(1));
6397   case Intrinsic::aarch64_sve_uxtw:
6398     return DAG.getNode(
6399         AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
6400         Op.getOperand(2), Op.getOperand(3),
6401         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6402         Op.getOperand(1));
6403   case Intrinsic::localaddress: {
6404     const auto &MF = DAG.getMachineFunction();
6405     const auto *RegInfo = Subtarget->getRegisterInfo();
6406     unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6407     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
6408                               Op.getSimpleValueType());
6409   }
6410 
6411   case Intrinsic::eh_recoverfp: {
6412     // FIXME: This needs to be implemented to correctly handle highly aligned
6413     // stack objects. For now we simply return the incoming FP. Refer D53541
6414     // for more details.
6415     SDValue FnOp = Op.getOperand(1);
6416     SDValue IncomingFPOp = Op.getOperand(2);
6417     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6418     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6419     if (!Fn)
6420       report_fatal_error(
6421           "llvm.eh.recoverfp must take a function as the first argument");
6422     return IncomingFPOp;
6423   }
6424 
6425   case Intrinsic::aarch64_neon_vsri:
6426   case Intrinsic::aarch64_neon_vsli:
6427   case Intrinsic::aarch64_sve_sri:
6428   case Intrinsic::aarch64_sve_sli: {
6429     EVT Ty = Op.getValueType();
6430 
6431     if (!Ty.isVector())
6432       report_fatal_error("Unexpected type for aarch64_neon_vsli");
6433 
6434     assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6435 
6436     bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6437                         IntNo == Intrinsic::aarch64_sve_sri;
6438     unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6439     return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
6440                        Op.getOperand(3));
6441   }
6442 
6443   case Intrinsic::aarch64_neon_srhadd:
6444   case Intrinsic::aarch64_neon_urhadd:
6445   case Intrinsic::aarch64_neon_shadd:
6446   case Intrinsic::aarch64_neon_uhadd: {
6447     bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6448                         IntNo == Intrinsic::aarch64_neon_shadd);
6449     bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6450                           IntNo == Intrinsic::aarch64_neon_urhadd);
6451     unsigned Opcode = IsSignedAdd
6452                           ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6453                           : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6454     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
6455                        Op.getOperand(2));
6456   }
6457   case Intrinsic::aarch64_neon_saddlp:
6458   case Intrinsic::aarch64_neon_uaddlp: {
6459     unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6460                           ? AArch64ISD::UADDLP
6461                           : AArch64ISD::SADDLP;
6462     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
6463   }
6464   case Intrinsic::aarch64_neon_sdot:
6465   case Intrinsic::aarch64_neon_udot:
6466   case Intrinsic::aarch64_sve_sdot:
6467   case Intrinsic::aarch64_sve_udot: {
6468     unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6469                        IntNo == Intrinsic::aarch64_sve_udot)
6470                           ? AArch64ISD::UDOT
6471                           : AArch64ISD::SDOT;
6472     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
6473                        Op.getOperand(2), Op.getOperand(3));
6474   }
6475   case Intrinsic::aarch64_neon_usdot:
6476   case Intrinsic::aarch64_sve_usdot: {
6477     return DAG.getNode(AArch64ISD::USDOT, dl, Op.getValueType(),
6478                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6479   }
6480   case Intrinsic::get_active_lane_mask: {
6481     SDValue ID =
6482         DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
6483 
6484     EVT VT = Op.getValueType();
6485     if (VT.isScalableVector())
6486       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
6487                          Op.getOperand(2));
6488 
6489     // We can use the SVE whilelo instruction to lower this intrinsic by
6490     // creating the appropriate sequence of scalable vector operations and
6491     // then extracting a fixed-width subvector from the scalable vector.
6492 
6493     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
6494     EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
6495 
6496     SDValue Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WhileVT, ID,
6497                                Op.getOperand(1), Op.getOperand(2));
6498     SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, dl, ContainerVT, Mask);
6499     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
6500                        DAG.getVectorIdxConstant(0, dl));
6501   }
6502   case Intrinsic::aarch64_neon_saddlv:
6503   case Intrinsic::aarch64_neon_uaddlv: {
6504     EVT OpVT = Op.getOperand(1).getValueType();
6505     EVT ResVT = Op.getValueType();
6506     assert(
6507         ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6508                                 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6509          (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6510         "Unexpected aarch64_neon_u/saddlv type");
6511     (void)OpVT;
6512     // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6513     SDValue ADDLV = DAG.getNode(
6514         IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6515                                                 : AArch64ISD::SADDLV,
6516         dl, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6517     SDValue EXTRACT_VEC_ELT = DAG.getNode(
6518         ISD::EXTRACT_VECTOR_ELT, dl, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6519         ADDLV, DAG.getConstant(0, dl, MVT::i64));
6520     return EXTRACT_VEC_ELT;
6521   }
6522   case Intrinsic::experimental_cttz_elts: {
6523     SDValue CttzOp = Op.getOperand(1);
6524     EVT VT = CttzOp.getValueType();
6525     assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6526 
6527     if (VT.isFixedLengthVector()) {
6528       // We can use SVE instructions to lower this intrinsic by first creating
6529       // an SVE predicate register mask from the fixed-width vector.
6530       EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6531       SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, NewVT, CttzOp);
6532       CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6533     }
6534 
6535     SDValue NewCttzElts =
6536         DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp);
6537     return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
6538   }
6539   case Intrinsic::experimental_vector_match: {
6540     return LowerVectorMatch(Op, DAG);
6541   }
6542   }
6543 }
6544 
6545 bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6546   if (VT.getVectorElementType() == MVT::i8 ||
6547       VT.getVectorElementType() == MVT::i16) {
6548     EltTy = MVT::i32;
6549     return true;
6550   }
6551   return false;
6552 }
6553 
6554 bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6555                                                           EVT DataVT) const {
6556   const EVT IndexVT = Extend.getOperand(0).getValueType();
6557   // SVE only supports implicit extension of 32-bit indices.
6558   if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6559     return false;
6560 
6561   // Indices cannot be smaller than the main data type.
6562   if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6563     return false;
6564 
6565   // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6566   // element container type, which would violate the previous clause.
6567   return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6568 }
6569 
6570 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6571   EVT ExtVT = ExtVal.getValueType();
6572   if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6573     return false;
6574 
6575   // It may be worth creating extending masked loads if there are multiple
6576   // masked loads using the same predicate. That way we'll end up creating
6577   // extending masked loads that may then get split by the legaliser. This
6578   // results in just one set of predicate unpacks at the start, instead of
6579   // multiple sets of vector unpacks after each load.
6580   if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6581     if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6582       // Disable extending masked loads for fixed-width for now, since the code
6583       // quality doesn't look great.
6584       if (!ExtVT.isScalableVector())
6585         return false;
6586 
6587       unsigned NumExtMaskedLoads = 0;
6588       for (auto *U : Ld->getMask()->users())
6589         if (isa<MaskedLoadSDNode>(U))
6590           NumExtMaskedLoads++;
6591 
6592       if (NumExtMaskedLoads <= 1)
6593         return false;
6594     }
6595   }
6596 
6597   return true;
6598 }
6599 
6600 unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6601   std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6602       {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
6603        AArch64ISD::GLD1_MERGE_ZERO},
6604       {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
6605        AArch64ISD::GLD1_UXTW_MERGE_ZERO},
6606       {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
6607        AArch64ISD::GLD1_MERGE_ZERO},
6608       {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
6609        AArch64ISD::GLD1_SXTW_MERGE_ZERO},
6610       {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
6611        AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6612       {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
6613        AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
6614       {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
6615        AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6616       {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
6617        AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
6618   };
6619   auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
6620   return AddrModes.find(Key)->second;
6621 }
6622 
6623 unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6624   switch (Opcode) {
6625   default:
6626     llvm_unreachable("unimplemented opcode");
6627     return Opcode;
6628   case AArch64ISD::GLD1_MERGE_ZERO:
6629     return AArch64ISD::GLD1S_MERGE_ZERO;
6630   case AArch64ISD::GLD1_IMM_MERGE_ZERO:
6631     return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
6632   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
6633     return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
6634   case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
6635     return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
6636   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
6637     return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
6638   case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
6639     return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
6640   case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
6641     return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
6642   }
6643 }
6644 
6645 SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6646                                             SelectionDAG &DAG) const {
6647   MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
6648 
6649   SDLoc DL(Op);
6650   SDValue Chain = MGT->getChain();
6651   SDValue PassThru = MGT->getPassThru();
6652   SDValue Mask = MGT->getMask();
6653   SDValue BasePtr = MGT->getBasePtr();
6654   SDValue Index = MGT->getIndex();
6655   SDValue Scale = MGT->getScale();
6656   EVT VT = Op.getValueType();
6657   EVT MemVT = MGT->getMemoryVT();
6658   ISD::LoadExtType ExtType = MGT->getExtensionType();
6659   ISD::MemIndexType IndexType = MGT->getIndexType();
6660 
6661   // SVE supports zero (and so undef) passthrough values only, everything else
6662   // must be handled manually by an explicit select on the load's output.
6663   if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
6664     SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6665     SDValue Load =
6666         DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6667                             MGT->getMemOperand(), IndexType, ExtType);
6668     SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6669     return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
6670   }
6671 
6672   bool IsScaled = MGT->isIndexScaled();
6673   bool IsSigned = MGT->isIndexSigned();
6674 
6675   // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6676   // must be calculated before hand.
6677   uint64_t ScaleVal = Scale->getAsZExtVal();
6678   if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6679     assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6680     EVT IndexVT = Index.getValueType();
6681     Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6682                         DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6683     Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6684 
6685     SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6686     return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6687                                MGT->getMemOperand(), IndexType, ExtType);
6688   }
6689 
6690   // Lower fixed length gather to a scalable equivalent.
6691   if (VT.isFixedLengthVector()) {
6692     assert(Subtarget->useSVEForFixedLengthVectors() &&
6693            "Cannot lower when not using SVE for fixed vectors!");
6694 
6695     // NOTE: Handle floating-point as if integer then bitcast the result.
6696     EVT DataVT = VT.changeVectorElementTypeToInteger();
6697     MemVT = MemVT.changeVectorElementTypeToInteger();
6698 
6699     // Find the smallest integer fixed length vector we can use for the gather.
6700     EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6701     if (DataVT.getVectorElementType() == MVT::i64 ||
6702         Index.getValueType().getVectorElementType() == MVT::i64 ||
6703         Mask.getValueType().getVectorElementType() == MVT::i64)
6704       PromotedVT = VT.changeVectorElementType(MVT::i64);
6705 
6706     // Promote vector operands except for passthrough, which we know is either
6707     // undef or zero, and thus best constructed directly.
6708     unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6709     Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6710     Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6711 
6712     // A promoted result type forces the need for an extending load.
6713     if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6714       ExtType = ISD::EXTLOAD;
6715 
6716     EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6717 
6718     // Convert fixed length vector operands to scalable.
6719     MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6720     Index = convertToScalableVector(DAG, ContainerVT, Index);
6721     Mask = convertFixedMaskToScalableVector(Mask, DAG);
6722     PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
6723                                    : DAG.getConstant(0, DL, ContainerVT);
6724 
6725     // Emit equivalent scalable vector gather.
6726     SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6727     SDValue Load =
6728         DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
6729                             Ops, MGT->getMemOperand(), IndexType, ExtType);
6730 
6731     // Extract fixed length data then convert to the required result type.
6732     SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
6733     Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
6734     if (VT.isFloatingPoint())
6735       Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
6736 
6737     return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6738   }
6739 
6740   // Everything else is legal.
6741   return Op;
6742 }
6743 
6744 SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6745                                              SelectionDAG &DAG) const {
6746   MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
6747 
6748   SDLoc DL(Op);
6749   SDValue Chain = MSC->getChain();
6750   SDValue StoreVal = MSC->getValue();
6751   SDValue Mask = MSC->getMask();
6752   SDValue BasePtr = MSC->getBasePtr();
6753   SDValue Index = MSC->getIndex();
6754   SDValue Scale = MSC->getScale();
6755   EVT VT = StoreVal.getValueType();
6756   EVT MemVT = MSC->getMemoryVT();
6757   ISD::MemIndexType IndexType = MSC->getIndexType();
6758   bool Truncating = MSC->isTruncatingStore();
6759 
6760   bool IsScaled = MSC->isIndexScaled();
6761   bool IsSigned = MSC->isIndexSigned();
6762 
6763   // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6764   // must be calculated before hand.
6765   uint64_t ScaleVal = Scale->getAsZExtVal();
6766   if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6767     assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6768     EVT IndexVT = Index.getValueType();
6769     Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6770                         DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6771     Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6772 
6773     SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6774     return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6775                                 MSC->getMemOperand(), IndexType, Truncating);
6776   }
6777 
6778   // Lower fixed length scatter to a scalable equivalent.
6779   if (VT.isFixedLengthVector()) {
6780     assert(Subtarget->useSVEForFixedLengthVectors() &&
6781            "Cannot lower when not using SVE for fixed vectors!");
6782 
6783     // Once bitcast we treat floating-point scatters as if integer.
6784     if (VT.isFloatingPoint()) {
6785       VT = VT.changeVectorElementTypeToInteger();
6786       MemVT = MemVT.changeVectorElementTypeToInteger();
6787       StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6788     }
6789 
6790     // Find the smallest integer fixed length vector we can use for the scatter.
6791     EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6792     if (VT.getVectorElementType() == MVT::i64 ||
6793         Index.getValueType().getVectorElementType() == MVT::i64 ||
6794         Mask.getValueType().getVectorElementType() == MVT::i64)
6795       PromotedVT = VT.changeVectorElementType(MVT::i64);
6796 
6797     // Promote vector operands.
6798     unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6799     Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6800     Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6801     StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6802 
6803     // A promoted value type forces the need for a truncating store.
6804     if (PromotedVT != VT)
6805       Truncating = true;
6806 
6807     EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6808 
6809     // Convert fixed length vector operands to scalable.
6810     MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6811     Index = convertToScalableVector(DAG, ContainerVT, Index);
6812     Mask = convertFixedMaskToScalableVector(Mask, DAG);
6813     StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6814 
6815     // Emit equivalent scalable vector scatter.
6816     SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6817     return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6818                                 MSC->getMemOperand(), IndexType, Truncating);
6819   }
6820 
6821   // Everything else is legal.
6822   return Op;
6823 }
6824 
6825 SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6826   SDLoc DL(Op);
6827   MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6828   assert(LoadNode && "Expected custom lowering of a masked load node");
6829   EVT VT = Op->getValueType(0);
6830 
6831   if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6832     return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6833 
6834   SDValue PassThru = LoadNode->getPassThru();
6835   SDValue Mask = LoadNode->getMask();
6836 
6837   if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6838     return Op;
6839 
6840   SDValue Load = DAG.getMaskedLoad(
6841       VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6842       LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6843       LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6844       LoadNode->getExtensionType());
6845 
6846   SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6847 
6848   return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6849 }
6850 
6851 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6852 static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
6853                                         EVT VT, EVT MemVT,
6854                                         SelectionDAG &DAG) {
6855   assert(VT.isVector() && "VT should be a vector type");
6856   assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6857 
6858   SDValue Value = ST->getValue();
6859 
6860   // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6861   // the word lane which represent the v4i8 subvector.  It optimizes the store
6862   // to:
6863   //
6864   //   xtn  v0.8b, v0.8h
6865   //   str  s0, [x0]
6866 
6867   SDValue Undef = DAG.getUNDEF(MVT::i16);
6868   SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6869                                         {Undef, Undef, Undef, Undef});
6870 
6871   SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6872                                  Value, UndefVec);
6873   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6874 
6875   Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6876   SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6877                                      Trunc, DAG.getConstant(0, DL, MVT::i64));
6878 
6879   return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
6880                       ST->getBasePtr(), ST->getMemOperand());
6881 }
6882 
6883 // Custom lowering for any store, vector or scalar and/or default or with
6884 // a truncate operations.  Currently only custom lower truncate operation
6885 // from vector v4i16 to v4i8 or volatile stores of i128.
6886 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6887                                           SelectionDAG &DAG) const {
6888   SDLoc Dl(Op);
6889   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
6890   assert (StoreNode && "Can only custom lower store nodes");
6891 
6892   SDValue Value = StoreNode->getValue();
6893 
6894   EVT VT = Value.getValueType();
6895   EVT MemVT = StoreNode->getMemoryVT();
6896 
6897   if (VT.isVector()) {
6898     if (useSVEForFixedLengthVectorVT(
6899             VT,
6900             /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6901       return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6902 
6903     unsigned AS = StoreNode->getAddressSpace();
6904     Align Alignment = StoreNode->getAlign();
6905     if (Alignment < MemVT.getStoreSize() &&
6906         !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
6907                                         StoreNode->getMemOperand()->getFlags(),
6908                                         nullptr)) {
6909       return scalarizeVectorStore(StoreNode, DAG);
6910     }
6911 
6912     if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6913         MemVT == MVT::v4i8) {
6914       return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
6915     }
6916     // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6917     // the custom lowering, as there are no un-paired non-temporal stores and
6918     // legalization will break up 256 bit inputs.
6919     ElementCount EC = MemVT.getVectorElementCount();
6920     if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6921         EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6922         (MemVT.getScalarSizeInBits() == 8u ||
6923          MemVT.getScalarSizeInBits() == 16u ||
6924          MemVT.getScalarSizeInBits() == 32u ||
6925          MemVT.getScalarSizeInBits() == 64u)) {
6926       SDValue Lo =
6927           DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
6928                       MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
6929                       StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6930       SDValue Hi =
6931           DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
6932                       MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
6933                       StoreNode->getValue(),
6934                       DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6935       SDValue Result = DAG.getMemIntrinsicNode(
6936           AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6937           {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6938           StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6939       return Result;
6940     }
6941   } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6942     return LowerStore128(Op, DAG);
6943   } else if (MemVT == MVT::i64x8) {
6944     SDValue Value = StoreNode->getValue();
6945     assert(Value->getValueType(0) == MVT::i64x8);
6946     SDValue Chain = StoreNode->getChain();
6947     SDValue Base = StoreNode->getBasePtr();
6948     EVT PtrVT = Base.getValueType();
6949     for (unsigned i = 0; i < 8; i++) {
6950       SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6951                                  Value, DAG.getConstant(i, Dl, MVT::i32));
6952       SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
6953                                 DAG.getConstant(i * 8, Dl, PtrVT));
6954       Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
6955                            StoreNode->getOriginalAlign());
6956     }
6957     return Chain;
6958   }
6959 
6960   return SDValue();
6961 }
6962 
6963 /// Lower atomic or volatile 128-bit stores to a single STP instruction.
6964 SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6965                                              SelectionDAG &DAG) const {
6966   MemSDNode *StoreNode = cast<MemSDNode>(Op);
6967   assert(StoreNode->getMemoryVT() == MVT::i128);
6968   assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6969 
6970   bool IsStoreRelease =
6971       StoreNode->getMergedOrdering() == AtomicOrdering::Release;
6972   if (StoreNode->isAtomic())
6973     assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6974             Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6975            StoreNode->getMergedOrdering() == AtomicOrdering::Unordered ||
6976            StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
6977 
6978   SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6979                    StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6980                       ? StoreNode->getOperand(1)
6981                       : StoreNode->getOperand(2);
6982   SDLoc DL(Op);
6983   auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
6984   unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6985   if (DAG.getDataLayout().isBigEndian())
6986     std::swap(StoreValue.first, StoreValue.second);
6987   SDValue Result = DAG.getMemIntrinsicNode(
6988       Opcode, DL, DAG.getVTList(MVT::Other),
6989       {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6990        StoreNode->getBasePtr()},
6991       StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6992   return Result;
6993 }
6994 
6995 SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6996                                          SelectionDAG &DAG) const {
6997   SDLoc DL(Op);
6998   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
6999   assert(LoadNode && "Expected custom lowering of a load node");
7000 
7001   if (LoadNode->getMemoryVT() == MVT::i64x8) {
7002     SmallVector<SDValue, 8> Ops;
7003     SDValue Base = LoadNode->getBasePtr();
7004     SDValue Chain = LoadNode->getChain();
7005     EVT PtrVT = Base.getValueType();
7006     for (unsigned i = 0; i < 8; i++) {
7007       SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7008                                 DAG.getConstant(i * 8, DL, PtrVT));
7009       SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
7010                                  LoadNode->getPointerInfo(),
7011                                  LoadNode->getOriginalAlign());
7012       Ops.push_back(Part);
7013       Chain = SDValue(Part.getNode(), 1);
7014     }
7015     SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7016     return DAG.getMergeValues({Loaded, Chain}, DL);
7017   }
7018 
7019   // Custom lowering for extending v4i8 vector loads.
7020   EVT VT = Op->getValueType(0);
7021   assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
7022 
7023   if (LoadNode->getMemoryVT() != MVT::v4i8)
7024     return SDValue();
7025 
7026   // Avoid generating unaligned loads.
7027   if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
7028     return SDValue();
7029 
7030   unsigned ExtType;
7031   if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
7032     ExtType = ISD::SIGN_EXTEND;
7033   else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
7034            LoadNode->getExtensionType() == ISD::EXTLOAD)
7035     ExtType = ISD::ZERO_EXTEND;
7036   else
7037     return SDValue();
7038 
7039   SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
7040                              LoadNode->getBasePtr(), MachinePointerInfo());
7041   SDValue Chain = Load.getValue(1);
7042   SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
7043   SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
7044   SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
7045   Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
7046                     DAG.getConstant(0, DL, MVT::i64));
7047   if (VT == MVT::v4i32)
7048     Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
7049   return DAG.getMergeValues({Ext, Chain}, DL);
7050 }
7051 
7052 SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7053                                                     SelectionDAG &DAG) const {
7054   SDLoc DL(Op);
7055   SDValue Vec = Op.getOperand(0);
7056   SDValue Mask = Op.getOperand(1);
7057   SDValue Passthru = Op.getOperand(2);
7058   EVT VecVT = Vec.getValueType();
7059   EVT MaskVT = Mask.getValueType();
7060   EVT ElmtVT = VecVT.getVectorElementType();
7061   const bool IsFixedLength = VecVT.isFixedLengthVector();
7062   const bool HasPassthru = !Passthru.isUndef();
7063   unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
7064   EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7065 
7066   assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
7067 
7068   if (!Subtarget->isSVEAvailable())
7069     return SDValue();
7070 
7071   if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
7072     return SDValue();
7073 
7074   // Only <vscale x {4|2} x {i32|i64}> supported for compact.
7075   if (MinElmts != 2 && MinElmts != 4)
7076     return SDValue();
7077 
7078   // We can use the SVE register containing the NEON vector in its lowest bits.
7079   if (IsFixedLength) {
7080     EVT ScalableVecVT =
7081         MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7082     EVT ScalableMaskVT = MVT::getScalableVectorVT(
7083         MaskVT.getVectorElementType().getSimpleVT(), MinElmts);
7084 
7085     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7086                       DAG.getUNDEF(ScalableVecVT), Vec,
7087                       DAG.getConstant(0, DL, MVT::i64));
7088     Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,
7089                        DAG.getUNDEF(ScalableMaskVT), Mask,
7090                        DAG.getConstant(0, DL, MVT::i64));
7091     Mask = DAG.getNode(ISD::TRUNCATE, DL,
7092                        ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);
7093     Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7094                            DAG.getUNDEF(ScalableVecVT), Passthru,
7095                            DAG.getConstant(0, DL, MVT::i64));
7096 
7097     VecVT = Vec.getValueType();
7098     MaskVT = Mask.getValueType();
7099   }
7100 
7101   // Get legal type for compact instruction
7102   EVT ContainerVT = getSVEContainerType(VecVT);
7103   EVT CastVT = VecVT.changeVectorElementTypeToInteger();
7104 
7105   // Convert to i32 or i64 for smaller types, as these are the only supported
7106   // sizes for compact.
7107   if (ContainerVT != VecVT) {
7108     Vec = DAG.getBitcast(CastVT, Vec);
7109     Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
7110   }
7111 
7112   SDValue Compressed = DAG.getNode(
7113       ISD::INTRINSIC_WO_CHAIN, DL, Vec.getValueType(),
7114       DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec);
7115 
7116   // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7117   if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
7118     SDValue Offset = DAG.getNode(
7119         ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7120         DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask);
7121 
7122     SDValue IndexMask = DAG.getNode(
7123         ISD::INTRINSIC_WO_CHAIN, DL, MaskVT,
7124         DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),
7125         DAG.getConstant(0, DL, MVT::i64), Offset);
7126 
7127     Compressed =
7128         DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru);
7129   }
7130 
7131   // Extracting from a legal SVE type before truncating produces better code.
7132   if (IsFixedLength) {
7133     Compressed = DAG.getNode(
7134         ISD::EXTRACT_SUBVECTOR, DL,
7135         FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()),
7136         Compressed, DAG.getConstant(0, DL, MVT::i64));
7137     CastVT = FixedVecVT.changeVectorElementTypeToInteger();
7138     VecVT = FixedVecVT;
7139   }
7140 
7141   // If we changed the element type before, we need to convert it back.
7142   if (ContainerVT != VecVT) {
7143     Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed);
7144     Compressed = DAG.getBitcast(VecVT, Compressed);
7145   }
7146 
7147   return Compressed;
7148 }
7149 
7150 // Generate SUBS and CSEL for integer abs.
7151 SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7152   MVT VT = Op.getSimpleValueType();
7153 
7154   if (VT.isVector())
7155     return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7156 
7157   SDLoc DL(Op);
7158   SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
7159                             Op.getOperand(0));
7160   // Generate SUBS & CSEL.
7161   SDValue Cmp =
7162       DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
7163                   Op.getOperand(0), DAG.getConstant(0, DL, VT));
7164   return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7165                      DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
7166                      Cmp.getValue(1));
7167 }
7168 
7169 static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
7170   SDValue Chain = Op.getOperand(0);
7171   SDValue Cond = Op.getOperand(1);
7172   SDValue Dest = Op.getOperand(2);
7173 
7174   AArch64CC::CondCode CC;
7175   if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7176     SDLoc dl(Op);
7177     SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
7178     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
7179                        Cmp);
7180   }
7181 
7182   return SDValue();
7183 }
7184 
7185 // Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7186 // FSHL is converted to FSHR before deciding what to do with it
7187 static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) {
7188   SDValue Shifts = Op.getOperand(2);
7189   // Check if the shift amount is a constant
7190   // If opcode is FSHL, convert it to FSHR
7191   if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7192     SDLoc DL(Op);
7193     MVT VT = Op.getSimpleValueType();
7194 
7195     if (Op.getOpcode() == ISD::FSHL) {
7196       unsigned int NewShiftNo =
7197           VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
7198       return DAG.getNode(
7199           ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7200           DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7201     } else if (Op.getOpcode() == ISD::FSHR) {
7202       return Op;
7203     }
7204   }
7205 
7206   return SDValue();
7207 }
7208 
7209 static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) {
7210   SDValue X = Op.getOperand(0);
7211   EVT XScalarTy = X.getValueType();
7212   SDValue Exp = Op.getOperand(1);
7213 
7214   SDLoc DL(Op);
7215   EVT XVT, ExpVT;
7216   switch (Op.getSimpleValueType().SimpleTy) {
7217   default:
7218     return SDValue();
7219   case MVT::bf16:
7220   case MVT::f16:
7221     X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7222     [[fallthrough]];
7223   case MVT::f32:
7224     XVT = MVT::nxv4f32;
7225     ExpVT = MVT::nxv4i32;
7226     break;
7227   case MVT::f64:
7228     XVT = MVT::nxv2f64;
7229     ExpVT = MVT::nxv2i64;
7230     Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7231     break;
7232   }
7233 
7234   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7235   SDValue VX =
7236       DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
7237   SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7238                              DAG.getUNDEF(ExpVT), Exp, Zero);
7239   SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
7240                          AArch64SVEPredPattern::all);
7241   SDValue FScale =
7242       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XVT,
7243                   DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
7244                   VPg, VX, VExp);
7245   SDValue Final =
7246       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7247   if (X.getValueType() != XScalarTy)
7248     Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7249                         DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7250   return Final;
7251 }
7252 
7253 SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7254                                                       SelectionDAG &DAG) const {
7255   // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
7256   if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
7257     report_fatal_error(
7258         "ADJUST_TRAMPOLINE operation is only supported on Linux.");
7259 
7260   return Op.getOperand(0);
7261 }
7262 
7263 SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7264                                                     SelectionDAG &DAG) const {
7265 
7266   // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
7267   if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
7268     report_fatal_error("INIT_TRAMPOLINE operation is only supported on Linux.");
7269 
7270   SDValue Chain = Op.getOperand(0);
7271   SDValue Trmp = Op.getOperand(1); // trampoline
7272   SDValue FPtr = Op.getOperand(2); // nested function
7273   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7274   SDLoc dl(Op);
7275 
7276   EVT PtrVT = getPointerTy(DAG.getDataLayout());
7277   Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
7278 
7279   TargetLowering::ArgListTy Args;
7280   TargetLowering::ArgListEntry Entry;
7281 
7282   Entry.Ty = IntPtrTy;
7283   Entry.Node = Trmp;
7284   Args.push_back(Entry);
7285 
7286   if (auto *FI = dyn_cast<FrameIndexSDNode>(Trmp.getNode())) {
7287     MachineFunction &MF = DAG.getMachineFunction();
7288     MachineFrameInfo &MFI = MF.getFrameInfo();
7289     Entry.Node =
7290         DAG.getConstant(MFI.getObjectSize(FI->getIndex()), dl, MVT::i64);
7291   } else
7292     Entry.Node = DAG.getConstant(36, dl, MVT::i64);
7293 
7294   Args.push_back(Entry);
7295   Entry.Node = FPtr;
7296   Args.push_back(Entry);
7297   Entry.Node = Nest;
7298   Args.push_back(Entry);
7299 
7300   // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
7301   TargetLowering::CallLoweringInfo CLI(DAG);
7302   CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
7303       CallingConv::C, Type::getVoidTy(*DAG.getContext()),
7304       DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
7305 
7306   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
7307   return CallResult.second;
7308 }
7309 
7310 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
7311                                               SelectionDAG &DAG) const {
7312   LLVM_DEBUG(dbgs() << "Custom lowering: ");
7313   LLVM_DEBUG(Op.dump());
7314 
7315   switch (Op.getOpcode()) {
7316   default:
7317     llvm_unreachable("unimplemented operand");
7318     return SDValue();
7319   case ISD::BITCAST:
7320     return LowerBITCAST(Op, DAG);
7321   case ISD::GlobalAddress:
7322     return LowerGlobalAddress(Op, DAG);
7323   case ISD::GlobalTLSAddress:
7324     return LowerGlobalTLSAddress(Op, DAG);
7325   case ISD::PtrAuthGlobalAddress:
7326     return LowerPtrAuthGlobalAddress(Op, DAG);
7327   case ISD::ADJUST_TRAMPOLINE:
7328     return LowerADJUST_TRAMPOLINE(Op, DAG);
7329   case ISD::INIT_TRAMPOLINE:
7330     return LowerINIT_TRAMPOLINE(Op, DAG);
7331   case ISD::SETCC:
7332   case ISD::STRICT_FSETCC:
7333   case ISD::STRICT_FSETCCS:
7334     return LowerSETCC(Op, DAG);
7335   case ISD::SETCCCARRY:
7336     return LowerSETCCCARRY(Op, DAG);
7337   case ISD::BRCOND:
7338     return LowerBRCOND(Op, DAG);
7339   case ISD::BR_CC:
7340     return LowerBR_CC(Op, DAG);
7341   case ISD::SELECT:
7342     return LowerSELECT(Op, DAG);
7343   case ISD::SELECT_CC:
7344     return LowerSELECT_CC(Op, DAG);
7345   case ISD::JumpTable:
7346     return LowerJumpTable(Op, DAG);
7347   case ISD::BR_JT:
7348     return LowerBR_JT(Op, DAG);
7349   case ISD::BRIND:
7350     return LowerBRIND(Op, DAG);
7351   case ISD::ConstantPool:
7352     return LowerConstantPool(Op, DAG);
7353   case ISD::BlockAddress:
7354     return LowerBlockAddress(Op, DAG);
7355   case ISD::VASTART:
7356     return LowerVASTART(Op, DAG);
7357   case ISD::VACOPY:
7358     return LowerVACOPY(Op, DAG);
7359   case ISD::VAARG:
7360     return LowerVAARG(Op, DAG);
7361   case ISD::UADDO_CARRY:
7362     return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
7363   case ISD::USUBO_CARRY:
7364     return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
7365   case ISD::SADDO_CARRY:
7366     return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
7367   case ISD::SSUBO_CARRY:
7368     return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
7369   case ISD::SADDO:
7370   case ISD::UADDO:
7371   case ISD::SSUBO:
7372   case ISD::USUBO:
7373   case ISD::SMULO:
7374   case ISD::UMULO:
7375     return LowerXALUO(Op, DAG);
7376   case ISD::FADD:
7377     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
7378   case ISD::FSUB:
7379     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
7380   case ISD::FMUL:
7381     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7382   case ISD::FMA:
7383     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
7384   case ISD::FDIV:
7385     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
7386   case ISD::FNEG:
7387     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7388   case ISD::FCEIL:
7389     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
7390   case ISD::FFLOOR:
7391     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
7392   case ISD::FNEARBYINT:
7393     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
7394   case ISD::FRINT:
7395     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
7396   case ISD::FROUND:
7397     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
7398   case ISD::FROUNDEVEN:
7399     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
7400   case ISD::FTRUNC:
7401     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
7402   case ISD::FSQRT:
7403     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
7404   case ISD::FABS:
7405     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
7406   case ISD::FP_ROUND:
7407   case ISD::STRICT_FP_ROUND:
7408     return LowerFP_ROUND(Op, DAG);
7409   case ISD::FP_EXTEND:
7410   case ISD::STRICT_FP_EXTEND:
7411     return LowerFP_EXTEND(Op, DAG);
7412   case ISD::FRAMEADDR:
7413     return LowerFRAMEADDR(Op, DAG);
7414   case ISD::SPONENTRY:
7415     return LowerSPONENTRY(Op, DAG);
7416   case ISD::RETURNADDR:
7417     return LowerRETURNADDR(Op, DAG);
7418   case ISD::ADDROFRETURNADDR:
7419     return LowerADDROFRETURNADDR(Op, DAG);
7420   case ISD::CONCAT_VECTORS:
7421     return LowerCONCAT_VECTORS(Op, DAG);
7422   case ISD::INSERT_VECTOR_ELT:
7423     return LowerINSERT_VECTOR_ELT(Op, DAG);
7424   case ISD::EXTRACT_VECTOR_ELT:
7425     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7426   case ISD::BUILD_VECTOR:
7427     return LowerBUILD_VECTOR(Op, DAG);
7428   case ISD::ZERO_EXTEND_VECTOR_INREG:
7429     return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
7430   case ISD::VECTOR_SHUFFLE:
7431     return LowerVECTOR_SHUFFLE(Op, DAG);
7432   case ISD::SPLAT_VECTOR:
7433     return LowerSPLAT_VECTOR(Op, DAG);
7434   case ISD::EXTRACT_SUBVECTOR:
7435     return LowerEXTRACT_SUBVECTOR(Op, DAG);
7436   case ISD::INSERT_SUBVECTOR:
7437     return LowerINSERT_SUBVECTOR(Op, DAG);
7438   case ISD::SDIV:
7439   case ISD::UDIV:
7440     return LowerDIV(Op, DAG);
7441   case ISD::SMIN:
7442   case ISD::UMIN:
7443   case ISD::SMAX:
7444   case ISD::UMAX:
7445     return LowerMinMax(Op, DAG);
7446   case ISD::SRA:
7447   case ISD::SRL:
7448   case ISD::SHL:
7449     return LowerVectorSRA_SRL_SHL(Op, DAG);
7450   case ISD::SHL_PARTS:
7451   case ISD::SRL_PARTS:
7452   case ISD::SRA_PARTS:
7453     return LowerShiftParts(Op, DAG);
7454   case ISD::CTPOP:
7455   case ISD::PARITY:
7456     return LowerCTPOP_PARITY(Op, DAG);
7457   case ISD::FCOPYSIGN:
7458     return LowerFCOPYSIGN(Op, DAG);
7459   case ISD::OR:
7460     return LowerVectorOR(Op, DAG);
7461   case ISD::XOR:
7462     return LowerXOR(Op, DAG);
7463   case ISD::PREFETCH:
7464     return LowerPREFETCH(Op, DAG);
7465   case ISD::SINT_TO_FP:
7466   case ISD::UINT_TO_FP:
7467   case ISD::STRICT_SINT_TO_FP:
7468   case ISD::STRICT_UINT_TO_FP:
7469     return LowerINT_TO_FP(Op, DAG);
7470   case ISD::FP_TO_SINT:
7471   case ISD::FP_TO_UINT:
7472   case ISD::STRICT_FP_TO_SINT:
7473   case ISD::STRICT_FP_TO_UINT:
7474     return LowerFP_TO_INT(Op, DAG);
7475   case ISD::FP_TO_SINT_SAT:
7476   case ISD::FP_TO_UINT_SAT:
7477     return LowerFP_TO_INT_SAT(Op, DAG);
7478   case ISD::FSINCOS:
7479     return LowerFSINCOS(Op, DAG);
7480   case ISD::GET_ROUNDING:
7481     return LowerGET_ROUNDING(Op, DAG);
7482   case ISD::SET_ROUNDING:
7483     return LowerSET_ROUNDING(Op, DAG);
7484   case ISD::GET_FPMODE:
7485     return LowerGET_FPMODE(Op, DAG);
7486   case ISD::SET_FPMODE:
7487     return LowerSET_FPMODE(Op, DAG);
7488   case ISD::RESET_FPMODE:
7489     return LowerRESET_FPMODE(Op, DAG);
7490   case ISD::MUL:
7491     return LowerMUL(Op, DAG);
7492   case ISD::MULHS:
7493     return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
7494   case ISD::MULHU:
7495     return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
7496   case ISD::INTRINSIC_W_CHAIN:
7497     return LowerINTRINSIC_W_CHAIN(Op, DAG);
7498   case ISD::INTRINSIC_WO_CHAIN:
7499     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7500   case ISD::INTRINSIC_VOID:
7501     return LowerINTRINSIC_VOID(Op, DAG);
7502   case ISD::ATOMIC_STORE:
7503     if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
7504       assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
7505       return LowerStore128(Op, DAG);
7506     }
7507     return SDValue();
7508   case ISD::STORE:
7509     return LowerSTORE(Op, DAG);
7510   case ISD::MSTORE:
7511     return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
7512   case ISD::MGATHER:
7513     return LowerMGATHER(Op, DAG);
7514   case ISD::MSCATTER:
7515     return LowerMSCATTER(Op, DAG);
7516   case ISD::VECREDUCE_SEQ_FADD:
7517     return LowerVECREDUCE_SEQ_FADD(Op, DAG);
7518   case ISD::VECREDUCE_ADD:
7519   case ISD::VECREDUCE_AND:
7520   case ISD::VECREDUCE_OR:
7521   case ISD::VECREDUCE_XOR:
7522   case ISD::VECREDUCE_SMAX:
7523   case ISD::VECREDUCE_SMIN:
7524   case ISD::VECREDUCE_UMAX:
7525   case ISD::VECREDUCE_UMIN:
7526   case ISD::VECREDUCE_FADD:
7527   case ISD::VECREDUCE_FMAX:
7528   case ISD::VECREDUCE_FMIN:
7529   case ISD::VECREDUCE_FMAXIMUM:
7530   case ISD::VECREDUCE_FMINIMUM:
7531     return LowerVECREDUCE(Op, DAG);
7532   case ISD::ATOMIC_LOAD_AND:
7533     return LowerATOMIC_LOAD_AND(Op, DAG);
7534   case ISD::DYNAMIC_STACKALLOC:
7535     return LowerDYNAMIC_STACKALLOC(Op, DAG);
7536   case ISD::VSCALE:
7537     return LowerVSCALE(Op, DAG);
7538   case ISD::VECTOR_COMPRESS:
7539     return LowerVECTOR_COMPRESS(Op, DAG);
7540   case ISD::ANY_EXTEND:
7541   case ISD::SIGN_EXTEND:
7542   case ISD::ZERO_EXTEND:
7543     return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
7544   case ISD::SIGN_EXTEND_INREG: {
7545     // Only custom lower when ExtraVT has a legal byte based element type.
7546     EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
7547     EVT ExtraEltVT = ExtraVT.getVectorElementType();
7548     if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
7549         (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
7550       return SDValue();
7551 
7552     return LowerToPredicatedOp(Op, DAG,
7553                                AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
7554   }
7555   case ISD::TRUNCATE:
7556     return LowerTRUNCATE(Op, DAG);
7557   case ISD::MLOAD:
7558     return LowerMLOAD(Op, DAG);
7559   case ISD::LOAD:
7560     if (useSVEForFixedLengthVectorVT(Op.getValueType(),
7561                                      !Subtarget->isNeonAvailable()))
7562       return LowerFixedLengthVectorLoadToSVE(Op, DAG);
7563     return LowerLOAD(Op, DAG);
7564   case ISD::ADD:
7565   case ISD::AND:
7566   case ISD::SUB:
7567     return LowerToScalableOp(Op, DAG);
7568   case ISD::FMAXIMUM:
7569     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
7570   case ISD::FMAXNUM:
7571     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
7572   case ISD::FMINIMUM:
7573     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
7574   case ISD::FMINNUM:
7575     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
7576   case ISD::VSELECT:
7577     return LowerFixedLengthVectorSelectToSVE(Op, DAG);
7578   case ISD::ABS:
7579     return LowerABS(Op, DAG);
7580   case ISD::ABDS:
7581     return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
7582   case ISD::ABDU:
7583     return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
7584   case ISD::AVGFLOORS:
7585     return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
7586   case ISD::AVGFLOORU:
7587     return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
7588   case ISD::AVGCEILS:
7589     return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
7590   case ISD::AVGCEILU:
7591     return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
7592   case ISD::BITREVERSE:
7593     return LowerBitreverse(Op, DAG);
7594   case ISD::BSWAP:
7595     return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
7596   case ISD::CTLZ:
7597     return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
7598   case ISD::CTTZ:
7599     return LowerCTTZ(Op, DAG);
7600   case ISD::VECTOR_SPLICE:
7601     return LowerVECTOR_SPLICE(Op, DAG);
7602   case ISD::VECTOR_DEINTERLEAVE:
7603     return LowerVECTOR_DEINTERLEAVE(Op, DAG);
7604   case ISD::VECTOR_INTERLEAVE:
7605     return LowerVECTOR_INTERLEAVE(Op, DAG);
7606   case ISD::LRINT:
7607   case ISD::LLRINT:
7608     if (Op.getValueType().isVector())
7609       return LowerVectorXRINT(Op, DAG);
7610     [[fallthrough]];
7611   case ISD::LROUND:
7612   case ISD::LLROUND: {
7613     assert((Op.getOperand(0).getValueType() == MVT::f16 ||
7614             Op.getOperand(0).getValueType() == MVT::bf16) &&
7615            "Expected custom lowering of rounding operations only for f16");
7616     SDLoc DL(Op);
7617     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
7618     return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
7619   }
7620   case ISD::STRICT_LROUND:
7621   case ISD::STRICT_LLROUND:
7622   case ISD::STRICT_LRINT:
7623   case ISD::STRICT_LLRINT: {
7624     assert((Op.getOperand(1).getValueType() == MVT::f16 ||
7625             Op.getOperand(1).getValueType() == MVT::bf16) &&
7626            "Expected custom lowering of rounding operations only for f16");
7627     SDLoc DL(Op);
7628     SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
7629                               {Op.getOperand(0), Op.getOperand(1)});
7630     return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
7631                        {Ext.getValue(1), Ext.getValue(0)});
7632   }
7633   case ISD::WRITE_REGISTER: {
7634     assert(Op.getOperand(2).getValueType() == MVT::i128 &&
7635            "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
7636     SDLoc DL(Op);
7637 
7638     SDValue Chain = Op.getOperand(0);
7639     SDValue SysRegName = Op.getOperand(1);
7640     std::pair<SDValue, SDValue> Pair =
7641         DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
7642 
7643     // chain = MSRR(chain, sysregname, lo, hi)
7644     SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
7645                                  SysRegName, Pair.first, Pair.second);
7646 
7647     return Result;
7648   }
7649   case ISD::FSHL:
7650   case ISD::FSHR:
7651     return LowerFunnelShift(Op, DAG);
7652   case ISD::FLDEXP:
7653     return LowerFLDEXP(Op, DAG);
7654   case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
7655     return LowerVECTOR_HISTOGRAM(Op, DAG);
7656   }
7657 }
7658 
7659 bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
7660   return !Subtarget->useSVEForFixedLengthVectors();
7661 }
7662 
7663 bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
7664     EVT VT, bool OverrideNEON) const {
7665   if (!VT.isFixedLengthVector() || !VT.isSimple())
7666     return false;
7667 
7668   // Don't use SVE for vectors we cannot scalarize if required.
7669   switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7670   // Fixed length predicates should be promoted to i8.
7671   // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7672   case MVT::i1:
7673   default:
7674     return false;
7675   case MVT::i8:
7676   case MVT::i16:
7677   case MVT::i32:
7678   case MVT::i64:
7679   case MVT::f16:
7680   case MVT::f32:
7681   case MVT::f64:
7682     break;
7683   }
7684 
7685   // NEON-sized vectors can be emulated using SVE instructions.
7686   if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
7687     return Subtarget->isSVEorStreamingSVEAvailable();
7688 
7689   // Ensure NEON MVTs only belong to a single register class.
7690   if (VT.getFixedSizeInBits() <= 128)
7691     return false;
7692 
7693   // Ensure wider than NEON code generation is enabled.
7694   if (!Subtarget->useSVEForFixedLengthVectors())
7695     return false;
7696 
7697   // Don't use SVE for types that don't fit.
7698   if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7699     return false;
7700 
7701   // TODO: Perhaps an artificial restriction, but worth having whilst getting
7702   // the base fixed length SVE support in place.
7703   if (!VT.isPow2VectorType())
7704     return false;
7705 
7706   return true;
7707 }
7708 
7709 //===----------------------------------------------------------------------===//
7710 //                      Calling Convention Implementation
7711 //===----------------------------------------------------------------------===//
7712 
7713 static unsigned getIntrinsicID(const SDNode *N) {
7714   unsigned Opcode = N->getOpcode();
7715   switch (Opcode) {
7716   default:
7717     return Intrinsic::not_intrinsic;
7718   case ISD::INTRINSIC_WO_CHAIN: {
7719     unsigned IID = N->getConstantOperandVal(0);
7720     if (IID < Intrinsic::num_intrinsics)
7721       return IID;
7722     return Intrinsic::not_intrinsic;
7723   }
7724   }
7725 }
7726 
7727 bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
7728                                                 SDValue N1) const {
7729   if (!N0.hasOneUse())
7730     return false;
7731 
7732   unsigned IID = getIntrinsicID(N1.getNode());
7733   // Avoid reassociating expressions that can be lowered to smlal/umlal.
7734   if (IID == Intrinsic::aarch64_neon_umull ||
7735       N1.getOpcode() == AArch64ISD::UMULL ||
7736       IID == Intrinsic::aarch64_neon_smull ||
7737       N1.getOpcode() == AArch64ISD::SMULL)
7738     return N0.getOpcode() != ISD::ADD;
7739 
7740   return true;
7741 }
7742 
7743 /// Selects the correct CCAssignFn for a given CallingConvention value.
7744 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
7745                                                      bool IsVarArg) const {
7746   switch (CC) {
7747   default:
7748     report_fatal_error("Unsupported calling convention.");
7749   case CallingConv::GHC:
7750     return CC_AArch64_GHC;
7751   case CallingConv::PreserveNone:
7752     // The VarArg implementation makes assumptions about register
7753     // argument passing that do not hold for preserve_none, so we
7754     // instead fall back to C argument passing.
7755     // The non-vararg case is handled in the CC function itself.
7756     if (!IsVarArg)
7757       return CC_AArch64_Preserve_None;
7758     [[fallthrough]];
7759   case CallingConv::C:
7760   case CallingConv::Fast:
7761   case CallingConv::PreserveMost:
7762   case CallingConv::PreserveAll:
7763   case CallingConv::CXX_FAST_TLS:
7764   case CallingConv::Swift:
7765   case CallingConv::SwiftTail:
7766   case CallingConv::Tail:
7767   case CallingConv::GRAAL:
7768     if (Subtarget->isTargetWindows()) {
7769       if (IsVarArg) {
7770         if (Subtarget->isWindowsArm64EC())
7771           return CC_AArch64_Arm64EC_VarArg;
7772         return CC_AArch64_Win64_VarArg;
7773       }
7774       return CC_AArch64_Win64PCS;
7775     }
7776     if (!Subtarget->isTargetDarwin())
7777       return CC_AArch64_AAPCS;
7778     if (!IsVarArg)
7779       return CC_AArch64_DarwinPCS;
7780     return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
7781                                       : CC_AArch64_DarwinPCS_VarArg;
7782   case CallingConv::Win64:
7783     if (IsVarArg) {
7784       if (Subtarget->isWindowsArm64EC())
7785         return CC_AArch64_Arm64EC_VarArg;
7786       return CC_AArch64_Win64_VarArg;
7787     }
7788     return CC_AArch64_Win64PCS;
7789   case CallingConv::CFGuard_Check:
7790     if (Subtarget->isWindowsArm64EC())
7791       return CC_AArch64_Arm64EC_CFGuard_Check;
7792     return CC_AArch64_Win64_CFGuard_Check;
7793   case CallingConv::AArch64_VectorCall:
7794   case CallingConv::AArch64_SVE_VectorCall:
7795   case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
7796   case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1:
7797   case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
7798     return CC_AArch64_AAPCS;
7799   case CallingConv::ARM64EC_Thunk_X64:
7800     return CC_AArch64_Arm64EC_Thunk;
7801   case CallingConv::ARM64EC_Thunk_Native:
7802     return CC_AArch64_Arm64EC_Thunk_Native;
7803   }
7804 }
7805 
7806 CCAssignFn *
7807 AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
7808   switch (CC) {
7809   default:
7810     return RetCC_AArch64_AAPCS;
7811   case CallingConv::ARM64EC_Thunk_X64:
7812     return RetCC_AArch64_Arm64EC_Thunk;
7813   case CallingConv::CFGuard_Check:
7814     if (Subtarget->isWindowsArm64EC())
7815       return RetCC_AArch64_Arm64EC_CFGuard_Check;
7816     return RetCC_AArch64_AAPCS;
7817   }
7818 }
7819 
7820 static bool isPassedInFPR(EVT VT) {
7821   return VT.isFixedLengthVector() ||
7822          (VT.isFloatingPoint() && !VT.isScalableVector());
7823 }
7824 
7825 SDValue AArch64TargetLowering::LowerFormalArguments(
7826     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7827     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
7828     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7829   MachineFunction &MF = DAG.getMachineFunction();
7830   const Function &F = MF.getFunction();
7831   MachineFrameInfo &MFI = MF.getFrameInfo();
7832   bool IsWin64 =
7833       Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
7834   bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
7835                     (isVarArg && Subtarget->isWindowsArm64EC());
7836   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7837 
7838   SmallVector<ISD::OutputArg, 4> Outs;
7839   GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
7840                 DAG.getTargetLoweringInfo(), MF.getDataLayout());
7841   if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
7842     FuncInfo->setIsSVECC(true);
7843 
7844   // Assign locations to all of the incoming arguments.
7845   SmallVector<CCValAssign, 16> ArgLocs;
7846   DenseMap<unsigned, SDValue> CopiedRegs;
7847   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7848 
7849   // At this point, Ins[].VT may already be promoted to i32. To correctly
7850   // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
7851   // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
7852   // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
7853   // we use a special version of AnalyzeFormalArguments to pass in ValVT and
7854   // LocVT.
7855   unsigned NumArgs = Ins.size();
7856   Function::const_arg_iterator CurOrigArg = F.arg_begin();
7857   unsigned CurArgIdx = 0;
7858   for (unsigned i = 0; i != NumArgs; ++i) {
7859     MVT ValVT = Ins[i].VT;
7860     if (Ins[i].isOrigArg()) {
7861       std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
7862       CurArgIdx = Ins[i].getOrigArgIndex();
7863 
7864       // Get type of the original argument.
7865       EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
7866                                   /*AllowUnknown*/ true);
7867       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
7868       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7869       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7870         ValVT = MVT::i8;
7871       else if (ActualMVT == MVT::i16)
7872         ValVT = MVT::i16;
7873     }
7874     bool UseVarArgCC = false;
7875     if (IsWin64)
7876       UseVarArgCC = isVarArg;
7877     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
7878     bool Res =
7879         AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
7880     assert(!Res && "Call operand has unhandled type");
7881     (void)Res;
7882   }
7883 
7884   SMEAttrs Attrs(MF.getFunction());
7885   bool IsLocallyStreaming =
7886       !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
7887   assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
7888   SDValue Glue = Chain.getValue(1);
7889 
7890   SmallVector<SDValue, 16> ArgValues;
7891   unsigned ExtraArgLocs = 0;
7892   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
7893     CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7894 
7895     if (Ins[i].Flags.isByVal()) {
7896       // Byval is used for HFAs in the PCS, but the system should work in a
7897       // non-compliant manner for larger structs.
7898       EVT PtrVT = getPointerTy(DAG.getDataLayout());
7899       int Size = Ins[i].Flags.getByValSize();
7900       unsigned NumRegs = (Size + 7) / 8;
7901 
7902       // FIXME: This works on big-endian for composite byvals, which are the common
7903       // case. It should also work for fundamental types too.
7904       unsigned FrameIdx =
7905         MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
7906       SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
7907       InVals.push_back(FrameIdxN);
7908 
7909       continue;
7910     }
7911 
7912     if (Ins[i].Flags.isSwiftAsync())
7913       MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
7914 
7915     SDValue ArgValue;
7916     if (VA.isRegLoc()) {
7917       // Arguments stored in registers.
7918       EVT RegVT = VA.getLocVT();
7919       const TargetRegisterClass *RC;
7920 
7921       if (RegVT == MVT::i32)
7922         RC = &AArch64::GPR32RegClass;
7923       else if (RegVT == MVT::i64)
7924         RC = &AArch64::GPR64RegClass;
7925       else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
7926         RC = &AArch64::FPR16RegClass;
7927       else if (RegVT == MVT::f32)
7928         RC = &AArch64::FPR32RegClass;
7929       else if (RegVT == MVT::f64 || RegVT.is64BitVector())
7930         RC = &AArch64::FPR64RegClass;
7931       else if (RegVT == MVT::f128 || RegVT.is128BitVector())
7932         RC = &AArch64::FPR128RegClass;
7933       else if (RegVT.isScalableVector() &&
7934                RegVT.getVectorElementType() == MVT::i1) {
7935         FuncInfo->setIsSVECC(true);
7936         RC = &AArch64::PPRRegClass;
7937       } else if (RegVT == MVT::aarch64svcount) {
7938         FuncInfo->setIsSVECC(true);
7939         RC = &AArch64::PPRRegClass;
7940       } else if (RegVT.isScalableVector()) {
7941         FuncInfo->setIsSVECC(true);
7942         RC = &AArch64::ZPRRegClass;
7943       } else
7944         llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7945 
7946       // Transform the arguments in physical registers into virtual ones.
7947       Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
7948 
7949       if (IsLocallyStreaming) {
7950         // LocallyStreamingFunctions must insert the SMSTART in the correct
7951         // position, so we use Glue to ensure no instructions can be scheduled
7952         // between the chain of:
7953         //        t0: ch,glue = EntryNode
7954         //      t1:  res,ch,glue = CopyFromReg
7955         //     ...
7956         //   tn: res,ch,glue = CopyFromReg t(n-1), ..
7957         // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7958         // ^^^^^^
7959         // This will be the new Chain/Root node.
7960         ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
7961         Glue = ArgValue.getValue(2);
7962         if (isPassedInFPR(ArgValue.getValueType())) {
7963           ArgValue =
7964               DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
7965                           DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
7966                           {ArgValue, Glue});
7967           Glue = ArgValue.getValue(1);
7968         }
7969       } else
7970         ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
7971 
7972       // If this is an 8, 16 or 32-bit value, it is really passed promoted
7973       // to 64 bits.  Insert an assert[sz]ext to capture this, then
7974       // truncate to the right size.
7975       switch (VA.getLocInfo()) {
7976       default:
7977         llvm_unreachable("Unknown loc info!");
7978       case CCValAssign::Full:
7979         break;
7980       case CCValAssign::Indirect:
7981         assert(
7982             (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7983             "Indirect arguments should be scalable on most subtargets");
7984         break;
7985       case CCValAssign::BCvt:
7986         ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
7987         break;
7988       case CCValAssign::AExt:
7989       case CCValAssign::SExt:
7990       case CCValAssign::ZExt:
7991         break;
7992       case CCValAssign::AExtUpper:
7993         ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
7994                                DAG.getConstant(32, DL, RegVT));
7995         ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
7996         break;
7997       }
7998     } else { // VA.isRegLoc()
7999       assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
8000       unsigned ArgOffset = VA.getLocMemOffset();
8001       unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8002                               ? VA.getLocVT().getSizeInBits()
8003                               : VA.getValVT().getSizeInBits()) / 8;
8004 
8005       uint32_t BEAlign = 0;
8006       if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8007           !Ins[i].Flags.isInConsecutiveRegs())
8008         BEAlign = 8 - ArgSize;
8009 
8010       SDValue FIN;
8011       MachinePointerInfo PtrInfo;
8012       if (StackViaX4) {
8013         // In both the ARM64EC varargs convention and the thunk convention,
8014         // arguments on the stack are accessed relative to x4, not sp. In
8015         // the thunk convention, there's an additional offset of 32 bytes
8016         // to account for the shadow store.
8017         unsigned ObjOffset = ArgOffset + BEAlign;
8018         if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8019           ObjOffset += 32;
8020         Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8021         SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8022         FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8023                           DAG.getConstant(ObjOffset, DL, MVT::i64));
8024         PtrInfo = MachinePointerInfo::getUnknownStack(MF);
8025       } else {
8026         int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8027 
8028         // Create load nodes to retrieve arguments from the stack.
8029         FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8030         PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8031       }
8032 
8033       // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8034       ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
8035       MVT MemVT = VA.getValVT();
8036 
8037       switch (VA.getLocInfo()) {
8038       default:
8039         break;
8040       case CCValAssign::Trunc:
8041       case CCValAssign::BCvt:
8042         MemVT = VA.getLocVT();
8043         break;
8044       case CCValAssign::Indirect:
8045         assert((VA.getValVT().isScalableVector() ||
8046                 Subtarget->isWindowsArm64EC()) &&
8047                "Indirect arguments should be scalable on most subtargets");
8048         MemVT = VA.getLocVT();
8049         break;
8050       case CCValAssign::SExt:
8051         ExtType = ISD::SEXTLOAD;
8052         break;
8053       case CCValAssign::ZExt:
8054         ExtType = ISD::ZEXTLOAD;
8055         break;
8056       case CCValAssign::AExt:
8057         ExtType = ISD::EXTLOAD;
8058         break;
8059       }
8060 
8061       ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8062                                 MemVT);
8063     }
8064 
8065     if (VA.getLocInfo() == CCValAssign::Indirect) {
8066       assert((VA.getValVT().isScalableVT() ||
8067               Subtarget->isWindowsArm64EC()) &&
8068              "Indirect arguments should be scalable on most subtargets");
8069 
8070       uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
8071       unsigned NumParts = 1;
8072       if (Ins[i].Flags.isInConsecutiveRegs()) {
8073         while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8074           ++NumParts;
8075       }
8076 
8077       MVT PartLoad = VA.getValVT();
8078       SDValue Ptr = ArgValue;
8079 
8080       // Ensure we generate all loads for each tuple part, whilst updating the
8081       // pointer after each load correctly using vscale.
8082       while (NumParts > 0) {
8083         ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8084         InVals.push_back(ArgValue);
8085         NumParts--;
8086         if (NumParts > 0) {
8087           SDValue BytesIncrement;
8088           if (PartLoad.isScalableVector()) {
8089             BytesIncrement = DAG.getVScale(
8090                 DL, Ptr.getValueType(),
8091                 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8092           } else {
8093             BytesIncrement = DAG.getConstant(
8094                 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8095                 Ptr.getValueType());
8096           }
8097           Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8098                             BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8099           ExtraArgLocs++;
8100           i++;
8101         }
8102       }
8103     } else {
8104       if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8105         ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8106                                ArgValue, DAG.getValueType(MVT::i32));
8107 
8108       // i1 arguments are zero-extended to i8 by the caller. Emit a
8109       // hint to reflect this.
8110       if (Ins[i].isOrigArg()) {
8111         Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
8112         if (OrigArg->getType()->isIntegerTy(1)) {
8113           if (!Ins[i].Flags.isZExt()) {
8114             ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8115                                    ArgValue.getValueType(), ArgValue);
8116           }
8117         }
8118       }
8119 
8120       InVals.push_back(ArgValue);
8121     }
8122   }
8123   assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8124 
8125   // Insert the SMSTART if this is a locally streaming function and
8126   // make sure it is Glued to the last CopyFromReg value.
8127   if (IsLocallyStreaming) {
8128     SDValue PStateSM;
8129     if (Attrs.hasStreamingCompatibleInterface()) {
8130       PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
8131       Register Reg = MF.getRegInfo().createVirtualRegister(
8132           getRegClassFor(PStateSM.getValueType().getSimpleVT()));
8133       FuncInfo->setPStateSMReg(Reg);
8134       Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
8135       Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8136                                   AArch64SME::IfCallerIsNonStreaming, PStateSM);
8137     } else
8138       Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8139                                   AArch64SME::Always);
8140 
8141     // Ensure that the SMSTART happens after the CopyWithChain such that its
8142     // chain result is used.
8143     for (unsigned I=0; I<InVals.size(); ++I) {
8144       Register Reg = MF.getRegInfo().createVirtualRegister(
8145           getRegClassFor(InVals[I].getValueType().getSimpleVT()));
8146       Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
8147       InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
8148                                      InVals[I].getValueType());
8149     }
8150   }
8151 
8152   // varargs
8153   if (isVarArg) {
8154     if (!Subtarget->isTargetDarwin() || IsWin64) {
8155       // The AAPCS variadic function ABI is identical to the non-variadic
8156       // one. As a result there may be more arguments in registers and we should
8157       // save them for future reference.
8158       // Win64 variadic functions also pass arguments in registers, but all float
8159       // arguments are passed in integer registers.
8160       saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8161     }
8162 
8163     // This will point to the next argument passed via stack.
8164     unsigned VarArgsOffset = CCInfo.getStackSize();
8165     // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8166     VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
8167     FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8168     FuncInfo->setVarArgsStackIndex(
8169         MFI.CreateFixedObject(4, VarArgsOffset, true));
8170 
8171     if (MFI.hasMustTailInVarArgFunc()) {
8172       SmallVector<MVT, 2> RegParmTypes;
8173       RegParmTypes.push_back(MVT::i64);
8174       RegParmTypes.push_back(MVT::f128);
8175       // Compute the set of forwarded registers. The rest are scratch.
8176       SmallVectorImpl<ForwardedRegister> &Forwards =
8177                                        FuncInfo->getForwardedMustTailRegParms();
8178       CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8179                                                CC_AArch64_AAPCS);
8180 
8181       // Conservatively forward X8, since it might be used for aggregate return.
8182       if (!CCInfo.isAllocated(AArch64::X8)) {
8183         Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
8184         Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
8185       }
8186     }
8187   }
8188 
8189   // On Windows, InReg pointers must be returned, so record the pointer in a
8190   // virtual register at the start of the function so it can be returned in the
8191   // epilogue.
8192   if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8193     for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
8194       if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
8195            Ins[I].Flags.isInReg()) &&
8196           Ins[I].Flags.isSRet()) {
8197         assert(!FuncInfo->getSRetReturnReg());
8198 
8199         MVT PtrTy = getPointerTy(DAG.getDataLayout());
8200         Register Reg =
8201             MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
8202         FuncInfo->setSRetReturnReg(Reg);
8203 
8204         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
8205         Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
8206         break;
8207       }
8208     }
8209   }
8210 
8211   unsigned StackArgSize = CCInfo.getStackSize();
8212   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8213   if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
8214     // This is a non-standard ABI so by fiat I say we're allowed to make full
8215     // use of the stack area to be popped, which must be aligned to 16 bytes in
8216     // any case:
8217     StackArgSize = alignTo(StackArgSize, 16);
8218 
8219     // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
8220     // a multiple of 16.
8221     FuncInfo->setArgumentStackToRestore(StackArgSize);
8222 
8223     // This realignment carries over to the available bytes below. Our own
8224     // callers will guarantee the space is free by giving an aligned value to
8225     // CALLSEQ_START.
8226   }
8227   // Even if we're not expected to free up the space, it's useful to know how
8228   // much is there while considering tail calls (because we can reuse it).
8229   FuncInfo->setBytesInStackArgArea(StackArgSize);
8230 
8231   if (Subtarget->hasCustomCallingConv())
8232     Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
8233 
8234   // Create a 16 Byte TPIDR2 object. The dynamic buffer
8235   // will be expanded and stored in the static object later using a pseudonode.
8236   if (SMEAttrs(MF.getFunction()).hasZAState()) {
8237     TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8238     TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
8239     SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8240                               DAG.getConstant(1, DL, MVT::i32));
8241 
8242     SDValue Buffer;
8243     if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8244       Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
8245                            DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
8246     } else {
8247       SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8248       Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
8249                            DAG.getVTList(MVT::i64, MVT::Other),
8250                            {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8251       MFI.CreateVariableSizedObject(Align(16), nullptr);
8252     }
8253     Chain = DAG.getNode(
8254         AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
8255         {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
8256   } else if (SMEAttrs(MF.getFunction()).hasAgnosticZAInterface()) {
8257     // Call __arm_sme_state_size().
8258     SDValue BufferSize =
8259         DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL,
8260                     DAG.getVTList(MVT::i64, MVT::Other), Chain);
8261     Chain = BufferSize.getValue(1);
8262 
8263     SDValue Buffer;
8264     if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8265       Buffer =
8266           DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
8267                       DAG.getVTList(MVT::i64, MVT::Other), {Chain, BufferSize});
8268     } else {
8269       // Allocate space dynamically.
8270       Buffer = DAG.getNode(
8271           ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8272           {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
8273       MFI.CreateVariableSizedObject(Align(16), nullptr);
8274     }
8275 
8276     // Copy the value to a virtual register, and save that in FuncInfo.
8277     Register BufferPtr =
8278         MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8279     FuncInfo->setSMESaveBufferAddr(BufferPtr);
8280     Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
8281   }
8282 
8283   if (CallConv == CallingConv::PreserveNone) {
8284     for (const ISD::InputArg &I : Ins) {
8285       if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
8286           I.Flags.isSwiftAsync()) {
8287         MachineFunction &MF = DAG.getMachineFunction();
8288         DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
8289             MF.getFunction(),
8290             "Swift attributes can't be used with preserve_none",
8291             DL.getDebugLoc()));
8292         break;
8293       }
8294     }
8295   }
8296 
8297   return Chain;
8298 }
8299 
8300 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
8301                                                 SelectionDAG &DAG,
8302                                                 const SDLoc &DL,
8303                                                 SDValue &Chain) const {
8304   MachineFunction &MF = DAG.getMachineFunction();
8305   MachineFrameInfo &MFI = MF.getFrameInfo();
8306   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8307   auto PtrVT = getPointerTy(DAG.getDataLayout());
8308   Function &F = MF.getFunction();
8309   bool IsWin64 =
8310       Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8311 
8312   SmallVector<SDValue, 8> MemOps;
8313 
8314   auto GPRArgRegs = AArch64::getGPRArgRegs();
8315   unsigned NumGPRArgRegs = GPRArgRegs.size();
8316   if (Subtarget->isWindowsArm64EC()) {
8317     // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
8318     // functions.
8319     NumGPRArgRegs = 4;
8320   }
8321   unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
8322 
8323   unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
8324   int GPRIdx = 0;
8325   if (GPRSaveSize != 0) {
8326     if (IsWin64) {
8327       GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
8328       if (GPRSaveSize & 15)
8329         // The extra size here, if triggered, will always be 8.
8330         MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
8331     } else
8332       GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
8333 
8334     SDValue FIN;
8335     if (Subtarget->isWindowsArm64EC()) {
8336       // With the Arm64EC ABI, we reserve the save area as usual, but we
8337       // compute its address relative to x4.  For a normal AArch64->AArch64
8338       // call, x4 == sp on entry, but calls from an entry thunk can pass in a
8339       // different address.
8340       Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8341       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8342       FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
8343                         DAG.getConstant(GPRSaveSize, DL, MVT::i64));
8344     } else {
8345       FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
8346     }
8347 
8348     for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
8349       Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
8350       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8351       SDValue Store =
8352           DAG.getStore(Val.getValue(1), DL, Val, FIN,
8353                        IsWin64 ? MachinePointerInfo::getFixedStack(
8354                                      MF, GPRIdx, (i - FirstVariadicGPR) * 8)
8355                                : MachinePointerInfo::getStack(MF, i * 8));
8356       MemOps.push_back(Store);
8357       FIN =
8358           DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
8359     }
8360   }
8361   FuncInfo->setVarArgsGPRIndex(GPRIdx);
8362   FuncInfo->setVarArgsGPRSize(GPRSaveSize);
8363 
8364   if (Subtarget->hasFPARMv8() && !IsWin64) {
8365     auto FPRArgRegs = AArch64::getFPRArgRegs();
8366     const unsigned NumFPRArgRegs = FPRArgRegs.size();
8367     unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
8368 
8369     unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
8370     int FPRIdx = 0;
8371     if (FPRSaveSize != 0) {
8372       FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
8373 
8374       SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
8375 
8376       for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
8377         Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
8378         SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
8379 
8380         SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
8381                                      MachinePointerInfo::getStack(MF, i * 16));
8382         MemOps.push_back(Store);
8383         FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
8384                           DAG.getConstant(16, DL, PtrVT));
8385       }
8386     }
8387     FuncInfo->setVarArgsFPRIndex(FPRIdx);
8388     FuncInfo->setVarArgsFPRSize(FPRSaveSize);
8389   }
8390 
8391   if (!MemOps.empty()) {
8392     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
8393   }
8394 }
8395 
8396 /// LowerCallResult - Lower the result values of a call into the
8397 /// appropriate copies out of appropriate physical registers.
8398 SDValue AArch64TargetLowering::LowerCallResult(
8399     SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
8400     const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
8401     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
8402     SDValue ThisVal, bool RequiresSMChange) const {
8403   DenseMap<unsigned, SDValue> CopiedRegs;
8404   // Copy all of the result registers out of their specified physreg.
8405   for (unsigned i = 0; i != RVLocs.size(); ++i) {
8406     CCValAssign VA = RVLocs[i];
8407 
8408     // Pass 'this' value directly from the argument to return value, to avoid
8409     // reg unit interference
8410     if (i == 0 && isThisReturn) {
8411       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
8412              "unexpected return calling convention register assignment");
8413       InVals.push_back(ThisVal);
8414       continue;
8415     }
8416 
8417     // Avoid copying a physreg twice since RegAllocFast is incompetent and only
8418     // allows one use of a physreg per block.
8419     SDValue Val = CopiedRegs.lookup(VA.getLocReg());
8420     if (!Val) {
8421       Val =
8422           DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
8423       Chain = Val.getValue(1);
8424       InGlue = Val.getValue(2);
8425       CopiedRegs[VA.getLocReg()] = Val;
8426     }
8427 
8428     switch (VA.getLocInfo()) {
8429     default:
8430       llvm_unreachable("Unknown loc info!");
8431     case CCValAssign::Full:
8432       break;
8433     case CCValAssign::BCvt:
8434       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
8435       break;
8436     case CCValAssign::AExtUpper:
8437       Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
8438                         DAG.getConstant(32, DL, VA.getLocVT()));
8439       [[fallthrough]];
8440     case CCValAssign::AExt:
8441       [[fallthrough]];
8442     case CCValAssign::ZExt:
8443       Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
8444       break;
8445     }
8446 
8447     if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
8448       Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL, Val.getValueType(),
8449                         Val);
8450 
8451     InVals.push_back(Val);
8452   }
8453 
8454   return Chain;
8455 }
8456 
8457 /// Return true if the calling convention is one that we can guarantee TCO for.
8458 static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
8459   return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
8460          CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
8461 }
8462 
8463 /// Return true if we might ever do TCO for calls with this calling convention.
8464 static bool mayTailCallThisCC(CallingConv::ID CC) {
8465   switch (CC) {
8466   case CallingConv::C:
8467   case CallingConv::AArch64_SVE_VectorCall:
8468   case CallingConv::PreserveMost:
8469   case CallingConv::PreserveAll:
8470   case CallingConv::PreserveNone:
8471   case CallingConv::Swift:
8472   case CallingConv::SwiftTail:
8473   case CallingConv::Tail:
8474   case CallingConv::Fast:
8475     return true;
8476   default:
8477     return false;
8478   }
8479 }
8480 
8481 /// Return true if the call convention supports varargs
8482 /// Currently only those that pass varargs like the C
8483 /// calling convention does are eligible
8484 /// Calling conventions listed in this function must also
8485 /// be properly handled in AArch64Subtarget::isCallingConvWin64
8486 static bool callConvSupportsVarArgs(CallingConv::ID CC) {
8487   switch (CC) {
8488   case CallingConv::C:
8489   case CallingConv::PreserveNone:
8490     return true;
8491   default:
8492     return false;
8493   }
8494 }
8495 
8496 static void analyzeCallOperands(const AArch64TargetLowering &TLI,
8497                                 const AArch64Subtarget *Subtarget,
8498                                 const TargetLowering::CallLoweringInfo &CLI,
8499                                 CCState &CCInfo) {
8500   const SelectionDAG &DAG = CLI.DAG;
8501   CallingConv::ID CalleeCC = CLI.CallConv;
8502   bool IsVarArg = CLI.IsVarArg;
8503   const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8504   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
8505 
8506   // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
8507   // for the shadow store.
8508   if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
8509     CCInfo.AllocateStack(32, Align(16));
8510 
8511   unsigned NumArgs = Outs.size();
8512   for (unsigned i = 0; i != NumArgs; ++i) {
8513     MVT ArgVT = Outs[i].VT;
8514     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
8515 
8516     bool UseVarArgCC = false;
8517     if (IsVarArg) {
8518       // On Windows, the fixed arguments in a vararg call are passed in GPRs
8519       // too, so use the vararg CC to force them to integer registers.
8520       if (IsCalleeWin64) {
8521         UseVarArgCC = true;
8522       } else {
8523         UseVarArgCC = !Outs[i].IsFixed;
8524       }
8525     }
8526 
8527     if (!UseVarArgCC) {
8528       // Get type of the original argument.
8529       EVT ActualVT =
8530           TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
8531                        /*AllowUnknown*/ true);
8532       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
8533       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8534       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8535         ArgVT = MVT::i8;
8536       else if (ActualMVT == MVT::i16)
8537         ArgVT = MVT::i16;
8538     }
8539 
8540     CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
8541     bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
8542     assert(!Res && "Call operand has unhandled type");
8543     (void)Res;
8544   }
8545 }
8546 
8547 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
8548     const CallLoweringInfo &CLI) const {
8549   CallingConv::ID CalleeCC = CLI.CallConv;
8550   if (!mayTailCallThisCC(CalleeCC))
8551     return false;
8552 
8553   SDValue Callee = CLI.Callee;
8554   bool IsVarArg = CLI.IsVarArg;
8555   const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8556   const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8557   const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
8558   const SelectionDAG &DAG = CLI.DAG;
8559   MachineFunction &MF = DAG.getMachineFunction();
8560   const Function &CallerF = MF.getFunction();
8561   CallingConv::ID CallerCC = CallerF.getCallingConv();
8562 
8563   // SME Streaming functions are not eligible for TCO as they may require
8564   // the streaming mode or ZA to be restored after returning from the call.
8565   SMEAttrs CallerAttrs(MF.getFunction());
8566   auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
8567   if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
8568       CallerAttrs.requiresLazySave(CalleeAttrs) ||
8569       CallerAttrs.requiresPreservingAllZAState(CalleeAttrs) ||
8570       CallerAttrs.hasStreamingBody())
8571     return false;
8572 
8573   // Functions using the C or Fast calling convention that have an SVE signature
8574   // preserve more registers and should assume the SVE_VectorCall CC.
8575   // The check for matching callee-saved regs will determine whether it is
8576   // eligible for TCO.
8577   if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
8578       MF.getInfo<AArch64FunctionInfo>()->isSVECC())
8579     CallerCC = CallingConv::AArch64_SVE_VectorCall;
8580 
8581   bool CCMatch = CallerCC == CalleeCC;
8582 
8583   // When using the Windows calling convention on a non-windows OS, we want
8584   // to back up and restore X18 in such functions; we can't do a tail call
8585   // from those functions.
8586   if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
8587       CalleeCC != CallingConv::Win64)
8588     return false;
8589 
8590   // Byval parameters hand the function a pointer directly into the stack area
8591   // we want to reuse during a tail call. Working around this *is* possible (see
8592   // X86) but less efficient and uglier in LowerCall.
8593   for (Function::const_arg_iterator i = CallerF.arg_begin(),
8594                                     e = CallerF.arg_end();
8595        i != e; ++i) {
8596     if (i->hasByValAttr())
8597       return false;
8598 
8599     // On Windows, "inreg" attributes signify non-aggregate indirect returns.
8600     // In this case, it is necessary to save/restore X0 in the callee. Tail
8601     // call opt interferes with this. So we disable tail call opt when the
8602     // caller has an argument with "inreg" attribute.
8603 
8604     // FIXME: Check whether the callee also has an "inreg" argument.
8605     if (i->hasInRegAttr())
8606       return false;
8607   }
8608 
8609   if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
8610     return CCMatch;
8611 
8612   // Externally-defined functions with weak linkage should not be
8613   // tail-called on AArch64 when the OS does not support dynamic
8614   // pre-emption of symbols, as the AAELF spec requires normal calls
8615   // to undefined weak functions to be replaced with a NOP or jump to the
8616   // next instruction. The behaviour of branch instructions in this
8617   // situation (as used for tail calls) is implementation-defined, so we
8618   // cannot rely on the linker replacing the tail call with a return.
8619   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8620     const GlobalValue *GV = G->getGlobal();
8621     const Triple &TT = getTargetMachine().getTargetTriple();
8622     if (GV->hasExternalWeakLinkage() &&
8623         (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
8624       return false;
8625   }
8626 
8627   // Now we search for cases where we can use a tail call without changing the
8628   // ABI. Sibcall is used in some places (particularly gcc) to refer to this
8629   // concept.
8630 
8631   // I want anyone implementing a new calling convention to think long and hard
8632   // about this assert.
8633   if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
8634     report_fatal_error("Unsupported variadic calling convention");
8635 
8636   LLVMContext &C = *DAG.getContext();
8637   // Check that the call results are passed in the same way.
8638   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
8639                                   CCAssignFnForCall(CalleeCC, IsVarArg),
8640                                   CCAssignFnForCall(CallerCC, IsVarArg)))
8641     return false;
8642   // The callee has to preserve all registers the caller needs to preserve.
8643   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8644   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
8645   if (!CCMatch) {
8646     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
8647     if (Subtarget->hasCustomCallingConv()) {
8648       TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
8649       TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
8650     }
8651     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
8652       return false;
8653   }
8654 
8655   // Nothing more to check if the callee is taking no arguments
8656   if (Outs.empty())
8657     return true;
8658 
8659   SmallVector<CCValAssign, 16> ArgLocs;
8660   CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
8661 
8662   analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
8663 
8664   if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
8665     // When we are musttail, additional checks have been done and we can safely ignore this check
8666     // At least two cases here: if caller is fastcc then we can't have any
8667     // memory arguments (we'd be expected to clean up the stack afterwards). If
8668     // caller is C then we could potentially use its argument area.
8669 
8670     // FIXME: for now we take the most conservative of these in both cases:
8671     // disallow all variadic memory operands.
8672     for (const CCValAssign &ArgLoc : ArgLocs)
8673       if (!ArgLoc.isRegLoc())
8674         return false;
8675   }
8676 
8677   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8678 
8679   // If any of the arguments is passed indirectly, it must be SVE, so the
8680   // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
8681   // allocate space on the stack. That is why we determine this explicitly here
8682   // the call cannot be a tailcall.
8683   if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
8684         assert((A.getLocInfo() != CCValAssign::Indirect ||
8685                 A.getValVT().isScalableVector() ||
8686                 Subtarget->isWindowsArm64EC()) &&
8687                "Expected value to be scalable");
8688         return A.getLocInfo() == CCValAssign::Indirect;
8689       }))
8690     return false;
8691 
8692   // If the stack arguments for this call do not fit into our own save area then
8693   // the call cannot be made tail.
8694   if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
8695     return false;
8696 
8697   const MachineRegisterInfo &MRI = MF.getRegInfo();
8698   if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
8699     return false;
8700 
8701   return true;
8702 }
8703 
8704 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
8705                                                    SelectionDAG &DAG,
8706                                                    MachineFrameInfo &MFI,
8707                                                    int ClobberedFI) const {
8708   SmallVector<SDValue, 8> ArgChains;
8709   int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
8710   int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
8711 
8712   // Include the original chain at the beginning of the list. When this is
8713   // used by target LowerCall hooks, this helps legalize find the
8714   // CALLSEQ_BEGIN node.
8715   ArgChains.push_back(Chain);
8716 
8717   // Add a chain value for each stack argument corresponding
8718   for (SDNode *U : DAG.getEntryNode().getNode()->users())
8719     if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
8720       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
8721         if (FI->getIndex() < 0) {
8722           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
8723           int64_t InLastByte = InFirstByte;
8724           InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
8725 
8726           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
8727               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
8728             ArgChains.push_back(SDValue(L, 1));
8729         }
8730 
8731   // Build a tokenfactor for all the chains.
8732   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
8733 }
8734 
8735 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
8736                                                    bool TailCallOpt) const {
8737   return (CallCC == CallingConv::Fast && TailCallOpt) ||
8738          CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
8739 }
8740 
8741 // Check if the value is zero-extended from i1 to i8
8742 static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
8743   unsigned SizeInBits = Arg.getValueType().getSizeInBits();
8744   if (SizeInBits < 8)
8745     return false;
8746 
8747   APInt RequredZero(SizeInBits, 0xFE);
8748   KnownBits Bits = DAG.computeKnownBits(Arg, 4);
8749   bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
8750   return ZExtBool;
8751 }
8752 
8753 // The FORM_TRANSPOSED_REG_TUPLE pseudo should only be used if the
8754 // input operands are copy nodes where the source register is in a
8755 // StridedOrContiguous class. For example:
8756 //
8757 //   %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
8758 //   %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
8759 //   %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
8760 //   %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
8761 //   %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
8762 //   %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
8763 //   %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
8764 //
8765 bool shouldUseFormStridedPseudo(MachineInstr &MI) {
8766   MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
8767 
8768   const TargetRegisterClass *RegClass = nullptr;
8769   switch (MI.getOpcode()) {
8770   case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO:
8771     RegClass = &AArch64::ZPR2StridedOrContiguousRegClass;
8772     break;
8773   case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO:
8774     RegClass = &AArch64::ZPR4StridedOrContiguousRegClass;
8775     break;
8776   default:
8777     llvm_unreachable("Unexpected opcode.");
8778   }
8779 
8780   MCRegister SubReg = MCRegister::NoRegister;
8781   for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
8782     MachineOperand &MO = MI.getOperand(I);
8783     assert(MO.isReg() && "Unexpected operand to FORM_TRANSPOSED_REG_TUPLE");
8784 
8785     MachineOperand *Def = MRI.getOneDef(MO.getReg());
8786     if (!Def || !Def->getParent()->isCopy())
8787       return false;
8788 
8789     const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
8790     unsigned OpSubReg = CopySrc.getSubReg();
8791     if (SubReg == MCRegister::NoRegister)
8792       SubReg = OpSubReg;
8793 
8794     MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
8795     if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
8796         MRI.getRegClass(CopySrcOp->getReg()) != RegClass)
8797       return false;
8798   }
8799 
8800   return true;
8801 }
8802 
8803 void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
8804                                                           SDNode *Node) const {
8805   // Live-in physreg copies that are glued to SMSTART are applied as
8806   // implicit-def's in the InstrEmitter. Here we remove them, allowing the
8807   // register allocator to pass call args in callee saved regs, without extra
8808   // copies to avoid these fake clobbers of actually-preserved GPRs.
8809   if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
8810       MI.getOpcode() == AArch64::MSRpstatePseudo) {
8811     for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
8812       if (MachineOperand &MO = MI.getOperand(I);
8813           MO.isReg() && MO.isImplicit() && MO.isDef() &&
8814           (AArch64::GPR32RegClass.contains(MO.getReg()) ||
8815            AArch64::GPR64RegClass.contains(MO.getReg())))
8816         MI.removeOperand(I);
8817 
8818     // The SVE vector length can change when entering/leaving streaming mode.
8819     if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
8820         MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
8821       MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8822                                               /*IsImplicit=*/true));
8823       MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
8824                                               /*IsImplicit=*/true));
8825     }
8826   }
8827 
8828   if (MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
8829       MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
8830     // If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies
8831     // from a StridedOrContiguous class, fall back on REG_SEQUENCE node.
8832     if (shouldUseFormStridedPseudo(MI))
8833       return;
8834 
8835     const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8836     MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
8837                                       TII->get(TargetOpcode::REG_SEQUENCE),
8838                                       MI.getOperand(0).getReg());
8839 
8840     for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
8841       MIB.add(MI.getOperand(I));
8842       MIB.addImm(AArch64::zsub0 + (I - 1));
8843     }
8844 
8845     MI.eraseFromParent();
8846     return;
8847   }
8848 
8849   // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
8850   // have nothing to do with VG, were it not that they are used to materialise a
8851   // frame-address. If they contain a frame-index to a scalable vector, this
8852   // will likely require an ADDVL instruction to materialise the address, thus
8853   // reading VG.
8854   const MachineFunction &MF = *MI.getMF();
8855   if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
8856       (MI.getOpcode() == AArch64::ADDXri ||
8857        MI.getOpcode() == AArch64::SUBXri)) {
8858     const MachineOperand &MO = MI.getOperand(1);
8859     if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
8860                          TargetStackID::ScalableVector)
8861       MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8862                                               /*IsImplicit=*/true));
8863   }
8864 }
8865 
8866 SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
8867                                                    bool Enable, SDValue Chain,
8868                                                    SDValue InGlue,
8869                                                    unsigned Condition,
8870                                                    SDValue PStateSM) const {
8871   MachineFunction &MF = DAG.getMachineFunction();
8872   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8873   FuncInfo->setHasStreamingModeChanges(true);
8874 
8875   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8876   SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
8877   SDValue MSROp =
8878       DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
8879   SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
8880   SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
8881   if (Condition != AArch64SME::Always) {
8882     assert(PStateSM && "PStateSM should be defined");
8883     Ops.push_back(PStateSM);
8884   }
8885   Ops.push_back(RegMask);
8886 
8887   if (InGlue)
8888     Ops.push_back(InGlue);
8889 
8890   unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
8891   return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
8892 }
8893 
8894 // Emit a call to __arm_sme_save or __arm_sme_restore.
8895 static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI,
8896                                        SelectionDAG &DAG,
8897                                        AArch64FunctionInfo *Info, SDLoc DL,
8898                                        SDValue Chain, bool IsSave) {
8899   MachineFunction &MF = DAG.getMachineFunction();
8900   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8901   FuncInfo->setSMESaveBufferUsed();
8902 
8903   TargetLowering::ArgListTy Args;
8904   TargetLowering::ArgListEntry Entry;
8905   Entry.Ty = PointerType::getUnqual(*DAG.getContext());
8906   Entry.Node =
8907       DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64);
8908   Args.push_back(Entry);
8909 
8910   SDValue Callee =
8911       DAG.getExternalSymbol(IsSave ? "__arm_sme_save" : "__arm_sme_restore",
8912                             TLI.getPointerTy(DAG.getDataLayout()));
8913   auto *RetTy = Type::getVoidTy(*DAG.getContext());
8914   TargetLowering::CallLoweringInfo CLI(DAG);
8915   CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8916       CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1, RetTy,
8917       Callee, std::move(Args));
8918   return TLI.LowerCallTo(CLI).second;
8919 }
8920 
8921 static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
8922                                const SMEAttrs &CalleeAttrs) {
8923   if (!CallerAttrs.hasStreamingCompatibleInterface() ||
8924       CallerAttrs.hasStreamingBody())
8925     return AArch64SME::Always;
8926   if (CalleeAttrs.hasNonStreamingInterface())
8927     return AArch64SME::IfCallerIsStreaming;
8928   if (CalleeAttrs.hasStreamingInterface())
8929     return AArch64SME::IfCallerIsNonStreaming;
8930 
8931   llvm_unreachable("Unsupported attributes");
8932 }
8933 
8934 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
8935 /// and add input and output parameter nodes.
8936 SDValue
8937 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
8938                                  SmallVectorImpl<SDValue> &InVals) const {
8939   SelectionDAG &DAG = CLI.DAG;
8940   SDLoc &DL = CLI.DL;
8941   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8942   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8943   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
8944   SDValue Chain = CLI.Chain;
8945   SDValue Callee = CLI.Callee;
8946   bool &IsTailCall = CLI.IsTailCall;
8947   CallingConv::ID &CallConv = CLI.CallConv;
8948   bool IsVarArg = CLI.IsVarArg;
8949 
8950   MachineFunction &MF = DAG.getMachineFunction();
8951   MachineFunction::CallSiteInfo CSInfo;
8952   bool IsThisReturn = false;
8953 
8954   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8955   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8956   bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
8957   bool IsSibCall = false;
8958   bool GuardWithBTI = false;
8959 
8960   if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
8961       !Subtarget->noBTIAtReturnTwice()) {
8962     GuardWithBTI = FuncInfo->branchTargetEnforcement();
8963   }
8964 
8965   // Analyze operands of the call, assigning locations to each operand.
8966   SmallVector<CCValAssign, 16> ArgLocs;
8967   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
8968 
8969   if (IsVarArg) {
8970     unsigned NumArgs = Outs.size();
8971 
8972     for (unsigned i = 0; i != NumArgs; ++i) {
8973       if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
8974         report_fatal_error("Passing SVE types to variadic functions is "
8975                            "currently not supported");
8976     }
8977   }
8978 
8979   analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
8980 
8981   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8982   // Assign locations to each value returned by this call.
8983   SmallVector<CCValAssign, 16> RVLocs;
8984   CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
8985                     *DAG.getContext());
8986   RetCCInfo.AnalyzeCallResult(Ins, RetCC);
8987 
8988   // Check callee args/returns for SVE registers and set calling convention
8989   // accordingly.
8990   if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
8991     auto HasSVERegLoc = [](CCValAssign &Loc) {
8992       if (!Loc.isRegLoc())
8993         return false;
8994       return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
8995              AArch64::PPRRegClass.contains(Loc.getLocReg());
8996     };
8997     if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
8998       CallConv = CallingConv::AArch64_SVE_VectorCall;
8999   }
9000 
9001   if (IsTailCall) {
9002     // Check if it's really possible to do a tail call.
9003     IsTailCall = isEligibleForTailCallOptimization(CLI);
9004 
9005     // A sibling call is one where we're under the usual C ABI and not planning
9006     // to change that but can still do a tail call:
9007     if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
9008         CallConv != CallingConv::SwiftTail)
9009       IsSibCall = true;
9010 
9011     if (IsTailCall)
9012       ++NumTailCalls;
9013   }
9014 
9015   if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9016     report_fatal_error("failed to perform tail call elimination on a call "
9017                        "site marked musttail");
9018 
9019   // Get a count of how many bytes are to be pushed on the stack.
9020   unsigned NumBytes = CCInfo.getStackSize();
9021 
9022   if (IsSibCall) {
9023     // Since we're not changing the ABI to make this a tail call, the memory
9024     // operands are already available in the caller's incoming argument space.
9025     NumBytes = 0;
9026   }
9027 
9028   // FPDiff is the byte offset of the call's argument area from the callee's.
9029   // Stores to callee stack arguments will be placed in FixedStackSlots offset
9030   // by this amount for a tail call. In a sibling call it must be 0 because the
9031   // caller will deallocate the entire stack and the callee still expects its
9032   // arguments to begin at SP+0. Completely unused for non-tail calls.
9033   int FPDiff = 0;
9034 
9035   if (IsTailCall && !IsSibCall) {
9036     unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9037 
9038     // Since callee will pop argument stack as a tail call, we must keep the
9039     // popped size 16-byte aligned.
9040     NumBytes = alignTo(NumBytes, 16);
9041 
9042     // FPDiff will be negative if this tail call requires more space than we
9043     // would automatically have in our incoming argument space. Positive if we
9044     // can actually shrink the stack.
9045     FPDiff = NumReusableBytes - NumBytes;
9046 
9047     // Update the required reserved area if this is the tail call requiring the
9048     // most argument stack space.
9049     if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9050       FuncInfo->setTailCallReservedStack(-FPDiff);
9051 
9052     // The stack pointer must be 16-byte aligned at all times it's used for a
9053     // memory operation, which in practice means at *all* times and in
9054     // particular across call boundaries. Therefore our own arguments started at
9055     // a 16-byte aligned SP and the delta applied for the tail call should
9056     // satisfy the same constraint.
9057     assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9058   }
9059 
9060   // Determine whether we need any streaming mode changes.
9061   SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
9062   if (CLI.CB)
9063     CalleeAttrs = SMEAttrs(*CLI.CB);
9064   else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9065     CalleeAttrs = SMEAttrs(ES->getSymbol());
9066 
9067   auto DescribeCallsite =
9068       [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
9069     R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9070     if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9071       R << ore::NV("Callee", ES->getSymbol());
9072     else if (CLI.CB && CLI.CB->getCalledFunction())
9073       R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9074     else
9075       R << "unknown callee";
9076     R << "'";
9077     return R;
9078   };
9079 
9080   bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
9081   bool RequiresSaveAllZA =
9082       CallerAttrs.requiresPreservingAllZAState(CalleeAttrs);
9083   if (RequiresLazySave) {
9084     const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9085     MachinePointerInfo MPI =
9086         MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex);
9087     SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
9088         TPIDR2.FrameIndex,
9089         DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
9090     SDValue NumZaSaveSlicesAddr =
9091         DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
9092                     DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
9093     SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
9094                                           DAG.getConstant(1, DL, MVT::i32));
9095     Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
9096                               MPI, MVT::i16);
9097     Chain = DAG.getNode(
9098         ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
9099         DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9100         TPIDR2ObjAddr);
9101     OptimizationRemarkEmitter ORE(&MF.getFunction());
9102     ORE.emit([&]() {
9103       auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9104                                                    CLI.CB)
9105                       : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9106                                                    &MF.getFunction());
9107       return DescribeCallsite(R) << " sets up a lazy save for ZA";
9108     });
9109   } else if (RequiresSaveAllZA) {
9110     assert(!CalleeAttrs.hasSharedZAInterface() &&
9111            "Cannot share state that may not exist");
9112     Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
9113                                     /*IsSave=*/true);
9114   }
9115 
9116   SDValue PStateSM;
9117   bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
9118   if (RequiresSMChange) {
9119     if (CallerAttrs.hasStreamingInterfaceOrBody())
9120       PStateSM = DAG.getConstant(1, DL, MVT::i64);
9121     else if (CallerAttrs.hasNonStreamingInterface())
9122       PStateSM = DAG.getConstant(0, DL, MVT::i64);
9123     else
9124       PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
9125     OptimizationRemarkEmitter ORE(&MF.getFunction());
9126     ORE.emit([&]() {
9127       auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9128                                                    CLI.CB)
9129                       : OptimizationRemarkAnalysis("sme", "SMETransition",
9130                                                    &MF.getFunction());
9131       DescribeCallsite(R) << " requires a streaming mode transition";
9132       return R;
9133     });
9134   }
9135 
9136   SDValue ZTFrameIdx;
9137   MachineFrameInfo &MFI = MF.getFrameInfo();
9138   bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
9139 
9140   // If the caller has ZT0 state which will not be preserved by the callee,
9141   // spill ZT0 before the call.
9142   if (ShouldPreserveZT0) {
9143     unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
9144     ZTFrameIdx = DAG.getFrameIndex(
9145         ZTObj,
9146         DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
9147 
9148     Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
9149                         {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9150   }
9151 
9152   // If caller shares ZT0 but the callee is not shared ZA, we need to stop
9153   // PSTATE.ZA before the call if there is no lazy-save active.
9154   bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
9155   assert((!DisableZA || !RequiresLazySave) &&
9156          "Lazy-save should have PSTATE.SM=1 on entry to the function");
9157 
9158   if (DisableZA)
9159     Chain = DAG.getNode(
9160         AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
9161         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
9162         DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
9163 
9164   // Adjust the stack pointer for the new arguments...
9165   // These operations are automatically eliminated by the prolog/epilog pass
9166   if (!IsSibCall)
9167     Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
9168 
9169   SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
9170                                         getPointerTy(DAG.getDataLayout()));
9171 
9172   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
9173   SmallSet<unsigned, 8> RegsUsed;
9174   SmallVector<SDValue, 8> MemOpChains;
9175   auto PtrVT = getPointerTy(DAG.getDataLayout());
9176 
9177   if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
9178     const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
9179     for (const auto &F : Forwards) {
9180       SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
9181        RegsToPass.emplace_back(F.PReg, Val);
9182     }
9183   }
9184 
9185   // Walk the register/memloc assignments, inserting copies/loads.
9186   unsigned ExtraArgLocs = 0;
9187   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
9188     CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
9189     SDValue Arg = OutVals[i];
9190     ISD::ArgFlagsTy Flags = Outs[i].Flags;
9191 
9192     // Promote the value if needed.
9193     switch (VA.getLocInfo()) {
9194     default:
9195       llvm_unreachable("Unknown loc info!");
9196     case CCValAssign::Full:
9197       break;
9198     case CCValAssign::SExt:
9199       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
9200       break;
9201     case CCValAssign::ZExt:
9202       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9203       break;
9204     case CCValAssign::AExt:
9205       if (Outs[i].ArgVT == MVT::i1) {
9206         // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
9207         //
9208         // Check if we actually have to do this, because the value may
9209         // already be zero-extended.
9210         //
9211         // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
9212         // and rely on DAGCombiner to fold this, because the following
9213         // (anyext i32) is combined with (zext i8) in DAG.getNode:
9214         //
9215         //   (ext (zext x)) -> (zext x)
9216         //
9217         // This will give us (zext i32), which we cannot remove, so
9218         // try to check this beforehand.
9219         if (!checkZExtBool(Arg, DAG)) {
9220           Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9221           Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
9222         }
9223       }
9224       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9225       break;
9226     case CCValAssign::AExtUpper:
9227       assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9228       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9229       Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9230                         DAG.getConstant(32, DL, VA.getLocVT()));
9231       break;
9232     case CCValAssign::BCvt:
9233       Arg = DAG.getBitcast(VA.getLocVT(), Arg);
9234       break;
9235     case CCValAssign::Trunc:
9236       Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9237       break;
9238     case CCValAssign::FPExt:
9239       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
9240       break;
9241     case CCValAssign::Indirect:
9242       bool isScalable = VA.getValVT().isScalableVT();
9243       assert((isScalable || Subtarget->isWindowsArm64EC()) &&
9244              "Indirect arguments should be scalable on most subtargets");
9245 
9246       uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
9247       uint64_t PartSize = StoreSize;
9248       unsigned NumParts = 1;
9249       if (Outs[i].Flags.isInConsecutiveRegs()) {
9250         while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
9251           ++NumParts;
9252         StoreSize *= NumParts;
9253       }
9254 
9255       Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
9256       Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
9257       MachineFrameInfo &MFI = MF.getFrameInfo();
9258       int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
9259       if (isScalable)
9260         MFI.setStackID(FI, TargetStackID::ScalableVector);
9261 
9262       MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
9263       SDValue Ptr = DAG.getFrameIndex(
9264           FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
9265       SDValue SpillSlot = Ptr;
9266 
9267       // Ensure we generate all stores for each tuple part, whilst updating the
9268       // pointer after each store correctly using vscale.
9269       while (NumParts) {
9270         SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
9271         MemOpChains.push_back(Store);
9272 
9273         NumParts--;
9274         if (NumParts > 0) {
9275           SDValue BytesIncrement;
9276           if (isScalable) {
9277             BytesIncrement = DAG.getVScale(
9278                 DL, Ptr.getValueType(),
9279                 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
9280           } else {
9281             BytesIncrement = DAG.getConstant(
9282                 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
9283                 Ptr.getValueType());
9284           }
9285           MPI = MachinePointerInfo(MPI.getAddrSpace());
9286           Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9287                             BytesIncrement, SDNodeFlags::NoUnsignedWrap);
9288           ExtraArgLocs++;
9289           i++;
9290         }
9291       }
9292 
9293       Arg = SpillSlot;
9294       break;
9295     }
9296 
9297     if (VA.isRegLoc()) {
9298       if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
9299           Outs[0].VT == MVT::i64) {
9300         assert(VA.getLocVT() == MVT::i64 &&
9301                "unexpected calling convention register assignment");
9302         assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
9303                "unexpected use of 'returned'");
9304         IsThisReturn = true;
9305       }
9306       if (RegsUsed.count(VA.getLocReg())) {
9307         // If this register has already been used then we're trying to pack
9308         // parts of an [N x i32] into an X-register. The extension type will
9309         // take care of putting the two halves in the right place but we have to
9310         // combine them.
9311         SDValue &Bits =
9312             llvm::find_if(RegsToPass,
9313                           [=](const std::pair<unsigned, SDValue> &Elt) {
9314                             return Elt.first == VA.getLocReg();
9315                           })
9316                 ->second;
9317         Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9318         // Call site info is used for function's parameter entry value
9319         // tracking. For now we track only simple cases when parameter
9320         // is transferred through whole register.
9321         llvm::erase_if(CSInfo.ArgRegPairs,
9322                        [&VA](MachineFunction::ArgRegPair ArgReg) {
9323                          return ArgReg.Reg == VA.getLocReg();
9324                        });
9325       } else {
9326         // Add an extra level of indirection for streaming mode changes by
9327         // using a pseudo copy node that cannot be rematerialised between a
9328         // smstart/smstop and the call by the simple register coalescer.
9329         if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
9330           Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9331                             Arg.getValueType(), Arg);
9332         RegsToPass.emplace_back(VA.getLocReg(), Arg);
9333         RegsUsed.insert(VA.getLocReg());
9334         const TargetOptions &Options = DAG.getTarget().Options;
9335         if (Options.EmitCallSiteInfo)
9336           CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
9337       }
9338     } else {
9339       assert(VA.isMemLoc());
9340 
9341       SDValue DstAddr;
9342       MachinePointerInfo DstInfo;
9343 
9344       // FIXME: This works on big-endian for composite byvals, which are the
9345       // common case. It should also work for fundamental types too.
9346       uint32_t BEAlign = 0;
9347       unsigned OpSize;
9348       if (VA.getLocInfo() == CCValAssign::Indirect ||
9349           VA.getLocInfo() == CCValAssign::Trunc)
9350         OpSize = VA.getLocVT().getFixedSizeInBits();
9351       else
9352         OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
9353                                  : VA.getValVT().getSizeInBits();
9354       OpSize = (OpSize + 7) / 8;
9355       if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
9356           !Flags.isInConsecutiveRegs()) {
9357         if (OpSize < 8)
9358           BEAlign = 8 - OpSize;
9359       }
9360       unsigned LocMemOffset = VA.getLocMemOffset();
9361       int32_t Offset = LocMemOffset + BEAlign;
9362       SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
9363       PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
9364 
9365       if (IsTailCall) {
9366         Offset = Offset + FPDiff;
9367         int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
9368 
9369         DstAddr = DAG.getFrameIndex(FI, PtrVT);
9370         DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
9371 
9372         // Make sure any stack arguments overlapping with where we're storing
9373         // are loaded before this eventual operation. Otherwise they'll be
9374         // clobbered.
9375         Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
9376       } else {
9377         SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
9378 
9379         DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
9380         DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
9381       }
9382 
9383       if (Outs[i].Flags.isByVal()) {
9384         SDValue SizeNode =
9385             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
9386         SDValue Cpy = DAG.getMemcpy(
9387             Chain, DL, DstAddr, Arg, SizeNode,
9388             Outs[i].Flags.getNonZeroByValAlign(),
9389             /*isVol = */ false, /*AlwaysInline = */ false,
9390             /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
9391 
9392         MemOpChains.push_back(Cpy);
9393       } else {
9394         // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
9395         // promoted to a legal register type i32, we should truncate Arg back to
9396         // i1/i8/i16.
9397         if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
9398             VA.getValVT() == MVT::i16)
9399           Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
9400 
9401         SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
9402         MemOpChains.push_back(Store);
9403       }
9404     }
9405   }
9406 
9407   if (IsVarArg && Subtarget->isWindowsArm64EC()) {
9408     SDValue ParamPtr = StackPtr;
9409     if (IsTailCall) {
9410       // Create a dummy object at the top of the stack that can be used to get
9411       // the SP after the epilogue
9412       int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
9413       ParamPtr = DAG.getFrameIndex(FI, PtrVT);
9414     }
9415 
9416     // For vararg calls, the Arm64EC ABI requires values in x4 and x5
9417     // describing the argument list.  x4 contains the address of the
9418     // first stack parameter. x5 contains the size in bytes of all parameters
9419     // passed on the stack.
9420     RegsToPass.emplace_back(AArch64::X4, ParamPtr);
9421     RegsToPass.emplace_back(AArch64::X5,
9422                             DAG.getConstant(NumBytes, DL, MVT::i64));
9423   }
9424 
9425   if (!MemOpChains.empty())
9426     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
9427 
9428   SDValue InGlue;
9429   if (RequiresSMChange) {
9430     if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
9431       Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL,
9432                           DAG.getVTList(MVT::Other, MVT::Glue), Chain);
9433       InGlue = Chain.getValue(1);
9434     }
9435 
9436     SDValue NewChain = changeStreamingMode(
9437         DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
9438         getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
9439     Chain = NewChain.getValue(0);
9440     InGlue = NewChain.getValue(1);
9441   }
9442 
9443   // Build a sequence of copy-to-reg nodes chained together with token chain
9444   // and flag operands which copy the outgoing args into the appropriate regs.
9445   for (auto &RegToPass : RegsToPass) {
9446     Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
9447                              RegToPass.second, InGlue);
9448     InGlue = Chain.getValue(1);
9449   }
9450 
9451   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
9452   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
9453   // node so that legalize doesn't hack it.
9454   const GlobalValue *CalledGlobal = nullptr;
9455   unsigned OpFlags = 0;
9456   if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9457     CalledGlobal = G->getGlobal();
9458     OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
9459                                                          getTargetMachine());
9460     if (OpFlags & AArch64II::MO_GOT) {
9461       Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
9462       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9463     } else {
9464       const GlobalValue *GV = G->getGlobal();
9465       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
9466     }
9467   } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
9468     bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
9469                    Subtarget->isTargetMachO()) ||
9470                   MF.getFunction().getParent()->getRtLibUseGOT();
9471     const char *Sym = S->getSymbol();
9472     if (UseGot) {
9473       Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
9474       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9475     } else {
9476       Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
9477     }
9478   }
9479 
9480   // We don't usually want to end the call-sequence here because we would tidy
9481   // the frame up *after* the call, however in the ABI-changing tail-call case
9482   // we've carefully laid out the parameters so that when sp is reset they'll be
9483   // in the correct location.
9484   if (IsTailCall && !IsSibCall) {
9485     Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
9486     InGlue = Chain.getValue(1);
9487   }
9488 
9489   unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
9490 
9491   std::vector<SDValue> Ops;
9492   Ops.push_back(Chain);
9493   Ops.push_back(Callee);
9494 
9495   // Calls with operand bundle "clang.arc.attachedcall" are special. They should
9496   // be expanded to the call, directly followed by a special marker sequence and
9497   // a call to an ObjC library function.  Use CALL_RVMARKER to do that.
9498   if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
9499     assert(!IsTailCall &&
9500            "tail calls cannot be marked with clang.arc.attachedcall");
9501     Opc = AArch64ISD::CALL_RVMARKER;
9502 
9503     // Add a target global address for the retainRV/claimRV runtime function
9504     // just before the call target.
9505     Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
9506     auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
9507     Ops.insert(Ops.begin() + 1, GA);
9508   } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9509     Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
9510   } else if (GuardWithBTI) {
9511     Opc = AArch64ISD::CALL_BTI;
9512   }
9513 
9514   if (IsTailCall) {
9515     // Each tail call may have to adjust the stack by a different amount, so
9516     // this information must travel along with the operation for eventual
9517     // consumption by emitEpilogue.
9518     Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
9519   }
9520 
9521   if (CLI.PAI) {
9522     const uint64_t Key = CLI.PAI->Key;
9523     assert((Key == AArch64PACKey::IA || Key == AArch64PACKey::IB) &&
9524            "Invalid auth call key");
9525 
9526     // Split the discriminator into address/integer components.
9527     SDValue AddrDisc, IntDisc;
9528     std::tie(IntDisc, AddrDisc) =
9529         extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
9530 
9531     if (Opc == AArch64ISD::CALL_RVMARKER)
9532       Opc = AArch64ISD::AUTH_CALL_RVMARKER;
9533     else
9534       Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
9535     Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
9536     Ops.push_back(IntDisc);
9537     Ops.push_back(AddrDisc);
9538   }
9539 
9540   // Add argument registers to the end of the list so that they are known live
9541   // into the call.
9542   for (auto &RegToPass : RegsToPass)
9543     Ops.push_back(DAG.getRegister(RegToPass.first,
9544                                   RegToPass.second.getValueType()));
9545 
9546   // Add a register mask operand representing the call-preserved registers.
9547   const uint32_t *Mask;
9548   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9549   if (IsThisReturn) {
9550     // For 'this' returns, use the X0-preserving mask if applicable
9551     Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
9552     if (!Mask) {
9553       IsThisReturn = false;
9554       Mask = TRI->getCallPreservedMask(MF, CallConv);
9555     }
9556   } else
9557     Mask = TRI->getCallPreservedMask(MF, CallConv);
9558 
9559   if (Subtarget->hasCustomCallingConv())
9560     TRI->UpdateCustomCallPreservedMask(MF, &Mask);
9561 
9562   if (TRI->isAnyArgRegReserved(MF))
9563     TRI->emitReservedArgRegCallError(MF);
9564 
9565   assert(Mask && "Missing call preserved mask for calling convention");
9566   Ops.push_back(DAG.getRegisterMask(Mask));
9567 
9568   if (InGlue.getNode())
9569     Ops.push_back(InGlue);
9570 
9571   // If we're doing a tall call, use a TC_RETURN here rather than an
9572   // actual call instruction.
9573   if (IsTailCall) {
9574     MF.getFrameInfo().setHasTailCall();
9575     SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
9576     if (IsCFICall)
9577       Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9578 
9579     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
9580     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
9581     if (CalledGlobal)
9582       DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
9583     return Ret;
9584   }
9585 
9586   // Returns a chain and a flag for retval copy to use.
9587   Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
9588   if (IsCFICall)
9589     Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9590 
9591   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
9592   InGlue = Chain.getValue(1);
9593   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
9594   if (CalledGlobal)
9595     DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
9596 
9597   uint64_t CalleePopBytes =
9598       DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
9599 
9600   Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
9601   InGlue = Chain.getValue(1);
9602 
9603   // Handle result values, copying them out of physregs into vregs that we
9604   // return.
9605   SDValue Result = LowerCallResult(
9606       Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
9607       IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
9608 
9609   if (!Ins.empty())
9610     InGlue = Result.getValue(Result->getNumValues() - 1);
9611 
9612   if (RequiresSMChange) {
9613     assert(PStateSM && "Expected a PStateSM to be set");
9614     Result = changeStreamingMode(
9615         DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
9616         getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
9617 
9618     if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
9619       InGlue = Result.getValue(1);
9620       Result =
9621           DAG.getNode(AArch64ISD::VG_RESTORE, DL,
9622                       DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue});
9623     }
9624   }
9625 
9626   if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
9627     // Unconditionally resume ZA.
9628     Result = DAG.getNode(
9629         AArch64ISD::SMSTART, DL, MVT::Other, Result,
9630         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
9631         DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
9632 
9633   if (ShouldPreserveZT0)
9634     Result =
9635         DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
9636                     {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9637 
9638   if (RequiresLazySave) {
9639     // Conditionally restore the lazy save using a pseudo node.
9640     TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9641     SDValue RegMask = DAG.getRegisterMask(
9642         TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
9643     SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
9644         "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
9645     SDValue TPIDR2_EL0 = DAG.getNode(
9646         ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
9647         DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
9648 
9649     // Copy the address of the TPIDR2 block into X0 before 'calling' the
9650     // RESTORE_ZA pseudo.
9651     SDValue Glue;
9652     SDValue TPIDR2Block = DAG.getFrameIndex(
9653         TPIDR2.FrameIndex,
9654         DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
9655     Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
9656     Result =
9657         DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
9658                     {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
9659                      RestoreRoutine, RegMask, Result.getValue(1)});
9660 
9661     // Finally reset the TPIDR2_EL0 register to 0.
9662     Result = DAG.getNode(
9663         ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
9664         DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9665         DAG.getConstant(0, DL, MVT::i64));
9666     TPIDR2.Uses++;
9667   } else if (RequiresSaveAllZA) {
9668     Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
9669                                      /*IsSave=*/false);
9670   }
9671 
9672   if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||
9673       RequiresSaveAllZA) {
9674     for (unsigned I = 0; I < InVals.size(); ++I) {
9675       // The smstart/smstop is chained as part of the call, but when the
9676       // resulting chain is discarded (which happens when the call is not part
9677       // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
9678       // smstart/smstop is chained to the result value. We can do that by doing
9679       // a vreg -> vreg copy.
9680       Register Reg = MF.getRegInfo().createVirtualRegister(
9681           getRegClassFor(InVals[I].getValueType().getSimpleVT()));
9682       SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
9683       InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
9684                                      InVals[I].getValueType());
9685     }
9686   }
9687 
9688   if (CallConv == CallingConv::PreserveNone) {
9689     for (const ISD::OutputArg &O : Outs) {
9690       if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
9691           O.Flags.isSwiftAsync()) {
9692         MachineFunction &MF = DAG.getMachineFunction();
9693         DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9694             MF.getFunction(),
9695             "Swift attributes can't be used with preserve_none",
9696             DL.getDebugLoc()));
9697         break;
9698       }
9699     }
9700   }
9701 
9702   return Result;
9703 }
9704 
9705 bool AArch64TargetLowering::CanLowerReturn(
9706     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
9707     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
9708     const Type *RetTy) const {
9709   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9710   SmallVector<CCValAssign, 16> RVLocs;
9711   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
9712   return CCInfo.CheckReturn(Outs, RetCC);
9713 }
9714 
9715 SDValue
9716 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
9717                                    bool isVarArg,
9718                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
9719                                    const SmallVectorImpl<SDValue> &OutVals,
9720                                    const SDLoc &DL, SelectionDAG &DAG) const {
9721   auto &MF = DAG.getMachineFunction();
9722   auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9723 
9724   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9725   SmallVector<CCValAssign, 16> RVLocs;
9726   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
9727   CCInfo.AnalyzeReturn(Outs, RetCC);
9728 
9729   // Copy the result values into the output registers.
9730   SDValue Glue;
9731   SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
9732   SmallSet<unsigned, 4> RegsUsed;
9733   for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
9734        ++i, ++realRVLocIdx) {
9735     CCValAssign &VA = RVLocs[i];
9736     assert(VA.isRegLoc() && "Can only return in registers!");
9737     SDValue Arg = OutVals[realRVLocIdx];
9738 
9739     switch (VA.getLocInfo()) {
9740     default:
9741       llvm_unreachable("Unknown loc info!");
9742     case CCValAssign::Full:
9743       if (Outs[i].ArgVT == MVT::i1) {
9744         // AAPCS requires i1 to be zero-extended to i8 by the producer of the
9745         // value. This is strictly redundant on Darwin (which uses "zeroext
9746         // i1"), but will be optimised out before ISel.
9747         Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9748         Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9749       }
9750       break;
9751     case CCValAssign::BCvt:
9752       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
9753       break;
9754     case CCValAssign::AExt:
9755     case CCValAssign::ZExt:
9756       Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9757       break;
9758     case CCValAssign::AExtUpper:
9759       assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9760       Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9761       Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9762                         DAG.getConstant(32, DL, VA.getLocVT()));
9763       break;
9764     }
9765 
9766     if (RegsUsed.count(VA.getLocReg())) {
9767       SDValue &Bits =
9768           llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
9769             return Elt.first == VA.getLocReg();
9770           })->second;
9771       Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9772     } else {
9773       RetVals.emplace_back(VA.getLocReg(), Arg);
9774       RegsUsed.insert(VA.getLocReg());
9775     }
9776   }
9777 
9778   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9779 
9780   // Emit SMSTOP before returning from a locally streaming function
9781   SMEAttrs FuncAttrs(MF.getFunction());
9782   if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
9783     if (FuncAttrs.hasStreamingCompatibleInterface()) {
9784       Register Reg = FuncInfo->getPStateSMReg();
9785       assert(Reg.isValid() && "PStateSM Register is invalid");
9786       SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
9787       Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9788                                   /*Glue*/ SDValue(),
9789                                   AArch64SME::IfCallerIsNonStreaming, PStateSM);
9790     } else
9791       Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9792                                   /*Glue*/ SDValue(), AArch64SME::Always);
9793     Glue = Chain.getValue(1);
9794   }
9795 
9796   SmallVector<SDValue, 4> RetOps(1, Chain);
9797   for (auto &RetVal : RetVals) {
9798     if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
9799         isPassedInFPR(RetVal.second.getValueType()))
9800       RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9801                                   RetVal.second.getValueType(), RetVal.second);
9802     Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
9803     Glue = Chain.getValue(1);
9804     RetOps.push_back(
9805         DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
9806   }
9807 
9808   // Windows AArch64 ABIs require that for returning structs by value we copy
9809   // the sret argument into X0 for the return.
9810   // We saved the argument into a virtual register in the entry block,
9811   // so now we copy the value out and into X0.
9812   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
9813     SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
9814                                      getPointerTy(MF.getDataLayout()));
9815 
9816     unsigned RetValReg = AArch64::X0;
9817     if (CallConv == CallingConv::ARM64EC_Thunk_X64)
9818       RetValReg = AArch64::X8;
9819     Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
9820     Glue = Chain.getValue(1);
9821 
9822     RetOps.push_back(
9823       DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
9824   }
9825 
9826   const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
9827   if (I) {
9828     for (; *I; ++I) {
9829       if (AArch64::GPR64RegClass.contains(*I))
9830         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
9831       else if (AArch64::FPR64RegClass.contains(*I))
9832         RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
9833       else
9834         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
9835     }
9836   }
9837 
9838   RetOps[0] = Chain; // Update chain.
9839 
9840   // Add the glue if we have it.
9841   if (Glue.getNode())
9842     RetOps.push_back(Glue);
9843 
9844   if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9845     // ARM64EC entry thunks use a special return sequence: instead of a regular
9846     // "ret" instruction, they need to explicitly call the emulator.
9847     EVT PtrVT = getPointerTy(DAG.getDataLayout());
9848     SDValue Arm64ECRetDest =
9849         DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
9850     Arm64ECRetDest =
9851         getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
9852     Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
9853                                  MachinePointerInfo());
9854     RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
9855     RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
9856     return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
9857   }
9858 
9859   return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
9860 }
9861 
9862 //===----------------------------------------------------------------------===//
9863 //  Other Lowering Code
9864 //===----------------------------------------------------------------------===//
9865 
9866 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
9867                                              SelectionDAG &DAG,
9868                                              unsigned Flag) const {
9869   return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
9870                                     N->getOffset(), Flag);
9871 }
9872 
9873 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
9874                                              SelectionDAG &DAG,
9875                                              unsigned Flag) const {
9876   return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
9877 }
9878 
9879 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
9880                                              SelectionDAG &DAG,
9881                                              unsigned Flag) const {
9882   return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
9883                                    N->getOffset(), Flag);
9884 }
9885 
9886 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
9887                                              SelectionDAG &DAG,
9888                                              unsigned Flag) const {
9889   return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
9890 }
9891 
9892 SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
9893                                              SelectionDAG &DAG,
9894                                              unsigned Flag) const {
9895   return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
9896 }
9897 
9898 // (loadGOT sym)
9899 template <class NodeTy>
9900 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
9901                                       unsigned Flags) const {
9902   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
9903   SDLoc DL(N);
9904   EVT Ty = getPointerTy(DAG.getDataLayout());
9905   SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
9906   // FIXME: Once remat is capable of dealing with instructions with register
9907   // operands, expand this into two nodes instead of using a wrapper node.
9908   if (DAG.getMachineFunction()
9909           .getInfo<AArch64FunctionInfo>()
9910           ->hasELFSignedGOT())
9911     return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
9912                    0);
9913   return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
9914 }
9915 
9916 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
9917 template <class NodeTy>
9918 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
9919                                             unsigned Flags) const {
9920   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
9921   SDLoc DL(N);
9922   EVT Ty = getPointerTy(DAG.getDataLayout());
9923   const unsigned char MO_NC = AArch64II::MO_NC;
9924   return DAG.getNode(
9925       AArch64ISD::WrapperLarge, DL, Ty,
9926       getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
9927       getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
9928       getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
9929       getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
9930 }
9931 
9932 // (addlow (adrp %hi(sym)) %lo(sym))
9933 template <class NodeTy>
9934 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
9935                                        unsigned Flags) const {
9936   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
9937   SDLoc DL(N);
9938   EVT Ty = getPointerTy(DAG.getDataLayout());
9939   SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
9940   SDValue Lo = getTargetNode(N, Ty, DAG,
9941                              AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);
9942   SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
9943   return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
9944 }
9945 
9946 // (adr sym)
9947 template <class NodeTy>
9948 SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
9949                                            unsigned Flags) const {
9950   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
9951   SDLoc DL(N);
9952   EVT Ty = getPointerTy(DAG.getDataLayout());
9953   SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
9954   return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
9955 }
9956 
9957 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
9958                                                   SelectionDAG &DAG) const {
9959   GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
9960   const GlobalValue *GV = GN->getGlobal();
9961   unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
9962 
9963   if (OpFlags != AArch64II::MO_NO_FLAG)
9964     assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
9965            "unexpected offset in global node");
9966 
9967   // This also catches the large code model case for Darwin, and tiny code
9968   // model with got relocations.
9969   if ((OpFlags & AArch64II::MO_GOT) != 0) {
9970     return getGOT(GN, DAG, OpFlags);
9971   }
9972 
9973   SDValue Result;
9974   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
9975       !getTargetMachine().isPositionIndependent()) {
9976     Result = getAddrLarge(GN, DAG, OpFlags);
9977   } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9978     Result = getAddrTiny(GN, DAG, OpFlags);
9979   } else {
9980     Result = getAddr(GN, DAG, OpFlags);
9981   }
9982   EVT PtrVT = getPointerTy(DAG.getDataLayout());
9983   SDLoc DL(GN);
9984   if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB))
9985     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
9986                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
9987   return Result;
9988 }
9989 
9990 /// Convert a TLS address reference into the correct sequence of loads
9991 /// and calls to compute the variable's address (for Darwin, currently) and
9992 /// return an SDValue containing the final node.
9993 
9994 /// Darwin only has one TLS scheme which must be capable of dealing with the
9995 /// fully general situation, in the worst case. This means:
9996 ///     + "extern __thread" declaration.
9997 ///     + Defined in a possibly unknown dynamic library.
9998 ///
9999 /// The general system is that each __thread variable has a [3 x i64] descriptor
10000 /// which contains information used by the runtime to calculate the address. The
10001 /// only part of this the compiler needs to know about is the first xword, which
10002 /// contains a function pointer that must be called with the address of the
10003 /// entire descriptor in "x0".
10004 ///
10005 /// Since this descriptor may be in a different unit, in general even the
10006 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
10007 /// is:
10008 ///     adrp x0, _var@TLVPPAGE
10009 ///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
10010 ///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
10011 ///                                      ; the function pointer
10012 ///     blr x1                           ; Uses descriptor address in x0
10013 ///     ; Address of _var is now in x0.
10014 ///
10015 /// If the address of _var's descriptor *is* known to the linker, then it can
10016 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10017 /// a slight efficiency gain.
10018 SDValue
10019 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10020                                                    SelectionDAG &DAG) const {
10021   assert(Subtarget->isTargetDarwin() &&
10022          "This function expects a Darwin target");
10023 
10024   SDLoc DL(Op);
10025   MVT PtrVT = getPointerTy(DAG.getDataLayout());
10026   MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10027   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10028 
10029   SDValue TLVPAddr =
10030       DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10031   SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10032 
10033   // The first entry in the descriptor is a function pointer that we must call
10034   // to obtain the address of the variable.
10035   SDValue Chain = DAG.getEntryNode();
10036   SDValue FuncTLVGet = DAG.getLoad(
10037       PtrMemVT, DL, Chain, DescAddr,
10038       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
10039       Align(PtrMemVT.getSizeInBits() / 8),
10040       MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
10041   Chain = FuncTLVGet.getValue(1);
10042 
10043   // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10044   FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10045 
10046   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10047   MFI.setAdjustsStack(true);
10048 
10049   // TLS calls preserve all registers except those that absolutely must be
10050   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10051   // silly).
10052   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10053   const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10054   if (Subtarget->hasCustomCallingConv())
10055     TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10056 
10057   // Finally, we can make the call. This is just a degenerate version of a
10058   // normal AArch64 call node: x0 takes the address of the descriptor, and
10059   // returns the address of the variable in this thread.
10060   Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10061 
10062   unsigned Opcode = AArch64ISD::CALL;
10063   SmallVector<SDValue, 8> Ops;
10064   Ops.push_back(Chain);
10065   Ops.push_back(FuncTLVGet);
10066 
10067   // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10068   if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10069     Opcode = AArch64ISD::AUTH_CALL;
10070     Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10071     Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10072     Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10073   }
10074 
10075   Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10076   Ops.push_back(DAG.getRegisterMask(Mask));
10077   Ops.push_back(Chain.getValue(1));
10078   Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10079   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10080 }
10081 
10082 /// Convert a thread-local variable reference into a sequence of instructions to
10083 /// compute the variable's address for the local exec TLS model of ELF targets.
10084 /// The sequence depends on the maximum TLS area size.
10085 SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10086                                                     SDValue ThreadBase,
10087                                                     const SDLoc &DL,
10088                                                     SelectionDAG &DAG) const {
10089   EVT PtrVT = getPointerTy(DAG.getDataLayout());
10090   SDValue TPOff, Addr;
10091 
10092   switch (DAG.getTarget().Options.TLSSize) {
10093   default:
10094     llvm_unreachable("Unexpected TLS size");
10095 
10096   case 12: {
10097     // mrs   x0, TPIDR_EL0
10098     // add   x0, x0, :tprel_lo12:a
10099     SDValue Var = DAG.getTargetGlobalAddress(
10100         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10101     return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10102                                       Var,
10103                                       DAG.getTargetConstant(0, DL, MVT::i32)),
10104                    0);
10105   }
10106 
10107   case 24: {
10108     // mrs   x0, TPIDR_EL0
10109     // add   x0, x0, :tprel_hi12:a
10110     // add   x0, x0, :tprel_lo12_nc:a
10111     SDValue HiVar = DAG.getTargetGlobalAddress(
10112         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10113     SDValue LoVar = DAG.getTargetGlobalAddress(
10114         GV, DL, PtrVT, 0,
10115         AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10116     Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10117                                       HiVar,
10118                                       DAG.getTargetConstant(0, DL, MVT::i32)),
10119                    0);
10120     return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10121                                       LoVar,
10122                                       DAG.getTargetConstant(0, DL, MVT::i32)),
10123                    0);
10124   }
10125 
10126   case 32: {
10127     // mrs   x1, TPIDR_EL0
10128     // movz  x0, #:tprel_g1:a
10129     // movk  x0, #:tprel_g0_nc:a
10130     // add   x0, x1, x0
10131     SDValue HiVar = DAG.getTargetGlobalAddress(
10132         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10133     SDValue LoVar = DAG.getTargetGlobalAddress(
10134         GV, DL, PtrVT, 0,
10135         AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
10136     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10137                                        DAG.getTargetConstant(16, DL, MVT::i32)),
10138                     0);
10139     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10140                                        DAG.getTargetConstant(0, DL, MVT::i32)),
10141                     0);
10142     return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10143   }
10144 
10145   case 48: {
10146     // mrs   x1, TPIDR_EL0
10147     // movz  x0, #:tprel_g2:a
10148     // movk  x0, #:tprel_g1_nc:a
10149     // movk  x0, #:tprel_g0_nc:a
10150     // add   x0, x1, x0
10151     SDValue HiVar = DAG.getTargetGlobalAddress(
10152         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10153     SDValue MiVar = DAG.getTargetGlobalAddress(
10154         GV, DL, PtrVT, 0,
10155         AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC);
10156     SDValue LoVar = DAG.getTargetGlobalAddress(
10157         GV, DL, PtrVT, 0,
10158         AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
10159     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10160                                        DAG.getTargetConstant(32, DL, MVT::i32)),
10161                     0);
10162     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10163                                        DAG.getTargetConstant(16, DL, MVT::i32)),
10164                     0);
10165     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10166                                        DAG.getTargetConstant(0, DL, MVT::i32)),
10167                     0);
10168     return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10169   }
10170   }
10171 }
10172 
10173 /// When accessing thread-local variables under either the general-dynamic or
10174 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10175 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10176 /// is a function pointer to carry out the resolution.
10177 ///
10178 /// The sequence is:
10179 ///    adrp  x0, :tlsdesc:var
10180 ///    ldr   x1, [x0, #:tlsdesc_lo12:var]
10181 ///    add   x0, x0, #:tlsdesc_lo12:var
10182 ///    .tlsdesccall var
10183 ///    blr   x1
10184 ///    (TPIDR_EL0 offset now in x0)
10185 ///
10186 ///  The above sequence must be produced unscheduled, to enable the linker to
10187 ///  optimize/relax this sequence.
10188 ///  Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10189 ///  above sequence, and expanded really late in the compilation flow, to ensure
10190 ///  the sequence is produced as per above.
10191 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10192                                                       const SDLoc &DL,
10193                                                       SelectionDAG &DAG) const {
10194   EVT PtrVT = getPointerTy(DAG.getDataLayout());
10195 
10196   SDValue Chain = DAG.getEntryNode();
10197   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10198 
10199   unsigned Opcode =
10200       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
10201           ? AArch64ISD::TLSDESC_AUTH_CALLSEQ
10202           : AArch64ISD::TLSDESC_CALLSEQ;
10203   Chain = DAG.getNode(Opcode, DL, NodeTys, {Chain, SymAddr});
10204   SDValue Glue = Chain.getValue(1);
10205 
10206   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
10207 }
10208 
10209 SDValue
10210 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
10211                                                 SelectionDAG &DAG) const {
10212   assert(Subtarget->isTargetELF() && "This function expects an ELF target");
10213 
10214   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10215   AArch64FunctionInfo *MFI =
10216       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10217 
10218   TLSModel::Model Model = MFI->hasELFSignedGOT()
10219                               ? TLSModel::GeneralDynamic
10220                               : getTargetMachine().getTLSModel(GA->getGlobal());
10221 
10222   if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
10223     if (Model == TLSModel::LocalDynamic)
10224       Model = TLSModel::GeneralDynamic;
10225   }
10226 
10227   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
10228       Model != TLSModel::LocalExec)
10229     report_fatal_error("ELF TLS only supported in small memory model or "
10230                        "in local exec TLS model");
10231   // Different choices can be made for the maximum size of the TLS area for a
10232   // module. For the small address model, the default TLS size is 16MiB and the
10233   // maximum TLS size is 4GiB.
10234   // FIXME: add tiny and large code model support for TLS access models other
10235   // than local exec. We currently generate the same code as small for tiny,
10236   // which may be larger than needed.
10237 
10238   SDValue TPOff;
10239   EVT PtrVT = getPointerTy(DAG.getDataLayout());
10240   SDLoc DL(Op);
10241   const GlobalValue *GV = GA->getGlobal();
10242 
10243   SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
10244 
10245   if (Model == TLSModel::LocalExec) {
10246     return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
10247   } else if (Model == TLSModel::InitialExec) {
10248     TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10249     TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
10250   } else if (Model == TLSModel::LocalDynamic) {
10251     // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
10252     // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
10253     // the beginning of the module's TLS region, followed by a DTPREL offset
10254     // calculation.
10255 
10256     // These accesses will need deduplicating if there's more than one.
10257     MFI->incNumLocalDynamicTLSAccesses();
10258 
10259     // The call needs a relocation too for linker relaxation. It doesn't make
10260     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10261     // the address.
10262     SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
10263                                                   AArch64II::MO_TLS);
10264 
10265     // Now we can calculate the offset from TPIDR_EL0 to this module's
10266     // thread-local area.
10267     TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10268 
10269     // Now use :dtprel_whatever: operations to calculate this variable's offset
10270     // in its thread-storage area.
10271     SDValue HiVar = DAG.getTargetGlobalAddress(
10272         GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10273     SDValue LoVar = DAG.getTargetGlobalAddress(
10274         GV, DL, MVT::i64, 0,
10275         AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10276 
10277     TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
10278                                        DAG.getTargetConstant(0, DL, MVT::i32)),
10279                     0);
10280     TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
10281                                        DAG.getTargetConstant(0, DL, MVT::i32)),
10282                     0);
10283   } else if (Model == TLSModel::GeneralDynamic) {
10284     // The call needs a relocation too for linker relaxation. It doesn't make
10285     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10286     // the address.
10287     SDValue SymAddr =
10288         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10289 
10290     // Finally we can make a call to calculate the offset from tpidr_el0.
10291     TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10292   } else
10293     llvm_unreachable("Unsupported ELF TLS access model");
10294 
10295   return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10296 }
10297 
10298 SDValue
10299 AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
10300                                                     SelectionDAG &DAG) const {
10301   assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
10302 
10303   SDValue Chain = DAG.getEntryNode();
10304   EVT PtrVT = getPointerTy(DAG.getDataLayout());
10305   SDLoc DL(Op);
10306 
10307   SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
10308 
10309   // Load the ThreadLocalStoragePointer from the TEB
10310   // A pointer to the TLS array is located at offset 0x58 from the TEB.
10311   SDValue TLSArray =
10312       DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
10313   TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
10314   Chain = TLSArray.getValue(1);
10315 
10316   // Load the TLS index from the C runtime;
10317   // This does the same as getAddr(), but without having a GlobalAddressSDNode.
10318   // This also does the same as LOADgot, but using a generic i32 load,
10319   // while LOADgot only loads i64.
10320   SDValue TLSIndexHi =
10321       DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
10322   SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
10323       "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10324   SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
10325   SDValue TLSIndex =
10326       DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
10327   TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
10328   Chain = TLSIndex.getValue(1);
10329 
10330   // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
10331   // offset into the TLSArray.
10332   TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
10333   SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
10334                              DAG.getConstant(3, DL, PtrVT));
10335   SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
10336                             DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
10337                             MachinePointerInfo());
10338   Chain = TLS.getValue(1);
10339 
10340   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10341   const GlobalValue *GV = GA->getGlobal();
10342   SDValue TGAHi = DAG.getTargetGlobalAddress(
10343       GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10344   SDValue TGALo = DAG.getTargetGlobalAddress(
10345       GV, DL, PtrVT, 0,
10346       AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10347 
10348   // Add the offset from the start of the .tls section (section base).
10349   SDValue Addr =
10350       SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
10351                                  DAG.getTargetConstant(0, DL, MVT::i32)),
10352               0);
10353   Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
10354   return Addr;
10355 }
10356 
10357 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
10358                                                      SelectionDAG &DAG) const {
10359   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10360   if (DAG.getTarget().useEmulatedTLS())
10361     return LowerToTLSEmulatedModel(GA, DAG);
10362 
10363   if (Subtarget->isTargetDarwin())
10364     return LowerDarwinGlobalTLSAddress(Op, DAG);
10365   if (Subtarget->isTargetELF())
10366     return LowerELFGlobalTLSAddress(Op, DAG);
10367   if (Subtarget->isTargetWindows())
10368     return LowerWindowsGlobalTLSAddress(Op, DAG);
10369 
10370   llvm_unreachable("Unexpected platform trying to use TLS");
10371 }
10372 
10373 //===----------------------------------------------------------------------===//
10374 //                      PtrAuthGlobalAddress lowering
10375 //
10376 // We have 3 lowering alternatives to choose from:
10377 // - MOVaddrPAC: similar to MOVaddr, with added PAC.
10378 //   If the GV doesn't need a GOT load (i.e., is locally defined)
10379 //   materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
10380 //
10381 // - LOADgotPAC: similar to LOADgot, with added PAC.
10382 //   If the GV needs a GOT load, materialize the pointer using the usual
10383 //   GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
10384 //   section is assumed to be read-only (for example, via relro mechanism). See
10385 //   LowerMOVaddrPAC.
10386 //
10387 // - LOADauthptrstatic: similar to LOADgot, but use a
10388 //   special stub slot instead of a GOT slot.
10389 //   Load a signed pointer for symbol 'sym' from a stub slot named
10390 //   'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
10391 //   resolving. This usually lowers to adrp+ldr, but also emits an entry into
10392 //   .data with an @AUTH relocation. See LowerLOADauthptrstatic.
10393 //
10394 // All 3 are pseudos that are expand late to longer sequences: this lets us
10395 // provide integrity guarantees on the to-be-signed intermediate values.
10396 //
10397 // LOADauthptrstatic is undesirable because it requires a large section filled
10398 // with often similarly-signed pointers, making it a good harvesting target.
10399 // Thus, it's only used for ptrauth references to extern_weak to avoid null
10400 // checks.
10401 
10402 static SDValue LowerPtrAuthGlobalAddressStatically(
10403     SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
10404     SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
10405   const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
10406   assert(TGN->getGlobal()->hasExternalWeakLinkage());
10407 
10408   // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
10409   // offset alone as a pointer if the symbol wasn't available, which would
10410   // probably break null checks in users. Ptrauth complicates things further:
10411   // error out.
10412   if (TGN->getOffset() != 0)
10413     report_fatal_error(
10414         "unsupported non-zero offset in weak ptrauth global reference");
10415 
10416   if (!isNullConstant(AddrDiscriminator))
10417     report_fatal_error("unsupported weak addr-div ptrauth global");
10418 
10419   SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10420   return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
10421                                     {TGA, Key, Discriminator}),
10422                  0);
10423 }
10424 
10425 SDValue
10426 AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
10427                                                  SelectionDAG &DAG) const {
10428   SDValue Ptr = Op.getOperand(0);
10429   uint64_t KeyC = Op.getConstantOperandVal(1);
10430   SDValue AddrDiscriminator = Op.getOperand(2);
10431   uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
10432   EVT VT = Op.getValueType();
10433   SDLoc DL(Op);
10434 
10435   if (KeyC > AArch64PACKey::LAST)
10436     report_fatal_error("key in ptrauth global out of range [0, " +
10437                        Twine((int)AArch64PACKey::LAST) + "]");
10438 
10439   // Blend only works if the integer discriminator is 16-bit wide.
10440   if (!isUInt<16>(DiscriminatorC))
10441     report_fatal_error(
10442         "constant discriminator in ptrauth global out of range [0, 0xffff]");
10443 
10444   // Choosing between 3 lowering alternatives is target-specific.
10445   if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
10446     report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
10447 
10448   int64_t PtrOffsetC = 0;
10449   if (Ptr.getOpcode() == ISD::ADD) {
10450     PtrOffsetC = Ptr.getConstantOperandVal(1);
10451     Ptr = Ptr.getOperand(0);
10452   }
10453   const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
10454   const GlobalValue *PtrGV = PtrN->getGlobal();
10455 
10456   // Classify the reference to determine whether it needs a GOT load.
10457   const unsigned OpFlags =
10458       Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
10459   const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
10460   assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
10461          "unsupported non-GOT op flags on ptrauth global reference");
10462 
10463   // Fold any offset into the GV; our pseudos expect it there.
10464   PtrOffsetC += PtrN->getOffset();
10465   SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
10466                                             /*TargetFlags=*/0);
10467   assert(PtrN->getTargetFlags() == 0 &&
10468          "unsupported target flags on ptrauth global");
10469 
10470   SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10471   SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
10472   SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
10473                                    ? AddrDiscriminator
10474                                    : DAG.getRegister(AArch64::XZR, MVT::i64);
10475 
10476   // No GOT load needed -> MOVaddrPAC
10477   if (!NeedsGOTLoad) {
10478     assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
10479     return SDValue(
10480         DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
10481                            {TPtr, Key, TAddrDiscriminator, Discriminator}),
10482         0);
10483   }
10484 
10485   // GOT load -> LOADgotPAC
10486   // Note that we disallow extern_weak refs to avoid null checks later.
10487   if (!PtrGV->hasExternalWeakLinkage())
10488     return SDValue(
10489         DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
10490                            {TPtr, Key, TAddrDiscriminator, Discriminator}),
10491         0);
10492 
10493   // extern_weak ref -> LOADauthptrstatic
10494   return LowerPtrAuthGlobalAddressStatically(
10495       TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
10496       DAG);
10497 }
10498 
10499 // Looks through \param Val to determine the bit that can be used to
10500 // check the sign of the value. It returns the unextended value and
10501 // the sign bit position.
10502 std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
10503   if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
10504     return {Val.getOperand(0),
10505             cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
10506                 1};
10507 
10508   if (Val.getOpcode() == ISD::SIGN_EXTEND)
10509     return {Val.getOperand(0),
10510             Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
10511 
10512   return {Val, Val.getValueSizeInBits() - 1};
10513 }
10514 
10515 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
10516   SDValue Chain = Op.getOperand(0);
10517   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
10518   SDValue LHS = Op.getOperand(2);
10519   SDValue RHS = Op.getOperand(3);
10520   SDValue Dest = Op.getOperand(4);
10521   SDLoc dl(Op);
10522 
10523   MachineFunction &MF = DAG.getMachineFunction();
10524   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
10525   // will not be produced, as they are conditional branch instructions that do
10526   // not set flags.
10527   bool ProduceNonFlagSettingCondBr =
10528       !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
10529 
10530   // Handle f128 first, since lowering it will result in comparing the return
10531   // value of a libcall against zero, which is just what the rest of LowerBR_CC
10532   // is expecting to deal with.
10533   if (LHS.getValueType() == MVT::f128) {
10534     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
10535 
10536     // If softenSetCCOperands returned a scalar, we need to compare the result
10537     // against zero to select between true and false values.
10538     if (!RHS.getNode()) {
10539       RHS = DAG.getConstant(0, dl, LHS.getValueType());
10540       CC = ISD::SETNE;
10541     }
10542   }
10543 
10544   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
10545   // instruction.
10546   if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
10547       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
10548     // Only lower legal XALUO ops.
10549     if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
10550       return SDValue();
10551 
10552     // The actual operation with overflow check.
10553     AArch64CC::CondCode OFCC;
10554     SDValue Value, Overflow;
10555     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
10556 
10557     if (CC == ISD::SETNE)
10558       OFCC = getInvertedCondCode(OFCC);
10559     SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
10560 
10561     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
10562                        Overflow);
10563   }
10564 
10565   if (LHS.getValueType().isInteger()) {
10566     assert((LHS.getValueType() == RHS.getValueType()) &&
10567            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
10568 
10569     // If the RHS of the comparison is zero, we can potentially fold this
10570     // to a specialized branch.
10571     const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
10572     if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
10573       if (CC == ISD::SETEQ) {
10574         // See if we can use a TBZ to fold in an AND as well.
10575         // TBZ has a smaller branch displacement than CBZ.  If the offset is
10576         // out of bounds, a late MI-layer pass rewrites branches.
10577         // 403.gcc is an example that hits this case.
10578         if (LHS.getOpcode() == ISD::AND &&
10579             isa<ConstantSDNode>(LHS.getOperand(1)) &&
10580             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10581           SDValue Test = LHS.getOperand(0);
10582           uint64_t Mask = LHS.getConstantOperandVal(1);
10583           return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
10584                              DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
10585                              Dest);
10586         }
10587 
10588         return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
10589       } else if (CC == ISD::SETNE) {
10590         // See if we can use a TBZ to fold in an AND as well.
10591         // TBZ has a smaller branch displacement than CBZ.  If the offset is
10592         // out of bounds, a late MI-layer pass rewrites branches.
10593         // 403.gcc is an example that hits this case.
10594         if (LHS.getOpcode() == ISD::AND &&
10595             isa<ConstantSDNode>(LHS.getOperand(1)) &&
10596             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10597           SDValue Test = LHS.getOperand(0);
10598           uint64_t Mask = LHS.getConstantOperandVal(1);
10599           return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
10600                              DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
10601                              Dest);
10602         }
10603 
10604         return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
10605       } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
10606         // Don't combine AND since emitComparison converts the AND to an ANDS
10607         // (a.k.a. TST) and the test in the test bit and branch instruction
10608         // becomes redundant.  This would also increase register pressure.
10609         uint64_t SignBitPos;
10610         std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
10611         return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
10612                            DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
10613       }
10614     }
10615     if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
10616         LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
10617       // Don't combine AND since emitComparison converts the AND to an ANDS
10618       // (a.k.a. TST) and the test in the test bit and branch instruction
10619       // becomes redundant.  This would also increase register pressure.
10620       uint64_t SignBitPos;
10621       std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
10622       return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
10623                          DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
10624     }
10625 
10626     SDValue CCVal;
10627     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
10628     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
10629                        Cmp);
10630   }
10631 
10632   assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
10633          LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
10634 
10635   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10636   // clean.  Some of them require two branches to implement.
10637   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10638   AArch64CC::CondCode CC1, CC2;
10639   changeFPCCToAArch64CC(CC, CC1, CC2);
10640   SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10641   SDValue BR1 =
10642       DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
10643   if (CC2 != AArch64CC::AL) {
10644     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
10645     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
10646                        Cmp);
10647   }
10648 
10649   return BR1;
10650 }
10651 
10652 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
10653                                               SelectionDAG &DAG) const {
10654   if (!Subtarget->isNeonAvailable() &&
10655       !Subtarget->useSVEForFixedLengthVectors())
10656     return SDValue();
10657 
10658   EVT VT = Op.getValueType();
10659   EVT IntVT = VT.changeTypeToInteger();
10660   SDLoc DL(Op);
10661 
10662   SDValue In1 = Op.getOperand(0);
10663   SDValue In2 = Op.getOperand(1);
10664   EVT SrcVT = In2.getValueType();
10665 
10666   if (!SrcVT.bitsEq(VT))
10667     In2 = DAG.getFPExtendOrRound(In2, DL, VT);
10668 
10669   if (VT.isScalableVector())
10670     IntVT =
10671         getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger());
10672 
10673   if (VT.isFixedLengthVector() &&
10674       useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
10675     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
10676 
10677     In1 = convertToScalableVector(DAG, ContainerVT, In1);
10678     In2 = convertToScalableVector(DAG, ContainerVT, In2);
10679 
10680     SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
10681     return convertFromScalableVector(DAG, VT, Res);
10682   }
10683 
10684   auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
10685     if (VT.isScalableVector())
10686       return getSVESafeBitCast(VT, Op, DAG);
10687 
10688     return DAG.getBitcast(VT, Op);
10689   };
10690 
10691   SDValue VecVal1, VecVal2;
10692   EVT VecVT;
10693   auto SetVecVal = [&](int Idx = -1) {
10694     if (!VT.isVector()) {
10695       VecVal1 =
10696           DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
10697       VecVal2 =
10698           DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
10699     } else {
10700       VecVal1 = BitCast(VecVT, In1, DAG);
10701       VecVal2 = BitCast(VecVT, In2, DAG);
10702     }
10703   };
10704   if (VT.isVector()) {
10705     VecVT = IntVT;
10706     SetVecVal();
10707   } else if (VT == MVT::f64) {
10708     VecVT = MVT::v2i64;
10709     SetVecVal(AArch64::dsub);
10710   } else if (VT == MVT::f32) {
10711     VecVT = MVT::v4i32;
10712     SetVecVal(AArch64::ssub);
10713   } else if (VT == MVT::f16 || VT == MVT::bf16) {
10714     VecVT = MVT::v8i16;
10715     SetVecVal(AArch64::hsub);
10716   } else {
10717     llvm_unreachable("Invalid type for copysign!");
10718   }
10719 
10720   unsigned BitWidth = In1.getScalarValueSizeInBits();
10721   SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
10722 
10723   // We want to materialize a mask with every bit but the high bit set, but the
10724   // AdvSIMD immediate moves cannot materialize that in a single instruction for
10725   // 64-bit elements. Instead, materialize all bits set and then negate that.
10726   if (VT == MVT::f64 || VT == MVT::v2f64) {
10727     SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
10728     SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
10729     SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
10730     SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
10731   }
10732 
10733   SDValue BSP =
10734       DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
10735   if (VT == MVT::f16 || VT == MVT::bf16)
10736     return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
10737   if (VT == MVT::f32)
10738     return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
10739   if (VT == MVT::f64)
10740     return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
10741 
10742   return BitCast(VT, BSP, DAG);
10743 }
10744 
10745 SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
10746                                                  SelectionDAG &DAG) const {
10747   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
10748           Attribute::NoImplicitFloat))
10749     return SDValue();
10750 
10751   EVT VT = Op.getValueType();
10752   if (VT.isScalableVector() ||
10753       useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
10754     return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
10755 
10756   if (!Subtarget->isNeonAvailable())
10757     return SDValue();
10758 
10759   bool IsParity = Op.getOpcode() == ISD::PARITY;
10760   SDValue Val = Op.getOperand(0);
10761   SDLoc DL(Op);
10762 
10763   // for i32, general parity function using EORs is more efficient compared to
10764   // using floating point
10765   if (VT == MVT::i32 && IsParity)
10766     return SDValue();
10767 
10768   // If there is no CNT instruction available, GPR popcount can
10769   // be more efficiently lowered to the following sequence that uses
10770   // AdvSIMD registers/instructions as long as the copies to/from
10771   // the AdvSIMD registers are cheap.
10772   //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
10773   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
10774   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
10775   //  FMOV    X0, D0        // copy result back to integer reg
10776   if (VT == MVT::i32 || VT == MVT::i64) {
10777     if (VT == MVT::i32)
10778       Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
10779     Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
10780 
10781     SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
10782     SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
10783     if (VT == MVT::i32)
10784       AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
10785                          DAG.getConstant(0, DL, MVT::i64));
10786     AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
10787     if (IsParity)
10788       AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
10789     return AddV;
10790   } else if (VT == MVT::i128) {
10791     Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
10792 
10793     SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
10794     SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
10795     AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
10796     if (IsParity)
10797       AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
10798     return AddV;
10799   }
10800 
10801   assert(!IsParity && "ISD::PARITY of vector types not supported");
10802 
10803   assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
10804           VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
10805          "Unexpected type for custom ctpop lowering");
10806 
10807   EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
10808   Val = DAG.getBitcast(VT8Bit, Val);
10809   Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
10810 
10811   if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
10812       VT.getVectorNumElements() >= 2) {
10813     EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
10814     SDValue Zeros = DAG.getConstant(0, DL, DT);
10815     SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
10816 
10817     if (VT == MVT::v2i64) {
10818       Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10819       Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
10820     } else if (VT == MVT::v2i32) {
10821       Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10822     } else if (VT == MVT::v4i32) {
10823       Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10824     } else {
10825       llvm_unreachable("Unexpected type for custom ctpop lowering");
10826     }
10827 
10828     return Val;
10829   }
10830 
10831   // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
10832   unsigned EltSize = 8;
10833   unsigned NumElts = VT.is64BitVector() ? 8 : 16;
10834   while (EltSize != VT.getScalarSizeInBits()) {
10835     EltSize *= 2;
10836     NumElts /= 2;
10837     MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
10838     Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
10839   }
10840 
10841   return Val;
10842 }
10843 
10844 SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
10845   EVT VT = Op.getValueType();
10846   assert(VT.isScalableVector() ||
10847          useSVEForFixedLengthVectorVT(
10848              VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
10849 
10850   SDLoc DL(Op);
10851   SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
10852   return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
10853 }
10854 
10855 SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
10856                                            SelectionDAG &DAG) const {
10857 
10858   EVT VT = Op.getValueType();
10859   SDLoc DL(Op);
10860   unsigned Opcode = Op.getOpcode();
10861   ISD::CondCode CC;
10862   switch (Opcode) {
10863   default:
10864     llvm_unreachable("Wrong instruction");
10865   case ISD::SMAX:
10866     CC = ISD::SETGT;
10867     break;
10868   case ISD::SMIN:
10869     CC = ISD::SETLT;
10870     break;
10871   case ISD::UMAX:
10872     CC = ISD::SETUGT;
10873     break;
10874   case ISD::UMIN:
10875     CC = ISD::SETULT;
10876     break;
10877   }
10878 
10879   if (VT.isScalableVector() ||
10880       useSVEForFixedLengthVectorVT(
10881           VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
10882     switch (Opcode) {
10883     default:
10884       llvm_unreachable("Wrong instruction");
10885     case ISD::SMAX:
10886       return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
10887     case ISD::SMIN:
10888       return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
10889     case ISD::UMAX:
10890       return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
10891     case ISD::UMIN:
10892       return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
10893     }
10894   }
10895 
10896   SDValue Op0 = Op.getOperand(0);
10897   SDValue Op1 = Op.getOperand(1);
10898   SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
10899   return DAG.getSelect(DL, VT, Cond, Op0, Op1);
10900 }
10901 
10902 SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
10903                                                SelectionDAG &DAG) const {
10904   EVT VT = Op.getValueType();
10905 
10906   if (VT.isScalableVector() ||
10907       useSVEForFixedLengthVectorVT(
10908           VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
10909     return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
10910 
10911   SDLoc DL(Op);
10912   SDValue REVB;
10913   MVT VST;
10914 
10915   switch (VT.getSimpleVT().SimpleTy) {
10916   default:
10917     llvm_unreachable("Invalid type for bitreverse!");
10918 
10919   case MVT::v2i32: {
10920     VST = MVT::v8i8;
10921     REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
10922 
10923     break;
10924   }
10925 
10926   case MVT::v4i32: {
10927     VST = MVT::v16i8;
10928     REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
10929 
10930     break;
10931   }
10932 
10933   case MVT::v1i64: {
10934     VST = MVT::v8i8;
10935     REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
10936 
10937     break;
10938   }
10939 
10940   case MVT::v2i64: {
10941     VST = MVT::v16i8;
10942     REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
10943 
10944     break;
10945   }
10946   }
10947 
10948   return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
10949                      DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
10950 }
10951 
10952 // Check whether the continuous comparison sequence.
10953 static bool
10954 isOrXorChain(SDValue N, unsigned &Num,
10955              SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
10956   if (Num == MaxXors)
10957     return false;
10958 
10959   // Skip the one-use zext
10960   if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
10961     N = N->getOperand(0);
10962 
10963   // The leaf node must be XOR
10964   if (N->getOpcode() == ISD::XOR) {
10965     WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
10966     Num++;
10967     return true;
10968   }
10969 
10970   // All the non-leaf nodes must be OR.
10971   if (N->getOpcode() != ISD::OR || !N->hasOneUse())
10972     return false;
10973 
10974   if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
10975       isOrXorChain(N->getOperand(1), Num, WorkList))
10976     return true;
10977   return false;
10978 }
10979 
10980 // Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
10981 static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) {
10982   SDValue LHS = N->getOperand(0);
10983   SDValue RHS = N->getOperand(1);
10984   SDLoc DL(N);
10985   EVT VT = N->getValueType(0);
10986   SmallVector<std::pair<SDValue, SDValue>, 16> WorkList;
10987 
10988   // Only handle integer compares.
10989   if (N->getOpcode() != ISD::SETCC)
10990     return SDValue();
10991 
10992   ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
10993   // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
10994   // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
10995   unsigned NumXors = 0;
10996   if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
10997       LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
10998       isOrXorChain(LHS, NumXors, WorkList)) {
10999     SDValue XOR0, XOR1;
11000     std::tie(XOR0, XOR1) = WorkList[0];
11001     unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
11002     SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11003     for (unsigned I = 1; I < WorkList.size(); I++) {
11004       std::tie(XOR0, XOR1) = WorkList[I];
11005       SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11006       Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11007     }
11008 
11009     // Exit early by inverting the condition, which help reduce indentations.
11010     return Cmp;
11011   }
11012 
11013   return SDValue();
11014 }
11015 
11016 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11017 
11018   if (Op.getValueType().isVector())
11019     return LowerVSETCC(Op, DAG);
11020 
11021   bool IsStrict = Op->isStrictFPOpcode();
11022   bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11023   unsigned OpNo = IsStrict ? 1 : 0;
11024   SDValue Chain;
11025   if (IsStrict)
11026     Chain = Op.getOperand(0);
11027   SDValue LHS = Op.getOperand(OpNo + 0);
11028   SDValue RHS = Op.getOperand(OpNo + 1);
11029   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11030   SDLoc dl(Op);
11031 
11032   // We chose ZeroOrOneBooleanContents, so use zero and one.
11033   EVT VT = Op.getValueType();
11034   SDValue TVal = DAG.getConstant(1, dl, VT);
11035   SDValue FVal = DAG.getConstant(0, dl, VT);
11036 
11037   // Handle f128 first, since one possible outcome is a normal integer
11038   // comparison which gets picked up by the next if statement.
11039   if (LHS.getValueType() == MVT::f128) {
11040     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
11041                         IsSignaling);
11042 
11043     // If softenSetCCOperands returned a scalar, use it.
11044     if (!RHS.getNode()) {
11045       assert(LHS.getValueType() == Op.getValueType() &&
11046              "Unexpected setcc expansion!");
11047       return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
11048     }
11049   }
11050 
11051   if (LHS.getValueType().isInteger()) {
11052 
11053     simplifySetCCIntoEq(CC, LHS, RHS, DAG, dl);
11054 
11055     SDValue CCVal;
11056     SDValue Cmp = getAArch64Cmp(
11057         LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
11058 
11059     // Note that we inverted the condition above, so we reverse the order of
11060     // the true and false operands here.  This will allow the setcc to be
11061     // matched to a single CSINC instruction.
11062     SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
11063     return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
11064   }
11065 
11066   // Now we know we're dealing with FP values.
11067   assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11068          LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11069 
11070   // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
11071   // and do the comparison.
11072   SDValue Cmp;
11073   if (IsStrict)
11074     Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
11075   else
11076     Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
11077 
11078   AArch64CC::CondCode CC1, CC2;
11079   changeFPCCToAArch64CC(CC, CC1, CC2);
11080   SDValue Res;
11081   if (CC2 == AArch64CC::AL) {
11082     changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
11083                           CC2);
11084     SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
11085 
11086     // Note that we inverted the condition above, so we reverse the order of
11087     // the true and false operands here.  This will allow the setcc to be
11088     // matched to a single CSINC instruction.
11089     Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
11090   } else {
11091     // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
11092     // totally clean.  Some of them require two CSELs to implement.  As is in
11093     // this case, we emit the first CSEL and then emit a second using the output
11094     // of the first as the RHS.  We're effectively OR'ing the two CC's together.
11095 
11096     // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
11097     SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
11098     SDValue CS1 =
11099         DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
11100 
11101     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
11102     Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
11103   }
11104   return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
11105 }
11106 
11107 SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
11108                                                SelectionDAG &DAG) const {
11109 
11110   SDValue LHS = Op.getOperand(0);
11111   SDValue RHS = Op.getOperand(1);
11112   EVT VT = LHS.getValueType();
11113   if (VT != MVT::i32 && VT != MVT::i64)
11114     return SDValue();
11115 
11116   SDLoc DL(Op);
11117   SDValue Carry = Op.getOperand(2);
11118   // SBCS uses a carry not a borrow so the carry flag should be inverted first.
11119   SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
11120   SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
11121                             LHS, RHS, InvCarry);
11122 
11123   EVT OpVT = Op.getValueType();
11124   SDValue TVal = DAG.getConstant(1, DL, OpVT);
11125   SDValue FVal = DAG.getConstant(0, DL, OpVT);
11126 
11127   ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
11128   ISD::CondCode CondInv = ISD::getSetCCInverse(Cond, VT);
11129   SDValue CCVal =
11130       DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
11131   // Inputs are swapped because the condition is inverted. This will allow
11132   // matching with a single CSINC instruction.
11133   return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
11134                      Cmp.getValue(1));
11135 }
11136 
11137 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
11138                                               SDValue RHS, SDValue TVal,
11139                                               SDValue FVal, const SDLoc &dl,
11140                                               SelectionDAG &DAG) const {
11141   // Handle f128 first, because it will result in a comparison of some RTLIB
11142   // call result against zero.
11143   if (LHS.getValueType() == MVT::f128) {
11144     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
11145 
11146     // If softenSetCCOperands returned a scalar, we need to compare the result
11147     // against zero to select between true and false values.
11148     if (!RHS.getNode()) {
11149       RHS = DAG.getConstant(0, dl, LHS.getValueType());
11150       CC = ISD::SETNE;
11151     }
11152   }
11153 
11154   // Also handle f16, for which we need to do a f32 comparison.
11155   if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
11156       LHS.getValueType() == MVT::bf16) {
11157     LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
11158     RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
11159   }
11160 
11161   // Next, handle integers.
11162   if (LHS.getValueType().isInteger()) {
11163     assert((LHS.getValueType() == RHS.getValueType()) &&
11164            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11165 
11166     ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
11167     ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
11168     ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11169     // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
11170     // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
11171     // supported types.
11172     if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
11173         CTVal->isOne() && CFVal->isAllOnes() &&
11174         LHS.getValueType() == TVal.getValueType()) {
11175       EVT VT = LHS.getValueType();
11176       SDValue Shift =
11177           DAG.getNode(ISD::SRA, dl, VT, LHS,
11178                       DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
11179       return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
11180     }
11181 
11182     // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
11183     // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
11184     // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
11185     // Both require less instructions than compare and conditional select.
11186     if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
11187         RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
11188         LHS.getValueType() == RHS.getValueType()) {
11189       EVT VT = LHS.getValueType();
11190       SDValue Shift =
11191           DAG.getNode(ISD::SRA, dl, VT, LHS,
11192                       DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
11193 
11194       if (CC == ISD::SETGT)
11195         Shift = DAG.getNOT(dl, Shift, VT);
11196 
11197       return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
11198     }
11199 
11200     unsigned Opcode = AArch64ISD::CSEL;
11201 
11202     // If both the TVal and the FVal are constants, see if we can swap them in
11203     // order to for a CSINV or CSINC out of them.
11204     if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
11205       std::swap(TVal, FVal);
11206       std::swap(CTVal, CFVal);
11207       CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11208     } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
11209       std::swap(TVal, FVal);
11210       std::swap(CTVal, CFVal);
11211       CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11212     } else if (TVal.getOpcode() == ISD::XOR) {
11213       // If TVal is a NOT we want to swap TVal and FVal so that we can match
11214       // with a CSINV rather than a CSEL.
11215       if (isAllOnesConstant(TVal.getOperand(1))) {
11216         std::swap(TVal, FVal);
11217         std::swap(CTVal, CFVal);
11218         CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11219       }
11220     } else if (TVal.getOpcode() == ISD::SUB) {
11221       // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
11222       // that we can match with a CSNEG rather than a CSEL.
11223       if (isNullConstant(TVal.getOperand(0))) {
11224         std::swap(TVal, FVal);
11225         std::swap(CTVal, CFVal);
11226         CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11227       }
11228     } else if (CTVal && CFVal) {
11229       const int64_t TrueVal = CTVal->getSExtValue();
11230       const int64_t FalseVal = CFVal->getSExtValue();
11231       bool Swap = false;
11232 
11233       // If both TVal and FVal are constants, see if FVal is the
11234       // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
11235       // instead of a CSEL in that case.
11236       if (TrueVal == ~FalseVal) {
11237         Opcode = AArch64ISD::CSINV;
11238       } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
11239                  TrueVal == -FalseVal) {
11240         Opcode = AArch64ISD::CSNEG;
11241       } else if (TVal.getValueType() == MVT::i32) {
11242         // If our operands are only 32-bit wide, make sure we use 32-bit
11243         // arithmetic for the check whether we can use CSINC. This ensures that
11244         // the addition in the check will wrap around properly in case there is
11245         // an overflow (which would not be the case if we do the check with
11246         // 64-bit arithmetic).
11247         const uint32_t TrueVal32 = CTVal->getZExtValue();
11248         const uint32_t FalseVal32 = CFVal->getZExtValue();
11249 
11250         if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
11251           Opcode = AArch64ISD::CSINC;
11252 
11253           if (TrueVal32 > FalseVal32) {
11254             Swap = true;
11255           }
11256         }
11257       } else {
11258         // 64-bit check whether we can use CSINC.
11259         const uint64_t TrueVal64 = TrueVal;
11260         const uint64_t FalseVal64 = FalseVal;
11261 
11262         if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
11263           Opcode = AArch64ISD::CSINC;
11264 
11265           if (TrueVal > FalseVal) {
11266             Swap = true;
11267           }
11268         }
11269       }
11270 
11271       // Swap TVal and FVal if necessary.
11272       if (Swap) {
11273         std::swap(TVal, FVal);
11274         std::swap(CTVal, CFVal);
11275         CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11276       }
11277 
11278       if (Opcode != AArch64ISD::CSEL) {
11279         // Drop FVal since we can get its value by simply inverting/negating
11280         // TVal.
11281         FVal = TVal;
11282       }
11283     }
11284 
11285     // Avoid materializing a constant when possible by reusing a known value in
11286     // a register.  However, don't perform this optimization if the known value
11287     // is one, zero or negative one in the case of a CSEL.  We can always
11288     // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
11289     // FVal, respectively.
11290     ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
11291     if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
11292         !RHSVal->isZero() && !RHSVal->isAllOnes()) {
11293       AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
11294       // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
11295       // "a != C ? x : a" to avoid materializing C.
11296       if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
11297         TVal = LHS;
11298       else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
11299         FVal = LHS;
11300     } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
11301       assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
11302       // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
11303       // avoid materializing C.
11304       AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
11305       if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
11306         Opcode = AArch64ISD::CSINV;
11307         TVal = LHS;
11308         FVal = DAG.getConstant(0, dl, FVal.getValueType());
11309       }
11310     }
11311 
11312     SDValue CCVal;
11313     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
11314     EVT VT = TVal.getValueType();
11315     return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
11316   }
11317 
11318   // Now we know we're dealing with FP values.
11319   assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
11320          LHS.getValueType() == MVT::f64);
11321   assert(LHS.getValueType() == RHS.getValueType());
11322   EVT VT = TVal.getValueType();
11323   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
11324 
11325   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11326   // clean.  Some of them require two CSELs to implement.
11327   AArch64CC::CondCode CC1, CC2;
11328   changeFPCCToAArch64CC(CC, CC1, CC2);
11329 
11330   if (DAG.getTarget().Options.UnsafeFPMath) {
11331     // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
11332     // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
11333     ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
11334     if (RHSVal && RHSVal->isZero()) {
11335       ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
11336       ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
11337 
11338       if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
11339           CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
11340         TVal = LHS;
11341       else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
11342                CFVal && CFVal->isZero() &&
11343                FVal.getValueType() == LHS.getValueType())
11344         FVal = LHS;
11345     }
11346   }
11347 
11348   // Emit first, and possibly only, CSEL.
11349   SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
11350   SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
11351 
11352   // If we need a second CSEL, emit it, using the output of the first as the
11353   // RHS.  We're effectively OR'ing the two CC's together.
11354   if (CC2 != AArch64CC::AL) {
11355     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
11356     return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
11357   }
11358 
11359   // Otherwise, return the output of the first CSEL.
11360   return CS1;
11361 }
11362 
11363 SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
11364                                                   SelectionDAG &DAG) const {
11365   EVT Ty = Op.getValueType();
11366   auto Idx = Op.getConstantOperandAPInt(2);
11367   int64_t IdxVal = Idx.getSExtValue();
11368   assert(Ty.isScalableVector() &&
11369          "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
11370 
11371   // We can use the splice instruction for certain index values where we are
11372   // able to efficiently generate the correct predicate. The index will be
11373   // inverted and used directly as the input to the ptrue instruction, i.e.
11374   // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
11375   // splice predicate. However, we can only do this if we can guarantee that
11376   // there are enough elements in the vector, hence we check the index <= min
11377   // number of elements.
11378   std::optional<unsigned> PredPattern;
11379   if (Ty.isScalableVector() && IdxVal < 0 &&
11380       (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
11381           std::nullopt) {
11382     SDLoc DL(Op);
11383 
11384     // Create a predicate where all but the last -IdxVal elements are false.
11385     EVT PredVT = Ty.changeVectorElementType(MVT::i1);
11386     SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
11387     Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
11388 
11389     // Now splice the two inputs together using the predicate.
11390     return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
11391                        Op.getOperand(1));
11392   }
11393 
11394   // We can select to an EXT instruction when indexing the first 256 bytes.
11395   unsigned BlockSize = AArch64::SVEBitsPerBlock / Ty.getVectorMinNumElements();
11396   if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
11397     return Op;
11398 
11399   return SDValue();
11400 }
11401 
11402 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
11403                                               SelectionDAG &DAG) const {
11404   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
11405   SDValue LHS = Op.getOperand(0);
11406   SDValue RHS = Op.getOperand(1);
11407   SDValue TVal = Op.getOperand(2);
11408   SDValue FVal = Op.getOperand(3);
11409   SDLoc DL(Op);
11410   return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
11411 }
11412 
11413 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
11414                                            SelectionDAG &DAG) const {
11415   SDValue CCVal = Op->getOperand(0);
11416   SDValue TVal = Op->getOperand(1);
11417   SDValue FVal = Op->getOperand(2);
11418   SDLoc DL(Op);
11419 
11420   EVT Ty = Op.getValueType();
11421   if (Ty == MVT::aarch64svcount) {
11422     TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
11423     FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
11424     SDValue Sel =
11425         DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
11426     return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
11427   }
11428 
11429   if (Ty.isScalableVector()) {
11430     MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
11431     SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
11432     return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
11433   }
11434 
11435   if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
11436     // FIXME: Ideally this would be the same as above using i1 types, however
11437     // for the moment we can't deal with fixed i1 vector types properly, so
11438     // instead extend the predicate to a result type sized integer vector.
11439     MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
11440     MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
11441     SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
11442     SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
11443     return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
11444   }
11445 
11446   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
11447   // instruction.
11448   if (ISD::isOverflowIntrOpRes(CCVal)) {
11449     // Only lower legal XALUO ops.
11450     if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
11451       return SDValue();
11452 
11453     AArch64CC::CondCode OFCC;
11454     SDValue Value, Overflow;
11455     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
11456     SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
11457 
11458     return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
11459                        CCVal, Overflow);
11460   }
11461 
11462   // Lower it the same way as we would lower a SELECT_CC node.
11463   ISD::CondCode CC;
11464   SDValue LHS, RHS;
11465   if (CCVal.getOpcode() == ISD::SETCC) {
11466     LHS = CCVal.getOperand(0);
11467     RHS = CCVal.getOperand(1);
11468     CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
11469   } else {
11470     LHS = CCVal;
11471     RHS = DAG.getConstant(0, DL, CCVal.getValueType());
11472     CC = ISD::SETNE;
11473   }
11474 
11475   // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
11476   // order to use FCSELSrrr
11477   if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11478     TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
11479                                      DAG.getUNDEF(MVT::f32), TVal);
11480     FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
11481                                      DAG.getUNDEF(MVT::f32), FVal);
11482   }
11483 
11484   SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
11485 
11486   if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11487     return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
11488   }
11489 
11490   return Res;
11491 }
11492 
11493 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
11494                                               SelectionDAG &DAG) const {
11495   // Jump table entries as PC relative offsets. No additional tweaking
11496   // is necessary here. Just get the address of the jump table.
11497   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
11498 
11499   CodeModel::Model CM = getTargetMachine().getCodeModel();
11500   if (CM == CodeModel::Large && !getTargetMachine().isPositionIndependent() &&
11501       !Subtarget->isTargetMachO())
11502     return getAddrLarge(JT, DAG);
11503   if (CM == CodeModel::Tiny)
11504     return getAddrTiny(JT, DAG);
11505   return getAddr(JT, DAG);
11506 }
11507 
11508 SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
11509                                           SelectionDAG &DAG) const {
11510   // Jump table entries as PC relative offsets. No additional tweaking
11511   // is necessary here. Just get the address of the jump table.
11512   SDLoc DL(Op);
11513   SDValue JT = Op.getOperand(1);
11514   SDValue Entry = Op.getOperand(2);
11515   int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
11516 
11517   auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
11518   AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
11519 
11520   // With aarch64-jump-table-hardening, we only expand the jump table dispatch
11521   // sequence later, to guarantee the integrity of the intermediate values.
11522   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
11523           "aarch64-jump-table-hardening")) {
11524     CodeModel::Model CM = getTargetMachine().getCodeModel();
11525     if (Subtarget->isTargetMachO()) {
11526       if (CM != CodeModel::Small && CM != CodeModel::Large)
11527         report_fatal_error("Unsupported code-model for hardened jump-table");
11528     } else {
11529       // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
11530       assert(Subtarget->isTargetELF() &&
11531              "jump table hardening only supported on MachO/ELF");
11532       if (CM != CodeModel::Small)
11533         report_fatal_error("Unsupported code-model for hardened jump-table");
11534     }
11535 
11536     SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
11537                                        Entry, SDValue());
11538     SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
11539                                    DAG.getTargetJumpTable(JTI, MVT::i32),
11540                                    X16Copy.getValue(0), X16Copy.getValue(1));
11541     return SDValue(B, 0);
11542   }
11543 
11544   SDNode *Dest =
11545       DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
11546                          Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
11547   SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
11548   return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
11549 }
11550 
11551 SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
11552   SDValue Chain = Op.getOperand(0);
11553   SDValue Dest = Op.getOperand(1);
11554 
11555   // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
11556   // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
11557   if (Dest->isMachineOpcode() &&
11558       Dest->getMachineOpcode() == AArch64::JumpTableDest32)
11559     return SDValue();
11560 
11561   const MachineFunction &MF = DAG.getMachineFunction();
11562   std::optional<uint16_t> BADisc =
11563       Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(MF.getFunction());
11564   if (!BADisc)
11565     return SDValue();
11566 
11567   SDLoc DL(Op);
11568 
11569   SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
11570   SDValue Key = DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32);
11571   SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
11572 
11573   SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
11574                                    {Dest, Key, Disc, AddrDisc, Chain});
11575   return SDValue(BrA, 0);
11576 }
11577 
11578 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
11579                                                  SelectionDAG &DAG) const {
11580   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
11581   CodeModel::Model CM = getTargetMachine().getCodeModel();
11582   if (CM == CodeModel::Large) {
11583     // Use the GOT for the large code model on iOS.
11584     if (Subtarget->isTargetMachO()) {
11585       return getGOT(CP, DAG);
11586     }
11587     if (!getTargetMachine().isPositionIndependent())
11588       return getAddrLarge(CP, DAG);
11589   } else if (CM == CodeModel::Tiny) {
11590     return getAddrTiny(CP, DAG);
11591   }
11592   return getAddr(CP, DAG);
11593 }
11594 
11595 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
11596                                                SelectionDAG &DAG) const {
11597   BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
11598   const BlockAddress *BA = BAN->getBlockAddress();
11599 
11600   if (std::optional<uint16_t> BADisc =
11601           Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
11602               *BA->getFunction())) {
11603     SDLoc DL(Op);
11604 
11605     // This isn't cheap, but BRIND is rare.
11606     SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
11607 
11608     SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
11609 
11610     SDValue Key = DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32);
11611     SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
11612 
11613     SDNode *MOV =
11614         DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
11615                            {TargetBA, Key, AddrDisc, Disc});
11616     return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
11617                               SDValue(MOV, 1));
11618   }
11619 
11620   CodeModel::Model CM = getTargetMachine().getCodeModel();
11621   if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
11622     if (!getTargetMachine().isPositionIndependent())
11623       return getAddrLarge(BAN, DAG);
11624   } else if (CM == CodeModel::Tiny) {
11625     return getAddrTiny(BAN, DAG);
11626   }
11627   return getAddr(BAN, DAG);
11628 }
11629 
11630 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
11631                                                  SelectionDAG &DAG) const {
11632   AArch64FunctionInfo *FuncInfo =
11633       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
11634 
11635   SDLoc DL(Op);
11636   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
11637                                  getPointerTy(DAG.getDataLayout()));
11638   FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
11639   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11640   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
11641                       MachinePointerInfo(SV));
11642 }
11643 
11644 SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
11645                                                   SelectionDAG &DAG) const {
11646   MachineFunction &MF = DAG.getMachineFunction();
11647   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
11648 
11649   SDLoc DL(Op);
11650   SDValue FR;
11651   if (Subtarget->isWindowsArm64EC()) {
11652     // With the Arm64EC ABI, we compute the address of the varargs save area
11653     // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
11654     // but calls from an entry thunk can pass in a different address.
11655     Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
11656     SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
11657     uint64_t StackOffset;
11658     if (FuncInfo->getVarArgsGPRSize() > 0)
11659       StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
11660     else
11661       StackOffset = FuncInfo->getVarArgsStackOffset();
11662     FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
11663                      DAG.getConstant(StackOffset, DL, MVT::i64));
11664   } else {
11665     FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
11666                                ? FuncInfo->getVarArgsGPRIndex()
11667                                : FuncInfo->getVarArgsStackIndex(),
11668                            getPointerTy(DAG.getDataLayout()));
11669   }
11670   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11671   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
11672                       MachinePointerInfo(SV));
11673 }
11674 
11675 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
11676                                                   SelectionDAG &DAG) const {
11677   // The layout of the va_list struct is specified in the AArch64 Procedure Call
11678   // Standard, section B.3.
11679   MachineFunction &MF = DAG.getMachineFunction();
11680   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
11681   unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
11682   auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
11683   auto PtrVT = getPointerTy(DAG.getDataLayout());
11684   SDLoc DL(Op);
11685 
11686   SDValue Chain = Op.getOperand(0);
11687   SDValue VAList = Op.getOperand(1);
11688   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11689   SmallVector<SDValue, 4> MemOps;
11690 
11691   // void *__stack at offset 0
11692   unsigned Offset = 0;
11693   SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
11694   Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
11695   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
11696                                 MachinePointerInfo(SV), Align(PtrSize)));
11697 
11698   // void *__gr_top at offset 8 (4 on ILP32)
11699   Offset += PtrSize;
11700   int GPRSize = FuncInfo->getVarArgsGPRSize();
11701   if (GPRSize > 0) {
11702     SDValue GRTop, GRTopAddr;
11703 
11704     GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11705                             DAG.getConstant(Offset, DL, PtrVT));
11706 
11707     GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
11708     GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
11709                         DAG.getSignedConstant(GPRSize, DL, PtrVT));
11710     GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
11711 
11712     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
11713                                   MachinePointerInfo(SV, Offset),
11714                                   Align(PtrSize)));
11715   }
11716 
11717   // void *__vr_top at offset 16 (8 on ILP32)
11718   Offset += PtrSize;
11719   int FPRSize = FuncInfo->getVarArgsFPRSize();
11720   if (FPRSize > 0) {
11721     SDValue VRTop, VRTopAddr;
11722     VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11723                             DAG.getConstant(Offset, DL, PtrVT));
11724 
11725     VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
11726     VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
11727                         DAG.getSignedConstant(FPRSize, DL, PtrVT));
11728     VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
11729 
11730     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
11731                                   MachinePointerInfo(SV, Offset),
11732                                   Align(PtrSize)));
11733   }
11734 
11735   // int __gr_offs at offset 24 (12 on ILP32)
11736   Offset += PtrSize;
11737   SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11738                                    DAG.getConstant(Offset, DL, PtrVT));
11739   MemOps.push_back(
11740       DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
11741                    GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
11742 
11743   // int __vr_offs at offset 28 (16 on ILP32)
11744   Offset += 4;
11745   SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11746                                    DAG.getConstant(Offset, DL, PtrVT));
11747   MemOps.push_back(
11748       DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
11749                    VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
11750 
11751   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
11752 }
11753 
11754 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
11755                                             SelectionDAG &DAG) const {
11756   MachineFunction &MF = DAG.getMachineFunction();
11757   Function &F = MF.getFunction();
11758 
11759   if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
11760     return LowerWin64_VASTART(Op, DAG);
11761   else if (Subtarget->isTargetDarwin())
11762     return LowerDarwin_VASTART(Op, DAG);
11763   else
11764     return LowerAAPCS_VASTART(Op, DAG);
11765 }
11766 
11767 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
11768                                            SelectionDAG &DAG) const {
11769   // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
11770   // pointer.
11771   SDLoc DL(Op);
11772   unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
11773   unsigned VaListSize =
11774       (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
11775           ? PtrSize
11776           : Subtarget->isTargetILP32() ? 20 : 32;
11777   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
11778   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
11779 
11780   return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
11781                        DAG.getConstant(VaListSize, DL, MVT::i32),
11782                        Align(PtrSize), false, false, /*CI=*/nullptr,
11783                        std::nullopt, MachinePointerInfo(DestSV),
11784                        MachinePointerInfo(SrcSV));
11785 }
11786 
11787 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
11788   assert(Subtarget->isTargetDarwin() &&
11789          "automatic va_arg instruction only works on Darwin");
11790 
11791   const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11792   EVT VT = Op.getValueType();
11793   SDLoc DL(Op);
11794   SDValue Chain = Op.getOperand(0);
11795   SDValue Addr = Op.getOperand(1);
11796   MaybeAlign Align(Op.getConstantOperandVal(3));
11797   unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
11798   auto PtrVT = getPointerTy(DAG.getDataLayout());
11799   auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
11800   SDValue VAList =
11801       DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
11802   Chain = VAList.getValue(1);
11803   VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
11804 
11805   if (VT.isScalableVector())
11806     report_fatal_error("Passing SVE types to variadic functions is "
11807                        "currently not supported");
11808 
11809   if (Align && *Align > MinSlotSize) {
11810     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11811                          DAG.getConstant(Align->value() - 1, DL, PtrVT));
11812     VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
11813                          DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
11814   }
11815 
11816   Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
11817   unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
11818 
11819   // Scalar integer and FP values smaller than 64 bits are implicitly extended
11820   // up to 64 bits.  At the very least, we have to increase the striding of the
11821   // vaargs list to match this, and for FP values we need to introduce
11822   // FP_ROUND nodes as well.
11823   if (VT.isInteger() && !VT.isVector())
11824     ArgSize = std::max(ArgSize, MinSlotSize);
11825   bool NeedFPTrunc = false;
11826   if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
11827     ArgSize = 8;
11828     NeedFPTrunc = true;
11829   }
11830 
11831   // Increment the pointer, VAList, to the next vaarg
11832   SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11833                                DAG.getConstant(ArgSize, DL, PtrVT));
11834   VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
11835 
11836   // Store the incremented VAList to the legalized pointer
11837   SDValue APStore =
11838       DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
11839 
11840   // Load the actual argument out of the pointer VAList
11841   if (NeedFPTrunc) {
11842     // Load the value as an f64.
11843     SDValue WideFP =
11844         DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
11845     // Round the value down to an f32.
11846     SDValue NarrowFP =
11847         DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
11848                     DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
11849     SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
11850     // Merge the rounded value with the chain output of the load.
11851     return DAG.getMergeValues(Ops, DL);
11852   }
11853 
11854   return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
11855 }
11856 
11857 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
11858                                               SelectionDAG &DAG) const {
11859   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
11860   MFI.setFrameAddressIsTaken(true);
11861 
11862   EVT VT = Op.getValueType();
11863   SDLoc DL(Op);
11864   unsigned Depth = Op.getConstantOperandVal(0);
11865   SDValue FrameAddr =
11866       DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
11867   while (Depth--)
11868     FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
11869                             MachinePointerInfo());
11870 
11871   if (Subtarget->isTargetILP32())
11872     FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
11873                             DAG.getValueType(VT));
11874 
11875   return FrameAddr;
11876 }
11877 
11878 SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
11879                                               SelectionDAG &DAG) const {
11880   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
11881 
11882   EVT VT = getPointerTy(DAG.getDataLayout());
11883   SDLoc DL(Op);
11884   int FI = MFI.CreateFixedObject(4, 0, false);
11885   return DAG.getFrameIndex(FI, VT);
11886 }
11887 
11888 #define GET_REGISTER_MATCHER
11889 #include "AArch64GenAsmMatcher.inc"
11890 
11891 // FIXME? Maybe this could be a TableGen attribute on some registers and
11892 // this table could be generated automatically from RegInfo.
11893 Register AArch64TargetLowering::
11894 getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
11895   Register Reg = MatchRegisterName(RegName);
11896   if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
11897     const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
11898     unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
11899     if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
11900         !MRI->isReservedReg(MF, Reg))
11901       Reg = 0;
11902   }
11903   if (Reg)
11904     return Reg;
11905   report_fatal_error(Twine("Invalid register name \""
11906                               + StringRef(RegName)  + "\"."));
11907 }
11908 
11909 SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
11910                                                      SelectionDAG &DAG) const {
11911   DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
11912 
11913   EVT VT = Op.getValueType();
11914   SDLoc DL(Op);
11915 
11916   SDValue FrameAddr =
11917       DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
11918   SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
11919 
11920   return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
11921 }
11922 
11923 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
11924                                                SelectionDAG &DAG) const {
11925   MachineFunction &MF = DAG.getMachineFunction();
11926   MachineFrameInfo &MFI = MF.getFrameInfo();
11927   MFI.setReturnAddressIsTaken(true);
11928 
11929   EVT VT = Op.getValueType();
11930   SDLoc DL(Op);
11931   unsigned Depth = Op.getConstantOperandVal(0);
11932   SDValue ReturnAddress;
11933   if (Depth) {
11934     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
11935     SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
11936     ReturnAddress = DAG.getLoad(
11937         VT, DL, DAG.getEntryNode(),
11938         DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
11939   } else {
11940     // Return LR, which contains the return address. Mark it an implicit
11941     // live-in.
11942     Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
11943     ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
11944   }
11945 
11946   // The XPACLRI instruction assembles to a hint-space instruction before
11947   // Armv8.3-A therefore this instruction can be safely used for any pre
11948   // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
11949   // that instead.
11950   SDNode *St;
11951   if (Subtarget->hasPAuth()) {
11952     St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
11953   } else {
11954     // XPACLRI operates on LR therefore we must move the operand accordingly.
11955     SDValue Chain =
11956         DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
11957     St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
11958   }
11959   return SDValue(St, 0);
11960 }
11961 
11962 /// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
11963 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
11964 SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
11965                                                SelectionDAG &DAG) const {
11966   SDValue Lo, Hi;
11967   expandShiftParts(Op.getNode(), Lo, Hi, DAG);
11968   return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
11969 }
11970 
11971 bool AArch64TargetLowering::isOffsetFoldingLegal(
11972     const GlobalAddressSDNode *GA) const {
11973   // Offsets are folded in the DAG combine rather than here so that we can
11974   // intelligently choose an offset based on the uses.
11975   return false;
11976 }
11977 
11978 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
11979                                          bool OptForSize) const {
11980   bool IsLegal = false;
11981   // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
11982   // 16-bit case when target has full fp16 support.
11983   // We encode bf16 bit patterns as if they were fp16. This results in very
11984   // strange looking assembly but should populate the register with appropriate
11985   // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
11986   // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
11987   // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
11988   // FIXME: We should be able to handle f128 as well with a clever lowering.
11989   const APInt ImmInt = Imm.bitcastToAPInt();
11990   if (VT == MVT::f64)
11991     IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
11992   else if (VT == MVT::f32)
11993     IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
11994   else if (VT == MVT::f16 || VT == MVT::bf16)
11995     IsLegal =
11996         (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
11997         Imm.isPosZero();
11998 
11999   // If we can not materialize in immediate field for fmov, check if the
12000   // value can be encoded as the immediate operand of a logical instruction.
12001   // The immediate value will be created with either MOVZ, MOVN, or ORR.
12002   // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
12003   //       generate that fmov.
12004   if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
12005     // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
12006     // however the mov+fmov sequence is always better because of the reduced
12007     // cache pressure. The timings are still the same if you consider
12008     // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
12009     // movw+movk is fused). So we limit up to 2 instrdduction at most.
12010     SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
12011     AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), Insn);
12012     assert(Insn.size() <= 4 &&
12013            "Should be able to build any value with at most 4 moves");
12014     unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
12015     IsLegal = Insn.size() <= Limit;
12016   }
12017 
12018   LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
12019                     << " imm value: "; Imm.dump(););
12020   return IsLegal;
12021 }
12022 
12023 //===----------------------------------------------------------------------===//
12024 //                          AArch64 Optimization Hooks
12025 //===----------------------------------------------------------------------===//
12026 
12027 static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
12028                            SDValue Operand, SelectionDAG &DAG,
12029                            int &ExtraSteps) {
12030   EVT VT = Operand.getValueType();
12031   if ((ST->hasNEON() &&
12032        (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
12033         VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
12034         VT == MVT::v4f32)) ||
12035       (ST->hasSVE() &&
12036        (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
12037     if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) {
12038       // For the reciprocal estimates, convergence is quadratic, so the number
12039       // of digits is doubled after each iteration.  In ARMv8, the accuracy of
12040       // the initial estimate is 2^-8.  Thus the number of extra steps to refine
12041       // the result for float (23 mantissa bits) is 2 and for double (52
12042       // mantissa bits) is 3.
12043       constexpr unsigned AccurateBits = 8;
12044       unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
12045       ExtraSteps = DesiredBits <= AccurateBits
12046                        ? 0
12047                        : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
12048     }
12049 
12050     return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
12051   }
12052 
12053   return SDValue();
12054 }
12055 
12056 SDValue
12057 AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
12058                                         const DenormalMode &Mode) const {
12059   SDLoc DL(Op);
12060   EVT VT = Op.getValueType();
12061   EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
12062   SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
12063   return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
12064 }
12065 
12066 SDValue
12067 AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
12068                                                    SelectionDAG &DAG) const {
12069   return Op;
12070 }
12071 
12072 SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
12073                                                SelectionDAG &DAG, int Enabled,
12074                                                int &ExtraSteps,
12075                                                bool &UseOneConst,
12076                                                bool Reciprocal) const {
12077   if (Enabled == ReciprocalEstimate::Enabled ||
12078       (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
12079     if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
12080                                        DAG, ExtraSteps)) {
12081       SDLoc DL(Operand);
12082       EVT VT = Operand.getValueType();
12083 
12084       SDNodeFlags Flags = SDNodeFlags::AllowReassociation;
12085 
12086       // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
12087       // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
12088       for (int i = ExtraSteps; i > 0; --i) {
12089         SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
12090                                    Flags);
12091         Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
12092         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12093       }
12094       if (!Reciprocal)
12095         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
12096 
12097       ExtraSteps = 0;
12098       return Estimate;
12099     }
12100 
12101   return SDValue();
12102 }
12103 
12104 SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
12105                                                 SelectionDAG &DAG, int Enabled,
12106                                                 int &ExtraSteps) const {
12107   if (Enabled == ReciprocalEstimate::Enabled)
12108     if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
12109                                        DAG, ExtraSteps)) {
12110       SDLoc DL(Operand);
12111       EVT VT = Operand.getValueType();
12112 
12113       SDNodeFlags Flags = SDNodeFlags::AllowReassociation;
12114 
12115       // Newton reciprocal iteration: E * (2 - X * E)
12116       // AArch64 reciprocal iteration instruction: (2 - M * N)
12117       for (int i = ExtraSteps; i > 0; --i) {
12118         SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
12119                                    Estimate, Flags);
12120         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12121       }
12122 
12123       ExtraSteps = 0;
12124       return Estimate;
12125     }
12126 
12127   return SDValue();
12128 }
12129 
12130 //===----------------------------------------------------------------------===//
12131 //                          AArch64 Inline Assembly Support
12132 //===----------------------------------------------------------------------===//
12133 
12134 // Table of Constraints
12135 // TODO: This is the current set of constraints supported by ARM for the
12136 // compiler, not all of them may make sense.
12137 //
12138 // r - A general register
12139 // w - An FP/SIMD register of some size in the range v0-v31
12140 // x - An FP/SIMD register of some size in the range v0-v15
12141 // I - Constant that can be used with an ADD instruction
12142 // J - Constant that can be used with a SUB instruction
12143 // K - Constant that can be used with a 32-bit logical instruction
12144 // L - Constant that can be used with a 64-bit logical instruction
12145 // M - Constant that can be used as a 32-bit MOV immediate
12146 // N - Constant that can be used as a 64-bit MOV immediate
12147 // Q - A memory reference with base register and no offset
12148 // S - A symbolic address
12149 // Y - Floating point constant zero
12150 // Z - Integer constant zero
12151 //
12152 //   Note that general register operands will be output using their 64-bit x
12153 // register name, whatever the size of the variable, unless the asm operand
12154 // is prefixed by the %w modifier. Floating-point and SIMD register operands
12155 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
12156 // %q modifier.
12157 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
12158   // At this point, we have to lower this constraint to something else, so we
12159   // lower it to an "r" or "w". However, by doing this we will force the result
12160   // to be in register, while the X constraint is much more permissive.
12161   //
12162   // Although we are correct (we are free to emit anything, without
12163   // constraints), we might break use cases that would expect us to be more
12164   // efficient and emit something else.
12165   if (!Subtarget->hasFPARMv8())
12166     return "r";
12167 
12168   if (ConstraintVT.isFloatingPoint())
12169     return "w";
12170 
12171   if (ConstraintVT.isVector() &&
12172      (ConstraintVT.getSizeInBits() == 64 ||
12173       ConstraintVT.getSizeInBits() == 128))
12174     return "w";
12175 
12176   return "r";
12177 }
12178 
12179 enum class PredicateConstraint { Uph, Upl, Upa };
12180 
12181 // Returns a {Reg, RegisterClass} tuple if the constraint is
12182 // a specific predicate register.
12183 //
12184 // For some constraint like "{pn3}" the default path in
12185 // TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
12186 // suitable register class for this register is "PPRorPNR", after which it
12187 // determines that nxv16i1 is an appropriate type for the constraint, which is
12188 // not what we want. The code here pre-empts this by matching the register
12189 // explicitly.
12190 static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
12191 parsePredicateRegAsConstraint(StringRef Constraint) {
12192   if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
12193       Constraint[1] != 'p')
12194     return std::nullopt;
12195 
12196   Constraint = Constraint.substr(2, Constraint.size() - 3);
12197   bool IsPredicateAsCount = Constraint.starts_with("n");
12198   if (IsPredicateAsCount)
12199     Constraint = Constraint.drop_front(1);
12200 
12201   unsigned V;
12202   if (Constraint.getAsInteger(10, V) || V > 31)
12203     return std::nullopt;
12204 
12205   if (IsPredicateAsCount)
12206     return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
12207   else
12208     return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
12209 }
12210 
12211 static std::optional<PredicateConstraint>
12212 parsePredicateConstraint(StringRef Constraint) {
12213   return StringSwitch<std::optional<PredicateConstraint>>(Constraint)
12214       .Case("Uph", PredicateConstraint::Uph)
12215       .Case("Upl", PredicateConstraint::Upl)
12216       .Case("Upa", PredicateConstraint::Upa)
12217       .Default(std::nullopt);
12218 }
12219 
12220 static const TargetRegisterClass *
12221 getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) {
12222   if (VT != MVT::aarch64svcount &&
12223       (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
12224     return nullptr;
12225 
12226   switch (Constraint) {
12227   case PredicateConstraint::Uph:
12228     return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
12229                                      : &AArch64::PPR_p8to15RegClass;
12230   case PredicateConstraint::Upl:
12231     return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
12232                                      : &AArch64::PPR_3bRegClass;
12233   case PredicateConstraint::Upa:
12234     return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
12235                                      : &AArch64::PPRRegClass;
12236   }
12237 
12238   llvm_unreachable("Missing PredicateConstraint!");
12239 }
12240 
12241 enum class ReducedGprConstraint { Uci, Ucj };
12242 
12243 static std::optional<ReducedGprConstraint>
12244 parseReducedGprConstraint(StringRef Constraint) {
12245   return StringSwitch<std::optional<ReducedGprConstraint>>(Constraint)
12246       .Case("Uci", ReducedGprConstraint::Uci)
12247       .Case("Ucj", ReducedGprConstraint::Ucj)
12248       .Default(std::nullopt);
12249 }
12250 
12251 static const TargetRegisterClass *
12252 getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT) {
12253   if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
12254     return nullptr;
12255 
12256   switch (Constraint) {
12257   case ReducedGprConstraint::Uci:
12258     return &AArch64::MatrixIndexGPR32_8_11RegClass;
12259   case ReducedGprConstraint::Ucj:
12260     return &AArch64::MatrixIndexGPR32_12_15RegClass;
12261   }
12262 
12263   llvm_unreachable("Missing ReducedGprConstraint!");
12264 }
12265 
12266 // The set of cc code supported is from
12267 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
12268 static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint) {
12269   AArch64CC::CondCode Cond = StringSwitch<AArch64CC::CondCode>(Constraint)
12270                                  .Case("{@cchi}", AArch64CC::HI)
12271                                  .Case("{@cccs}", AArch64CC::HS)
12272                                  .Case("{@cclo}", AArch64CC::LO)
12273                                  .Case("{@ccls}", AArch64CC::LS)
12274                                  .Case("{@cccc}", AArch64CC::LO)
12275                                  .Case("{@cceq}", AArch64CC::EQ)
12276                                  .Case("{@ccgt}", AArch64CC::GT)
12277                                  .Case("{@ccge}", AArch64CC::GE)
12278                                  .Case("{@cclt}", AArch64CC::LT)
12279                                  .Case("{@ccle}", AArch64CC::LE)
12280                                  .Case("{@cchs}", AArch64CC::HS)
12281                                  .Case("{@ccne}", AArch64CC::NE)
12282                                  .Case("{@ccvc}", AArch64CC::VC)
12283                                  .Case("{@ccpl}", AArch64CC::PL)
12284                                  .Case("{@ccvs}", AArch64CC::VS)
12285                                  .Case("{@ccmi}", AArch64CC::MI)
12286                                  .Default(AArch64CC::Invalid);
12287   return Cond;
12288 }
12289 
12290 /// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
12291 /// WZR, invert(<cond>)'.
12292 static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL,
12293                         SelectionDAG &DAG) {
12294   return DAG.getNode(
12295       AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
12296       DAG.getConstant(0, DL, MVT::i32),
12297       DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
12298 }
12299 
12300 // Lower @cc flag output via getSETCC.
12301 SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
12302     SDValue &Chain, SDValue &Glue, const SDLoc &DL,
12303     const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
12304   AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
12305   if (Cond == AArch64CC::Invalid)
12306     return SDValue();
12307   // The output variable should be a scalar integer.
12308   if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
12309       OpInfo.ConstraintVT.getSizeInBits() < 8)
12310     report_fatal_error("Flag output operand is of invalid type");
12311 
12312   // Get NZCV register. Only update chain when copyfrom is glued.
12313   if (Glue.getNode()) {
12314     Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
12315     Chain = Glue.getValue(1);
12316   } else
12317     Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
12318   // Extract CC code.
12319   SDValue CC = getSETCC(Cond, Glue, DL, DAG);
12320 
12321   SDValue Result;
12322 
12323   // Truncate or ZERO_EXTEND based on value types.
12324   if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
12325     Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
12326   else
12327     Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
12328 
12329   return Result;
12330 }
12331 
12332 /// getConstraintType - Given a constraint letter, return the type of
12333 /// constraint it is for this target.
12334 AArch64TargetLowering::ConstraintType
12335 AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
12336   if (Constraint.size() == 1) {
12337     switch (Constraint[0]) {
12338     default:
12339       break;
12340     case 'x':
12341     case 'w':
12342     case 'y':
12343       return C_RegisterClass;
12344     // An address with a single base register. Due to the way we
12345     // currently handle addresses it is the same as 'r'.
12346     case 'Q':
12347       return C_Memory;
12348     case 'I':
12349     case 'J':
12350     case 'K':
12351     case 'L':
12352     case 'M':
12353     case 'N':
12354     case 'Y':
12355     case 'Z':
12356       return C_Immediate;
12357     case 'z':
12358     case 'S': // A symbol or label reference with a constant offset
12359       return C_Other;
12360     }
12361   } else if (parsePredicateConstraint(Constraint))
12362     return C_RegisterClass;
12363   else if (parseReducedGprConstraint(Constraint))
12364     return C_RegisterClass;
12365   else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
12366     return C_Other;
12367   return TargetLowering::getConstraintType(Constraint);
12368 }
12369 
12370 /// Examine constraint type and operand type and determine a weight value.
12371 /// This object must already have been set up with the operand type
12372 /// and the current alternative constraint selected.
12373 TargetLowering::ConstraintWeight
12374 AArch64TargetLowering::getSingleConstraintMatchWeight(
12375     AsmOperandInfo &info, const char *constraint) const {
12376   ConstraintWeight weight = CW_Invalid;
12377   Value *CallOperandVal = info.CallOperandVal;
12378   // If we don't have a value, we can't do a match,
12379   // but allow it at the lowest weight.
12380   if (!CallOperandVal)
12381     return CW_Default;
12382   Type *type = CallOperandVal->getType();
12383   // Look at the constraint type.
12384   switch (*constraint) {
12385   default:
12386     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
12387     break;
12388   case 'x':
12389   case 'w':
12390   case 'y':
12391     if (type->isFloatingPointTy() || type->isVectorTy())
12392       weight = CW_Register;
12393     break;
12394   case 'z':
12395     weight = CW_Constant;
12396     break;
12397   case 'U':
12398     if (parsePredicateConstraint(constraint) ||
12399         parseReducedGprConstraint(constraint))
12400       weight = CW_Register;
12401     break;
12402   }
12403   return weight;
12404 }
12405 
12406 std::pair<unsigned, const TargetRegisterClass *>
12407 AArch64TargetLowering::getRegForInlineAsmConstraint(
12408     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
12409   if (Constraint.size() == 1) {
12410     switch (Constraint[0]) {
12411     case 'r':
12412       if (VT.isScalableVector())
12413         return std::make_pair(0U, nullptr);
12414       if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
12415         return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
12416       if (VT.getFixedSizeInBits() == 64)
12417         return std::make_pair(0U, &AArch64::GPR64commonRegClass);
12418       return std::make_pair(0U, &AArch64::GPR32commonRegClass);
12419     case 'w': {
12420       if (!Subtarget->hasFPARMv8())
12421         break;
12422       if (VT.isScalableVector()) {
12423         if (VT.getVectorElementType() != MVT::i1)
12424           return std::make_pair(0U, &AArch64::ZPRRegClass);
12425         return std::make_pair(0U, nullptr);
12426       }
12427       if (VT == MVT::Other)
12428         break;
12429       uint64_t VTSize = VT.getFixedSizeInBits();
12430       if (VTSize == 16)
12431         return std::make_pair(0U, &AArch64::FPR16RegClass);
12432       if (VTSize == 32)
12433         return std::make_pair(0U, &AArch64::FPR32RegClass);
12434       if (VTSize == 64)
12435         return std::make_pair(0U, &AArch64::FPR64RegClass);
12436       if (VTSize == 128)
12437         return std::make_pair(0U, &AArch64::FPR128RegClass);
12438       break;
12439     }
12440     // The instructions that this constraint is designed for can
12441     // only take 128-bit registers so just use that regclass.
12442     case 'x':
12443       if (!Subtarget->hasFPARMv8())
12444         break;
12445       if (VT.isScalableVector())
12446         return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
12447       if (VT.getSizeInBits() == 128)
12448         return std::make_pair(0U, &AArch64::FPR128_loRegClass);
12449       break;
12450     case 'y':
12451       if (!Subtarget->hasFPARMv8())
12452         break;
12453       if (VT.isScalableVector())
12454         return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
12455       break;
12456     }
12457   } else {
12458     if (const auto P = parsePredicateRegAsConstraint(Constraint))
12459       return *P;
12460     if (const auto PC = parsePredicateConstraint(Constraint))
12461       if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
12462         return std::make_pair(0U, RegClass);
12463 
12464     if (const auto RGC = parseReducedGprConstraint(Constraint))
12465       if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
12466         return std::make_pair(0U, RegClass);
12467   }
12468   if (StringRef("{cc}").equals_insensitive(Constraint) ||
12469       parseConstraintCode(Constraint) != AArch64CC::Invalid)
12470     return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
12471 
12472   if (Constraint == "{za}") {
12473     return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
12474   }
12475 
12476   if (Constraint == "{zt0}") {
12477     return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
12478   }
12479 
12480   // Use the default implementation in TargetLowering to convert the register
12481   // constraint into a member of a register class.
12482   std::pair<unsigned, const TargetRegisterClass *> Res;
12483   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
12484 
12485   // Not found as a standard register?
12486   if (!Res.second) {
12487     unsigned Size = Constraint.size();
12488     if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
12489         tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
12490       int RegNo;
12491       bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
12492       if (!Failed && RegNo >= 0 && RegNo <= 31) {
12493         // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
12494         // By default we'll emit v0-v31 for this unless there's a modifier where
12495         // we'll emit the correct register as well.
12496         if (VT != MVT::Other && VT.getSizeInBits() == 64) {
12497           Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
12498           Res.second = &AArch64::FPR64RegClass;
12499         } else {
12500           Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
12501           Res.second = &AArch64::FPR128RegClass;
12502         }
12503       }
12504     }
12505   }
12506 
12507   if (Res.second && !Subtarget->hasFPARMv8() &&
12508       !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
12509       !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
12510     return std::make_pair(0U, nullptr);
12511 
12512   return Res;
12513 }
12514 
12515 EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
12516                                                   llvm::Type *Ty,
12517                                                   bool AllowUnknown) const {
12518   if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
12519     return EVT(MVT::i64x8);
12520 
12521   return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
12522 }
12523 
12524 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
12525 /// vector.  If it is invalid, don't add anything to Ops.
12526 void AArch64TargetLowering::LowerAsmOperandForConstraint(
12527     SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
12528     SelectionDAG &DAG) const {
12529   SDValue Result;
12530 
12531   // Currently only support length 1 constraints.
12532   if (Constraint.size() != 1)
12533     return;
12534 
12535   char ConstraintLetter = Constraint[0];
12536   switch (ConstraintLetter) {
12537   default:
12538     break;
12539 
12540   // This set of constraints deal with valid constants for various instructions.
12541   // Validate and return a target constant for them if we can.
12542   case 'z': {
12543     // 'z' maps to xzr or wzr so it needs an input of 0.
12544     if (!isNullConstant(Op))
12545       return;
12546 
12547     if (Op.getValueType() == MVT::i64)
12548       Result = DAG.getRegister(AArch64::XZR, MVT::i64);
12549     else
12550       Result = DAG.getRegister(AArch64::WZR, MVT::i32);
12551     break;
12552   }
12553   case 'S':
12554     // Use the generic code path for "s". In GCC's aarch64 port, "S" is
12555     // supported for PIC while "s" isn't, making "s" less useful. We implement
12556     // "S" but not "s".
12557     TargetLowering::LowerAsmOperandForConstraint(Op, "s", Ops, DAG);
12558     break;
12559 
12560   case 'I':
12561   case 'J':
12562   case 'K':
12563   case 'L':
12564   case 'M':
12565   case 'N':
12566     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
12567     if (!C)
12568       return;
12569 
12570     // Grab the value and do some validation.
12571     uint64_t CVal = C->getZExtValue();
12572     switch (ConstraintLetter) {
12573     // The I constraint applies only to simple ADD or SUB immediate operands:
12574     // i.e. 0 to 4095 with optional shift by 12
12575     // The J constraint applies only to ADD or SUB immediates that would be
12576     // valid when negated, i.e. if [an add pattern] were to be output as a SUB
12577     // instruction [or vice versa], in other words -1 to -4095 with optional
12578     // left shift by 12.
12579     case 'I':
12580       if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
12581         break;
12582       return;
12583     case 'J': {
12584       uint64_t NVal = -C->getSExtValue();
12585       if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
12586         CVal = C->getSExtValue();
12587         break;
12588       }
12589       return;
12590     }
12591     // The K and L constraints apply *only* to logical immediates, including
12592     // what used to be the MOVI alias for ORR (though the MOVI alias has now
12593     // been removed and MOV should be used). So these constraints have to
12594     // distinguish between bit patterns that are valid 32-bit or 64-bit
12595     // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
12596     // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
12597     // versa.
12598     case 'K':
12599       if (AArch64_AM::isLogicalImmediate(CVal, 32))
12600         break;
12601       return;
12602     case 'L':
12603       if (AArch64_AM::isLogicalImmediate(CVal, 64))
12604         break;
12605       return;
12606     // The M and N constraints are a superset of K and L respectively, for use
12607     // with the MOV (immediate) alias. As well as the logical immediates they
12608     // also match 32 or 64-bit immediates that can be loaded either using a
12609     // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
12610     // (M) or 64-bit 0x1234000000000000 (N) etc.
12611     // As a note some of this code is liberally stolen from the asm parser.
12612     case 'M': {
12613       if (!isUInt<32>(CVal))
12614         return;
12615       if (AArch64_AM::isLogicalImmediate(CVal, 32))
12616         break;
12617       if ((CVal & 0xFFFF) == CVal)
12618         break;
12619       if ((CVal & 0xFFFF0000ULL) == CVal)
12620         break;
12621       uint64_t NCVal = ~(uint32_t)CVal;
12622       if ((NCVal & 0xFFFFULL) == NCVal)
12623         break;
12624       if ((NCVal & 0xFFFF0000ULL) == NCVal)
12625         break;
12626       return;
12627     }
12628     case 'N': {
12629       if (AArch64_AM::isLogicalImmediate(CVal, 64))
12630         break;
12631       if ((CVal & 0xFFFFULL) == CVal)
12632         break;
12633       if ((CVal & 0xFFFF0000ULL) == CVal)
12634         break;
12635       if ((CVal & 0xFFFF00000000ULL) == CVal)
12636         break;
12637       if ((CVal & 0xFFFF000000000000ULL) == CVal)
12638         break;
12639       uint64_t NCVal = ~CVal;
12640       if ((NCVal & 0xFFFFULL) == NCVal)
12641         break;
12642       if ((NCVal & 0xFFFF0000ULL) == NCVal)
12643         break;
12644       if ((NCVal & 0xFFFF00000000ULL) == NCVal)
12645         break;
12646       if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
12647         break;
12648       return;
12649     }
12650     default:
12651       return;
12652     }
12653 
12654     // All assembler immediates are 64-bit integers.
12655     Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
12656     break;
12657   }
12658 
12659   if (Result.getNode()) {
12660     Ops.push_back(Result);
12661     return;
12662   }
12663 
12664   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
12665 }
12666 
12667 //===----------------------------------------------------------------------===//
12668 //                     AArch64 Advanced SIMD Support
12669 //===----------------------------------------------------------------------===//
12670 
12671 /// WidenVector - Given a value in the V64 register class, produce the
12672 /// equivalent value in the V128 register class.
12673 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
12674   EVT VT = V64Reg.getValueType();
12675   unsigned NarrowSize = VT.getVectorNumElements();
12676   MVT EltTy = VT.getVectorElementType().getSimpleVT();
12677   MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
12678   SDLoc DL(V64Reg);
12679 
12680   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
12681                      V64Reg, DAG.getConstant(0, DL, MVT::i64));
12682 }
12683 
12684 /// getExtFactor - Determine the adjustment factor for the position when
12685 /// generating an "extract from vector registers" instruction.
12686 static unsigned getExtFactor(SDValue &V) {
12687   EVT EltType = V.getValueType().getVectorElementType();
12688   return EltType.getSizeInBits() / 8;
12689 }
12690 
12691 // Check if a vector is built from one vector via extracted elements of
12692 // another together with an AND mask, ensuring that all elements fit
12693 // within range. This can be reconstructed using AND and NEON's TBL1.
12694 SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) {
12695   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12696   SDLoc dl(Op);
12697   EVT VT = Op.getValueType();
12698   assert(!VT.isScalableVector() &&
12699          "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
12700 
12701   // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
12702   // directly to TBL1.
12703   if (VT != MVT::v16i8 && VT != MVT::v8i8)
12704     return SDValue();
12705 
12706   unsigned NumElts = VT.getVectorNumElements();
12707   assert((NumElts == 8 || NumElts == 16) &&
12708          "Need to have exactly 8 or 16 elements in vector.");
12709 
12710   SDValue SourceVec;
12711   SDValue MaskSourceVec;
12712   SmallVector<SDValue, 16> AndMaskConstants;
12713 
12714   for (unsigned i = 0; i < NumElts; ++i) {
12715     SDValue V = Op.getOperand(i);
12716     if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12717       return SDValue();
12718 
12719     SDValue OperandSourceVec = V.getOperand(0);
12720     if (!SourceVec)
12721       SourceVec = OperandSourceVec;
12722     else if (SourceVec != OperandSourceVec)
12723       return SDValue();
12724 
12725     // This only looks at shuffles with elements that are
12726     // a) truncated by a constant AND mask extracted from a mask vector, or
12727     // b) extracted directly from a mask vector.
12728     SDValue MaskSource = V.getOperand(1);
12729     if (MaskSource.getOpcode() == ISD::AND) {
12730       if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
12731         return SDValue();
12732 
12733       AndMaskConstants.push_back(MaskSource.getOperand(1));
12734       MaskSource = MaskSource->getOperand(0);
12735     } else if (!AndMaskConstants.empty()) {
12736       // Either all or no operands should have an AND mask.
12737       return SDValue();
12738     }
12739 
12740     // An ANY_EXTEND may be inserted between the AND and the source vector
12741     // extraction. We don't care about that, so we can just skip it.
12742     if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
12743       MaskSource = MaskSource.getOperand(0);
12744 
12745     if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12746       return SDValue();
12747 
12748     SDValue MaskIdx = MaskSource.getOperand(1);
12749     if (!isa<ConstantSDNode>(MaskIdx) ||
12750         !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
12751       return SDValue();
12752 
12753     // We only apply this if all elements come from the same vector with the
12754     // same vector type.
12755     if (!MaskSourceVec) {
12756       MaskSourceVec = MaskSource->getOperand(0);
12757       if (MaskSourceVec.getValueType() != VT)
12758         return SDValue();
12759     } else if (MaskSourceVec != MaskSource->getOperand(0)) {
12760       return SDValue();
12761     }
12762   }
12763 
12764   // We need a v16i8 for TBL, so we extend the source with a placeholder vector
12765   // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
12766   // insert, we know that the index in the mask must be smaller than the number
12767   // of elements in the source, or we would have an out-of-bounds access.
12768   if (NumElts == 8)
12769     SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
12770                             DAG.getUNDEF(VT));
12771 
12772   // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
12773   if (!AndMaskConstants.empty())
12774     MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
12775                                 DAG.getBuildVector(VT, dl, AndMaskConstants));
12776 
12777   return DAG.getNode(
12778       ISD::INTRINSIC_WO_CHAIN, dl, VT,
12779       DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
12780       MaskSourceVec);
12781 }
12782 
12783 // Gather data to see if the operation can be modelled as a
12784 // shuffle in combination with VEXTs.
12785 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
12786                                                   SelectionDAG &DAG) const {
12787   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12788   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
12789   SDLoc dl(Op);
12790   EVT VT = Op.getValueType();
12791   assert(!VT.isScalableVector() &&
12792          "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
12793   unsigned NumElts = VT.getVectorNumElements();
12794 
12795   struct ShuffleSourceInfo {
12796     SDValue Vec;
12797     unsigned MinElt;
12798     unsigned MaxElt;
12799 
12800     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
12801     // be compatible with the shuffle we intend to construct. As a result
12802     // ShuffleVec will be some sliding window into the original Vec.
12803     SDValue ShuffleVec;
12804 
12805     // Code should guarantee that element i in Vec starts at element "WindowBase
12806     // + i * WindowScale in ShuffleVec".
12807     int WindowBase;
12808     int WindowScale;
12809 
12810     ShuffleSourceInfo(SDValue Vec)
12811       : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
12812           ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
12813 
12814     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
12815   };
12816 
12817   // First gather all vectors used as an immediate source for this BUILD_VECTOR
12818   // node.
12819   SmallVector<ShuffleSourceInfo, 2> Sources;
12820   for (unsigned i = 0; i < NumElts; ++i) {
12821     SDValue V = Op.getOperand(i);
12822     if (V.isUndef())
12823       continue;
12824     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12825              !isa<ConstantSDNode>(V.getOperand(1)) ||
12826              V.getOperand(0).getValueType().isScalableVector()) {
12827       LLVM_DEBUG(
12828           dbgs() << "Reshuffle failed: "
12829                     "a shuffle can only come from building a vector from "
12830                     "various elements of other fixed-width vectors, provided "
12831                     "their indices are constant\n");
12832       return SDValue();
12833     }
12834 
12835     // Add this element source to the list if it's not already there.
12836     SDValue SourceVec = V.getOperand(0);
12837     auto Source = find(Sources, SourceVec);
12838     if (Source == Sources.end())
12839       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
12840 
12841     // Update the minimum and maximum lane number seen.
12842     unsigned EltNo = V.getConstantOperandVal(1);
12843     Source->MinElt = std::min(Source->MinElt, EltNo);
12844     Source->MaxElt = std::max(Source->MaxElt, EltNo);
12845   }
12846 
12847   // If we have 3 or 4 sources, try to generate a TBL, which will at least be
12848   // better than moving to/from gpr registers for larger vectors.
12849   if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
12850     // Construct a mask for the tbl. We may need to adjust the index for types
12851     // larger than i8.
12852     SmallVector<unsigned, 16> Mask;
12853     unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
12854     for (unsigned I = 0; I < NumElts; ++I) {
12855       SDValue V = Op.getOperand(I);
12856       if (V.isUndef()) {
12857         for (unsigned OF = 0; OF < OutputFactor; OF++)
12858           Mask.push_back(-1);
12859         continue;
12860       }
12861       // Set the Mask lanes adjusted for the size of the input and output
12862       // lanes. The Mask is always i8, so it will set OutputFactor lanes per
12863       // output element, adjusted in their positions per input and output types.
12864       unsigned Lane = V.getConstantOperandVal(1);
12865       for (unsigned S = 0; S < Sources.size(); S++) {
12866         if (V.getOperand(0) == Sources[S].Vec) {
12867           unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
12868           unsigned InputBase = 16 * S + Lane * InputSize / 8;
12869           for (unsigned OF = 0; OF < OutputFactor; OF++)
12870             Mask.push_back(InputBase + OF);
12871           break;
12872         }
12873       }
12874     }
12875 
12876     // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
12877     // v16i8, and the TBLMask
12878     SmallVector<SDValue, 16> TBLOperands;
12879     TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
12880                                               ? Intrinsic::aarch64_neon_tbl3
12881                                               : Intrinsic::aarch64_neon_tbl4,
12882                                           dl, MVT::i32));
12883     for (unsigned i = 0; i < Sources.size(); i++) {
12884       SDValue Src = Sources[i].Vec;
12885       EVT SrcVT = Src.getValueType();
12886       Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
12887       assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
12888              "Expected a legally typed vector");
12889       if (SrcVT.is64BitVector())
12890         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
12891                           DAG.getUNDEF(MVT::v8i8));
12892       TBLOperands.push_back(Src);
12893     }
12894 
12895     SmallVector<SDValue, 16> TBLMask;
12896     for (unsigned i = 0; i < Mask.size(); i++)
12897       TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
12898     assert((Mask.size() == 8 || Mask.size() == 16) &&
12899            "Expected a v8i8 or v16i8 Mask");
12900     TBLOperands.push_back(
12901         DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
12902 
12903     SDValue Shuffle =
12904         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
12905                     Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
12906     return DAG.getBitcast(VT, Shuffle);
12907   }
12908 
12909   if (Sources.size() > 2) {
12910     LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
12911                       << "sensible when at most two source vectors are "
12912                       << "involved\n");
12913     return SDValue();
12914   }
12915 
12916   // Find out the smallest element size among result and two sources, and use
12917   // it as element size to build the shuffle_vector.
12918   EVT SmallestEltTy = VT.getVectorElementType();
12919   for (auto &Source : Sources) {
12920     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
12921     if (SrcEltTy.bitsLT(SmallestEltTy)) {
12922       SmallestEltTy = SrcEltTy;
12923     }
12924   }
12925   unsigned ResMultiplier =
12926       VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
12927   uint64_t VTSize = VT.getFixedSizeInBits();
12928   NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
12929   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
12930 
12931   // If the source vector is too wide or too narrow, we may nevertheless be able
12932   // to construct a compatible shuffle either by concatenating it with UNDEF or
12933   // extracting a suitable range of elements.
12934   for (auto &Src : Sources) {
12935     EVT SrcVT = Src.ShuffleVec.getValueType();
12936 
12937     TypeSize SrcVTSize = SrcVT.getSizeInBits();
12938     if (SrcVTSize == TypeSize::getFixed(VTSize))
12939       continue;
12940 
12941     // This stage of the search produces a source with the same element type as
12942     // the original, but with a total width matching the BUILD_VECTOR output.
12943     EVT EltVT = SrcVT.getVectorElementType();
12944     unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
12945     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
12946 
12947     if (SrcVTSize.getFixedValue() < VTSize) {
12948       assert(2 * SrcVTSize == VTSize);
12949       // We can pad out the smaller vector for free, so if it's part of a
12950       // shuffle...
12951       Src.ShuffleVec =
12952           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
12953                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
12954       continue;
12955     }
12956 
12957     if (SrcVTSize.getFixedValue() != 2 * VTSize) {
12958       LLVM_DEBUG(
12959           dbgs() << "Reshuffle failed: result vector too small to extract\n");
12960       return SDValue();
12961     }
12962 
12963     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
12964       LLVM_DEBUG(
12965           dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
12966       return SDValue();
12967     }
12968 
12969     if (Src.MinElt >= NumSrcElts) {
12970       // The extraction can just take the second half
12971       Src.ShuffleVec =
12972           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12973                       DAG.getConstant(NumSrcElts, dl, MVT::i64));
12974       Src.WindowBase = -NumSrcElts;
12975     } else if (Src.MaxElt < NumSrcElts) {
12976       // The extraction can just take the first half
12977       Src.ShuffleVec =
12978           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12979                       DAG.getConstant(0, dl, MVT::i64));
12980     } else {
12981       // An actual VEXT is needed
12982       SDValue VEXTSrc1 =
12983           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12984                       DAG.getConstant(0, dl, MVT::i64));
12985       SDValue VEXTSrc2 =
12986           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12987                       DAG.getConstant(NumSrcElts, dl, MVT::i64));
12988       unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
12989 
12990       if (!SrcVT.is64BitVector()) {
12991         LLVM_DEBUG(
12992           dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
12993                     "for SVE vectors.");
12994         return SDValue();
12995       }
12996 
12997       Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
12998                                    VEXTSrc2,
12999                                    DAG.getConstant(Imm, dl, MVT::i32));
13000       Src.WindowBase = -Src.MinElt;
13001     }
13002   }
13003 
13004   // Another possible incompatibility occurs from the vector element types. We
13005   // can fix this by bitcasting the source vectors to the same type we intend
13006   // for the shuffle.
13007   for (auto &Src : Sources) {
13008     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
13009     if (SrcEltTy == SmallestEltTy)
13010       continue;
13011     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
13012     if (DAG.getDataLayout().isBigEndian()) {
13013       Src.ShuffleVec =
13014           DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
13015     } else {
13016       Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
13017     }
13018     Src.WindowScale =
13019         SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13020     Src.WindowBase *= Src.WindowScale;
13021   }
13022 
13023   // Final check before we try to actually produce a shuffle.
13024   LLVM_DEBUG({
13025     for (auto Src : Sources)
13026       assert(Src.ShuffleVec.getValueType() == ShuffleVT);
13027   });
13028 
13029   // The stars all align, our next step is to produce the mask for the shuffle.
13030   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
13031   int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
13032   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
13033     SDValue Entry = Op.getOperand(i);
13034     if (Entry.isUndef())
13035       continue;
13036 
13037     auto Src = find(Sources, Entry.getOperand(0));
13038     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
13039 
13040     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
13041     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
13042     // segment.
13043     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
13044     int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
13045                                VT.getScalarSizeInBits());
13046     int LanesDefined = BitsDefined / BitsPerShuffleLane;
13047 
13048     // This source is expected to fill ResMultiplier lanes of the final shuffle,
13049     // starting at the appropriate offset.
13050     int *LaneMask = &Mask[i * ResMultiplier];
13051 
13052     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
13053     ExtractBase += NumElts * (Src - Sources.begin());
13054     for (int j = 0; j < LanesDefined; ++j)
13055       LaneMask[j] = ExtractBase + j;
13056   }
13057 
13058   // Final check before we try to produce nonsense...
13059   if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
13060     LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
13061     return SDValue();
13062   }
13063 
13064   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
13065   for (unsigned i = 0; i < Sources.size(); ++i)
13066     ShuffleOps[i] = Sources[i].ShuffleVec;
13067 
13068   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
13069                                          ShuffleOps[1], Mask);
13070   SDValue V;
13071   if (DAG.getDataLayout().isBigEndian()) {
13072     V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
13073   } else {
13074     V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
13075   }
13076 
13077   LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
13078              dbgs() << "Reshuffle, creating node: "; V.dump(););
13079 
13080   return V;
13081 }
13082 
13083 // check if an EXT instruction can handle the shuffle mask when the
13084 // vector sources of the shuffle are the same.
13085 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
13086   unsigned NumElts = VT.getVectorNumElements();
13087 
13088   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
13089   if (M[0] < 0)
13090     return false;
13091 
13092   Imm = M[0];
13093 
13094   // If this is a VEXT shuffle, the immediate value is the index of the first
13095   // element.  The other shuffle indices must be the successive elements after
13096   // the first one.
13097   unsigned ExpectedElt = Imm;
13098   for (unsigned i = 1; i < NumElts; ++i) {
13099     // Increment the expected index.  If it wraps around, just follow it
13100     // back to index zero and keep going.
13101     ++ExpectedElt;
13102     if (ExpectedElt == NumElts)
13103       ExpectedElt = 0;
13104 
13105     if (M[i] < 0)
13106       continue; // ignore UNDEF indices
13107     if (ExpectedElt != static_cast<unsigned>(M[i]))
13108       return false;
13109   }
13110 
13111   return true;
13112 }
13113 
13114 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13115 // v4i32s. This is really a truncate, which we can construct out of (legal)
13116 // concats and truncate nodes.
13117 static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
13118   if (V.getValueType() != MVT::v16i8)
13119     return SDValue();
13120   assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
13121 
13122   for (unsigned X = 0; X < 4; X++) {
13123     // Check the first item in each group is an extract from lane 0 of a v4i32
13124     // or v4i16.
13125     SDValue BaseExt = V.getOperand(X * 4);
13126     if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13127         (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
13128          BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
13129         !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
13130         BaseExt.getConstantOperandVal(1) != 0)
13131       return SDValue();
13132     SDValue Base = BaseExt.getOperand(0);
13133     // And check the other items are extracts from the same vector.
13134     for (unsigned Y = 1; Y < 4; Y++) {
13135       SDValue Ext = V.getOperand(X * 4 + Y);
13136       if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13137           Ext.getOperand(0) != Base ||
13138           !isa<ConstantSDNode>(Ext.getOperand(1)) ||
13139           Ext.getConstantOperandVal(1) != Y)
13140         return SDValue();
13141     }
13142   }
13143 
13144   // Turn the buildvector into a series of truncates and concates, which will
13145   // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
13146   // concat together to produce 2 v8i16. These are both truncated and concat
13147   // together.
13148   SDLoc DL(V);
13149   SDValue Trunc[4] = {
13150       V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
13151       V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
13152   for (SDValue &V : Trunc)
13153     if (V.getValueType() == MVT::v4i32)
13154       V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
13155   SDValue Concat0 =
13156       DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
13157   SDValue Concat1 =
13158       DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
13159   SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
13160   SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
13161   return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
13162 }
13163 
13164 /// Check if a vector shuffle corresponds to a DUP instructions with a larger
13165 /// element width than the vector lane type. If that is the case the function
13166 /// returns true and writes the value of the DUP instruction lane operand into
13167 /// DupLaneOp
13168 static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
13169                           unsigned &DupLaneOp) {
13170   assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
13171          "Only possible block sizes for wide DUP are: 16, 32, 64");
13172 
13173   if (BlockSize <= VT.getScalarSizeInBits())
13174     return false;
13175   if (BlockSize % VT.getScalarSizeInBits() != 0)
13176     return false;
13177   if (VT.getSizeInBits() % BlockSize != 0)
13178     return false;
13179 
13180   size_t SingleVecNumElements = VT.getVectorNumElements();
13181   size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
13182   size_t NumBlocks = VT.getSizeInBits() / BlockSize;
13183 
13184   // We are looking for masks like
13185   // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
13186   // might be replaced by 'undefined'. BlockIndices will eventually contain
13187   // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
13188   // for the above examples)
13189   SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
13190   for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
13191     for (size_t I = 0; I < NumEltsPerBlock; I++) {
13192       int Elt = M[BlockIndex * NumEltsPerBlock + I];
13193       if (Elt < 0)
13194         continue;
13195       // For now we don't support shuffles that use the second operand
13196       if ((unsigned)Elt >= SingleVecNumElements)
13197         return false;
13198       if (BlockElts[I] < 0)
13199         BlockElts[I] = Elt;
13200       else if (BlockElts[I] != Elt)
13201         return false;
13202     }
13203 
13204   // We found a candidate block (possibly with some undefs). It must be a
13205   // sequence of consecutive integers starting with a value divisible by
13206   // NumEltsPerBlock with some values possibly replaced by undef-s.
13207 
13208   // Find first non-undef element
13209   auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
13210   assert(FirstRealEltIter != BlockElts.end() &&
13211          "Shuffle with all-undefs must have been caught by previous cases, "
13212          "e.g. isSplat()");
13213   if (FirstRealEltIter == BlockElts.end()) {
13214     DupLaneOp = 0;
13215     return true;
13216   }
13217 
13218   // Index of FirstRealElt in BlockElts
13219   size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
13220 
13221   if ((unsigned)*FirstRealEltIter < FirstRealIndex)
13222     return false;
13223   // BlockElts[0] must have the following value if it isn't undef:
13224   size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
13225 
13226   // Check the first element
13227   if (Elt0 % NumEltsPerBlock != 0)
13228     return false;
13229   // Check that the sequence indeed consists of consecutive integers (modulo
13230   // undefs)
13231   for (size_t I = 0; I < NumEltsPerBlock; I++)
13232     if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
13233       return false;
13234 
13235   DupLaneOp = Elt0 / NumEltsPerBlock;
13236   return true;
13237 }
13238 
13239 // check if an EXT instruction can handle the shuffle mask when the
13240 // vector sources of the shuffle are different.
13241 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
13242                       unsigned &Imm) {
13243   // Look for the first non-undef element.
13244   const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
13245 
13246   // Benefit form APInt to handle overflow when calculating expected element.
13247   unsigned NumElts = VT.getVectorNumElements();
13248   unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
13249   APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
13250                             /*implicitTrunc=*/true);
13251   // The following shuffle indices must be the successive elements after the
13252   // first real element.
13253   bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
13254     return Elt != ExpectedElt++ && Elt != -1;
13255   });
13256   if (FoundWrongElt)
13257     return false;
13258 
13259   // The index of an EXT is the first element if it is not UNDEF.
13260   // Watch out for the beginning UNDEFs. The EXT index should be the expected
13261   // value of the first element.  E.g.
13262   // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
13263   // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
13264   // ExpectedElt is the last mask index plus 1.
13265   Imm = ExpectedElt.getZExtValue();
13266 
13267   // There are two difference cases requiring to reverse input vectors.
13268   // For example, for vector <4 x i32> we have the following cases,
13269   // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
13270   // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
13271   // For both cases, we finally use mask <5, 6, 7, 0>, which requires
13272   // to reverse two input vectors.
13273   if (Imm < NumElts)
13274     ReverseEXT = true;
13275   else
13276     Imm -= NumElts;
13277 
13278   return true;
13279 }
13280 
13281 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
13282 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13283 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
13284 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13285   unsigned NumElts = VT.getVectorNumElements();
13286   if (NumElts % 2 != 0)
13287     return false;
13288   WhichResult = (M[0] == 0 ? 0 : 1);
13289   unsigned Idx = WhichResult * NumElts / 2;
13290   for (unsigned i = 0; i != NumElts; i += 2) {
13291     if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
13292         (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
13293       return false;
13294     Idx += 1;
13295   }
13296 
13297   return true;
13298 }
13299 
13300 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
13301 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13302 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
13303 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13304   unsigned Half = VT.getVectorNumElements() / 2;
13305   WhichResult = (M[0] == 0 ? 0 : 1);
13306   for (unsigned j = 0; j != 2; ++j) {
13307     unsigned Idx = WhichResult;
13308     for (unsigned i = 0; i != Half; ++i) {
13309       int MIdx = M[i + j * Half];
13310       if (MIdx >= 0 && (unsigned)MIdx != Idx)
13311         return false;
13312       Idx += 2;
13313     }
13314   }
13315 
13316   return true;
13317 }
13318 
13319 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
13320 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13321 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
13322 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13323   unsigned NumElts = VT.getVectorNumElements();
13324   if (NumElts % 2 != 0)
13325     return false;
13326   WhichResult = (M[0] == 0 ? 0 : 1);
13327   for (unsigned i = 0; i < NumElts; i += 2) {
13328     if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
13329         (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
13330       return false;
13331   }
13332   return true;
13333 }
13334 
13335 static bool isINSMask(ArrayRef<int> M, int NumInputElements,
13336                       bool &DstIsLeft, int &Anomaly) {
13337   if (M.size() != static_cast<size_t>(NumInputElements))
13338     return false;
13339 
13340   int NumLHSMatch = 0, NumRHSMatch = 0;
13341   int LastLHSMismatch = -1, LastRHSMismatch = -1;
13342 
13343   for (int i = 0; i < NumInputElements; ++i) {
13344     if (M[i] == -1) {
13345       ++NumLHSMatch;
13346       ++NumRHSMatch;
13347       continue;
13348     }
13349 
13350     if (M[i] == i)
13351       ++NumLHSMatch;
13352     else
13353       LastLHSMismatch = i;
13354 
13355     if (M[i] == i + NumInputElements)
13356       ++NumRHSMatch;
13357     else
13358       LastRHSMismatch = i;
13359   }
13360 
13361   if (NumLHSMatch == NumInputElements - 1) {
13362     DstIsLeft = true;
13363     Anomaly = LastLHSMismatch;
13364     return true;
13365   } else if (NumRHSMatch == NumInputElements - 1) {
13366     DstIsLeft = false;
13367     Anomaly = LastRHSMismatch;
13368     return true;
13369   }
13370 
13371   return false;
13372 }
13373 
13374 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
13375   if (VT.getSizeInBits() != 128)
13376     return false;
13377 
13378   unsigned NumElts = VT.getVectorNumElements();
13379 
13380   for (int I = 0, E = NumElts / 2; I != E; I++) {
13381     if (Mask[I] != I)
13382       return false;
13383   }
13384 
13385   int Offset = NumElts / 2;
13386   for (int I = NumElts / 2, E = NumElts; I != E; I++) {
13387     if (Mask[I] != I + SplitLHS * Offset)
13388       return false;
13389   }
13390 
13391   return true;
13392 }
13393 
13394 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
13395   SDLoc DL(Op);
13396   EVT VT = Op.getValueType();
13397   SDValue V0 = Op.getOperand(0);
13398   SDValue V1 = Op.getOperand(1);
13399   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
13400 
13401   if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
13402       VT.getVectorElementType() != V1.getValueType().getVectorElementType())
13403     return SDValue();
13404 
13405   bool SplitV0 = V0.getValueSizeInBits() == 128;
13406 
13407   if (!isConcatMask(Mask, VT, SplitV0))
13408     return SDValue();
13409 
13410   EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13411   if (SplitV0) {
13412     V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
13413                      DAG.getConstant(0, DL, MVT::i64));
13414   }
13415   if (V1.getValueSizeInBits() == 128) {
13416     V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
13417                      DAG.getConstant(0, DL, MVT::i64));
13418   }
13419   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
13420 }
13421 
13422 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
13423 /// the specified operations to build the shuffle. ID is the perfect-shuffle
13424 //ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
13425 //table entry and LHS/RHS are the immediate inputs for this stage of the
13426 //shuffle.
13427 static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
13428                                       SDValue V2, unsigned PFEntry, SDValue LHS,
13429                                       SDValue RHS, SelectionDAG &DAG,
13430                                       const SDLoc &dl) {
13431   unsigned OpNum = (PFEntry >> 26) & 0x0F;
13432   unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
13433   unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
13434 
13435   enum {
13436     OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
13437     OP_VREV,
13438     OP_VDUP0,
13439     OP_VDUP1,
13440     OP_VDUP2,
13441     OP_VDUP3,
13442     OP_VEXT1,
13443     OP_VEXT2,
13444     OP_VEXT3,
13445     OP_VUZPL,  // VUZP, left result
13446     OP_VUZPR,  // VUZP, right result
13447     OP_VZIPL,  // VZIP, left result
13448     OP_VZIPR,  // VZIP, right result
13449     OP_VTRNL,  // VTRN, left result
13450     OP_VTRNR,  // VTRN, right result
13451     OP_MOVLANE // Move lane. RHSID is the lane to move into
13452   };
13453 
13454   if (OpNum == OP_COPY) {
13455     if (LHSID == (1 * 9 + 2) * 9 + 3)
13456       return LHS;
13457     assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
13458     return RHS;
13459   }
13460 
13461   if (OpNum == OP_MOVLANE) {
13462     // Decompose a PerfectShuffle ID to get the Mask for lane Elt
13463     auto getPFIDLane = [](unsigned ID, int Elt) -> int {
13464       assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
13465       Elt = 3 - Elt;
13466       while (Elt > 0) {
13467         ID /= 9;
13468         Elt--;
13469       }
13470       return (ID % 9 == 8) ? -1 : ID % 9;
13471     };
13472 
13473     // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
13474     // get the lane to move from the PFID, which is always from the
13475     // original vectors (V1 or V2).
13476     SDValue OpLHS = GeneratePerfectShuffle(
13477         LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
13478     EVT VT = OpLHS.getValueType();
13479     assert(RHSID < 8 && "Expected a lane index for RHSID!");
13480     unsigned ExtLane = 0;
13481     SDValue Input;
13482 
13483     // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
13484     // convert into a higher type.
13485     if (RHSID & 0x4) {
13486       int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
13487       if (MaskElt == -1)
13488         MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
13489       assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
13490       ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
13491       Input = MaskElt < 2 ? V1 : V2;
13492       if (VT.getScalarSizeInBits() == 16) {
13493         Input = DAG.getBitcast(MVT::v2f32, Input);
13494         OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
13495       } else {
13496         assert(VT.getScalarSizeInBits() == 32 &&
13497                "Expected 16 or 32 bit shuffle elemements");
13498         Input = DAG.getBitcast(MVT::v2f64, Input);
13499         OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
13500       }
13501     } else {
13502       int MaskElt = getPFIDLane(ID, RHSID);
13503       assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
13504       ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
13505       Input = MaskElt < 4 ? V1 : V2;
13506       // Be careful about creating illegal types. Use f16 instead of i16.
13507       if (VT == MVT::v4i16) {
13508         Input = DAG.getBitcast(MVT::v4f16, Input);
13509         OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
13510       }
13511     }
13512     SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13513                               Input.getValueType().getVectorElementType(),
13514                               Input, DAG.getVectorIdxConstant(ExtLane, dl));
13515     SDValue Ins =
13516         DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
13517                     Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
13518     return DAG.getBitcast(VT, Ins);
13519   }
13520 
13521   SDValue OpLHS, OpRHS;
13522   OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
13523                                  RHS, DAG, dl);
13524   OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
13525                                  RHS, DAG, dl);
13526   EVT VT = OpLHS.getValueType();
13527 
13528   switch (OpNum) {
13529   default:
13530     llvm_unreachable("Unknown shuffle opcode!");
13531   case OP_VREV:
13532     // VREV divides the vector in half and swaps within the half.
13533     if (VT.getVectorElementType() == MVT::i32 ||
13534         VT.getVectorElementType() == MVT::f32)
13535       return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
13536     // vrev <4 x i16> -> REV32
13537     if (VT.getVectorElementType() == MVT::i16 ||
13538         VT.getVectorElementType() == MVT::f16 ||
13539         VT.getVectorElementType() == MVT::bf16)
13540       return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
13541     // vrev <4 x i8> -> REV16
13542     assert(VT.getVectorElementType() == MVT::i8);
13543     return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
13544   case OP_VDUP0:
13545   case OP_VDUP1:
13546   case OP_VDUP2:
13547   case OP_VDUP3: {
13548     EVT EltTy = VT.getVectorElementType();
13549     unsigned Opcode;
13550     if (EltTy == MVT::i8)
13551       Opcode = AArch64ISD::DUPLANE8;
13552     else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
13553       Opcode = AArch64ISD::DUPLANE16;
13554     else if (EltTy == MVT::i32 || EltTy == MVT::f32)
13555       Opcode = AArch64ISD::DUPLANE32;
13556     else if (EltTy == MVT::i64 || EltTy == MVT::f64)
13557       Opcode = AArch64ISD::DUPLANE64;
13558     else
13559       llvm_unreachable("Invalid vector element type?");
13560 
13561     if (VT.getSizeInBits() == 64)
13562       OpLHS = WidenVector(OpLHS, DAG);
13563     SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
13564     return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
13565   }
13566   case OP_VEXT1:
13567   case OP_VEXT2:
13568   case OP_VEXT3: {
13569     unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
13570     return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
13571                        DAG.getConstant(Imm, dl, MVT::i32));
13572   }
13573   case OP_VUZPL:
13574     return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
13575   case OP_VUZPR:
13576     return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
13577   case OP_VZIPL:
13578     return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
13579   case OP_VZIPR:
13580     return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
13581   case OP_VTRNL:
13582     return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
13583   case OP_VTRNR:
13584     return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
13585   }
13586 }
13587 
13588 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
13589                            SelectionDAG &DAG) {
13590   // Check to see if we can use the TBL instruction.
13591   SDValue V1 = Op.getOperand(0);
13592   SDValue V2 = Op.getOperand(1);
13593   SDLoc DL(Op);
13594 
13595   EVT EltVT = Op.getValueType().getVectorElementType();
13596   unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
13597 
13598   bool Swap = false;
13599   if (V1.isUndef() || isZerosVector(V1.getNode())) {
13600     std::swap(V1, V2);
13601     Swap = true;
13602   }
13603 
13604   // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
13605   // out of range values with 0s. We do need to make sure that any out-of-range
13606   // values are really out-of-range for a v16i8 vector.
13607   bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
13608   MVT IndexVT = MVT::v8i8;
13609   unsigned IndexLen = 8;
13610   if (Op.getValueSizeInBits() == 128) {
13611     IndexVT = MVT::v16i8;
13612     IndexLen = 16;
13613   }
13614 
13615   SmallVector<SDValue, 8> TBLMask;
13616   for (int Val : ShuffleMask) {
13617     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
13618       unsigned Offset = Byte + Val * BytesPerElt;
13619       if (Swap)
13620         Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
13621       if (IsUndefOrZero && Offset >= IndexLen)
13622         Offset = 255;
13623       TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
13624     }
13625   }
13626 
13627   SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
13628   SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
13629 
13630   SDValue Shuffle;
13631   if (IsUndefOrZero) {
13632     if (IndexLen == 8)
13633       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
13634     Shuffle = DAG.getNode(
13635         ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13636         DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
13637         DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13638   } else {
13639     if (IndexLen == 8) {
13640       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
13641       Shuffle = DAG.getNode(
13642           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13643           DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
13644           DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13645     } else {
13646       // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
13647       // cannot currently represent the register constraints on the input
13648       // table registers.
13649       //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
13650       //                   DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
13651       //                   IndexLen));
13652       Shuffle = DAG.getNode(
13653           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13654           DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
13655           V2Cst,
13656           DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13657     }
13658   }
13659   return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
13660 }
13661 
13662 static unsigned getDUPLANEOp(EVT EltType) {
13663   if (EltType == MVT::i8)
13664     return AArch64ISD::DUPLANE8;
13665   if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
13666     return AArch64ISD::DUPLANE16;
13667   if (EltType == MVT::i32 || EltType == MVT::f32)
13668     return AArch64ISD::DUPLANE32;
13669   if (EltType == MVT::i64 || EltType == MVT::f64)
13670     return AArch64ISD::DUPLANE64;
13671 
13672   llvm_unreachable("Invalid vector element type?");
13673 }
13674 
13675 static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
13676                             unsigned Opcode, SelectionDAG &DAG) {
13677   // Try to eliminate a bitcasted extract subvector before a DUPLANE.
13678   auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
13679     // Match: dup (bitcast (extract_subv X, C)), LaneC
13680     if (BitCast.getOpcode() != ISD::BITCAST ||
13681         BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
13682       return false;
13683 
13684     // The extract index must align in the destination type. That may not
13685     // happen if the bitcast is from narrow to wide type.
13686     SDValue Extract = BitCast.getOperand(0);
13687     unsigned ExtIdx = Extract.getConstantOperandVal(1);
13688     unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
13689     unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
13690     unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
13691     if (ExtIdxInBits % CastedEltBitWidth != 0)
13692       return false;
13693 
13694     // Can't handle cases where vector size is not 128-bit
13695     if (!Extract.getOperand(0).getValueType().is128BitVector())
13696       return false;
13697 
13698     // Update the lane value by offsetting with the scaled extract index.
13699     LaneC += ExtIdxInBits / CastedEltBitWidth;
13700 
13701     // Determine the casted vector type of the wide vector input.
13702     // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
13703     // Examples:
13704     // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
13705     // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
13706     unsigned SrcVecNumElts =
13707         Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
13708     CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
13709                               SrcVecNumElts);
13710     return true;
13711   };
13712   MVT CastVT;
13713   if (getScaledOffsetDup(V, Lane, CastVT)) {
13714     V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
13715   } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13716              V.getOperand(0).getValueType().is128BitVector()) {
13717     // The lane is incremented by the index of the extract.
13718     // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
13719     Lane += V.getConstantOperandVal(1);
13720     V = V.getOperand(0);
13721   } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
13722     // The lane is decremented if we are splatting from the 2nd operand.
13723     // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
13724     unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
13725     Lane -= Idx * VT.getVectorNumElements() / 2;
13726     V = WidenVector(V.getOperand(Idx), DAG);
13727   } else if (VT.getSizeInBits() == 64) {
13728     // Widen the operand to 128-bit register with undef.
13729     V = WidenVector(V, DAG);
13730   }
13731   return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
13732 }
13733 
13734 // Try to widen element type to get a new mask value for a better permutation
13735 // sequence, so that we can use NEON shuffle instructions, such as zip1/2,
13736 // UZP1/2, TRN1/2, REV, INS, etc.
13737 // For example:
13738 //  shufflevector <4 x i32> %a, <4 x i32> %b,
13739 //                <4 x i32> <i32 6, i32 7, i32 2, i32 3>
13740 // is equivalent to:
13741 //  shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
13742 // Finally, we can get:
13743 //  mov     v0.d[0], v1.d[1]
13744 static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
13745   SDLoc DL(Op);
13746   EVT VT = Op.getValueType();
13747   EVT ScalarVT = VT.getVectorElementType();
13748   unsigned ElementSize = ScalarVT.getFixedSizeInBits();
13749   SDValue V0 = Op.getOperand(0);
13750   SDValue V1 = Op.getOperand(1);
13751   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
13752 
13753   // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
13754   // We need to make sure the wider element type is legal. Thus, ElementSize
13755   // should be not larger than 32 bits, and i1 type should also be excluded.
13756   if (ElementSize > 32 || ElementSize == 1)
13757     return SDValue();
13758 
13759   SmallVector<int, 8> NewMask;
13760   if (widenShuffleMaskElts(Mask, NewMask)) {
13761     MVT NewEltVT = VT.isFloatingPoint()
13762                        ? MVT::getFloatingPointVT(ElementSize * 2)
13763                        : MVT::getIntegerVT(ElementSize * 2);
13764     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13765     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13766       V0 = DAG.getBitcast(NewVT, V0);
13767       V1 = DAG.getBitcast(NewVT, V1);
13768       return DAG.getBitcast(VT,
13769                             DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
13770     }
13771   }
13772 
13773   return SDValue();
13774 }
13775 
13776 // Try to fold shuffle (tbl2, tbl2) into a single tbl4.
13777 static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op,
13778                                                ArrayRef<int> ShuffleMask,
13779                                                SelectionDAG &DAG) {
13780   SDValue Tbl1 = Op->getOperand(0);
13781   SDValue Tbl2 = Op->getOperand(1);
13782   SDLoc dl(Op);
13783   SDValue Tbl2ID =
13784       DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
13785 
13786   EVT VT = Op.getValueType();
13787   if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
13788       Tbl1->getOperand(0) != Tbl2ID ||
13789       Tbl2->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
13790       Tbl2->getOperand(0) != Tbl2ID)
13791     return SDValue();
13792 
13793   if (Tbl1->getValueType(0) != MVT::v16i8 ||
13794       Tbl2->getValueType(0) != MVT::v16i8)
13795     return SDValue();
13796 
13797   SDValue Mask1 = Tbl1->getOperand(3);
13798   SDValue Mask2 = Tbl2->getOperand(3);
13799   SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
13800   for (unsigned I = 0; I < 16; I++) {
13801     if (ShuffleMask[I] < 16)
13802       TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
13803     else {
13804       auto *C =
13805           dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
13806       if (!C)
13807         return SDValue();
13808       TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
13809     }
13810   }
13811 
13812   SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
13813   SDValue ID =
13814       DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
13815 
13816   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
13817                      {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
13818                       Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
13819 }
13820 
13821 // Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
13822 // but we don't have an appropriate instruction,
13823 // so custom-lower it as ZIP1-with-zeros.
13824 SDValue
13825 AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
13826                                                      SelectionDAG &DAG) const {
13827   SDLoc dl(Op);
13828   EVT VT = Op.getValueType();
13829   SDValue SrcOp = Op.getOperand(0);
13830   EVT SrcVT = SrcOp.getValueType();
13831   assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
13832          "Unexpected extension factor.");
13833   unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
13834   // FIXME: support multi-step zipping?
13835   if (Scale != 2)
13836     return SDValue();
13837   SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
13838   return DAG.getBitcast(VT,
13839                         DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
13840 }
13841 
13842 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
13843                                                    SelectionDAG &DAG) const {
13844   SDLoc dl(Op);
13845   EVT VT = Op.getValueType();
13846 
13847   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
13848 
13849   if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13850     return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
13851 
13852   // Convert shuffles that are directly supported on NEON to target-specific
13853   // DAG nodes, instead of keeping them as shuffles and matching them again
13854   // during code selection.  This is more efficient and avoids the possibility
13855   // of inconsistencies between legalization and selection.
13856   ArrayRef<int> ShuffleMask = SVN->getMask();
13857 
13858   SDValue V1 = Op.getOperand(0);
13859   SDValue V2 = Op.getOperand(1);
13860 
13861   assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
13862   assert(ShuffleMask.size() == VT.getVectorNumElements() &&
13863          "Unexpected VECTOR_SHUFFLE mask size!");
13864 
13865   if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
13866     return Res;
13867 
13868   if (SVN->isSplat()) {
13869     int Lane = SVN->getSplatIndex();
13870     // If this is undef splat, generate it via "just" vdup, if possible.
13871     if (Lane == -1)
13872       Lane = 0;
13873 
13874     if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
13875       return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
13876                          V1.getOperand(0));
13877     // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
13878     // constant. If so, we can just reference the lane's definition directly.
13879     if (V1.getOpcode() == ISD::BUILD_VECTOR &&
13880         !isa<ConstantSDNode>(V1.getOperand(Lane)))
13881       return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
13882 
13883     // Otherwise, duplicate from the lane of the input vector.
13884     unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
13885     return constructDup(V1, Lane, dl, VT, Opcode, DAG);
13886   }
13887 
13888   // Check if the mask matches a DUP for a wider element
13889   for (unsigned LaneSize : {64U, 32U, 16U}) {
13890     unsigned Lane = 0;
13891     if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
13892       unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
13893                                        : LaneSize == 32 ? AArch64ISD::DUPLANE32
13894                                                         : AArch64ISD::DUPLANE16;
13895       // Cast V1 to an integer vector with required lane size
13896       MVT NewEltTy = MVT::getIntegerVT(LaneSize);
13897       unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
13898       MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
13899       V1 = DAG.getBitcast(NewVecTy, V1);
13900       // Constuct the DUP instruction
13901       V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
13902       // Cast back to the original type
13903       return DAG.getBitcast(VT, V1);
13904     }
13905   }
13906 
13907   unsigned NumElts = VT.getVectorNumElements();
13908   unsigned EltSize = VT.getScalarSizeInBits();
13909   if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
13910     return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1);
13911   if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
13912     return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1);
13913   if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
13914     return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1);
13915 
13916   if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
13917       ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
13918     SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
13919     return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
13920                        DAG.getConstant(8, dl, MVT::i32));
13921   }
13922 
13923   bool ReverseEXT = false;
13924   unsigned Imm;
13925   if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
13926     if (ReverseEXT)
13927       std::swap(V1, V2);
13928     Imm *= getExtFactor(V1);
13929     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
13930                        DAG.getConstant(Imm, dl, MVT::i32));
13931   } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
13932     Imm *= getExtFactor(V1);
13933     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
13934                        DAG.getConstant(Imm, dl, MVT::i32));
13935   }
13936 
13937   unsigned WhichResult;
13938   if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
13939     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13940     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13941   }
13942   if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
13943     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13944     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13945   }
13946   if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
13947     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13948     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13949   }
13950 
13951   if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13952     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13953     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13954   }
13955   if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13956     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13957     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13958   }
13959   if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13960     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13961     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13962   }
13963 
13964   if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
13965     return Concat;
13966 
13967   bool DstIsLeft;
13968   int Anomaly;
13969   int NumInputElements = V1.getValueType().getVectorNumElements();
13970   if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
13971     SDValue DstVec = DstIsLeft ? V1 : V2;
13972     SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
13973 
13974     SDValue SrcVec = V1;
13975     int SrcLane = ShuffleMask[Anomaly];
13976     if (SrcLane >= NumInputElements) {
13977       SrcVec = V2;
13978       SrcLane -= NumElts;
13979     }
13980     SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
13981 
13982     EVT ScalarVT = VT.getVectorElementType();
13983 
13984     if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
13985       ScalarVT = MVT::i32;
13986 
13987     return DAG.getNode(
13988         ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
13989         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
13990         DstLaneV);
13991   }
13992 
13993   if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
13994     return NewSD;
13995 
13996   // If the shuffle is not directly supported and it has 4 elements, use
13997   // the PerfectShuffle-generated table to synthesize it from other shuffles.
13998   if (NumElts == 4) {
13999     unsigned PFIndexes[4];
14000     for (unsigned i = 0; i != 4; ++i) {
14001       if (ShuffleMask[i] < 0)
14002         PFIndexes[i] = 8;
14003       else
14004         PFIndexes[i] = ShuffleMask[i];
14005     }
14006 
14007     // Compute the index in the perfect shuffle table.
14008     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
14009                             PFIndexes[2] * 9 + PFIndexes[3];
14010     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
14011     return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
14012                                   dl);
14013   }
14014 
14015   // Check for a "select shuffle", generating a BSL to pick between lanes in
14016   // V1/V2.
14017   if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
14018     assert(VT.getScalarSizeInBits() <= 32 &&
14019            "Expected larger vector element sizes to be handled already");
14020     SmallVector<SDValue> MaskElts;
14021     for (int M : ShuffleMask)
14022       MaskElts.push_back(DAG.getConstant(
14023           M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, dl, MVT::i32));
14024     EVT IVT = VT.changeVectorElementTypeToInteger();
14025     SDValue MaskConst = DAG.getBuildVector(IVT, dl, MaskElts);
14026     return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, dl, IVT, MaskConst,
14027                                           DAG.getBitcast(IVT, V1),
14028                                           DAG.getBitcast(IVT, V2)));
14029   }
14030 
14031   // Fall back to generating a TBL
14032   return GenerateTBL(Op, ShuffleMask, DAG);
14033 }
14034 
14035 SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
14036                                                  SelectionDAG &DAG) const {
14037   EVT VT = Op.getValueType();
14038 
14039   if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14040     return LowerToScalableOp(Op, DAG);
14041 
14042   assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
14043          "Unexpected vector type!");
14044 
14045   // We can handle the constant cases during isel.
14046   if (isa<ConstantSDNode>(Op.getOperand(0)))
14047     return Op;
14048 
14049   // There isn't a natural way to handle the general i1 case, so we use some
14050   // trickery with whilelo.
14051   SDLoc DL(Op);
14052   SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
14053   SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
14054                          DAG.getValueType(MVT::i1));
14055   SDValue ID =
14056       DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
14057   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14058   if (VT == MVT::nxv1i1)
14059     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
14060                        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
14061                                    Zero, SplatVal),
14062                        Zero);
14063   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
14064 }
14065 
14066 SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
14067                                              SelectionDAG &DAG) const {
14068   SDLoc DL(Op);
14069 
14070   EVT VT = Op.getValueType();
14071   if (!isTypeLegal(VT) || !VT.isScalableVector())
14072     return SDValue();
14073 
14074   // Current lowering only supports the SVE-ACLE types.
14075   if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
14076     return SDValue();
14077 
14078   // The DUPQ operation is independent of element type so normalise to i64s.
14079   SDValue Idx128 = Op.getOperand(2);
14080 
14081   // DUPQ can be used when idx is in range.
14082   auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
14083   if (CIdx && (CIdx->getZExtValue() <= 3)) {
14084     SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
14085     return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
14086   }
14087 
14088   SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
14089 
14090   // The ACLE says this must produce the same result as:
14091   //   svtbl(data, svadd_x(svptrue_b64(),
14092   //                       svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
14093   //                       index * 2))
14094   SDValue One = DAG.getConstant(1, DL, MVT::i64);
14095   SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
14096 
14097   // create the vector 0,1,0,1,...
14098   SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
14099   SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
14100 
14101   // create the vector idx64,idx64+1,idx64,idx64+1,...
14102   SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
14103   SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
14104   SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
14105 
14106   // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
14107   SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
14108   return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
14109 }
14110 
14111 
14112 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
14113                                APInt &UndefBits) {
14114   EVT VT = BVN->getValueType(0);
14115   APInt SplatBits, SplatUndef;
14116   unsigned SplatBitSize;
14117   bool HasAnyUndefs;
14118   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14119     unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
14120 
14121     for (unsigned i = 0; i < NumSplats; ++i) {
14122       CnstBits <<= SplatBitSize;
14123       UndefBits <<= SplatBitSize;
14124       CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
14125       UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
14126     }
14127 
14128     return true;
14129   }
14130 
14131   return false;
14132 }
14133 
14134 // Try 64-bit splatted SIMD immediate.
14135 static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14136                                  const APInt &Bits) {
14137   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14138     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14139     EVT VT = Op.getValueType();
14140     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
14141 
14142     if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
14143       Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);
14144 
14145       SDLoc dl(Op);
14146       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14147                                 DAG.getConstant(Value, dl, MVT::i32));
14148       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14149     }
14150   }
14151 
14152   return SDValue();
14153 }
14154 
14155 // Try 32-bit splatted SIMD immediate.
14156 static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14157                                   const APInt &Bits,
14158                                   const SDValue *LHS = nullptr) {
14159   EVT VT = Op.getValueType();
14160   if (VT.isFixedLengthVector() &&
14161       !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())
14162     return SDValue();
14163 
14164   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14165     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14166     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14167     bool isAdvSIMDModImm = false;
14168     uint64_t Shift;
14169 
14170     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
14171       Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
14172       Shift = 0;
14173     }
14174     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
14175       Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
14176       Shift = 8;
14177     }
14178     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
14179       Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
14180       Shift = 16;
14181     }
14182     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
14183       Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
14184       Shift = 24;
14185     }
14186 
14187     if (isAdvSIMDModImm) {
14188       SDLoc dl(Op);
14189       SDValue Mov;
14190 
14191       if (LHS)
14192         Mov = DAG.getNode(NewOp, dl, MovTy,
14193                           DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
14194                           DAG.getConstant(Value, dl, MVT::i32),
14195                           DAG.getConstant(Shift, dl, MVT::i32));
14196       else
14197         Mov = DAG.getNode(NewOp, dl, MovTy,
14198                           DAG.getConstant(Value, dl, MVT::i32),
14199                           DAG.getConstant(Shift, dl, MVT::i32));
14200 
14201       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14202     }
14203   }
14204 
14205   return SDValue();
14206 }
14207 
14208 // Try 16-bit splatted SIMD immediate.
14209 static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14210                                   const APInt &Bits,
14211                                   const SDValue *LHS = nullptr) {
14212   EVT VT = Op.getValueType();
14213   if (VT.isFixedLengthVector() &&
14214       !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())
14215     return SDValue();
14216 
14217   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14218     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14219     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
14220     bool isAdvSIMDModImm = false;
14221     uint64_t Shift;
14222 
14223     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
14224       Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
14225       Shift = 0;
14226     }
14227     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
14228       Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
14229       Shift = 8;
14230     }
14231 
14232     if (isAdvSIMDModImm) {
14233       SDLoc dl(Op);
14234       SDValue Mov;
14235 
14236       if (LHS)
14237         Mov = DAG.getNode(NewOp, dl, MovTy,
14238                           DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
14239                           DAG.getConstant(Value, dl, MVT::i32),
14240                           DAG.getConstant(Shift, dl, MVT::i32));
14241       else
14242         Mov = DAG.getNode(NewOp, dl, MovTy,
14243                           DAG.getConstant(Value, dl, MVT::i32),
14244                           DAG.getConstant(Shift, dl, MVT::i32));
14245 
14246       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14247     }
14248   }
14249 
14250   return SDValue();
14251 }
14252 
14253 // Try 32-bit splatted SIMD immediate with shifted ones.
14254 static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
14255                                     SelectionDAG &DAG, const APInt &Bits) {
14256   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14257     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14258     EVT VT = Op.getValueType();
14259     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14260     bool isAdvSIMDModImm = false;
14261     uint64_t Shift;
14262 
14263     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
14264       Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
14265       Shift = 264;
14266     }
14267     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
14268       Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
14269       Shift = 272;
14270     }
14271 
14272     if (isAdvSIMDModImm) {
14273       SDLoc dl(Op);
14274       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14275                                 DAG.getConstant(Value, dl, MVT::i32),
14276                                 DAG.getConstant(Shift, dl, MVT::i32));
14277       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14278     }
14279   }
14280 
14281   return SDValue();
14282 }
14283 
14284 // Try 8-bit splatted SIMD immediate.
14285 static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14286                                  const APInt &Bits) {
14287   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14288     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14289     EVT VT = Op.getValueType();
14290     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
14291 
14292     if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
14293       Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);
14294 
14295       SDLoc dl(Op);
14296       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14297                                 DAG.getConstant(Value, dl, MVT::i32));
14298       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14299     }
14300   }
14301 
14302   return SDValue();
14303 }
14304 
14305 // Try FP splatted SIMD immediate.
14306 static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14307                                   const APInt &Bits) {
14308   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14309     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14310     EVT VT = Op.getValueType();
14311     bool isWide = (VT.getSizeInBits() == 128);
14312     MVT MovTy;
14313     bool isAdvSIMDModImm = false;
14314 
14315     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
14316       Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
14317       MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
14318     }
14319     else if (isWide &&
14320              (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
14321       Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
14322       MovTy = MVT::v2f64;
14323     }
14324 
14325     if (isAdvSIMDModImm) {
14326       SDLoc dl(Op);
14327       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14328                                 DAG.getConstant(Value, dl, MVT::i32));
14329       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14330     }
14331   }
14332 
14333   return SDValue();
14334 }
14335 
14336 // Specialized code to quickly find if PotentialBVec is a BuildVector that
14337 // consists of only the same constant int value, returned in reference arg
14338 // ConstVal
14339 static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
14340                                      uint64_t &ConstVal) {
14341   BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
14342   if (!Bvec)
14343     return false;
14344   ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
14345   if (!FirstElt)
14346     return false;
14347   EVT VT = Bvec->getValueType(0);
14348   unsigned NumElts = VT.getVectorNumElements();
14349   for (unsigned i = 1; i < NumElts; ++i)
14350     if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
14351       return false;
14352   ConstVal = FirstElt->getZExtValue();
14353   return true;
14354 }
14355 
14356 static bool isAllInactivePredicate(SDValue N) {
14357   // Look through cast.
14358   while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
14359     N = N.getOperand(0);
14360 
14361   return ISD::isConstantSplatVectorAllZeros(N.getNode());
14362 }
14363 
14364 static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
14365   unsigned NumElts = N.getValueType().getVectorMinNumElements();
14366 
14367   // Look through cast.
14368   while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
14369     N = N.getOperand(0);
14370     // When reinterpreting from a type with fewer elements the "new" elements
14371     // are not active, so bail if they're likely to be used.
14372     if (N.getValueType().getVectorMinNumElements() < NumElts)
14373       return false;
14374   }
14375 
14376   if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
14377     return true;
14378 
14379   // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
14380   // or smaller than the implicit element type represented by N.
14381   // NOTE: A larger element count implies a smaller element type.
14382   if (N.getOpcode() == AArch64ISD::PTRUE &&
14383       N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
14384     return N.getValueType().getVectorMinNumElements() >= NumElts;
14385 
14386   // If we're compiling for a specific vector-length, we can check if the
14387   // pattern's VL equals that of the scalable vector at runtime.
14388   if (N.getOpcode() == AArch64ISD::PTRUE) {
14389     const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14390     unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
14391     unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
14392     if (MaxSVESize && MinSVESize == MaxSVESize) {
14393       unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
14394       unsigned PatNumElts =
14395           getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
14396       return PatNumElts == (NumElts * VScale);
14397     }
14398   }
14399 
14400   return false;
14401 }
14402 
14403 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
14404 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
14405 // BUILD_VECTORs with constant element C1, C2 is a constant, and:
14406 //   - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
14407 //   - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
14408 // The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
14409 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
14410   EVT VT = N->getValueType(0);
14411 
14412   if (!VT.isVector())
14413     return SDValue();
14414 
14415   SDLoc DL(N);
14416 
14417   SDValue And;
14418   SDValue Shift;
14419 
14420   SDValue FirstOp = N->getOperand(0);
14421   unsigned FirstOpc = FirstOp.getOpcode();
14422   SDValue SecondOp = N->getOperand(1);
14423   unsigned SecondOpc = SecondOp.getOpcode();
14424 
14425   // Is one of the operands an AND or a BICi? The AND may have been optimised to
14426   // a BICi in order to use an immediate instead of a register.
14427   // Is the other operand an shl or lshr? This will have been turned into:
14428   // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
14429   // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
14430   if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
14431       (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
14432        SecondOpc == AArch64ISD::SHL_PRED ||
14433        SecondOpc == AArch64ISD::SRL_PRED)) {
14434     And = FirstOp;
14435     Shift = SecondOp;
14436 
14437   } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
14438              (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
14439               FirstOpc == AArch64ISD::SHL_PRED ||
14440               FirstOpc == AArch64ISD::SRL_PRED)) {
14441     And = SecondOp;
14442     Shift = FirstOp;
14443   } else
14444     return SDValue();
14445 
14446   bool IsAnd = And.getOpcode() == ISD::AND;
14447   bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
14448                       Shift.getOpcode() == AArch64ISD::SRL_PRED;
14449   bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
14450                         Shift.getOpcode() == AArch64ISD::SRL_PRED;
14451 
14452   // Is the shift amount constant and are all lanes active?
14453   uint64_t C2;
14454   if (ShiftHasPredOp) {
14455     if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
14456       return SDValue();
14457     APInt C;
14458     if (!ISD::isConstantSplatVector(Shift.getOperand(2).getNode(), C))
14459       return SDValue();
14460     C2 = C.getZExtValue();
14461   } else if (ConstantSDNode *C2node =
14462                  dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
14463     C2 = C2node->getZExtValue();
14464   else
14465     return SDValue();
14466 
14467   APInt C1AsAPInt;
14468   unsigned ElemSizeInBits = VT.getScalarSizeInBits();
14469   if (IsAnd) {
14470     // Is the and mask vector all constant?
14471     if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
14472       return SDValue();
14473   } else {
14474     // Reconstruct the corresponding AND immediate from the two BICi immediates.
14475     ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
14476     ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
14477     assert(C1nodeImm && C1nodeShift);
14478     C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
14479     C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
14480   }
14481 
14482   // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
14483   // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
14484   // how much one can shift elements of a particular size?
14485   if (C2 > ElemSizeInBits)
14486     return SDValue();
14487 
14488   APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
14489                                   : APInt::getLowBitsSet(ElemSizeInBits, C2);
14490   if (C1AsAPInt != RequiredC1)
14491     return SDValue();
14492 
14493   SDValue X = And.getOperand(0);
14494   SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
14495   SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
14496                                : Shift.getOperand(1);
14497 
14498   unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
14499   SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
14500 
14501   LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
14502   LLVM_DEBUG(N->dump(&DAG));
14503   LLVM_DEBUG(dbgs() << "into: \n");
14504   LLVM_DEBUG(ResultSLI->dump(&DAG));
14505 
14506   ++NumShiftInserts;
14507   return ResultSLI;
14508 }
14509 
14510 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
14511                                              SelectionDAG &DAG) const {
14512   if (useSVEForFixedLengthVectorVT(Op.getValueType(),
14513                                    !Subtarget->isNeonAvailable()))
14514     return LowerToScalableOp(Op, DAG);
14515 
14516   // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
14517   if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
14518     return Res;
14519 
14520   EVT VT = Op.getValueType();
14521   if (VT.isScalableVector())
14522     return Op;
14523 
14524   SDValue LHS = Op.getOperand(0);
14525   BuildVectorSDNode *BVN =
14526       dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
14527   if (!BVN) {
14528     // OR commutes, so try swapping the operands.
14529     LHS = Op.getOperand(1);
14530     BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
14531   }
14532   if (!BVN)
14533     return Op;
14534 
14535   APInt DefBits(VT.getSizeInBits(), 0);
14536   APInt UndefBits(VT.getSizeInBits(), 0);
14537   if (resolveBuildVector(BVN, DefBits, UndefBits)) {
14538     SDValue NewOp;
14539 
14540     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
14541                                     DefBits, &LHS)) ||
14542         (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
14543                                     DefBits, &LHS)))
14544       return NewOp;
14545 
14546     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
14547                                     UndefBits, &LHS)) ||
14548         (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
14549                                     UndefBits, &LHS)))
14550       return NewOp;
14551   }
14552 
14553   // We can always fall back to a non-immediate OR.
14554   return Op;
14555 }
14556 
14557 // Normalize the operands of BUILD_VECTOR. The value of constant operands will
14558 // be truncated to fit element width.
14559 static SDValue NormalizeBuildVector(SDValue Op,
14560                                     SelectionDAG &DAG) {
14561   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
14562   SDLoc dl(Op);
14563   EVT VT = Op.getValueType();
14564   EVT EltTy= VT.getVectorElementType();
14565 
14566   if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
14567     return Op;
14568 
14569   SmallVector<SDValue, 16> Ops;
14570   for (SDValue Lane : Op->ops()) {
14571     // For integer vectors, type legalization would have promoted the
14572     // operands already. Otherwise, if Op is a floating-point splat
14573     // (with operands cast to integers), then the only possibilities
14574     // are constants and UNDEFs.
14575     if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
14576       Lane = DAG.getConstant(
14577           CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
14578           dl, MVT::i32);
14579     } else if (Lane.getNode()->isUndef()) {
14580       Lane = DAG.getUNDEF(MVT::i32);
14581     } else {
14582       assert(Lane.getValueType() == MVT::i32 &&
14583              "Unexpected BUILD_VECTOR operand type");
14584     }
14585     Ops.push_back(Lane);
14586   }
14587   return DAG.getBuildVector(VT, dl, Ops);
14588 }
14589 
14590 static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
14591                                    const AArch64Subtarget *ST) {
14592   EVT VT = Op.getValueType();
14593   assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
14594          "Expected a legal NEON vector");
14595 
14596   APInt DefBits(VT.getSizeInBits(), 0);
14597   APInt UndefBits(VT.getSizeInBits(), 0);
14598   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
14599   if (resolveBuildVector(BVN, DefBits, UndefBits)) {
14600     auto TryMOVIWithBits = [&](APInt DefBits) {
14601       SDValue NewOp;
14602       if ((NewOp =
14603                tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
14604           (NewOp =
14605                tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
14606           (NewOp =
14607                tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
14608           (NewOp =
14609                tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
14610           (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
14611           (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
14612         return NewOp;
14613 
14614       APInt NotDefBits = ~DefBits;
14615       if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
14616                                       NotDefBits)) ||
14617           (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG,
14618                                         NotDefBits)) ||
14619           (NewOp =
14620                tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
14621         return NewOp;
14622       return SDValue();
14623     };
14624     if (SDValue R = TryMOVIWithBits(DefBits))
14625       return R;
14626     if (SDValue R = TryMOVIWithBits(UndefBits))
14627       return R;
14628 
14629     // See if a fneg of the constant can be materialized with a MOVI, etc
14630     auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
14631       // FNegate each sub-element of the constant
14632       assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
14633       APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
14634                       .zext(VT.getSizeInBits());
14635       APInt NegBits(VT.getSizeInBits(), 0);
14636       unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
14637       for (unsigned i = 0; i < NumElts; i++)
14638         NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
14639       NegBits = DefBits ^ NegBits;
14640 
14641       // Try to create the new constants with MOVI, and if so generate a fneg
14642       // for it.
14643       if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
14644         SDLoc DL(Op);
14645         MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
14646         return DAG.getNode(
14647             AArch64ISD::NVCAST, DL, VT,
14648             DAG.getNode(ISD::FNEG, DL, VFVT,
14649                         DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
14650       }
14651       return SDValue();
14652     };
14653     SDValue R;
14654     if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
14655         (R = TryWithFNeg(DefBits, MVT::f64)) ||
14656         (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
14657       return R;
14658   }
14659 
14660   return SDValue();
14661 }
14662 
14663 SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
14664     SDValue Op, SelectionDAG &DAG) const {
14665   EVT VT = Op.getValueType();
14666   SDLoc DL(Op);
14667   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
14668   auto *BVN = cast<BuildVectorSDNode>(Op);
14669 
14670   if (auto SeqInfo = BVN->isConstantSequence()) {
14671     SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
14672     SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
14673     SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
14674     return convertFromScalableVector(DAG, VT, Seq);
14675   }
14676 
14677   unsigned NumElems = VT.getVectorNumElements();
14678   if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
14679       NumElems <= 1 || BVN->isConstant())
14680     return SDValue();
14681 
14682   auto IsExtractElt = [](SDValue Op) {
14683     return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
14684   };
14685 
14686   // For integer types that are not already in vectors limit to at most four
14687   // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
14688   if (VT.getScalarType().isInteger() &&
14689       NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
14690     return SDValue();
14691 
14692   // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
14693   SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
14694   SmallVector<SDValue, 16> Intermediates = map_to_vector<16>(
14695       Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
14696         return Op.isUndef() ? Undef
14697                             : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
14698                                           ContainerVT, Undef, Op, ZeroI64);
14699       });
14700 
14701   ElementCount ZipEC = ContainerVT.getVectorElementCount();
14702   while (Intermediates.size() > 1) {
14703     EVT ZipVT = getPackedSVEVectorVT(ZipEC);
14704 
14705     for (unsigned I = 0; I < Intermediates.size(); I += 2) {
14706       SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
14707       SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
14708       Intermediates[I / 2] =
14709           Op1.isUndef() ? Op0
14710                         : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
14711     }
14712 
14713     Intermediates.resize(Intermediates.size() / 2);
14714     ZipEC = ZipEC.divideCoefficientBy(2);
14715   }
14716 
14717   assert(Intermediates.size() == 1);
14718   SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
14719   return convertFromScalableVector(DAG, VT, Vec);
14720 }
14721 
14722 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
14723                                                  SelectionDAG &DAG) const {
14724   EVT VT = Op.getValueType();
14725 
14726   bool OverrideNEON = !Subtarget->isNeonAvailable() ||
14727                       cast<BuildVectorSDNode>(Op)->isConstantSequence();
14728   if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
14729     return LowerFixedLengthBuildVectorToSVE(Op, DAG);
14730 
14731   // Try to build a simple constant vector.
14732   Op = NormalizeBuildVector(Op, DAG);
14733   // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
14734   // abort.
14735   if (Op.getOpcode() != ISD::BUILD_VECTOR)
14736     return SDValue();
14737 
14738   // Certain vector constants, used to express things like logical NOT and
14739   // arithmetic NEG, are passed through unmodified.  This allows special
14740   // patterns for these operations to match, which will lower these constants
14741   // to whatever is proven necessary.
14742   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
14743   if (BVN->isConstant()) {
14744     if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
14745       unsigned BitSize = VT.getVectorElementType().getSizeInBits();
14746       APInt Val(BitSize,
14747                 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
14748       if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
14749         return Op;
14750     }
14751     if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
14752       if (Const->isZero() && !Const->isNegative())
14753         return Op;
14754   }
14755 
14756   if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
14757     return V;
14758 
14759   // Scan through the operands to find some interesting properties we can
14760   // exploit:
14761   //   1) If only one value is used, we can use a DUP, or
14762   //   2) if only the low element is not undef, we can just insert that, or
14763   //   3) if only one constant value is used (w/ some non-constant lanes),
14764   //      we can splat the constant value into the whole vector then fill
14765   //      in the non-constant lanes.
14766   //   4) FIXME: If different constant values are used, but we can intelligently
14767   //             select the values we'll be overwriting for the non-constant
14768   //             lanes such that we can directly materialize the vector
14769   //             some other way (MOVI, e.g.), we can be sneaky.
14770   //   5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
14771   SDLoc dl(Op);
14772   unsigned NumElts = VT.getVectorNumElements();
14773   bool isOnlyLowElement = true;
14774   bool usesOnlyOneValue = true;
14775   bool usesOnlyOneConstantValue = true;
14776   bool isConstant = true;
14777   bool AllLanesExtractElt = true;
14778   unsigned NumConstantLanes = 0;
14779   unsigned NumDifferentLanes = 0;
14780   unsigned NumUndefLanes = 0;
14781   SDValue Value;
14782   SDValue ConstantValue;
14783   SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
14784   unsigned ConsecutiveValCount = 0;
14785   SDValue PrevVal;
14786   for (unsigned i = 0; i < NumElts; ++i) {
14787     SDValue V = Op.getOperand(i);
14788     if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14789       AllLanesExtractElt = false;
14790     if (V.isUndef()) {
14791       ++NumUndefLanes;
14792       continue;
14793     }
14794     if (i > 0)
14795       isOnlyLowElement = false;
14796     if (!isIntOrFPConstant(V))
14797       isConstant = false;
14798 
14799     if (isIntOrFPConstant(V)) {
14800       ++NumConstantLanes;
14801       if (!ConstantValue.getNode())
14802         ConstantValue = V;
14803       else if (ConstantValue != V)
14804         usesOnlyOneConstantValue = false;
14805     }
14806 
14807     if (!Value.getNode())
14808       Value = V;
14809     else if (V != Value) {
14810       usesOnlyOneValue = false;
14811       ++NumDifferentLanes;
14812     }
14813 
14814     if (PrevVal != V) {
14815       ConsecutiveValCount = 0;
14816       PrevVal = V;
14817     }
14818 
14819     // Keep different values and its last consecutive count. For example,
14820     //
14821     //  t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
14822     //                            t24, t24, t24, t24, t24, t24, t24, t24
14823     //  t23 = consecutive count 8
14824     //  t24 = consecutive count 8
14825     // ------------------------------------------------------------------
14826     //  t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
14827     //                            t24, t24, t24, t24, t24, t24, t24, t24
14828     //  t23 = consecutive count 5
14829     //  t24 = consecutive count 9
14830     DifferentValueMap[V] = ++ConsecutiveValCount;
14831   }
14832 
14833   if (!Value.getNode()) {
14834     LLVM_DEBUG(
14835         dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
14836     return DAG.getUNDEF(VT);
14837   }
14838 
14839   // Convert BUILD_VECTOR where all elements but the lowest are undef into
14840   // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
14841   // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
14842   if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
14843     LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
14844                          "SCALAR_TO_VECTOR node\n");
14845     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
14846   }
14847 
14848   if (AllLanesExtractElt) {
14849     SDNode *Vector = nullptr;
14850     bool Even = false;
14851     bool Odd = false;
14852     // Check whether the extract elements match the Even pattern <0,2,4,...> or
14853     // the Odd pattern <1,3,5,...>.
14854     for (unsigned i = 0; i < NumElts; ++i) {
14855       SDValue V = Op.getOperand(i);
14856       const SDNode *N = V.getNode();
14857       if (!isa<ConstantSDNode>(N->getOperand(1))) {
14858         Even = false;
14859         Odd = false;
14860         break;
14861       }
14862       SDValue N0 = N->getOperand(0);
14863 
14864       // All elements are extracted from the same vector.
14865       if (!Vector) {
14866         Vector = N0.getNode();
14867         // Check that the type of EXTRACT_VECTOR_ELT matches the type of
14868         // BUILD_VECTOR.
14869         if (VT.getVectorElementType() !=
14870             N0.getValueType().getVectorElementType())
14871           break;
14872       } else if (Vector != N0.getNode()) {
14873         Odd = false;
14874         Even = false;
14875         break;
14876       }
14877 
14878       // Extracted values are either at Even indices <0,2,4,...> or at Odd
14879       // indices <1,3,5,...>.
14880       uint64_t Val = N->getConstantOperandVal(1);
14881       if (Val == 2 * i) {
14882         Even = true;
14883         continue;
14884       }
14885       if (Val - 1 == 2 * i) {
14886         Odd = true;
14887         continue;
14888       }
14889 
14890       // Something does not match: abort.
14891       Odd = false;
14892       Even = false;
14893       break;
14894     }
14895     if (Even || Odd) {
14896       SDValue LHS =
14897           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
14898                       DAG.getConstant(0, dl, MVT::i64));
14899       SDValue RHS =
14900           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
14901                       DAG.getConstant(NumElts, dl, MVT::i64));
14902 
14903       if (Even && !Odd)
14904         return DAG.getNode(AArch64ISD::UZP1, dl, VT, LHS, RHS);
14905       if (Odd && !Even)
14906         return DAG.getNode(AArch64ISD::UZP2, dl, VT, LHS, RHS);
14907     }
14908   }
14909 
14910   // Use DUP for non-constant splats. For f32 constant splats, reduce to
14911   // i32 and try again.
14912   if (usesOnlyOneValue) {
14913     if (!isConstant) {
14914       if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14915           Value.getValueType() != VT) {
14916         LLVM_DEBUG(
14917             dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
14918         return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
14919       }
14920 
14921       // This is actually a DUPLANExx operation, which keeps everything vectory.
14922 
14923       SDValue Lane = Value.getOperand(1);
14924       Value = Value.getOperand(0);
14925       if (Value.getValueSizeInBits() == 64) {
14926         LLVM_DEBUG(
14927             dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
14928                       "widening it\n");
14929         Value = WidenVector(Value, DAG);
14930       }
14931 
14932       unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
14933       return DAG.getNode(Opcode, dl, VT, Value, Lane);
14934     }
14935 
14936     if (VT.getVectorElementType().isFloatingPoint()) {
14937       SmallVector<SDValue, 8> Ops;
14938       EVT EltTy = VT.getVectorElementType();
14939       assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
14940                EltTy == MVT::f64) && "Unsupported floating-point vector type");
14941       LLVM_DEBUG(
14942           dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
14943                     "BITCASTS, and try again\n");
14944       MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
14945       for (unsigned i = 0; i < NumElts; ++i)
14946         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
14947       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
14948       SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
14949       LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
14950                  Val.dump(););
14951       Val = LowerBUILD_VECTOR(Val, DAG);
14952       if (Val.getNode())
14953         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
14954     }
14955   }
14956 
14957   // If we need to insert a small number of different non-constant elements and
14958   // the vector width is sufficiently large, prefer using DUP with the common
14959   // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
14960   // skip the constant lane handling below.
14961   bool PreferDUPAndInsert =
14962       !isConstant && NumDifferentLanes >= 1 &&
14963       NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
14964       NumDifferentLanes >= NumConstantLanes;
14965 
14966   // If there was only one constant value used and for more than one lane,
14967   // start by splatting that value, then replace the non-constant lanes. This
14968   // is better than the default, which will perform a separate initialization
14969   // for each lane.
14970   if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
14971     // Firstly, try to materialize the splat constant.
14972     SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
14973     unsigned BitSize = VT.getScalarSizeInBits();
14974     APInt ConstantValueAPInt(1, 0);
14975     if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
14976       ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
14977     if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
14978         !ConstantValueAPInt.isAllOnes()) {
14979       Val = ConstantBuildVector(Val, DAG, Subtarget);
14980       if (!Val)
14981         // Otherwise, materialize the constant and splat it.
14982         Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
14983     }
14984 
14985     // Now insert the non-constant lanes.
14986     for (unsigned i = 0; i < NumElts; ++i) {
14987       SDValue V = Op.getOperand(i);
14988       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
14989       if (!isIntOrFPConstant(V))
14990         // Note that type legalization likely mucked about with the VT of the
14991         // source operand, so we may have to convert it here before inserting.
14992         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
14993     }
14994     return Val;
14995   }
14996 
14997   // This will generate a load from the constant pool.
14998   if (isConstant) {
14999     LLVM_DEBUG(
15000         dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
15001                   "expansion\n");
15002     return SDValue();
15003   }
15004 
15005   // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
15006   // v4i32s. This is really a truncate, which we can construct out of (legal)
15007   // concats and truncate nodes.
15008   if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
15009     return M;
15010 
15011   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
15012   if (NumElts >= 4) {
15013     if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
15014       return Shuffle;
15015 
15016     if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
15017       return Shuffle;
15018   }
15019 
15020   if (PreferDUPAndInsert) {
15021     // First, build a constant vector with the common element.
15022     SmallVector<SDValue, 8> Ops(NumElts, Value);
15023     SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
15024     // Next, insert the elements that do not match the common value.
15025     for (unsigned I = 0; I < NumElts; ++I)
15026       if (Op.getOperand(I) != Value)
15027         NewVector =
15028             DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
15029                         Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
15030 
15031     return NewVector;
15032   }
15033 
15034   // If vector consists of two different values, try to generate two DUPs and
15035   // (CONCAT_VECTORS or VECTOR_SHUFFLE).
15036   if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
15037     SmallVector<SDValue, 2> Vals;
15038     // Check the consecutive count of the value is the half number of vector
15039     // elements. In this case, we can use CONCAT_VECTORS. For example,
15040     //
15041     // canUseVECTOR_CONCAT = true;
15042     //  t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15043     //                            t24, t24, t24, t24, t24, t24, t24, t24
15044     //
15045     // canUseVECTOR_CONCAT = false;
15046     //  t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
15047     //                            t24, t24, t24, t24, t24, t24, t24, t24
15048     bool canUseVECTOR_CONCAT = true;
15049     for (auto Pair : DifferentValueMap) {
15050       // Check different values have same length which is NumElts / 2.
15051       if (Pair.second != NumElts / 2)
15052         canUseVECTOR_CONCAT = false;
15053       Vals.push_back(Pair.first);
15054     }
15055 
15056     // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
15057     // CONCAT_VECTORs. For example,
15058     //
15059     //  t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
15060     //                            t24, t24, t24, t24, t24, t24, t24, t24
15061     // ==>
15062     //    t26: v8i8 = AArch64ISD::DUP t23
15063     //    t28: v8i8 = AArch64ISD::DUP t24
15064     //  t29: v16i8 = concat_vectors t26, t28
15065     if (canUseVECTOR_CONCAT) {
15066       EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15067       if (isTypeLegal(SubVT) && SubVT.isVector() &&
15068           SubVT.getVectorNumElements() >= 2) {
15069         SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
15070         SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
15071         SDValue DUP1 =
15072             LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
15073         SDValue DUP2 =
15074             LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
15075         SDValue CONCAT_VECTORS =
15076             DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
15077         return CONCAT_VECTORS;
15078       }
15079     }
15080 
15081     // Let's try to generate VECTOR_SHUFFLE. For example,
15082     //
15083     //  t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
15084     //  ==>
15085     //    t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
15086     //    t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
15087     //  t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
15088     if (NumElts >= 8) {
15089       SmallVector<int, 16> MaskVec;
15090       // Build mask for VECTOR_SHUFLLE.
15091       SDValue FirstLaneVal = Op.getOperand(0);
15092       for (unsigned i = 0; i < NumElts; ++i) {
15093         SDValue Val = Op.getOperand(i);
15094         if (FirstLaneVal == Val)
15095           MaskVec.push_back(i);
15096         else
15097           MaskVec.push_back(i + NumElts);
15098       }
15099 
15100       SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
15101       SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
15102       SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
15103       SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
15104       SDValue VECTOR_SHUFFLE =
15105           DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
15106       return VECTOR_SHUFFLE;
15107     }
15108   }
15109 
15110   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
15111   // know the default expansion would otherwise fall back on something even
15112   // worse. For a vector with one or two non-undef values, that's
15113   // scalar_to_vector for the elements followed by a shuffle (provided the
15114   // shuffle is valid for the target) and materialization element by element
15115   // on the stack followed by a load for everything else.
15116   if (!isConstant && !usesOnlyOneValue) {
15117     LLVM_DEBUG(
15118         dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
15119                   "of INSERT_VECTOR_ELT\n");
15120 
15121     SDValue Vec = DAG.getUNDEF(VT);
15122     SDValue Op0 = Op.getOperand(0);
15123     unsigned i = 0;
15124 
15125     // Use SCALAR_TO_VECTOR for lane zero to
15126     // a) Avoid a RMW dependency on the full vector register, and
15127     // b) Allow the register coalescer to fold away the copy if the
15128     //    value is already in an S or D register, and we're forced to emit an
15129     //    INSERT_SUBREG that we can't fold anywhere.
15130     //
15131     // We also allow types like i8 and i16 which are illegal scalar but legal
15132     // vector element types. After type-legalization the inserted value is
15133     // extended (i32) and it is safe to cast them to the vector type by ignoring
15134     // the upper bits of the lowest lane (e.g. v8i8, v4i16).
15135     if (!Op0.isUndef()) {
15136       LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
15137       Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
15138       ++i;
15139     }
15140     LLVM_DEBUG({
15141       if (i < NumElts)
15142         dbgs() << "Creating nodes for the other vector elements:\n";
15143     });
15144     for (; i < NumElts; ++i) {
15145       SDValue V = Op.getOperand(i);
15146       if (V.isUndef())
15147         continue;
15148       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
15149       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
15150     }
15151     return Vec;
15152   }
15153 
15154   LLVM_DEBUG(
15155       dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
15156                 "better alternative\n");
15157   return SDValue();
15158 }
15159 
15160 SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
15161                                                    SelectionDAG &DAG) const {
15162   if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15163                                    !Subtarget->isNeonAvailable()))
15164     return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
15165 
15166   assert(Op.getValueType().isScalableVector() &&
15167          isTypeLegal(Op.getValueType()) &&
15168          "Expected legal scalable vector type!");
15169 
15170   if (isTypeLegal(Op.getOperand(0).getValueType())) {
15171     unsigned NumOperands = Op->getNumOperands();
15172     assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
15173            "Unexpected number of operands in CONCAT_VECTORS");
15174 
15175     if (NumOperands == 2)
15176       return Op;
15177 
15178     // Concat each pair of subvectors and pack into the lower half of the array.
15179     SmallVector<SDValue> ConcatOps(Op->ops());
15180     while (ConcatOps.size() > 1) {
15181       for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
15182         SDValue V1 = ConcatOps[I];
15183         SDValue V2 = ConcatOps[I + 1];
15184         EVT SubVT = V1.getValueType();
15185         EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
15186         ConcatOps[I / 2] =
15187             DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
15188       }
15189       ConcatOps.resize(ConcatOps.size() / 2);
15190     }
15191     return ConcatOps[0];
15192   }
15193 
15194   return SDValue();
15195 }
15196 
15197 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15198                                                       SelectionDAG &DAG) const {
15199   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
15200 
15201   if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15202                                    !Subtarget->isNeonAvailable()))
15203     return LowerFixedLengthInsertVectorElt(Op, DAG);
15204 
15205   EVT VT = Op.getOperand(0).getValueType();
15206 
15207   if (VT.getScalarType() == MVT::i1) {
15208     EVT VectorVT = getPromotedVTForPredicate(VT);
15209     SDLoc DL(Op);
15210     SDValue ExtendedVector =
15211         DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
15212     SDValue ExtendedValue =
15213         DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
15214                              VectorVT.getScalarType().getSizeInBits() < 32
15215                                  ? MVT::i32
15216                                  : VectorVT.getScalarType());
15217     ExtendedVector =
15218         DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
15219                     ExtendedValue, Op.getOperand(2));
15220     return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
15221   }
15222 
15223   // Check for non-constant or out of range lane.
15224   ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
15225   if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15226     return SDValue();
15227 
15228   return Op;
15229 }
15230 
15231 SDValue
15232 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
15233                                                SelectionDAG &DAG) const {
15234   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
15235   EVT VT = Op.getOperand(0).getValueType();
15236 
15237   if (VT.getScalarType() == MVT::i1) {
15238     // We can't directly extract from an SVE predicate; extend it first.
15239     // (This isn't the only possible lowering, but it's straightforward.)
15240     EVT VectorVT = getPromotedVTForPredicate(VT);
15241     SDLoc DL(Op);
15242     SDValue Extend =
15243         DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
15244     MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
15245     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
15246                                   Extend, Op.getOperand(1));
15247     return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
15248   }
15249 
15250   if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15251     return LowerFixedLengthExtractVectorElt(Op, DAG);
15252 
15253   // Check for non-constant or out of range lane.
15254   ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
15255   if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15256     return SDValue();
15257 
15258   // Insertion/extraction are legal for V128 types.
15259   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
15260       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
15261       VT == MVT::v8f16 || VT == MVT::v8bf16)
15262     return Op;
15263 
15264   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
15265       VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
15266       VT != MVT::v4bf16)
15267     return SDValue();
15268 
15269   // For V64 types, we perform extraction by expanding the value
15270   // to a V128 type and perform the extraction on that.
15271   SDLoc DL(Op);
15272   SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
15273   EVT WideTy = WideVec.getValueType();
15274 
15275   EVT ExtrTy = WideTy.getVectorElementType();
15276   if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
15277     ExtrTy = MVT::i32;
15278 
15279   // For extractions, we just return the result directly.
15280   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
15281                      Op.getOperand(1));
15282 }
15283 
15284 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
15285                                                       SelectionDAG &DAG) const {
15286   EVT VT = Op.getValueType();
15287   assert(VT.isFixedLengthVector() &&
15288          "Only cases that extract a fixed length vector are supported!");
15289   EVT InVT = Op.getOperand(0).getValueType();
15290 
15291   // If we don't have legal types yet, do nothing
15292   if (!isTypeLegal(InVT))
15293     return SDValue();
15294 
15295   if (InVT.is128BitVector()) {
15296     assert(VT.is64BitVector() && "Extracting unexpected vector type!");
15297     unsigned Idx = Op.getConstantOperandVal(1);
15298 
15299     // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
15300     if (Idx == 0)
15301       return Op;
15302 
15303     // If this is extracting the upper 64-bits of a 128-bit vector, we match
15304     // that directly.
15305     if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
15306       return Op;
15307   }
15308 
15309   if (InVT.isScalableVector() ||
15310       useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
15311     SDLoc DL(Op);
15312     SDValue Vec = Op.getOperand(0);
15313     SDValue Idx = Op.getOperand(1);
15314 
15315     EVT PackedVT = getPackedSVEVectorVT(InVT.getVectorElementType());
15316     if (PackedVT != InVT) {
15317       // Pack input into the bottom part of an SVE register and try again.
15318       SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
15319                                       DAG.getUNDEF(PackedVT), Vec,
15320                                       DAG.getVectorIdxConstant(0, DL));
15321       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
15322     }
15323 
15324     // This will get matched by custom code during ISelDAGToDAG.
15325     if (isNullConstant(Idx))
15326       return Op;
15327 
15328     assert(InVT.isScalableVector() && "Unexpected vector type!");
15329     // Move requested subvector to the start of the vector and try again.
15330     SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
15331     return convertFromScalableVector(DAG, VT, Splice);
15332   }
15333 
15334   return SDValue();
15335 }
15336 
15337 SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
15338                                                      SelectionDAG &DAG) const {
15339   assert(Op.getValueType().isScalableVector() &&
15340          "Only expect to lower inserts into scalable vectors!");
15341 
15342   EVT InVT = Op.getOperand(1).getValueType();
15343   unsigned Idx = Op.getConstantOperandVal(2);
15344 
15345   SDValue Vec0 = Op.getOperand(0);
15346   SDValue Vec1 = Op.getOperand(1);
15347   SDLoc DL(Op);
15348   EVT VT = Op.getValueType();
15349 
15350   if (InVT.isScalableVector()) {
15351     if (!isTypeLegal(VT))
15352       return SDValue();
15353 
15354     // Break down insert_subvector into simpler parts.
15355     if (VT.getVectorElementType() == MVT::i1) {
15356       unsigned NumElts = VT.getVectorMinNumElements();
15357       EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15358 
15359       SDValue Lo, Hi;
15360       Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
15361                        DAG.getVectorIdxConstant(0, DL));
15362       Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
15363                        DAG.getVectorIdxConstant(NumElts / 2, DL));
15364       if (Idx < (NumElts / 2))
15365         Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
15366                          DAG.getVectorIdxConstant(Idx, DL));
15367       else
15368         Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
15369                          DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
15370 
15371       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15372     }
15373 
15374     // We can select these directly.
15375     if (isTypeLegal(InVT) && Vec0.isUndef())
15376       return Op;
15377 
15378     // Ensure the subvector is half the size of the main vector.
15379     if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
15380       return SDValue();
15381 
15382     // Here narrow and wide refers to the vector element types. After "casting"
15383     // both vectors must have the same bit length and so because the subvector
15384     // has fewer elements, those elements need to be bigger.
15385     EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
15386     EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
15387 
15388     // NOP cast operands to the largest legal vector of the same element count.
15389     if (VT.isFloatingPoint()) {
15390       Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
15391       Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
15392     } else {
15393       // Legal integer vectors are already their largest so Vec0 is fine as is.
15394       Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
15395       Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
15396     }
15397 
15398     // To replace the top/bottom half of vector V with vector SubV we widen the
15399     // preserved half of V, concatenate this to SubV (the order depending on the
15400     // half being replaced) and then narrow the result.
15401     SDValue Narrow;
15402     if (Idx == 0) {
15403       SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
15404       HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
15405       Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
15406     } else {
15407       assert(Idx == InVT.getVectorMinNumElements() &&
15408              "Invalid subvector index!");
15409       SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
15410       LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
15411       Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
15412     }
15413 
15414     return getSVESafeBitCast(VT, Narrow, DAG);
15415   }
15416 
15417   if (Idx == 0 && isPackedVectorType(VT, DAG)) {
15418     // This will be matched by custom code during ISelDAGToDAG.
15419     if (Vec0.isUndef())
15420       return Op;
15421 
15422     std::optional<unsigned> PredPattern =
15423         getSVEPredPatternFromNumElements(InVT.getVectorNumElements());
15424     auto PredTy = VT.changeVectorElementType(MVT::i1);
15425     SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
15426     SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
15427     return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
15428   }
15429 
15430   return SDValue();
15431 }
15432 
15433 static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
15434   if (Op.getOpcode() != AArch64ISD::DUP &&
15435       Op.getOpcode() != ISD::SPLAT_VECTOR &&
15436       Op.getOpcode() != ISD::BUILD_VECTOR)
15437     return false;
15438 
15439   if (Op.getOpcode() == ISD::BUILD_VECTOR &&
15440       !isAllConstantBuildVector(Op, SplatVal))
15441     return false;
15442 
15443   if (Op.getOpcode() != ISD::BUILD_VECTOR &&
15444       !isa<ConstantSDNode>(Op->getOperand(0)))
15445     return false;
15446 
15447   SplatVal = Op->getConstantOperandVal(0);
15448   if (Op.getValueType().getVectorElementType() != MVT::i64)
15449     SplatVal = (int32_t)SplatVal;
15450 
15451   Negated = false;
15452   if (isPowerOf2_64(SplatVal))
15453     return true;
15454 
15455   Negated = true;
15456   if (isPowerOf2_64(-SplatVal)) {
15457     SplatVal = -SplatVal;
15458     return true;
15459   }
15460 
15461   return false;
15462 }
15463 
15464 SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
15465   EVT VT = Op.getValueType();
15466   SDLoc dl(Op);
15467 
15468   if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
15469     return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
15470 
15471   assert(VT.isScalableVector() && "Expected a scalable vector.");
15472 
15473   bool Signed = Op.getOpcode() == ISD::SDIV;
15474   unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
15475 
15476   bool Negated;
15477   uint64_t SplatVal;
15478   if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
15479     SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
15480     SDValue Res =
15481         DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
15482                     DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
15483     if (Negated)
15484       Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
15485 
15486     return Res;
15487   }
15488 
15489   if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
15490     return LowerToPredicatedOp(Op, DAG, PredOpcode);
15491 
15492   // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
15493   // operations, and truncate the result.
15494   EVT WidenedVT;
15495   if (VT == MVT::nxv16i8)
15496     WidenedVT = MVT::nxv8i16;
15497   else if (VT == MVT::nxv8i16)
15498     WidenedVT = MVT::nxv4i32;
15499   else
15500     llvm_unreachable("Unexpected Custom DIV operation");
15501 
15502   unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
15503   unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
15504   SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
15505   SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
15506   SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
15507   SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
15508   SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
15509   SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
15510   SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, dl, VT, ResultLo);
15511   SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, dl, VT, ResultHi);
15512   return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLoCast, ResultHiCast);
15513 }
15514 
15515 bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
15516     EVT VT, unsigned DefinedValues) const {
15517   if (!Subtarget->isNeonAvailable())
15518     return false;
15519   return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
15520 }
15521 
15522 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
15523   // Currently no fixed length shuffles that require SVE are legal.
15524   if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15525     return false;
15526 
15527   if (VT.getVectorNumElements() == 4 &&
15528       (VT.is128BitVector() || VT.is64BitVector())) {
15529     unsigned Cost = getPerfectShuffleCost(M);
15530     if (Cost <= 1)
15531       return true;
15532   }
15533 
15534   bool DummyBool;
15535   int DummyInt;
15536   unsigned DummyUnsigned;
15537 
15538   unsigned EltSize = VT.getScalarSizeInBits();
15539   unsigned NumElts = VT.getVectorNumElements();
15540   return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
15541           isREVMask(M, EltSize, NumElts, 64) ||
15542           isREVMask(M, EltSize, NumElts, 32) ||
15543           isREVMask(M, EltSize, NumElts, 16) ||
15544           isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
15545           isTRNMask(M, NumElts, DummyUnsigned) ||
15546           isUZPMask(M, NumElts, DummyUnsigned) ||
15547           isZIPMask(M, NumElts, DummyUnsigned) ||
15548           isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
15549           isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
15550           isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
15551           isINSMask(M, NumElts, DummyBool, DummyInt) ||
15552           isConcatMask(M, VT, VT.getSizeInBits() == 128));
15553 }
15554 
15555 bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M,
15556                                                    EVT VT) const {
15557   // Just delegate to the generic legality, clear masks aren't special.
15558   return isShuffleMaskLegal(M, VT);
15559 }
15560 
15561 /// getVShiftImm - Check if this is a valid build_vector for the immediate
15562 /// operand of a vector shift operation, where all the elements of the
15563 /// build_vector must have the same constant integer value.
15564 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
15565   // Ignore bit_converts.
15566   while (Op.getOpcode() == ISD::BITCAST)
15567     Op = Op.getOperand(0);
15568   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
15569   APInt SplatBits, SplatUndef;
15570   unsigned SplatBitSize;
15571   bool HasAnyUndefs;
15572   if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
15573                                     HasAnyUndefs, ElementBits) ||
15574       SplatBitSize > ElementBits)
15575     return false;
15576   Cnt = SplatBits.getSExtValue();
15577   return true;
15578 }
15579 
15580 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
15581 /// operand of a vector shift left operation.  That value must be in the range:
15582 ///   0 <= Value < ElementBits for a left shift; or
15583 ///   0 <= Value <= ElementBits for a long left shift.
15584 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
15585   assert(VT.isVector() && "vector shift count is not a vector type");
15586   int64_t ElementBits = VT.getScalarSizeInBits();
15587   if (!getVShiftImm(Op, ElementBits, Cnt))
15588     return false;
15589   return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
15590 }
15591 
15592 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
15593 /// operand of a vector shift right operation. The value must be in the range:
15594 ///   1 <= Value <= ElementBits for a right shift; or
15595 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
15596   assert(VT.isVector() && "vector shift count is not a vector type");
15597   int64_t ElementBits = VT.getScalarSizeInBits();
15598   if (!getVShiftImm(Op, ElementBits, Cnt))
15599     return false;
15600   return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
15601 }
15602 
15603 SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
15604                                              SelectionDAG &DAG) const {
15605   EVT VT = Op.getValueType();
15606 
15607   if (VT.getScalarType() == MVT::i1) {
15608     // Lower i1 truncate to `(x & 1) != 0`.
15609     SDLoc dl(Op);
15610     EVT OpVT = Op.getOperand(0).getValueType();
15611     SDValue Zero = DAG.getConstant(0, dl, OpVT);
15612     SDValue One = DAG.getConstant(1, dl, OpVT);
15613     SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
15614     return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
15615   }
15616 
15617   if (!VT.isVector() || VT.isScalableVector())
15618     return SDValue();
15619 
15620   if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
15621                                    !Subtarget->isNeonAvailable()))
15622     return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
15623 
15624   return SDValue();
15625 }
15626 
15627 // Check if we can we lower this SRL to a rounding shift instruction. ResVT is
15628 // possibly a truncated type, it tells how many bits of the value are to be
15629 // used.
15630 static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT,
15631                                             SelectionDAG &DAG,
15632                                             unsigned &ShiftValue,
15633                                             SDValue &RShOperand) {
15634   if (Shift->getOpcode() != ISD::SRL)
15635     return false;
15636 
15637   EVT VT = Shift.getValueType();
15638   assert(VT.isScalableVT());
15639 
15640   auto ShiftOp1 =
15641       dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
15642   if (!ShiftOp1)
15643     return false;
15644 
15645   ShiftValue = ShiftOp1->getZExtValue();
15646   if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
15647     return false;
15648 
15649   SDValue Add = Shift->getOperand(0);
15650   if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
15651     return false;
15652 
15653   assert(ResVT.getScalarSizeInBits() <= VT.getScalarSizeInBits() &&
15654          "ResVT must be truncated or same type as the shift.");
15655   // Check if an overflow can lead to incorrect results.
15656   uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
15657   if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
15658     return false;
15659 
15660   auto AddOp1 =
15661       dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
15662   if (!AddOp1)
15663     return false;
15664   uint64_t AddValue = AddOp1->getZExtValue();
15665   if (AddValue != 1ULL << (ShiftValue - 1))
15666     return false;
15667 
15668   RShOperand = Add->getOperand(0);
15669   return true;
15670 }
15671 
15672 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
15673                                                       SelectionDAG &DAG) const {
15674   EVT VT = Op.getValueType();
15675   SDLoc DL(Op);
15676   int64_t Cnt;
15677 
15678   if (!Op.getOperand(1).getValueType().isVector())
15679     return Op;
15680   unsigned EltSize = VT.getScalarSizeInBits();
15681 
15682   switch (Op.getOpcode()) {
15683   case ISD::SHL:
15684     if (VT.isScalableVector() ||
15685         useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15686       return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
15687 
15688     if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
15689       return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
15690                          DAG.getConstant(Cnt, DL, MVT::i32));
15691     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
15692                        DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
15693                                        MVT::i32),
15694                        Op.getOperand(0), Op.getOperand(1));
15695   case ISD::SRA:
15696   case ISD::SRL:
15697     if (VT.isScalableVector() &&
15698         (Subtarget->hasSVE2() ||
15699          (Subtarget->hasSME() && Subtarget->isStreaming()))) {
15700       SDValue RShOperand;
15701       unsigned ShiftValue;
15702       if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
15703         return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
15704                            getPredicateForVector(DAG, DL, VT), RShOperand,
15705                            DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
15706     }
15707 
15708     if (VT.isScalableVector() ||
15709         useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
15710       unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
15711                                                 : AArch64ISD::SRL_PRED;
15712       return LowerToPredicatedOp(Op, DAG, Opc);
15713     }
15714 
15715     // Right shift immediate
15716     if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
15717       unsigned Opc =
15718           (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
15719       return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
15720                          DAG.getConstant(Cnt, DL, MVT::i32), Op->getFlags());
15721     }
15722 
15723     // Right shift register.  Note, there is not a shift right register
15724     // instruction, but the shift left register instruction takes a signed
15725     // value, where negative numbers specify a right shift.
15726     unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
15727                                                 : Intrinsic::aarch64_neon_ushl;
15728     // negate the shift amount
15729     SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
15730                                    Op.getOperand(1));
15731     SDValue NegShiftLeft =
15732         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
15733                     DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
15734                     NegShift);
15735     return NegShiftLeft;
15736   }
15737 
15738   llvm_unreachable("unexpected shift opcode");
15739 }
15740 
15741 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
15742                                     AArch64CC::CondCode CC, bool NoNans, EVT VT,
15743                                     const SDLoc &dl, SelectionDAG &DAG) {
15744   EVT SrcVT = LHS.getValueType();
15745   assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
15746          "function only supposed to emit natural comparisons");
15747 
15748   APInt SplatValue;
15749   APInt SplatUndef;
15750   unsigned SplatBitSize = 0;
15751   bool HasAnyUndefs;
15752 
15753   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
15754   bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
15755                                             SplatBitSize, HasAnyUndefs);
15756 
15757   bool IsZero = IsCnst && SplatValue == 0;
15758   bool IsOne =
15759       IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
15760   bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
15761 
15762   if (SrcVT.getVectorElementType().isFloatingPoint()) {
15763     switch (CC) {
15764     default:
15765       return SDValue();
15766     case AArch64CC::NE: {
15767       SDValue Fcmeq;
15768       if (IsZero)
15769         Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
15770       else
15771         Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
15772       return DAG.getNOT(dl, Fcmeq, VT);
15773     }
15774     case AArch64CC::EQ:
15775       if (IsZero)
15776         return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
15777       return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
15778     case AArch64CC::GE:
15779       if (IsZero)
15780         return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
15781       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
15782     case AArch64CC::GT:
15783       if (IsZero)
15784         return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
15785       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
15786     case AArch64CC::LE:
15787       if (!NoNans)
15788         return SDValue();
15789       // If we ignore NaNs then we can use to the LS implementation.
15790       [[fallthrough]];
15791     case AArch64CC::LS:
15792       if (IsZero)
15793         return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
15794       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
15795     case AArch64CC::LT:
15796       if (!NoNans)
15797         return SDValue();
15798       // If we ignore NaNs then we can use to the MI implementation.
15799       [[fallthrough]];
15800     case AArch64CC::MI:
15801       if (IsZero)
15802         return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
15803       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
15804     }
15805   }
15806 
15807   switch (CC) {
15808   default:
15809     return SDValue();
15810   case AArch64CC::NE: {
15811     SDValue Cmeq;
15812     if (IsZero)
15813       Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
15814     else
15815       Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
15816     return DAG.getNOT(dl, Cmeq, VT);
15817   }
15818   case AArch64CC::EQ:
15819     if (IsZero)
15820       return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
15821     return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
15822   case AArch64CC::GE:
15823     if (IsZero)
15824       return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
15825     return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
15826   case AArch64CC::GT:
15827     if (IsZero)
15828       return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
15829     if (IsMinusOne)
15830       return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
15831     return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
15832   case AArch64CC::LE:
15833     if (IsZero)
15834       return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
15835     return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
15836   case AArch64CC::LS:
15837     return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
15838   case AArch64CC::LO:
15839     return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
15840   case AArch64CC::LT:
15841     if (IsZero)
15842       return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
15843     if (IsOne)
15844       return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
15845     return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
15846   case AArch64CC::HI:
15847     return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
15848   case AArch64CC::HS:
15849     return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
15850   }
15851 }
15852 
15853 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
15854                                            SelectionDAG &DAG) const {
15855   if (Op.getValueType().isScalableVector())
15856     return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
15857 
15858   if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
15859                                    !Subtarget->isNeonAvailable()))
15860     return LowerFixedLengthVectorSetccToSVE(Op, DAG);
15861 
15862   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15863   SDValue LHS = Op.getOperand(0);
15864   SDValue RHS = Op.getOperand(1);
15865   EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
15866   SDLoc dl(Op);
15867 
15868   if (LHS.getValueType().getVectorElementType().isInteger()) {
15869     assert(LHS.getValueType() == RHS.getValueType());
15870     AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
15871     SDValue Cmp =
15872         EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
15873     return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
15874   }
15875 
15876   // Lower isnan(x) | isnan(never-nan) to x != x.
15877   // Lower !isnan(x) & !isnan(never-nan) to x == x.
15878   if (CC == ISD::SETUO || CC == ISD::SETO) {
15879     bool OneNaN = false;
15880     if (LHS == RHS) {
15881       OneNaN = true;
15882     } else if (DAG.isKnownNeverNaN(RHS)) {
15883       OneNaN = true;
15884       RHS = LHS;
15885     } else if (DAG.isKnownNeverNaN(LHS)) {
15886       OneNaN = true;
15887       LHS = RHS;
15888     }
15889     if (OneNaN) {
15890       CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
15891     }
15892   }
15893 
15894   const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
15895 
15896   // Make v4f16 (only) fcmp operations utilise vector instructions
15897   // v8f16 support will be a litle more complicated
15898   if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
15899       LHS.getValueType().getVectorElementType() == MVT::bf16) {
15900     if (LHS.getValueType().getVectorNumElements() == 4) {
15901       LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
15902       RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
15903       SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
15904       DAG.ReplaceAllUsesWith(Op, NewSetcc);
15905       CmpVT = MVT::v4i32;
15906     } else
15907       return SDValue();
15908   }
15909 
15910   assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
15911          LHS.getValueType().getVectorElementType() != MVT::bf16 ||
15912          LHS.getValueType().getVectorElementType() != MVT::f128);
15913 
15914   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
15915   // clean.  Some of them require two branches to implement.
15916   AArch64CC::CondCode CC1, CC2;
15917   bool ShouldInvert;
15918   changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
15919 
15920   bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
15921   SDValue Cmp =
15922       EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
15923   if (!Cmp.getNode())
15924     return SDValue();
15925 
15926   if (CC2 != AArch64CC::AL) {
15927     SDValue Cmp2 =
15928         EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
15929     if (!Cmp2.getNode())
15930       return SDValue();
15931 
15932     Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
15933   }
15934 
15935   Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
15936 
15937   if (ShouldInvert)
15938     Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
15939 
15940   return Cmp;
15941 }
15942 
15943 static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
15944                                   SelectionDAG &DAG) {
15945   SDValue VecOp = ScalarOp.getOperand(0);
15946   auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
15947   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
15948                      DAG.getConstant(0, DL, MVT::i64));
15949 }
15950 
15951 static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
15952                                       SDLoc DL, SelectionDAG &DAG) {
15953   unsigned ScalarOpcode;
15954   switch (Opcode) {
15955   case ISD::VECREDUCE_AND:
15956     ScalarOpcode = ISD::AND;
15957     break;
15958   case ISD::VECREDUCE_OR:
15959     ScalarOpcode = ISD::OR;
15960     break;
15961   case ISD::VECREDUCE_XOR:
15962     ScalarOpcode = ISD::XOR;
15963     break;
15964   default:
15965     llvm_unreachable("Expected bitwise vector reduction");
15966     return SDValue();
15967   }
15968 
15969   EVT VecVT = Vec.getValueType();
15970   assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
15971          "Expected power-of-2 length vector");
15972 
15973   EVT ElemVT = VecVT.getVectorElementType();
15974 
15975   SDValue Result;
15976   unsigned NumElems = VecVT.getVectorNumElements();
15977 
15978   // Special case for boolean reductions
15979   if (ElemVT == MVT::i1) {
15980     // Split large vectors into smaller ones
15981     if (NumElems > 16) {
15982       SDValue Lo, Hi;
15983       std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
15984       EVT HalfVT = Lo.getValueType();
15985       SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
15986       return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
15987     }
15988 
15989     // Results of setcc operations get widened to 128 bits if their input
15990     // operands are 128 bits wide, otherwise vectors that are less than 64 bits
15991     // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
15992     // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
15993     // size leads to the best codegen, since e.g. setcc results might need to be
15994     // truncated otherwise.
15995     unsigned ExtendedWidth = 64;
15996     if (Vec.getOpcode() == ISD::SETCC &&
15997         Vec.getOperand(0).getValueSizeInBits() >= 128) {
15998       ExtendedWidth = 128;
15999     }
16000     EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
16001 
16002     // any_ext doesn't work with umin/umax, so only use it for uadd.
16003     unsigned ExtendOp =
16004         ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
16005     SDValue Extended = DAG.getNode(
16006         ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
16007     // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
16008     // in that case we bitcast the sign extended values from v2i64 to v4i32
16009     // before reduction for optimal code generation.
16010     if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
16011         NumElems == 2 && ExtendedWidth == 128) {
16012       Extended = DAG.getBitcast(MVT::v4i32, Extended);
16013       ExtendedVT = MVT::i32;
16014     }
16015     switch (ScalarOpcode) {
16016     case ISD::AND:
16017       Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
16018       break;
16019     case ISD::OR:
16020       Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
16021       break;
16022     case ISD::XOR:
16023       Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
16024       break;
16025     default:
16026       llvm_unreachable("Unexpected Opcode");
16027     }
16028 
16029     Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
16030   } else {
16031     // Iteratively split the vector in half and combine using the bitwise
16032     // operation until it fits in a 64 bit register.
16033     while (VecVT.getSizeInBits() > 64) {
16034       SDValue Lo, Hi;
16035       std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16036       VecVT = Lo.getValueType();
16037       NumElems = VecVT.getVectorNumElements();
16038       Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
16039     }
16040 
16041     EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
16042 
16043     // Do the remaining work on a scalar since it allows the code generator to
16044     // combine the shift and bitwise operation into one instruction and since
16045     // integer instructions can have higher throughput than vector instructions.
16046     SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
16047 
16048     // Iteratively combine the lower and upper halves of the scalar using the
16049     // bitwise operation, halving the relevant region of the scalar in each
16050     // iteration, until the relevant region is just one element of the original
16051     // vector.
16052     for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
16053       SDValue ShiftAmount =
16054           DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
16055       SDValue Shifted =
16056           DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
16057       Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
16058     }
16059 
16060     Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
16061   }
16062 
16063   return DAG.getAnyExtOrTrunc(Result, DL, VT);
16064 }
16065 
16066 SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
16067                                               SelectionDAG &DAG) const {
16068   SDValue Src = Op.getOperand(0);
16069 
16070   // Try to lower fixed length reductions to SVE.
16071   EVT SrcVT = Src.getValueType();
16072   bool OverrideNEON = !Subtarget->isNeonAvailable() ||
16073                       Op.getOpcode() == ISD::VECREDUCE_AND ||
16074                       Op.getOpcode() == ISD::VECREDUCE_OR ||
16075                       Op.getOpcode() == ISD::VECREDUCE_XOR ||
16076                       Op.getOpcode() == ISD::VECREDUCE_FADD ||
16077                       (Op.getOpcode() != ISD::VECREDUCE_ADD &&
16078                        SrcVT.getVectorElementType() == MVT::i64);
16079   if (SrcVT.isScalableVector() ||
16080       useSVEForFixedLengthVectorVT(
16081           SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
16082 
16083     if (SrcVT.getVectorElementType() == MVT::i1)
16084       return LowerPredReductionToSVE(Op, DAG);
16085 
16086     switch (Op.getOpcode()) {
16087     case ISD::VECREDUCE_ADD:
16088       return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
16089     case ISD::VECREDUCE_AND:
16090       return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
16091     case ISD::VECREDUCE_OR:
16092       return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
16093     case ISD::VECREDUCE_SMAX:
16094       return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
16095     case ISD::VECREDUCE_SMIN:
16096       return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
16097     case ISD::VECREDUCE_UMAX:
16098       return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
16099     case ISD::VECREDUCE_UMIN:
16100       return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
16101     case ISD::VECREDUCE_XOR:
16102       return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
16103     case ISD::VECREDUCE_FADD:
16104       return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
16105     case ISD::VECREDUCE_FMAX:
16106       return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
16107     case ISD::VECREDUCE_FMIN:
16108       return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
16109     case ISD::VECREDUCE_FMAXIMUM:
16110       return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
16111     case ISD::VECREDUCE_FMINIMUM:
16112       return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
16113     default:
16114       llvm_unreachable("Unhandled fixed length reduction");
16115     }
16116   }
16117 
16118   // Lower NEON reductions.
16119   SDLoc dl(Op);
16120   switch (Op.getOpcode()) {
16121   case ISD::VECREDUCE_AND:
16122   case ISD::VECREDUCE_OR:
16123   case ISD::VECREDUCE_XOR:
16124     return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
16125                                   Op.getValueType(), dl, DAG);
16126   case ISD::VECREDUCE_ADD:
16127     return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
16128   case ISD::VECREDUCE_SMAX:
16129     return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
16130   case ISD::VECREDUCE_SMIN:
16131     return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
16132   case ISD::VECREDUCE_UMAX:
16133     return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
16134   case ISD::VECREDUCE_UMIN:
16135     return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
16136   default:
16137     llvm_unreachable("Unhandled reduction");
16138   }
16139 }
16140 
16141 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
16142                                                     SelectionDAG &DAG) const {
16143   auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
16144   // No point replacing if we don't have the relevant instruction/libcall anyway
16145   if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
16146     return SDValue();
16147 
16148   // LSE has an atomic load-clear instruction, but not a load-and.
16149   SDLoc dl(Op);
16150   MVT VT = Op.getSimpleValueType();
16151   assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
16152   SDValue RHS = Op.getOperand(2);
16153   AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
16154   RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getAllOnesConstant(dl, VT), RHS);
16155   return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
16156                        Op.getOperand(0), Op.getOperand(1), RHS,
16157                        AN->getMemOperand());
16158 }
16159 
16160 SDValue
16161 AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
16162                                                       SelectionDAG &DAG) const {
16163 
16164   SDLoc dl(Op);
16165   // Get the inputs.
16166   SDNode *Node = Op.getNode();
16167   SDValue Chain = Op.getOperand(0);
16168   SDValue Size = Op.getOperand(1);
16169   MaybeAlign Align =
16170       cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16171   EVT VT = Node->getValueType(0);
16172 
16173   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
16174           "no-stack-arg-probe")) {
16175     SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
16176     Chain = SP.getValue(1);
16177     SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
16178     if (Align)
16179       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16180                        DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
16181     Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
16182     SDValue Ops[2] = {SP, Chain};
16183     return DAG.getMergeValues(Ops, dl);
16184   }
16185 
16186   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
16187 
16188   EVT PtrVT = getPointerTy(DAG.getDataLayout());
16189   SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
16190                                                PtrVT, 0);
16191 
16192   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
16193   const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
16194   if (Subtarget->hasCustomCallingConv())
16195     TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
16196 
16197   Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
16198                      DAG.getConstant(4, dl, MVT::i64));
16199   Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
16200   Chain =
16201       DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
16202                   Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
16203                   DAG.getRegisterMask(Mask), Chain.getValue(1));
16204   // To match the actual intent better, we should read the output from X15 here
16205   // again (instead of potentially spilling it to the stack), but rereading Size
16206   // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
16207   // here.
16208 
16209   Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
16210                      DAG.getConstant(4, dl, MVT::i64));
16211 
16212   SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
16213   Chain = SP.getValue(1);
16214   SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
16215   if (Align)
16216     SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16217                      DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
16218   Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
16219 
16220   Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
16221 
16222   SDValue Ops[2] = {SP, Chain};
16223   return DAG.getMergeValues(Ops, dl);
16224 }
16225 
16226 SDValue
16227 AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
16228                                                      SelectionDAG &DAG) const {
16229   // Get the inputs.
16230   SDNode *Node = Op.getNode();
16231   SDValue Chain = Op.getOperand(0);
16232   SDValue Size = Op.getOperand(1);
16233 
16234   MaybeAlign Align =
16235       cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16236   SDLoc dl(Op);
16237   EVT VT = Node->getValueType(0);
16238 
16239   // Construct the new SP value in a GPR.
16240   SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
16241   Chain = SP.getValue(1);
16242   SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
16243   if (Align)
16244     SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16245                      DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
16246 
16247   // Set the real SP to the new value with a probing loop.
16248   Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
16249   SDValue Ops[2] = {SP, Chain};
16250   return DAG.getMergeValues(Ops, dl);
16251 }
16252 
16253 SDValue
16254 AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16255                                                SelectionDAG &DAG) const {
16256   MachineFunction &MF = DAG.getMachineFunction();
16257 
16258   if (Subtarget->isTargetWindows())
16259     return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
16260   else if (hasInlineStackProbe(MF))
16261     return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
16262   else
16263     return SDValue();
16264 }
16265 
16266 SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
16267                                         unsigned NewOp) const {
16268   if (Subtarget->hasSVE2())
16269     return LowerToPredicatedOp(Op, DAG, NewOp);
16270 
16271   // Default to expand.
16272   return SDValue();
16273 }
16274 
16275 SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
16276                                            SelectionDAG &DAG) const {
16277   EVT VT = Op.getValueType();
16278   assert(VT != MVT::i64 && "Expected illegal VSCALE node");
16279 
16280   SDLoc DL(Op);
16281   APInt MulImm = Op.getConstantOperandAPInt(0);
16282   return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
16283                             VT);
16284 }
16285 
16286 /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
16287 template <unsigned NumVecs>
16288 static bool
16289 setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
16290               AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
16291   Info.opc = ISD::INTRINSIC_VOID;
16292   // Retrieve EC from first vector argument.
16293   const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
16294   ElementCount EC = VT.getVectorElementCount();
16295 #ifndef NDEBUG
16296   // Check the assumption that all input vectors are the same type.
16297   for (unsigned I = 0; I < NumVecs; ++I)
16298     assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
16299            "Invalid type.");
16300 #endif
16301   // memVT is `NumVecs * VT`.
16302   Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
16303                                 EC * NumVecs);
16304   Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
16305   Info.offset = 0;
16306   Info.align.reset();
16307   Info.flags = MachineMemOperand::MOStore;
16308   return true;
16309 }
16310 
16311 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
16312 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
16313 /// specified in the intrinsic calls.
16314 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
16315                                                const CallInst &I,
16316                                                MachineFunction &MF,
16317                                                unsigned Intrinsic) const {
16318   auto &DL = I.getDataLayout();
16319   switch (Intrinsic) {
16320   case Intrinsic::aarch64_sve_st2:
16321     return setInfoSVEStN<2>(*this, DL, Info, I);
16322   case Intrinsic::aarch64_sve_st3:
16323     return setInfoSVEStN<3>(*this, DL, Info, I);
16324   case Intrinsic::aarch64_sve_st4:
16325     return setInfoSVEStN<4>(*this, DL, Info, I);
16326   case Intrinsic::aarch64_neon_ld2:
16327   case Intrinsic::aarch64_neon_ld3:
16328   case Intrinsic::aarch64_neon_ld4:
16329   case Intrinsic::aarch64_neon_ld1x2:
16330   case Intrinsic::aarch64_neon_ld1x3:
16331   case Intrinsic::aarch64_neon_ld1x4: {
16332     Info.opc = ISD::INTRINSIC_W_CHAIN;
16333     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
16334     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16335     Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16336     Info.offset = 0;
16337     Info.align.reset();
16338     // volatile loads with NEON intrinsics not supported
16339     Info.flags = MachineMemOperand::MOLoad;
16340     return true;
16341   }
16342   case Intrinsic::aarch64_neon_ld2lane:
16343   case Intrinsic::aarch64_neon_ld3lane:
16344   case Intrinsic::aarch64_neon_ld4lane:
16345   case Intrinsic::aarch64_neon_ld2r:
16346   case Intrinsic::aarch64_neon_ld3r:
16347   case Intrinsic::aarch64_neon_ld4r: {
16348     Info.opc = ISD::INTRINSIC_W_CHAIN;
16349     // ldx return struct with the same vec type
16350     Type *RetTy = I.getType();
16351     auto *StructTy = cast<StructType>(RetTy);
16352     unsigned NumElts = StructTy->getNumElements();
16353     Type *VecTy = StructTy->getElementType(0);
16354     MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16355     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16356     Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16357     Info.offset = 0;
16358     Info.align.reset();
16359     // volatile loads with NEON intrinsics not supported
16360     Info.flags = MachineMemOperand::MOLoad;
16361     return true;
16362   }
16363   case Intrinsic::aarch64_neon_st2:
16364   case Intrinsic::aarch64_neon_st3:
16365   case Intrinsic::aarch64_neon_st4:
16366   case Intrinsic::aarch64_neon_st1x2:
16367   case Intrinsic::aarch64_neon_st1x3:
16368   case Intrinsic::aarch64_neon_st1x4: {
16369     Info.opc = ISD::INTRINSIC_VOID;
16370     unsigned NumElts = 0;
16371     for (const Value *Arg : I.args()) {
16372       Type *ArgTy = Arg->getType();
16373       if (!ArgTy->isVectorTy())
16374         break;
16375       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
16376     }
16377     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16378     Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16379     Info.offset = 0;
16380     Info.align.reset();
16381     // volatile stores with NEON intrinsics not supported
16382     Info.flags = MachineMemOperand::MOStore;
16383     return true;
16384   }
16385   case Intrinsic::aarch64_neon_st2lane:
16386   case Intrinsic::aarch64_neon_st3lane:
16387   case Intrinsic::aarch64_neon_st4lane: {
16388     Info.opc = ISD::INTRINSIC_VOID;
16389     unsigned NumElts = 0;
16390     // all the vector type is same
16391     Type *VecTy = I.getArgOperand(0)->getType();
16392     MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16393 
16394     for (const Value *Arg : I.args()) {
16395       Type *ArgTy = Arg->getType();
16396       if (!ArgTy->isVectorTy())
16397         break;
16398       NumElts += 1;
16399     }
16400 
16401     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16402     Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16403     Info.offset = 0;
16404     Info.align.reset();
16405     // volatile stores with NEON intrinsics not supported
16406     Info.flags = MachineMemOperand::MOStore;
16407     return true;
16408   }
16409   case Intrinsic::aarch64_ldaxr:
16410   case Intrinsic::aarch64_ldxr: {
16411     Type *ValTy = I.getParamElementType(0);
16412     Info.opc = ISD::INTRINSIC_W_CHAIN;
16413     Info.memVT = MVT::getVT(ValTy);
16414     Info.ptrVal = I.getArgOperand(0);
16415     Info.offset = 0;
16416     Info.align = DL.getABITypeAlign(ValTy);
16417     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
16418     return true;
16419   }
16420   case Intrinsic::aarch64_stlxr:
16421   case Intrinsic::aarch64_stxr: {
16422     Type *ValTy = I.getParamElementType(1);
16423     Info.opc = ISD::INTRINSIC_W_CHAIN;
16424     Info.memVT = MVT::getVT(ValTy);
16425     Info.ptrVal = I.getArgOperand(1);
16426     Info.offset = 0;
16427     Info.align = DL.getABITypeAlign(ValTy);
16428     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
16429     return true;
16430   }
16431   case Intrinsic::aarch64_ldaxp:
16432   case Intrinsic::aarch64_ldxp:
16433     Info.opc = ISD::INTRINSIC_W_CHAIN;
16434     Info.memVT = MVT::i128;
16435     Info.ptrVal = I.getArgOperand(0);
16436     Info.offset = 0;
16437     Info.align = Align(16);
16438     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
16439     return true;
16440   case Intrinsic::aarch64_stlxp:
16441   case Intrinsic::aarch64_stxp:
16442     Info.opc = ISD::INTRINSIC_W_CHAIN;
16443     Info.memVT = MVT::i128;
16444     Info.ptrVal = I.getArgOperand(2);
16445     Info.offset = 0;
16446     Info.align = Align(16);
16447     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
16448     return true;
16449   case Intrinsic::aarch64_sve_ldnt1: {
16450     Type *ElTy = cast<VectorType>(I.getType())->getElementType();
16451     Info.opc = ISD::INTRINSIC_W_CHAIN;
16452     Info.memVT = MVT::getVT(I.getType());
16453     Info.ptrVal = I.getArgOperand(1);
16454     Info.offset = 0;
16455     Info.align = DL.getABITypeAlign(ElTy);
16456     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
16457     return true;
16458   }
16459   case Intrinsic::aarch64_sve_stnt1: {
16460     Type *ElTy =
16461         cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
16462     Info.opc = ISD::INTRINSIC_W_CHAIN;
16463     Info.memVT = MVT::getVT(I.getOperand(0)->getType());
16464     Info.ptrVal = I.getArgOperand(2);
16465     Info.offset = 0;
16466     Info.align = DL.getABITypeAlign(ElTy);
16467     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
16468     return true;
16469   }
16470   case Intrinsic::aarch64_mops_memset_tag: {
16471     Value *Dst = I.getArgOperand(0);
16472     Value *Val = I.getArgOperand(1);
16473     Info.opc = ISD::INTRINSIC_W_CHAIN;
16474     Info.memVT = MVT::getVT(Val->getType());
16475     Info.ptrVal = Dst;
16476     Info.offset = 0;
16477     Info.align = I.getParamAlign(0).valueOrOne();
16478     Info.flags = MachineMemOperand::MOStore;
16479     // The size of the memory being operated on is unknown at this point
16480     Info.size = MemoryLocation::UnknownSize;
16481     return true;
16482   }
16483   default:
16484     break;
16485   }
16486 
16487   return false;
16488 }
16489 
16490 bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
16491                                                   ISD::LoadExtType ExtTy,
16492                                                   EVT NewVT) const {
16493   // TODO: This may be worth removing. Check regression tests for diffs.
16494   if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
16495     return false;
16496 
16497   // If we're reducing the load width in order to avoid having to use an extra
16498   // instruction to do extension then it's probably a good idea.
16499   if (ExtTy != ISD::NON_EXTLOAD)
16500     return true;
16501   // Don't reduce load width if it would prevent us from combining a shift into
16502   // the offset.
16503   MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
16504   assert(Mem);
16505   const SDValue &Base = Mem->getBasePtr();
16506   if (Base.getOpcode() == ISD::ADD &&
16507       Base.getOperand(1).getOpcode() == ISD::SHL &&
16508       Base.getOperand(1).hasOneUse() &&
16509       Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
16510     // It's unknown whether a scalable vector has a power-of-2 bitwidth.
16511     if (Mem->getMemoryVT().isScalableVector())
16512       return false;
16513     // The shift can be combined if it matches the size of the value being
16514     // loaded (and so reducing the width would make it not match).
16515     uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
16516     uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
16517     if (ShiftAmount == Log2_32(LoadBytes))
16518       return false;
16519   }
16520   // We have no reason to disallow reducing the load width, so allow it.
16521   return true;
16522 }
16523 
16524 // Treat a sext_inreg(extract(..)) as free if it has multiple uses.
16525 bool AArch64TargetLowering::shouldRemoveRedundantExtend(SDValue Extend) const {
16526   EVT VT = Extend.getValueType();
16527   if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
16528     SDValue Extract = Extend.getOperand(0);
16529     if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
16530       Extract = Extract.getOperand(0);
16531     if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
16532       EVT VecVT = Extract.getOperand(0).getValueType();
16533       if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
16534         return false;
16535     }
16536   }
16537   return true;
16538 }
16539 
16540 // Truncations from 64-bit GPR to 32-bit GPR is free.
16541 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
16542   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
16543     return false;
16544   uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
16545   uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
16546   return NumBits1 > NumBits2;
16547 }
16548 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
16549   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
16550     return false;
16551   uint64_t NumBits1 = VT1.getFixedSizeInBits();
16552   uint64_t NumBits2 = VT2.getFixedSizeInBits();
16553   return NumBits1 > NumBits2;
16554 }
16555 
16556 /// Check if it is profitable to hoist instruction in then/else to if.
16557 /// Not profitable if I and it's user can form a FMA instruction
16558 /// because we prefer FMSUB/FMADD.
16559 bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
16560   if (I->getOpcode() != Instruction::FMul)
16561     return true;
16562 
16563   if (!I->hasOneUse())
16564     return true;
16565 
16566   Instruction *User = I->user_back();
16567 
16568   if (!(User->getOpcode() == Instruction::FSub ||
16569         User->getOpcode() == Instruction::FAdd))
16570     return true;
16571 
16572   const TargetOptions &Options = getTargetMachine().Options;
16573   const Function *F = I->getFunction();
16574   const DataLayout &DL = F->getDataLayout();
16575   Type *Ty = User->getOperand(0)->getType();
16576 
16577   return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
16578            isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
16579            (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16580             Options.UnsafeFPMath));
16581 }
16582 
16583 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
16584 // 64-bit GPR.
16585 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
16586   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
16587     return false;
16588   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
16589   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
16590   return NumBits1 == 32 && NumBits2 == 64;
16591 }
16592 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
16593   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
16594     return false;
16595   unsigned NumBits1 = VT1.getSizeInBits();
16596   unsigned NumBits2 = VT2.getSizeInBits();
16597   return NumBits1 == 32 && NumBits2 == 64;
16598 }
16599 
16600 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
16601   EVT VT1 = Val.getValueType();
16602   if (isZExtFree(VT1, VT2)) {
16603     return true;
16604   }
16605 
16606   if (Val.getOpcode() != ISD::LOAD)
16607     return false;
16608 
16609   // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
16610   return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
16611           VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
16612           VT1.getSizeInBits() <= 32);
16613 }
16614 
16615 bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
16616   if (isa<FPExtInst>(Ext))
16617     return false;
16618 
16619   // Vector types are not free.
16620   if (Ext->getType()->isVectorTy())
16621     return false;
16622 
16623   for (const Use &U : Ext->uses()) {
16624     // The extension is free if we can fold it with a left shift in an
16625     // addressing mode or an arithmetic operation: add, sub, and cmp.
16626 
16627     // Is there a shift?
16628     const Instruction *Instr = cast<Instruction>(U.getUser());
16629 
16630     // Is this a constant shift?
16631     switch (Instr->getOpcode()) {
16632     case Instruction::Shl:
16633       if (!isa<ConstantInt>(Instr->getOperand(1)))
16634         return false;
16635       break;
16636     case Instruction::GetElementPtr: {
16637       gep_type_iterator GTI = gep_type_begin(Instr);
16638       auto &DL = Ext->getDataLayout();
16639       std::advance(GTI, U.getOperandNo()-1);
16640       Type *IdxTy = GTI.getIndexedType();
16641       // This extension will end up with a shift because of the scaling factor.
16642       // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
16643       // Get the shift amount based on the scaling factor:
16644       // log2(sizeof(IdxTy)) - log2(8).
16645       if (IdxTy->isScalableTy())
16646         return false;
16647       uint64_t ShiftAmt =
16648           llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
16649           3;
16650       // Is the constant foldable in the shift of the addressing mode?
16651       // I.e., shift amount is between 1 and 4 inclusive.
16652       if (ShiftAmt == 0 || ShiftAmt > 4)
16653         return false;
16654       break;
16655     }
16656     case Instruction::Trunc:
16657       // Check if this is a noop.
16658       // trunc(sext ty1 to ty2) to ty1.
16659       if (Instr->getType() == Ext->getOperand(0)->getType())
16660         continue;
16661       [[fallthrough]];
16662     default:
16663       return false;
16664     }
16665 
16666     // At this point we can use the bfm family, so this extension is free
16667     // for that use.
16668   }
16669   return true;
16670 }
16671 
16672 static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
16673                                  unsigned NumElts, bool IsLittleEndian,
16674                                  SmallVectorImpl<int> &Mask) {
16675   if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
16676     return false;
16677 
16678   assert(DstWidth % SrcWidth == 0 &&
16679          "TBL lowering is not supported for a conversion instruction with this "
16680          "source and destination element type.");
16681 
16682   unsigned Factor = DstWidth / SrcWidth;
16683   unsigned MaskLen = NumElts * Factor;
16684 
16685   Mask.clear();
16686   Mask.resize(MaskLen, NumElts);
16687 
16688   unsigned SrcIndex = 0;
16689   for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
16690     Mask[I] = SrcIndex++;
16691 
16692   return true;
16693 }
16694 
16695 static Value *createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op,
16696                                       FixedVectorType *ZExtTy,
16697                                       FixedVectorType *DstTy,
16698                                       bool IsLittleEndian) {
16699   auto *SrcTy = cast<FixedVectorType>(Op->getType());
16700   unsigned NumElts = SrcTy->getNumElements();
16701   auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16702   auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16703 
16704   SmallVector<int> Mask;
16705   if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
16706     return nullptr;
16707 
16708   auto *FirstEltZero = Builder.CreateInsertElement(
16709       PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
16710   Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
16711   Result = Builder.CreateBitCast(Result, DstTy);
16712   if (DstTy != ZExtTy)
16713     Result = Builder.CreateZExt(Result, ZExtTy);
16714   return Result;
16715 }
16716 
16717 static Value *createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op,
16718                                       FixedVectorType *DstTy,
16719                                       bool IsLittleEndian) {
16720   auto *SrcTy = cast<FixedVectorType>(Op->getType());
16721   auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16722   auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16723 
16724   SmallVector<int> Mask;
16725   if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
16726                             !IsLittleEndian, Mask))
16727     return nullptr;
16728 
16729   auto *FirstEltZero = Builder.CreateInsertElement(
16730       PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
16731 
16732   return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
16733 }
16734 
16735 static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
16736   IRBuilder<> Builder(TI);
16737   SmallVector<Value *> Parts;
16738   int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
16739   auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
16740   auto *DstTy = cast<FixedVectorType>(TI->getType());
16741   assert(SrcTy->getElementType()->isIntegerTy() &&
16742          "Non-integer type source vector element is not supported");
16743   assert(DstTy->getElementType()->isIntegerTy(8) &&
16744          "Unsupported destination vector element type");
16745   unsigned SrcElemTySz =
16746       cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16747   unsigned DstElemTySz =
16748       cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16749   assert((SrcElemTySz % DstElemTySz == 0) &&
16750          "Cannot lower truncate to tbl instructions for a source element size "
16751          "that is not divisible by the destination element size");
16752   unsigned TruncFactor = SrcElemTySz / DstElemTySz;
16753   assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
16754          "Unsupported source vector element type size");
16755   Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
16756 
16757   // Create a mask to choose every nth byte from the source vector table of
16758   // bytes to create the truncated destination vector, where 'n' is the truncate
16759   // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
16760   // 0,8,16,..Y*8th bytes for the little-endian format
16761   SmallVector<Constant *, 16> MaskConst;
16762   for (int Itr = 0; Itr < 16; Itr++) {
16763     if (Itr < NumElements)
16764       MaskConst.push_back(Builder.getInt8(
16765           IsLittleEndian ? Itr * TruncFactor
16766                          : Itr * TruncFactor + (TruncFactor - 1)));
16767     else
16768       MaskConst.push_back(Builder.getInt8(255));
16769   }
16770 
16771   int MaxTblSz = 128 * 4;
16772   int MaxSrcSz = SrcElemTySz * NumElements;
16773   int ElemsPerTbl =
16774       (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
16775   assert(ElemsPerTbl <= 16 &&
16776          "Maximum elements selected using TBL instruction cannot exceed 16!");
16777 
16778   int ShuffleCount = 128 / SrcElemTySz;
16779   SmallVector<int> ShuffleLanes;
16780   for (int i = 0; i < ShuffleCount; ++i)
16781     ShuffleLanes.push_back(i);
16782 
16783   // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
16784   // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
16785   // call TBL & save the result in a vector of TBL results for combining later.
16786   SmallVector<Value *> Results;
16787   while (ShuffleLanes.back() < NumElements) {
16788     Parts.push_back(Builder.CreateBitCast(
16789         Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
16790 
16791     if (Parts.size() == 4) {
16792       Parts.push_back(ConstantVector::get(MaskConst));
16793       Results.push_back(
16794           Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
16795       Parts.clear();
16796     }
16797 
16798     for (int i = 0; i < ShuffleCount; ++i)
16799       ShuffleLanes[i] += ShuffleCount;
16800   }
16801 
16802   assert((Parts.empty() || Results.empty()) &&
16803          "Lowering trunc for vectors requiring different TBL instructions is "
16804          "not supported!");
16805   // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
16806   // registers
16807   if (!Parts.empty()) {
16808     Intrinsic::ID TblID;
16809     switch (Parts.size()) {
16810     case 1:
16811       TblID = Intrinsic::aarch64_neon_tbl1;
16812       break;
16813     case 2:
16814       TblID = Intrinsic::aarch64_neon_tbl2;
16815       break;
16816     case 3:
16817       TblID = Intrinsic::aarch64_neon_tbl3;
16818       break;
16819     }
16820 
16821     Parts.push_back(ConstantVector::get(MaskConst));
16822     Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
16823   }
16824 
16825   // Extract the destination vector from TBL result(s) after combining them
16826   // where applicable. Currently, at most two TBLs are supported.
16827   assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
16828                                 "more than 2 tbl instructions!");
16829   Value *FinalResult = Results[0];
16830   if (Results.size() == 1) {
16831     if (ElemsPerTbl < 16) {
16832       SmallVector<int> FinalMask(ElemsPerTbl);
16833       std::iota(FinalMask.begin(), FinalMask.end(), 0);
16834       FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
16835     }
16836   } else {
16837     SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
16838     if (ElemsPerTbl < 16) {
16839       std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
16840       std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
16841     } else {
16842       std::iota(FinalMask.begin(), FinalMask.end(), 0);
16843     }
16844     FinalResult =
16845         Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
16846   }
16847 
16848   TI->replaceAllUsesWith(FinalResult);
16849   TI->eraseFromParent();
16850 }
16851 
16852 bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
16853     Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
16854   // shuffle_vector instructions are serialized when targeting SVE,
16855   // see LowerSPLAT_VECTOR. This peephole is not beneficial.
16856   if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
16857     return false;
16858 
16859   // Try to optimize conversions using tbl. This requires materializing constant
16860   // index vectors, which can increase code size and add loads. Skip the
16861   // transform unless the conversion is in a loop block guaranteed to execute
16862   // and we are not optimizing for size.
16863   Function *F = I->getParent()->getParent();
16864   if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
16865       F->hasOptSize())
16866     return false;
16867 
16868   auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
16869   auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
16870   if (!SrcTy || !DstTy)
16871     return false;
16872 
16873   // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
16874   // lowered to tbl instructions to insert the original i8 elements
16875   // into i8x lanes. This is enabled for cases where it is beneficial.
16876   auto *ZExt = dyn_cast<ZExtInst>(I);
16877   if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
16878     auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
16879     if (DstWidth % 8 != 0)
16880       return false;
16881 
16882     auto *TruncDstType =
16883         cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
16884     // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
16885     // the remaining ZExt folded into the user, don't use tbl lowering.
16886     auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
16887     if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
16888                              TargetTransformInfo::getCastContextHint(I),
16889                              TTI::TCK_SizeAndLatency, I) == TTI::TCC_Free) {
16890       if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
16891         return false;
16892 
16893       DstTy = TruncDstType;
16894     }
16895 
16896     // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
16897     // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
16898     // most one extra extend step is needed and using tbl is not profitable.
16899     if (SrcWidth * 4 <= DstWidth && I->hasOneUser()) {
16900       auto *SingleUser = cast<Instruction>(*I->user_begin());
16901       if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
16902         return false;
16903     }
16904 
16905     if (DstTy->getScalarSizeInBits() >= 64)
16906       return false;
16907 
16908     IRBuilder<> Builder(ZExt);
16909     Value *Result = createTblShuffleForZExt(
16910         Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
16911         DstTy, Subtarget->isLittleEndian());
16912     if (!Result)
16913       return false;
16914     ZExt->replaceAllUsesWith(Result);
16915     ZExt->eraseFromParent();
16916     return true;
16917   }
16918 
16919   auto *UIToFP = dyn_cast<UIToFPInst>(I);
16920   if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
16921                   DstTy->getElementType()->isFloatTy()) ||
16922                  (SrcTy->getElementType()->isIntegerTy(16) &&
16923                   DstTy->getElementType()->isDoubleTy()))) {
16924     IRBuilder<> Builder(I);
16925     Value *ZExt = createTblShuffleForZExt(
16926         Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
16927         FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
16928     assert(ZExt && "Cannot fail for the i8 to float conversion");
16929     auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
16930     I->replaceAllUsesWith(UI);
16931     I->eraseFromParent();
16932     return true;
16933   }
16934 
16935   auto *SIToFP = dyn_cast<SIToFPInst>(I);
16936   if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
16937       DstTy->getElementType()->isFloatTy()) {
16938     IRBuilder<> Builder(I);
16939     auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
16940                                             FixedVectorType::getInteger(DstTy),
16941                                             Subtarget->isLittleEndian());
16942     assert(Shuffle && "Cannot fail for the i8 to float conversion");
16943     auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
16944     auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
16945     auto *SI = Builder.CreateSIToFP(AShr, DstTy);
16946     I->replaceAllUsesWith(SI);
16947     I->eraseFromParent();
16948     return true;
16949   }
16950 
16951   // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
16952   // followed by a truncate lowered to using tbl.4.
16953   auto *FPToUI = dyn_cast<FPToUIInst>(I);
16954   if (FPToUI &&
16955       (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
16956       SrcTy->getElementType()->isFloatTy() &&
16957       DstTy->getElementType()->isIntegerTy(8)) {
16958     IRBuilder<> Builder(I);
16959     auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
16960                                           VectorType::getInteger(SrcTy));
16961     auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
16962     I->replaceAllUsesWith(TruncI);
16963     I->eraseFromParent();
16964     createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
16965     return true;
16966   }
16967 
16968   // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
16969   // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
16970   // per lane of the input that is represented using 1,2,3 or 4 128-bit table
16971   // registers
16972   auto *TI = dyn_cast<TruncInst>(I);
16973   if (TI && DstTy->getElementType()->isIntegerTy(8) &&
16974       ((SrcTy->getElementType()->isIntegerTy(32) ||
16975         SrcTy->getElementType()->isIntegerTy(64)) &&
16976        (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
16977     createTblForTrunc(TI, Subtarget->isLittleEndian());
16978     return true;
16979   }
16980 
16981   return false;
16982 }
16983 
16984 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
16985                                           Align &RequiredAligment) const {
16986   if (!LoadedType.isSimple() ||
16987       (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
16988     return false;
16989   // Cyclone supports unaligned accesses.
16990   RequiredAligment = Align(1);
16991   unsigned NumBits = LoadedType.getSizeInBits();
16992   return NumBits == 32 || NumBits == 64;
16993 }
16994 
16995 /// A helper function for determining the number of interleaved accesses we
16996 /// will generate when lowering accesses of the given type.
16997 unsigned AArch64TargetLowering::getNumInterleavedAccesses(
16998     VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
16999   unsigned VecSize = 128;
17000   unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17001   unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
17002   if (UseScalable && isa<FixedVectorType>(VecTy))
17003     VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17004   return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
17005 }
17006 
17007 MachineMemOperand::Flags
17008 AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
17009   if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
17010       I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
17011     return MOStridedAccess;
17012   return MachineMemOperand::MONone;
17013 }
17014 
17015 bool AArch64TargetLowering::isLegalInterleavedAccessType(
17016     VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
17017   unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17018   auto EC = VecTy->getElementCount();
17019   unsigned MinElts = EC.getKnownMinValue();
17020 
17021   UseScalable = false;
17022 
17023   if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
17024       (!Subtarget->useSVEForFixedLengthVectors() ||
17025        !getSVEPredPatternFromNumElements(MinElts)))
17026     return false;
17027 
17028   if (isa<ScalableVectorType>(VecTy) &&
17029       !Subtarget->isSVEorStreamingSVEAvailable())
17030     return false;
17031 
17032   // Ensure the number of vector elements is greater than 1.
17033   if (MinElts < 2)
17034     return false;
17035 
17036   // Ensure the element type is legal.
17037   if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
17038     return false;
17039 
17040   if (EC.isScalable()) {
17041     UseScalable = true;
17042     return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
17043   }
17044 
17045   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
17046   if (Subtarget->useSVEForFixedLengthVectors()) {
17047     unsigned MinSVEVectorSize =
17048         std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17049     if (VecSize % MinSVEVectorSize == 0 ||
17050         (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
17051          (!Subtarget->isNeonAvailable() || VecSize > 128))) {
17052       UseScalable = true;
17053       return true;
17054     }
17055   }
17056 
17057   // Ensure the total vector size is 64 or a multiple of 128. Types larger than
17058   // 128 will be split into multiple interleaved accesses.
17059   return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
17060 }
17061 
17062 static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) {
17063   if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
17064     return ScalableVectorType::get(VTy->getElementType(), 2);
17065 
17066   if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
17067     return ScalableVectorType::get(VTy->getElementType(), 4);
17068 
17069   if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
17070     return ScalableVectorType::get(VTy->getElementType(), 8);
17071 
17072   if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
17073     return ScalableVectorType::get(VTy->getElementType(), 8);
17074 
17075   if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
17076     return ScalableVectorType::get(VTy->getElementType(), 2);
17077 
17078   if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
17079     return ScalableVectorType::get(VTy->getElementType(), 4);
17080 
17081   if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
17082     return ScalableVectorType::get(VTy->getElementType(), 8);
17083 
17084   if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
17085     return ScalableVectorType::get(VTy->getElementType(), 16);
17086 
17087   llvm_unreachable("Cannot handle input vector type");
17088 }
17089 
17090 static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
17091                                            bool Scalable, Type *LDVTy,
17092                                            Type *PtrTy) {
17093   assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17094   static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
17095                                             Intrinsic::aarch64_sve_ld3_sret,
17096                                             Intrinsic::aarch64_sve_ld4_sret};
17097   static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
17098                                              Intrinsic::aarch64_neon_ld3,
17099                                              Intrinsic::aarch64_neon_ld4};
17100   if (Scalable)
17101     return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});
17102 
17103   return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
17104                                            {LDVTy, PtrTy});
17105 }
17106 
17107 static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
17108                                             bool Scalable, Type *STVTy,
17109                                             Type *PtrTy) {
17110   assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17111   static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
17112                                              Intrinsic::aarch64_sve_st3,
17113                                              Intrinsic::aarch64_sve_st4};
17114   static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
17115                                               Intrinsic::aarch64_neon_st3,
17116                                               Intrinsic::aarch64_neon_st4};
17117   if (Scalable)
17118     return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});
17119 
17120   return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
17121                                            {STVTy, PtrTy});
17122 }
17123 
17124 /// Lower an interleaved load into a ldN intrinsic.
17125 ///
17126 /// E.g. Lower an interleaved load (Factor = 2):
17127 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
17128 ///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
17129 ///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
17130 ///
17131 ///      Into:
17132 ///        %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
17133 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
17134 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
17135 bool AArch64TargetLowering::lowerInterleavedLoad(
17136     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
17137     ArrayRef<unsigned> Indices, unsigned Factor) const {
17138   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17139          "Invalid interleave factor");
17140   assert(!Shuffles.empty() && "Empty shufflevector input");
17141   assert(Shuffles.size() == Indices.size() &&
17142          "Unmatched number of shufflevectors and indices");
17143 
17144   const DataLayout &DL = LI->getDataLayout();
17145 
17146   VectorType *VTy = Shuffles[0]->getType();
17147 
17148   // Skip if we do not have NEON and skip illegal vector types. We can
17149   // "legalize" wide vector types into multiple interleaved accesses as long as
17150   // the vector types are divisible by 128.
17151   bool UseScalable;
17152   if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17153     return false;
17154 
17155   // Check if the interleave is a zext(shuffle), that can be better optimized
17156   // into shift / and masks. For the moment we do this just for uitofp (not
17157   // zext) to avoid issues with widening instructions.
17158   if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
17159         return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
17160                SI->getType()->getScalarSizeInBits() * 4 ==
17161                    SI->user_back()->getType()->getScalarSizeInBits();
17162       }))
17163     return false;
17164 
17165   unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17166 
17167   auto *FVTy = cast<FixedVectorType>(VTy);
17168 
17169   // A pointer vector can not be the return type of the ldN intrinsics. Need to
17170   // load integer vectors first and then convert to pointer vectors.
17171   Type *EltTy = FVTy->getElementType();
17172   if (EltTy->isPointerTy())
17173     FVTy =
17174         FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
17175 
17176   // If we're going to generate more than one load, reset the sub-vector type
17177   // to something legal.
17178   FVTy = FixedVectorType::get(FVTy->getElementType(),
17179                               FVTy->getNumElements() / NumLoads);
17180 
17181   auto *LDVTy =
17182       UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
17183 
17184   IRBuilder<> Builder(LI);
17185 
17186   // The base address of the load.
17187   Value *BaseAddr = LI->getPointerOperand();
17188 
17189   Type *PtrTy = LI->getPointerOperandType();
17190   Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
17191                                  LDVTy->getElementCount());
17192 
17193   Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
17194                                                 UseScalable, LDVTy, PtrTy);
17195 
17196   // Holds sub-vectors extracted from the load intrinsic return values. The
17197   // sub-vectors are associated with the shufflevector instructions they will
17198   // replace.
17199   DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
17200 
17201   Value *PTrue = nullptr;
17202   if (UseScalable) {
17203     std::optional<unsigned> PgPattern =
17204         getSVEPredPatternFromNumElements(FVTy->getNumElements());
17205     if (Subtarget->getMinSVEVectorSizeInBits() ==
17206             Subtarget->getMaxSVEVectorSizeInBits() &&
17207         Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
17208       PgPattern = AArch64SVEPredPattern::all;
17209 
17210     auto *PTruePat =
17211         ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
17212     PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17213                                     {PTruePat});
17214   }
17215 
17216   for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
17217 
17218     // If we're generating more than one load, compute the base address of
17219     // subsequent loads as an offset from the previous.
17220     if (LoadCount > 0)
17221       BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
17222                                             FVTy->getNumElements() * Factor);
17223 
17224     CallInst *LdN;
17225     if (UseScalable)
17226       LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
17227     else
17228       LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17229 
17230     // Extract and store the sub-vectors returned by the load intrinsic.
17231     for (unsigned i = 0; i < Shuffles.size(); i++) {
17232       ShuffleVectorInst *SVI = Shuffles[i];
17233       unsigned Index = Indices[i];
17234 
17235       Value *SubVec = Builder.CreateExtractValue(LdN, Index);
17236 
17237       if (UseScalable)
17238         SubVec = Builder.CreateExtractVector(
17239             FVTy, SubVec,
17240             ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
17241 
17242       // Convert the integer vector to pointer vector if the element is pointer.
17243       if (EltTy->isPointerTy())
17244         SubVec = Builder.CreateIntToPtr(
17245             SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
17246                                          FVTy->getNumElements()));
17247 
17248       SubVecs[SVI].push_back(SubVec);
17249     }
17250   }
17251 
17252   // Replace uses of the shufflevector instructions with the sub-vectors
17253   // returned by the load intrinsic. If a shufflevector instruction is
17254   // associated with more than one sub-vector, those sub-vectors will be
17255   // concatenated into a single wide vector.
17256   for (ShuffleVectorInst *SVI : Shuffles) {
17257     auto &SubVec = SubVecs[SVI];
17258     auto *WideVec =
17259         SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
17260     SVI->replaceAllUsesWith(WideVec);
17261   }
17262 
17263   return true;
17264 }
17265 
17266 template <typename Iter>
17267 bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
17268   int MaxLookupDist = 20;
17269   unsigned IdxWidth = DL.getIndexSizeInBits(0);
17270   APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
17271   const Value *PtrA1 =
17272       Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
17273 
17274   while (++It != End) {
17275     if (It->isDebugOrPseudoInst())
17276       continue;
17277     if (MaxLookupDist-- == 0)
17278       break;
17279     if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
17280       const Value *PtrB1 =
17281           SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
17282               DL, OffsetB);
17283       if (PtrA1 == PtrB1 &&
17284           (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
17285                   .abs() == 16)
17286         return true;
17287     }
17288   }
17289 
17290   return false;
17291 }
17292 
17293 /// Lower an interleaved store into a stN intrinsic.
17294 ///
17295 /// E.g. Lower an interleaved store (Factor = 3):
17296 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
17297 ///                 <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
17298 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
17299 ///
17300 ///      Into:
17301 ///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
17302 ///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
17303 ///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
17304 ///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17305 ///
17306 /// Note that the new shufflevectors will be removed and we'll only generate one
17307 /// st3 instruction in CodeGen.
17308 ///
17309 /// Example for a more general valid mask (Factor 3). Lower:
17310 ///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
17311 ///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
17312 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
17313 ///
17314 ///      Into:
17315 ///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
17316 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
17317 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
17318 ///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17319 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
17320                                                   ShuffleVectorInst *SVI,
17321                                                   unsigned Factor) const {
17322 
17323   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17324          "Invalid interleave factor");
17325 
17326   auto *VecTy = cast<FixedVectorType>(SVI->getType());
17327   assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
17328 
17329   unsigned LaneLen = VecTy->getNumElements() / Factor;
17330   Type *EltTy = VecTy->getElementType();
17331   auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
17332 
17333   const DataLayout &DL = SI->getDataLayout();
17334   bool UseScalable;
17335 
17336   // Skip if we do not have NEON and skip illegal vector types. We can
17337   // "legalize" wide vector types into multiple interleaved accesses as long as
17338   // the vector types are divisible by 128.
17339   if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
17340     return false;
17341 
17342   unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
17343 
17344   Value *Op0 = SVI->getOperand(0);
17345   Value *Op1 = SVI->getOperand(1);
17346   IRBuilder<> Builder(SI);
17347 
17348   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
17349   // vectors to integer vectors.
17350   if (EltTy->isPointerTy()) {
17351     Type *IntTy = DL.getIntPtrType(EltTy);
17352     unsigned NumOpElts =
17353         cast<FixedVectorType>(Op0->getType())->getNumElements();
17354 
17355     // Convert to the corresponding integer vector.
17356     auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
17357     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
17358     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
17359 
17360     SubVecTy = FixedVectorType::get(IntTy, LaneLen);
17361   }
17362 
17363   // If we're going to generate more than one store, reset the lane length
17364   // and sub-vector type to something legal.
17365   LaneLen /= NumStores;
17366   SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
17367 
17368   auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
17369                             : SubVecTy;
17370 
17371   // The base address of the store.
17372   Value *BaseAddr = SI->getPointerOperand();
17373 
17374   auto Mask = SVI->getShuffleMask();
17375 
17376   // Sanity check if all the indices are NOT in range.
17377   // If mask is `poison`, `Mask` may be a vector of -1s.
17378   // If all of them are `poison`, OOB read will happen later.
17379   if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
17380     return false;
17381   }
17382   // A 64bit st2 which does not start at element 0 will involved adding extra
17383   // ext elements making the st2 unprofitable, and if there is a nearby store
17384   // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
17385   // zip;ldp pair which has higher throughput.
17386   if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
17387       (Mask[0] != 0 ||
17388        hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
17389                             DL) ||
17390        hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
17391                             BaseAddr, DL)))
17392     return false;
17393 
17394   Type *PtrTy = SI->getPointerOperandType();
17395   Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
17396                                  STVTy->getElementCount());
17397 
17398   Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
17399                                                  UseScalable, STVTy, PtrTy);
17400 
17401   Value *PTrue = nullptr;
17402   if (UseScalable) {
17403     std::optional<unsigned> PgPattern =
17404         getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
17405     if (Subtarget->getMinSVEVectorSizeInBits() ==
17406             Subtarget->getMaxSVEVectorSizeInBits() &&
17407         Subtarget->getMinSVEVectorSizeInBits() ==
17408             DL.getTypeSizeInBits(SubVecTy))
17409       PgPattern = AArch64SVEPredPattern::all;
17410 
17411     auto *PTruePat =
17412         ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
17413     PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17414                                     {PTruePat});
17415   }
17416 
17417   for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
17418 
17419     SmallVector<Value *, 5> Ops;
17420 
17421     // Split the shufflevector operands into sub vectors for the new stN call.
17422     for (unsigned i = 0; i < Factor; i++) {
17423       Value *Shuffle;
17424       unsigned IdxI = StoreCount * LaneLen * Factor + i;
17425       if (Mask[IdxI] >= 0) {
17426         Shuffle = Builder.CreateShuffleVector(
17427             Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
17428       } else {
17429         unsigned StartMask = 0;
17430         for (unsigned j = 1; j < LaneLen; j++) {
17431           unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
17432           if (Mask[IdxJ] >= 0) {
17433             StartMask = Mask[IdxJ] - j;
17434             break;
17435           }
17436         }
17437         // Note: Filling undef gaps with random elements is ok, since
17438         // those elements were being written anyway (with undefs).
17439         // In the case of all undefs we're defaulting to using elems from 0
17440         // Note: StartMask cannot be negative, it's checked in
17441         // isReInterleaveMask
17442         Shuffle = Builder.CreateShuffleVector(
17443             Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
17444       }
17445 
17446       if (UseScalable)
17447         Shuffle = Builder.CreateInsertVector(
17448             STVTy, UndefValue::get(STVTy), Shuffle,
17449             ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
17450 
17451       Ops.push_back(Shuffle);
17452     }
17453 
17454     if (UseScalable)
17455       Ops.push_back(PTrue);
17456 
17457     // If we generating more than one store, we compute the base address of
17458     // subsequent stores as an offset from the previous.
17459     if (StoreCount > 0)
17460       BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
17461                                             BaseAddr, LaneLen * Factor);
17462 
17463     Ops.push_back(BaseAddr);
17464     Builder.CreateCall(StNFunc, Ops);
17465   }
17466   return true;
17467 }
17468 
17469 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
17470     LoadInst *LI, ArrayRef<Value *> DeinterleavedValues) const {
17471   unsigned Factor = DeinterleavedValues.size();
17472   if (Factor != 2 && Factor != 4) {
17473     LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
17474     return false;
17475   }
17476 
17477   VectorType *VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
17478 
17479   const DataLayout &DL = LI->getModule()->getDataLayout();
17480   bool UseScalable;
17481   if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17482     return false;
17483 
17484   // TODO: Add support for using SVE instructions with fixed types later, using
17485   // the code from lowerInterleavedLoad to obtain the correct container type.
17486   if (UseScalable && !VTy->isScalableTy())
17487     return false;
17488 
17489   unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17490   VectorType *LdTy =
17491       VectorType::get(VTy->getElementType(),
17492                       VTy->getElementCount().divideCoefficientBy(NumLoads));
17493 
17494   Type *PtrTy = LI->getPointerOperandType();
17495   Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
17496                                                 UseScalable, LdTy, PtrTy);
17497 
17498   IRBuilder<> Builder(LI);
17499   Value *Pred = nullptr;
17500   if (UseScalable)
17501     Pred =
17502         Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
17503 
17504   Value *BaseAddr = LI->getPointerOperand();
17505   if (NumLoads > 1) {
17506     // Create multiple legal small ldN.
17507     SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
17508     for (unsigned I = 0; I < NumLoads; ++I) {
17509       Value *Offset = Builder.getInt64(I * Factor);
17510 
17511       Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
17512       Value *LdN = nullptr;
17513       if (UseScalable)
17514         LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
17515       else
17516         LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
17517       Value *Idx =
17518           Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
17519       for (unsigned J = 0; J < Factor; ++J) {
17520         ExtractedLdValues[J] = Builder.CreateInsertVector(
17521             VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
17522       }
17523       LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
17524     }
17525     // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
17526     for (unsigned J = 0; J < Factor; ++J)
17527       DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
17528   } else {
17529     Value *Result;
17530     if (UseScalable)
17531       Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
17532     else
17533       Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17534     // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
17535     for (unsigned I = 0; I < Factor; I++) {
17536       Value *NewExtract = Builder.CreateExtractValue(Result, I);
17537       DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
17538     }
17539   }
17540   return true;
17541 }
17542 
17543 bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
17544     StoreInst *SI, ArrayRef<Value *> InterleavedValues) const {
17545   unsigned Factor = InterleavedValues.size();
17546   if (Factor != 2 && Factor != 4) {
17547     LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
17548     return false;
17549   }
17550 
17551   VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
17552   const DataLayout &DL = SI->getModule()->getDataLayout();
17553 
17554   bool UseScalable;
17555   if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17556     return false;
17557 
17558   // TODO: Add support for using SVE instructions with fixed types later, using
17559   // the code from lowerInterleavedStore to obtain the correct container type.
17560   if (UseScalable && !VTy->isScalableTy())
17561     return false;
17562 
17563   unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
17564 
17565   VectorType *StTy =
17566       VectorType::get(VTy->getElementType(),
17567                       VTy->getElementCount().divideCoefficientBy(NumStores));
17568 
17569   Type *PtrTy = SI->getPointerOperandType();
17570   Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
17571                                                  UseScalable, StTy, PtrTy);
17572 
17573   IRBuilder<> Builder(SI);
17574 
17575   Value *BaseAddr = SI->getPointerOperand();
17576   Value *Pred = nullptr;
17577 
17578   if (UseScalable)
17579     Pred =
17580         Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
17581 
17582   auto ExtractedValues = InterleavedValues;
17583   SmallVector<Value *, 4> StoreOperands(InterleavedValues.begin(),
17584                                         InterleavedValues.end());
17585   if (UseScalable)
17586     StoreOperands.push_back(Pred);
17587   StoreOperands.push_back(BaseAddr);
17588   for (unsigned I = 0; I < NumStores; ++I) {
17589     Value *Address = BaseAddr;
17590     if (NumStores > 1) {
17591       Value *Offset = Builder.getInt64(I * Factor);
17592       Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
17593       Value *Idx =
17594           Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
17595       for (unsigned J = 0; J < Factor; J++) {
17596         StoreOperands[J] =
17597             Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
17598       }
17599       // update the address
17600       StoreOperands[StoreOperands.size() - 1] = Address;
17601     }
17602     Builder.CreateCall(StNFunc, StoreOperands);
17603   }
17604   return true;
17605 }
17606 
17607 EVT AArch64TargetLowering::getOptimalMemOpType(
17608     const MemOp &Op, const AttributeList &FuncAttributes) const {
17609   bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
17610   bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17611   bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17612   // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17613   // taken one instruction to materialize the v2i64 zero and one store (with
17614   // restrictive addressing mode). Just do i64 stores.
17615   bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17616   auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17617     if (Op.isAligned(AlignCheck))
17618       return true;
17619     unsigned Fast;
17620     return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
17621                                           MachineMemOperand::MONone, &Fast) &&
17622            Fast;
17623   };
17624 
17625   if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17626       AlignmentIsAcceptable(MVT::v16i8, Align(16)))
17627     return MVT::v16i8;
17628   if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17629     return MVT::f128;
17630   if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17631     return MVT::i64;
17632   if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17633     return MVT::i32;
17634   return MVT::Other;
17635 }
17636 
17637 LLT AArch64TargetLowering::getOptimalMemOpLLT(
17638     const MemOp &Op, const AttributeList &FuncAttributes) const {
17639   bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
17640   bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17641   bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17642   // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17643   // taken one instruction to materialize the v2i64 zero and one store (with
17644   // restrictive addressing mode). Just do i64 stores.
17645   bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17646   auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17647     if (Op.isAligned(AlignCheck))
17648       return true;
17649     unsigned Fast;
17650     return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
17651                                           MachineMemOperand::MONone, &Fast) &&
17652            Fast;
17653   };
17654 
17655   if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17656       AlignmentIsAcceptable(MVT::v2i64, Align(16)))
17657     return LLT::fixed_vector(2, 64);
17658   if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17659     return LLT::scalar(128);
17660   if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17661     return LLT::scalar(64);
17662   if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17663     return LLT::scalar(32);
17664   return LLT();
17665 }
17666 
17667 // 12-bit optionally shifted immediates are legal for adds.
17668 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
17669   if (Immed == std::numeric_limits<int64_t>::min()) {
17670     LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
17671                       << ": avoid UB for INT64_MIN\n");
17672     return false;
17673   }
17674   // Same encoding for add/sub, just flip the sign.
17675   Immed = std::abs(Immed);
17676   bool IsLegal = ((Immed >> 12) == 0 ||
17677                   ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
17678   LLVM_DEBUG(dbgs() << "Is " << Immed
17679                     << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
17680   return IsLegal;
17681 }
17682 
17683 bool AArch64TargetLowering::isLegalAddScalableImmediate(int64_t Imm) const {
17684   // We will only emit addvl/inc* instructions for SVE2
17685   if (!Subtarget->hasSVE2())
17686     return false;
17687 
17688   // addvl's immediates are in terms of the number of bytes in a register.
17689   // Since there are 16 in the base supported size (128bits), we need to
17690   // divide the immediate by that much to give us a useful immediate to
17691   // multiply by vscale. We can't have a remainder as a result of this.
17692   if (Imm % 16 == 0)
17693     return isInt<6>(Imm / 16);
17694 
17695   // Inc[b|h|w|d] instructions take a pattern and a positive immediate
17696   // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
17697   // of addvl as a result, so only take h|w|d into account.
17698   // Dec[h|w|d] will cover subtractions.
17699   // Immediates are in the range [1,16], so we can't do a 2's complement check.
17700   // FIXME: Can we make use of other patterns to cover other immediates?
17701 
17702   // inch|dech
17703   if (Imm % 8 == 0)
17704     return std::abs(Imm / 8) <= 16;
17705   // incw|decw
17706   if (Imm % 4 == 0)
17707     return std::abs(Imm / 4) <= 16;
17708   // incd|decd
17709   if (Imm % 2 == 0)
17710     return std::abs(Imm / 2) <= 16;
17711 
17712   return false;
17713 }
17714 
17715 // Return false to prevent folding
17716 // (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
17717 // if the folding leads to worse code.
17718 bool AArch64TargetLowering::isMulAddWithConstProfitable(
17719     SDValue AddNode, SDValue ConstNode) const {
17720   // Let the DAGCombiner decide for vector types and large types.
17721   const EVT VT = AddNode.getValueType();
17722   if (VT.isVector() || VT.getScalarSizeInBits() > 64)
17723     return true;
17724 
17725   // It is worse if c1 is legal add immediate, while c1*c2 is not
17726   // and has to be composed by at least two instructions.
17727   const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
17728   const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
17729   const int64_t C1 = C1Node->getSExtValue();
17730   const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
17731   if (!isLegalAddImmediate(C1) || isLegalAddImmediate(C1C2.getSExtValue()))
17732     return true;
17733   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
17734   // Adapt to the width of a register.
17735   unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
17736   AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn);
17737   if (Insn.size() > 1)
17738     return false;
17739 
17740   // Default to true and let the DAGCombiner decide.
17741   return true;
17742 }
17743 
17744 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
17745 // immediates is the same as for an add or a sub.
17746 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
17747   return isLegalAddImmediate(Immed);
17748 }
17749 
17750 /// isLegalAddressingMode - Return true if the addressing mode represented
17751 /// by AM is legal for this target, for a load/store of the specified type.
17752 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
17753                                                   const AddrMode &AMode, Type *Ty,
17754                                                   unsigned AS, Instruction *I) const {
17755   // AArch64 has five basic addressing modes:
17756   //  reg
17757   //  reg + 9-bit signed offset
17758   //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
17759   //  reg1 + reg2
17760   //  reg + SIZE_IN_BYTES * reg
17761 
17762   // No global is ever allowed as a base.
17763   if (AMode.BaseGV)
17764     return false;
17765 
17766   // No reg+reg+imm addressing.
17767   if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
17768     return false;
17769 
17770   // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
17771   // `2*ScaledReg` into `BaseReg + ScaledReg`
17772   AddrMode AM = AMode;
17773   if (AM.Scale && !AM.HasBaseReg) {
17774     if (AM.Scale == 1) {
17775       AM.HasBaseReg = true;
17776       AM.Scale = 0;
17777     } else if (AM.Scale == 2) {
17778       AM.HasBaseReg = true;
17779       AM.Scale = 1;
17780     } else {
17781       return false;
17782     }
17783   }
17784 
17785   // A base register is required in all addressing modes.
17786   if (!AM.HasBaseReg)
17787     return false;
17788 
17789   if (Ty->isScalableTy()) {
17790     if (isa<ScalableVectorType>(Ty)) {
17791       // See if we have a foldable vscale-based offset, for vector types which
17792       // are either legal or smaller than the minimum; more work will be
17793       // required if we need to consider addressing for types which need
17794       // legalization by splitting.
17795       uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
17796       if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
17797           (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
17798           isPowerOf2_64(VecNumBytes))
17799         return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
17800 
17801       uint64_t VecElemNumBytes =
17802           DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
17803       return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
17804              (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
17805     }
17806 
17807     return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
17808   }
17809 
17810   // No scalable offsets allowed for non-scalable types.
17811   if (AM.ScalableOffset)
17812     return false;
17813 
17814   // check reg + imm case:
17815   // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
17816   uint64_t NumBytes = 0;
17817   if (Ty->isSized()) {
17818     uint64_t NumBits = DL.getTypeSizeInBits(Ty);
17819     NumBytes = NumBits / 8;
17820     if (!isPowerOf2_64(NumBits))
17821       NumBytes = 0;
17822   }
17823 
17824   return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
17825                                                           AM.Scale);
17826 }
17827 
17828 // Check whether the 2 offsets belong to the same imm24 range, and their high
17829 // 12bits are same, then their high part can be decoded with the offset of add.
17830 int64_t
17831 AArch64TargetLowering::getPreferredLargeGEPBaseOffset(int64_t MinOffset,
17832                                                       int64_t MaxOffset) const {
17833   int64_t HighPart = MinOffset & ~0xfffULL;
17834   if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
17835     // Rebase the value to an integer multiple of imm12.
17836     return HighPart;
17837   }
17838 
17839   return 0;
17840 }
17841 
17842 bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
17843   // Consider splitting large offset of struct or array.
17844   return true;
17845 }
17846 
17847 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
17848     const MachineFunction &MF, EVT VT) const {
17849   VT = VT.getScalarType();
17850 
17851   if (!VT.isSimple())
17852     return false;
17853 
17854   switch (VT.getSimpleVT().SimpleTy) {
17855   case MVT::f16:
17856     return Subtarget->hasFullFP16();
17857   case MVT::f32:
17858   case MVT::f64:
17859     return true;
17860   default:
17861     break;
17862   }
17863 
17864   return false;
17865 }
17866 
17867 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
17868                                                        Type *Ty) const {
17869   switch (Ty->getScalarType()->getTypeID()) {
17870   case Type::FloatTyID:
17871   case Type::DoubleTyID:
17872     return true;
17873   default:
17874     return false;
17875   }
17876 }
17877 
17878 bool AArch64TargetLowering::generateFMAsInMachineCombiner(
17879     EVT VT, CodeGenOptLevel OptLevel) const {
17880   return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
17881          !useSVEForFixedLengthVectorVT(VT);
17882 }
17883 
17884 const MCPhysReg *
17885 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
17886   // LR is a callee-save register, but we must treat it as clobbered by any call
17887   // site. Hence we include LR in the scratch registers, which are in turn added
17888   // as implicit-defs for stackmaps and patchpoints.
17889   static const MCPhysReg ScratchRegs[] = {
17890     AArch64::X16, AArch64::X17, AArch64::LR, 0
17891   };
17892   return ScratchRegs;
17893 }
17894 
17895 ArrayRef<MCPhysReg> AArch64TargetLowering::getRoundingControlRegisters() const {
17896   static const MCPhysReg RCRegs[] = {AArch64::FPCR};
17897   return RCRegs;
17898 }
17899 
17900 bool
17901 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
17902                                                      CombineLevel Level) const {
17903   assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
17904           N->getOpcode() == ISD::SRL) &&
17905          "Expected shift op");
17906 
17907   SDValue ShiftLHS = N->getOperand(0);
17908   EVT VT = N->getValueType(0);
17909 
17910   if (!ShiftLHS->hasOneUse())
17911     return false;
17912 
17913   if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
17914       !ShiftLHS.getOperand(0)->hasOneUse())
17915     return false;
17916 
17917   // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
17918   // combine it with shift 'N' to let it be lowered to UBFX except:
17919   // ((x >> C) & mask) << C.
17920   if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
17921       isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
17922     uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
17923     if (isMask_64(TruncMask)) {
17924       SDValue AndLHS = ShiftLHS.getOperand(0);
17925       if (AndLHS.getOpcode() == ISD::SRL) {
17926         if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
17927           if (N->getOpcode() == ISD::SHL)
17928             if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
17929               return SRLC->getZExtValue() == SHLC->getZExtValue();
17930           return false;
17931         }
17932       }
17933     }
17934   }
17935   return true;
17936 }
17937 
17938 bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(
17939     const SDNode *N) const {
17940   assert(N->getOpcode() == ISD::XOR &&
17941          (N->getOperand(0).getOpcode() == ISD::SHL ||
17942           N->getOperand(0).getOpcode() == ISD::SRL) &&
17943          "Expected XOR(SHIFT) pattern");
17944 
17945   // Only commute if the entire NOT mask is a hidden shifted mask.
17946   auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
17947   auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
17948   if (XorC && ShiftC) {
17949     unsigned MaskIdx, MaskLen;
17950     if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
17951       unsigned ShiftAmt = ShiftC->getZExtValue();
17952       unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17953       if (N->getOperand(0).getOpcode() == ISD::SHL)
17954         return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
17955       return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
17956     }
17957   }
17958 
17959   return false;
17960 }
17961 
17962 bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
17963     const SDNode *N, CombineLevel Level) const {
17964   assert(((N->getOpcode() == ISD::SHL &&
17965            N->getOperand(0).getOpcode() == ISD::SRL) ||
17966           (N->getOpcode() == ISD::SRL &&
17967            N->getOperand(0).getOpcode() == ISD::SHL)) &&
17968          "Expected shift-shift mask");
17969   // Don't allow multiuse shift folding with the same shift amount.
17970   if (!N->getOperand(0)->hasOneUse())
17971     return false;
17972 
17973   // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
17974   EVT VT = N->getValueType(0);
17975   if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
17976     auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
17977     auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
17978     return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
17979   }
17980 
17981   // We do not need to fold when this shifting used in specific load case:
17982   // (ldr x, (add x, (shl (srl x, c1) 2)))
17983   if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
17984     if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
17985       unsigned ShlAmt = C2->getZExtValue();
17986       if (auto ShouldADD = *N->user_begin();
17987           ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
17988         if (auto ShouldLOAD = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
17989           unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / 8;
17990           if ((1ULL << ShlAmt) == ByteVT &&
17991               isIndexedLoadLegal(ISD::PRE_INC, ShouldLOAD->getMemoryVT()))
17992             return false;
17993         }
17994       }
17995     }
17996   }
17997 
17998   return true;
17999 }
18000 
18001 bool AArch64TargetLowering::shouldFoldSelectWithIdentityConstant(
18002     unsigned BinOpcode, EVT VT) const {
18003   return VT.isScalableVector() && isTypeLegal(VT);
18004 }
18005 
18006 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
18007                                                               Type *Ty) const {
18008   assert(Ty->isIntegerTy());
18009 
18010   unsigned BitSize = Ty->getPrimitiveSizeInBits();
18011   if (BitSize == 0)
18012     return false;
18013 
18014   int64_t Val = Imm.getSExtValue();
18015   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
18016     return true;
18017 
18018   if ((int64_t)Val < 0)
18019     Val = ~Val;
18020   if (BitSize == 32)
18021     Val &= (1LL << 32) - 1;
18022 
18023   unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
18024   // MOVZ is free so return true for one or fewer MOVK.
18025   return Shift < 3;
18026 }
18027 
18028 bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
18029                                                     unsigned Index) const {
18030   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
18031     return false;
18032 
18033   return (Index == 0 || Index == ResVT.getVectorMinNumElements());
18034 }
18035 
18036 /// Turn vector tests of the signbit in the form of:
18037 ///   xor (sra X, elt_size(X)-1), -1
18038 /// into:
18039 ///   cmge X, X, #0
18040 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
18041                                          const AArch64Subtarget *Subtarget) {
18042   EVT VT = N->getValueType(0);
18043   if (!Subtarget->hasNEON() || !VT.isVector())
18044     return SDValue();
18045 
18046   // There must be a shift right algebraic before the xor, and the xor must be a
18047   // 'not' operation.
18048   SDValue Shift = N->getOperand(0);
18049   SDValue Ones = N->getOperand(1);
18050   if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
18051       !ISD::isBuildVectorAllOnes(Ones.getNode()))
18052     return SDValue();
18053 
18054   // The shift should be smearing the sign bit across each vector element.
18055   auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
18056   EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
18057   if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
18058     return SDValue();
18059 
18060   return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
18061 }
18062 
18063 // Given a vecreduce_add node, detect the below pattern and convert it to the
18064 // node sequence with UABDL, [S|U]ADB and UADDLP.
18065 //
18066 // i32 vecreduce_add(
18067 //  v16i32 abs(
18068 //    v16i32 sub(
18069 //     v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
18070 // =================>
18071 // i32 vecreduce_add(
18072 //   v4i32 UADDLP(
18073 //     v8i16 add(
18074 //       v8i16 zext(
18075 //         v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
18076 //       v8i16 zext(
18077 //         v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
18078 static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
18079                                                     SelectionDAG &DAG) {
18080   // Assumed i32 vecreduce_add
18081   if (N->getValueType(0) != MVT::i32)
18082     return SDValue();
18083 
18084   SDValue VecReduceOp0 = N->getOperand(0);
18085   unsigned Opcode = VecReduceOp0.getOpcode();
18086   // Assumed v16i32 abs
18087   if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
18088     return SDValue();
18089 
18090   SDValue ABS = VecReduceOp0;
18091   // Assumed v16i32 sub
18092   if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
18093       ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
18094     return SDValue();
18095 
18096   SDValue SUB = ABS->getOperand(0);
18097   unsigned Opcode0 = SUB->getOperand(0).getOpcode();
18098   unsigned Opcode1 = SUB->getOperand(1).getOpcode();
18099   // Assumed v16i32 type
18100   if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
18101       SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
18102     return SDValue();
18103 
18104   // Assumed zext or sext
18105   bool IsZExt = false;
18106   if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
18107     IsZExt = true;
18108   } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
18109     IsZExt = false;
18110   } else
18111     return SDValue();
18112 
18113   SDValue EXT0 = SUB->getOperand(0);
18114   SDValue EXT1 = SUB->getOperand(1);
18115   // Assumed zext's operand has v16i8 type
18116   if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
18117       EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
18118     return SDValue();
18119 
18120   // Pattern is dectected. Let's convert it to sequence of nodes.
18121   SDLoc DL(N);
18122 
18123   // First, create the node pattern of UABD/SABD.
18124   SDValue UABDHigh8Op0 =
18125       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18126                   DAG.getConstant(8, DL, MVT::i64));
18127   SDValue UABDHigh8Op1 =
18128       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18129                   DAG.getConstant(8, DL, MVT::i64));
18130   SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18131                                   UABDHigh8Op0, UABDHigh8Op1);
18132   SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
18133 
18134   // Second, create the node pattern of UABAL.
18135   SDValue UABDLo8Op0 =
18136       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18137                   DAG.getConstant(0, DL, MVT::i64));
18138   SDValue UABDLo8Op1 =
18139       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18140                   DAG.getConstant(0, DL, MVT::i64));
18141   SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18142                                 UABDLo8Op0, UABDLo8Op1);
18143   SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
18144   SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
18145 
18146   // Third, create the node of UADDLP.
18147   SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
18148 
18149   // Fourth, create the node of VECREDUCE_ADD.
18150   return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
18151 }
18152 
18153 // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
18154 //   vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
18155 //   vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
18156 // If we have vectors larger than v16i8 we extract v16i8 vectors,
18157 // Follow the same steps above to get DOT instructions concatenate them
18158 // and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
18159 static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
18160                                           const AArch64Subtarget *ST) {
18161   if (!ST->isNeonAvailable())
18162     return SDValue();
18163 
18164   if (!ST->hasDotProd())
18165     return performVecReduceAddCombineWithUADDLP(N, DAG);
18166 
18167   SDValue Op0 = N->getOperand(0);
18168   if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
18169       Op0.getValueType().getVectorElementType() != MVT::i32)
18170     return SDValue();
18171 
18172   unsigned ExtOpcode = Op0.getOpcode();
18173   SDValue A = Op0;
18174   SDValue B;
18175   unsigned DotOpcode;
18176   if (ExtOpcode == ISD::MUL) {
18177     A = Op0.getOperand(0);
18178     B = Op0.getOperand(1);
18179     if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
18180       return SDValue();
18181     auto OpCodeA = A.getOpcode();
18182     if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
18183       return SDValue();
18184 
18185     auto OpCodeB = B.getOpcode();
18186     if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
18187       return SDValue();
18188 
18189     if (OpCodeA == OpCodeB) {
18190       DotOpcode =
18191           OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
18192     } else {
18193       // Check USDOT support support
18194       if (!ST->hasMatMulInt8())
18195         return SDValue();
18196       DotOpcode = AArch64ISD::USDOT;
18197       if (OpCodeA == ISD::SIGN_EXTEND)
18198         std::swap(A, B);
18199     }
18200   } else if (ExtOpcode == ISD::ZERO_EXTEND) {
18201     DotOpcode = AArch64ISD::UDOT;
18202   } else if (ExtOpcode == ISD::SIGN_EXTEND) {
18203     DotOpcode = AArch64ISD::SDOT;
18204   } else {
18205     return SDValue();
18206   }
18207 
18208   EVT Op0VT = A.getOperand(0).getValueType();
18209   bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
18210   bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
18211   if (!IsValidElementCount || !IsValidSize)
18212     return SDValue();
18213 
18214   SDLoc DL(Op0);
18215   // For non-mla reductions B can be set to 1. For MLA we take the operand of
18216   // the extend B.
18217   if (!B)
18218     B = DAG.getConstant(1, DL, Op0VT);
18219   else
18220     B = B.getOperand(0);
18221 
18222   unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
18223   unsigned NumOfVecReduce;
18224   EVT TargetType;
18225   if (IsMultipleOf16) {
18226     NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
18227     TargetType = MVT::v4i32;
18228   } else {
18229     NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
18230     TargetType = MVT::v2i32;
18231   }
18232   // Handle the case where we need to generate only one Dot operation.
18233   if (NumOfVecReduce == 1) {
18234     SDValue Zeros = DAG.getConstant(0, DL, TargetType);
18235     SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
18236                               A.getOperand(0), B);
18237     return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18238   }
18239   // Generate Dot instructions that are multiple of 16.
18240   unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
18241   SmallVector<SDValue, 4> SDotVec16;
18242   unsigned I = 0;
18243   for (; I < VecReduce16Num; I += 1) {
18244     SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
18245     SDValue Op0 =
18246         DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
18247                     DAG.getConstant(I * 16, DL, MVT::i64));
18248     SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
18249                               DAG.getConstant(I * 16, DL, MVT::i64));
18250     SDValue Dot =
18251         DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
18252     SDotVec16.push_back(Dot);
18253   }
18254   // Concatenate dot operations.
18255   EVT SDot16EVT =
18256       EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
18257   SDValue ConcatSDot16 =
18258       DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
18259   SDValue VecReduceAdd16 =
18260       DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
18261   unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
18262   if (VecReduce8Num == 0)
18263     return VecReduceAdd16;
18264 
18265   // Generate the remainder Dot operation that is multiple of 8.
18266   SmallVector<SDValue, 4> SDotVec8;
18267   SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
18268   SDValue Vec8Op0 =
18269       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
18270                   DAG.getConstant(I * 16, DL, MVT::i64));
18271   SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
18272                                 DAG.getConstant(I * 16, DL, MVT::i64));
18273   SDValue Dot =
18274       DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
18275   SDValue VecReudceAdd8 =
18276       DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18277   return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
18278                      VecReudceAdd8);
18279 }
18280 
18281 // Given an (integer) vecreduce, we know the order of the inputs does not
18282 // matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
18283 // into UADDV(UADDLP(x)). This can also happen through an extra add, where we
18284 // transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
18285 static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) {
18286   auto DetectAddExtract = [&](SDValue A) {
18287     // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
18288     // UADDLP(x) if found.
18289     assert(A.getOpcode() == ISD::ADD);
18290     EVT VT = A.getValueType();
18291     SDValue Op0 = A.getOperand(0);
18292     SDValue Op1 = A.getOperand(1);
18293     if (Op0.getOpcode() != Op1.getOpcode() ||
18294         (Op0.getOpcode() != ISD::ZERO_EXTEND &&
18295          Op0.getOpcode() != ISD::SIGN_EXTEND))
18296       return SDValue();
18297     SDValue Ext0 = Op0.getOperand(0);
18298     SDValue Ext1 = Op1.getOperand(0);
18299     if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
18300         Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
18301         Ext0.getOperand(0) != Ext1.getOperand(0))
18302       return SDValue();
18303     // Check that the type is twice the add types, and the extract are from
18304     // upper/lower parts of the same source.
18305     if (Ext0.getOperand(0).getValueType().getVectorNumElements() !=
18306         VT.getVectorNumElements() * 2)
18307       return SDValue();
18308     if ((Ext0.getConstantOperandVal(1) != 0 ||
18309          Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) &&
18310         (Ext1.getConstantOperandVal(1) != 0 ||
18311          Ext0.getConstantOperandVal(1) != VT.getVectorNumElements()))
18312       return SDValue();
18313     unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
18314                                                           : AArch64ISD::SADDLP;
18315     return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
18316   };
18317 
18318   if (SDValue R = DetectAddExtract(A))
18319     return R;
18320 
18321   if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
18322     if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
18323       return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
18324                          A.getOperand(1));
18325   if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
18326     if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
18327       return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
18328                          A.getOperand(0));
18329   return SDValue();
18330 }
18331 
18332 // We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
18333 // UADDLV(concat), where the concat represents the 64-bit zext sources.
18334 static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG) {
18335   // Look for add(zext(64-bit source), zext(64-bit source)), returning
18336   // UADDLV(concat(zext, zext)) if found.
18337   assert(A.getOpcode() == ISD::ADD);
18338   EVT VT = A.getValueType();
18339   if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18340     return SDValue();
18341   SDValue Op0 = A.getOperand(0);
18342   SDValue Op1 = A.getOperand(1);
18343   if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
18344     return SDValue();
18345   SDValue Ext0 = Op0.getOperand(0);
18346   SDValue Ext1 = Op1.getOperand(0);
18347   EVT ExtVT0 = Ext0.getValueType();
18348   EVT ExtVT1 = Ext1.getValueType();
18349   // Check zext VTs are the same and 64-bit length.
18350   if (ExtVT0 != ExtVT1 ||
18351       VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
18352     return SDValue();
18353   // Get VT for concat of zext sources.
18354   EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
18355   SDValue Concat =
18356       DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
18357 
18358   switch (VT.getSimpleVT().SimpleTy) {
18359   case MVT::v2i64:
18360   case MVT::v4i32:
18361     return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
18362   case MVT::v8i16: {
18363     SDValue Uaddlv =
18364         DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
18365     return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
18366   }
18367   default:
18368     llvm_unreachable("Unhandled vector type");
18369   }
18370 }
18371 
18372 static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
18373   SDValue A = N->getOperand(0);
18374   if (A.getOpcode() == ISD::ADD) {
18375     if (SDValue R = performUADDVAddCombine(A, DAG))
18376       return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
18377     else if (SDValue R = performUADDVZextCombine(A, DAG))
18378       return R;
18379   }
18380   return SDValue();
18381 }
18382 
18383 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
18384                                  TargetLowering::DAGCombinerInfo &DCI,
18385                                  const AArch64Subtarget *Subtarget) {
18386   if (DCI.isBeforeLegalizeOps())
18387     return SDValue();
18388 
18389   return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
18390 }
18391 
18392 SDValue
18393 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
18394                                      SelectionDAG &DAG,
18395                                      SmallVectorImpl<SDNode *> &Created) const {
18396   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
18397   if (isIntDivCheap(N->getValueType(0), Attr))
18398     return SDValue(N, 0); // Lower SDIV as SDIV
18399 
18400   EVT VT = N->getValueType(0);
18401 
18402   // For scalable and fixed types, mark them as cheap so we can handle it much
18403   // later. This allows us to handle larger than legal types.
18404   if (VT.isScalableVector() ||
18405       (VT.isFixedLengthVector() && Subtarget->useSVEForFixedLengthVectors()))
18406     return SDValue(N, 0);
18407 
18408   // fold (sdiv X, pow2)
18409   if ((VT != MVT::i32 && VT != MVT::i64) ||
18410       !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18411     return SDValue();
18412 
18413   // If the divisor is 2 or -2, the default expansion is better. It will add
18414   // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
18415   if (Divisor == 2 ||
18416       Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
18417     return SDValue();
18418 
18419   return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
18420 }
18421 
18422 SDValue
18423 AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
18424                                      SelectionDAG &DAG,
18425                                      SmallVectorImpl<SDNode *> &Created) const {
18426   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
18427   if (isIntDivCheap(N->getValueType(0), Attr))
18428     return SDValue(N, 0); // Lower SREM as SREM
18429 
18430   EVT VT = N->getValueType(0);
18431 
18432   // For scalable and fixed types, mark them as cheap so we can handle it much
18433   // later. This allows us to handle larger than legal types.
18434   if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
18435     return SDValue(N, 0);
18436 
18437   // fold (srem X, pow2)
18438   if ((VT != MVT::i32 && VT != MVT::i64) ||
18439       !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18440     return SDValue();
18441 
18442   unsigned Lg2 = Divisor.countr_zero();
18443   if (Lg2 == 0)
18444     return SDValue();
18445 
18446   SDLoc DL(N);
18447   SDValue N0 = N->getOperand(0);
18448   SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
18449   SDValue Zero = DAG.getConstant(0, DL, VT);
18450   SDValue CCVal, CSNeg;
18451   if (Lg2 == 1) {
18452     SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
18453     SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
18454     CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
18455 
18456     Created.push_back(Cmp.getNode());
18457     Created.push_back(And.getNode());
18458   } else {
18459     SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
18460     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18461 
18462     SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
18463     SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
18464     SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
18465     CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
18466                         Negs.getValue(1));
18467 
18468     Created.push_back(Negs.getNode());
18469     Created.push_back(AndPos.getNode());
18470     Created.push_back(AndNeg.getNode());
18471   }
18472 
18473   return CSNeg;
18474 }
18475 
18476 static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
18477   switch(getIntrinsicID(S.getNode())) {
18478   default:
18479     break;
18480   case Intrinsic::aarch64_sve_cntb:
18481     return 8;
18482   case Intrinsic::aarch64_sve_cnth:
18483     return 16;
18484   case Intrinsic::aarch64_sve_cntw:
18485     return 32;
18486   case Intrinsic::aarch64_sve_cntd:
18487     return 64;
18488   }
18489   return {};
18490 }
18491 
18492 /// Calculates what the pre-extend type is, based on the extension
18493 /// operation node provided by \p Extend.
18494 ///
18495 /// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
18496 /// pre-extend type is pulled directly from the operand, while other extend
18497 /// operations need a bit more inspection to get this information.
18498 ///
18499 /// \param Extend The SDNode from the DAG that represents the extend operation
18500 ///
18501 /// \returns The type representing the \p Extend source type, or \p MVT::Other
18502 /// if no valid type can be determined
18503 static EVT calculatePreExtendType(SDValue Extend) {
18504   switch (Extend.getOpcode()) {
18505   case ISD::SIGN_EXTEND:
18506   case ISD::ZERO_EXTEND:
18507   case ISD::ANY_EXTEND:
18508     return Extend.getOperand(0).getValueType();
18509   case ISD::AssertSext:
18510   case ISD::AssertZext:
18511   case ISD::SIGN_EXTEND_INREG: {
18512     VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
18513     if (!TypeNode)
18514       return MVT::Other;
18515     return TypeNode->getVT();
18516   }
18517   case ISD::AND: {
18518     ConstantSDNode *Constant =
18519         dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
18520     if (!Constant)
18521       return MVT::Other;
18522 
18523     uint32_t Mask = Constant->getZExtValue();
18524 
18525     if (Mask == UCHAR_MAX)
18526       return MVT::i8;
18527     else if (Mask == USHRT_MAX)
18528       return MVT::i16;
18529     else if (Mask == UINT_MAX)
18530       return MVT::i32;
18531 
18532     return MVT::Other;
18533   }
18534   default:
18535     return MVT::Other;
18536   }
18537 }
18538 
18539 /// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
18540 /// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
18541 /// SExt/ZExt rather than the scalar SExt/ZExt
18542 static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
18543   EVT VT = BV.getValueType();
18544   if (BV.getOpcode() != ISD::BUILD_VECTOR &&
18545       BV.getOpcode() != ISD::VECTOR_SHUFFLE)
18546     return SDValue();
18547 
18548   // Use the first item in the buildvector/shuffle to get the size of the
18549   // extend, and make sure it looks valid.
18550   SDValue Extend = BV->getOperand(0);
18551   unsigned ExtendOpcode = Extend.getOpcode();
18552   bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
18553   bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
18554                 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
18555                 ExtendOpcode == ISD::AssertSext;
18556   if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
18557       ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
18558     return SDValue();
18559   // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
18560   // ensure calculatePreExtendType will work without issue.
18561   if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
18562       ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
18563     return SDValue();
18564 
18565   // Restrict valid pre-extend data type
18566   EVT PreExtendType = calculatePreExtendType(Extend);
18567   if (PreExtendType == MVT::Other ||
18568       PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
18569     return SDValue();
18570 
18571   // Make sure all other operands are equally extended.
18572   bool SeenZExtOrSExt = !IsAnyExt;
18573   for (SDValue Op : drop_begin(BV->ops())) {
18574     if (Op.isUndef())
18575       continue;
18576 
18577     if (calculatePreExtendType(Op) != PreExtendType)
18578       return SDValue();
18579 
18580     unsigned Opc = Op.getOpcode();
18581     if (Opc == ISD::ANY_EXTEND)
18582       continue;
18583 
18584     bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
18585                      Opc == ISD::AssertSext;
18586 
18587     if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
18588       return SDValue();
18589 
18590     IsSExt = OpcIsSExt;
18591     SeenZExtOrSExt = true;
18592   }
18593 
18594   SDValue NBV;
18595   SDLoc DL(BV);
18596   if (BV.getOpcode() == ISD::BUILD_VECTOR) {
18597     EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
18598     EVT PreExtendLegalType =
18599         PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
18600     SmallVector<SDValue, 8> NewOps;
18601     for (SDValue Op : BV->ops())
18602       NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
18603                                     : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
18604                                                            PreExtendLegalType));
18605     NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
18606   } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
18607     EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
18608     NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
18609                                BV.getOperand(1).isUndef()
18610                                    ? DAG.getUNDEF(PreExtendVT)
18611                                    : BV.getOperand(1).getOperand(0),
18612                                cast<ShuffleVectorSDNode>(BV)->getMask());
18613   }
18614   unsigned ExtOpc = !SeenZExtOrSExt
18615                         ? ISD::ANY_EXTEND
18616                         : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
18617   return DAG.getNode(ExtOpc, DL, VT, NBV);
18618 }
18619 
18620 /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
18621 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
18622 static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
18623   // If the value type isn't a vector, none of the operands are going to be dups
18624   EVT VT = Mul->getValueType(0);
18625   if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18626     return SDValue();
18627 
18628   SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
18629   SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
18630 
18631   // Neither operands have been changed, don't make any further changes
18632   if (!Op0 && !Op1)
18633     return SDValue();
18634 
18635   SDLoc DL(Mul);
18636   return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
18637                      Op1 ? Op1 : Mul->getOperand(1));
18638 }
18639 
18640 // Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
18641 // Same for other types with equivalent constants.
18642 static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
18643   EVT VT = N->getValueType(0);
18644   if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
18645       VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
18646     return SDValue();
18647   if (N->getOperand(0).getOpcode() != ISD::AND ||
18648       N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
18649     return SDValue();
18650 
18651   SDValue And = N->getOperand(0);
18652   SDValue Srl = And.getOperand(0);
18653 
18654   APInt V1, V2, V3;
18655   if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
18656       !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
18657       !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3))
18658     return SDValue();
18659 
18660   unsigned HalfSize = VT.getScalarSizeInBits() / 2;
18661   if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
18662       V3 != (HalfSize - 1))
18663     return SDValue();
18664 
18665   EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
18666                                 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
18667                                 VT.getVectorElementCount() * 2);
18668 
18669   SDLoc DL(N);
18670   SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
18671   SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
18672   return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
18673 }
18674 
18675 // Transform vector add(zext i8 to i32, zext i8 to i32)
18676 //  into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
18677 // This allows extra uses of saddl/uaddl at the lower vector widths, and less
18678 // extends.
18679 static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG) {
18680   EVT VT = N->getValueType(0);
18681   if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
18682       (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
18683        N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
18684       (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
18685        N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
18686       N->getOperand(0).getOperand(0).getValueType() !=
18687           N->getOperand(1).getOperand(0).getValueType())
18688     return SDValue();
18689 
18690   if (N->getOpcode() == ISD::MUL &&
18691       N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
18692     return SDValue();
18693 
18694   SDValue N0 = N->getOperand(0).getOperand(0);
18695   SDValue N1 = N->getOperand(1).getOperand(0);
18696   EVT InVT = N0.getValueType();
18697 
18698   EVT S1 = InVT.getScalarType();
18699   EVT S2 = VT.getScalarType();
18700   if ((S2 == MVT::i32 && S1 == MVT::i8) ||
18701       (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
18702     SDLoc DL(N);
18703     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
18704                                   S2.getHalfSizedIntegerVT(*DAG.getContext()),
18705                                   VT.getVectorElementCount());
18706     SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
18707     SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
18708     SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
18709     return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
18710                                                   : (unsigned)ISD::SIGN_EXTEND,
18711                        DL, VT, NewOp);
18712   }
18713   return SDValue();
18714 }
18715 
18716 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
18717                                  TargetLowering::DAGCombinerInfo &DCI,
18718                                  const AArch64Subtarget *Subtarget) {
18719 
18720   if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
18721     return Ext;
18722   if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))
18723     return Ext;
18724   if (SDValue Ext = performVectorExtCombine(N, DAG))
18725     return Ext;
18726 
18727   if (DCI.isBeforeLegalizeOps())
18728     return SDValue();
18729 
18730   // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
18731   // and in MachineCombiner pass, add+mul will be combined into madd.
18732   // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
18733   SDLoc DL(N);
18734   EVT VT = N->getValueType(0);
18735   SDValue N0 = N->getOperand(0);
18736   SDValue N1 = N->getOperand(1);
18737   SDValue MulOper;
18738   unsigned AddSubOpc;
18739 
18740   auto IsAddSubWith1 = [&](SDValue V) -> bool {
18741     AddSubOpc = V->getOpcode();
18742     if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
18743       SDValue Opnd = V->getOperand(1);
18744       MulOper = V->getOperand(0);
18745       if (AddSubOpc == ISD::SUB)
18746         std::swap(Opnd, MulOper);
18747       if (auto C = dyn_cast<ConstantSDNode>(Opnd))
18748         return C->isOne();
18749     }
18750     return false;
18751   };
18752 
18753   if (IsAddSubWith1(N0)) {
18754     SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
18755     return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
18756   }
18757 
18758   if (IsAddSubWith1(N1)) {
18759     SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
18760     return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
18761   }
18762 
18763   // The below optimizations require a constant RHS.
18764   if (!isa<ConstantSDNode>(N1))
18765     return SDValue();
18766 
18767   ConstantSDNode *C = cast<ConstantSDNode>(N1);
18768   const APInt &ConstValue = C->getAPIntValue();
18769 
18770   // Allow the scaling to be folded into the `cnt` instruction by preventing
18771   // the scaling to be obscured here. This makes it easier to pattern match.
18772   if (IsSVECntIntrinsic(N0) ||
18773      (N0->getOpcode() == ISD::TRUNCATE &&
18774       (IsSVECntIntrinsic(N0->getOperand(0)))))
18775        if (ConstValue.sge(1) && ConstValue.sle(16))
18776          return SDValue();
18777 
18778   // Multiplication of a power of two plus/minus one can be done more
18779   // cheaply as shift+add/sub. For now, this is true unilaterally. If
18780   // future CPUs have a cheaper MADD instruction, this may need to be
18781   // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
18782   // 64-bit is 5 cycles, so this is always a win.
18783   // More aggressively, some multiplications N0 * C can be lowered to
18784   // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
18785   // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
18786   // TODO: lower more cases.
18787 
18788   // TrailingZeroes is used to test if the mul can be lowered to
18789   // shift+add+shift.
18790   unsigned TrailingZeroes = ConstValue.countr_zero();
18791   if (TrailingZeroes) {
18792     // Conservatively do not lower to shift+add+shift if the mul might be
18793     // folded into smul or umul.
18794     if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
18795                             isZeroExtended(N0, DAG)))
18796       return SDValue();
18797     // Conservatively do not lower to shift+add+shift if the mul might be
18798     // folded into madd or msub.
18799     if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
18800                            N->user_begin()->getOpcode() == ISD::SUB))
18801       return SDValue();
18802   }
18803   // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
18804   // and shift+add+shift.
18805   APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
18806   unsigned ShiftAmt;
18807 
18808   auto Shl = [&](SDValue N0, unsigned N1) {
18809     if (!N0.getNode())
18810       return SDValue();
18811     // If shift causes overflow, ignore this combine.
18812     if (N1 >= N0.getValueSizeInBits())
18813       return SDValue();
18814     SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
18815     return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
18816   };
18817   auto Add = [&](SDValue N0, SDValue N1) {
18818     if (!N0.getNode() || !N1.getNode())
18819       return SDValue();
18820     return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
18821   };
18822   auto Sub = [&](SDValue N0, SDValue N1) {
18823     if (!N0.getNode() || !N1.getNode())
18824       return SDValue();
18825     return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
18826   };
18827   auto Negate = [&](SDValue N) {
18828     if (!N0.getNode())
18829       return SDValue();
18830     SDValue Zero = DAG.getConstant(0, DL, VT);
18831     return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
18832   };
18833 
18834   // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
18835   // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
18836   // the (2^N - 1) can't be execused via a single instruction.
18837   auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
18838     unsigned BitWidth = C.getBitWidth();
18839     for (unsigned i = 1; i < BitWidth / 2; i++) {
18840       APInt Rem;
18841       APInt X(BitWidth, (1 << i) + 1);
18842       APInt::sdivrem(C, X, N, Rem);
18843       APInt NVMinus1 = N - 1;
18844       if (Rem == 0 && NVMinus1.isPowerOf2()) {
18845         M = X;
18846         return true;
18847       }
18848     }
18849     return false;
18850   };
18851 
18852   // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
18853   // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
18854   // the (2^N - 1) can't be execused via a single instruction.
18855   auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
18856     APInt CVMinus1 = C - 1;
18857     if (CVMinus1.isNegative())
18858       return false;
18859     unsigned TrailingZeroes = CVMinus1.countr_zero();
18860     APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
18861     if (SCVMinus1.isPowerOf2()) {
18862       unsigned BitWidth = SCVMinus1.getBitWidth();
18863       M = APInt(BitWidth, SCVMinus1.logBase2());
18864       N = APInt(BitWidth, TrailingZeroes);
18865       return true;
18866     }
18867     return false;
18868   };
18869 
18870   // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
18871   // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
18872   auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
18873     APInt CVMinus1 = C - 1;
18874     if (CVMinus1.isNegative())
18875       return false;
18876     unsigned TrailingZeroes = CVMinus1.countr_zero();
18877     APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
18878     if (CVPlus1.isPowerOf2()) {
18879       unsigned BitWidth = CVPlus1.getBitWidth();
18880       M = APInt(BitWidth, CVPlus1.logBase2());
18881       N = APInt(BitWidth, TrailingZeroes);
18882       return true;
18883     }
18884     return false;
18885   };
18886 
18887   if (ConstValue.isNonNegative()) {
18888     // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
18889     // (mul x, 2^N - 1) => (sub (shl x, N), x)
18890     // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
18891     // (mul x, (2^M + 1) * (2^N + 1))
18892     //     => MV = (add (shl x, M), x); (add (shl MV, N), MV)
18893     // (mul x, (2^M + 1) * 2^N + 1))
18894     //     =>  MV = add (shl x, M), x); add (shl MV, N), x)
18895     // (mul x, 1 - (1 - 2^M) * 2^N))
18896     //     =>  MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
18897     APInt SCVMinus1 = ShiftedConstValue - 1;
18898     APInt SCVPlus1 = ShiftedConstValue + 1;
18899     APInt CVPlus1 = ConstValue + 1;
18900     APInt CVM, CVN;
18901     if (SCVMinus1.isPowerOf2()) {
18902       ShiftAmt = SCVMinus1.logBase2();
18903       return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
18904     } else if (CVPlus1.isPowerOf2()) {
18905       ShiftAmt = CVPlus1.logBase2();
18906       return Sub(Shl(N0, ShiftAmt), N0);
18907     } else if (SCVPlus1.isPowerOf2()) {
18908       ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18909       return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
18910     }
18911     if (Subtarget->hasALULSLFast() &&
18912         isPowPlusPlusConst(ConstValue, CVM, CVN)) {
18913       APInt CVMMinus1 = CVM - 1;
18914       APInt CVNMinus1 = CVN - 1;
18915       unsigned ShiftM1 = CVMMinus1.logBase2();
18916       unsigned ShiftN1 = CVNMinus1.logBase2();
18917       // ALULSLFast implicate that Shifts <= 4 places are fast
18918       if (ShiftM1 <= 4 && ShiftN1 <= 4) {
18919         SDValue MVal = Add(Shl(N0, ShiftM1), N0);
18920         return Add(Shl(MVal, ShiftN1), MVal);
18921       }
18922     }
18923     if (Subtarget->hasALULSLFast() &&
18924         isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
18925       unsigned ShiftM = CVM.getZExtValue();
18926       unsigned ShiftN = CVN.getZExtValue();
18927       // ALULSLFast implicate that Shifts <= 4 places are fast
18928       if (ShiftM <= 4 && ShiftN <= 4) {
18929         SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
18930         return Add(Shl(MVal, CVN.getZExtValue()), N0);
18931       }
18932     }
18933 
18934     if (Subtarget->hasALULSLFast() &&
18935         isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
18936       unsigned ShiftM = CVM.getZExtValue();
18937       unsigned ShiftN = CVN.getZExtValue();
18938       // ALULSLFast implicate that Shifts <= 4 places are fast
18939       if (ShiftM <= 4 && ShiftN <= 4) {
18940         SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
18941         return Sub(N0, Shl(MVal, CVN.getZExtValue()));
18942       }
18943     }
18944   } else {
18945     // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
18946     // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
18947     // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
18948     APInt SCVPlus1 = -ShiftedConstValue + 1;
18949     APInt CVNegPlus1 = -ConstValue + 1;
18950     APInt CVNegMinus1 = -ConstValue - 1;
18951     if (CVNegPlus1.isPowerOf2()) {
18952       ShiftAmt = CVNegPlus1.logBase2();
18953       return Sub(N0, Shl(N0, ShiftAmt));
18954     } else if (CVNegMinus1.isPowerOf2()) {
18955       ShiftAmt = CVNegMinus1.logBase2();
18956       return Negate(Add(Shl(N0, ShiftAmt), N0));
18957     } else if (SCVPlus1.isPowerOf2()) {
18958       ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18959       return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
18960     }
18961   }
18962 
18963   return SDValue();
18964 }
18965 
18966 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
18967                                                          SelectionDAG &DAG) {
18968   // Take advantage of vector comparisons producing 0 or -1 in each lane to
18969   // optimize away operation when it's from a constant.
18970   //
18971   // The general transformation is:
18972   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
18973   //       AND(VECTOR_CMP(x,y), constant2)
18974   //    constant2 = UNARYOP(constant)
18975 
18976   // Early exit if this isn't a vector operation, the operand of the
18977   // unary operation isn't a bitwise AND, or if the sizes of the operations
18978   // aren't the same.
18979   EVT VT = N->getValueType(0);
18980   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
18981       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
18982       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
18983     return SDValue();
18984 
18985   // Now check that the other operand of the AND is a constant. We could
18986   // make the transformation for non-constant splats as well, but it's unclear
18987   // that would be a benefit as it would not eliminate any operations, just
18988   // perform one more step in scalar code before moving to the vector unit.
18989   if (BuildVectorSDNode *BV =
18990           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
18991     // Bail out if the vector isn't a constant.
18992     if (!BV->isConstant())
18993       return SDValue();
18994 
18995     // Everything checks out. Build up the new and improved node.
18996     SDLoc DL(N);
18997     EVT IntVT = BV->getValueType(0);
18998     // Create a new constant of the appropriate type for the transformed
18999     // DAG.
19000     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
19001     // The AND node needs bitcasts to/from an integer vector type around it.
19002     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
19003     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
19004                                  N->getOperand(0)->getOperand(0), MaskConst);
19005     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
19006     return Res;
19007   }
19008 
19009   return SDValue();
19010 }
19011 
19012 /// Tries to replace scalar FP <-> INT conversions with SVE in streaming
19013 /// functions, this can help to reduce the number of fmovs to/from GPRs.
19014 static SDValue
19015 tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG,
19016                                       TargetLowering::DAGCombinerInfo &DCI,
19017                                       const AArch64Subtarget *Subtarget) {
19018   if (N->isStrictFPOpcode())
19019     return SDValue();
19020 
19021   if (DCI.isBeforeLegalizeOps())
19022     return SDValue();
19023 
19024   if (!Subtarget->isSVEorStreamingSVEAvailable() ||
19025       (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
19026     return SDValue();
19027 
19028   auto isSupportedType = [](EVT VT) {
19029     return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
19030   };
19031 
19032   SDValue SrcVal = N->getOperand(0);
19033   EVT SrcTy = SrcVal.getValueType();
19034   EVT DestTy = N->getValueType(0);
19035 
19036   if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
19037     return SDValue();
19038 
19039   EVT SrcVecTy;
19040   EVT DestVecTy;
19041   if (DestTy.bitsGT(SrcTy)) {
19042     DestVecTy = getPackedSVEVectorVT(DestTy);
19043     SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
19044   } else {
19045     SrcVecTy = getPackedSVEVectorVT(SrcTy);
19046     DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
19047   }
19048 
19049   // Ensure the resulting src/dest vector type is legal.
19050   if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
19051     return SDValue();
19052 
19053   SDLoc DL(N);
19054   SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19055   SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
19056                             DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
19057   SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
19058   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
19059 }
19060 
19061 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
19062                                      TargetLowering::DAGCombinerInfo &DCI,
19063                                      const AArch64Subtarget *Subtarget) {
19064   // First try to optimize away the conversion when it's conditionally from
19065   // a constant. Vectors only.
19066   if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
19067     return Res;
19068 
19069   if (SDValue Res =
19070           tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19071     return Res;
19072 
19073   EVT VT = N->getValueType(0);
19074   if (VT != MVT::f32 && VT != MVT::f64)
19075     return SDValue();
19076 
19077   // Only optimize when the source and destination types have the same width.
19078   if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
19079     return SDValue();
19080 
19081   // If the result of an integer load is only used by an integer-to-float
19082   // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
19083   // This eliminates an "integer-to-vector-move" UOP and improves throughput.
19084   SDValue N0 = N->getOperand(0);
19085   if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
19086       N0.hasOneUse() &&
19087       // Do not change the width of a volatile load.
19088       !cast<LoadSDNode>(N0)->isVolatile()) {
19089     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
19090     SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
19091                                LN0->getPointerInfo(), LN0->getAlign(),
19092                                LN0->getMemOperand()->getFlags());
19093 
19094     // Make sure successors of the original load stay after it by updating them
19095     // to use the new Chain.
19096     DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
19097 
19098     unsigned Opcode =
19099         (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
19100     return DAG.getNode(Opcode, SDLoc(N), VT, Load);
19101   }
19102 
19103   return SDValue();
19104 }
19105 
19106 /// Fold a floating-point multiply by power of two into floating-point to
19107 /// fixed-point conversion.
19108 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
19109                                      TargetLowering::DAGCombinerInfo &DCI,
19110                                      const AArch64Subtarget *Subtarget) {
19111   if (SDValue Res =
19112           tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19113     return Res;
19114 
19115   if (!Subtarget->isNeonAvailable())
19116     return SDValue();
19117 
19118   if (!N->getValueType(0).isSimple())
19119     return SDValue();
19120 
19121   SDValue Op = N->getOperand(0);
19122   if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
19123     return SDValue();
19124 
19125   if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
19126     return SDValue();
19127 
19128   SDValue ConstVec = Op->getOperand(1);
19129   if (!isa<BuildVectorSDNode>(ConstVec))
19130     return SDValue();
19131 
19132   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
19133   uint32_t FloatBits = FloatTy.getSizeInBits();
19134   if (FloatBits != 32 && FloatBits != 64 &&
19135       (FloatBits != 16 || !Subtarget->hasFullFP16()))
19136     return SDValue();
19137 
19138   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
19139   uint32_t IntBits = IntTy.getSizeInBits();
19140   if (IntBits != 16 && IntBits != 32 && IntBits != 64)
19141     return SDValue();
19142 
19143   // Avoid conversions where iN is larger than the float (e.g., float -> i64).
19144   if (IntBits > FloatBits)
19145     return SDValue();
19146 
19147   BitVector UndefElements;
19148   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
19149   int32_t Bits = IntBits == 64 ? 64 : 32;
19150   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
19151   if (C == -1 || C == 0 || C > Bits)
19152     return SDValue();
19153 
19154   EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
19155   if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
19156     return SDValue();
19157 
19158   if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
19159       N->getOpcode() == ISD::FP_TO_UINT_SAT) {
19160     EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
19161     if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
19162       return SDValue();
19163   }
19164 
19165   SDLoc DL(N);
19166   bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
19167                    N->getOpcode() == ISD::FP_TO_SINT_SAT);
19168   unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
19169                                       : Intrinsic::aarch64_neon_vcvtfp2fxu;
19170   SDValue FixConv =
19171       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
19172                   DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
19173                   Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
19174   // We can handle smaller integers by generating an extra trunc.
19175   if (IntBits < FloatBits)
19176     FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
19177 
19178   return FixConv;
19179 }
19180 
19181 static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19182                                const AArch64TargetLowering &TLI) {
19183   EVT VT = N->getValueType(0);
19184   SelectionDAG &DAG = DCI.DAG;
19185   SDLoc DL(N);
19186   const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
19187 
19188   if (!VT.isVector())
19189     return SDValue();
19190 
19191   if (VT.isScalableVector() && !Subtarget.hasSVE2())
19192     return SDValue();
19193 
19194   if (VT.isFixedLengthVector() &&
19195       (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
19196     return SDValue();
19197 
19198   SDValue N0 = N->getOperand(0);
19199   if (N0.getOpcode() != ISD::AND)
19200     return SDValue();
19201 
19202   SDValue N1 = N->getOperand(1);
19203   if (N1.getOpcode() != ISD::AND)
19204     return SDValue();
19205 
19206   // InstCombine does (not (neg a)) => (add a -1).
19207   // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
19208   // Loop over all combinations of AND operands.
19209   for (int i = 1; i >= 0; --i) {
19210     for (int j = 1; j >= 0; --j) {
19211       SDValue O0 = N0->getOperand(i);
19212       SDValue O1 = N1->getOperand(j);
19213       SDValue Sub, Add, SubSibling, AddSibling;
19214 
19215       // Find a SUB and an ADD operand, one from each AND.
19216       if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
19217         Sub = O0;
19218         Add = O1;
19219         SubSibling = N0->getOperand(1 - i);
19220         AddSibling = N1->getOperand(1 - j);
19221       } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
19222         Add = O0;
19223         Sub = O1;
19224         AddSibling = N0->getOperand(1 - i);
19225         SubSibling = N1->getOperand(1 - j);
19226       } else
19227         continue;
19228 
19229       if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
19230         continue;
19231 
19232       // Constant ones is always righthand operand of the Add.
19233       if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
19234         continue;
19235 
19236       if (Sub.getOperand(1) != Add.getOperand(0))
19237         continue;
19238 
19239       return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
19240     }
19241   }
19242 
19243   // (or (and a b) (and (not a) c)) => (bsl a b c)
19244   // We only have to look for constant vectors here since the general, variable
19245   // case can be handled in TableGen.
19246   unsigned Bits = VT.getScalarSizeInBits();
19247   uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
19248   for (int i = 1; i >= 0; --i)
19249     for (int j = 1; j >= 0; --j) {
19250       APInt Val1, Val2;
19251 
19252       if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
19253           ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) &&
19254           (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
19255         return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
19256                            N0->getOperand(1 - i), N1->getOperand(1 - j));
19257       }
19258       BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
19259       BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
19260       if (!BVN0 || !BVN1)
19261         continue;
19262 
19263       bool FoundMatch = true;
19264       for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
19265         ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
19266         ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
19267         if (!CN0 || !CN1 ||
19268             CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
19269           FoundMatch = false;
19270           break;
19271         }
19272       }
19273       if (FoundMatch)
19274         return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
19275                            N0->getOperand(1 - i), N1->getOperand(1 - j));
19276     }
19277 
19278   return SDValue();
19279 }
19280 
19281 // Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
19282 // convert to csel(ccmp(.., cc0)), depending on cc1:
19283 
19284 // (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19285 // =>
19286 // (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
19287 //
19288 // (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19289 // =>
19290 // (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
19291 static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
19292   EVT VT = N->getValueType(0);
19293   SDValue CSel0 = N->getOperand(0);
19294   SDValue CSel1 = N->getOperand(1);
19295 
19296   if (CSel0.getOpcode() != AArch64ISD::CSEL ||
19297       CSel1.getOpcode() != AArch64ISD::CSEL)
19298     return SDValue();
19299 
19300   if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
19301     return SDValue();
19302 
19303   if (!isNullConstant(CSel0.getOperand(0)) ||
19304       !isOneConstant(CSel0.getOperand(1)) ||
19305       !isNullConstant(CSel1.getOperand(0)) ||
19306       !isOneConstant(CSel1.getOperand(1)))
19307     return SDValue();
19308 
19309   SDValue Cmp0 = CSel0.getOperand(3);
19310   SDValue Cmp1 = CSel1.getOperand(3);
19311   AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(2);
19312   AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(2);
19313   if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
19314     return SDValue();
19315   if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
19316       Cmp0.getOpcode() == AArch64ISD::SUBS) {
19317     std::swap(Cmp0, Cmp1);
19318     std::swap(CC0, CC1);
19319   }
19320 
19321   if (Cmp1.getOpcode() != AArch64ISD::SUBS)
19322     return SDValue();
19323 
19324   SDLoc DL(N);
19325   SDValue CCmp, Condition;
19326   unsigned NZCV;
19327 
19328   if (N->getOpcode() == ISD::AND) {
19329     AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0);
19330     Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
19331     NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1);
19332   } else {
19333     AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1);
19334     Condition = DAG.getConstant(CC0, DL, MVT_CC);
19335     NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1);
19336   }
19337 
19338   SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
19339 
19340   auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
19341   if (Op1 && Op1->getAPIntValue().isNegative() &&
19342       Op1->getAPIntValue().sgt(-32)) {
19343     // CCMP accept the constant int the range [0, 31]
19344     // if the Op1 is a constant in the range [-31, -1], we
19345     // can select to CCMN to avoid the extra mov
19346     SDValue AbsOp1 =
19347         DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
19348     CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
19349                        NZCVOp, Condition, Cmp0);
19350   } else {
19351     CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
19352                        Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
19353   }
19354   return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
19355                      CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
19356                      CCmp);
19357 }
19358 
19359 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19360                                 const AArch64Subtarget *Subtarget,
19361                                 const AArch64TargetLowering &TLI) {
19362   SelectionDAG &DAG = DCI.DAG;
19363   EVT VT = N->getValueType(0);
19364 
19365   if (SDValue R = performANDORCSELCombine(N, DAG))
19366     return R;
19367 
19368   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19369     return SDValue();
19370 
19371   if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
19372     return Res;
19373 
19374   return SDValue();
19375 }
19376 
19377 static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
19378   if (!MemVT.getVectorElementType().isSimple())
19379     return false;
19380 
19381   uint64_t MaskForTy = 0ull;
19382   switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
19383   case MVT::i8:
19384     MaskForTy = 0xffull;
19385     break;
19386   case MVT::i16:
19387     MaskForTy = 0xffffull;
19388     break;
19389   case MVT::i32:
19390     MaskForTy = 0xffffffffull;
19391     break;
19392   default:
19393     return false;
19394     break;
19395   }
19396 
19397   if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
19398     if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
19399       return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
19400 
19401   return false;
19402 }
19403 
19404 static SDValue performReinterpretCastCombine(SDNode *N) {
19405   SDValue LeafOp = SDValue(N, 0);
19406   SDValue Op = N->getOperand(0);
19407   while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
19408          LeafOp.getValueType() != Op.getValueType())
19409     Op = Op->getOperand(0);
19410   if (LeafOp.getValueType() == Op.getValueType())
19411     return Op;
19412   return SDValue();
19413 }
19414 
19415 static SDValue performSVEAndCombine(SDNode *N,
19416                                     TargetLowering::DAGCombinerInfo &DCI) {
19417   SelectionDAG &DAG = DCI.DAG;
19418   SDValue Src = N->getOperand(0);
19419   unsigned Opc = Src->getOpcode();
19420 
19421   // Zero/any extend of an unsigned unpack
19422   if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
19423     SDValue UnpkOp = Src->getOperand(0);
19424     SDValue Dup = N->getOperand(1);
19425 
19426     if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
19427       return SDValue();
19428 
19429     SDLoc DL(N);
19430     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
19431     if (!C)
19432       return SDValue();
19433 
19434     uint64_t ExtVal = C->getZExtValue();
19435 
19436     auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
19437       return ((ExtVal == 0xFF && VT == MVT::i8) ||
19438               (ExtVal == 0xFFFF && VT == MVT::i16) ||
19439               (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
19440     };
19441 
19442     // If the mask is fully covered by the unpack, we don't need to push
19443     // a new AND onto the operand
19444     EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
19445     if (MaskAndTypeMatch(EltTy))
19446       return Src;
19447 
19448     // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
19449     // to see if the mask is all-ones of size MemTy.
19450     auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
19451     if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
19452                          MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
19453       EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
19454       if (MaskAndTypeMatch(EltTy))
19455         return Src;
19456     }
19457 
19458     // Truncate to prevent a DUP with an over wide constant
19459     APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
19460 
19461     // Otherwise, make sure we propagate the AND to the operand
19462     // of the unpack
19463     Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
19464                       DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
19465 
19466     SDValue And = DAG.getNode(ISD::AND, DL,
19467                               UnpkOp->getValueType(0), UnpkOp, Dup);
19468 
19469     return DAG.getNode(Opc, DL, N->getValueType(0), And);
19470   }
19471 
19472   if (DCI.isBeforeLegalizeOps())
19473     return SDValue();
19474 
19475   // If both sides of AND operations are i1 splat_vectors then
19476   // we can produce just i1 splat_vector as the result.
19477   if (isAllActivePredicate(DAG, N->getOperand(0)))
19478     return N->getOperand(1);
19479   if (isAllActivePredicate(DAG, N->getOperand(1)))
19480     return N->getOperand(0);
19481 
19482   if (!EnableCombineMGatherIntrinsics)
19483     return SDValue();
19484 
19485   SDValue Mask = N->getOperand(1);
19486 
19487   if (!Src.hasOneUse())
19488     return SDValue();
19489 
19490   EVT MemVT;
19491 
19492   // SVE load instructions perform an implicit zero-extend, which makes them
19493   // perfect candidates for combining.
19494   switch (Opc) {
19495   case AArch64ISD::LD1_MERGE_ZERO:
19496   case AArch64ISD::LDNF1_MERGE_ZERO:
19497   case AArch64ISD::LDFF1_MERGE_ZERO:
19498     MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
19499     break;
19500   case AArch64ISD::GLD1_MERGE_ZERO:
19501   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
19502   case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
19503   case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
19504   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
19505   case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
19506   case AArch64ISD::GLD1_IMM_MERGE_ZERO:
19507   case AArch64ISD::GLDFF1_MERGE_ZERO:
19508   case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
19509   case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
19510   case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
19511   case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
19512   case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
19513   case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
19514   case AArch64ISD::GLDNT1_MERGE_ZERO:
19515     MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
19516     break;
19517   default:
19518     return SDValue();
19519   }
19520 
19521   if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
19522     return Src;
19523 
19524   return SDValue();
19525 }
19526 
19527 // Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
19528 static SDValue performANDSETCCCombine(SDNode *N,
19529                                       TargetLowering::DAGCombinerInfo &DCI) {
19530 
19531   // This function performs an optimization on a specific pattern involving
19532   // an AND operation and SETCC (Set Condition Code) node.
19533 
19534   SDValue SetCC = N->getOperand(0);
19535   EVT VT = N->getValueType(0);
19536   SelectionDAG &DAG = DCI.DAG;
19537 
19538   // Checks if the current node (N) is used by any SELECT instruction and
19539   // returns an empty SDValue to avoid applying the optimization to prevent
19540   // incorrect results
19541   for (auto U : N->users())
19542     if (U->getOpcode() == ISD::SELECT)
19543       return SDValue();
19544 
19545   // Check if the operand is a SETCC node with floating-point comparison
19546   if (SetCC.getOpcode() == ISD::SETCC &&
19547       SetCC.getOperand(0).getValueType() == MVT::f32) {
19548 
19549     SDValue Cmp;
19550     AArch64CC::CondCode CC;
19551 
19552     // Check if the DAG is after legalization and if we can emit the conjunction
19553     if (!DCI.isBeforeLegalize() &&
19554         (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
19555 
19556       AArch64CC::CondCode InvertedCC = AArch64CC::getInvertedCondCode(CC);
19557 
19558       SDLoc DL(N);
19559       return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
19560                          DAG.getConstant(0, DL, VT),
19561                          DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
19562     }
19563   }
19564   return SDValue();
19565 }
19566 
19567 static SDValue performANDCombine(SDNode *N,
19568                                  TargetLowering::DAGCombinerInfo &DCI) {
19569   SelectionDAG &DAG = DCI.DAG;
19570   SDValue LHS = N->getOperand(0);
19571   SDValue RHS = N->getOperand(1);
19572   EVT VT = N->getValueType(0);
19573 
19574   if (SDValue R = performANDORCSELCombine(N, DAG))
19575     return R;
19576 
19577   if (SDValue R = performANDSETCCCombine(N,DCI))
19578     return R;
19579 
19580   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19581     return SDValue();
19582 
19583   if (VT.isScalableVector())
19584     return performSVEAndCombine(N, DCI);
19585 
19586   // The combining code below works only for NEON vectors. In particular, it
19587   // does not work for SVE when dealing with vectors wider than 128 bits.
19588   if (!VT.is64BitVector() && !VT.is128BitVector())
19589     return SDValue();
19590 
19591   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
19592   if (!BVN)
19593     return SDValue();
19594 
19595   // AND does not accept an immediate, so check if we can use a BIC immediate
19596   // instruction instead. We do this here instead of using a (and x, (mvni imm))
19597   // pattern in isel, because some immediates may be lowered to the preferred
19598   // (and x, (movi imm)) form, even though an mvni representation also exists.
19599   APInt DefBits(VT.getSizeInBits(), 0);
19600   APInt UndefBits(VT.getSizeInBits(), 0);
19601   if (resolveBuildVector(BVN, DefBits, UndefBits)) {
19602     SDValue NewOp;
19603 
19604     // Any bits known to already be 0 need not be cleared again, which can help
19605     // reduce the size of the immediate to one supported by the instruction.
19606     KnownBits Known = DAG.computeKnownBits(LHS);
19607     APInt ZeroSplat(VT.getSizeInBits(), 0);
19608     for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
19609       ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
19610                    << (Known.Zero.getBitWidth() * I);
19611 
19612     DefBits = ~(DefBits | ZeroSplat);
19613     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
19614                                     DefBits, &LHS)) ||
19615         (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
19616                                     DefBits, &LHS)))
19617       return NewOp;
19618 
19619     UndefBits = ~(UndefBits | ZeroSplat);
19620     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
19621                                     UndefBits, &LHS)) ||
19622         (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
19623                                     UndefBits, &LHS)))
19624       return NewOp;
19625   }
19626 
19627   return SDValue();
19628 }
19629 
19630 static SDValue performFADDCombine(SDNode *N,
19631                                   TargetLowering::DAGCombinerInfo &DCI) {
19632   SelectionDAG &DAG = DCI.DAG;
19633   SDValue LHS = N->getOperand(0);
19634   SDValue RHS = N->getOperand(1);
19635   EVT VT = N->getValueType(0);
19636   SDLoc DL(N);
19637 
19638   if (!N->getFlags().hasAllowReassociation())
19639     return SDValue();
19640 
19641   // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
19642   auto ReassocComplex = [&](SDValue A, SDValue B) {
19643     if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
19644       return SDValue();
19645     unsigned Opc = A.getConstantOperandVal(0);
19646     if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
19647         Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
19648         Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
19649         Opc != Intrinsic::aarch64_neon_vcmla_rot270)
19650       return SDValue();
19651     SDValue VCMLA = DAG.getNode(
19652         ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
19653         DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
19654         A.getOperand(2), A.getOperand(3));
19655     VCMLA->setFlags(A->getFlags());
19656     return VCMLA;
19657   };
19658   if (SDValue R = ReassocComplex(LHS, RHS))
19659     return R;
19660   if (SDValue R = ReassocComplex(RHS, LHS))
19661     return R;
19662 
19663   return SDValue();
19664 }
19665 
19666 static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
19667   switch (Opcode) {
19668   case ISD::STRICT_FADD:
19669   case ISD::FADD:
19670     return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
19671   case ISD::ADD:
19672     return VT == MVT::i64;
19673   default:
19674     return false;
19675   }
19676 }
19677 
19678 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
19679                         AArch64CC::CondCode Cond);
19680 
19681 static bool isPredicateCCSettingOp(SDValue N) {
19682   if ((N.getOpcode() == ISD::SETCC) ||
19683       (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
19684        (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
19685         N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
19686         N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
19687         N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
19688         N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
19689         N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
19690         N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
19691         N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
19692         // get_active_lane_mask is lowered to a whilelo instruction.
19693         N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
19694     return true;
19695 
19696   return false;
19697 }
19698 
19699 // Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
19700 // ... into: "ptrue p, all" + PTEST
19701 static SDValue
19702 performFirstTrueTestVectorCombine(SDNode *N,
19703                                   TargetLowering::DAGCombinerInfo &DCI,
19704                                   const AArch64Subtarget *Subtarget) {
19705   assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19706   // Make sure PTEST can be legalised with illegal types.
19707   if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
19708     return SDValue();
19709 
19710   SDValue N0 = N->getOperand(0);
19711   EVT VT = N0.getValueType();
19712 
19713   if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
19714       !isNullConstant(N->getOperand(1)))
19715     return SDValue();
19716 
19717   // Restricted the DAG combine to only cases where we're extracting from a
19718   // flag-setting operation.
19719   if (!isPredicateCCSettingOp(N0))
19720     return SDValue();
19721 
19722   // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
19723   SelectionDAG &DAG = DCI.DAG;
19724   SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
19725   return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
19726 }
19727 
19728 // Materialize : Idx = (add (mul vscale, NumEls), -1)
19729 //               i1 = extract_vector_elt t37, Constant:i64<Idx>
19730 //     ... into: "ptrue p, all" + PTEST
19731 static SDValue
19732 performLastTrueTestVectorCombine(SDNode *N,
19733                                  TargetLowering::DAGCombinerInfo &DCI,
19734                                  const AArch64Subtarget *Subtarget) {
19735   assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19736   // Make sure PTEST is legal types.
19737   if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
19738     return SDValue();
19739 
19740   SDValue N0 = N->getOperand(0);
19741   EVT OpVT = N0.getValueType();
19742 
19743   if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
19744     return SDValue();
19745 
19746   // Idx == (add (mul vscale, NumEls), -1)
19747   SDValue Idx = N->getOperand(1);
19748   if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
19749     return SDValue();
19750 
19751   SDValue VS = Idx.getOperand(0);
19752   if (VS.getOpcode() != ISD::VSCALE)
19753     return SDValue();
19754 
19755   unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
19756   if (VS.getConstantOperandVal(0) != NumEls)
19757     return SDValue();
19758 
19759   // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
19760   SelectionDAG &DAG = DCI.DAG;
19761   SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
19762   return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
19763 }
19764 
19765 static SDValue
19766 performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19767                                const AArch64Subtarget *Subtarget) {
19768   assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19769   if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
19770     return Res;
19771   if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
19772     return Res;
19773 
19774   SelectionDAG &DAG = DCI.DAG;
19775   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
19776 
19777   EVT VT = N->getValueType(0);
19778   const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
19779   bool IsStrict = N0->isStrictFPOpcode();
19780 
19781   // extract(dup x) -> x
19782   if (N0.getOpcode() == AArch64ISD::DUP)
19783     return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
19784                           : N0.getOperand(0);
19785 
19786   // Rewrite for pairwise fadd pattern
19787   //   (f32 (extract_vector_elt
19788   //           (fadd (vXf32 Other)
19789   //                 (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
19790   // ->
19791   //   (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
19792   //              (extract_vector_elt (vXf32 Other) 1))
19793   // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
19794   // we can only do this when it's used only by the extract_vector_elt.
19795   if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
19796       (!IsStrict || N0.hasOneUse())) {
19797     SDLoc DL(N0);
19798     SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
19799     SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
19800 
19801     ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
19802     SDValue Other = N00;
19803 
19804     // And handle the commutative case.
19805     if (!Shuffle) {
19806       Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
19807       Other = N01;
19808     }
19809 
19810     if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
19811         Other == Shuffle->getOperand(0)) {
19812       SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
19813                                      DAG.getConstant(0, DL, MVT::i64));
19814       SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
19815                                      DAG.getConstant(1, DL, MVT::i64));
19816       if (!IsStrict)
19817         return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
19818 
19819       // For strict_fadd we need uses of the final extract_vector to be replaced
19820       // with the strict_fadd, but we also need uses of the chain output of the
19821       // original strict_fadd to use the chain output of the new strict_fadd as
19822       // otherwise it may not be deleted.
19823       SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
19824                                 {VT, MVT::Other},
19825                                 {N0->getOperand(0), Extract1, Extract2});
19826       DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
19827       DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
19828       return SDValue(N, 0);
19829     }
19830   }
19831 
19832   return SDValue();
19833 }
19834 
19835 static SDValue performConcatVectorsCombine(SDNode *N,
19836                                            TargetLowering::DAGCombinerInfo &DCI,
19837                                            SelectionDAG &DAG) {
19838   SDLoc dl(N);
19839   EVT VT = N->getValueType(0);
19840   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
19841   unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
19842 
19843   if (VT.isScalableVector())
19844     return SDValue();
19845 
19846   if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
19847       N1Opc == ISD::TRUNCATE) {
19848     SDValue N00 = N0->getOperand(0);
19849     SDValue N10 = N1->getOperand(0);
19850     EVT N00VT = N00.getValueType();
19851     unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
19852 
19853     // Optimize concat_vectors of truncated vectors, where the intermediate
19854     // type is illegal, to avoid said illegality,  e.g.,
19855     //   (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
19856     //                          (v2i16 (truncate (v2i64)))))
19857     // ->
19858     //   (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
19859     //                                    (v4i32 (bitcast (v2i64))),
19860     //                                    <0, 2, 4, 6>)))
19861     // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
19862     // on both input and result type, so we might generate worse code.
19863     // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
19864     if (N00VT == N10.getValueType() &&
19865         (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
19866         N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
19867       MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
19868       SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
19869       for (size_t i = 0; i < Mask.size(); ++i)
19870         Mask[i] = i * 2;
19871       return DAG.getNode(ISD::TRUNCATE, dl, VT,
19872                          DAG.getVectorShuffle(
19873                              MidVT, dl,
19874                              DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
19875                              DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
19876     }
19877 
19878     // Optimize two large shifts and a combine into a single combine and shift
19879     // For AArch64 architectures, sequences like the following:
19880     //
19881     //     ushr    v0.4s, v0.4s, #20
19882     //     ushr    v1.4s, v1.4s, #20
19883     //     uzp1    v0.8h, v0.8h, v1.8h
19884     //
19885     // Can be optimized to:
19886     //
19887     //     uzp2    v0.8h, v0.8h, v1.8h
19888     //     ushr    v0.8h, v0.8h, #4
19889     //
19890     // This optimization reduces instruction count.
19891     if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
19892         N00->getOperand(1) == N10->getOperand(1)) {
19893       SDValue N000 = N00->getOperand(0);
19894       SDValue N100 = N10->getOperand(0);
19895       uint64_t N001ConstVal = N00->getConstantOperandVal(1),
19896                N101ConstVal = N10->getConstantOperandVal(1),
19897                NScalarSize = N->getValueType(0).getScalarSizeInBits();
19898 
19899       if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
19900         N000 = DAG.getNode(AArch64ISD::NVCAST, dl, VT, N000);
19901         N100 = DAG.getNode(AArch64ISD::NVCAST, dl, VT, N100);
19902         SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, dl, VT, N000, N100);
19903         SDValue NewShiftConstant =
19904             DAG.getConstant(N001ConstVal - NScalarSize, dl, MVT::i32);
19905 
19906         return DAG.getNode(AArch64ISD::VLSHR, dl, VT, Uzp, NewShiftConstant);
19907       }
19908     }
19909   }
19910 
19911   if (N->getOperand(0).getValueType() == MVT::v4i8 ||
19912       N->getOperand(0).getValueType() == MVT::v2i16 ||
19913       N->getOperand(0).getValueType() == MVT::v2i8) {
19914     EVT SrcVT = N->getOperand(0).getValueType();
19915     // If we have a concat of v4i8 loads, convert them to a buildvector of f32
19916     // loads to prevent having to go through the v4i8 load legalization that
19917     // needs to extend each element into a larger type.
19918     if (N->getNumOperands() % 2 == 0 &&
19919         all_of(N->op_values(), [SrcVT](SDValue V) {
19920           if (V.getValueType() != SrcVT)
19921             return false;
19922           if (V.isUndef())
19923             return true;
19924           LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
19925           return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
19926                  LD->getExtensionType() == ISD::NON_EXTLOAD;
19927         })) {
19928       EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
19929       EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
19930       SmallVector<SDValue> Ops;
19931 
19932       for (unsigned i = 0; i < N->getNumOperands(); i++) {
19933         SDValue V = N->getOperand(i);
19934         if (V.isUndef())
19935           Ops.push_back(DAG.getUNDEF(FVT));
19936         else {
19937           LoadSDNode *LD = cast<LoadSDNode>(V);
19938           SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
19939                                         LD->getBasePtr(), LD->getMemOperand());
19940           DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
19941           Ops.push_back(NewLoad);
19942         }
19943       }
19944       return DAG.getBitcast(N->getValueType(0),
19945                             DAG.getBuildVector(NVT, dl, Ops));
19946     }
19947   }
19948 
19949   // Canonicalise concat_vectors to replace concatenations of truncated nots
19950   // with nots of concatenated truncates. This in some cases allows for multiple
19951   // redundant negations to be eliminated.
19952   //  (concat_vectors (v4i16 (truncate (not (v4i32)))),
19953   //                  (v4i16 (truncate (not (v4i32)))))
19954   // ->
19955   //  (not (concat_vectors (v4i16 (truncate (v4i32))),
19956   //                       (v4i16 (truncate (v4i32)))))
19957   if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
19958       N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
19959       N->isOnlyUserOf(N1.getNode())) {
19960     auto isBitwiseVectorNegate = [](SDValue V) {
19961       return V->getOpcode() == ISD::XOR &&
19962              ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
19963     };
19964     SDValue N00 = N0->getOperand(0);
19965     SDValue N10 = N1->getOperand(0);
19966     if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
19967         isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
19968       return DAG.getNOT(
19969           dl,
19970           DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
19971                       DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
19972                                   N00->getOperand(0)),
19973                       DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
19974                                   N10->getOperand(0))),
19975           VT);
19976     }
19977   }
19978 
19979   // Wait till after everything is legalized to try this. That way we have
19980   // legal vector types and such.
19981   if (DCI.isBeforeLegalizeOps())
19982     return SDValue();
19983 
19984   // Optimise concat_vectors of two identical binops with a 128-bit destination
19985   // size, combine into an binop of two contacts of the source vectors. eg:
19986   // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
19987   if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
19988       DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
19989       N1->hasOneUse()) {
19990     SDValue N00 = N0->getOperand(0);
19991     SDValue N01 = N0->getOperand(1);
19992     SDValue N10 = N1->getOperand(0);
19993     SDValue N11 = N1->getOperand(1);
19994 
19995     if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
19996       SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
19997       SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
19998       return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
19999     }
20000   }
20001 
20002   auto IsRSHRN = [](SDValue Shr) {
20003     if (Shr.getOpcode() != AArch64ISD::VLSHR)
20004       return false;
20005     SDValue Op = Shr.getOperand(0);
20006     EVT VT = Op.getValueType();
20007     unsigned ShtAmt = Shr.getConstantOperandVal(1);
20008     if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
20009       return false;
20010 
20011     APInt Imm;
20012     if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
20013       Imm = APInt(VT.getScalarSizeInBits(),
20014                   Op.getOperand(1).getConstantOperandVal(0)
20015                       << Op.getOperand(1).getConstantOperandVal(1));
20016     else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
20017              isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
20018       Imm = APInt(VT.getScalarSizeInBits(),
20019                   Op.getOperand(1).getConstantOperandVal(0));
20020     else
20021       return false;
20022 
20023     if (Imm != 1ULL << (ShtAmt - 1))
20024       return false;
20025     return true;
20026   };
20027 
20028   // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
20029   if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
20030       ((IsRSHRN(N1) &&
20031         N0.getConstantOperandVal(1) == N1.getConstantOperandVal(1)) ||
20032        N1.isUndef())) {
20033     SDValue X = N0.getOperand(0).getOperand(0);
20034     SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
20035                              : N1.getOperand(0).getOperand(0);
20036     EVT BVT =
20037         X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
20038     SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
20039     SDValue Add = DAG.getNode(
20040         ISD::ADD, dl, BVT, CC,
20041         DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
20042     SDValue Shr =
20043         DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
20044     return Shr;
20045   }
20046 
20047   // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
20048   if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
20049       N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
20050       N0.getOperand(1) == N1.getOperand(1)) {
20051     SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
20052                              DAG.getUNDEF(N0.getValueType()));
20053     SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
20054                              DAG.getUNDEF(N0.getValueType()));
20055     return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
20056   }
20057 
20058   // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
20059   // splat. The indexed instructions are going to be expecting a DUPLANE64, so
20060   // canonicalise to that.
20061   if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
20062     assert(VT.getScalarSizeInBits() == 64);
20063     return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
20064                        DAG.getConstant(0, dl, MVT::i64));
20065   }
20066 
20067   // Canonicalise concat_vectors so that the right-hand vector has as few
20068   // bit-casts as possible before its real operation. The primary matching
20069   // destination for these operations will be the narrowing "2" instructions,
20070   // which depend on the operation being performed on this right-hand vector.
20071   // For example,
20072   //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
20073   // becomes
20074   //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
20075 
20076   if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
20077     return SDValue();
20078   SDValue RHS = N1->getOperand(0);
20079   MVT RHSTy = RHS.getValueType().getSimpleVT();
20080   // If the RHS is not a vector, this is not the pattern we're looking for.
20081   if (!RHSTy.isVector())
20082     return SDValue();
20083 
20084   LLVM_DEBUG(
20085       dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
20086 
20087   MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
20088                                   RHSTy.getVectorNumElements() * 2);
20089   return DAG.getNode(ISD::BITCAST, dl, VT,
20090                      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
20091                                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
20092                                  RHS));
20093 }
20094 
20095 static SDValue
20096 performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
20097                                SelectionDAG &DAG) {
20098   if (DCI.isBeforeLegalizeOps())
20099     return SDValue();
20100 
20101   EVT VT = N->getValueType(0);
20102   if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
20103     return SDValue();
20104 
20105   SDValue V = N->getOperand(0);
20106 
20107   // NOTE: This combine exists in DAGCombiner, but that version's legality check
20108   // blocks this combine because the non-const case requires custom lowering.
20109   //
20110   // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
20111   if (V.getOpcode() == ISD::SPLAT_VECTOR)
20112     if (isa<ConstantSDNode>(V.getOperand(0)))
20113       return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
20114 
20115   return SDValue();
20116 }
20117 
20118 static SDValue
20119 performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
20120                               SelectionDAG &DAG) {
20121   SDLoc DL(N);
20122   SDValue Vec = N->getOperand(0);
20123   SDValue SubVec = N->getOperand(1);
20124   uint64_t IdxVal = N->getConstantOperandVal(2);
20125   EVT VecVT = Vec.getValueType();
20126   EVT SubVT = SubVec.getValueType();
20127 
20128   // Only do this for legal fixed vector types.
20129   if (!VecVT.isFixedLengthVector() ||
20130       !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
20131       !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
20132     return SDValue();
20133 
20134   // Ignore widening patterns.
20135   if (IdxVal == 0 && Vec.isUndef())
20136     return SDValue();
20137 
20138   // Subvector must be half the width and an "aligned" insertion.
20139   unsigned NumSubElts = SubVT.getVectorNumElements();
20140   if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
20141       (IdxVal != 0 && IdxVal != NumSubElts))
20142     return SDValue();
20143 
20144   // Fold insert_subvector -> concat_vectors
20145   // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
20146   // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
20147   SDValue Lo, Hi;
20148   if (IdxVal == 0) {
20149     Lo = SubVec;
20150     Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20151                      DAG.getVectorIdxConstant(NumSubElts, DL));
20152   } else {
20153     Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20154                      DAG.getVectorIdxConstant(0, DL));
20155     Hi = SubVec;
20156   }
20157   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
20158 }
20159 
20160 static SDValue tryCombineFixedPointConvert(SDNode *N,
20161                                            TargetLowering::DAGCombinerInfo &DCI,
20162                                            SelectionDAG &DAG) {
20163   // Wait until after everything is legalized to try this. That way we have
20164   // legal vector types and such.
20165   if (DCI.isBeforeLegalizeOps())
20166     return SDValue();
20167   // Transform a scalar conversion of a value from a lane extract into a
20168   // lane extract of a vector conversion. E.g., from foo1 to foo2:
20169   // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
20170   // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
20171   //
20172   // The second form interacts better with instruction selection and the
20173   // register allocator to avoid cross-class register copies that aren't
20174   // coalescable due to a lane reference.
20175 
20176   // Check the operand and see if it originates from a lane extract.
20177   SDValue Op1 = N->getOperand(1);
20178   if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
20179     return SDValue();
20180 
20181   // Yep, no additional predication needed. Perform the transform.
20182   SDValue IID = N->getOperand(0);
20183   SDValue Shift = N->getOperand(2);
20184   SDValue Vec = Op1.getOperand(0);
20185   SDValue Lane = Op1.getOperand(1);
20186   EVT ResTy = N->getValueType(0);
20187   EVT VecResTy;
20188   SDLoc DL(N);
20189 
20190   // The vector width should be 128 bits by the time we get here, even
20191   // if it started as 64 bits (the extract_vector handling will have
20192   // done so). Bail if it is not.
20193   if (Vec.getValueSizeInBits() != 128)
20194     return SDValue();
20195 
20196   if (Vec.getValueType() == MVT::v4i32)
20197     VecResTy = MVT::v4f32;
20198   else if (Vec.getValueType() == MVT::v2i64)
20199     VecResTy = MVT::v2f64;
20200   else
20201     return SDValue();
20202 
20203   SDValue Convert =
20204       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
20205   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
20206 }
20207 
20208 // AArch64 high-vector "long" operations are formed by performing the non-high
20209 // version on an extract_subvector of each operand which gets the high half:
20210 //
20211 //  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
20212 //
20213 // However, there are cases which don't have an extract_high explicitly, but
20214 // have another operation that can be made compatible with one for free. For
20215 // example:
20216 //
20217 //  (dupv64 scalar) --> (extract_high (dup128 scalar))
20218 //
20219 // This routine does the actual conversion of such DUPs, once outer routines
20220 // have determined that everything else is in order.
20221 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
20222 // similarly here.
20223 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
20224   MVT VT = N.getSimpleValueType();
20225   if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
20226       N.getConstantOperandVal(1) == 0)
20227     N = N.getOperand(0);
20228 
20229   switch (N.getOpcode()) {
20230   case AArch64ISD::DUP:
20231   case AArch64ISD::DUPLANE8:
20232   case AArch64ISD::DUPLANE16:
20233   case AArch64ISD::DUPLANE32:
20234   case AArch64ISD::DUPLANE64:
20235   case AArch64ISD::MOVI:
20236   case AArch64ISD::MOVIshift:
20237   case AArch64ISD::MOVIedit:
20238   case AArch64ISD::MOVImsl:
20239   case AArch64ISD::MVNIshift:
20240   case AArch64ISD::MVNImsl:
20241     break;
20242   default:
20243     // FMOV could be supported, but isn't very useful, as it would only occur
20244     // if you passed a bitcast' floating point immediate to an eligible long
20245     // integer op (addl, smull, ...).
20246     return SDValue();
20247   }
20248 
20249   if (!VT.is64BitVector())
20250     return SDValue();
20251 
20252   SDLoc DL(N);
20253   unsigned NumElems = VT.getVectorNumElements();
20254   if (N.getValueType().is64BitVector()) {
20255     MVT ElementTy = VT.getVectorElementType();
20256     MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
20257     N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
20258   }
20259 
20260   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
20261                      DAG.getConstant(NumElems, DL, MVT::i64));
20262 }
20263 
20264 static bool isEssentiallyExtractHighSubvector(SDValue N) {
20265   if (N.getOpcode() == ISD::BITCAST)
20266     N = N.getOperand(0);
20267   if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
20268     return false;
20269   if (N.getOperand(0).getValueType().isScalableVector())
20270     return false;
20271   return N.getConstantOperandAPInt(1) ==
20272          N.getOperand(0).getValueType().getVectorNumElements() / 2;
20273 }
20274 
20275 /// Helper structure to keep track of ISD::SET_CC operands.
20276 struct GenericSetCCInfo {
20277   const SDValue *Opnd0;
20278   const SDValue *Opnd1;
20279   ISD::CondCode CC;
20280 };
20281 
20282 /// Helper structure to keep track of a SET_CC lowered into AArch64 code.
20283 struct AArch64SetCCInfo {
20284   const SDValue *Cmp;
20285   AArch64CC::CondCode CC;
20286 };
20287 
20288 /// Helper structure to keep track of SetCC information.
20289 union SetCCInfo {
20290   GenericSetCCInfo Generic;
20291   AArch64SetCCInfo AArch64;
20292 };
20293 
20294 /// Helper structure to be able to read SetCC information.  If set to
20295 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
20296 /// GenericSetCCInfo.
20297 struct SetCCInfoAndKind {
20298   SetCCInfo Info;
20299   bool IsAArch64;
20300 };
20301 
20302 /// Check whether or not \p Op is a SET_CC operation, either a generic or
20303 /// an
20304 /// AArch64 lowered one.
20305 /// \p SetCCInfo is filled accordingly.
20306 /// \post SetCCInfo is meanginfull only when this function returns true.
20307 /// \return True when Op is a kind of SET_CC operation.
20308 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
20309   // If this is a setcc, this is straight forward.
20310   if (Op.getOpcode() == ISD::SETCC) {
20311     SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
20312     SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
20313     SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
20314     SetCCInfo.IsAArch64 = false;
20315     return true;
20316   }
20317   // Otherwise, check if this is a matching csel instruction.
20318   // In other words:
20319   // - csel 1, 0, cc
20320   // - csel 0, 1, !cc
20321   if (Op.getOpcode() != AArch64ISD::CSEL)
20322     return false;
20323   // Set the information about the operands.
20324   // TODO: we want the operands of the Cmp not the csel
20325   SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
20326   SetCCInfo.IsAArch64 = true;
20327   SetCCInfo.Info.AArch64.CC =
20328       static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
20329 
20330   // Check that the operands matches the constraints:
20331   // (1) Both operands must be constants.
20332   // (2) One must be 1 and the other must be 0.
20333   ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
20334   ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20335 
20336   // Check (1).
20337   if (!TValue || !FValue)
20338     return false;
20339 
20340   // Check (2).
20341   if (!TValue->isOne()) {
20342     // Update the comparison when we are interested in !cc.
20343     std::swap(TValue, FValue);
20344     SetCCInfo.Info.AArch64.CC =
20345         AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
20346   }
20347   return TValue->isOne() && FValue->isZero();
20348 }
20349 
20350 // Returns true if Op is setcc or zext of setcc.
20351 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
20352   if (isSetCC(Op, Info))
20353     return true;
20354   return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
20355     isSetCC(Op->getOperand(0), Info));
20356 }
20357 
20358 // The folding we want to perform is:
20359 // (add x, [zext] (setcc cc ...) )
20360 //   -->
20361 // (csel x, (add x, 1), !cc ...)
20362 //
20363 // The latter will get matched to a CSINC instruction.
20364 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
20365   assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
20366   SDValue LHS = Op->getOperand(0);
20367   SDValue RHS = Op->getOperand(1);
20368   SetCCInfoAndKind InfoAndKind;
20369 
20370   // If both operands are a SET_CC, then we don't want to perform this
20371   // folding and create another csel as this results in more instructions
20372   // (and higher register usage).
20373   if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
20374       isSetCCOrZExtSetCC(RHS, InfoAndKind))
20375     return SDValue();
20376 
20377   // If neither operand is a SET_CC, give up.
20378   if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
20379     std::swap(LHS, RHS);
20380     if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
20381       return SDValue();
20382   }
20383 
20384   // FIXME: This could be generatized to work for FP comparisons.
20385   EVT CmpVT = InfoAndKind.IsAArch64
20386                   ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
20387                   : InfoAndKind.Info.Generic.Opnd0->getValueType();
20388   if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
20389     return SDValue();
20390 
20391   SDValue CCVal;
20392   SDValue Cmp;
20393   SDLoc dl(Op);
20394   if (InfoAndKind.IsAArch64) {
20395     CCVal = DAG.getConstant(
20396         AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
20397         MVT::i32);
20398     Cmp = *InfoAndKind.Info.AArch64.Cmp;
20399   } else
20400     Cmp = getAArch64Cmp(
20401         *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
20402         ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
20403         dl);
20404 
20405   EVT VT = Op->getValueType(0);
20406   LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
20407   return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
20408 }
20409 
20410 // ADD(UADDV a, UADDV b) -->  UADDV(ADD a, b)
20411 static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
20412   EVT VT = N->getValueType(0);
20413   // Only scalar integer and vector types.
20414   if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
20415     return SDValue();
20416 
20417   SDValue LHS = N->getOperand(0);
20418   SDValue RHS = N->getOperand(1);
20419   if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20420       RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
20421     return SDValue();
20422 
20423   auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
20424   auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
20425   if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
20426     return SDValue();
20427 
20428   SDValue Op1 = LHS->getOperand(0);
20429   SDValue Op2 = RHS->getOperand(0);
20430   EVT OpVT1 = Op1.getValueType();
20431   EVT OpVT2 = Op2.getValueType();
20432   if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
20433       Op2.getOpcode() != AArch64ISD::UADDV ||
20434       OpVT1.getVectorElementType() != VT)
20435     return SDValue();
20436 
20437   SDValue Val1 = Op1.getOperand(0);
20438   SDValue Val2 = Op2.getOperand(0);
20439   EVT ValVT = Val1->getValueType(0);
20440   SDLoc DL(N);
20441   SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
20442   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
20443                      DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
20444                      DAG.getConstant(0, DL, MVT::i64));
20445 }
20446 
20447 /// Perform the scalar expression combine in the form of:
20448 ///   CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
20449 ///   CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
20450 static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) {
20451   EVT VT = N->getValueType(0);
20452   if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
20453     return SDValue();
20454 
20455   SDValue LHS = N->getOperand(0);
20456   SDValue RHS = N->getOperand(1);
20457 
20458   // Handle commutivity.
20459   if (LHS.getOpcode() != AArch64ISD::CSEL &&
20460       LHS.getOpcode() != AArch64ISD::CSNEG) {
20461     std::swap(LHS, RHS);
20462     if (LHS.getOpcode() != AArch64ISD::CSEL &&
20463         LHS.getOpcode() != AArch64ISD::CSNEG) {
20464       return SDValue();
20465     }
20466   }
20467 
20468   if (!LHS.hasOneUse())
20469     return SDValue();
20470 
20471   AArch64CC::CondCode AArch64CC =
20472       static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
20473 
20474   // The CSEL should include a const one operand, and the CSNEG should include
20475   // One or NegOne operand.
20476   ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
20477   ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
20478   if (!CTVal || !CFVal)
20479     return SDValue();
20480 
20481   if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
20482         (CTVal->isOne() || CFVal->isOne())) &&
20483       !(LHS.getOpcode() == AArch64ISD::CSNEG &&
20484         (CTVal->isOne() || CFVal->isAllOnes())))
20485     return SDValue();
20486 
20487   // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
20488   if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
20489       !CFVal->isOne()) {
20490     std::swap(CTVal, CFVal);
20491     AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
20492   }
20493 
20494   SDLoc DL(N);
20495   // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
20496   if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
20497       !CFVal->isAllOnes()) {
20498     APInt C = -1 * CFVal->getAPIntValue();
20499     CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
20500     CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
20501     AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
20502   }
20503 
20504   // It might be neutral for larger constants, as the immediate need to be
20505   // materialized in a register.
20506   APInt ADDC = CTVal->getAPIntValue();
20507   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20508   if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
20509     return SDValue();
20510 
20511   assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
20512           (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
20513          "Unexpected constant value");
20514 
20515   SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
20516   SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
20517   SDValue Cmp = LHS.getOperand(3);
20518 
20519   return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
20520 }
20521 
20522 // ADD(UDOT(zero, x, y), A) -->  UDOT(A, x, y)
20523 static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
20524   EVT VT = N->getValueType(0);
20525   if (N->getOpcode() != ISD::ADD)
20526     return SDValue();
20527 
20528   SDValue Dot = N->getOperand(0);
20529   SDValue A = N->getOperand(1);
20530   // Handle commutivity
20531   auto isZeroDot = [](SDValue Dot) {
20532     return (Dot.getOpcode() == AArch64ISD::UDOT ||
20533             Dot.getOpcode() == AArch64ISD::SDOT) &&
20534            isZerosVector(Dot.getOperand(0).getNode());
20535   };
20536   if (!isZeroDot(Dot))
20537     std::swap(Dot, A);
20538   if (!isZeroDot(Dot))
20539     return SDValue();
20540 
20541   return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
20542                      Dot.getOperand(2));
20543 }
20544 
20545 static bool isNegatedInteger(SDValue Op) {
20546   return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
20547 }
20548 
20549 static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
20550   SDLoc DL(Op);
20551   EVT VT = Op.getValueType();
20552   SDValue Zero = DAG.getConstant(0, DL, VT);
20553   return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
20554 }
20555 
20556 // Try to fold
20557 //
20558 // (neg (csel X, Y)) -> (csel (neg X), (neg Y))
20559 //
20560 // The folding helps csel to be matched with csneg without generating
20561 // redundant neg instruction, which includes negation of the csel expansion
20562 // of abs node lowered by lowerABS.
20563 static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
20564   if (!isNegatedInteger(SDValue(N, 0)))
20565     return SDValue();
20566 
20567   SDValue CSel = N->getOperand(1);
20568   if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
20569     return SDValue();
20570 
20571   SDValue N0 = CSel.getOperand(0);
20572   SDValue N1 = CSel.getOperand(1);
20573 
20574   // If both of them is not negations, it's not worth the folding as it
20575   // introduces two additional negations while reducing one negation.
20576   if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
20577     return SDValue();
20578 
20579   SDValue N0N = getNegatedInteger(N0, DAG);
20580   SDValue N1N = getNegatedInteger(N1, DAG);
20581 
20582   SDLoc DL(N);
20583   EVT VT = CSel.getValueType();
20584   return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
20585                      CSel.getOperand(3));
20586 }
20587 
20588 // The basic add/sub long vector instructions have variants with "2" on the end
20589 // which act on the high-half of their inputs. They are normally matched by
20590 // patterns like:
20591 //
20592 // (add (zeroext (extract_high LHS)),
20593 //      (zeroext (extract_high RHS)))
20594 // -> uaddl2 vD, vN, vM
20595 //
20596 // However, if one of the extracts is something like a duplicate, this
20597 // instruction can still be used profitably. This function puts the DAG into a
20598 // more appropriate form for those patterns to trigger.
20599 static SDValue performAddSubLongCombine(SDNode *N,
20600                                         TargetLowering::DAGCombinerInfo &DCI) {
20601   SelectionDAG &DAG = DCI.DAG;
20602   if (DCI.isBeforeLegalizeOps())
20603     return SDValue();
20604 
20605   MVT VT = N->getSimpleValueType(0);
20606   if (!VT.is128BitVector()) {
20607     if (N->getOpcode() == ISD::ADD)
20608       return performSetccAddFolding(N, DAG);
20609     return SDValue();
20610   }
20611 
20612   // Make sure both branches are extended in the same way.
20613   SDValue LHS = N->getOperand(0);
20614   SDValue RHS = N->getOperand(1);
20615   if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
20616        LHS.getOpcode() != ISD::SIGN_EXTEND) ||
20617       LHS.getOpcode() != RHS.getOpcode())
20618     return SDValue();
20619 
20620   unsigned ExtType = LHS.getOpcode();
20621 
20622   // It's not worth doing if at least one of the inputs isn't already an
20623   // extract, but we don't know which it'll be so we have to try both.
20624   if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
20625     RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
20626     if (!RHS.getNode())
20627       return SDValue();
20628 
20629     RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
20630   } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
20631     LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
20632     if (!LHS.getNode())
20633       return SDValue();
20634 
20635     LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
20636   }
20637 
20638   return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
20639 }
20640 
20641 static bool isCMP(SDValue Op) {
20642   return Op.getOpcode() == AArch64ISD::SUBS &&
20643          !Op.getNode()->hasAnyUseOfValue(0);
20644 }
20645 
20646 // (CSEL 1 0 CC Cond) => CC
20647 // (CSEL 0 1 CC Cond) => !CC
20648 static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
20649   if (Op.getOpcode() != AArch64ISD::CSEL)
20650     return std::nullopt;
20651   auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
20652   if (CC == AArch64CC::AL || CC == AArch64CC::NV)
20653     return std::nullopt;
20654   SDValue OpLHS = Op.getOperand(0);
20655   SDValue OpRHS = Op.getOperand(1);
20656   if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
20657     return CC;
20658   if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
20659     return getInvertedCondCode(CC);
20660 
20661   return std::nullopt;
20662 }
20663 
20664 // (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
20665 // (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
20666 static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
20667   SDValue CmpOp = Op->getOperand(2);
20668   if (!isCMP(CmpOp))
20669     return SDValue();
20670 
20671   if (IsAdd) {
20672     if (!isOneConstant(CmpOp.getOperand(1)))
20673       return SDValue();
20674   } else {
20675     if (!isNullConstant(CmpOp.getOperand(0)))
20676       return SDValue();
20677   }
20678 
20679   SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
20680   auto CC = getCSETCondCode(CsetOp);
20681   if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
20682     return SDValue();
20683 
20684   return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
20685                      Op->getOperand(0), Op->getOperand(1),
20686                      CsetOp.getOperand(3));
20687 }
20688 
20689 // (ADC x 0 cond) => (CINC x HS cond)
20690 static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
20691   SDValue LHS = N->getOperand(0);
20692   SDValue RHS = N->getOperand(1);
20693   SDValue Cond = N->getOperand(2);
20694 
20695   if (!isNullConstant(RHS))
20696     return SDValue();
20697 
20698   EVT VT = N->getValueType(0);
20699   SDLoc DL(N);
20700 
20701   // (CINC x cc cond) <=> (CSINC x x !cc cond)
20702   SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
20703   return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
20704 }
20705 
20706 static SDValue performBuildVectorCombine(SDNode *N,
20707                                          TargetLowering::DAGCombinerInfo &DCI,
20708                                          SelectionDAG &DAG) {
20709   SDLoc DL(N);
20710   EVT VT = N->getValueType(0);
20711 
20712   if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
20713       (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
20714     SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
20715             Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
20716     if (Elt0->getOpcode() == ISD::FP_ROUND &&
20717         Elt1->getOpcode() == ISD::FP_ROUND &&
20718         isa<ConstantSDNode>(Elt0->getOperand(1)) &&
20719         isa<ConstantSDNode>(Elt1->getOperand(1)) &&
20720         Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
20721         Elt0->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20722         Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20723         // Constant index.
20724         isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
20725         isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
20726         Elt0->getOperand(0)->getOperand(0) ==
20727             Elt1->getOperand(0)->getOperand(0) &&
20728         Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
20729         Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
20730       SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
20731       if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
20732         SDValue HighLanes;
20733         if (Elt2->getOpcode() == ISD::UNDEF &&
20734             Elt3->getOpcode() == ISD::UNDEF) {
20735           HighLanes = DAG.getUNDEF(MVT::v2f32);
20736         } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
20737                    Elt3->getOpcode() == ISD::FP_ROUND &&
20738                    isa<ConstantSDNode>(Elt2->getOperand(1)) &&
20739                    isa<ConstantSDNode>(Elt3->getOperand(1)) &&
20740                    Elt2->getConstantOperandVal(1) ==
20741                        Elt3->getConstantOperandVal(1) &&
20742                    Elt2->getOperand(0)->getOpcode() ==
20743                        ISD::EXTRACT_VECTOR_ELT &&
20744                    Elt3->getOperand(0)->getOpcode() ==
20745                        ISD::EXTRACT_VECTOR_ELT &&
20746                    // Constant index.
20747                    isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
20748                    isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
20749                    Elt2->getOperand(0)->getOperand(0) ==
20750                        Elt3->getOperand(0)->getOperand(0) &&
20751                    Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
20752                    Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
20753           SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
20754           HighLanes =
20755               DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
20756         }
20757         if (HighLanes) {
20758           SDValue DoubleToSingleSticky =
20759               DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
20760           SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
20761                                        DoubleToSingleSticky, HighLanes);
20762           return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
20763                              Elt0->getOperand(1));
20764         }
20765       }
20766     }
20767   }
20768 
20769   if (VT == MVT::v2f64) {
20770     SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
20771     if (Elt0->getOpcode() == ISD::FP_EXTEND &&
20772         Elt1->getOpcode() == ISD::FP_EXTEND &&
20773         Elt0->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20774         Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20775         Elt0->getOperand(0)->getOperand(0) ==
20776             Elt1->getOperand(0)->getOperand(0) &&
20777         // Constant index.
20778         isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
20779         isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
20780         Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
20781             Elt1->getOperand(0)->getConstantOperandVal(1) &&
20782         // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
20783         // ResultType's known minimum vector length.
20784         Elt0->getOperand(0)->getConstantOperandVal(1) %
20785                 VT.getVectorMinNumElements() ==
20786             0) {
20787       SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
20788       if (SrcVec.getValueType() == MVT::v4f16 ||
20789           SrcVec.getValueType() == MVT::v4bf16) {
20790         SDValue HalfToSingle =
20791             DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
20792         SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
20793         SDValue Extract = DAG.getNode(
20794             ISD::EXTRACT_SUBVECTOR, DL, VT.changeVectorElementType(MVT::f32),
20795             HalfToSingle, SubvectorIdx);
20796         return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
20797       }
20798     }
20799   }
20800 
20801   // A build vector of two extracted elements is equivalent to an
20802   // extract subvector where the inner vector is any-extended to the
20803   // extract_vector_elt VT.
20804   //    (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
20805   //                  (extract_elt_iXX_to_i32 vec Idx+1))
20806   // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
20807 
20808   // For now, only consider the v2i32 case, which arises as a result of
20809   // legalization.
20810   if (VT != MVT::v2i32)
20811     return SDValue();
20812 
20813   SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
20814   // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
20815   if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20816       Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20817       // Constant index.
20818       isa<ConstantSDNode>(Elt0->getOperand(1)) &&
20819       isa<ConstantSDNode>(Elt1->getOperand(1)) &&
20820       // Both EXTRACT_VECTOR_ELT from same vector...
20821       Elt0->getOperand(0) == Elt1->getOperand(0) &&
20822       // ... and contiguous. First element's index +1 == second element's index.
20823       Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
20824       // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
20825       // ResultType's known minimum vector length.
20826       Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
20827     SDValue VecToExtend = Elt0->getOperand(0);
20828     EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
20829     if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
20830       return SDValue();
20831 
20832     SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
20833 
20834     SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
20835     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
20836                        SubvectorIdx);
20837   }
20838 
20839   return SDValue();
20840 }
20841 
20842 static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
20843                                       TargetLowering::DAGCombinerInfo &DCI) {
20844   SDLoc DL(N);
20845   EVT VT = N->getValueType(0);
20846   SDValue N0 = N->getOperand(0);
20847   if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
20848       N0.getOpcode() == AArch64ISD::DUP) {
20849     SDValue Op = N0.getOperand(0);
20850     if (VT.getScalarType() == MVT::i32 &&
20851         N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
20852       Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
20853     return DAG.getNode(N0.getOpcode(), DL, VT, Op);
20854   }
20855 
20856   // Performing the following combine produces a preferable form for ISEL.
20857   // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
20858   if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20859       N0.hasOneUse()) {
20860     SDValue Op = N0.getOperand(0);
20861     SDValue ExtractIndexNode = N0.getOperand(1);
20862     if (!isa<ConstantSDNode>(ExtractIndexNode))
20863       return SDValue();
20864 
20865     // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
20866     // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
20867     assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
20868            "Unexpected legalisation result!");
20869 
20870     EVT SrcVectorType = Op.getValueType();
20871     // We also assume that SrcVectorType cannot be a V64 (see
20872     // LowerEXTRACT_VECTOR_ELT).
20873     assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
20874            "Unexpected legalisation result!");
20875 
20876     unsigned ExtractIndex =
20877         cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
20878     MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
20879 
20880     Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
20881     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
20882                        DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
20883   }
20884 
20885   return SDValue();
20886 }
20887 
20888 // Check an node is an extend or shift operand
20889 static bool isExtendOrShiftOperand(SDValue N) {
20890   unsigned Opcode = N.getOpcode();
20891   if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
20892     EVT SrcVT;
20893     if (Opcode == ISD::SIGN_EXTEND_INREG)
20894       SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
20895     else
20896       SrcVT = N.getOperand(0).getValueType();
20897 
20898     return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
20899   } else if (Opcode == ISD::AND) {
20900     ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
20901     if (!CSD)
20902       return false;
20903     uint64_t AndMask = CSD->getZExtValue();
20904     return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
20905   } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
20906     return isa<ConstantSDNode>(N.getOperand(1));
20907   }
20908 
20909   return false;
20910 }
20911 
20912 // (N - Y) + Z --> (Z - Y) + N
20913 // when N is an extend or shift operand
20914 static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z,
20915                                          SelectionDAG &DAG) {
20916   auto IsOneUseExtend = [](SDValue N) {
20917     return N.hasOneUse() && isExtendOrShiftOperand(N);
20918   };
20919 
20920   // DAGCombiner will revert the combination when Z is constant cause
20921   // dead loop. So don't enable the combination when Z is constant.
20922   // If Z is one use shift C, we also can't do the optimization.
20923   // It will falling to self infinite loop.
20924   if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
20925     return SDValue();
20926 
20927   if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
20928     return SDValue();
20929 
20930   SDValue Shift = SUB.getOperand(0);
20931   if (!IsOneUseExtend(Shift))
20932     return SDValue();
20933 
20934   SDLoc DL(N);
20935   EVT VT = N->getValueType(0);
20936 
20937   SDValue Y = SUB.getOperand(1);
20938   SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
20939   return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
20940 }
20941 
20942 static SDValue performAddCombineForShiftedOperands(SDNode *N,
20943                                                    SelectionDAG &DAG) {
20944   // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
20945   // commutative.
20946   if (N->getOpcode() != ISD::ADD)
20947     return SDValue();
20948 
20949   // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
20950   // shifted register is only available for i32 and i64.
20951   EVT VT = N->getValueType(0);
20952   if (VT != MVT::i32 && VT != MVT::i64)
20953     return SDValue();
20954 
20955   SDLoc DL(N);
20956   SDValue LHS = N->getOperand(0);
20957   SDValue RHS = N->getOperand(1);
20958 
20959   if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
20960     return Val;
20961   if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
20962     return Val;
20963 
20964   uint64_t LHSImm = 0, RHSImm = 0;
20965   // If both operand are shifted by imm and shift amount is not greater than 4
20966   // for one operand, swap LHS and RHS to put operand with smaller shift amount
20967   // on RHS.
20968   //
20969   // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
20970   // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
20971   // with LSL (shift > 4). For the rest of processors, this is no-op for
20972   // performance or correctness.
20973   if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
20974       isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
20975       RHSImm > 4 && LHS.hasOneUse())
20976     return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
20977 
20978   return SDValue();
20979 }
20980 
20981 // The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
20982 // This reassociates it back to allow the creation of more mls instructions.
20983 static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG) {
20984   if (N->getOpcode() != ISD::SUB)
20985     return SDValue();
20986 
20987   SDValue Add = N->getOperand(1);
20988   SDValue X = N->getOperand(0);
20989   if (Add.getOpcode() != ISD::ADD)
20990     return SDValue();
20991 
20992   if (!Add.hasOneUse())
20993     return SDValue();
20994   if (DAG.isConstantIntBuildVectorOrConstantInt(X))
20995     return SDValue();
20996 
20997   SDValue M1 = Add.getOperand(0);
20998   SDValue M2 = Add.getOperand(1);
20999   if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
21000       M1.getOpcode() != AArch64ISD::UMULL)
21001     return SDValue();
21002   if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
21003       M2.getOpcode() != AArch64ISD::UMULL)
21004     return SDValue();
21005 
21006   EVT VT = N->getValueType(0);
21007   SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
21008   return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
21009 }
21010 
21011 // Combine into mla/mls.
21012 // This works on the patterns of:
21013 //   add v1, (mul v2, v3)
21014 //   sub v1, (mul v2, v3)
21015 // for vectors of type <1 x i64> and <2 x i64> when SVE is available.
21016 // It will transform the add/sub to a scalable version, so that we can
21017 // make use of SVE's MLA/MLS that will be generated for that pattern
21018 static SDValue
21019 performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
21020   SelectionDAG &DAG = DCI.DAG;
21021   // Make sure that the types are legal
21022   if (!DCI.isAfterLegalizeDAG())
21023     return SDValue();
21024   // Before using SVE's features, check first if it's available.
21025   if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
21026     return SDValue();
21027 
21028   if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
21029     return SDValue();
21030 
21031   if (!N->getValueType(0).isFixedLengthVector())
21032     return SDValue();
21033 
21034   auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
21035     if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21036       return SDValue();
21037 
21038     if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
21039       return SDValue();
21040 
21041     SDValue MulValue = Op1->getOperand(0);
21042     if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
21043       return SDValue();
21044 
21045     if (!Op1.hasOneUse() || !MulValue.hasOneUse())
21046       return SDValue();
21047 
21048     EVT ScalableVT = MulValue.getValueType();
21049     if (!ScalableVT.isScalableVector())
21050       return SDValue();
21051 
21052     SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
21053     SDValue NewValue =
21054         DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
21055     return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
21056   };
21057 
21058   if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
21059     return res;
21060   else if (N->getOpcode() == ISD::ADD)
21061     return performOpt(N->getOperand(1), N->getOperand(0));
21062 
21063   return SDValue();
21064 }
21065 
21066 // Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
21067 // help, for example, to produce ssra from sshr+add.
21068 static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) {
21069   EVT VT = N->getValueType(0);
21070   if (VT != MVT::i64 ||
21071       DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))
21072     return SDValue();
21073   SDValue Op0 = N->getOperand(0);
21074   SDValue Op1 = N->getOperand(1);
21075 
21076   // At least one of the operands should be an extract, and the other should be
21077   // something that is easy to convert to v1i64 type (in this case a load).
21078   if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21079       Op0.getOpcode() != ISD::LOAD)
21080     return SDValue();
21081   if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21082       Op1.getOpcode() != ISD::LOAD)
21083     return SDValue();
21084 
21085   SDLoc DL(N);
21086   if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21087       Op0.getOperand(0).getValueType() == MVT::v1i64) {
21088     Op0 = Op0.getOperand(0);
21089     Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
21090   } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21091              Op1.getOperand(0).getValueType() == MVT::v1i64) {
21092     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
21093     Op1 = Op1.getOperand(0);
21094   } else
21095     return SDValue();
21096 
21097   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
21098                      DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
21099                      DAG.getConstant(0, DL, MVT::i64));
21100 }
21101 
21102 static bool isLoadOrMultipleLoads(SDValue B, SmallVector<LoadSDNode *> &Loads) {
21103   SDValue BV = peekThroughOneUseBitcasts(B);
21104   if (!BV->hasOneUse())
21105     return false;
21106   if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
21107     if (!Ld || !Ld->isSimple())
21108       return false;
21109     Loads.push_back(Ld);
21110     return true;
21111   } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
21112              BV.getOpcode() == ISD::CONCAT_VECTORS) {
21113     for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
21114       auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
21115       if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
21116         return false;
21117       Loads.push_back(Ld);
21118     }
21119     return true;
21120   } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
21121     // Try to find a tree of shuffles and concats from how IR shuffles of loads
21122     // are lowered. Note that this only comes up because we do not always visit
21123     // operands before uses. After that is fixed this can be removed and in the
21124     // meantime this is fairly specific to the lowering we expect from IR.
21125     // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
21126     //   t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
21127     //     t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
21128     //       t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
21129     //       t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
21130     //     t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
21131     //       t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
21132     //   t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
21133     //     t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
21134     if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
21135         B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
21136         B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
21137         B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
21138         B.getOperand(1).getNumOperands() != 4)
21139       return false;
21140     auto SV1 = cast<ShuffleVectorSDNode>(B);
21141     auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
21142     int NumElts = B.getValueType().getVectorNumElements();
21143     int NumSubElts = NumElts / 4;
21144     for (int I = 0; I < NumSubElts; I++) {
21145       // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
21146       if (SV1->getMaskElt(I) != I ||
21147           SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
21148           SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
21149           SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
21150         return false;
21151       // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
21152       if (SV2->getMaskElt(I) != I ||
21153           SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
21154           SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
21155         return false;
21156     }
21157     auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
21158     auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
21159     auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
21160     auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
21161     if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
21162         !Ld2->isSimple() || !Ld3->isSimple())
21163       return false;
21164     Loads.push_back(Ld0);
21165     Loads.push_back(Ld1);
21166     Loads.push_back(Ld2);
21167     Loads.push_back(Ld3);
21168     return true;
21169   }
21170   return false;
21171 }
21172 
21173 static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1,
21174                                             SelectionDAG &DAG,
21175                                             unsigned &NumSubLoads) {
21176   if (!Op0.hasOneUse() || !Op1.hasOneUse())
21177     return false;
21178 
21179   SmallVector<LoadSDNode *> Loads0, Loads1;
21180   if (isLoadOrMultipleLoads(Op0, Loads0) &&
21181       isLoadOrMultipleLoads(Op1, Loads1)) {
21182     if (NumSubLoads && Loads0.size() != NumSubLoads)
21183       return false;
21184     NumSubLoads = Loads0.size();
21185     return Loads0.size() == Loads1.size() &&
21186            all_of(zip(Loads0, Loads1), [&DAG](auto L) {
21187              unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
21188              return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
21189                     DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
21190                                                        Size / 8, 1);
21191            });
21192   }
21193 
21194   if (Op0.getOpcode() != Op1.getOpcode())
21195     return false;
21196 
21197   switch (Op0.getOpcode()) {
21198   case ISD::ADD:
21199   case ISD::SUB:
21200     return areLoadedOffsetButOtherwiseSame(Op0.getOperand(0), Op1.getOperand(0),
21201                                            DAG, NumSubLoads) &&
21202            areLoadedOffsetButOtherwiseSame(Op0.getOperand(1), Op1.getOperand(1),
21203                                            DAG, NumSubLoads);
21204   case ISD::SIGN_EXTEND:
21205   case ISD::ANY_EXTEND:
21206   case ISD::ZERO_EXTEND:
21207     EVT XVT = Op0.getOperand(0).getValueType();
21208     if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
21209         XVT.getScalarSizeInBits() != 32)
21210       return false;
21211     return areLoadedOffsetButOtherwiseSame(Op0.getOperand(0), Op1.getOperand(0),
21212                                            DAG, NumSubLoads);
21213   }
21214   return false;
21215 }
21216 
21217 // This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
21218 // into a single load of twice the size, that we extract the bottom part and top
21219 // part so that the shl can use a shll2 instruction. The two loads in that
21220 // example can also be larger trees of instructions, which are identical except
21221 // for the leaves which are all loads offset from the LHS, including
21222 // buildvectors of multiple loads. For example the RHS tree could be
21223 // sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
21224 // Whilst it can be common for the larger loads to replace LDP instructions
21225 // (which doesn't gain anything on it's own), the larger loads can help create
21226 // more efficient code, and in buildvectors prevent the need for ld1 lane
21227 // inserts which can be slower than normal loads.
21228 static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) {
21229   EVT VT = N->getValueType(0);
21230   if (!VT.isFixedLengthVector() ||
21231       (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
21232        VT.getScalarSizeInBits() != 64))
21233     return SDValue();
21234 
21235   SDValue Other = N->getOperand(0);
21236   SDValue Shift = N->getOperand(1);
21237   if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
21238     std::swap(Shift, Other);
21239   APInt ShiftAmt;
21240   if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
21241       !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
21242     return SDValue();
21243 
21244   if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
21245       !ISD::isExtOpcode(Other.getOpcode()) ||
21246       Shift.getOperand(0).getOperand(0).getValueType() !=
21247           Other.getOperand(0).getValueType() ||
21248       !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
21249     return SDValue();
21250 
21251   SDValue Op0 = Other.getOperand(0);
21252   SDValue Op1 = Shift.getOperand(0).getOperand(0);
21253 
21254   unsigned NumSubLoads = 0;
21255   if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
21256     return SDValue();
21257 
21258   // Attempt to rule out some unprofitable cases using heuristics (some working
21259   // around suboptimal code generation), notably if the extend not be able to
21260   // use ushll2 instructions as the types are not large enough. Otherwise zip's
21261   // will need to be created which can increase the instruction count.
21262   unsigned NumElts = Op0.getValueType().getVectorNumElements();
21263   unsigned NumSubElts = NumElts / NumSubLoads;
21264   if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
21265       (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
21266        Op0.getValueType().getSizeInBits() < 128 &&
21267        !DAG.getTargetLoweringInfo().isTypeLegal(Op0.getValueType())))
21268     return SDValue();
21269 
21270   // Recreate the tree with the new combined loads.
21271   std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
21272       [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
21273         EVT DVT =
21274             Op0.getValueType().getDoubleNumVectorElementsVT(*DAG.getContext());
21275 
21276         SmallVector<LoadSDNode *> Loads0, Loads1;
21277         if (isLoadOrMultipleLoads(Op0, Loads0) &&
21278             isLoadOrMultipleLoads(Op1, Loads1)) {
21279           EVT LoadVT = EVT::getVectorVT(
21280               *DAG.getContext(), Op0.getValueType().getScalarType(),
21281               Op0.getValueType().getVectorNumElements() / Loads0.size());
21282           EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
21283 
21284           SmallVector<SDValue> NewLoads;
21285           for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
21286             SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
21287                                        L0->getBasePtr(), L0->getPointerInfo(),
21288                                        L0->getOriginalAlign());
21289             DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
21290             DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
21291             NewLoads.push_back(Load);
21292           }
21293           return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
21294         }
21295 
21296         SmallVector<SDValue> Ops;
21297         for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
21298           Ops.push_back(GenCombinedTree(O0, O1, DAG));
21299         return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
21300       };
21301   SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
21302 
21303   SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
21304   int Hi = NumSubElts, Lo = 0;
21305   for (unsigned i = 0; i < NumSubLoads; i++) {
21306     for (unsigned j = 0; j < NumSubElts; j++) {
21307       LowMask[i * NumSubElts + j] = Lo++;
21308       HighMask[i * NumSubElts + j] = Hi++;
21309     }
21310     Lo += NumSubElts;
21311     Hi += NumSubElts;
21312   }
21313   SDLoc DL(N);
21314   SDValue Ext0, Ext1;
21315   // Extract the top and bottom lanes, then extend the result. Possibly extend
21316   // the result then extract the lanes if the two operands match as it produces
21317   // slightly smaller code.
21318   if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
21319     SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(),
21320                                NewOp, DAG.getConstant(0, DL, MVT::i64));
21321     SDValue SubH =
21322         DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
21323                     DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
21324     SDValue Extr0 =
21325         DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
21326     SDValue Extr1 =
21327         DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
21328     Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
21329     Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
21330   } else {
21331     EVT DVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
21332     SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
21333     SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
21334                                DAG.getConstant(0, DL, MVT::i64));
21335     SDValue SubH =
21336         DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
21337                     DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
21338     Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
21339     Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
21340   }
21341   SDValue NShift =
21342       DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
21343   return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
21344 }
21345 
21346 static SDValue performAddSubCombine(SDNode *N,
21347                                     TargetLowering::DAGCombinerInfo &DCI) {
21348   // Try to change sum of two reductions.
21349   if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
21350     return Val;
21351   if (SDValue Val = performAddDotCombine(N, DCI.DAG))
21352     return Val;
21353   if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
21354     return Val;
21355   if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
21356     return Val;
21357   if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
21358     return Val;
21359   if (SDValue Val = performAddCombineForShiftedOperands(N, DCI.DAG))
21360     return Val;
21361   if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
21362     return Val;
21363   if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
21364     return Val;
21365   if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
21366     return Val;
21367 
21368   if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
21369     return Val;
21370 
21371   return performAddSubLongCombine(N, DCI);
21372 }
21373 
21374 // Massage DAGs which we can use the high-half "long" operations on into
21375 // something isel will recognize better. E.g.
21376 //
21377 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
21378 //   (aarch64_neon_umull (extract_high (v2i64 vec)))
21379 //                     (extract_high (v2i64 (dup128 scalar)))))
21380 //
21381 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
21382                                        TargetLowering::DAGCombinerInfo &DCI,
21383                                        SelectionDAG &DAG) {
21384   if (DCI.isBeforeLegalizeOps())
21385     return SDValue();
21386 
21387   SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
21388   SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
21389   assert(LHS.getValueType().is64BitVector() &&
21390          RHS.getValueType().is64BitVector() &&
21391          "unexpected shape for long operation");
21392 
21393   // Either node could be a DUP, but it's not worth doing both of them (you'd
21394   // just as well use the non-high version) so look for a corresponding extract
21395   // operation on the other "wing".
21396   if (isEssentiallyExtractHighSubvector(LHS)) {
21397     RHS = tryExtendDUPToExtractHigh(RHS, DAG);
21398     if (!RHS.getNode())
21399       return SDValue();
21400   } else if (isEssentiallyExtractHighSubvector(RHS)) {
21401     LHS = tryExtendDUPToExtractHigh(LHS, DAG);
21402     if (!LHS.getNode())
21403       return SDValue();
21404   } else
21405     return SDValue();
21406 
21407   if (IID == Intrinsic::not_intrinsic)
21408     return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
21409 
21410   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
21411                      N->getOperand(0), LHS, RHS);
21412 }
21413 
21414 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
21415   MVT ElemTy = N->getSimpleValueType(0).getScalarType();
21416   unsigned ElemBits = ElemTy.getSizeInBits();
21417 
21418   int64_t ShiftAmount;
21419   if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
21420     APInt SplatValue, SplatUndef;
21421     unsigned SplatBitSize;
21422     bool HasAnyUndefs;
21423     if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
21424                               HasAnyUndefs, ElemBits) ||
21425         SplatBitSize != ElemBits)
21426       return SDValue();
21427 
21428     ShiftAmount = SplatValue.getSExtValue();
21429   } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
21430     ShiftAmount = CVN->getSExtValue();
21431   } else
21432     return SDValue();
21433 
21434   // If the shift amount is zero, remove the shift intrinsic.
21435   if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
21436     return N->getOperand(1);
21437 
21438   unsigned Opcode;
21439   bool IsRightShift;
21440   switch (IID) {
21441   default:
21442     llvm_unreachable("Unknown shift intrinsic");
21443   case Intrinsic::aarch64_neon_sqshl:
21444     Opcode = AArch64ISD::SQSHL_I;
21445     IsRightShift = false;
21446     break;
21447   case Intrinsic::aarch64_neon_uqshl:
21448     Opcode = AArch64ISD::UQSHL_I;
21449     IsRightShift = false;
21450     break;
21451   case Intrinsic::aarch64_neon_srshl:
21452     Opcode = AArch64ISD::SRSHR_I;
21453     IsRightShift = true;
21454     break;
21455   case Intrinsic::aarch64_neon_urshl:
21456     Opcode = AArch64ISD::URSHR_I;
21457     IsRightShift = true;
21458     break;
21459   case Intrinsic::aarch64_neon_sqshlu:
21460     Opcode = AArch64ISD::SQSHLU_I;
21461     IsRightShift = false;
21462     break;
21463   case Intrinsic::aarch64_neon_sshl:
21464   case Intrinsic::aarch64_neon_ushl:
21465     // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
21466     // left shift for positive shift amounts. For negative shifts we can use a
21467     // VASHR/VLSHR as appropiate.
21468     if (ShiftAmount < 0) {
21469       Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
21470                                                    : AArch64ISD::VLSHR;
21471       ShiftAmount = -ShiftAmount;
21472     } else
21473       Opcode = AArch64ISD::VSHL;
21474     IsRightShift = false;
21475     break;
21476   }
21477 
21478   EVT VT = N->getValueType(0);
21479   SDValue Op = N->getOperand(1);
21480   SDLoc dl(N);
21481   if (VT == MVT::i64) {
21482     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
21483     VT = MVT::v1i64;
21484   }
21485 
21486   if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
21487     Op = DAG.getNode(Opcode, dl, VT, Op,
21488                      DAG.getConstant(-ShiftAmount, dl, MVT::i32));
21489     if (N->getValueType(0) == MVT::i64)
21490       Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
21491                        DAG.getConstant(0, dl, MVT::i64));
21492     return Op;
21493   } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
21494     Op = DAG.getNode(Opcode, dl, VT, Op,
21495                      DAG.getConstant(ShiftAmount, dl, MVT::i32));
21496     if (N->getValueType(0) == MVT::i64)
21497       Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
21498                        DAG.getConstant(0, dl, MVT::i64));
21499     return Op;
21500   }
21501 
21502   return SDValue();
21503 }
21504 
21505 // The CRC32[BH] instructions ignore the high bits of their data operand. Since
21506 // the intrinsics must be legal and take an i32, this means there's almost
21507 // certainly going to be a zext in the DAG which we can eliminate.
21508 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
21509   SDValue AndN = N->getOperand(2);
21510   if (AndN.getOpcode() != ISD::AND)
21511     return SDValue();
21512 
21513   ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
21514   if (!CMask || CMask->getZExtValue() != Mask)
21515     return SDValue();
21516 
21517   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
21518                      N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
21519 }
21520 
21521 static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
21522                                            SelectionDAG &DAG) {
21523   SDLoc dl(N);
21524   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
21525                      DAG.getNode(Opc, dl,
21526                                  N->getOperand(1).getSimpleValueType(),
21527                                  N->getOperand(1)),
21528                      DAG.getConstant(0, dl, MVT::i64));
21529 }
21530 
21531 static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
21532   SDLoc DL(N);
21533   SDValue Op1 = N->getOperand(1);
21534   SDValue Op2 = N->getOperand(2);
21535   EVT ScalarTy = Op2.getValueType();
21536   if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
21537     ScalarTy = MVT::i32;
21538 
21539   // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
21540   SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
21541   SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
21542   SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
21543   SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
21544   return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
21545 }
21546 
21547 static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
21548   SDLoc dl(N);
21549   SDValue Scalar = N->getOperand(3);
21550   EVT ScalarTy = Scalar.getValueType();
21551 
21552   if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
21553     Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
21554 
21555   SDValue Passthru = N->getOperand(1);
21556   SDValue Pred = N->getOperand(2);
21557   return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
21558                      Pred, Scalar, Passthru);
21559 }
21560 
21561 static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
21562   SDLoc dl(N);
21563   LLVMContext &Ctx = *DAG.getContext();
21564   EVT VT = N->getValueType(0);
21565 
21566   assert(VT.isScalableVector() && "Expected a scalable vector.");
21567 
21568   // Current lowering only supports the SVE-ACLE types.
21569   if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
21570     return SDValue();
21571 
21572   unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
21573   unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
21574   EVT ByteVT =
21575       EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
21576 
21577   // Convert everything to the domain of EXT (i.e bytes).
21578   SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
21579   SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
21580   SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
21581                             DAG.getConstant(ElemSize, dl, MVT::i32));
21582 
21583   SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
21584   return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
21585 }
21586 
21587 static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
21588                                         TargetLowering::DAGCombinerInfo &DCI,
21589                                         SelectionDAG &DAG) {
21590   if (DCI.isBeforeLegalize())
21591     return SDValue();
21592 
21593   SDValue Comparator = N->getOperand(3);
21594   if (Comparator.getOpcode() == AArch64ISD::DUP ||
21595       Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
21596     unsigned IID = getIntrinsicID(N);
21597     EVT VT = N->getValueType(0);
21598     EVT CmpVT = N->getOperand(2).getValueType();
21599     SDValue Pred = N->getOperand(1);
21600     SDValue Imm;
21601     SDLoc DL(N);
21602 
21603     switch (IID) {
21604     default:
21605       llvm_unreachable("Called with wrong intrinsic!");
21606       break;
21607 
21608     // Signed comparisons
21609     case Intrinsic::aarch64_sve_cmpeq_wide:
21610     case Intrinsic::aarch64_sve_cmpne_wide:
21611     case Intrinsic::aarch64_sve_cmpge_wide:
21612     case Intrinsic::aarch64_sve_cmpgt_wide:
21613     case Intrinsic::aarch64_sve_cmplt_wide:
21614     case Intrinsic::aarch64_sve_cmple_wide: {
21615       if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
21616         int64_t ImmVal = CN->getSExtValue();
21617         if (ImmVal >= -16 && ImmVal <= 15)
21618           Imm = DAG.getSignedConstant(ImmVal, DL, MVT::i32);
21619         else
21620           return SDValue();
21621       }
21622       break;
21623     }
21624     // Unsigned comparisons
21625     case Intrinsic::aarch64_sve_cmphs_wide:
21626     case Intrinsic::aarch64_sve_cmphi_wide:
21627     case Intrinsic::aarch64_sve_cmplo_wide:
21628     case Intrinsic::aarch64_sve_cmpls_wide:  {
21629       if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
21630         uint64_t ImmVal = CN->getZExtValue();
21631         if (ImmVal <= 127)
21632           Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
21633         else
21634           return SDValue();
21635       }
21636       break;
21637     }
21638     }
21639 
21640     if (!Imm)
21641       return SDValue();
21642 
21643     SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
21644     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
21645                        N->getOperand(2), Splat, DAG.getCondCode(CC));
21646   }
21647 
21648   return SDValue();
21649 }
21650 
21651 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
21652                         AArch64CC::CondCode Cond) {
21653   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21654 
21655   SDLoc DL(Op);
21656   assert(Op.getValueType().isScalableVector() &&
21657          TLI.isTypeLegal(Op.getValueType()) &&
21658          "Expected legal scalable vector type!");
21659   assert(Op.getValueType() == Pg.getValueType() &&
21660          "Expected same type for PTEST operands");
21661 
21662   // Ensure target specific opcodes are using legal type.
21663   EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
21664   SDValue TVal = DAG.getConstant(1, DL, OutVT);
21665   SDValue FVal = DAG.getConstant(0, DL, OutVT);
21666 
21667   // Ensure operands have type nxv16i1.
21668   if (Op.getValueType() != MVT::nxv16i1) {
21669     if ((Cond == AArch64CC::ANY_ACTIVE || Cond == AArch64CC::NONE_ACTIVE) &&
21670         isZeroingInactiveLanes(Op))
21671       Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
21672     else
21673       Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
21674     Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
21675   }
21676 
21677   // Set condition code (CC) flags.
21678   SDValue Test = DAG.getNode(
21679       Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST,
21680       DL, MVT::i32, Pg, Op);
21681 
21682   // Convert CC to integer based on requested condition.
21683   // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
21684   SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
21685   SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
21686   return DAG.getZExtOrTrunc(Res, DL, VT);
21687 }
21688 
21689 static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc,
21690                                       SelectionDAG &DAG) {
21691   SDLoc DL(N);
21692 
21693   SDValue Pred = N->getOperand(1);
21694   SDValue VecToReduce = N->getOperand(2);
21695 
21696   // NOTE: The integer reduction's result type is not always linked to the
21697   // operand's element type so we construct it from the intrinsic's result type.
21698   EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
21699   SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
21700 
21701   // SVE reductions set the whole vector register with the first element
21702   // containing the reduction result, which we'll now extract.
21703   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
21704   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
21705                      Zero);
21706 }
21707 
21708 static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
21709                                      SelectionDAG &DAG) {
21710   SDLoc DL(N);
21711 
21712   SDValue Pred = N->getOperand(1);
21713   SDValue VecToReduce = N->getOperand(2);
21714 
21715   EVT ReduceVT = VecToReduce.getValueType();
21716   SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
21717 
21718   // SVE reductions set the whole vector register with the first element
21719   // containing the reduction result, which we'll now extract.
21720   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
21721   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
21722                      Zero);
21723 }
21724 
21725 static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
21726                                             SelectionDAG &DAG) {
21727   SDLoc DL(N);
21728 
21729   SDValue Pred = N->getOperand(1);
21730   SDValue InitVal = N->getOperand(2);
21731   SDValue VecToReduce = N->getOperand(3);
21732   EVT ReduceVT = VecToReduce.getValueType();
21733 
21734   // Ordered reductions use the first lane of the result vector as the
21735   // reduction's initial value.
21736   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
21737   InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
21738                         DAG.getUNDEF(ReduceVT), InitVal, Zero);
21739 
21740   SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
21741 
21742   // SVE reductions set the whole vector register with the first element
21743   // containing the reduction result, which we'll now extract.
21744   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
21745                      Zero);
21746 }
21747 
21748 // If a merged operation has no inactive lanes we can relax it to a predicated
21749 // or unpredicated operation, which potentially allows better isel (perhaps
21750 // using immediate forms) or relaxing register reuse requirements.
21751 static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
21752                                        SelectionDAG &DAG, bool UnpredOp = false,
21753                                        bool SwapOperands = false) {
21754   assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
21755   assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
21756   SDValue Pg = N->getOperand(1);
21757   SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
21758   SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
21759 
21760   // ISD way to specify an all active predicate.
21761   if (isAllActivePredicate(DAG, Pg)) {
21762     if (UnpredOp)
21763       return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
21764 
21765     return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
21766   }
21767 
21768   // FUTURE: SplatVector(true)
21769   return SDValue();
21770 }
21771 
21772 static SDValue tryCombineWhileLo(SDNode *N,
21773                                  TargetLowering::DAGCombinerInfo &DCI,
21774                                  const AArch64Subtarget *Subtarget) {
21775   if (DCI.isBeforeLegalize())
21776     return SDValue();
21777 
21778   if (!Subtarget->hasSVE2p1())
21779     return SDValue();
21780 
21781   if (!N->hasNUsesOfValue(2, 0))
21782     return SDValue();
21783 
21784   const uint64_t HalfSize = N->getValueType(0).getVectorMinNumElements() / 2;
21785   if (HalfSize < 2)
21786     return SDValue();
21787 
21788   auto It = N->user_begin();
21789   SDNode *Lo = *It++;
21790   SDNode *Hi = *It;
21791 
21792   if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
21793       Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR)
21794     return SDValue();
21795 
21796   uint64_t OffLo = Lo->getConstantOperandVal(1);
21797   uint64_t OffHi = Hi->getConstantOperandVal(1);
21798 
21799   if (OffLo > OffHi) {
21800     std::swap(Lo, Hi);
21801     std::swap(OffLo, OffHi);
21802   }
21803 
21804   if (OffLo != 0 || OffHi != HalfSize)
21805     return SDValue();
21806 
21807   EVT HalfVec = Lo->getValueType(0);
21808   if (HalfVec != Hi->getValueType(0) ||
21809       HalfVec.getVectorElementCount() != ElementCount::getScalable(HalfSize))
21810     return SDValue();
21811 
21812   SelectionDAG &DAG = DCI.DAG;
21813   SDLoc DL(N);
21814   SDValue ID =
21815       DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
21816   SDValue Idx = N->getOperand(1);
21817   SDValue TC = N->getOperand(2);
21818   if (Idx.getValueType() != MVT::i64) {
21819     Idx = DAG.getZExtOrTrunc(Idx, DL, MVT::i64);
21820     TC = DAG.getZExtOrTrunc(TC, DL, MVT::i64);
21821   }
21822   auto R =
21823       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL,
21824                   {Lo->getValueType(0), Hi->getValueType(0)}, {ID, Idx, TC});
21825 
21826   DCI.CombineTo(Lo, R.getValue(0));
21827   DCI.CombineTo(Hi, R.getValue(1));
21828 
21829   return SDValue(N, 0);
21830 }
21831 
21832 SDValue tryLowerPartialReductionToDot(SDNode *N,
21833                                       const AArch64Subtarget *Subtarget,
21834                                       SelectionDAG &DAG) {
21835 
21836   assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
21837          getIntrinsicID(N) ==
21838              Intrinsic::experimental_vector_partial_reduce_add &&
21839          "Expected a partial reduction node");
21840 
21841   bool Scalable = N->getValueType(0).isScalableVector();
21842   if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
21843     return SDValue();
21844   if (!Scalable && (!Subtarget->isNeonAvailable() || !Subtarget->hasDotProd()))
21845     return SDValue();
21846 
21847   SDLoc DL(N);
21848 
21849   SDValue Op2 = N->getOperand(2);
21850   unsigned Op2Opcode = Op2->getOpcode();
21851   SDValue MulOpLHS, MulOpRHS;
21852   bool MulOpLHSIsSigned, MulOpRHSIsSigned;
21853   if (ISD::isExtOpcode(Op2Opcode)) {
21854     MulOpLHSIsSigned = MulOpRHSIsSigned = (Op2Opcode == ISD::SIGN_EXTEND);
21855     MulOpLHS = Op2->getOperand(0);
21856     MulOpRHS = DAG.getConstant(1, DL, MulOpLHS.getValueType());
21857   } else if (Op2Opcode == ISD::MUL) {
21858     SDValue ExtMulOpLHS = Op2->getOperand(0);
21859     SDValue ExtMulOpRHS = Op2->getOperand(1);
21860 
21861     unsigned ExtMulOpLHSOpcode = ExtMulOpLHS->getOpcode();
21862     unsigned ExtMulOpRHSOpcode = ExtMulOpRHS->getOpcode();
21863     if (!ISD::isExtOpcode(ExtMulOpLHSOpcode) ||
21864         !ISD::isExtOpcode(ExtMulOpRHSOpcode))
21865       return SDValue();
21866 
21867     MulOpLHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
21868     MulOpRHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
21869 
21870     MulOpLHS = ExtMulOpLHS->getOperand(0);
21871     MulOpRHS = ExtMulOpRHS->getOperand(0);
21872 
21873     if (MulOpLHS.getValueType() != MulOpRHS.getValueType())
21874       return SDValue();
21875   } else
21876     return SDValue();
21877 
21878   SDValue Acc = N->getOperand(1);
21879   EVT ReducedVT = N->getValueType(0);
21880   EVT MulSrcVT = MulOpLHS.getValueType();
21881 
21882   // Dot products operate on chunks of four elements so there must be four times
21883   // as many elements in the wide type
21884   if (!(ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) &&
21885       !(ReducedVT == MVT::nxv4i32 && MulSrcVT == MVT::nxv16i8) &&
21886       !(ReducedVT == MVT::nxv2i64 && MulSrcVT == MVT::nxv8i16) &&
21887       !(ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8) &&
21888       !(ReducedVT == MVT::v4i32 && MulSrcVT == MVT::v16i8) &&
21889       !(ReducedVT == MVT::v2i32 && MulSrcVT == MVT::v8i8))
21890     return SDValue();
21891 
21892   // If the extensions are mixed, we should lower it to a usdot instead
21893   unsigned Opcode = 0;
21894   if (MulOpLHSIsSigned != MulOpRHSIsSigned) {
21895     if (!Subtarget->hasMatMulInt8())
21896       return SDValue();
21897 
21898     bool Scalable = N->getValueType(0).isScalableVT();
21899     // There's no nxv2i64 version of usdot
21900     if (Scalable && ReducedVT != MVT::nxv4i32 && ReducedVT != MVT::nxv4i64)
21901       return SDValue();
21902 
21903     Opcode = AArch64ISD::USDOT;
21904     // USDOT expects the signed operand to be last
21905     if (!MulOpRHSIsSigned)
21906       std::swap(MulOpLHS, MulOpRHS);
21907   } else
21908     Opcode = MulOpLHSIsSigned ? AArch64ISD::SDOT : AArch64ISD::UDOT;
21909 
21910   // Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
21911   // product followed by a zero / sign extension
21912   if ((ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) ||
21913       (ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8)) {
21914     EVT ReducedVTI32 =
21915         (ReducedVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
21916 
21917     SDValue DotI32 =
21918         DAG.getNode(Opcode, DL, ReducedVTI32,
21919                     DAG.getConstant(0, DL, ReducedVTI32), MulOpLHS, MulOpRHS);
21920     SDValue Extended = DAG.getSExtOrTrunc(DotI32, DL, ReducedVT);
21921     return DAG.getNode(ISD::ADD, DL, ReducedVT, Acc, Extended);
21922   }
21923 
21924   return DAG.getNode(Opcode, DL, ReducedVT, Acc, MulOpLHS, MulOpRHS);
21925 }
21926 
21927 SDValue tryLowerPartialReductionToWideAdd(SDNode *N,
21928                                           const AArch64Subtarget *Subtarget,
21929                                           SelectionDAG &DAG) {
21930 
21931   assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
21932          getIntrinsicID(N) ==
21933              Intrinsic::experimental_vector_partial_reduce_add &&
21934          "Expected a partial reduction node");
21935 
21936   if (!Subtarget->hasSVE2() && !Subtarget->isStreamingSVEAvailable())
21937     return SDValue();
21938 
21939   SDLoc DL(N);
21940 
21941   if (!ISD::isExtOpcode(N->getOperand(2).getOpcode()))
21942     return SDValue();
21943   SDValue Acc = N->getOperand(1);
21944   SDValue Ext = N->getOperand(2);
21945   EVT AccVT = Acc.getValueType();
21946   EVT ExtVT = Ext.getValueType();
21947   if (ExtVT.getVectorElementType() != AccVT.getVectorElementType())
21948     return SDValue();
21949 
21950   SDValue ExtOp = Ext->getOperand(0);
21951   EVT ExtOpVT = ExtOp.getValueType();
21952 
21953   if (!(ExtOpVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
21954       !(ExtOpVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
21955       !(ExtOpVT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
21956     return SDValue();
21957 
21958   bool ExtOpIsSigned = Ext.getOpcode() == ISD::SIGN_EXTEND;
21959   unsigned BottomOpcode =
21960       ExtOpIsSigned ? AArch64ISD::SADDWB : AArch64ISD::UADDWB;
21961   unsigned TopOpcode = ExtOpIsSigned ? AArch64ISD::SADDWT : AArch64ISD::UADDWT;
21962   SDValue BottomNode = DAG.getNode(BottomOpcode, DL, AccVT, Acc, ExtOp);
21963   return DAG.getNode(TopOpcode, DL, AccVT, BottomNode, ExtOp);
21964 }
21965 
21966 static SDValue performIntrinsicCombine(SDNode *N,
21967                                        TargetLowering::DAGCombinerInfo &DCI,
21968                                        const AArch64Subtarget *Subtarget) {
21969   SelectionDAG &DAG = DCI.DAG;
21970   unsigned IID = getIntrinsicID(N);
21971   switch (IID) {
21972   default:
21973     break;
21974   case Intrinsic::experimental_vector_partial_reduce_add: {
21975     if (SDValue Dot = tryLowerPartialReductionToDot(N, Subtarget, DAG))
21976       return Dot;
21977     if (SDValue WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG))
21978       return WideAdd;
21979     return DAG.getPartialReduceAdd(SDLoc(N), N->getValueType(0),
21980                                    N->getOperand(1), N->getOperand(2));
21981   }
21982   case Intrinsic::aarch64_neon_vcvtfxs2fp:
21983   case Intrinsic::aarch64_neon_vcvtfxu2fp:
21984     return tryCombineFixedPointConvert(N, DCI, DAG);
21985   case Intrinsic::aarch64_neon_saddv:
21986     return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
21987   case Intrinsic::aarch64_neon_uaddv:
21988     return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
21989   case Intrinsic::aarch64_neon_sminv:
21990     return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
21991   case Intrinsic::aarch64_neon_uminv:
21992     return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
21993   case Intrinsic::aarch64_neon_smaxv:
21994     return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
21995   case Intrinsic::aarch64_neon_umaxv:
21996     return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
21997   case Intrinsic::aarch64_neon_fmax:
21998     return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
21999                        N->getOperand(1), N->getOperand(2));
22000   case Intrinsic::aarch64_neon_fmin:
22001     return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
22002                        N->getOperand(1), N->getOperand(2));
22003   case Intrinsic::aarch64_neon_fmaxnm:
22004     return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
22005                        N->getOperand(1), N->getOperand(2));
22006   case Intrinsic::aarch64_neon_fminnm:
22007     return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
22008                        N->getOperand(1), N->getOperand(2));
22009   case Intrinsic::aarch64_neon_smull:
22010     return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
22011                        N->getOperand(1), N->getOperand(2));
22012   case Intrinsic::aarch64_neon_umull:
22013     return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
22014                        N->getOperand(1), N->getOperand(2));
22015   case Intrinsic::aarch64_neon_pmull:
22016     return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
22017                        N->getOperand(1), N->getOperand(2));
22018   case Intrinsic::aarch64_neon_sqdmull:
22019     return tryCombineLongOpWithDup(IID, N, DCI, DAG);
22020   case Intrinsic::aarch64_neon_sqshl:
22021   case Intrinsic::aarch64_neon_uqshl:
22022   case Intrinsic::aarch64_neon_sqshlu:
22023   case Intrinsic::aarch64_neon_srshl:
22024   case Intrinsic::aarch64_neon_urshl:
22025   case Intrinsic::aarch64_neon_sshl:
22026   case Intrinsic::aarch64_neon_ushl:
22027     return tryCombineShiftImm(IID, N, DAG);
22028   case Intrinsic::aarch64_neon_sabd:
22029     return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22030                        N->getOperand(1), N->getOperand(2));
22031   case Intrinsic::aarch64_neon_uabd:
22032     return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22033                        N->getOperand(1), N->getOperand(2));
22034   case Intrinsic::aarch64_crc32b:
22035   case Intrinsic::aarch64_crc32cb:
22036     return tryCombineCRC32(0xff, N, DAG);
22037   case Intrinsic::aarch64_crc32h:
22038   case Intrinsic::aarch64_crc32ch:
22039     return tryCombineCRC32(0xffff, N, DAG);
22040   case Intrinsic::aarch64_sve_saddv:
22041     // There is no i64 version of SADDV because the sign is irrelevant.
22042     if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
22043       return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
22044     else
22045       return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
22046   case Intrinsic::aarch64_sve_uaddv:
22047     return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
22048   case Intrinsic::aarch64_sve_smaxv:
22049     return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
22050   case Intrinsic::aarch64_sve_umaxv:
22051     return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
22052   case Intrinsic::aarch64_sve_sminv:
22053     return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
22054   case Intrinsic::aarch64_sve_uminv:
22055     return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
22056   case Intrinsic::aarch64_sve_orv:
22057     return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
22058   case Intrinsic::aarch64_sve_eorv:
22059     return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
22060   case Intrinsic::aarch64_sve_andv:
22061     return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
22062   case Intrinsic::aarch64_sve_index:
22063     return LowerSVEIntrinsicIndex(N, DAG);
22064   case Intrinsic::aarch64_sve_dup:
22065     return LowerSVEIntrinsicDUP(N, DAG);
22066   case Intrinsic::aarch64_sve_dup_x:
22067     return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
22068                        N->getOperand(1));
22069   case Intrinsic::aarch64_sve_ext:
22070     return LowerSVEIntrinsicEXT(N, DAG);
22071   case Intrinsic::aarch64_sve_mul_u:
22072     return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
22073                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22074   case Intrinsic::aarch64_sve_smulh_u:
22075     return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
22076                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22077   case Intrinsic::aarch64_sve_umulh_u:
22078     return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
22079                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22080   case Intrinsic::aarch64_sve_smin_u:
22081     return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
22082                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22083   case Intrinsic::aarch64_sve_umin_u:
22084     return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
22085                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22086   case Intrinsic::aarch64_sve_smax_u:
22087     return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
22088                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22089   case Intrinsic::aarch64_sve_umax_u:
22090     return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
22091                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22092   case Intrinsic::aarch64_sve_lsl_u:
22093     return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
22094                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22095   case Intrinsic::aarch64_sve_lsr_u:
22096     return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
22097                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22098   case Intrinsic::aarch64_sve_asr_u:
22099     return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
22100                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22101   case Intrinsic::aarch64_sve_fadd_u:
22102     return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
22103                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22104   case Intrinsic::aarch64_sve_fdiv_u:
22105     return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
22106                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22107   case Intrinsic::aarch64_sve_fmax_u:
22108     return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
22109                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22110   case Intrinsic::aarch64_sve_fmaxnm_u:
22111     return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
22112                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22113   case Intrinsic::aarch64_sve_fmla_u:
22114     return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
22115                        N->getOperand(1), N->getOperand(3), N->getOperand(4),
22116                        N->getOperand(2));
22117   case Intrinsic::aarch64_sve_fmin_u:
22118     return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
22119                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22120   case Intrinsic::aarch64_sve_fminnm_u:
22121     return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
22122                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22123   case Intrinsic::aarch64_sve_fmul_u:
22124     return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
22125                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22126   case Intrinsic::aarch64_sve_fsub_u:
22127     return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
22128                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22129   case Intrinsic::aarch64_sve_add_u:
22130     return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
22131                        N->getOperand(3));
22132   case Intrinsic::aarch64_sve_sub_u:
22133     return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
22134                        N->getOperand(3));
22135   case Intrinsic::aarch64_sve_subr:
22136     return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
22137   case Intrinsic::aarch64_sve_and_u:
22138     return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
22139                        N->getOperand(3));
22140   case Intrinsic::aarch64_sve_bic_u:
22141     return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
22142                        N->getOperand(2), N->getOperand(3));
22143   case Intrinsic::aarch64_sve_saddwb:
22144     return DAG.getNode(AArch64ISD::SADDWB, SDLoc(N), N->getValueType(0),
22145                        N->getOperand(1), N->getOperand(2));
22146   case Intrinsic::aarch64_sve_saddwt:
22147     return DAG.getNode(AArch64ISD::SADDWT, SDLoc(N), N->getValueType(0),
22148                        N->getOperand(1), N->getOperand(2));
22149   case Intrinsic::aarch64_sve_uaddwb:
22150     return DAG.getNode(AArch64ISD::UADDWB, SDLoc(N), N->getValueType(0),
22151                        N->getOperand(1), N->getOperand(2));
22152   case Intrinsic::aarch64_sve_uaddwt:
22153     return DAG.getNode(AArch64ISD::UADDWT, SDLoc(N), N->getValueType(0),
22154                        N->getOperand(1), N->getOperand(2));
22155   case Intrinsic::aarch64_sve_eor_u:
22156     return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22157                        N->getOperand(3));
22158   case Intrinsic::aarch64_sve_orr_u:
22159     return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22160                        N->getOperand(3));
22161   case Intrinsic::aarch64_sve_sabd_u:
22162     return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22163                        N->getOperand(2), N->getOperand(3));
22164   case Intrinsic::aarch64_sve_uabd_u:
22165     return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22166                        N->getOperand(2), N->getOperand(3));
22167   case Intrinsic::aarch64_sve_sdiv_u:
22168     return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
22169                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22170   case Intrinsic::aarch64_sve_udiv_u:
22171     return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
22172                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22173   case Intrinsic::aarch64_sve_sqadd:
22174     return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
22175   case Intrinsic::aarch64_sve_sqsub_u:
22176     return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22177                        N->getOperand(2), N->getOperand(3));
22178   case Intrinsic::aarch64_sve_uqadd:
22179     return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
22180   case Intrinsic::aarch64_sve_uqsub_u:
22181     return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22182                        N->getOperand(2), N->getOperand(3));
22183   case Intrinsic::aarch64_sve_sqadd_x:
22184     return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
22185                        N->getOperand(1), N->getOperand(2));
22186   case Intrinsic::aarch64_sve_sqsub_x:
22187     return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22188                        N->getOperand(1), N->getOperand(2));
22189   case Intrinsic::aarch64_sve_uqadd_x:
22190     return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
22191                        N->getOperand(1), N->getOperand(2));
22192   case Intrinsic::aarch64_sve_uqsub_x:
22193     return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22194                        N->getOperand(1), N->getOperand(2));
22195   case Intrinsic::aarch64_sve_asrd:
22196     return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
22197                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22198   case Intrinsic::aarch64_sve_cmphs:
22199     if (!N->getOperand(2).getValueType().isFloatingPoint())
22200       return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22201                          N->getValueType(0), N->getOperand(1), N->getOperand(2),
22202                          N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
22203     break;
22204   case Intrinsic::aarch64_sve_cmphi:
22205     if (!N->getOperand(2).getValueType().isFloatingPoint())
22206       return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22207                          N->getValueType(0), N->getOperand(1), N->getOperand(2),
22208                          N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
22209     break;
22210   case Intrinsic::aarch64_sve_fcmpge:
22211   case Intrinsic::aarch64_sve_cmpge:
22212     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22213                        N->getValueType(0), N->getOperand(1), N->getOperand(2),
22214                        N->getOperand(3), DAG.getCondCode(ISD::SETGE));
22215     break;
22216   case Intrinsic::aarch64_sve_fcmpgt:
22217   case Intrinsic::aarch64_sve_cmpgt:
22218     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22219                        N->getValueType(0), N->getOperand(1), N->getOperand(2),
22220                        N->getOperand(3), DAG.getCondCode(ISD::SETGT));
22221     break;
22222   case Intrinsic::aarch64_sve_fcmpeq:
22223   case Intrinsic::aarch64_sve_cmpeq:
22224     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22225                        N->getValueType(0), N->getOperand(1), N->getOperand(2),
22226                        N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
22227     break;
22228   case Intrinsic::aarch64_sve_fcmpne:
22229   case Intrinsic::aarch64_sve_cmpne:
22230     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22231                        N->getValueType(0), N->getOperand(1), N->getOperand(2),
22232                        N->getOperand(3), DAG.getCondCode(ISD::SETNE));
22233     break;
22234   case Intrinsic::aarch64_sve_fcmpuo:
22235     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22236                        N->getValueType(0), N->getOperand(1), N->getOperand(2),
22237                        N->getOperand(3), DAG.getCondCode(ISD::SETUO));
22238     break;
22239   case Intrinsic::aarch64_sve_fadda:
22240     return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
22241   case Intrinsic::aarch64_sve_faddv:
22242     return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
22243   case Intrinsic::aarch64_sve_fmaxnmv:
22244     return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
22245   case Intrinsic::aarch64_sve_fmaxv:
22246     return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
22247   case Intrinsic::aarch64_sve_fminnmv:
22248     return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
22249   case Intrinsic::aarch64_sve_fminv:
22250     return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
22251   case Intrinsic::aarch64_sve_sel:
22252     return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
22253                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
22254   case Intrinsic::aarch64_sve_cmpeq_wide:
22255     return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
22256   case Intrinsic::aarch64_sve_cmpne_wide:
22257     return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
22258   case Intrinsic::aarch64_sve_cmpge_wide:
22259     return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
22260   case Intrinsic::aarch64_sve_cmpgt_wide:
22261     return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
22262   case Intrinsic::aarch64_sve_cmplt_wide:
22263     return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
22264   case Intrinsic::aarch64_sve_cmple_wide:
22265     return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
22266   case Intrinsic::aarch64_sve_cmphs_wide:
22267     return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
22268   case Intrinsic::aarch64_sve_cmphi_wide:
22269     return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
22270   case Intrinsic::aarch64_sve_cmplo_wide:
22271     return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
22272   case Intrinsic::aarch64_sve_cmpls_wide:
22273     return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
22274   case Intrinsic::aarch64_sve_ptest_any:
22275     return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22276                     AArch64CC::ANY_ACTIVE);
22277   case Intrinsic::aarch64_sve_ptest_first:
22278     return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22279                     AArch64CC::FIRST_ACTIVE);
22280   case Intrinsic::aarch64_sve_ptest_last:
22281     return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22282                     AArch64CC::LAST_ACTIVE);
22283   case Intrinsic::aarch64_sve_whilelo:
22284     return tryCombineWhileLo(N, DCI, Subtarget);
22285   }
22286   return SDValue();
22287 }
22288 
22289 static bool isCheapToExtend(const SDValue &N) {
22290   unsigned OC = N->getOpcode();
22291   return OC == ISD::LOAD || OC == ISD::MLOAD ||
22292          ISD::isConstantSplatVectorAllZeros(N.getNode());
22293 }
22294 
22295 static SDValue
22296 performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
22297                               SelectionDAG &DAG) {
22298   // If we have (sext (setcc A B)) and A and B are cheap to extend,
22299   // we can move the sext into the arguments and have the same result. For
22300   // example, if A and B are both loads, we can make those extending loads and
22301   // avoid an extra instruction. This pattern appears often in VLS code
22302   // generation where the inputs to the setcc have a different size to the
22303   // instruction that wants to use the result of the setcc.
22304   assert(N->getOpcode() == ISD::SIGN_EXTEND &&
22305          N->getOperand(0)->getOpcode() == ISD::SETCC);
22306   const SDValue SetCC = N->getOperand(0);
22307 
22308   const SDValue CCOp0 = SetCC.getOperand(0);
22309   const SDValue CCOp1 = SetCC.getOperand(1);
22310   if (!CCOp0->getValueType(0).isInteger() ||
22311       !CCOp1->getValueType(0).isInteger())
22312     return SDValue();
22313 
22314   ISD::CondCode Code =
22315       cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
22316 
22317   ISD::NodeType ExtType =
22318       isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22319 
22320   if (isCheapToExtend(SetCC.getOperand(0)) &&
22321       isCheapToExtend(SetCC.getOperand(1))) {
22322     const SDValue Ext1 =
22323         DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
22324     const SDValue Ext2 =
22325         DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
22326 
22327     return DAG.getSetCC(
22328         SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
22329         cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
22330   }
22331 
22332   return SDValue();
22333 }
22334 
22335 // Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255)
22336 // This comes from interleaved vectorization. It is performed late to capture
22337 // uitofp converts too.
22338 static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N,
22339                                                      SelectionDAG &DAG) {
22340   EVT VT = N->getValueType(0);
22341   if ((VT != MVT::v4i32 && VT != MVT::v8i16) ||
22342       N->getOpcode() != ISD::ZERO_EXTEND ||
22343       N->getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
22344     return SDValue();
22345 
22346   unsigned ExtOffset = N->getOperand(0).getConstantOperandVal(1);
22347   if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
22348     return SDValue();
22349 
22350   EVT InVT = N->getOperand(0).getOperand(0).getValueType();
22351   auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0).getOperand(0));
22352   if (!Shuffle ||
22353       InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 ||
22354       InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits())
22355     return SDValue();
22356 
22357   unsigned Idx;
22358   bool IsDeInterleave = ShuffleVectorInst::isDeInterleaveMaskOfFactor(
22359       Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements()), 4, Idx);
22360   // An undef interleave shuffle can come up after other canonicalizations,
22361   // where the shuffle has been converted to
22362   //   zext(extract(shuffle b, undef, [u,u,0,4]))
22363   bool IsUndefDeInterleave = false;
22364   if (!IsDeInterleave)
22365     IsUndefDeInterleave =
22366         Shuffle->getOperand(1).isUndef() &&
22367         ShuffleVectorInst::isDeInterleaveMaskOfFactor(
22368             Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2,
22369                                      VT.getVectorNumElements() / 2),
22370             4, Idx);
22371   if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4)
22372     return SDValue();
22373   SDLoc DL(N);
22374   SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
22375                             Shuffle->getOperand(IsUndefDeInterleave ? 1 : 0));
22376   SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
22377                             Shuffle->getOperand(IsUndefDeInterleave ? 0 : 1));
22378   SDValue UZP = DAG.getNode(Idx < 2 ? AArch64ISD::UZP1 : AArch64ISD::UZP2, DL,
22379                             VT, BC1, BC2);
22380   if ((Idx & 1) == 1)
22381     UZP = DAG.getNode(ISD::SRL, DL, VT, UZP,
22382                       DAG.getConstant(InVT.getScalarSizeInBits(), DL, VT));
22383   return DAG.getNode(
22384       ISD::AND, DL, VT, UZP,
22385       DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
22386 }
22387 
22388 // This comes up similar to the above when lowering deinterleaving shuffles from
22389 // zexts. We have legalized the operations in the generally case to
22390 // zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if
22391 // the extract is to the low half and the uzp is uzp1. There would be an extra
22392 // shift if the uzp was uzp2 to grab the upper half. Due to the combine above
22393 // there could also be an existing and / shift that can be combined in, either
22394 // before of after the extract.
22395 static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) {
22396   EVT VT = N->getValueType(0);
22397   if (N->getOpcode() != ISD::ZERO_EXTEND ||
22398       (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))
22399     return SDValue();
22400 
22401   SDValue Op = N->getOperand(0);
22402   unsigned ExtOffset = (unsigned)-1;
22403   if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22404     ExtOffset = Op.getConstantOperandVal(1);
22405     Op = Op.getOperand(0);
22406   }
22407 
22408   unsigned Shift = 0;
22409   APInt Mask = APInt::getLowBitsSet(VT.getScalarSizeInBits(),
22410                                     Op.getValueType().getScalarSizeInBits());
22411 
22412   if (Op.getOpcode() == AArch64ISD::VLSHR) {
22413     Shift = Op.getConstantOperandVal(1);
22414     Op = Op.getOperand(0);
22415     Mask = Mask.lshr(Shift);
22416   }
22417   if (Op.getOpcode() == ISD::AND &&
22418       ISD::isConstantSplatVector(Op.getOperand(1).getNode(), Mask)) {
22419     Op = Op.getOperand(0);
22420     Mask = Mask.zext(VT.getScalarSizeInBits());
22421   } else if (Op.getOpcode() == AArch64ISD::BICi) {
22422     Mask = ~APInt(Op.getValueType().getScalarSizeInBits(),
22423                   Op.getConstantOperandVal(1) << Op.getConstantOperandVal(2));
22424     Mask = Mask.zext(VT.getScalarSizeInBits());
22425     Op = Op.getOperand(0);
22426   }
22427 
22428   if (ExtOffset == (unsigned)-1) {
22429     if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22430       ExtOffset = Op.getConstantOperandVal(1);
22431       Op = Op.getOperand(0);
22432     } else
22433       return SDValue();
22434   }
22435   if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
22436     return SDValue();
22437 
22438   if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)
22439     return SDValue();
22440   if (Op.getOpcode() == AArch64ISD::UZP2)
22441     Shift += VT.getScalarSizeInBits() / 2;
22442 
22443   SDLoc DL(N);
22444   SDValue BC = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
22445                            Op.getOperand(ExtOffset == 0 ? 0 : 1));
22446   if (Shift != 0)
22447     BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC,
22448                      DAG.getConstant(Shift, DL, MVT::i32));
22449   return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
22450 }
22451 
22452 static SDValue performExtendCombine(SDNode *N,
22453                                     TargetLowering::DAGCombinerInfo &DCI,
22454                                     SelectionDAG &DAG) {
22455   // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
22456   // we can convert that DUP into another extract_high (of a bigger DUP), which
22457   // helps the backend to decide that an sabdl2 would be useful, saving a real
22458   // extract_high operation.
22459   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
22460       N->getOperand(0).getValueType().is64BitVector() &&
22461       (N->getOperand(0).getOpcode() == ISD::ABDU ||
22462        N->getOperand(0).getOpcode() == ISD::ABDS)) {
22463     SDNode *ABDNode = N->getOperand(0).getNode();
22464     SDValue NewABD =
22465         tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG);
22466     if (!NewABD.getNode())
22467       return SDValue();
22468 
22469     return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
22470   }
22471 
22472   if (SDValue R = performZExtDeinterleaveShuffleCombine(N, DAG))
22473     return R;
22474   if (SDValue R = performZExtUZPCombine(N, DAG))
22475     return R;
22476 
22477   if (N->getValueType(0).isFixedLengthVector() &&
22478       N->getOpcode() == ISD::SIGN_EXTEND &&
22479       N->getOperand(0)->getOpcode() == ISD::SETCC)
22480     return performSignExtendSetCCCombine(N, DCI, DAG);
22481 
22482   // If we see (any_extend (bswap ...)) with bswap returning an i16, we know
22483   // that the top half of the result register must be unused, due to the
22484   // any_extend. This means that we can replace this pattern with (rev16
22485   // (any_extend ...)). This saves a machine instruction compared to (lsr (rev
22486   // ...)), which is what this pattern would otherwise be lowered to.
22487   // Only apply this optimisation if any_extend in original pattern to i32 or
22488   // i64, because this type will become the input type to REV16 in the new
22489   // pattern, so must be a legitimate REV16 input type.
22490   SDValue Bswap = N->getOperand(0);
22491   if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
22492       Bswap.getValueType() == MVT::i16 &&
22493       (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
22494     SDLoc DL(N);
22495     SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),
22496                                        Bswap->getOperand(0));
22497     return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
22498                        NewAnyExtend);
22499   }
22500 
22501   return SDValue();
22502 }
22503 
22504 static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
22505                                SDValue SplatVal, unsigned NumVecElts) {
22506   assert(!St.isTruncatingStore() && "cannot split truncating vector store");
22507   Align OrigAlignment = St.getAlign();
22508   unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
22509 
22510   // Create scalar stores. This is at least as good as the code sequence for a
22511   // split unaligned store which is a dup.s, ext.b, and two stores.
22512   // Most of the time the three stores should be replaced by store pair
22513   // instructions (stp).
22514   SDLoc DL(&St);
22515   SDValue BasePtr = St.getBasePtr();
22516   uint64_t BaseOffset = 0;
22517 
22518   const MachinePointerInfo &PtrInfo = St.getPointerInfo();
22519   SDValue NewST1 =
22520       DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
22521                    OrigAlignment, St.getMemOperand()->getFlags());
22522 
22523   // As this in ISel, we will not merge this add which may degrade results.
22524   if (BasePtr->getOpcode() == ISD::ADD &&
22525       isa<ConstantSDNode>(BasePtr->getOperand(1))) {
22526     BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
22527     BasePtr = BasePtr->getOperand(0);
22528   }
22529 
22530   unsigned Offset = EltOffset;
22531   while (--NumVecElts) {
22532     Align Alignment = commonAlignment(OrigAlignment, Offset);
22533     SDValue OffsetPtr =
22534         DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
22535                     DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
22536     NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
22537                           PtrInfo.getWithOffset(Offset), Alignment,
22538                           St.getMemOperand()->getFlags());
22539     Offset += EltOffset;
22540   }
22541   return NewST1;
22542 }
22543 
22544 // Returns an SVE type that ContentTy can be trivially sign or zero extended
22545 // into.
22546 static MVT getSVEContainerType(EVT ContentTy) {
22547   assert(ContentTy.isSimple() && "No SVE containers for extended types");
22548 
22549   switch (ContentTy.getSimpleVT().SimpleTy) {
22550   default:
22551     llvm_unreachable("No known SVE container for this MVT type");
22552   case MVT::nxv2i8:
22553   case MVT::nxv2i16:
22554   case MVT::nxv2i32:
22555   case MVT::nxv2i64:
22556   case MVT::nxv2f32:
22557   case MVT::nxv2f64:
22558     return MVT::nxv2i64;
22559   case MVT::nxv4i8:
22560   case MVT::nxv4i16:
22561   case MVT::nxv4i32:
22562   case MVT::nxv4f32:
22563     return MVT::nxv4i32;
22564   case MVT::nxv8i8:
22565   case MVT::nxv8i16:
22566   case MVT::nxv8f16:
22567   case MVT::nxv8bf16:
22568     return MVT::nxv8i16;
22569   case MVT::nxv16i8:
22570     return MVT::nxv16i8;
22571   }
22572 }
22573 
22574 static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
22575   SDLoc DL(N);
22576   EVT VT = N->getValueType(0);
22577 
22578   if (VT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
22579     return SDValue();
22580 
22581   EVT ContainerVT = VT;
22582   if (ContainerVT.isInteger())
22583     ContainerVT = getSVEContainerType(ContainerVT);
22584 
22585   SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
22586   SDValue Ops[] = { N->getOperand(0), // Chain
22587                     N->getOperand(2), // Pg
22588                     N->getOperand(3), // Base
22589                     DAG.getValueType(VT) };
22590 
22591   SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
22592   SDValue LoadChain = SDValue(Load.getNode(), 1);
22593 
22594   if (ContainerVT.isInteger() && (VT != ContainerVT))
22595     Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
22596 
22597   return DAG.getMergeValues({ Load, LoadChain }, DL);
22598 }
22599 
22600 static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
22601   SDLoc DL(N);
22602   EVT VT = N->getValueType(0);
22603   EVT PtrTy = N->getOperand(3).getValueType();
22604 
22605   EVT LoadVT = VT;
22606   if (VT.isFloatingPoint())
22607     LoadVT = VT.changeTypeToInteger();
22608 
22609   auto *MINode = cast<MemIntrinsicSDNode>(N);
22610   SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
22611   SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
22612                                 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
22613                                 MINode->getOperand(2), PassThru,
22614                                 MINode->getMemoryVT(), MINode->getMemOperand(),
22615                                 ISD::UNINDEXED, ISD::NON_EXTLOAD, false);
22616 
22617    if (VT.isFloatingPoint()) {
22618      SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
22619      return DAG.getMergeValues(Ops, DL);
22620    }
22621 
22622   return L;
22623 }
22624 
22625 template <unsigned Opcode>
22626 static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
22627   static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
22628                     Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
22629                 "Unsupported opcode.");
22630   SDLoc DL(N);
22631   EVT VT = N->getValueType(0);
22632 
22633   EVT LoadVT = VT;
22634   if (VT.isFloatingPoint())
22635     LoadVT = VT.changeTypeToInteger();
22636 
22637   SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
22638   SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
22639   SDValue LoadChain = SDValue(Load.getNode(), 1);
22640 
22641   if (VT.isFloatingPoint())
22642     Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
22643 
22644   return DAG.getMergeValues({Load, LoadChain}, DL);
22645 }
22646 
22647 static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
22648   SDLoc DL(N);
22649   SDValue Data = N->getOperand(2);
22650   EVT DataVT = Data.getValueType();
22651   EVT HwSrcVt = getSVEContainerType(DataVT);
22652   SDValue InputVT = DAG.getValueType(DataVT);
22653 
22654   if (DataVT.isFloatingPoint())
22655     InputVT = DAG.getValueType(HwSrcVt);
22656 
22657   SDValue SrcNew;
22658   if (Data.getValueType().isFloatingPoint())
22659     SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
22660   else
22661     SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
22662 
22663   SDValue Ops[] = { N->getOperand(0), // Chain
22664                     SrcNew,
22665                     N->getOperand(4), // Base
22666                     N->getOperand(3), // Pg
22667                     InputVT
22668                   };
22669 
22670   return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
22671 }
22672 
22673 static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
22674   SDLoc DL(N);
22675 
22676   SDValue Data = N->getOperand(2);
22677   EVT DataVT = Data.getValueType();
22678   EVT PtrTy = N->getOperand(4).getValueType();
22679 
22680   if (DataVT.isFloatingPoint())
22681     Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
22682 
22683   auto *MINode = cast<MemIntrinsicSDNode>(N);
22684   return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
22685                             DAG.getUNDEF(PtrTy), MINode->getOperand(3),
22686                             MINode->getMemoryVT(), MINode->getMemOperand(),
22687                             ISD::UNINDEXED, false, false);
22688 }
22689 
22690 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.  The
22691 /// load store optimizer pass will merge them to store pair stores.  This should
22692 /// be better than a movi to create the vector zero followed by a vector store
22693 /// if the zero constant is not re-used, since one instructions and one register
22694 /// live range will be removed.
22695 ///
22696 /// For example, the final generated code should be:
22697 ///
22698 ///   stp xzr, xzr, [x0]
22699 ///
22700 /// instead of:
22701 ///
22702 ///   movi v0.2d, #0
22703 ///   str q0, [x0]
22704 ///
22705 static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
22706   SDValue StVal = St.getValue();
22707   EVT VT = StVal.getValueType();
22708 
22709   // Avoid scalarizing zero splat stores for scalable vectors.
22710   if (VT.isScalableVector())
22711     return SDValue();
22712 
22713   // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
22714   // 2, 3 or 4 i32 elements.
22715   int NumVecElts = VT.getVectorNumElements();
22716   if (!(((NumVecElts == 2 || NumVecElts == 3) &&
22717          VT.getVectorElementType().getSizeInBits() == 64) ||
22718         ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
22719          VT.getVectorElementType().getSizeInBits() == 32)))
22720     return SDValue();
22721 
22722   if (StVal.getOpcode() != ISD::BUILD_VECTOR)
22723     return SDValue();
22724 
22725   // If the zero constant has more than one use then the vector store could be
22726   // better since the constant mov will be amortized and stp q instructions
22727   // should be able to be formed.
22728   if (!StVal.hasOneUse())
22729     return SDValue();
22730 
22731   // If the store is truncating then it's going down to i16 or smaller, which
22732   // means it can be implemented in a single store anyway.
22733   if (St.isTruncatingStore())
22734     return SDValue();
22735 
22736   // If the immediate offset of the address operand is too large for the stp
22737   // instruction, then bail out.
22738   if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
22739     int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
22740     if (Offset < -512 || Offset > 504)
22741       return SDValue();
22742   }
22743 
22744   for (int I = 0; I < NumVecElts; ++I) {
22745     SDValue EltVal = StVal.getOperand(I);
22746     if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
22747       return SDValue();
22748   }
22749 
22750   // Use a CopyFromReg WZR/XZR here to prevent
22751   // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
22752   SDLoc DL(&St);
22753   unsigned ZeroReg;
22754   EVT ZeroVT;
22755   if (VT.getVectorElementType().getSizeInBits() == 32) {
22756     ZeroReg = AArch64::WZR;
22757     ZeroVT = MVT::i32;
22758   } else {
22759     ZeroReg = AArch64::XZR;
22760     ZeroVT = MVT::i64;
22761   }
22762   SDValue SplatVal =
22763       DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
22764   return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
22765 }
22766 
22767 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
22768 /// value. The load store optimizer pass will merge them to store pair stores.
22769 /// This has better performance than a splat of the scalar followed by a split
22770 /// vector store. Even if the stores are not merged it is four stores vs a dup,
22771 /// followed by an ext.b and two stores.
22772 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
22773   SDValue StVal = St.getValue();
22774   EVT VT = StVal.getValueType();
22775 
22776   // Don't replace floating point stores, they possibly won't be transformed to
22777   // stp because of the store pair suppress pass.
22778   if (VT.isFloatingPoint())
22779     return SDValue();
22780 
22781   // We can express a splat as store pair(s) for 2 or 4 elements.
22782   unsigned NumVecElts = VT.getVectorNumElements();
22783   if (NumVecElts != 4 && NumVecElts != 2)
22784     return SDValue();
22785 
22786   // If the store is truncating then it's going down to i16 or smaller, which
22787   // means it can be implemented in a single store anyway.
22788   if (St.isTruncatingStore())
22789     return SDValue();
22790 
22791   // Check that this is a splat.
22792   // Make sure that each of the relevant vector element locations are inserted
22793   // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
22794   std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
22795   SDValue SplatVal;
22796   for (unsigned I = 0; I < NumVecElts; ++I) {
22797     // Check for insert vector elements.
22798     if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
22799       return SDValue();
22800 
22801     // Check that same value is inserted at each vector element.
22802     if (I == 0)
22803       SplatVal = StVal.getOperand(1);
22804     else if (StVal.getOperand(1) != SplatVal)
22805       return SDValue();
22806 
22807     // Check insert element index.
22808     ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
22809     if (!CIndex)
22810       return SDValue();
22811     uint64_t IndexVal = CIndex->getZExtValue();
22812     if (IndexVal >= NumVecElts)
22813       return SDValue();
22814     IndexNotInserted.reset(IndexVal);
22815 
22816     StVal = StVal.getOperand(0);
22817   }
22818   // Check that all vector element locations were inserted to.
22819   if (IndexNotInserted.any())
22820       return SDValue();
22821 
22822   return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
22823 }
22824 
22825 static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
22826                            SelectionDAG &DAG,
22827                            const AArch64Subtarget *Subtarget) {
22828 
22829   StoreSDNode *S = cast<StoreSDNode>(N);
22830   if (S->isVolatile() || S->isIndexed())
22831     return SDValue();
22832 
22833   SDValue StVal = S->getValue();
22834   EVT VT = StVal.getValueType();
22835 
22836   if (!VT.isFixedLengthVector())
22837     return SDValue();
22838 
22839   // If we get a splat of zeros, convert this vector store to a store of
22840   // scalars. They will be merged into store pairs of xzr thereby removing one
22841   // instruction and one register.
22842   if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
22843     return ReplacedZeroSplat;
22844 
22845   // FIXME: The logic for deciding if an unaligned store should be split should
22846   // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
22847   // a call to that function here.
22848 
22849   if (!Subtarget->isMisaligned128StoreSlow())
22850     return SDValue();
22851 
22852   // Don't split at -Oz.
22853   if (DAG.getMachineFunction().getFunction().hasMinSize())
22854     return SDValue();
22855 
22856   // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
22857   // those up regresses performance on micro-benchmarks and olden/bh.
22858   if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
22859     return SDValue();
22860 
22861   // Split unaligned 16B stores. They are terrible for performance.
22862   // Don't split stores with alignment of 1 or 2. Code that uses clang vector
22863   // extensions can use this to mark that it does not want splitting to happen
22864   // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
22865   // eliminating alignment hazards is only 1 in 8 for alignment of 2.
22866   if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
22867       S->getAlign() <= Align(2))
22868     return SDValue();
22869 
22870   // If we get a splat of a scalar convert this vector store to a store of
22871   // scalars. They will be merged into store pairs thereby removing two
22872   // instructions.
22873   if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
22874     return ReplacedSplat;
22875 
22876   SDLoc DL(S);
22877 
22878   // Split VT into two.
22879   EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
22880   unsigned NumElts = HalfVT.getVectorNumElements();
22881   SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
22882                                    DAG.getConstant(0, DL, MVT::i64));
22883   SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
22884                                    DAG.getConstant(NumElts, DL, MVT::i64));
22885   SDValue BasePtr = S->getBasePtr();
22886   SDValue NewST1 =
22887       DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
22888                    S->getAlign(), S->getMemOperand()->getFlags());
22889   SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
22890                                   DAG.getConstant(8, DL, MVT::i64));
22891   return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
22892                       S->getPointerInfo(), S->getAlign(),
22893                       S->getMemOperand()->getFlags());
22894 }
22895 
22896 static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
22897   assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
22898 
22899   // splice(pg, op1, undef) -> op1
22900   if (N->getOperand(2).isUndef())
22901     return N->getOperand(1);
22902 
22903   return SDValue();
22904 }
22905 
22906 static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG,
22907                                     const AArch64Subtarget *Subtarget) {
22908   assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
22909           N->getOpcode() == AArch64ISD::UUNPKLO) &&
22910          "Unexpected Opcode!");
22911 
22912   // uunpklo/hi undef -> undef
22913   if (N->getOperand(0).isUndef())
22914     return DAG.getUNDEF(N->getValueType(0));
22915 
22916   // If this is a masked load followed by an UUNPKLO, fold this into a masked
22917   // extending load.  We can do this even if this is already a masked
22918   // {z,}extload.
22919   if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
22920       N->getOpcode() == AArch64ISD::UUNPKLO) {
22921     MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
22922     SDValue Mask = MLD->getMask();
22923     SDLoc DL(N);
22924 
22925     if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
22926         SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22927         (MLD->getPassThru()->isUndef() ||
22928          isZerosVector(MLD->getPassThru().getNode()))) {
22929       unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22930       unsigned PgPattern = Mask->getConstantOperandVal(0);
22931       EVT VT = N->getValueType(0);
22932 
22933       // Ensure we can double the size of the predicate pattern
22934       unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
22935       if (NumElts &&
22936           NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
22937         Mask =
22938             getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
22939         SDValue PassThru = DAG.getConstant(0, DL, VT);
22940         SDValue NewLoad = DAG.getMaskedLoad(
22941             VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
22942             PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
22943             MLD->getAddressingMode(), ISD::ZEXTLOAD);
22944 
22945         DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
22946 
22947         return NewLoad;
22948       }
22949     }
22950   }
22951 
22952   return SDValue();
22953 }
22954 
22955 static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N) {
22956   if (N->getOpcode() != AArch64ISD::UZP1)
22957     return false;
22958   SDValue Op0 = N->getOperand(0);
22959   EVT SrcVT = Op0->getValueType(0);
22960   EVT DstVT = N->getValueType(0);
22961   return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
22962          (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
22963          (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
22964 }
22965 
22966 // Try to combine rounding shifts where the operands come from an extend, and
22967 // the result is truncated and combined into one vector.
22968 //   uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
22969 static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG) {
22970   assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
22971   SDValue Op0 = N->getOperand(0);
22972   SDValue Op1 = N->getOperand(1);
22973   EVT ResVT = N->getValueType(0);
22974 
22975   unsigned RshOpc = Op0.getOpcode();
22976   if (RshOpc != AArch64ISD::RSHRNB_I)
22977     return SDValue();
22978 
22979   // Same op code and imm value?
22980   SDValue ShiftValue = Op0.getOperand(1);
22981   if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
22982     return SDValue();
22983 
22984   // Same unextended operand value?
22985   SDValue Lo = Op0.getOperand(0);
22986   SDValue Hi = Op1.getOperand(0);
22987   if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
22988       Hi.getOpcode() != AArch64ISD::UUNPKHI)
22989     return SDValue();
22990   SDValue OrigArg = Lo.getOperand(0);
22991   if (OrigArg != Hi.getOperand(0))
22992     return SDValue();
22993 
22994   SDLoc DL(N);
22995   return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
22996                      getPredicateForVector(DAG, DL, ResVT), OrigArg,
22997                      ShiftValue);
22998 }
22999 
23000 // Try to simplify:
23001 //    t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
23002 //    t2 = nxv8i16 srl(t1, ShiftValue)
23003 // to
23004 //    t1 = nxv8i16 rshrnb(X, shiftvalue).
23005 // rshrnb will zero the top half bits of each element. Therefore, this combine
23006 // should only be performed when a following instruction with the rshrnb
23007 // as an operand does not care about the top half of each element. For example,
23008 // a uzp1 or a truncating store.
23009 static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG,
23010                                          const AArch64Subtarget *Subtarget) {
23011   EVT VT = Srl->getValueType(0);
23012   if (!VT.isScalableVector() || !Subtarget->hasSVE2())
23013     return SDValue();
23014 
23015   EVT ResVT;
23016   if (VT == MVT::nxv8i16)
23017     ResVT = MVT::nxv16i8;
23018   else if (VT == MVT::nxv4i32)
23019     ResVT = MVT::nxv8i16;
23020   else if (VT == MVT::nxv2i64)
23021     ResVT = MVT::nxv4i32;
23022   else
23023     return SDValue();
23024 
23025   SDLoc DL(Srl);
23026   unsigned ShiftValue;
23027   SDValue RShOperand;
23028   if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
23029     return SDValue();
23030   SDValue Rshrnb = DAG.getNode(
23031       AArch64ISD::RSHRNB_I, DL, ResVT,
23032       {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
23033   return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Rshrnb);
23034 }
23035 
23036 static SDValue isNVCastToHalfWidthElements(SDValue V) {
23037   if (V.getOpcode() != AArch64ISD::NVCAST)
23038     return SDValue();
23039 
23040   SDValue Op = V.getOperand(0);
23041   if (V.getValueType().getVectorElementCount() !=
23042       Op.getValueType().getVectorElementCount() * 2)
23043     return SDValue();
23044 
23045   return Op;
23046 }
23047 
23048 static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
23049                                  const AArch64Subtarget *Subtarget) {
23050   SDLoc DL(N);
23051   SDValue Op0 = N->getOperand(0);
23052   SDValue Op1 = N->getOperand(1);
23053   EVT ResVT = N->getValueType(0);
23054 
23055   // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
23056   if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23057       Op1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23058       Op0.getOperand(0) == Op1.getOperand(0)) {
23059 
23060     SDValue SourceVec = Op0.getOperand(0);
23061     uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
23062     uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
23063     uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
23064     if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
23065       EVT OpVT = Op0.getOperand(1).getValueType();
23066       EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
23067       SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
23068                                 DAG.getUNDEF(WidenedResVT));
23069       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
23070                          DAG.getConstant(0, DL, OpVT));
23071     }
23072   }
23073 
23074   // Following optimizations only work with uzp1.
23075   if (N->getOpcode() == AArch64ISD::UZP2)
23076     return SDValue();
23077 
23078   // uzp1(x, undef) -> concat(truncate(x), undef)
23079   if (Op1.getOpcode() == ISD::UNDEF) {
23080     EVT BCVT = MVT::Other, HalfVT = MVT::Other;
23081     switch (ResVT.getSimpleVT().SimpleTy) {
23082     default:
23083       break;
23084     case MVT::v16i8:
23085       BCVT = MVT::v8i16;
23086       HalfVT = MVT::v8i8;
23087       break;
23088     case MVT::v8i16:
23089       BCVT = MVT::v4i32;
23090       HalfVT = MVT::v4i16;
23091       break;
23092     case MVT::v4i32:
23093       BCVT = MVT::v2i64;
23094       HalfVT = MVT::v2i32;
23095       break;
23096     }
23097     if (BCVT != MVT::Other) {
23098       SDValue BC = DAG.getBitcast(BCVT, Op0);
23099       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
23100       return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
23101                          DAG.getUNDEF(HalfVT));
23102     }
23103   }
23104 
23105   if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
23106     return Urshr;
23107 
23108   if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23109     if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23110       Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23111       return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
23112     }
23113   }
23114 
23115   if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23116     if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23117       Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23118       return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
23119     }
23120   }
23121 
23122   // uzp1<ty>(nvcast(unpklo(uzp1<ty>(x, y))), z) => uzp1<ty>(x, z)
23123   if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23124     if (PreCast.getOpcode() == AArch64ISD::UUNPKLO) {
23125       if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23126         SDValue X = PreCast.getOperand(0).getOperand(0);
23127         return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
23128       }
23129     }
23130   }
23131 
23132   // uzp1<ty>(x, nvcast(unpkhi(uzp1<ty>(y, z)))) => uzp1<ty>(x, z)
23133   if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23134     if (PreCast.getOpcode() == AArch64ISD::UUNPKHI) {
23135       if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23136         SDValue Z = PreCast.getOperand(0).getOperand(1);
23137         return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
23138       }
23139     }
23140   }
23141 
23142   // These optimizations only work on little endian.
23143   if (!DAG.getDataLayout().isLittleEndian())
23144     return SDValue();
23145 
23146   // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
23147   // Example:
23148   // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
23149   // to
23150   // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
23151   if (isHalvingTruncateAndConcatOfLegalIntScalableType(N) &&
23152       Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
23153     if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
23154       return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
23155                          Op1.getOperand(0));
23156     }
23157   }
23158 
23159   if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
23160     return SDValue();
23161 
23162   SDValue SourceOp0 = peekThroughBitcasts(Op0);
23163   SDValue SourceOp1 = peekThroughBitcasts(Op1);
23164 
23165   // truncating uzp1(x, y) -> xtn(concat (x, y))
23166   if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
23167     EVT Op0Ty = SourceOp0.getValueType();
23168     if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
23169         (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
23170       SDValue Concat =
23171           DAG.getNode(ISD::CONCAT_VECTORS, DL,
23172                       Op0Ty.getDoubleNumVectorElementsVT(*DAG.getContext()),
23173                       SourceOp0, SourceOp1);
23174       return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
23175     }
23176   }
23177 
23178   // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
23179   if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
23180       SourceOp1.getOpcode() != ISD::TRUNCATE)
23181     return SDValue();
23182   SourceOp0 = SourceOp0.getOperand(0);
23183   SourceOp1 = SourceOp1.getOperand(0);
23184 
23185   if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
23186       !SourceOp0.getValueType().isSimple())
23187     return SDValue();
23188 
23189   EVT ResultTy;
23190 
23191   switch (SourceOp0.getSimpleValueType().SimpleTy) {
23192   case MVT::v2i64:
23193     ResultTy = MVT::v4i32;
23194     break;
23195   case MVT::v4i32:
23196     ResultTy = MVT::v8i16;
23197     break;
23198   case MVT::v8i16:
23199     ResultTy = MVT::v16i8;
23200     break;
23201   default:
23202     return SDValue();
23203   }
23204 
23205   SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
23206   SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
23207   SDValue UzpResult =
23208       DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
23209 
23210   EVT BitcastResultTy;
23211 
23212   switch (ResVT.getSimpleVT().SimpleTy) {
23213   case MVT::v2i32:
23214     BitcastResultTy = MVT::v2i64;
23215     break;
23216   case MVT::v4i16:
23217     BitcastResultTy = MVT::v4i32;
23218     break;
23219   case MVT::v8i8:
23220     BitcastResultTy = MVT::v8i16;
23221     break;
23222   default:
23223     llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
23224   }
23225 
23226   return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
23227                      DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
23228 }
23229 
23230 static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
23231   unsigned Opc = N->getOpcode();
23232 
23233   assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
23234            Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) ||
23235           (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
23236            Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) &&
23237          "Invalid opcode.");
23238 
23239   const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
23240                       Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23241   const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
23242                       Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23243   const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
23244                         Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
23245                         Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
23246                         Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
23247 
23248   SDLoc DL(N);
23249   SDValue Chain = N->getOperand(0);
23250   SDValue Pg = N->getOperand(1);
23251   SDValue Base = N->getOperand(2);
23252   SDValue Offset = N->getOperand(3);
23253   SDValue Ty = N->getOperand(4);
23254 
23255   EVT ResVT = N->getValueType(0);
23256 
23257   const auto OffsetOpc = Offset.getOpcode();
23258   const bool OffsetIsZExt =
23259       OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
23260   const bool OffsetIsSExt =
23261       OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
23262 
23263   // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
23264   if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
23265     SDValue ExtPg = Offset.getOperand(0);
23266     VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
23267     EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
23268 
23269     // If the predicate for the sign- or zero-extended offset is the
23270     // same as the predicate used for this load and the sign-/zero-extension
23271     // was from a 32-bits...
23272     if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
23273       SDValue UnextendedOffset = Offset.getOperand(1);
23274 
23275       unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
23276       if (Signed)
23277         NewOpc = getSignExtendedGatherOpcode(NewOpc);
23278 
23279       return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
23280                          {Chain, Pg, Base, UnextendedOffset, Ty});
23281     }
23282   }
23283 
23284   return SDValue();
23285 }
23286 
23287 /// Optimize a vector shift instruction and its operand if shifted out
23288 /// bits are not used.
23289 static SDValue performVectorShiftCombine(SDNode *N,
23290                                          const AArch64TargetLowering &TLI,
23291                                          TargetLowering::DAGCombinerInfo &DCI) {
23292   assert(N->getOpcode() == AArch64ISD::VASHR ||
23293          N->getOpcode() == AArch64ISD::VLSHR);
23294 
23295   SDValue Op = N->getOperand(0);
23296   unsigned OpScalarSize = Op.getScalarValueSizeInBits();
23297 
23298   unsigned ShiftImm = N->getConstantOperandVal(1);
23299   assert(OpScalarSize > ShiftImm && "Invalid shift imm");
23300 
23301   // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
23302   if (N->getOpcode() == AArch64ISD::VASHR &&
23303       Op.getOpcode() == AArch64ISD::VSHL &&
23304       N->getOperand(1) == Op.getOperand(1))
23305     if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
23306       return Op.getOperand(0);
23307 
23308   // If the shift is exact, the shifted out bits matter.
23309   if (N->getFlags().hasExact())
23310     return SDValue();
23311 
23312   APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
23313   APInt DemandedMask = ~ShiftedOutBits;
23314 
23315   if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
23316     return SDValue(N, 0);
23317 
23318   return SDValue();
23319 }
23320 
23321 static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) {
23322   // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
23323   // This transform works in partnership with performSetCCPunpkCombine to
23324   // remove unnecessary transfer of predicates into standard registers and back
23325   if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
23326       N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
23327           MVT::i1) {
23328     SDValue CC = N->getOperand(0)->getOperand(0);
23329     auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
23330     SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
23331                                DAG.getVectorIdxConstant(0, SDLoc(N)));
23332     return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
23333   }
23334 
23335   return SDValue();
23336 }
23337 
23338 /// Target-specific DAG combine function for post-increment LD1 (lane) and
23339 /// post-increment LD1R.
23340 static SDValue performPostLD1Combine(SDNode *N,
23341                                      TargetLowering::DAGCombinerInfo &DCI,
23342                                      bool IsLaneOp) {
23343   if (DCI.isBeforeLegalizeOps())
23344     return SDValue();
23345 
23346   SelectionDAG &DAG = DCI.DAG;
23347   EVT VT = N->getValueType(0);
23348 
23349   if (!VT.is128BitVector() && !VT.is64BitVector())
23350     return SDValue();
23351 
23352   // If it is not LOAD, can not do such combine.
23353   unsigned LoadIdx = IsLaneOp ? 1 : 0;
23354   LoadSDNode *LD = dyn_cast<LoadSDNode>(N->getOperand(LoadIdx).getNode());
23355   if (!LD)
23356     return SDValue();
23357 
23358   // If the Generic combiner already helped form a pre- or post-indexed load,
23359   // skip forming one here.
23360   if (LD->isIndexed())
23361     return SDValue();
23362 
23363   // The vector lane must be a constant in the LD1LANE opcode.
23364   SDValue Lane;
23365   if (IsLaneOp) {
23366     Lane = N->getOperand(2);
23367     auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
23368     if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
23369       return SDValue();
23370   }
23371 
23372   LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
23373   EVT MemVT = LoadSDN->getMemoryVT();
23374   // Check if memory operand is the same type as the vector element.
23375   if (MemVT != VT.getVectorElementType())
23376     return SDValue();
23377 
23378   // Check if there are other uses. If so, do not combine as it will introduce
23379   // an extra load.
23380   for (SDUse &U : LD->uses()) {
23381     if (U.getResNo() == 1) // Ignore uses of the chain result.
23382       continue;
23383     if (U.getUser() != N)
23384       return SDValue();
23385   }
23386 
23387   // If there is one use and it can splat the value, prefer that operation.
23388   // TODO: This could be expanded to more operations if they reliably use the
23389   // index variants.
23390   if (N->hasOneUse()) {
23391     unsigned UseOpc = N->user_begin()->getOpcode();
23392     if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
23393       return SDValue();
23394   }
23395 
23396   SDValue Addr = LD->getOperand(1);
23397   SDValue Vector = N->getOperand(0);
23398   // Search for a use of the address operand that is an increment.
23399   for (SDUse &Use : Addr->uses()) {
23400     SDNode *User = Use.getUser();
23401     if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
23402       continue;
23403 
23404     // If the increment is a constant, it must match the memory ref size.
23405     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
23406     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
23407       uint32_t IncVal = CInc->getZExtValue();
23408       unsigned NumBytes = VT.getScalarSizeInBits() / 8;
23409       if (IncVal != NumBytes)
23410         continue;
23411       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
23412     }
23413 
23414     // To avoid cycle construction make sure that neither the load nor the add
23415     // are predecessors to each other or the Vector.
23416     SmallPtrSet<const SDNode *, 32> Visited;
23417     SmallVector<const SDNode *, 16> Worklist;
23418     Visited.insert(Addr.getNode());
23419     Worklist.push_back(User);
23420     Worklist.push_back(LD);
23421     Worklist.push_back(Vector.getNode());
23422     if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
23423         SDNode::hasPredecessorHelper(User, Visited, Worklist))
23424       continue;
23425 
23426     SmallVector<SDValue, 8> Ops;
23427     Ops.push_back(LD->getOperand(0));  // Chain
23428     if (IsLaneOp) {
23429       Ops.push_back(Vector);           // The vector to be inserted
23430       Ops.push_back(Lane);             // The lane to be inserted in the vector
23431     }
23432     Ops.push_back(Addr);
23433     Ops.push_back(Inc);
23434 
23435     EVT Tys[3] = { VT, MVT::i64, MVT::Other };
23436     SDVTList SDTys = DAG.getVTList(Tys);
23437     unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
23438     SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
23439                                            MemVT,
23440                                            LoadSDN->getMemOperand());
23441 
23442     // Update the uses.
23443     SDValue NewResults[] = {
23444         SDValue(LD, 0),            // The result of load
23445         SDValue(UpdN.getNode(), 2) // Chain
23446     };
23447     DCI.CombineTo(LD, NewResults);
23448     DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
23449     DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
23450 
23451     break;
23452   }
23453   return SDValue();
23454 }
23455 
23456 /// Simplify ``Addr`` given that the top byte of it is ignored by HW during
23457 /// address translation.
23458 static bool performTBISimplification(SDValue Addr,
23459                                      TargetLowering::DAGCombinerInfo &DCI,
23460                                      SelectionDAG &DAG) {
23461   APInt DemandedMask = APInt::getLowBitsSet(64, 56);
23462   KnownBits Known;
23463   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
23464                                         !DCI.isBeforeLegalizeOps());
23465   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23466   if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
23467     DCI.CommitTargetLoweringOpt(TLO);
23468     return true;
23469   }
23470   return false;
23471 }
23472 
23473 static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
23474   assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
23475          "Expected STORE dag node in input!");
23476 
23477   if (auto Store = dyn_cast<StoreSDNode>(N)) {
23478     if (!Store->isTruncatingStore() || Store->isIndexed())
23479       return SDValue();
23480     SDValue Ext = Store->getValue();
23481     auto ExtOpCode = Ext.getOpcode();
23482     if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
23483         ExtOpCode != ISD::ANY_EXTEND)
23484       return SDValue();
23485     SDValue Orig = Ext->getOperand(0);
23486     if (Store->getMemoryVT() != Orig.getValueType())
23487       return SDValue();
23488     return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
23489                         Store->getBasePtr(), Store->getMemOperand());
23490   }
23491 
23492   return SDValue();
23493 }
23494 
23495 // A custom combine to lower load <3 x i8> as the more efficient sequence
23496 // below:
23497 //    ldrb wX, [x0, #2]
23498 //    ldrh wY, [x0]
23499 //    orr wX, wY, wX, lsl #16
23500 //    fmov s0, wX
23501 //
23502 // Note that an alternative sequence with even fewer (although usually more
23503 // complex/expensive) instructions would be:
23504 //   ld1r.4h { v0 }, [x0], #2
23505 //   ld1.b { v0 }[2], [x0]
23506 //
23507 // Generating this sequence unfortunately results in noticeably worse codegen
23508 // for code that extends the loaded v3i8, due to legalization breaking vector
23509 // shuffle detection in a way that is very difficult to work around.
23510 // TODO: Revisit once v3i8 legalization has been improved in general.
23511 static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
23512   EVT MemVT = LD->getMemoryVT();
23513   if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
23514       LD->getOriginalAlign() >= 4)
23515     return SDValue();
23516 
23517   SDLoc DL(LD);
23518   MachineFunction &MF = DAG.getMachineFunction();
23519   SDValue Chain = LD->getChain();
23520   SDValue BasePtr = LD->getBasePtr();
23521   MachineMemOperand *MMO = LD->getMemOperand();
23522   assert(LD->getOffset().isUndef() && "undef offset expected");
23523 
23524   // Load 2 x i8, then 1 x i8.
23525   SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
23526   TypeSize Offset2 = TypeSize::getFixed(2);
23527   SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
23528                            DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
23529                            MF.getMachineMemOperand(MMO, 2, 1));
23530 
23531   // Extend to i32.
23532   SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
23533   SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
23534 
23535   // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
23536   SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
23537                             DAG.getConstant(16, DL, MVT::i32));
23538   SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
23539   SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
23540 
23541   // Extract v3i8 again.
23542   SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
23543                                 DAG.getConstant(0, DL, MVT::i64));
23544   SDValue TokenFactor = DAG.getNode(
23545       ISD::TokenFactor, DL, MVT::Other,
23546       {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
23547   return DAG.getMergeValues({Extract, TokenFactor}, DL);
23548 }
23549 
23550 // Perform TBI simplification if supported by the target and try to break up
23551 // nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
23552 // load instructions can be selected.
23553 static SDValue performLOADCombine(SDNode *N,
23554                                   TargetLowering::DAGCombinerInfo &DCI,
23555                                   SelectionDAG &DAG,
23556                                   const AArch64Subtarget *Subtarget) {
23557   if (Subtarget->supportsAddressTopByteIgnored())
23558     performTBISimplification(N->getOperand(1), DCI, DAG);
23559 
23560   LoadSDNode *LD = cast<LoadSDNode>(N);
23561   if (LD->isVolatile() || !Subtarget->isLittleEndian())
23562     return SDValue(N, 0);
23563 
23564   if (SDValue Res = combineV3I8LoadExt(LD, DAG))
23565     return Res;
23566 
23567   if (!LD->isNonTemporal())
23568     return SDValue(N, 0);
23569 
23570   EVT MemVT = LD->getMemoryVT();
23571   if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
23572       MemVT.getSizeInBits() % 256 == 0 ||
23573       256 % MemVT.getScalarSizeInBits() != 0)
23574     return SDValue(N, 0);
23575 
23576   SDLoc DL(LD);
23577   SDValue Chain = LD->getChain();
23578   SDValue BasePtr = LD->getBasePtr();
23579   SDNodeFlags Flags = LD->getFlags();
23580   SmallVector<SDValue, 4> LoadOps;
23581   SmallVector<SDValue, 4> LoadOpsChain;
23582   // Replace any non temporal load over 256-bit with a series of 256 bit loads
23583   // and a scalar/vector load less than 256. This way we can utilize 256-bit
23584   // loads and reduce the amount of load instructions generated.
23585   MVT NewVT =
23586       MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
23587                        256 / MemVT.getVectorElementType().getSizeInBits());
23588   unsigned Num256Loads = MemVT.getSizeInBits() / 256;
23589   // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
23590   for (unsigned I = 0; I < Num256Loads; I++) {
23591     unsigned PtrOffset = I * 32;
23592     SDValue NewPtr = DAG.getMemBasePlusOffset(
23593         BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
23594     Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
23595     SDValue NewLoad = DAG.getLoad(
23596         NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
23597         NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
23598     LoadOps.push_back(NewLoad);
23599     LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
23600   }
23601 
23602   // Process remaining bits of the load operation.
23603   // This is done by creating an UNDEF vector to match the size of the
23604   // 256-bit loads and inserting the remaining load to it. We extract the
23605   // original load type at the end using EXTRACT_SUBVECTOR instruction.
23606   unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
23607   unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
23608   MVT RemainingVT = MVT::getVectorVT(
23609       MemVT.getVectorElementType().getSimpleVT(),
23610       BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
23611   SDValue NewPtr = DAG.getMemBasePlusOffset(
23612       BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
23613   Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
23614   SDValue RemainingLoad =
23615       DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
23616                   LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
23617                   LD->getMemOperand()->getFlags(), LD->getAAInfo());
23618   SDValue UndefVector = DAG.getUNDEF(NewVT);
23619   SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
23620   SDValue ExtendedReminingLoad =
23621       DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
23622                   {UndefVector, RemainingLoad, InsertIdx});
23623   LoadOps.push_back(ExtendedReminingLoad);
23624   LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
23625   EVT ConcatVT =
23626       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
23627                        LoadOps.size() * NewVT.getVectorNumElements());
23628   SDValue ConcatVectors =
23629       DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
23630   // Extract the original vector type size.
23631   SDValue ExtractSubVector =
23632       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
23633                   {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
23634   SDValue TokenFactor =
23635       DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
23636   return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
23637 }
23638 
23639 static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = 0) {
23640   EVT VecVT = Op.getValueType();
23641   assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
23642          "Need boolean vector type.");
23643 
23644   if (Depth > 3)
23645     return MVT::INVALID_SIMPLE_VALUE_TYPE;
23646 
23647   // We can get the base type from a vector compare or truncate.
23648   if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
23649     return Op.getOperand(0).getValueType();
23650 
23651   // If an operand is a bool vector, continue looking.
23652   EVT BaseVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
23653   for (SDValue Operand : Op->op_values()) {
23654     if (Operand.getValueType() != VecVT)
23655       continue;
23656 
23657     EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
23658     if (!BaseVT.isSimple())
23659       BaseVT = OperandVT;
23660     else if (OperandVT != BaseVT)
23661       return MVT::INVALID_SIMPLE_VALUE_TYPE;
23662   }
23663 
23664   return BaseVT;
23665 }
23666 
23667 // When converting a <N x iX> vector to <N x i1> to store or use as a scalar
23668 // iN, we can use a trick that extracts the i^th bit from the i^th element and
23669 // then performs a vector add to get a scalar bitmask. This requires that each
23670 // element's bits are either all 1 or all 0.
23671 static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
23672   SDLoc DL(N);
23673   SDValue ComparisonResult(N, 0);
23674   EVT VecVT = ComparisonResult.getValueType();
23675   assert(VecVT.isVector() && "Must be a vector type");
23676 
23677   unsigned NumElts = VecVT.getVectorNumElements();
23678   if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
23679     return SDValue();
23680 
23681   if (VecVT.getVectorElementType() != MVT::i1 &&
23682       !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
23683     return SDValue();
23684 
23685   // If we can find the original types to work on instead of a vector of i1,
23686   // we can avoid extend/extract conversion instructions.
23687   if (VecVT.getVectorElementType() == MVT::i1) {
23688     VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
23689     if (!VecVT.isSimple()) {
23690       unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
23691       VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
23692     }
23693   }
23694   VecVT = VecVT.changeVectorElementTypeToInteger();
23695 
23696   // Large vectors don't map directly to this conversion, so to avoid too many
23697   // edge cases, we don't apply it here. The conversion will likely still be
23698   // applied later via multiple smaller vectors, whose results are concatenated.
23699   if (VecVT.getSizeInBits() > 128)
23700     return SDValue();
23701 
23702   // Ensure that all elements' bits are either 0s or 1s.
23703   ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
23704 
23705   SmallVector<SDValue, 16> MaskConstants;
23706   if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
23707       VecVT == MVT::v16i8) {
23708     // v16i8 is a special case, as we have 16 entries but only 8 positional bits
23709     // per entry. We split it into two halves, apply the mask, zip the halves to
23710     // create 8x 16-bit values, and the perform the vector reduce.
23711     for (unsigned Half = 0; Half < 2; ++Half) {
23712       for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
23713         MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
23714       }
23715     }
23716     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
23717     SDValue RepresentativeBits =
23718         DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
23719 
23720     SDValue UpperRepresentativeBits =
23721         DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
23722                     RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
23723     SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
23724                                  RepresentativeBits, UpperRepresentativeBits);
23725     Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
23726     return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
23727   }
23728 
23729   // All other vector sizes.
23730   unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
23731   for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
23732     MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
23733   }
23734 
23735   SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
23736   SDValue RepresentativeBits =
23737       DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
23738   EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
23739       NumElts, VecVT.getVectorElementType().getSizeInBits()));
23740   return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
23741 }
23742 
23743 static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
23744                                                  StoreSDNode *Store) {
23745   if (!Store->isTruncatingStore())
23746     return SDValue();
23747 
23748   SDLoc DL(Store);
23749   SDValue VecOp = Store->getValue();
23750   EVT VT = VecOp.getValueType();
23751   EVT MemVT = Store->getMemoryVT();
23752 
23753   if (!MemVT.isVector() || !VT.isVector() ||
23754       MemVT.getVectorElementType() != MVT::i1)
23755     return SDValue();
23756 
23757   // If we are storing a vector that we are currently building, let
23758   // `scalarizeVectorStore()` handle this more efficiently.
23759   if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
23760     return SDValue();
23761 
23762   VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
23763   SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
23764   if (!VectorBits)
23765     return SDValue();
23766 
23767   EVT StoreVT =
23768       EVT::getIntegerVT(*DAG.getContext(), MemVT.getStoreSizeInBits());
23769   SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
23770   return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
23771                       Store->getMemOperand());
23772 }
23773 
23774 bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
23775   return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
23776          (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
23777          (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
23778 }
23779 
23780 // Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
23781 static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
23782                                    const AArch64Subtarget *Subtarget) {
23783   SDValue Value = ST->getValue();
23784   EVT ValueVT = Value.getValueType();
23785 
23786   if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
23787       Value.getOpcode() != ISD::TRUNCATE ||
23788       ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
23789     return SDValue();
23790 
23791   assert(ST->getOffset().isUndef() && "undef offset expected");
23792   SDLoc DL(ST);
23793   auto WideVT = EVT::getVectorVT(
23794       *DAG.getContext(),
23795       Value->getOperand(0).getValueType().getVectorElementType(), 4);
23796   SDValue UndefVector = DAG.getUNDEF(WideVT);
23797   SDValue WideTrunc = DAG.getNode(
23798       ISD::INSERT_SUBVECTOR, DL, WideVT,
23799       {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
23800   SDValue Cast = DAG.getNode(
23801       ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
23802       WideTrunc);
23803 
23804   MachineFunction &MF = DAG.getMachineFunction();
23805   SDValue Chain = ST->getChain();
23806   MachineMemOperand *MMO = ST->getMemOperand();
23807   unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
23808   SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
23809                            DAG.getConstant(2 * IdxScale, DL, MVT::i64));
23810   TypeSize Offset2 = TypeSize::getFixed(2);
23811   SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
23812   Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
23813 
23814   SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
23815                            DAG.getConstant(1 * IdxScale, DL, MVT::i64));
23816   TypeSize Offset1 = TypeSize::getFixed(1);
23817   SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
23818   Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
23819 
23820   SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
23821                            DAG.getConstant(0, DL, MVT::i64));
23822   Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
23823                        MF.getMachineMemOperand(MMO, 0, 1));
23824   return Chain;
23825 }
23826 
23827 static SDValue performSTORECombine(SDNode *N,
23828                                    TargetLowering::DAGCombinerInfo &DCI,
23829                                    SelectionDAG &DAG,
23830                                    const AArch64Subtarget *Subtarget) {
23831   StoreSDNode *ST = cast<StoreSDNode>(N);
23832   SDValue Chain = ST->getChain();
23833   SDValue Value = ST->getValue();
23834   SDValue Ptr = ST->getBasePtr();
23835   EVT ValueVT = Value.getValueType();
23836 
23837   auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
23838     EVT EltVT = VT.getVectorElementType();
23839     return EltVT == MVT::f32 || EltVT == MVT::f64;
23840   };
23841 
23842   if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
23843     return Res;
23844 
23845   // If this is an FP_ROUND followed by a store, fold this into a truncating
23846   // store. We can do this even if this is already a truncstore.
23847   // We purposefully don't care about legality of the nodes here as we know
23848   // they can be split down into something legal.
23849   if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
23850       Value.getNode()->hasOneUse() && ST->isUnindexed() &&
23851       Subtarget->useSVEForFixedLengthVectors() &&
23852       ValueVT.isFixedLengthVector() &&
23853       ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
23854       hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
23855     return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
23856                              ST->getMemoryVT(), ST->getMemOperand());
23857 
23858   if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
23859     return Split;
23860 
23861   if (Subtarget->supportsAddressTopByteIgnored() &&
23862       performTBISimplification(N->getOperand(2), DCI, DAG))
23863     return SDValue(N, 0);
23864 
23865   if (SDValue Store = foldTruncStoreOfExt(DAG, N))
23866     return Store;
23867 
23868   if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
23869     return Store;
23870 
23871   if (ST->isTruncatingStore()) {
23872     EVT StoreVT = ST->getMemoryVT();
23873     if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
23874       return SDValue();
23875     if (SDValue Rshrnb =
23876             trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
23877       return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
23878                                StoreVT, ST->getMemOperand());
23879     }
23880   }
23881 
23882   return SDValue();
23883 }
23884 
23885 static SDValue performMSTORECombine(SDNode *N,
23886                                     TargetLowering::DAGCombinerInfo &DCI,
23887                                     SelectionDAG &DAG,
23888                                     const AArch64Subtarget *Subtarget) {
23889   MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
23890   SDValue Value = MST->getValue();
23891   SDValue Mask = MST->getMask();
23892   SDLoc DL(N);
23893 
23894   // If this is a UZP1 followed by a masked store, fold this into a masked
23895   // truncating store.  We can do this even if this is already a masked
23896   // truncstore.
23897   if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
23898       MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
23899       Value.getValueType().isInteger()) {
23900     Value = Value.getOperand(0);
23901     if (Value.getOpcode() == ISD::BITCAST) {
23902       EVT HalfVT =
23903           Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
23904       EVT InVT = Value.getOperand(0).getValueType();
23905 
23906       if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
23907         unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
23908         unsigned PgPattern = Mask->getConstantOperandVal(0);
23909 
23910         // Ensure we can double the size of the predicate pattern
23911         unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
23912         if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
23913                            MinSVESize) {
23914           Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
23915                           PgPattern);
23916           return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
23917                                     MST->getBasePtr(), MST->getOffset(), Mask,
23918                                     MST->getMemoryVT(), MST->getMemOperand(),
23919                                     MST->getAddressingMode(),
23920                                     /*IsTruncating=*/true);
23921         }
23922       }
23923     }
23924   }
23925 
23926   if (MST->isTruncatingStore()) {
23927     EVT ValueVT = Value->getValueType(0);
23928     EVT MemVT = MST->getMemoryVT();
23929     if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
23930       return SDValue();
23931     if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
23932       return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
23933                                 MST->getOffset(), MST->getMask(),
23934                                 MST->getMemoryVT(), MST->getMemOperand(),
23935                                 MST->getAddressingMode(), true);
23936     }
23937   }
23938 
23939   return SDValue();
23940 }
23941 
23942 /// \return true if part of the index was folded into the Base.
23943 static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
23944                               SDLoc DL, SelectionDAG &DAG) {
23945   // This function assumes a vector of i64 indices.
23946   EVT IndexVT = Index.getValueType();
23947   if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
23948     return false;
23949 
23950   // Simplify:
23951   //   BasePtr = Ptr
23952   //   Index = X + splat(Offset)
23953   // ->
23954   //   BasePtr = Ptr + Offset * scale.
23955   //   Index = X
23956   if (Index.getOpcode() == ISD::ADD) {
23957     if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
23958       Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
23959       BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
23960       Index = Index.getOperand(0);
23961       return true;
23962     }
23963   }
23964 
23965   // Simplify:
23966   //   BasePtr = Ptr
23967   //   Index = (X + splat(Offset)) << splat(Shift)
23968   // ->
23969   //   BasePtr = Ptr + (Offset << Shift) * scale)
23970   //   Index = X << splat(shift)
23971   if (Index.getOpcode() == ISD::SHL &&
23972       Index.getOperand(0).getOpcode() == ISD::ADD) {
23973     SDValue Add = Index.getOperand(0);
23974     SDValue ShiftOp = Index.getOperand(1);
23975     SDValue OffsetOp = Add.getOperand(1);
23976     if (auto Shift = DAG.getSplatValue(ShiftOp))
23977       if (auto Offset = DAG.getSplatValue(OffsetOp)) {
23978         Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
23979         Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
23980         BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
23981         Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
23982                             Add.getOperand(0), ShiftOp);
23983         return true;
23984       }
23985   }
23986 
23987   return false;
23988 }
23989 
23990 // Analyse the specified address returning true if a more optimal addressing
23991 // mode is available. When returning true all parameters are updated to reflect
23992 // their recommended values.
23993 static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
23994                                      SDValue &BasePtr, SDValue &Index,
23995                                      SelectionDAG &DAG) {
23996   // Try to iteratively fold parts of the index into the base pointer to
23997   // simplify the index as much as possible.
23998   bool Changed = false;
23999   while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
24000     Changed = true;
24001 
24002   // Only consider element types that are pointer sized as smaller types can
24003   // be easily promoted.
24004   EVT IndexVT = Index.getValueType();
24005   if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
24006     return Changed;
24007 
24008   // Can indices be trivially shrunk?
24009   EVT DataVT = N->getOperand(1).getValueType();
24010   // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
24011   // will later be re-extended to 64 bits in legalization
24012   if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
24013     return Changed;
24014   if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
24015     EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
24016     Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
24017     return true;
24018   }
24019 
24020   // Match:
24021   //   Index = step(const)
24022   int64_t Stride = 0;
24023   if (Index.getOpcode() == ISD::STEP_VECTOR) {
24024     Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
24025   }
24026   // Match:
24027   //   Index = step(const) << shift(const)
24028   else if (Index.getOpcode() == ISD::SHL &&
24029            Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
24030     SDValue RHS = Index.getOperand(1);
24031     if (auto *Shift =
24032             dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
24033       int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
24034       Stride = Step << Shift->getZExtValue();
24035     }
24036   }
24037 
24038   // Return early because no supported pattern is found.
24039   if (Stride == 0)
24040     return Changed;
24041 
24042   if (Stride < std::numeric_limits<int32_t>::min() ||
24043       Stride > std::numeric_limits<int32_t>::max())
24044     return Changed;
24045 
24046   const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
24047   unsigned MaxVScale =
24048       Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock;
24049   int64_t LastElementOffset =
24050       IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
24051 
24052   if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
24053       LastElementOffset > std::numeric_limits<int32_t>::max())
24054     return Changed;
24055 
24056   EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
24057   // Stride does not scale explicitly by 'Scale', because it happens in
24058   // the gather/scatter addressing mode.
24059   Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride, true));
24060   return true;
24061 }
24062 
24063 static SDValue performMaskedGatherScatterCombine(
24064     SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
24065   if (!DCI.isBeforeLegalize())
24066     return SDValue();
24067   MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
24068 
24069   SDLoc DL(MGS);
24070   SDValue Chain = MGS->getChain();
24071   SDValue Scale = MGS->getScale();
24072   SDValue Index = MGS->getIndex();
24073   SDValue Mask = MGS->getMask();
24074   SDValue BasePtr = MGS->getBasePtr();
24075   ISD::MemIndexType IndexType = MGS->getIndexType();
24076 
24077   if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
24078     return SDValue();
24079 
24080   // Here we catch such cases early and change MGATHER's IndexType to allow
24081   // the use of an Index that's more legalisation friendly.
24082   if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
24083     SDValue PassThru = MGT->getPassThru();
24084     SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
24085     return DAG.getMaskedGather(
24086         DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
24087         Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
24088   }
24089   if (auto *MSC = dyn_cast<MaskedScatterSDNode>(MGS)) {
24090     SDValue Data = MSC->getValue();
24091     SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
24092     return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
24093                                 DL, Ops, MSC->getMemOperand(), IndexType,
24094                                 MSC->isTruncatingStore());
24095   }
24096   auto *HG = cast<MaskedHistogramSDNode>(MGS);
24097   SDValue Ops[] = {Chain, HG->getInc(), Mask,          BasePtr,
24098                    Index, Scale,        HG->getIntID()};
24099   return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
24100                                 DL, Ops, HG->getMemOperand(), IndexType);
24101 }
24102 
24103 /// Target-specific DAG combine function for NEON load/store intrinsics
24104 /// to merge base address updates.
24105 static SDValue performNEONPostLDSTCombine(SDNode *N,
24106                                           TargetLowering::DAGCombinerInfo &DCI,
24107                                           SelectionDAG &DAG) {
24108   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24109     return SDValue();
24110 
24111   unsigned AddrOpIdx = N->getNumOperands() - 1;
24112   SDValue Addr = N->getOperand(AddrOpIdx);
24113 
24114   // Search for a use of the address operand that is an increment.
24115   for (SDUse &Use : Addr->uses()) {
24116     SDNode *User = Use.getUser();
24117     if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
24118       continue;
24119 
24120     // Check that the add is independent of the load/store.  Otherwise, folding
24121     // it would create a cycle.
24122     SmallPtrSet<const SDNode *, 32> Visited;
24123     SmallVector<const SDNode *, 16> Worklist;
24124     Visited.insert(Addr.getNode());
24125     Worklist.push_back(N);
24126     Worklist.push_back(User);
24127     if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
24128         SDNode::hasPredecessorHelper(User, Visited, Worklist))
24129       continue;
24130 
24131     // Find the new opcode for the updating load/store.
24132     bool IsStore = false;
24133     bool IsLaneOp = false;
24134     bool IsDupOp = false;
24135     unsigned NewOpc = 0;
24136     unsigned NumVecs = 0;
24137     unsigned IntNo = N->getConstantOperandVal(1);
24138     switch (IntNo) {
24139     default: llvm_unreachable("unexpected intrinsic for Neon base update");
24140     case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
24141       NumVecs = 2; break;
24142     case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
24143       NumVecs = 3; break;
24144     case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
24145       NumVecs = 4; break;
24146     case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
24147       NumVecs = 2; IsStore = true; break;
24148     case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
24149       NumVecs = 3; IsStore = true; break;
24150     case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
24151       NumVecs = 4; IsStore = true; break;
24152     case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
24153       NumVecs = 2; break;
24154     case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
24155       NumVecs = 3; break;
24156     case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
24157       NumVecs = 4; break;
24158     case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
24159       NumVecs = 2; IsStore = true; break;
24160     case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
24161       NumVecs = 3; IsStore = true; break;
24162     case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
24163       NumVecs = 4; IsStore = true; break;
24164     case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
24165       NumVecs = 2; IsDupOp = true; break;
24166     case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
24167       NumVecs = 3; IsDupOp = true; break;
24168     case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
24169       NumVecs = 4; IsDupOp = true; break;
24170     case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
24171       NumVecs = 2; IsLaneOp = true; break;
24172     case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
24173       NumVecs = 3; IsLaneOp = true; break;
24174     case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
24175       NumVecs = 4; IsLaneOp = true; break;
24176     case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
24177       NumVecs = 2; IsStore = true; IsLaneOp = true; break;
24178     case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
24179       NumVecs = 3; IsStore = true; IsLaneOp = true; break;
24180     case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
24181       NumVecs = 4; IsStore = true; IsLaneOp = true; break;
24182     }
24183 
24184     EVT VecTy;
24185     if (IsStore)
24186       VecTy = N->getOperand(2).getValueType();
24187     else
24188       VecTy = N->getValueType(0);
24189 
24190     // If the increment is a constant, it must match the memory ref size.
24191     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
24192     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
24193       uint32_t IncVal = CInc->getZExtValue();
24194       unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
24195       if (IsLaneOp || IsDupOp)
24196         NumBytes /= VecTy.getVectorNumElements();
24197       if (IncVal != NumBytes)
24198         continue;
24199       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
24200     }
24201     SmallVector<SDValue, 8> Ops;
24202     Ops.push_back(N->getOperand(0)); // Incoming chain
24203     // Load lane and store have vector list as input.
24204     if (IsLaneOp || IsStore)
24205       for (unsigned i = 2; i < AddrOpIdx; ++i)
24206         Ops.push_back(N->getOperand(i));
24207     Ops.push_back(Addr); // Base register
24208     Ops.push_back(Inc);
24209 
24210     // Return Types.
24211     EVT Tys[6];
24212     unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
24213     unsigned n;
24214     for (n = 0; n < NumResultVecs; ++n)
24215       Tys[n] = VecTy;
24216     Tys[n++] = MVT::i64;  // Type of write back register
24217     Tys[n] = MVT::Other;  // Type of the chain
24218     SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
24219 
24220     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
24221     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
24222                                            MemInt->getMemoryVT(),
24223                                            MemInt->getMemOperand());
24224 
24225     // Update the uses.
24226     std::vector<SDValue> NewResults;
24227     for (unsigned i = 0; i < NumResultVecs; ++i) {
24228       NewResults.push_back(SDValue(UpdN.getNode(), i));
24229     }
24230     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
24231     DCI.CombineTo(N, NewResults);
24232     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
24233 
24234     break;
24235   }
24236   return SDValue();
24237 }
24238 
24239 // Checks to see if the value is the prescribed width and returns information
24240 // about its extension mode.
24241 static
24242 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
24243   ExtType = ISD::NON_EXTLOAD;
24244   switch(V.getNode()->getOpcode()) {
24245   default:
24246     return false;
24247   case ISD::LOAD: {
24248     LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
24249     if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
24250        || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
24251       ExtType = LoadNode->getExtensionType();
24252       return true;
24253     }
24254     return false;
24255   }
24256   case ISD::AssertSext: {
24257     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
24258     if ((TypeNode->getVT() == MVT::i8 && width == 8)
24259        || (TypeNode->getVT() == MVT::i16 && width == 16)) {
24260       ExtType = ISD::SEXTLOAD;
24261       return true;
24262     }
24263     return false;
24264   }
24265   case ISD::AssertZext: {
24266     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
24267     if ((TypeNode->getVT() == MVT::i8 && width == 8)
24268        || (TypeNode->getVT() == MVT::i16 && width == 16)) {
24269       ExtType = ISD::ZEXTLOAD;
24270       return true;
24271     }
24272     return false;
24273   }
24274   case ISD::Constant:
24275   case ISD::TargetConstant: {
24276     return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
24277            1LL << (width - 1);
24278   }
24279   }
24280 
24281   return true;
24282 }
24283 
24284 // This function does a whole lot of voodoo to determine if the tests are
24285 // equivalent without and with a mask. Essentially what happens is that given a
24286 // DAG resembling:
24287 //
24288 //  +-------------+ +-------------+ +-------------+ +-------------+
24289 //  |    Input    | | AddConstant | | CompConstant| |     CC      |
24290 //  +-------------+ +-------------+ +-------------+ +-------------+
24291 //           |           |           |               |
24292 //           V           V           |    +----------+
24293 //          +-------------+  +----+  |    |
24294 //          |     ADD     |  |0xff|  |    |
24295 //          +-------------+  +----+  |    |
24296 //                  |           |    |    |
24297 //                  V           V    |    |
24298 //                 +-------------+   |    |
24299 //                 |     AND     |   |    |
24300 //                 +-------------+   |    |
24301 //                      |            |    |
24302 //                      +-----+      |    |
24303 //                            |      |    |
24304 //                            V      V    V
24305 //                           +-------------+
24306 //                           |     CMP     |
24307 //                           +-------------+
24308 //
24309 // The AND node may be safely removed for some combinations of inputs. In
24310 // particular we need to take into account the extension type of the Input,
24311 // the exact values of AddConstant, CompConstant, and CC, along with the nominal
24312 // width of the input (this can work for any width inputs, the above graph is
24313 // specific to 8 bits.
24314 //
24315 // The specific equations were worked out by generating output tables for each
24316 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
24317 // problem was simplified by working with 4 bit inputs, which means we only
24318 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
24319 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
24320 // patterns present in both extensions (0,7). For every distinct set of
24321 // AddConstant and CompConstants bit patterns we can consider the masked and
24322 // unmasked versions to be equivalent if the result of this function is true for
24323 // all 16 distinct bit patterns of for the current extension type of Input (w0).
24324 //
24325 //   sub      w8, w0, w1
24326 //   and      w10, w8, #0x0f
24327 //   cmp      w8, w2
24328 //   cset     w9, AArch64CC
24329 //   cmp      w10, w2
24330 //   cset     w11, AArch64CC
24331 //   cmp      w9, w11
24332 //   cset     w0, eq
24333 //   ret
24334 //
24335 // Since the above function shows when the outputs are equivalent it defines
24336 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
24337 // would be expensive to run during compiles. The equations below were written
24338 // in a test harness that confirmed they gave equivalent outputs to the above
24339 // for all inputs function, so they can be used determine if the removal is
24340 // legal instead.
24341 //
24342 // isEquivalentMaskless() is the code for testing if the AND can be removed
24343 // factored out of the DAG recognition as the DAG can take several forms.
24344 
24345 static bool isEquivalentMaskless(unsigned CC, unsigned width,
24346                                  ISD::LoadExtType ExtType, int AddConstant,
24347                                  int CompConstant) {
24348   // By being careful about our equations and only writing the in term
24349   // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
24350   // make them generally applicable to all bit widths.
24351   int MaxUInt = (1 << width);
24352 
24353   // For the purposes of these comparisons sign extending the type is
24354   // equivalent to zero extending the add and displacing it by half the integer
24355   // width. Provided we are careful and make sure our equations are valid over
24356   // the whole range we can just adjust the input and avoid writing equations
24357   // for sign extended inputs.
24358   if (ExtType == ISD::SEXTLOAD)
24359     AddConstant -= (1 << (width-1));
24360 
24361   switch(CC) {
24362   case AArch64CC::LE:
24363   case AArch64CC::GT:
24364     if ((AddConstant == 0) ||
24365         (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
24366         (AddConstant >= 0 && CompConstant < 0) ||
24367         (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
24368       return true;
24369     break;
24370   case AArch64CC::LT:
24371   case AArch64CC::GE:
24372     if ((AddConstant == 0) ||
24373         (AddConstant >= 0 && CompConstant <= 0) ||
24374         (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
24375       return true;
24376     break;
24377   case AArch64CC::HI:
24378   case AArch64CC::LS:
24379     if ((AddConstant >= 0 && CompConstant < 0) ||
24380        (AddConstant <= 0 && CompConstant >= -1 &&
24381         CompConstant < AddConstant + MaxUInt))
24382       return true;
24383    break;
24384   case AArch64CC::PL:
24385   case AArch64CC::MI:
24386     if ((AddConstant == 0) ||
24387         (AddConstant > 0 && CompConstant <= 0) ||
24388         (AddConstant < 0 && CompConstant <= AddConstant))
24389       return true;
24390     break;
24391   case AArch64CC::LO:
24392   case AArch64CC::HS:
24393     if ((AddConstant >= 0 && CompConstant <= 0) ||
24394         (AddConstant <= 0 && CompConstant >= 0 &&
24395          CompConstant <= AddConstant + MaxUInt))
24396       return true;
24397     break;
24398   case AArch64CC::EQ:
24399   case AArch64CC::NE:
24400     if ((AddConstant > 0 && CompConstant < 0) ||
24401         (AddConstant < 0 && CompConstant >= 0 &&
24402          CompConstant < AddConstant + MaxUInt) ||
24403         (AddConstant >= 0 && CompConstant >= 0 &&
24404          CompConstant >= AddConstant) ||
24405         (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
24406       return true;
24407     break;
24408   case AArch64CC::VS:
24409   case AArch64CC::VC:
24410   case AArch64CC::AL:
24411   case AArch64CC::NV:
24412     return true;
24413   case AArch64CC::Invalid:
24414     break;
24415   }
24416 
24417   return false;
24418 }
24419 
24420 // (X & C) >u Mask --> (X & (C & (~Mask)) != 0
24421 // (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
24422 static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode,
24423                                         SDNode *AndNode, SelectionDAG &DAG,
24424                                         unsigned CCIndex, unsigned CmpIndex,
24425                                         unsigned CC) {
24426   ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
24427   if (!SubsC)
24428     return SDValue();
24429 
24430   APInt SubsAP = SubsC->getAPIntValue();
24431   if (CC == AArch64CC::HI) {
24432     if (!SubsAP.isMask())
24433       return SDValue();
24434   } else if (CC == AArch64CC::LO) {
24435     if (!SubsAP.isPowerOf2())
24436       return SDValue();
24437   } else
24438     return SDValue();
24439 
24440   ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
24441   if (!AndC)
24442     return SDValue();
24443 
24444   APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
24445 
24446   SDLoc DL(N);
24447   APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
24448   SDValue ANDS = DAG.getNode(
24449       AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
24450       DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
24451   SDValue AArch64_CC =
24452       DAG.getConstant(CC == AArch64CC::HI ? AArch64CC::NE : AArch64CC::EQ, DL,
24453                       N->getOperand(CCIndex)->getValueType(0));
24454 
24455   // For now, only performCSELCombine and performBRCONDCombine call this
24456   // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
24457   // operands. So just init the ops direct to simplify the code. If we have some
24458   // other case with different CCIndex, CmpIndex, we need to use for loop to
24459   // rewrite the code here.
24460   // TODO: Do we need to assert number of operand is 4 here?
24461   assert((CCIndex == 2 && CmpIndex == 3) &&
24462          "Expected CCIndex to be 2 and CmpIndex to be 3.");
24463   SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
24464                    ANDS.getValue(1)};
24465   return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
24466 }
24467 
24468 static
24469 SDValue performCONDCombine(SDNode *N,
24470                            TargetLowering::DAGCombinerInfo &DCI,
24471                            SelectionDAG &DAG, unsigned CCIndex,
24472                            unsigned CmpIndex) {
24473   unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
24474   SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
24475   unsigned CondOpcode = SubsNode->getOpcode();
24476 
24477   if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
24478       !SubsNode->hasOneUse())
24479     return SDValue();
24480 
24481   // There is a SUBS feeding this condition. Is it fed by a mask we can
24482   // use?
24483 
24484   SDNode *AndNode = SubsNode->getOperand(0).getNode();
24485   unsigned MaskBits = 0;
24486 
24487   if (AndNode->getOpcode() != ISD::AND)
24488     return SDValue();
24489 
24490   if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
24491                                              CmpIndex, CC))
24492     return Val;
24493 
24494   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
24495     uint32_t CNV = CN->getZExtValue();
24496     if (CNV == 255)
24497       MaskBits = 8;
24498     else if (CNV == 65535)
24499       MaskBits = 16;
24500   }
24501 
24502   if (!MaskBits)
24503     return SDValue();
24504 
24505   SDValue AddValue = AndNode->getOperand(0);
24506 
24507   if (AddValue.getOpcode() != ISD::ADD)
24508     return SDValue();
24509 
24510   // The basic dag structure is correct, grab the inputs and validate them.
24511 
24512   SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
24513   SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
24514   SDValue SubsInputValue = SubsNode->getOperand(1);
24515 
24516   // The mask is present and the provenance of all the values is a smaller type,
24517   // lets see if the mask is superfluous.
24518 
24519   if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
24520       !isa<ConstantSDNode>(SubsInputValue.getNode()))
24521     return SDValue();
24522 
24523   ISD::LoadExtType ExtType;
24524 
24525   if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
24526       !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
24527       !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
24528     return SDValue();
24529 
24530   if(!isEquivalentMaskless(CC, MaskBits, ExtType,
24531                 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
24532                 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
24533     return SDValue();
24534 
24535   // The AND is not necessary, remove it.
24536 
24537   SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
24538                                SubsNode->getValueType(1));
24539   SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
24540 
24541   SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
24542   DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
24543 
24544   return SDValue(N, 0);
24545 }
24546 
24547 // Optimize compare with zero and branch.
24548 static SDValue performBRCONDCombine(SDNode *N,
24549                                     TargetLowering::DAGCombinerInfo &DCI,
24550                                     SelectionDAG &DAG) {
24551   MachineFunction &MF = DAG.getMachineFunction();
24552   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
24553   // will not be produced, as they are conditional branch instructions that do
24554   // not set flags.
24555   if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
24556     return SDValue();
24557 
24558   if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
24559     N = NV.getNode();
24560   SDValue Chain = N->getOperand(0);
24561   SDValue Dest = N->getOperand(1);
24562   SDValue CCVal = N->getOperand(2);
24563   SDValue Cmp = N->getOperand(3);
24564 
24565   assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
24566   unsigned CC = CCVal->getAsZExtVal();
24567   if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
24568     return SDValue();
24569 
24570   unsigned CmpOpc = Cmp.getOpcode();
24571   if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
24572     return SDValue();
24573 
24574   // Only attempt folding if there is only one use of the flag and no use of the
24575   // value.
24576   if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
24577     return SDValue();
24578 
24579   SDValue LHS = Cmp.getOperand(0);
24580   SDValue RHS = Cmp.getOperand(1);
24581 
24582   assert(LHS.getValueType() == RHS.getValueType() &&
24583          "Expected the value type to be the same for both operands!");
24584   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
24585     return SDValue();
24586 
24587   if (isNullConstant(LHS))
24588     std::swap(LHS, RHS);
24589 
24590   if (!isNullConstant(RHS))
24591     return SDValue();
24592 
24593   if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
24594       LHS.getOpcode() == ISD::SRL)
24595     return SDValue();
24596 
24597   // Fold the compare into the branch instruction.
24598   SDValue BR;
24599   if (CC == AArch64CC::EQ)
24600     BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
24601   else
24602     BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
24603 
24604   // Do not add new nodes to DAG combiner worklist.
24605   DCI.CombineTo(N, BR, false);
24606 
24607   return SDValue();
24608 }
24609 
24610 static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) {
24611   unsigned CC = N->getConstantOperandVal(2);
24612   SDValue SUBS = N->getOperand(3);
24613   SDValue Zero, CTTZ;
24614 
24615   if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
24616     Zero = N->getOperand(0);
24617     CTTZ = N->getOperand(1);
24618   } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
24619     Zero = N->getOperand(1);
24620     CTTZ = N->getOperand(0);
24621   } else
24622     return SDValue();
24623 
24624   if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
24625       (CTTZ.getOpcode() == ISD::TRUNCATE &&
24626        CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
24627     return SDValue();
24628 
24629   assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
24630          "Illegal type in CTTZ folding");
24631 
24632   if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
24633     return SDValue();
24634 
24635   SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
24636                   ? CTTZ.getOperand(0).getOperand(0)
24637                   : CTTZ.getOperand(0);
24638 
24639   if (X != SUBS.getOperand(0))
24640     return SDValue();
24641 
24642   unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
24643                           ? CTTZ.getOperand(0).getValueSizeInBits()
24644                           : CTTZ.getValueSizeInBits();
24645   SDValue BitWidthMinusOne =
24646       DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
24647   return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
24648                      BitWidthMinusOne);
24649 }
24650 
24651 // (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
24652 // (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
24653 // Where x and y are constants and x != y
24654 
24655 // (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
24656 // (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
24657 // Where x and y are constants and x != y
24658 static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) {
24659   SDValue L = Op->getOperand(0);
24660   SDValue R = Op->getOperand(1);
24661   AArch64CC::CondCode OpCC =
24662       static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
24663 
24664   SDValue OpCmp = Op->getOperand(3);
24665   if (!isCMP(OpCmp))
24666     return SDValue();
24667 
24668   SDValue CmpLHS = OpCmp.getOperand(0);
24669   SDValue CmpRHS = OpCmp.getOperand(1);
24670 
24671   if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
24672     std::swap(CmpLHS, CmpRHS);
24673   else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
24674     return SDValue();
24675 
24676   SDValue X = CmpLHS->getOperand(0);
24677   SDValue Y = CmpLHS->getOperand(1);
24678   if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
24679     return SDValue();
24680   }
24681 
24682   // If one of the constant is opaque constant, x,y sdnode is still different
24683   // but the real value maybe the same. So check APInt here to make sure the
24684   // code is correct.
24685   ConstantSDNode *CX = cast<ConstantSDNode>(X);
24686   ConstantSDNode *CY = cast<ConstantSDNode>(Y);
24687   if (CX->getAPIntValue() == CY->getAPIntValue())
24688     return SDValue();
24689 
24690   AArch64CC::CondCode CC =
24691       static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
24692   SDValue Cond = CmpLHS->getOperand(3);
24693 
24694   if (CmpRHS == Y)
24695     CC = AArch64CC::getInvertedCondCode(CC);
24696   else if (CmpRHS != X)
24697     return SDValue();
24698 
24699   if (OpCC == AArch64CC::NE)
24700     CC = AArch64CC::getInvertedCondCode(CC);
24701   else if (OpCC != AArch64CC::EQ)
24702     return SDValue();
24703 
24704   SDLoc DL(Op);
24705   EVT VT = Op->getValueType(0);
24706 
24707   SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
24708   return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
24709 }
24710 
24711 // Reassociate the true/false expressions of a CSEL instruction to obtain a
24712 // common subexpression with the comparison instruction. For example, change
24713 // (CSEL (ADD (ADD x y) -c) f LO (SUBS x c)) to
24714 // (CSEL (ADD (SUBS x c) y) f LO (SUBS x c)) such that (SUBS x c) is a common
24715 // subexpression.
24716 static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) {
24717   SDValue SubsNode = N->getOperand(3);
24718   if (SubsNode.getOpcode() != AArch64ISD::SUBS || !SubsNode.hasOneUse())
24719     return SDValue();
24720 
24721   SDValue CmpOpToMatch = SubsNode.getOperand(1);
24722   SDValue CmpOpOther = SubsNode.getOperand(0);
24723   EVT VT = N->getValueType(0);
24724 
24725   unsigned ExpectedOpcode;
24726   SDValue ExpectedOp;
24727   SDValue SubsOp;
24728   auto *CmpOpConst = dyn_cast<ConstantSDNode>(CmpOpToMatch);
24729   if (CmpOpConst) {
24730     ExpectedOpcode = ISD::ADD;
24731     ExpectedOp =
24732         DAG.getConstant(-CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
24733                         CmpOpConst->getValueType(0));
24734     SubsOp = DAG.getConstant(CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
24735                              CmpOpConst->getValueType(0));
24736   } else {
24737     ExpectedOpcode = ISD::SUB;
24738     ExpectedOp = CmpOpToMatch;
24739     SubsOp = CmpOpToMatch;
24740   }
24741 
24742   // Get the operand that can be reassociated with the SUBS instruction.
24743   auto GetReassociationOp = [&](SDValue Op, SDValue ExpectedOp) {
24744     if (Op.getOpcode() != ExpectedOpcode)
24745       return SDValue();
24746     if (Op.getOperand(0).getOpcode() != ISD::ADD ||
24747         !Op.getOperand(0).hasOneUse())
24748       return SDValue();
24749     SDValue X = Op.getOperand(0).getOperand(0);
24750     SDValue Y = Op.getOperand(0).getOperand(1);
24751     if (X != CmpOpOther)
24752       std::swap(X, Y);
24753     if (X != CmpOpOther)
24754       return SDValue();
24755     if (ExpectedOp != Op.getOperand(1))
24756       return SDValue();
24757     return Y;
24758   };
24759 
24760   // Try the reassociation using the given constant and condition code.
24761   auto Fold = [&](AArch64CC::CondCode NewCC, SDValue ExpectedOp,
24762                   SDValue SubsOp) {
24763     SDValue TReassocOp = GetReassociationOp(N->getOperand(0), ExpectedOp);
24764     SDValue FReassocOp = GetReassociationOp(N->getOperand(1), ExpectedOp);
24765     if (!TReassocOp && !FReassocOp)
24766       return SDValue();
24767 
24768     SDValue NewCmp = DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
24769                                  DAG.getVTList(VT, MVT_CC), CmpOpOther, SubsOp);
24770 
24771     auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
24772       if (!ReassocOp)
24773         return N->getOperand(OpNum);
24774       SDValue Res = DAG.getNode(ISD::ADD, SDLoc(N->getOperand(OpNum)), VT,
24775                                 NewCmp.getValue(0), ReassocOp);
24776       DAG.ReplaceAllUsesWith(N->getOperand(OpNum), Res);
24777       return Res;
24778     };
24779 
24780     SDValue TValReassoc = Reassociate(TReassocOp, 0);
24781     SDValue FValReassoc = Reassociate(FReassocOp, 1);
24782     return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc,
24783                        DAG.getConstant(NewCC, SDLoc(N->getOperand(2)), MVT_CC),
24784                        NewCmp.getValue(1));
24785   };
24786 
24787   auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
24788 
24789   // First, try to eliminate the compare instruction by searching for a
24790   // subtraction with the same constant.
24791   if (SDValue R = Fold(CC, ExpectedOp, SubsOp))
24792     return R;
24793 
24794   if (!CmpOpConst) {
24795     // Try again with the operands of the SUBS instruction and the condition
24796     // swapped. Due to canonicalization, this only helps for non-constant
24797     // operands of the SUBS instruction.
24798     std::swap(CmpOpToMatch, CmpOpOther);
24799     if (SDValue R = Fold(getSwappedCondition(CC), CmpOpToMatch, CmpOpToMatch))
24800       return R;
24801     return SDValue();
24802   }
24803 
24804   if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && !CmpOpConst->isZero())
24805     return SDValue();
24806 
24807   // Next, search for a subtraction with a slightly different constant. By
24808   // adjusting the condition code, we can still eliminate the compare
24809   // instruction. Adjusting the constant is only valid if it does not result
24810   // in signed/unsigned wrap for signed/unsigned comparisons, respectively.
24811   // Since such comparisons are trivially true/false, we should not encounter
24812   // them here but check for them nevertheless to be on the safe side.
24813   auto CheckedFold = [&](bool Check, APInt NewCmpConst,
24814                          AArch64CC::CondCode NewCC) {
24815     auto ExpectedOp = DAG.getConstant(-NewCmpConst, SDLoc(CmpOpConst),
24816                                       CmpOpConst->getValueType(0));
24817     auto SubsOp = DAG.getConstant(NewCmpConst, SDLoc(CmpOpConst),
24818                                   CmpOpConst->getValueType(0));
24819     return Check ? Fold(NewCC, ExpectedOp, SubsOp) : SDValue();
24820   };
24821   switch (CC) {
24822   case AArch64CC::EQ:
24823   case AArch64CC::LS:
24824     return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
24825                        CmpOpConst->getAPIntValue() + 1, AArch64CC::LO);
24826   case AArch64CC::NE:
24827   case AArch64CC::HI:
24828     return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
24829                        CmpOpConst->getAPIntValue() + 1, AArch64CC::HS);
24830   case AArch64CC::LO:
24831     return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
24832                        CmpOpConst->getAPIntValue() - 1, AArch64CC::LS);
24833   case AArch64CC::HS:
24834     return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
24835                        CmpOpConst->getAPIntValue() - 1, AArch64CC::HI);
24836   case AArch64CC::LT:
24837     return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
24838                        CmpOpConst->getAPIntValue() - 1, AArch64CC::LE);
24839   case AArch64CC::LE:
24840     return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
24841                        CmpOpConst->getAPIntValue() + 1, AArch64CC::LT);
24842   case AArch64CC::GT:
24843     return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
24844                        CmpOpConst->getAPIntValue() + 1, AArch64CC::GE);
24845   case AArch64CC::GE:
24846     return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
24847                        CmpOpConst->getAPIntValue() - 1, AArch64CC::GT);
24848   default:
24849     return SDValue();
24850   }
24851 }
24852 
24853 // Optimize CSEL instructions
24854 static SDValue performCSELCombine(SDNode *N,
24855                                   TargetLowering::DAGCombinerInfo &DCI,
24856                                   SelectionDAG &DAG) {
24857   // CSEL x, x, cc -> x
24858   if (N->getOperand(0) == N->getOperand(1))
24859     return N->getOperand(0);
24860 
24861   if (SDValue R = foldCSELOfCSEL(N, DAG))
24862     return R;
24863 
24864   // Try to reassociate the true/false expressions so that we can do CSE with
24865   // a SUBS instruction used to perform the comparison.
24866   if (SDValue R = reassociateCSELOperandsForCSE(N, DAG))
24867     return R;
24868 
24869   // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
24870   // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
24871   if (SDValue Folded = foldCSELofCTTZ(N, DAG))
24872 		return Folded;
24873 
24874   // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)
24875   // if SUB(y, x) already exists and we can produce a swapped predicate for cc.
24876   SDValue Cond = N->getOperand(3);
24877   if (DCI.isAfterLegalizeDAG() && Cond.getOpcode() == AArch64ISD::SUBS &&
24878       Cond.hasOneUse() && Cond->hasNUsesOfValue(0, 0) &&
24879       DAG.doesNodeExist(ISD::SUB, N->getVTList(),
24880                         {Cond.getOperand(1), Cond.getOperand(0)}) &&
24881       !DAG.doesNodeExist(ISD::SUB, N->getVTList(),
24882                          {Cond.getOperand(0), Cond.getOperand(1)}) &&
24883       !isNullConstant(Cond.getOperand(1))) {
24884     AArch64CC::CondCode OldCond =
24885         static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
24886     AArch64CC::CondCode NewCond = getSwappedCondition(OldCond);
24887     if (NewCond != AArch64CC::AL) {
24888       SDLoc DL(N);
24889       SDValue Sub = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
24890                                 Cond.getOperand(1), Cond.getOperand(0));
24891       return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0),
24892                          N->getOperand(1),
24893                          DAG.getConstant(NewCond, DL, MVT::i32),
24894                          Sub.getValue(1));
24895     }
24896   }
24897 
24898   return performCONDCombine(N, DCI, DAG, 2, 3);
24899 }
24900 
24901 // Try to re-use an already extended operand of a vector SetCC feeding a
24902 // extended select. Doing so avoids requiring another full extension of the
24903 // SET_CC result when lowering the select.
24904 static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
24905   EVT Op0MVT = Op->getOperand(0).getValueType();
24906   if (!Op0MVT.isVector() || Op->use_empty())
24907     return SDValue();
24908 
24909   // Make sure that all uses of Op are VSELECTs with result matching types where
24910   // the result type has a larger element type than the SetCC operand.
24911   SDNode *FirstUse = *Op->user_begin();
24912   if (FirstUse->getOpcode() != ISD::VSELECT)
24913     return SDValue();
24914   EVT UseMVT = FirstUse->getValueType(0);
24915   if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
24916     return SDValue();
24917   if (any_of(Op->users(), [&UseMVT](const SDNode *N) {
24918         return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
24919       }))
24920     return SDValue();
24921 
24922   APInt V;
24923   if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
24924     return SDValue();
24925 
24926   SDLoc DL(Op);
24927   SDValue Op0ExtV;
24928   SDValue Op1ExtV;
24929   ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
24930   // Check if the first operand of the SET_CC is already extended. If it is,
24931   // split the SET_CC and re-use the extended version of the operand.
24932   SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
24933                                         Op->getOperand(0));
24934   SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
24935                                         Op->getOperand(0));
24936   if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
24937     Op0ExtV = SDValue(Op0SExt, 0);
24938     Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
24939   } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
24940     Op0ExtV = SDValue(Op0ZExt, 0);
24941     Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
24942   } else
24943     return SDValue();
24944 
24945   return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
24946                      Op0ExtV, Op1ExtV, Op->getOperand(2));
24947 }
24948 
24949 static SDValue
24950 performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
24951                                SelectionDAG &DAG) {
24952   SDValue Vec = N->getOperand(0);
24953   if (DCI.isBeforeLegalize() &&
24954       Vec.getValueType().getVectorElementType() == MVT::i1 &&
24955       Vec.getValueType().isFixedLengthVector() &&
24956       Vec.getValueType().isPow2VectorType()) {
24957     SDLoc DL(N);
24958     return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
24959                                   DAG);
24960   }
24961 
24962   return SDValue();
24963 }
24964 
24965 static SDValue performSETCCCombine(SDNode *N,
24966                                    TargetLowering::DAGCombinerInfo &DCI,
24967                                    SelectionDAG &DAG) {
24968   assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
24969   SDValue LHS = N->getOperand(0);
24970   SDValue RHS = N->getOperand(1);
24971   ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
24972   SDLoc DL(N);
24973   EVT VT = N->getValueType(0);
24974 
24975   if (SDValue V = tryToWidenSetCCOperands(N, DAG))
24976     return V;
24977 
24978   // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
24979   if (Cond == ISD::SETNE && isOneConstant(RHS) &&
24980       LHS->getOpcode() == AArch64ISD::CSEL &&
24981       isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
24982       LHS->hasOneUse()) {
24983     // Invert CSEL's condition.
24984     auto OldCond =
24985         static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
24986     auto NewCond = getInvertedCondCode(OldCond);
24987 
24988     // csel 0, 1, !cond, X
24989     SDValue CSEL =
24990         DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
24991                     LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
24992                     LHS.getOperand(3));
24993     return DAG.getZExtOrTrunc(CSEL, DL, VT);
24994   }
24995 
24996   // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
24997   if (Cond == ISD::SETNE && isNullConstant(RHS) &&
24998       LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
24999       LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() &&
25000       LHS->hasOneUse()) {
25001     EVT TstVT = LHS->getValueType(0);
25002     if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
25003       // this pattern will get better opt in emitComparison
25004       uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
25005       SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
25006                                 DAG.getSignedConstant(TstImm, DL, TstVT));
25007       return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
25008     }
25009   }
25010 
25011   // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
25012   //   ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
25013   // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
25014   //   ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
25015   if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
25016       (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
25017       (isNullConstant(RHS) || isAllOnesConstant(RHS)) &&
25018       LHS->getOpcode() == ISD::BITCAST) {
25019     EVT ToVT = LHS->getValueType(0);
25020     EVT FromVT = LHS->getOperand(0).getValueType();
25021     if (FromVT.isFixedLengthVector() &&
25022         FromVT.getVectorElementType() == MVT::i1) {
25023       bool IsNull = isNullConstant(RHS);
25024       LHS = DAG.getNode(IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND,
25025                         DL, MVT::i1, LHS->getOperand(0));
25026       LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
25027                         LHS);
25028       return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
25029     }
25030   }
25031 
25032   // Try to perform the memcmp when the result is tested for [in]equality with 0
25033   if (SDValue V = performOrXorChainCombine(N, DAG))
25034     return V;
25035 
25036   return SDValue();
25037 }
25038 
25039 // Replace a flag-setting operator (eg ANDS) with the generic version
25040 // (eg AND) if the flag is unused.
25041 static SDValue performFlagSettingCombine(SDNode *N,
25042                                          TargetLowering::DAGCombinerInfo &DCI,
25043                                          unsigned GenericOpcode) {
25044   SDLoc DL(N);
25045   SDValue LHS = N->getOperand(0);
25046   SDValue RHS = N->getOperand(1);
25047   EVT VT = N->getValueType(0);
25048 
25049   // If the flag result isn't used, convert back to a generic opcode.
25050   if (!N->hasAnyUseOfValue(1)) {
25051     SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
25052     return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
25053                                   DL);
25054   }
25055 
25056   // Combine identical generic nodes into this node, re-using the result.
25057   if (SDNode *Generic = DCI.DAG.getNodeIfExists(
25058           GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
25059     DCI.CombineTo(Generic, SDValue(N, 0));
25060 
25061   return SDValue();
25062 }
25063 
25064 static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
25065   // setcc_merge_zero pred
25066   //   (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
25067   //   => extract_subvector (inner setcc_merge_zero)
25068   SDValue Pred = N->getOperand(0);
25069   SDValue LHS = N->getOperand(1);
25070   SDValue RHS = N->getOperand(2);
25071   ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
25072 
25073   if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
25074       LHS->getOpcode() != ISD::SIGN_EXTEND)
25075     return SDValue();
25076 
25077   SDValue Extract = LHS->getOperand(0);
25078   if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
25079       Extract->getValueType(0) != N->getValueType(0) ||
25080       Extract->getConstantOperandVal(1) != 0)
25081     return SDValue();
25082 
25083   SDValue InnerSetCC = Extract->getOperand(0);
25084   if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
25085     return SDValue();
25086 
25087   // By this point we've effectively got
25088   // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
25089   // lanes are already zero then the trunc(sext()) sequence is redundant and we
25090   // can operate on A directly.
25091   SDValue InnerPred = InnerSetCC.getOperand(0);
25092   if (Pred.getOpcode() == AArch64ISD::PTRUE &&
25093       InnerPred.getOpcode() == AArch64ISD::PTRUE &&
25094       Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
25095       Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
25096       Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
25097     return Extract;
25098 
25099   return SDValue();
25100 }
25101 
25102 static SDValue
25103 performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
25104   assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
25105          "Unexpected opcode!");
25106 
25107   SelectionDAG &DAG = DCI.DAG;
25108   SDValue Pred = N->getOperand(0);
25109   SDValue LHS = N->getOperand(1);
25110   SDValue RHS = N->getOperand(2);
25111   ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
25112 
25113   if (SDValue V = performSetCCPunpkCombine(N, DAG))
25114     return V;
25115 
25116   if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
25117       LHS->getOpcode() == ISD::SIGN_EXTEND &&
25118       LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
25119     //    setcc_merge_zero(
25120     //       pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
25121     // => setcc_merge_zero(pred, ...)
25122     if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
25123         LHS->getOperand(0)->getOperand(0) == Pred)
25124       return LHS->getOperand(0);
25125 
25126     //    setcc_merge_zero(
25127     //        all_active, extend(nxvNi1 ...), != splat(0))
25128     // -> nxvNi1 ...
25129     if (isAllActivePredicate(DAG, Pred))
25130       return LHS->getOperand(0);
25131 
25132     //    setcc_merge_zero(
25133     //        pred, extend(nxvNi1 ...), != splat(0))
25134     // -> nxvNi1 and(pred, ...)
25135     if (DCI.isAfterLegalizeDAG())
25136       // Do this after legalization to allow more folds on setcc_merge_zero
25137       // to be recognized.
25138       return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
25139                          LHS->getOperand(0), Pred);
25140   }
25141 
25142   return SDValue();
25143 }
25144 
25145 // Optimize some simple tbz/tbnz cases.  Returns the new operand and bit to test
25146 // as well as whether the test should be inverted.  This code is required to
25147 // catch these cases (as opposed to standard dag combines) because
25148 // AArch64ISD::TBZ is matched during legalization.
25149 static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
25150                                  SelectionDAG &DAG) {
25151 
25152   if (!Op->hasOneUse())
25153     return Op;
25154 
25155   // We don't handle undef/constant-fold cases below, as they should have
25156   // already been taken care of (e.g. and of 0, test of undefined shifted bits,
25157   // etc.)
25158 
25159   // (tbz (trunc x), b) -> (tbz x, b)
25160   // This case is just here to enable more of the below cases to be caught.
25161   if (Op->getOpcode() == ISD::TRUNCATE &&
25162       Bit < Op->getValueType(0).getSizeInBits()) {
25163     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25164   }
25165 
25166   // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
25167   if (Op->getOpcode() == ISD::ANY_EXTEND &&
25168       Bit < Op->getOperand(0).getValueSizeInBits()) {
25169     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25170   }
25171 
25172   if (Op->getNumOperands() != 2)
25173     return Op;
25174 
25175   auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
25176   if (!C)
25177     return Op;
25178 
25179   switch (Op->getOpcode()) {
25180   default:
25181     return Op;
25182 
25183   // (tbz (and x, m), b) -> (tbz x, b)
25184   case ISD::AND:
25185     if ((C->getZExtValue() >> Bit) & 1)
25186       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25187     return Op;
25188 
25189   // (tbz (shl x, c), b) -> (tbz x, b-c)
25190   case ISD::SHL:
25191     if (C->getZExtValue() <= Bit &&
25192         (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
25193       Bit = Bit - C->getZExtValue();
25194       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25195     }
25196     return Op;
25197 
25198   // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
25199   case ISD::SRA:
25200     Bit = Bit + C->getZExtValue();
25201     if (Bit >= Op->getValueType(0).getSizeInBits())
25202       Bit = Op->getValueType(0).getSizeInBits() - 1;
25203     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25204 
25205   // (tbz (srl x, c), b) -> (tbz x, b+c)
25206   case ISD::SRL:
25207     if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
25208       Bit = Bit + C->getZExtValue();
25209       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25210     }
25211     return Op;
25212 
25213   // (tbz (xor x, -1), b) -> (tbnz x, b)
25214   case ISD::XOR:
25215     if ((C->getZExtValue() >> Bit) & 1)
25216       Invert = !Invert;
25217     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25218   }
25219 }
25220 
25221 // Optimize test single bit zero/non-zero and branch.
25222 static SDValue performTBZCombine(SDNode *N,
25223                                  TargetLowering::DAGCombinerInfo &DCI,
25224                                  SelectionDAG &DAG) {
25225   unsigned Bit = N->getConstantOperandVal(2);
25226   bool Invert = false;
25227   SDValue TestSrc = N->getOperand(1);
25228   SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
25229 
25230   if (TestSrc == NewTestSrc)
25231     return SDValue();
25232 
25233   unsigned NewOpc = N->getOpcode();
25234   if (Invert) {
25235     if (NewOpc == AArch64ISD::TBZ)
25236       NewOpc = AArch64ISD::TBNZ;
25237     else {
25238       assert(NewOpc == AArch64ISD::TBNZ);
25239       NewOpc = AArch64ISD::TBZ;
25240     }
25241   }
25242 
25243   SDLoc DL(N);
25244   return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
25245                      DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
25246 }
25247 
25248 // Swap vselect operands where it may allow a predicated operation to achieve
25249 // the `sel`.
25250 //
25251 //     (vselect (setcc ( condcode) (_) (_)) (a)          (op (a) (b)))
25252 //  => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
25253 static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
25254   auto SelectA = N->getOperand(1);
25255   auto SelectB = N->getOperand(2);
25256   auto NTy = N->getValueType(0);
25257 
25258   if (!NTy.isScalableVector())
25259     return SDValue();
25260   SDValue SetCC = N->getOperand(0);
25261   if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
25262     return SDValue();
25263 
25264   switch (SelectB.getOpcode()) {
25265   default:
25266     return SDValue();
25267   case ISD::FMUL:
25268   case ISD::FSUB:
25269   case ISD::FADD:
25270     break;
25271   }
25272   if (SelectA != SelectB.getOperand(0))
25273     return SDValue();
25274 
25275   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
25276   ISD::CondCode InverseCC =
25277       ISD::getSetCCInverse(CC, SetCC.getOperand(0).getValueType());
25278   auto InverseSetCC =
25279       DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
25280                    SetCC.getOperand(1), InverseCC);
25281 
25282   return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
25283                      {InverseSetCC, SelectB, SelectA});
25284 }
25285 
25286 // vselect (v1i1 setcc) ->
25287 //     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
25288 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
25289 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
25290 // such VSELECT.
25291 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
25292   if (auto SwapResult = trySwapVSelectOperands(N, DAG))
25293     return SwapResult;
25294 
25295   SDValue N0 = N->getOperand(0);
25296   EVT CCVT = N0.getValueType();
25297 
25298   if (isAllActivePredicate(DAG, N0))
25299     return N->getOperand(1);
25300 
25301   if (isAllInactivePredicate(N0))
25302     return N->getOperand(2);
25303 
25304   // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
25305   // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
25306   // supported types.
25307   SDValue SetCC = N->getOperand(0);
25308   if (SetCC.getOpcode() == ISD::SETCC &&
25309       SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
25310     SDValue CmpLHS = SetCC.getOperand(0);
25311     EVT VT = CmpLHS.getValueType();
25312     SDNode *CmpRHS = SetCC.getOperand(1).getNode();
25313     SDNode *SplatLHS = N->getOperand(1).getNode();
25314     SDNode *SplatRHS = N->getOperand(2).getNode();
25315     APInt SplatLHSVal;
25316     if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
25317         VT.isSimple() &&
25318         is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
25319                                MVT::v2i32, MVT::v4i32, MVT::v2i64}),
25320                      VT.getSimpleVT().SimpleTy) &&
25321         ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
25322         SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
25323         ISD::isConstantSplatVectorAllOnes(SplatRHS)) {
25324       unsigned NumElts = VT.getVectorNumElements();
25325       SmallVector<SDValue, 8> Ops(
25326           NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
25327                                    VT.getScalarType()));
25328       SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
25329 
25330       auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
25331       auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
25332       return Or;
25333     }
25334   }
25335 
25336   EVT CmpVT = N0.getOperand(0).getValueType();
25337   if (N0.getOpcode() != ISD::SETCC ||
25338       CCVT.getVectorElementCount() != ElementCount::getFixed(1) ||
25339       CCVT.getVectorElementType() != MVT::i1 ||
25340       CmpVT.getVectorElementType().isFloatingPoint())
25341     return SDValue();
25342 
25343   EVT ResVT = N->getValueType(0);
25344   // Only combine when the result type is of the same size as the compared
25345   // operands.
25346   if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
25347     return SDValue();
25348 
25349   SDValue IfTrue = N->getOperand(1);
25350   SDValue IfFalse = N->getOperand(2);
25351   SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
25352                        N0.getOperand(0), N0.getOperand(1),
25353                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
25354   return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
25355                      IfTrue, IfFalse);
25356 }
25357 
25358 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
25359 /// the compare-mask instructions rather than going via NZCV, even if LHS and
25360 /// RHS are really scalar. This replaces any scalar setcc in the above pattern
25361 /// with a vector one followed by a DUP shuffle on the result.
25362 static SDValue performSelectCombine(SDNode *N,
25363                                     TargetLowering::DAGCombinerInfo &DCI) {
25364   SelectionDAG &DAG = DCI.DAG;
25365   SDValue N0 = N->getOperand(0);
25366   EVT ResVT = N->getValueType(0);
25367 
25368   if (N0.getOpcode() != ISD::SETCC)
25369     return SDValue();
25370 
25371   if (ResVT.isScalableVT())
25372     return SDValue();
25373 
25374   // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
25375   // scalar SetCCResultType. We also don't expect vectors, because we assume
25376   // that selects fed by vector SETCCs are canonicalized to VSELECT.
25377   assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
25378          "Scalar-SETCC feeding SELECT has unexpected result type!");
25379 
25380   // If NumMaskElts == 0, the comparison is larger than select result. The
25381   // largest real NEON comparison is 64-bits per lane, which means the result is
25382   // at most 32-bits and an illegal vector. Just bail out for now.
25383   EVT SrcVT = N0.getOperand(0).getValueType();
25384 
25385   // Don't try to do this optimization when the setcc itself has i1 operands.
25386   // There are no legal vectors of i1, so this would be pointless. v1f16 is
25387   // ruled out to prevent the creation of setcc that need to be scalarized.
25388   if (SrcVT == MVT::i1 ||
25389       (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
25390     return SDValue();
25391 
25392   int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
25393   if (!ResVT.isVector() || NumMaskElts == 0)
25394     return SDValue();
25395 
25396   SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
25397   EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
25398 
25399   // Also bail out if the vector CCVT isn't the same size as ResVT.
25400   // This can happen if the SETCC operand size doesn't divide the ResVT size
25401   // (e.g., f64 vs v3f32).
25402   if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
25403     return SDValue();
25404 
25405   // Make sure we didn't create illegal types, if we're not supposed to.
25406   assert(DCI.isBeforeLegalize() ||
25407          DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
25408 
25409   // First perform a vector comparison, where lane 0 is the one we're interested
25410   // in.
25411   SDLoc DL(N0);
25412   SDValue LHS =
25413       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
25414   SDValue RHS =
25415       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
25416   SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
25417 
25418   // Now duplicate the comparison mask we want across all other lanes.
25419   SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
25420   SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
25421   Mask = DAG.getNode(ISD::BITCAST, DL,
25422                      ResVT.changeVectorElementTypeToInteger(), Mask);
25423 
25424   return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
25425 }
25426 
25427 static SDValue performDUPCombine(SDNode *N,
25428                                  TargetLowering::DAGCombinerInfo &DCI) {
25429   EVT VT = N->getValueType(0);
25430   SDLoc DL(N);
25431   // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
25432   // 128bit vector version.
25433   if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
25434     EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
25435     SmallVector<SDValue> Ops(N->ops());
25436     if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
25437                                              DCI.DAG.getVTList(LVT), Ops)) {
25438       return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
25439                              DCI.DAG.getConstant(0, DL, MVT::i64));
25440     }
25441   }
25442 
25443   if (N->getOpcode() == AArch64ISD::DUP) {
25444     if (DCI.isAfterLegalizeDAG()) {
25445       // If scalar dup's operand is extract_vector_elt, try to combine them into
25446       // duplane. For example,
25447       //
25448       //    t21: i32 = extract_vector_elt t19, Constant:i64<0>
25449       //  t18: v4i32 = AArch64ISD::DUP t21
25450       //  ==>
25451       //  t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
25452       SDValue EXTRACT_VEC_ELT = N->getOperand(0);
25453       if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
25454         if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
25455           unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
25456           return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
25457                                  EXTRACT_VEC_ELT.getOperand(1));
25458         }
25459       }
25460     }
25461 
25462     return performPostLD1Combine(N, DCI, false);
25463   }
25464 
25465   return SDValue();
25466 }
25467 
25468 /// Get rid of unnecessary NVCASTs (that don't change the type).
25469 static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG) {
25470   if (N->getValueType(0) == N->getOperand(0).getValueType())
25471     return N->getOperand(0);
25472   if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
25473     return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
25474                        N->getOperand(0).getOperand(0));
25475 
25476   return SDValue();
25477 }
25478 
25479 // If all users of the globaladdr are of the form (globaladdr + constant), find
25480 // the smallest constant, fold it into the globaladdr's offset and rewrite the
25481 // globaladdr as (globaladdr + constant) - constant.
25482 static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
25483                                            const AArch64Subtarget *Subtarget,
25484                                            const TargetMachine &TM) {
25485   auto *GN = cast<GlobalAddressSDNode>(N);
25486   if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
25487       AArch64II::MO_NO_FLAG)
25488     return SDValue();
25489 
25490   uint64_t MinOffset = -1ull;
25491   for (SDNode *N : GN->users()) {
25492     if (N->getOpcode() != ISD::ADD)
25493       return SDValue();
25494     auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
25495     if (!C)
25496       C = dyn_cast<ConstantSDNode>(N->getOperand(1));
25497     if (!C)
25498       return SDValue();
25499     MinOffset = std::min(MinOffset, C->getZExtValue());
25500   }
25501   uint64_t Offset = MinOffset + GN->getOffset();
25502 
25503   // Require that the new offset is larger than the existing one. Otherwise, we
25504   // can end up oscillating between two possible DAGs, for example,
25505   // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
25506   if (Offset <= uint64_t(GN->getOffset()))
25507     return SDValue();
25508 
25509   // Check whether folding this offset is legal. It must not go out of bounds of
25510   // the referenced object to avoid violating the code model, and must be
25511   // smaller than 2^20 because this is the largest offset expressible in all
25512   // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
25513   // stores an immediate signed 21 bit offset.)
25514   //
25515   // This check also prevents us from folding negative offsets, which will end
25516   // up being treated in the same way as large positive ones. They could also
25517   // cause code model violations, and aren't really common enough to matter.
25518   if (Offset >= (1 << 20))
25519     return SDValue();
25520 
25521   const GlobalValue *GV = GN->getGlobal();
25522   Type *T = GV->getValueType();
25523   if (!T->isSized() ||
25524       Offset > GV->getDataLayout().getTypeAllocSize(T))
25525     return SDValue();
25526 
25527   SDLoc DL(GN);
25528   SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
25529   return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
25530                      DAG.getConstant(MinOffset, DL, MVT::i64));
25531 }
25532 
25533 static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG,
25534                                   const AArch64Subtarget *Subtarget) {
25535   SDValue BR = N->getOperand(0);
25536   if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
25537       !BR.getValueType().isScalarInteger())
25538     return SDValue();
25539 
25540   SDLoc DL(N);
25541   return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
25542 }
25543 
25544 // Turns the vector of indices into a vector of byte offstes by scaling Offset
25545 // by (BitWidth / 8).
25546 static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
25547                                           SDLoc DL, unsigned BitWidth) {
25548   assert(Offset.getValueType().isScalableVector() &&
25549          "This method is only for scalable vectors of offsets");
25550 
25551   SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
25552   SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
25553 
25554   return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
25555 }
25556 
25557 /// Check if the value of \p OffsetInBytes can be used as an immediate for
25558 /// the gather load/prefetch and scatter store instructions with vector base and
25559 /// immediate offset addressing mode:
25560 ///
25561 ///      [<Zn>.[S|D]{, #<imm>}]
25562 ///
25563 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
25564 inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
25565                                                   unsigned ScalarSizeInBytes) {
25566   // The immediate is not a multiple of the scalar size.
25567   if (OffsetInBytes % ScalarSizeInBytes)
25568     return false;
25569 
25570   // The immediate is out of range.
25571   if (OffsetInBytes / ScalarSizeInBytes > 31)
25572     return false;
25573 
25574   return true;
25575 }
25576 
25577 /// Check if the value of \p Offset represents a valid immediate for the SVE
25578 /// gather load/prefetch and scatter store instructiona with vector base and
25579 /// immediate offset addressing mode:
25580 ///
25581 ///      [<Zn>.[S|D]{, #<imm>}]
25582 ///
25583 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
25584 static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
25585                                            unsigned ScalarSizeInBytes) {
25586   ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
25587   return OffsetConst && isValidImmForSVEVecImmAddrMode(
25588                             OffsetConst->getZExtValue(), ScalarSizeInBytes);
25589 }
25590 
25591 static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
25592                                           unsigned Opcode,
25593                                           bool OnlyPackedOffsets = true) {
25594   const SDValue Src = N->getOperand(2);
25595   const EVT SrcVT = Src->getValueType(0);
25596   assert(SrcVT.isScalableVector() &&
25597          "Scatter stores are only possible for SVE vectors");
25598 
25599   SDLoc DL(N);
25600   MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
25601 
25602   // Make sure that source data will fit into an SVE register
25603   if (SrcVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
25604     return SDValue();
25605 
25606   // For FPs, ACLE only supports _packed_ single and double precision types.
25607   // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
25608   if (SrcElVT.isFloatingPoint())
25609     if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
25610         ((Opcode != AArch64ISD::SST1Q_PRED &&
25611           Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
25612          ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
25613       return SDValue();
25614 
25615   // Depending on the addressing mode, this is either a pointer or a vector of
25616   // pointers (that fits into one register)
25617   SDValue Base = N->getOperand(4);
25618   // Depending on the addressing mode, this is either a single offset or a
25619   // vector of offsets  (that fits into one register)
25620   SDValue Offset = N->getOperand(5);
25621 
25622   // For "scalar + vector of indices", just scale the indices. This only
25623   // applies to non-temporal scatters because there's no instruction that takes
25624   // indices.
25625   if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
25626     Offset =
25627         getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
25628     Opcode = AArch64ISD::SSTNT1_PRED;
25629   } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
25630     Offset =
25631         getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
25632     Opcode = AArch64ISD::SST1Q_PRED;
25633   }
25634 
25635   // In the case of non-temporal gather loads there's only one SVE instruction
25636   // per data-size: "scalar + vector", i.e.
25637   //    * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
25638   // Since we do have intrinsics that allow the arguments to be in a different
25639   // order, we may need to swap them to match the spec.
25640   if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
25641       Offset.getValueType().isVector())
25642     std::swap(Base, Offset);
25643 
25644   // SST1_IMM requires that the offset is an immediate that is:
25645   //    * a multiple of #SizeInBytes,
25646   //    * in the range [0, 31 x #SizeInBytes],
25647   // where #SizeInBytes is the size in bytes of the stored items. For
25648   // immediates outside that range and non-immediate scalar offsets use SST1 or
25649   // SST1_UXTW instead.
25650   if (Opcode == AArch64ISD::SST1_IMM_PRED) {
25651     if (!isValidImmForSVEVecImmAddrMode(Offset,
25652                                         SrcVT.getScalarSizeInBits() / 8)) {
25653       if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
25654         Opcode = AArch64ISD::SST1_UXTW_PRED;
25655       else
25656         Opcode = AArch64ISD::SST1_PRED;
25657 
25658       std::swap(Base, Offset);
25659     }
25660   }
25661 
25662   auto &TLI = DAG.getTargetLoweringInfo();
25663   if (!TLI.isTypeLegal(Base.getValueType()))
25664     return SDValue();
25665 
25666   // Some scatter store variants allow unpacked offsets, but only as nxv2i32
25667   // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
25668   // nxv2i64. Legalize accordingly.
25669   if (!OnlyPackedOffsets &&
25670       Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
25671     Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
25672 
25673   if (!TLI.isTypeLegal(Offset.getValueType()))
25674     return SDValue();
25675 
25676   // Source value type that is representable in hardware
25677   EVT HwSrcVt = getSVEContainerType(SrcVT);
25678 
25679   // Keep the original type of the input data to store - this is needed to be
25680   // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
25681   // FP values we want the integer equivalent, so just use HwSrcVt.
25682   SDValue InputVT = DAG.getValueType(SrcVT);
25683   if (SrcVT.isFloatingPoint())
25684     InputVT = DAG.getValueType(HwSrcVt);
25685 
25686   SDVTList VTs = DAG.getVTList(MVT::Other);
25687   SDValue SrcNew;
25688 
25689   if (Src.getValueType().isFloatingPoint())
25690     SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
25691   else
25692     SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
25693 
25694   SDValue Ops[] = {N->getOperand(0), // Chain
25695                    SrcNew,
25696                    N->getOperand(3), // Pg
25697                    Base,
25698                    Offset,
25699                    InputVT};
25700 
25701   return DAG.getNode(Opcode, DL, VTs, Ops);
25702 }
25703 
25704 static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
25705                                         unsigned Opcode,
25706                                         bool OnlyPackedOffsets = true) {
25707   const EVT RetVT = N->getValueType(0);
25708   assert(RetVT.isScalableVector() &&
25709          "Gather loads are only possible for SVE vectors");
25710 
25711   SDLoc DL(N);
25712 
25713   // Make sure that the loaded data will fit into an SVE register
25714   if (RetVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
25715     return SDValue();
25716 
25717   // Depending on the addressing mode, this is either a pointer or a vector of
25718   // pointers (that fits into one register)
25719   SDValue Base = N->getOperand(3);
25720   // Depending on the addressing mode, this is either a single offset or a
25721   // vector of offsets  (that fits into one register)
25722   SDValue Offset = N->getOperand(4);
25723 
25724   // For "scalar + vector of indices", scale the indices to obtain unscaled
25725   // offsets. This applies to non-temporal and quadword gathers, which do not
25726   // have an addressing mode with scaled offset.
25727   if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
25728     Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
25729                                         RetVT.getScalarSizeInBits());
25730     Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
25731   } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
25732     Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
25733                                         RetVT.getScalarSizeInBits());
25734     Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
25735   }
25736 
25737   // In the case of non-temporal gather loads and quadword gather loads there's
25738   // only one addressing mode : "vector + scalar", e.g.
25739   //   ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
25740   // Since we do have intrinsics that allow the arguments to be in a different
25741   // order, we may need to swap them to match the spec.
25742   if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
25743        Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
25744       Offset.getValueType().isVector())
25745     std::swap(Base, Offset);
25746 
25747   // GLD{FF}1_IMM requires that the offset is an immediate that is:
25748   //    * a multiple of #SizeInBytes,
25749   //    * in the range [0, 31 x #SizeInBytes],
25750   // where #SizeInBytes is the size in bytes of the loaded items. For
25751   // immediates outside that range and non-immediate scalar offsets use
25752   // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
25753   if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
25754       Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
25755     if (!isValidImmForSVEVecImmAddrMode(Offset,
25756                                         RetVT.getScalarSizeInBits() / 8)) {
25757       if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
25758         Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
25759                      ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
25760                      : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
25761       else
25762         Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
25763                      ? AArch64ISD::GLD1_MERGE_ZERO
25764                      : AArch64ISD::GLDFF1_MERGE_ZERO;
25765 
25766       std::swap(Base, Offset);
25767     }
25768   }
25769 
25770   auto &TLI = DAG.getTargetLoweringInfo();
25771   if (!TLI.isTypeLegal(Base.getValueType()))
25772     return SDValue();
25773 
25774   // Some gather load variants allow unpacked offsets, but only as nxv2i32
25775   // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
25776   // nxv2i64. Legalize accordingly.
25777   if (!OnlyPackedOffsets &&
25778       Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
25779     Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
25780 
25781   // Return value type that is representable in hardware
25782   EVT HwRetVt = getSVEContainerType(RetVT);
25783 
25784   // Keep the original output value type around - this is needed to be able to
25785   // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
25786   // values we want the integer equivalent, so just use HwRetVT.
25787   SDValue OutVT = DAG.getValueType(RetVT);
25788   if (RetVT.isFloatingPoint())
25789     OutVT = DAG.getValueType(HwRetVt);
25790 
25791   SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
25792   SDValue Ops[] = {N->getOperand(0), // Chain
25793                    N->getOperand(2), // Pg
25794                    Base, Offset, OutVT};
25795 
25796   SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
25797   SDValue LoadChain = SDValue(Load.getNode(), 1);
25798 
25799   if (RetVT.isInteger() && (RetVT != HwRetVt))
25800     Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
25801 
25802   // If the original return value was FP, bitcast accordingly. Doing it here
25803   // means that we can avoid adding TableGen patterns for FPs.
25804   if (RetVT.isFloatingPoint())
25805     Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
25806 
25807   return DAG.getMergeValues({Load, LoadChain}, DL);
25808 }
25809 
25810 static SDValue
25811 performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
25812                               SelectionDAG &DAG) {
25813   SDLoc DL(N);
25814   SDValue Src = N->getOperand(0);
25815   unsigned Opc = Src->getOpcode();
25816 
25817   // Sign extend of an unsigned unpack -> signed unpack
25818   if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
25819 
25820     unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
25821                                                : AArch64ISD::SUNPKLO;
25822 
25823     // Push the sign extend to the operand of the unpack
25824     // This is necessary where, for example, the operand of the unpack
25825     // is another unpack:
25826     // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
25827     // ->
25828     // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
25829     // ->
25830     // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
25831     SDValue ExtOp = Src->getOperand(0);
25832     auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
25833     EVT EltTy = VT.getVectorElementType();
25834     (void)EltTy;
25835 
25836     assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
25837            "Sign extending from an invalid type");
25838 
25839     EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
25840 
25841     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
25842                               ExtOp, DAG.getValueType(ExtVT));
25843 
25844     return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
25845   }
25846 
25847   if (DCI.isBeforeLegalizeOps())
25848     return SDValue();
25849 
25850   if (!EnableCombineMGatherIntrinsics)
25851     return SDValue();
25852 
25853   // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
25854   // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
25855   unsigned NewOpc;
25856   unsigned MemVTOpNum = 4;
25857   switch (Opc) {
25858   case AArch64ISD::LD1_MERGE_ZERO:
25859     NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
25860     MemVTOpNum = 3;
25861     break;
25862   case AArch64ISD::LDNF1_MERGE_ZERO:
25863     NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
25864     MemVTOpNum = 3;
25865     break;
25866   case AArch64ISD::LDFF1_MERGE_ZERO:
25867     NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
25868     MemVTOpNum = 3;
25869     break;
25870   case AArch64ISD::GLD1_MERGE_ZERO:
25871     NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
25872     break;
25873   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
25874     NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
25875     break;
25876   case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
25877     NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
25878     break;
25879   case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
25880     NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
25881     break;
25882   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
25883     NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
25884     break;
25885   case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
25886     NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
25887     break;
25888   case AArch64ISD::GLD1_IMM_MERGE_ZERO:
25889     NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
25890     break;
25891   case AArch64ISD::GLDFF1_MERGE_ZERO:
25892     NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
25893     break;
25894   case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
25895     NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
25896     break;
25897   case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
25898     NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
25899     break;
25900   case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
25901     NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
25902     break;
25903   case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
25904     NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
25905     break;
25906   case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
25907     NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
25908     break;
25909   case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
25910     NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
25911     break;
25912   case AArch64ISD::GLDNT1_MERGE_ZERO:
25913     NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
25914     break;
25915   default:
25916     return SDValue();
25917   }
25918 
25919   EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
25920   EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
25921 
25922   if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
25923     return SDValue();
25924 
25925   EVT DstVT = N->getValueType(0);
25926   SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
25927 
25928   SmallVector<SDValue, 5> Ops;
25929   for (unsigned I = 0; I < Src->getNumOperands(); ++I)
25930     Ops.push_back(Src->getOperand(I));
25931 
25932   SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
25933   DCI.CombineTo(N, ExtLoad);
25934   DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
25935 
25936   // Return N so it doesn't get rechecked
25937   return SDValue(N, 0);
25938 }
25939 
25940 /// Legalize the gather prefetch (scalar + vector addressing mode) when the
25941 /// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
25942 /// != nxv2i32) do not need legalization.
25943 static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
25944   const unsigned OffsetPos = 4;
25945   SDValue Offset = N->getOperand(OffsetPos);
25946 
25947   // Not an unpacked vector, bail out.
25948   if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
25949     return SDValue();
25950 
25951   // Extend the unpacked offset vector to 64-bit lanes.
25952   SDLoc DL(N);
25953   Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
25954   SmallVector<SDValue, 5> Ops(N->ops());
25955   // Replace the offset operand with the 64-bit one.
25956   Ops[OffsetPos] = Offset;
25957 
25958   return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
25959 }
25960 
25961 /// Combines a node carrying the intrinsic
25962 /// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
25963 /// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
25964 /// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
25965 /// sve gather prefetch instruction with vector plus immediate addressing mode.
25966 static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
25967                                                unsigned ScalarSizeInBytes) {
25968   const unsigned ImmPos = 4, OffsetPos = 3;
25969   // No need to combine the node if the immediate is valid...
25970   if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
25971     return SDValue();
25972 
25973   // ...otherwise swap the offset base with the offset...
25974   SmallVector<SDValue, 5> Ops(N->ops());
25975   std::swap(Ops[ImmPos], Ops[OffsetPos]);
25976   // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
25977   // `aarch64_sve_prfb_gather_uxtw_index`.
25978   SDLoc DL(N);
25979   Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
25980                            MVT::i64);
25981 
25982   return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
25983 }
25984 
25985 // Return true if the vector operation can guarantee only the first lane of its
25986 // result contains data, with all bits in other lanes set to zero.
25987 static bool isLanes1toNKnownZero(SDValue Op) {
25988   switch (Op.getOpcode()) {
25989   default:
25990     return false;
25991   case AArch64ISD::ANDV_PRED:
25992   case AArch64ISD::EORV_PRED:
25993   case AArch64ISD::FADDA_PRED:
25994   case AArch64ISD::FADDV_PRED:
25995   case AArch64ISD::FMAXNMV_PRED:
25996   case AArch64ISD::FMAXV_PRED:
25997   case AArch64ISD::FMINNMV_PRED:
25998   case AArch64ISD::FMINV_PRED:
25999   case AArch64ISD::ORV_PRED:
26000   case AArch64ISD::SADDV_PRED:
26001   case AArch64ISD::SMAXV_PRED:
26002   case AArch64ISD::SMINV_PRED:
26003   case AArch64ISD::UADDV_PRED:
26004   case AArch64ISD::UMAXV_PRED:
26005   case AArch64ISD::UMINV_PRED:
26006     return true;
26007   }
26008 }
26009 
26010 static SDValue removeRedundantInsertVectorElt(SDNode *N) {
26011   assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
26012   SDValue InsertVec = N->getOperand(0);
26013   SDValue InsertElt = N->getOperand(1);
26014   SDValue InsertIdx = N->getOperand(2);
26015 
26016   // We only care about inserts into the first element...
26017   if (!isNullConstant(InsertIdx))
26018     return SDValue();
26019   // ...of a zero'd vector...
26020   if (!ISD::isConstantSplatVectorAllZeros(InsertVec.getNode()))
26021     return SDValue();
26022   // ...where the inserted data was previously extracted...
26023   if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
26024     return SDValue();
26025 
26026   SDValue ExtractVec = InsertElt.getOperand(0);
26027   SDValue ExtractIdx = InsertElt.getOperand(1);
26028 
26029   // ...from the first element of a vector.
26030   if (!isNullConstant(ExtractIdx))
26031     return SDValue();
26032 
26033   // If we get here we are effectively trying to zero lanes 1-N of a vector.
26034 
26035   // Ensure there's no type conversion going on.
26036   if (N->getValueType(0) != ExtractVec.getValueType())
26037     return SDValue();
26038 
26039   if (!isLanes1toNKnownZero(ExtractVec))
26040     return SDValue();
26041 
26042   // The explicit zeroing is redundant.
26043   return ExtractVec;
26044 }
26045 
26046 static SDValue
26047 performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
26048   if (SDValue Res = removeRedundantInsertVectorElt(N))
26049     return Res;
26050 
26051   return performPostLD1Combine(N, DCI, true);
26052 }
26053 
26054 static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
26055                                       TargetLowering::DAGCombinerInfo &DCI,
26056                                       const AArch64Subtarget *Subtarget) {
26057   SDValue N0 = N->getOperand(0);
26058   EVT VT = N->getValueType(0);
26059 
26060   // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
26061   if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)
26062     return SDValue();
26063 
26064   auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
26065     EVT EltVT = VT.getVectorElementType();
26066     return EltVT == MVT::f32 || EltVT == MVT::f64;
26067   };
26068 
26069   // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
26070   // We purposefully don't care about legality of the nodes here as we know
26071   // they can be split down into something legal.
26072   if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
26073       N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
26074       VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
26075       VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
26076     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
26077     SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
26078                                      LN0->getChain(), LN0->getBasePtr(),
26079                                      N0.getValueType(), LN0->getMemOperand());
26080     DCI.CombineTo(N, ExtLoad);
26081     DCI.CombineTo(
26082         N0.getNode(),
26083         DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
26084                     DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
26085         ExtLoad.getValue(1));
26086     return SDValue(N, 0); // Return N so it doesn't get rechecked!
26087   }
26088 
26089   return SDValue();
26090 }
26091 
26092 static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
26093                                       const AArch64Subtarget *Subtarget) {
26094   EVT VT = N->getValueType(0);
26095 
26096   // Don't expand for NEON, SVE2 or SME
26097   if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
26098     return SDValue();
26099 
26100   SDLoc DL(N);
26101 
26102   SDValue Mask = N->getOperand(0);
26103   SDValue In1 = N->getOperand(1);
26104   SDValue In2 = N->getOperand(2);
26105 
26106   SDValue InvMask = DAG.getNOT(DL, Mask, VT);
26107   SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
26108   SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
26109   return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
26110 }
26111 
26112 static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
26113   EVT VT = N->getValueType(0);
26114 
26115   SDValue Insert = N->getOperand(0);
26116   if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
26117     return SDValue();
26118 
26119   if (!Insert.getOperand(0).isUndef())
26120     return SDValue();
26121 
26122   uint64_t IdxInsert = Insert.getConstantOperandVal(2);
26123   uint64_t IdxDupLane = N->getConstantOperandVal(1);
26124   if (IdxInsert != 0 || IdxDupLane != 0)
26125     return SDValue();
26126 
26127   SDValue Bitcast = Insert.getOperand(1);
26128   if (Bitcast.getOpcode() != ISD::BITCAST)
26129     return SDValue();
26130 
26131   SDValue Subvec = Bitcast.getOperand(0);
26132   EVT SubvecVT = Subvec.getValueType();
26133   if (!SubvecVT.is128BitVector())
26134     return SDValue();
26135   EVT NewSubvecVT =
26136       getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType());
26137 
26138   SDLoc DL(N);
26139   SDValue NewInsert =
26140       DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
26141                   DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
26142   SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
26143                                       NewInsert, N->getOperand(1));
26144   return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
26145 }
26146 
26147 // Try to combine mull with uzp1.
26148 static SDValue tryCombineMULLWithUZP1(SDNode *N,
26149                                       TargetLowering::DAGCombinerInfo &DCI,
26150                                       SelectionDAG &DAG) {
26151   if (DCI.isBeforeLegalizeOps())
26152     return SDValue();
26153 
26154   SDValue LHS = N->getOperand(0);
26155   SDValue RHS = N->getOperand(1);
26156 
26157   SDValue ExtractHigh;
26158   SDValue ExtractLow;
26159   SDValue TruncHigh;
26160   SDValue TruncLow;
26161   SDLoc DL(N);
26162 
26163   // Check the operands are trunc and extract_high.
26164   if (isEssentiallyExtractHighSubvector(LHS) &&
26165       RHS.getOpcode() == ISD::TRUNCATE) {
26166     TruncHigh = RHS;
26167     if (LHS.getOpcode() == ISD::BITCAST)
26168       ExtractHigh = LHS.getOperand(0);
26169     else
26170       ExtractHigh = LHS;
26171   } else if (isEssentiallyExtractHighSubvector(RHS) &&
26172              LHS.getOpcode() == ISD::TRUNCATE) {
26173     TruncHigh = LHS;
26174     if (RHS.getOpcode() == ISD::BITCAST)
26175       ExtractHigh = RHS.getOperand(0);
26176     else
26177       ExtractHigh = RHS;
26178   } else
26179     return SDValue();
26180 
26181   // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
26182   // with uzp1.
26183   // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
26184   SDValue TruncHighOp = TruncHigh.getOperand(0);
26185   EVT TruncHighOpVT = TruncHighOp.getValueType();
26186   if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
26187       DAG.isSplatValue(TruncHighOp, false))
26188     return SDValue();
26189 
26190   // Check there is other extract_high with same source vector.
26191   // For example,
26192   //
26193   //    t18: v4i16 = extract_subvector t2, Constant:i64<0>
26194   //    t12: v4i16 = truncate t11
26195   //  t31: v4i32 = AArch64ISD::SMULL t18, t12
26196   //    t23: v4i16 = extract_subvector t2, Constant:i64<4>
26197   //    t16: v4i16 = truncate t15
26198   //  t30: v4i32 = AArch64ISD::SMULL t23, t1
26199   //
26200   // This dagcombine assumes the two extract_high uses same source vector in
26201   // order to detect the pair of the mull. If they have different source vector,
26202   // this code will not work.
26203   // TODO: Should also try to look through a bitcast.
26204   bool HasFoundMULLow = true;
26205   SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
26206   if (ExtractHighSrcVec->use_size() != 2)
26207     HasFoundMULLow = false;
26208 
26209   // Find ExtractLow.
26210   for (SDNode *User : ExtractHighSrcVec.getNode()->users()) {
26211     if (User == ExtractHigh.getNode())
26212       continue;
26213 
26214     if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
26215         !isNullConstant(User->getOperand(1))) {
26216       HasFoundMULLow = false;
26217       break;
26218     }
26219 
26220     ExtractLow.setNode(User);
26221   }
26222 
26223   if (!ExtractLow || !ExtractLow->hasOneUse())
26224     HasFoundMULLow = false;
26225 
26226   // Check ExtractLow's user.
26227   if (HasFoundMULLow) {
26228     SDNode *ExtractLowUser = *ExtractLow.getNode()->user_begin();
26229     if (ExtractLowUser->getOpcode() != N->getOpcode()) {
26230       HasFoundMULLow = false;
26231     } else {
26232       if (ExtractLowUser->getOperand(0) == ExtractLow) {
26233         if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
26234           TruncLow = ExtractLowUser->getOperand(1);
26235         else
26236           HasFoundMULLow = false;
26237       } else {
26238         if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
26239           TruncLow = ExtractLowUser->getOperand(0);
26240         else
26241           HasFoundMULLow = false;
26242       }
26243     }
26244   }
26245 
26246   // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
26247   // with uzp1.
26248   // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
26249   EVT TruncHighVT = TruncHigh.getValueType();
26250   EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
26251   SDValue TruncLowOp =
26252       HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
26253   EVT TruncLowOpVT = TruncLowOp.getValueType();
26254   if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
26255                          DAG.isSplatValue(TruncLowOp, false)))
26256     return SDValue();
26257 
26258   // Create uzp1, extract_high and extract_low.
26259   if (TruncHighOpVT != UZP1VT)
26260     TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
26261   if (TruncLowOpVT != UZP1VT)
26262     TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
26263 
26264   SDValue UZP1 =
26265       DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
26266   SDValue HighIdxCst =
26267       DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
26268   SDValue NewTruncHigh =
26269       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
26270   DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
26271 
26272   if (HasFoundMULLow) {
26273     EVT TruncLowVT = TruncLow.getValueType();
26274     SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
26275                                       UZP1, ExtractLow.getOperand(1));
26276     DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
26277   }
26278 
26279   return SDValue(N, 0);
26280 }
26281 
26282 static SDValue performMULLCombine(SDNode *N,
26283                                   TargetLowering::DAGCombinerInfo &DCI,
26284                                   SelectionDAG &DAG) {
26285   if (SDValue Val =
26286           tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG))
26287     return Val;
26288 
26289   if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
26290     return Val;
26291 
26292   return SDValue();
26293 }
26294 
26295 static SDValue
26296 performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
26297                              SelectionDAG &DAG) {
26298   // Let's do below transform.
26299   //
26300   //         t34: v4i32 = AArch64ISD::UADDLV t2
26301   //       t35: i32 = extract_vector_elt t34, Constant:i64<0>
26302   //     t7: i64 = zero_extend t35
26303   //   t20: v1i64 = scalar_to_vector t7
26304   // ==>
26305   //      t34: v4i32 = AArch64ISD::UADDLV t2
26306   //    t39: v2i32 = extract_subvector t34, Constant:i64<0>
26307   //  t40: v1i64 = AArch64ISD::NVCAST t39
26308   if (DCI.isBeforeLegalizeOps())
26309     return SDValue();
26310 
26311   EVT VT = N->getValueType(0);
26312   if (VT != MVT::v1i64)
26313     return SDValue();
26314 
26315   SDValue ZEXT = N->getOperand(0);
26316   if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
26317     return SDValue();
26318 
26319   SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
26320   if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
26321       EXTRACT_VEC_ELT.getValueType() != MVT::i32)
26322     return SDValue();
26323 
26324   if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
26325     return SDValue();
26326 
26327   SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
26328   if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
26329       UADDLV.getValueType() != MVT::v4i32 ||
26330       UADDLV.getOperand(0).getValueType() != MVT::v8i8)
26331     return SDValue();
26332 
26333   // Let's generate new sequence with AArch64ISD::NVCAST.
26334   SDLoc DL(N);
26335   SDValue EXTRACT_SUBVEC =
26336       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
26337                   DAG.getConstant(0, DL, MVT::i64));
26338   SDValue NVCAST =
26339       DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
26340 
26341   return NVCAST;
26342 }
26343 
26344 /// If the operand is a bitwise AND with a constant RHS, and the shift has a
26345 /// constant RHS and is the only use, we can pull it out of the shift, i.e.
26346 ///
26347 ///   (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
26348 ///
26349 /// We prefer this canonical form to match existing isel patterns.
26350 static SDValue performSHLCombine(SDNode *N,
26351                                  TargetLowering::DAGCombinerInfo &DCI,
26352                                  SelectionDAG &DAG) {
26353   if (DCI.isBeforeLegalizeOps())
26354     return SDValue();
26355 
26356   SDValue Op0 = N->getOperand(0);
26357   if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
26358     return SDValue();
26359 
26360   SDValue C1 = Op0->getOperand(1);
26361   SDValue C2 = N->getOperand(1);
26362   if (!isa<ConstantSDNode>(C1) || !isa<ConstantSDNode>(C2))
26363     return SDValue();
26364 
26365   // Might be folded into shifted op, do not lower.
26366   if (N->hasOneUse()) {
26367     unsigned UseOpc = N->user_begin()->getOpcode();
26368     if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
26369         UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
26370       return SDValue();
26371   }
26372 
26373   SDLoc DL(N);
26374   EVT VT = N->getValueType(0);
26375 
26376   // Don't combine unless (shl C1, C2) can be constant folded. Otherwise,
26377   // DAGCombiner will simplify (and (op x...), (op y...)) -> (op (and x, y))
26378   // causing infinite loop. Result may also be worse.
26379   SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
26380   if (!isa<ConstantSDNode>(NewRHS))
26381     return SDValue();
26382 
26383   SDValue X = Op0->getOperand(0);
26384   SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
26385   return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
26386 }
26387 
26388 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
26389                                                  DAGCombinerInfo &DCI) const {
26390   SelectionDAG &DAG = DCI.DAG;
26391   switch (N->getOpcode()) {
26392   default:
26393     LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
26394     break;
26395   case ISD::VECREDUCE_AND:
26396   case ISD::VECREDUCE_OR:
26397   case ISD::VECREDUCE_XOR:
26398     return performVecReduceBitwiseCombine(N, DCI, DAG);
26399   case ISD::ADD:
26400   case ISD::SUB:
26401     return performAddSubCombine(N, DCI);
26402   case ISD::BUILD_VECTOR:
26403     return performBuildVectorCombine(N, DCI, DAG);
26404   case ISD::TRUNCATE:
26405     return performTruncateCombine(N, DAG, DCI);
26406   case AArch64ISD::ANDS:
26407     return performFlagSettingCombine(N, DCI, ISD::AND);
26408   case AArch64ISD::ADC:
26409     if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
26410       return R;
26411     return foldADCToCINC(N, DAG);
26412   case AArch64ISD::SBC:
26413     return foldOverflowCheck(N, DAG, /* IsAdd */ false);
26414   case AArch64ISD::ADCS:
26415     if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
26416       return R;
26417     return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
26418   case AArch64ISD::SBCS:
26419     if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
26420       return R;
26421     return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
26422   case AArch64ISD::BICi: {
26423     APInt DemandedBits =
26424         APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
26425     APInt DemandedElts =
26426         APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
26427 
26428     if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(
26429             SDValue(N, 0), DemandedBits, DemandedElts, DCI))
26430       return SDValue();
26431 
26432     break;
26433   }
26434   case ISD::XOR:
26435     return performXorCombine(N, DAG, DCI, Subtarget);
26436   case ISD::MUL:
26437     return performMulCombine(N, DAG, DCI, Subtarget);
26438   case ISD::SINT_TO_FP:
26439   case ISD::UINT_TO_FP:
26440     return performIntToFpCombine(N, DAG, DCI, Subtarget);
26441   case ISD::FP_TO_SINT:
26442   case ISD::FP_TO_UINT:
26443   case ISD::FP_TO_SINT_SAT:
26444   case ISD::FP_TO_UINT_SAT:
26445     return performFpToIntCombine(N, DAG, DCI, Subtarget);
26446   case ISD::OR:
26447     return performORCombine(N, DCI, Subtarget, *this);
26448   case ISD::AND:
26449     return performANDCombine(N, DCI);
26450   case ISD::FADD:
26451     return performFADDCombine(N, DCI);
26452   case ISD::INTRINSIC_WO_CHAIN:
26453     return performIntrinsicCombine(N, DCI, Subtarget);
26454   case ISD::ANY_EXTEND:
26455   case ISD::ZERO_EXTEND:
26456   case ISD::SIGN_EXTEND:
26457     return performExtendCombine(N, DCI, DAG);
26458   case ISD::SIGN_EXTEND_INREG:
26459     return performSignExtendInRegCombine(N, DCI, DAG);
26460   case ISD::CONCAT_VECTORS:
26461     return performConcatVectorsCombine(N, DCI, DAG);
26462   case ISD::EXTRACT_SUBVECTOR:
26463     return performExtractSubvectorCombine(N, DCI, DAG);
26464   case ISD::INSERT_SUBVECTOR:
26465     return performInsertSubvectorCombine(N, DCI, DAG);
26466   case ISD::SELECT:
26467     return performSelectCombine(N, DCI);
26468   case ISD::VSELECT:
26469     return performVSelectCombine(N, DCI.DAG);
26470   case ISD::SETCC:
26471     return performSETCCCombine(N, DCI, DAG);
26472   case ISD::LOAD:
26473     return performLOADCombine(N, DCI, DAG, Subtarget);
26474   case ISD::STORE:
26475     return performSTORECombine(N, DCI, DAG, Subtarget);
26476   case ISD::MSTORE:
26477     return performMSTORECombine(N, DCI, DAG, Subtarget);
26478   case ISD::MGATHER:
26479   case ISD::MSCATTER:
26480   case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
26481     return performMaskedGatherScatterCombine(N, DCI, DAG);
26482   case ISD::FP_EXTEND:
26483     return performFPExtendCombine(N, DAG, DCI, Subtarget);
26484   case AArch64ISD::BRCOND:
26485     return performBRCONDCombine(N, DCI, DAG);
26486   case AArch64ISD::TBNZ:
26487   case AArch64ISD::TBZ:
26488     return performTBZCombine(N, DCI, DAG);
26489   case AArch64ISD::CSEL:
26490     return performCSELCombine(N, DCI, DAG);
26491   case AArch64ISD::DUP:
26492   case AArch64ISD::DUPLANE8:
26493   case AArch64ISD::DUPLANE16:
26494   case AArch64ISD::DUPLANE32:
26495   case AArch64ISD::DUPLANE64:
26496     return performDUPCombine(N, DCI);
26497   case AArch64ISD::DUPLANE128:
26498     return performDupLane128Combine(N, DAG);
26499   case AArch64ISD::NVCAST:
26500     return performNVCASTCombine(N, DAG);
26501   case AArch64ISD::SPLICE:
26502     return performSpliceCombine(N, DAG);
26503   case AArch64ISD::UUNPKLO:
26504   case AArch64ISD::UUNPKHI:
26505     return performUnpackCombine(N, DAG, Subtarget);
26506   case AArch64ISD::UZP1:
26507   case AArch64ISD::UZP2:
26508     return performUzpCombine(N, DAG, Subtarget);
26509   case AArch64ISD::SETCC_MERGE_ZERO:
26510     return performSetccMergeZeroCombine(N, DCI);
26511   case AArch64ISD::REINTERPRET_CAST:
26512     return performReinterpretCastCombine(N);
26513   case AArch64ISD::GLD1_MERGE_ZERO:
26514   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
26515   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
26516   case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
26517   case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
26518   case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
26519   case AArch64ISD::GLD1_IMM_MERGE_ZERO:
26520   case AArch64ISD::GLD1S_MERGE_ZERO:
26521   case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
26522   case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
26523   case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
26524   case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
26525   case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
26526   case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
26527     return performGLD1Combine(N, DAG);
26528   case AArch64ISD::VASHR:
26529   case AArch64ISD::VLSHR:
26530     return performVectorShiftCombine(N, *this, DCI);
26531   case AArch64ISD::SUNPKLO:
26532     return performSunpkloCombine(N, DAG);
26533   case AArch64ISD::BSP:
26534     return performBSPExpandForSVE(N, DAG, Subtarget);
26535   case ISD::INSERT_VECTOR_ELT:
26536     return performInsertVectorEltCombine(N, DCI);
26537   case ISD::EXTRACT_VECTOR_ELT:
26538     return performExtractVectorEltCombine(N, DCI, Subtarget);
26539   case ISD::VECREDUCE_ADD:
26540     return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
26541   case AArch64ISD::UADDV:
26542     return performUADDVCombine(N, DAG);
26543   case AArch64ISD::SMULL:
26544   case AArch64ISD::UMULL:
26545   case AArch64ISD::PMULL:
26546     return performMULLCombine(N, DCI, DAG);
26547   case ISD::INTRINSIC_VOID:
26548   case ISD::INTRINSIC_W_CHAIN:
26549     switch (N->getConstantOperandVal(1)) {
26550     case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
26551       return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
26552     case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
26553       return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
26554     case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
26555       return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
26556     case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
26557       return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
26558     case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
26559     case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
26560     case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
26561     case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
26562     case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
26563     case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
26564     case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
26565     case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
26566       return legalizeSVEGatherPrefetchOffsVec(N, DAG);
26567     case Intrinsic::aarch64_neon_ld2:
26568     case Intrinsic::aarch64_neon_ld3:
26569     case Intrinsic::aarch64_neon_ld4:
26570     case Intrinsic::aarch64_neon_ld1x2:
26571     case Intrinsic::aarch64_neon_ld1x3:
26572     case Intrinsic::aarch64_neon_ld1x4:
26573     case Intrinsic::aarch64_neon_ld2lane:
26574     case Intrinsic::aarch64_neon_ld3lane:
26575     case Intrinsic::aarch64_neon_ld4lane:
26576     case Intrinsic::aarch64_neon_ld2r:
26577     case Intrinsic::aarch64_neon_ld3r:
26578     case Intrinsic::aarch64_neon_ld4r:
26579     case Intrinsic::aarch64_neon_st2:
26580     case Intrinsic::aarch64_neon_st3:
26581     case Intrinsic::aarch64_neon_st4:
26582     case Intrinsic::aarch64_neon_st1x2:
26583     case Intrinsic::aarch64_neon_st1x3:
26584     case Intrinsic::aarch64_neon_st1x4:
26585     case Intrinsic::aarch64_neon_st2lane:
26586     case Intrinsic::aarch64_neon_st3lane:
26587     case Intrinsic::aarch64_neon_st4lane:
26588       return performNEONPostLDSTCombine(N, DCI, DAG);
26589     case Intrinsic::aarch64_sve_ldnt1:
26590       return performLDNT1Combine(N, DAG);
26591     case Intrinsic::aarch64_sve_ld1rq:
26592       return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
26593     case Intrinsic::aarch64_sve_ld1ro:
26594       return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
26595     case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
26596       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
26597     case Intrinsic::aarch64_sve_ldnt1_gather:
26598       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
26599     case Intrinsic::aarch64_sve_ldnt1_gather_index:
26600       return performGatherLoadCombine(N, DAG,
26601                                       AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
26602     case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
26603       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
26604     case Intrinsic::aarch64_sve_ld1:
26605       return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
26606     case Intrinsic::aarch64_sve_ldnf1:
26607       return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
26608     case Intrinsic::aarch64_sve_ldff1:
26609       return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
26610     case Intrinsic::aarch64_sve_st1:
26611       return performST1Combine(N, DAG);
26612     case Intrinsic::aarch64_sve_stnt1:
26613       return performSTNT1Combine(N, DAG);
26614     case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
26615       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
26616     case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
26617       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
26618     case Intrinsic::aarch64_sve_stnt1_scatter:
26619       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
26620     case Intrinsic::aarch64_sve_stnt1_scatter_index:
26621       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
26622     case Intrinsic::aarch64_sve_ld1_gather:
26623       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
26624     case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
26625     case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
26626       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
26627     case Intrinsic::aarch64_sve_ld1q_gather_index:
26628       return performGatherLoadCombine(N, DAG,
26629                                       AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
26630     case Intrinsic::aarch64_sve_ld1_gather_index:
26631       return performGatherLoadCombine(N, DAG,
26632                                       AArch64ISD::GLD1_SCALED_MERGE_ZERO);
26633     case Intrinsic::aarch64_sve_ld1_gather_sxtw:
26634       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
26635                                       /*OnlyPackedOffsets=*/false);
26636     case Intrinsic::aarch64_sve_ld1_gather_uxtw:
26637       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
26638                                       /*OnlyPackedOffsets=*/false);
26639     case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
26640       return performGatherLoadCombine(N, DAG,
26641                                       AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
26642                                       /*OnlyPackedOffsets=*/false);
26643     case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
26644       return performGatherLoadCombine(N, DAG,
26645                                       AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
26646                                       /*OnlyPackedOffsets=*/false);
26647     case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
26648       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
26649     case Intrinsic::aarch64_sve_ldff1_gather:
26650       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
26651     case Intrinsic::aarch64_sve_ldff1_gather_index:
26652       return performGatherLoadCombine(N, DAG,
26653                                       AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
26654     case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
26655       return performGatherLoadCombine(N, DAG,
26656                                       AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
26657                                       /*OnlyPackedOffsets=*/false);
26658     case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
26659       return performGatherLoadCombine(N, DAG,
26660                                       AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
26661                                       /*OnlyPackedOffsets=*/false);
26662     case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
26663       return performGatherLoadCombine(N, DAG,
26664                                       AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
26665                                       /*OnlyPackedOffsets=*/false);
26666     case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
26667       return performGatherLoadCombine(N, DAG,
26668                                       AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
26669                                       /*OnlyPackedOffsets=*/false);
26670     case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
26671       return performGatherLoadCombine(N, DAG,
26672                                       AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
26673     case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
26674     case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
26675       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
26676     case Intrinsic::aarch64_sve_st1q_scatter_index:
26677       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
26678     case Intrinsic::aarch64_sve_st1_scatter:
26679       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
26680     case Intrinsic::aarch64_sve_st1_scatter_index:
26681       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
26682     case Intrinsic::aarch64_sve_st1_scatter_sxtw:
26683       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
26684                                         /*OnlyPackedOffsets=*/false);
26685     case Intrinsic::aarch64_sve_st1_scatter_uxtw:
26686       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
26687                                         /*OnlyPackedOffsets=*/false);
26688     case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
26689       return performScatterStoreCombine(N, DAG,
26690                                         AArch64ISD::SST1_SXTW_SCALED_PRED,
26691                                         /*OnlyPackedOffsets=*/false);
26692     case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
26693       return performScatterStoreCombine(N, DAG,
26694                                         AArch64ISD::SST1_UXTW_SCALED_PRED,
26695                                         /*OnlyPackedOffsets=*/false);
26696     case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
26697       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
26698     case Intrinsic::aarch64_rndr:
26699     case Intrinsic::aarch64_rndrrs: {
26700       unsigned IntrinsicID = N->getConstantOperandVal(1);
26701       auto Register =
26702           (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
26703                                                   : AArch64SysReg::RNDRRS);
26704       SDLoc DL(N);
26705       SDValue A = DAG.getNode(
26706           AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::i32, MVT::Other),
26707           N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
26708       SDValue B = DAG.getNode(
26709           AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
26710           DAG.getConstant(0, DL, MVT::i32),
26711           DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
26712       return DAG.getMergeValues(
26713           {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
26714     }
26715     case Intrinsic::aarch64_sme_ldr_zt:
26716       return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
26717                          DAG.getVTList(MVT::Other), N->getOperand(0),
26718                          N->getOperand(2), N->getOperand(3));
26719     case Intrinsic::aarch64_sme_str_zt:
26720       return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
26721                          DAG.getVTList(MVT::Other), N->getOperand(0),
26722                          N->getOperand(2), N->getOperand(3));
26723     default:
26724       break;
26725     }
26726     break;
26727   case ISD::GlobalAddress:
26728     return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
26729   case ISD::CTLZ:
26730     return performCTLZCombine(N, DAG, Subtarget);
26731   case ISD::SCALAR_TO_VECTOR:
26732     return performScalarToVectorCombine(N, DCI, DAG);
26733   case ISD::SHL:
26734     return performSHLCombine(N, DCI, DAG);
26735   }
26736   return SDValue();
26737 }
26738 
26739 // Check if the return value is used as only a return value, as otherwise
26740 // we can't perform a tail-call. In particular, we need to check for
26741 // target ISD nodes that are returns and any other "odd" constructs
26742 // that the generic analysis code won't necessarily catch.
26743 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
26744                                                SDValue &Chain) const {
26745   if (N->getNumValues() != 1)
26746     return false;
26747   if (!N->hasNUsesOfValue(1, 0))
26748     return false;
26749 
26750   SDValue TCChain = Chain;
26751   SDNode *Copy = *N->user_begin();
26752   if (Copy->getOpcode() == ISD::CopyToReg) {
26753     // If the copy has a glue operand, we conservatively assume it isn't safe to
26754     // perform a tail call.
26755     if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
26756         MVT::Glue)
26757       return false;
26758     TCChain = Copy->getOperand(0);
26759   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
26760     return false;
26761 
26762   bool HasRet = false;
26763   for (SDNode *Node : Copy->users()) {
26764     if (Node->getOpcode() != AArch64ISD::RET_GLUE)
26765       return false;
26766     HasRet = true;
26767   }
26768 
26769   if (!HasRet)
26770     return false;
26771 
26772   Chain = TCChain;
26773   return true;
26774 }
26775 
26776 // Return whether the an instruction can potentially be optimized to a tail
26777 // call. This will cause the optimizers to attempt to move, or duplicate,
26778 // return instructions to help enable tail call optimizations for this
26779 // instruction.
26780 bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
26781   return CI->isTailCall();
26782 }
26783 
26784 bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
26785                                             Register Offset, bool IsPre,
26786                                             MachineRegisterInfo &MRI) const {
26787   auto CstOffset = getIConstantVRegVal(Offset, MRI);
26788   if (!CstOffset || CstOffset->isZero())
26789     return false;
26790 
26791   // All of the indexed addressing mode instructions take a signed 9 bit
26792   // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
26793   // encodes the sign/indexing direction.
26794   return isInt<9>(CstOffset->getSExtValue());
26795 }
26796 
26797 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
26798                                                    SDValue &Base,
26799                                                    SDValue &Offset,
26800                                                    SelectionDAG &DAG) const {
26801   if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
26802     return false;
26803 
26804   // Non-null if there is exactly one user of the loaded value (ignoring chain).
26805   SDNode *ValOnlyUser = nullptr;
26806   for (SDUse &U : N->uses()) {
26807     if (U.getResNo() == 1)
26808       continue; // Ignore chain.
26809     if (ValOnlyUser == nullptr)
26810       ValOnlyUser = U.getUser();
26811     else {
26812       ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
26813       break;
26814     }
26815   }
26816 
26817   auto IsUndefOrZero = [](SDValue V) {
26818     return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
26819   };
26820 
26821   // If the only user of the value is a scalable vector splat, it is
26822   // preferable to do a replicating load (ld1r*).
26823   if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
26824       (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
26825        (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
26826         IsUndefOrZero(ValOnlyUser->getOperand(2)))))
26827     return false;
26828 
26829   Base = Op->getOperand(0);
26830   // All of the indexed addressing mode instructions take a signed
26831   // 9 bit immediate offset.
26832   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
26833     int64_t RHSC = RHS->getSExtValue();
26834     if (Op->getOpcode() == ISD::SUB)
26835       RHSC = -(uint64_t)RHSC;
26836     if (!isInt<9>(RHSC))
26837       return false;
26838     // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
26839     // when dealing with subtraction.
26840     Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
26841     return true;
26842   }
26843   return false;
26844 }
26845 
26846 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
26847                                                       SDValue &Offset,
26848                                                       ISD::MemIndexedMode &AM,
26849                                                       SelectionDAG &DAG) const {
26850   EVT VT;
26851   SDValue Ptr;
26852   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
26853     VT = LD->getMemoryVT();
26854     Ptr = LD->getBasePtr();
26855   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
26856     VT = ST->getMemoryVT();
26857     Ptr = ST->getBasePtr();
26858   } else
26859     return false;
26860 
26861   if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
26862     return false;
26863   AM = ISD::PRE_INC;
26864   return true;
26865 }
26866 
26867 bool AArch64TargetLowering::getPostIndexedAddressParts(
26868     SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
26869     ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
26870   EVT VT;
26871   SDValue Ptr;
26872   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
26873     VT = LD->getMemoryVT();
26874     Ptr = LD->getBasePtr();
26875   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
26876     VT = ST->getMemoryVT();
26877     Ptr = ST->getBasePtr();
26878   } else
26879     return false;
26880 
26881   if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
26882     return false;
26883   // Post-indexing updates the base, so it's not a valid transform
26884   // if that's not the same as the load's pointer.
26885   if (Ptr != Base)
26886     return false;
26887   AM = ISD::POST_INC;
26888   return true;
26889 }
26890 
26891 static void replaceBoolVectorBitcast(SDNode *N,
26892                                      SmallVectorImpl<SDValue> &Results,
26893                                      SelectionDAG &DAG) {
26894   SDLoc DL(N);
26895   SDValue Op = N->getOperand(0);
26896   EVT VT = N->getValueType(0);
26897   [[maybe_unused]] EVT SrcVT = Op.getValueType();
26898   assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
26899          "Must be bool vector.");
26900 
26901   // Special handling for Clang's __builtin_convertvector. For vectors with <8
26902   // elements, it adds a vector concatenation with undef(s). If we encounter
26903   // this here, we can skip the concat.
26904   if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
26905     bool AllUndef = true;
26906     for (unsigned I = 1; I < Op.getNumOperands(); ++I)
26907       AllUndef &= Op.getOperand(I).isUndef();
26908 
26909     if (AllUndef)
26910       Op = Op.getOperand(0);
26911   }
26912 
26913   SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
26914   if (VectorBits)
26915     Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
26916 }
26917 
26918 static void CustomNonLegalBITCASTResults(SDNode *N,
26919                                          SmallVectorImpl<SDValue> &Results,
26920                                          SelectionDAG &DAG, EVT ExtendVT,
26921                                          EVT CastVT) {
26922   SDLoc DL(N);
26923   SDValue Op = N->getOperand(0);
26924   EVT VT = N->getValueType(0);
26925 
26926   // Use SCALAR_TO_VECTOR for lane zero
26927   SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
26928   SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
26929   SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
26930   Results.push_back(
26931       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
26932 }
26933 
26934 void AArch64TargetLowering::ReplaceBITCASTResults(
26935     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
26936   SDLoc DL(N);
26937   SDValue Op = N->getOperand(0);
26938   EVT VT = N->getValueType(0);
26939   EVT SrcVT = Op.getValueType();
26940 
26941   if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
26942     CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
26943     return;
26944   }
26945 
26946   if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
26947     CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
26948     return;
26949   }
26950 
26951   if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
26952     CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
26953     return;
26954   }
26955 
26956   if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
26957     assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
26958            "Expected fp->int bitcast!");
26959 
26960     // Bitcasting between unpacked vector types of different element counts is
26961     // not a NOP because the live elements are laid out differently.
26962     //                01234567
26963     // e.g. nxv2i32 = XX??XX??
26964     //      nxv4f16 = X?X?X?X?
26965     if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
26966       return;
26967 
26968     SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
26969     Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
26970     return;
26971   }
26972 
26973   if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
26974       !VT.isVector())
26975     return replaceBoolVectorBitcast(N, Results, DAG);
26976 
26977   if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
26978     return;
26979 
26980   Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
26981                                  DAG.getUNDEF(MVT::i32), Op);
26982   Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
26983   Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
26984 }
26985 
26986 static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,
26987                                SelectionDAG &DAG,
26988                                const AArch64Subtarget *Subtarget) {
26989   EVT VT = N->getValueType(0);
26990   if (!VT.is256BitVector() ||
26991       (VT.getScalarType().isFloatingPoint() &&
26992        !N->getFlags().hasAllowReassociation()) ||
26993       (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
26994       VT.getScalarType() == MVT::bf16)
26995     return;
26996 
26997   SDValue X = N->getOperand(0);
26998   auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
26999   if (!Shuf) {
27000     Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
27001     X = N->getOperand(1);
27002     if (!Shuf)
27003       return;
27004   }
27005 
27006   if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
27007     return;
27008 
27009   // Check the mask is 1,0,3,2,5,4,...
27010   ArrayRef<int> Mask = Shuf->getMask();
27011   for (int I = 0, E = Mask.size(); I < E; I++)
27012     if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
27013       return;
27014 
27015   SDLoc DL(N);
27016   auto LoHi = DAG.SplitVector(X, DL);
27017   assert(LoHi.first.getValueType() == LoHi.second.getValueType());
27018   SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
27019                              LoHi.first, LoHi.second);
27020 
27021   // Shuffle the elements back into order.
27022   SmallVector<int> NMask;
27023   for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
27024     NMask.push_back(I);
27025     NMask.push_back(I);
27026   }
27027   Results.push_back(
27028       DAG.getVectorShuffle(VT, DL,
27029                            DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
27030                                        DAG.getUNDEF(LoHi.first.getValueType())),
27031                            DAG.getUNDEF(VT), NMask));
27032 }
27033 
27034 static void ReplaceReductionResults(SDNode *N,
27035                                     SmallVectorImpl<SDValue> &Results,
27036                                     SelectionDAG &DAG, unsigned InterOp,
27037                                     unsigned AcrossOp) {
27038   EVT LoVT, HiVT;
27039   SDValue Lo, Hi;
27040   SDLoc dl(N);
27041   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
27042   std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
27043   SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
27044   SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
27045   Results.push_back(SplitVal);
27046 }
27047 
27048 void AArch64TargetLowering::ReplaceExtractSubVectorResults(
27049     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
27050   SDValue In = N->getOperand(0);
27051   EVT InVT = In.getValueType();
27052 
27053   // Common code will handle these just fine.
27054   if (!InVT.isScalableVector() || !InVT.isInteger())
27055     return;
27056 
27057   SDLoc DL(N);
27058   EVT VT = N->getValueType(0);
27059 
27060   // The following checks bail if this is not a halving operation.
27061 
27062   ElementCount ResEC = VT.getVectorElementCount();
27063 
27064   if (InVT.getVectorElementCount() != (ResEC * 2))
27065     return;
27066 
27067   auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
27068   if (!CIndex)
27069     return;
27070 
27071   unsigned Index = CIndex->getZExtValue();
27072   if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
27073     return;
27074 
27075   unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
27076   EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
27077 
27078   SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
27079   Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
27080 }
27081 
27082 // Create an even/odd pair of X registers holding integer value V.
27083 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
27084   SDLoc dl(V.getNode());
27085   auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
27086   if (DAG.getDataLayout().isBigEndian())
27087     std::swap (VLo, VHi);
27088   SDValue RegClass =
27089       DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
27090   SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
27091   SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
27092   const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
27093   return SDValue(
27094       DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
27095 }
27096 
27097 static void ReplaceCMP_SWAP_128Results(SDNode *N,
27098                                        SmallVectorImpl<SDValue> &Results,
27099                                        SelectionDAG &DAG,
27100                                        const AArch64Subtarget *Subtarget) {
27101   assert(N->getValueType(0) == MVT::i128 &&
27102          "AtomicCmpSwap on types less than 128 should be legal");
27103 
27104   MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
27105   if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
27106     // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
27107     // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
27108     SDValue Ops[] = {
27109         createGPRPairNode(DAG, N->getOperand(2)), // Compare value
27110         createGPRPairNode(DAG, N->getOperand(3)), // Store value
27111         N->getOperand(1), // Ptr
27112         N->getOperand(0), // Chain in
27113     };
27114 
27115     unsigned Opcode;
27116     switch (MemOp->getMergedOrdering()) {
27117     case AtomicOrdering::Monotonic:
27118       Opcode = AArch64::CASPX;
27119       break;
27120     case AtomicOrdering::Acquire:
27121       Opcode = AArch64::CASPAX;
27122       break;
27123     case AtomicOrdering::Release:
27124       Opcode = AArch64::CASPLX;
27125       break;
27126     case AtomicOrdering::AcquireRelease:
27127     case AtomicOrdering::SequentiallyConsistent:
27128       Opcode = AArch64::CASPALX;
27129       break;
27130     default:
27131       llvm_unreachable("Unexpected ordering!");
27132     }
27133 
27134     MachineSDNode *CmpSwap = DAG.getMachineNode(
27135         Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
27136     DAG.setNodeMemRefs(CmpSwap, {MemOp});
27137 
27138     unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
27139     if (DAG.getDataLayout().isBigEndian())
27140       std::swap(SubReg1, SubReg2);
27141     SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
27142                                             SDValue(CmpSwap, 0));
27143     SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
27144                                             SDValue(CmpSwap, 0));
27145     Results.push_back(
27146         DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
27147     Results.push_back(SDValue(CmpSwap, 1)); // Chain out
27148     return;
27149   }
27150 
27151   unsigned Opcode;
27152   switch (MemOp->getMergedOrdering()) {
27153   case AtomicOrdering::Monotonic:
27154     Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
27155     break;
27156   case AtomicOrdering::Acquire:
27157     Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
27158     break;
27159   case AtomicOrdering::Release:
27160     Opcode = AArch64::CMP_SWAP_128_RELEASE;
27161     break;
27162   case AtomicOrdering::AcquireRelease:
27163   case AtomicOrdering::SequentiallyConsistent:
27164     Opcode = AArch64::CMP_SWAP_128;
27165     break;
27166   default:
27167     llvm_unreachable("Unexpected ordering!");
27168   }
27169 
27170   SDLoc DL(N);
27171   auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
27172   auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
27173   SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
27174                    New.first,        New.second,    N->getOperand(0)};
27175   SDNode *CmpSwap = DAG.getMachineNode(
27176       Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
27177       Ops);
27178   DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
27179 
27180   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
27181                                 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
27182   Results.push_back(SDValue(CmpSwap, 3));
27183 }
27184 
27185 static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
27186                                        AtomicOrdering Ordering) {
27187   // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
27188   // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
27189   // the type is not legal. Therefore we shouldn't expect to see a 128-bit
27190   // ATOMIC_LOAD_CLR at any point.
27191   assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
27192          "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
27193   assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
27194   assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
27195 
27196   if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
27197     // The operand will need to be XORed in a separate step.
27198     switch (Ordering) {
27199     case AtomicOrdering::Monotonic:
27200       return AArch64::LDCLRP;
27201       break;
27202     case AtomicOrdering::Acquire:
27203       return AArch64::LDCLRPA;
27204       break;
27205     case AtomicOrdering::Release:
27206       return AArch64::LDCLRPL;
27207       break;
27208     case AtomicOrdering::AcquireRelease:
27209     case AtomicOrdering::SequentiallyConsistent:
27210       return AArch64::LDCLRPAL;
27211       break;
27212     default:
27213       llvm_unreachable("Unexpected ordering!");
27214     }
27215   }
27216 
27217   if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
27218     switch (Ordering) {
27219     case AtomicOrdering::Monotonic:
27220       return AArch64::LDSETP;
27221       break;
27222     case AtomicOrdering::Acquire:
27223       return AArch64::LDSETPA;
27224       break;
27225     case AtomicOrdering::Release:
27226       return AArch64::LDSETPL;
27227       break;
27228     case AtomicOrdering::AcquireRelease:
27229     case AtomicOrdering::SequentiallyConsistent:
27230       return AArch64::LDSETPAL;
27231       break;
27232     default:
27233       llvm_unreachable("Unexpected ordering!");
27234     }
27235   }
27236 
27237   if (ISDOpcode == ISD::ATOMIC_SWAP) {
27238     switch (Ordering) {
27239     case AtomicOrdering::Monotonic:
27240       return AArch64::SWPP;
27241       break;
27242     case AtomicOrdering::Acquire:
27243       return AArch64::SWPPA;
27244       break;
27245     case AtomicOrdering::Release:
27246       return AArch64::SWPPL;
27247       break;
27248     case AtomicOrdering::AcquireRelease:
27249     case AtomicOrdering::SequentiallyConsistent:
27250       return AArch64::SWPPAL;
27251       break;
27252     default:
27253       llvm_unreachable("Unexpected ordering!");
27254     }
27255   }
27256 
27257   llvm_unreachable("Unexpected ISDOpcode!");
27258 }
27259 
27260 static void ReplaceATOMIC_LOAD_128Results(SDNode *N,
27261                                           SmallVectorImpl<SDValue> &Results,
27262                                           SelectionDAG &DAG,
27263                                           const AArch64Subtarget *Subtarget) {
27264   // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
27265   // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
27266   // rather than the CASP instructions, because CASP has register classes for
27267   // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
27268   // to present them as single operands. LSE128 instructions use the GPR64
27269   // register class (because the pair does not have to be sequential), like
27270   // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
27271 
27272   assert(N->getValueType(0) == MVT::i128 &&
27273          "AtomicLoadXXX on types less than 128 should be legal");
27274 
27275   if (!Subtarget->hasLSE128())
27276     return;
27277 
27278   MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
27279   const SDValue &Chain = N->getOperand(0);
27280   const SDValue &Ptr = N->getOperand(1);
27281   const SDValue &Val128 = N->getOperand(2);
27282   std::pair<SDValue, SDValue> Val2x64 =
27283       DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
27284 
27285   const unsigned ISDOpcode = N->getOpcode();
27286   const unsigned MachineOpcode =
27287       getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
27288 
27289   if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
27290     SDLoc dl(Val128);
27291     Val2x64.first =
27292         DAG.getNode(ISD::XOR, dl, MVT::i64,
27293                     DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
27294     Val2x64.second =
27295         DAG.getNode(ISD::XOR, dl, MVT::i64,
27296                     DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
27297   }
27298 
27299   SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
27300   if (DAG.getDataLayout().isBigEndian())
27301     std::swap(Ops[0], Ops[1]);
27302 
27303   MachineSDNode *AtomicInst =
27304       DAG.getMachineNode(MachineOpcode, SDLoc(N),
27305                          DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
27306 
27307   DAG.setNodeMemRefs(AtomicInst, {MemOp});
27308 
27309   SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
27310   if (DAG.getDataLayout().isBigEndian())
27311     std::swap(Lo, Hi);
27312 
27313   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
27314   Results.push_back(SDValue(AtomicInst, 2)); // Chain out
27315 }
27316 
27317 void AArch64TargetLowering::ReplaceNodeResults(
27318     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
27319   switch (N->getOpcode()) {
27320   default:
27321     llvm_unreachable("Don't know how to custom expand this");
27322   case ISD::BITCAST:
27323     ReplaceBITCASTResults(N, Results, DAG);
27324     return;
27325   case ISD::VECREDUCE_ADD:
27326   case ISD::VECREDUCE_SMAX:
27327   case ISD::VECREDUCE_SMIN:
27328   case ISD::VECREDUCE_UMAX:
27329   case ISD::VECREDUCE_UMIN:
27330     Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
27331     return;
27332   case ISD::VECTOR_COMPRESS:
27333     if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
27334       Results.push_back(Res);
27335     return;
27336   case ISD::ADD:
27337   case ISD::FADD:
27338     ReplaceAddWithADDP(N, Results, DAG, Subtarget);
27339     return;
27340 
27341   case ISD::CTPOP:
27342   case ISD::PARITY:
27343     if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
27344       Results.push_back(Result);
27345     return;
27346   case AArch64ISD::SADDV:
27347     ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
27348     return;
27349   case AArch64ISD::UADDV:
27350     ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
27351     return;
27352   case AArch64ISD::SMINV:
27353     ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
27354     return;
27355   case AArch64ISD::UMINV:
27356     ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
27357     return;
27358   case AArch64ISD::SMAXV:
27359     ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
27360     return;
27361   case AArch64ISD::UMAXV:
27362     ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
27363     return;
27364   case ISD::MULHS:
27365     if (useSVEForFixedLengthVectorVT(SDValue(N, 0).getValueType()))
27366       Results.push_back(
27367           LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
27368     return;
27369   case ISD::MULHU:
27370     if (useSVEForFixedLengthVectorVT(SDValue(N, 0).getValueType()))
27371       Results.push_back(
27372           LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
27373     return;
27374   case ISD::FP_TO_UINT:
27375   case ISD::FP_TO_SINT:
27376   case ISD::STRICT_FP_TO_SINT:
27377   case ISD::STRICT_FP_TO_UINT:
27378     assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
27379     // Let normal code take care of it by not adding anything to Results.
27380     return;
27381   case ISD::ATOMIC_CMP_SWAP:
27382     ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
27383     return;
27384   case ISD::ATOMIC_LOAD_CLR:
27385     assert(N->getValueType(0) != MVT::i128 &&
27386            "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
27387     break;
27388   case ISD::ATOMIC_LOAD_AND:
27389   case ISD::ATOMIC_LOAD_OR:
27390   case ISD::ATOMIC_SWAP: {
27391     assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
27392            "Expected 128-bit atomicrmw.");
27393     // These need custom type legalisation so we go directly to instruction.
27394     ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
27395     return;
27396   }
27397   case ISD::ATOMIC_LOAD:
27398   case ISD::LOAD: {
27399     MemSDNode *LoadNode = cast<MemSDNode>(N);
27400     EVT MemVT = LoadNode->getMemoryVT();
27401     // Handle lowering 256 bit non temporal loads into LDNP for little-endian
27402     // targets.
27403     if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
27404         MemVT.getSizeInBits() == 256u &&
27405         (MemVT.getScalarSizeInBits() == 8u ||
27406          MemVT.getScalarSizeInBits() == 16u ||
27407          MemVT.getScalarSizeInBits() == 32u ||
27408          MemVT.getScalarSizeInBits() == 64u)) {
27409 
27410       SDValue Result = DAG.getMemIntrinsicNode(
27411           AArch64ISD::LDNP, SDLoc(N),
27412           DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
27413                          MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
27414                          MVT::Other}),
27415           {LoadNode->getChain(), LoadNode->getBasePtr()},
27416           LoadNode->getMemoryVT(), LoadNode->getMemOperand());
27417 
27418       SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
27419                                  Result.getValue(0), Result.getValue(1));
27420       Results.append({Pair, Result.getValue(2) /* Chain */});
27421       return;
27422     }
27423 
27424     if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
27425         LoadNode->getMemoryVT() != MVT::i128) {
27426       // Non-volatile or atomic loads are optimized later in AArch64's load/store
27427       // optimizer.
27428       return;
27429     }
27430 
27431     if (SDValue(N, 0).getValueType() == MVT::i128) {
27432       auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
27433       bool isLoadAcquire =
27434           AN && AN->getSuccessOrdering() == AtomicOrdering::Acquire;
27435       unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
27436 
27437       if (isLoadAcquire)
27438         assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
27439 
27440       SDValue Result = DAG.getMemIntrinsicNode(
27441           Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
27442           {LoadNode->getChain(), LoadNode->getBasePtr()},
27443           LoadNode->getMemoryVT(), LoadNode->getMemOperand());
27444 
27445       unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
27446 
27447       SDValue Pair =
27448           DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
27449                       Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
27450       Results.append({Pair, Result.getValue(2) /* Chain */});
27451     }
27452     return;
27453   }
27454   case ISD::EXTRACT_SUBVECTOR:
27455     ReplaceExtractSubVectorResults(N, Results, DAG);
27456     return;
27457   case ISD::INSERT_SUBVECTOR:
27458   case ISD::CONCAT_VECTORS:
27459     // Custom lowering has been requested for INSERT_SUBVECTOR and
27460     // CONCAT_VECTORS -- but delegate to common code for result type
27461     // legalisation
27462     return;
27463   case ISD::INTRINSIC_WO_CHAIN: {
27464     EVT VT = N->getValueType(0);
27465 
27466     Intrinsic::ID IntID =
27467         static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
27468     switch (IntID) {
27469     default:
27470       return;
27471     case Intrinsic::aarch64_sve_clasta_n: {
27472       assert((VT == MVT::i8 || VT == MVT::i16) &&
27473              "custom lowering for unexpected type");
27474       SDLoc DL(N);
27475       auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
27476       auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
27477                            N->getOperand(1), Op2, N->getOperand(3));
27478       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
27479       return;
27480     }
27481     case Intrinsic::aarch64_sve_clastb_n: {
27482       assert((VT == MVT::i8 || VT == MVT::i16) &&
27483              "custom lowering for unexpected type");
27484       SDLoc DL(N);
27485       auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
27486       auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
27487                            N->getOperand(1), Op2, N->getOperand(3));
27488       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
27489       return;
27490     }
27491     case Intrinsic::aarch64_sve_lasta: {
27492       assert((VT == MVT::i8 || VT == MVT::i16) &&
27493              "custom lowering for unexpected type");
27494       SDLoc DL(N);
27495       auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
27496                            N->getOperand(1), N->getOperand(2));
27497       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
27498       return;
27499     }
27500     case Intrinsic::aarch64_sve_lastb: {
27501       assert((VT == MVT::i8 || VT == MVT::i16) &&
27502              "custom lowering for unexpected type");
27503       SDLoc DL(N);
27504       auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
27505                            N->getOperand(1), N->getOperand(2));
27506       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
27507       return;
27508     }
27509     case Intrinsic::aarch64_sme_in_streaming_mode: {
27510       SDLoc DL(N);
27511       SDValue Chain = DAG.getEntryNode();
27512       SDValue RuntimePStateSM =
27513           getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
27514       Results.push_back(
27515           DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));
27516       return;
27517     }
27518     case Intrinsic::experimental_vector_match:
27519     case Intrinsic::get_active_lane_mask: {
27520       if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
27521         return;
27522 
27523       // NOTE: Only trivial type promotion is supported.
27524       EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
27525       if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
27526         return;
27527 
27528       SDLoc DL(N);
27529       auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
27530       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
27531       return;
27532     }
27533     }
27534   }
27535   case ISD::READ_REGISTER: {
27536     SDLoc DL(N);
27537     assert(N->getValueType(0) == MVT::i128 &&
27538            "READ_REGISTER custom lowering is only for 128-bit sysregs");
27539     SDValue Chain = N->getOperand(0);
27540     SDValue SysRegName = N->getOperand(1);
27541 
27542     SDValue Result = DAG.getNode(
27543         AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
27544         Chain, SysRegName);
27545 
27546     // Sysregs are not endian. Result.getValue(0) always contains the lower half
27547     // of the 128-bit System Register value.
27548     SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
27549                                Result.getValue(0), Result.getValue(1));
27550     Results.push_back(Pair);
27551     Results.push_back(Result.getValue(2)); // Chain
27552     return;
27553   }
27554   }
27555 }
27556 
27557 bool AArch64TargetLowering::useLoadStackGuardNode(const Module &M) const {
27558   if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
27559     return TargetLowering::useLoadStackGuardNode(M);
27560   return true;
27561 }
27562 
27563 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
27564   // Combine multiple FDIVs with the same divisor into multiple FMULs by the
27565   // reciprocal if there are three or more FDIVs.
27566   return 3;
27567 }
27568 
27569 TargetLoweringBase::LegalizeTypeAction
27570 AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
27571   // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
27572   // v4i16, v2i32 instead of to promote.
27573   if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
27574       VT == MVT::v1f32)
27575     return TypeWidenVector;
27576 
27577   return TargetLoweringBase::getPreferredVectorAction(VT);
27578 }
27579 
27580 // In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
27581 // provided the address is 16-byte aligned.
27582 bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
27583   if (!Subtarget->hasLSE2())
27584     return false;
27585 
27586   if (auto LI = dyn_cast<LoadInst>(I))
27587     return LI->getType()->getPrimitiveSizeInBits() == 128 &&
27588            LI->getAlign() >= Align(16);
27589 
27590   if (auto SI = dyn_cast<StoreInst>(I))
27591     return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
27592            SI->getAlign() >= Align(16);
27593 
27594   return false;
27595 }
27596 
27597 bool AArch64TargetLowering::isOpSuitableForLSE128(const Instruction *I) const {
27598   if (!Subtarget->hasLSE128())
27599     return false;
27600 
27601   // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
27602   // will clobber the two registers.
27603   if (const auto *SI = dyn_cast<StoreInst>(I))
27604     return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
27605            SI->getAlign() >= Align(16) &&
27606            (SI->getOrdering() == AtomicOrdering::Release ||
27607             SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
27608 
27609   if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
27610     return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
27611            RMW->getAlign() >= Align(16) &&
27612            (RMW->getOperation() == AtomicRMWInst::Xchg ||
27613             RMW->getOperation() == AtomicRMWInst::And ||
27614             RMW->getOperation() == AtomicRMWInst::Or);
27615 
27616   return false;
27617 }
27618 
27619 bool AArch64TargetLowering::isOpSuitableForRCPC3(const Instruction *I) const {
27620   if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
27621     return false;
27622 
27623   if (auto LI = dyn_cast<LoadInst>(I))
27624     return LI->getType()->getPrimitiveSizeInBits() == 128 &&
27625            LI->getAlign() >= Align(16) &&
27626            LI->getOrdering() == AtomicOrdering::Acquire;
27627 
27628   if (auto SI = dyn_cast<StoreInst>(I))
27629     return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
27630            SI->getAlign() >= Align(16) &&
27631            SI->getOrdering() == AtomicOrdering::Release;
27632 
27633   return false;
27634 }
27635 
27636 bool AArch64TargetLowering::shouldInsertFencesForAtomic(
27637     const Instruction *I) const {
27638   if (isOpSuitableForRCPC3(I))
27639     return false;
27640   if (isOpSuitableForLSE128(I))
27641     return false;
27642   if (isOpSuitableForLDPSTP(I))
27643     return true;
27644   return false;
27645 }
27646 
27647 bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore(
27648     const Instruction *I) const {
27649   // Store-Release instructions only provide seq_cst guarantees when paired with
27650   // Load-Acquire instructions. MSVC CRT does not use these instructions to
27651   // implement seq_cst loads and stores, so we need additional explicit fences
27652   // after memory writes.
27653   if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
27654     return false;
27655 
27656   switch (I->getOpcode()) {
27657   default:
27658     return false;
27659   case Instruction::AtomicCmpXchg:
27660     return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
27661            AtomicOrdering::SequentiallyConsistent;
27662   case Instruction::AtomicRMW:
27663     return cast<AtomicRMWInst>(I)->getOrdering() ==
27664            AtomicOrdering::SequentiallyConsistent;
27665   case Instruction::Store:
27666     return cast<StoreInst>(I)->getOrdering() ==
27667            AtomicOrdering::SequentiallyConsistent;
27668   }
27669 }
27670 
27671 // Loads and stores less than 128-bits are already atomic; ones above that
27672 // are doomed anyway, so defer to the default libcall and blame the OS when
27673 // things go wrong.
27674 TargetLoweringBase::AtomicExpansionKind
27675 AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
27676   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
27677   if (Size != 128)
27678     return AtomicExpansionKind::None;
27679   if (isOpSuitableForRCPC3(SI))
27680     return AtomicExpansionKind::None;
27681   if (isOpSuitableForLSE128(SI))
27682     return AtomicExpansionKind::Expand;
27683   if (isOpSuitableForLDPSTP(SI))
27684     return AtomicExpansionKind::None;
27685   return AtomicExpansionKind::Expand;
27686 }
27687 
27688 // Loads and stores less than 128-bits are already atomic; ones above that
27689 // are doomed anyway, so defer to the default libcall and blame the OS when
27690 // things go wrong.
27691 TargetLowering::AtomicExpansionKind
27692 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
27693   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
27694 
27695   if (Size != 128)
27696     return AtomicExpansionKind::None;
27697   if (isOpSuitableForRCPC3(LI))
27698     return AtomicExpansionKind::None;
27699   // No LSE128 loads
27700   if (isOpSuitableForLDPSTP(LI))
27701     return AtomicExpansionKind::None;
27702 
27703   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
27704   // implement atomicrmw without spilling. If the target address is also on the
27705   // stack and close enough to the spill slot, this can lead to a situation
27706   // where the monitor always gets cleared and the atomic operation can never
27707   // succeed. So at -O0 lower this operation to a CAS loop.
27708   if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
27709     return AtomicExpansionKind::CmpXChg;
27710 
27711   // Using CAS for an atomic load has a better chance of succeeding under high
27712   // contention situations. So use it if available.
27713   return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
27714                              : AtomicExpansionKind::LLSC;
27715 }
27716 
27717 // Return true if the atomic operation expansion will lower to use a library
27718 // call, and is thus ineligible to use an LLSC expansion.
27719 static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
27720                                    const AtomicRMWInst *RMW) {
27721   if (!RMW->isFloatingPointOperation())
27722     return false;
27723   switch (RMW->getType()->getScalarType()->getTypeID()) {
27724   case Type::FloatTyID:
27725   case Type::DoubleTyID:
27726   case Type::HalfTyID:
27727   case Type::BFloatTyID:
27728     // Will use soft float
27729     return !Subtarget.hasFPARMv8();
27730   default:
27731     // fp128 will emit library calls.
27732     return true;
27733   }
27734 
27735   llvm_unreachable("covered type switch");
27736 }
27737 
27738 // The "default" for integer RMW operations is to expand to an LL/SC loop.
27739 // However, with the LSE instructions (or outline-atomics mode, which provides
27740 // library routines in place of the LSE-instructions), we can directly emit many
27741 // operations instead.
27742 TargetLowering::AtomicExpansionKind
27743 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
27744   Type *Ty = AI->getType();
27745   unsigned Size = Ty->getPrimitiveSizeInBits();
27746   assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
27747 
27748   bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
27749                       (AI->getOperation() == AtomicRMWInst::Xchg ||
27750                        AI->getOperation() == AtomicRMWInst::Or ||
27751                        AI->getOperation() == AtomicRMWInst::And);
27752   if (CanUseLSE128)
27753     return AtomicExpansionKind::None;
27754 
27755   // Nand is not supported in LSE.
27756   // Leave 128 bits to LLSC or CmpXChg.
27757   if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
27758       !AI->isFloatingPointOperation()) {
27759     if (Subtarget->hasLSE())
27760       return AtomicExpansionKind::None;
27761     if (Subtarget->outlineAtomics()) {
27762       // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
27763       // Don't outline them unless
27764       // (1) high level <atomic> support approved:
27765       //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
27766       // (2) low level libgcc and compiler-rt support implemented by:
27767       //   min/max outline atomics helpers
27768       if (AI->getOperation() != AtomicRMWInst::Min &&
27769           AI->getOperation() != AtomicRMWInst::Max &&
27770           AI->getOperation() != AtomicRMWInst::UMin &&
27771           AI->getOperation() != AtomicRMWInst::UMax) {
27772         return AtomicExpansionKind::None;
27773       }
27774     }
27775   }
27776 
27777   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
27778   // implement atomicrmw without spilling. If the target address is also on the
27779   // stack and close enough to the spill slot, this can lead to a situation
27780   // where the monitor always gets cleared and the atomic operation can never
27781   // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
27782   // we have a single CAS instruction that can replace the loop.
27783   if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
27784       Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI))
27785     return AtomicExpansionKind::CmpXChg;
27786 
27787   return AtomicExpansionKind::LLSC;
27788 }
27789 
27790 TargetLowering::AtomicExpansionKind
27791 AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
27792     AtomicCmpXchgInst *AI) const {
27793   // If subtarget has LSE, leave cmpxchg intact for codegen.
27794   if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
27795     return AtomicExpansionKind::None;
27796   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
27797   // implement cmpxchg without spilling. If the address being exchanged is also
27798   // on the stack and close enough to the spill slot, this can lead to a
27799   // situation where the monitor always gets cleared and the atomic operation
27800   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
27801   if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
27802     return AtomicExpansionKind::None;
27803 
27804   // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
27805   // it.
27806   unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
27807   if (Size > 64)
27808     return AtomicExpansionKind::None;
27809 
27810   return AtomicExpansionKind::LLSC;
27811 }
27812 
27813 Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
27814                                              Type *ValueTy, Value *Addr,
27815                                              AtomicOrdering Ord) const {
27816   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
27817   bool IsAcquire = isAcquireOrStronger(Ord);
27818 
27819   // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
27820   // intrinsic must return {i64, i64} and we have to recombine them into a
27821   // single i128 here.
27822   if (ValueTy->getPrimitiveSizeInBits() == 128) {
27823     Intrinsic::ID Int =
27824         IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
27825 
27826     Value *LoHi =
27827         Builder.CreateIntrinsic(Int, {}, Addr, /*FMFSource=*/nullptr, "lohi");
27828 
27829     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
27830     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
27831 
27832     auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
27833     Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
27834     Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
27835 
27836     Value *Or = Builder.CreateOr(
27837         Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
27838     return Builder.CreateBitCast(Or, ValueTy);
27839   }
27840 
27841   Type *Tys[] = { Addr->getType() };
27842   Intrinsic::ID Int =
27843       IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
27844 
27845   const DataLayout &DL = M->getDataLayout();
27846   IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
27847   CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
27848   CI->addParamAttr(0, Attribute::get(Builder.getContext(),
27849                                      Attribute::ElementType, IntEltTy));
27850   Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
27851 
27852   return Builder.CreateBitCast(Trunc, ValueTy);
27853 }
27854 
27855 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
27856     IRBuilderBase &Builder) const {
27857   Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {}, {});
27858 }
27859 
27860 Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
27861                                                    Value *Val, Value *Addr,
27862                                                    AtomicOrdering Ord) const {
27863   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
27864   bool IsRelease = isReleaseOrStronger(Ord);
27865 
27866   // Since the intrinsics must have legal type, the i128 intrinsics take two
27867   // parameters: "i64, i64". We must marshal Val into the appropriate form
27868   // before the call.
27869   if (Val->getType()->getPrimitiveSizeInBits() == 128) {
27870     Intrinsic::ID Int =
27871         IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
27872     Function *Stxr = Intrinsic::getOrInsertDeclaration(M, Int);
27873     Type *Int64Ty = Type::getInt64Ty(M->getContext());
27874     Type *Int128Ty = Type::getInt128Ty(M->getContext());
27875 
27876     Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
27877 
27878     Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
27879     Value *Hi =
27880         Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
27881     return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
27882   }
27883 
27884   Intrinsic::ID Int =
27885       IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
27886   Type *Tys[] = { Addr->getType() };
27887   Function *Stxr = Intrinsic::getOrInsertDeclaration(M, Int, Tys);
27888 
27889   const DataLayout &DL = M->getDataLayout();
27890   IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
27891   Val = Builder.CreateBitCast(Val, IntValTy);
27892 
27893   CallInst *CI = Builder.CreateCall(
27894       Stxr, {Builder.CreateZExtOrBitCast(
27895                  Val, Stxr->getFunctionType()->getParamType(0)),
27896              Addr});
27897   CI->addParamAttr(1, Attribute::get(Builder.getContext(),
27898                                      Attribute::ElementType, Val->getType()));
27899   return CI;
27900 }
27901 
27902 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
27903     Type *Ty, CallingConv::ID CallConv, bool isVarArg,
27904     const DataLayout &DL) const {
27905   if (!Ty->isArrayTy()) {
27906     const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
27907     return TySize.isScalable() && TySize.getKnownMinValue() > 128;
27908   }
27909 
27910   // All non aggregate members of the type must have the same type
27911   SmallVector<EVT> ValueVTs;
27912   ComputeValueVTs(*this, DL, Ty, ValueVTs);
27913   return all_equal(ValueVTs);
27914 }
27915 
27916 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
27917                                                             EVT) const {
27918   return false;
27919 }
27920 
27921 static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
27922   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
27923   Function *ThreadPointerFunc =
27924       Intrinsic::getOrInsertDeclaration(M, Intrinsic::thread_pointer);
27925   return IRB.CreatePointerCast(
27926       IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
27927                              Offset),
27928       IRB.getPtrTy(0));
27929 }
27930 
27931 Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
27932   // Android provides a fixed TLS slot for the stack cookie. See the definition
27933   // of TLS_SLOT_STACK_GUARD in
27934   // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
27935   if (Subtarget->isTargetAndroid())
27936     return UseTlsOffset(IRB, 0x28);
27937 
27938   // Fuchsia is similar.
27939   // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
27940   if (Subtarget->isTargetFuchsia())
27941     return UseTlsOffset(IRB, -0x10);
27942 
27943   return TargetLowering::getIRStackGuard(IRB);
27944 }
27945 
27946 void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
27947   // MSVC CRT provides functionalities for stack protection.
27948   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
27949     // MSVC CRT has a global variable holding security cookie.
27950     M.getOrInsertGlobal("__security_cookie",
27951                         PointerType::getUnqual(M.getContext()));
27952 
27953     // MSVC CRT has a function to validate security cookie.
27954     FunctionCallee SecurityCheckCookie =
27955         M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
27956                               Type::getVoidTy(M.getContext()),
27957                               PointerType::getUnqual(M.getContext()));
27958     if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
27959       F->setCallingConv(CallingConv::Win64);
27960       F->addParamAttr(0, Attribute::AttrKind::InReg);
27961     }
27962     return;
27963   }
27964   TargetLowering::insertSSPDeclarations(M);
27965 }
27966 
27967 Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
27968   // MSVC CRT has a global variable holding security cookie.
27969   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
27970     return M.getGlobalVariable("__security_cookie");
27971   return TargetLowering::getSDagStackGuard(M);
27972 }
27973 
27974 Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
27975   // MSVC CRT has a function to validate security cookie.
27976   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
27977     return M.getFunction(Subtarget->getSecurityCheckCookieName());
27978   return TargetLowering::getSSPStackGuardCheck(M);
27979 }
27980 
27981 Value *
27982 AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
27983   // Android provides a fixed TLS slot for the SafeStack pointer. See the
27984   // definition of TLS_SLOT_SAFESTACK in
27985   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
27986   if (Subtarget->isTargetAndroid())
27987     return UseTlsOffset(IRB, 0x48);
27988 
27989   // Fuchsia is similar.
27990   // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
27991   if (Subtarget->isTargetFuchsia())
27992     return UseTlsOffset(IRB, -0x8);
27993 
27994   return TargetLowering::getSafeStackPointerLocation(IRB);
27995 }
27996 
27997 /// If a physical register, this returns the register that receives the
27998 /// exception address on entry to an EH pad.
27999 Register AArch64TargetLowering::getExceptionPointerRegister(
28000     const Constant *PersonalityFn) const {
28001   // FIXME: This is a guess. Has this been defined yet?
28002   return AArch64::X0;
28003 }
28004 
28005 /// If a physical register, this returns the register that receives the
28006 /// exception typeid on entry to a landing pad.
28007 Register AArch64TargetLowering::getExceptionSelectorRegister(
28008     const Constant *PersonalityFn) const {
28009   // FIXME: This is a guess. Has this been defined yet?
28010   return AArch64::X1;
28011 }
28012 
28013 bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
28014     const Instruction &AndI) const {
28015   // Only sink 'and' mask to cmp use block if it is masking a single bit, since
28016   // this is likely to be fold the and/cmp/br into a single tbz instruction.  It
28017   // may be beneficial to sink in other cases, but we would have to check that
28018   // the cmp would not get folded into the br to form a cbz for these to be
28019   // beneficial.
28020   ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
28021   if (!Mask)
28022     return false;
28023   return Mask->getValue().isPowerOf2();
28024 }
28025 
28026 bool AArch64TargetLowering::
28027     shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
28028         SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
28029         unsigned OldShiftOpcode, unsigned NewShiftOpcode,
28030         SelectionDAG &DAG) const {
28031   // Does baseline recommend not to perform the fold by default?
28032   if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
28033           X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
28034     return false;
28035   // Else, if this is a vector shift, prefer 'shl'.
28036   return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
28037 }
28038 
28039 TargetLowering::ShiftLegalizationStrategy
28040 AArch64TargetLowering::preferredShiftLegalizationStrategy(
28041     SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
28042   if (DAG.getMachineFunction().getFunction().hasMinSize() &&
28043       !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
28044     return ShiftLegalizationStrategy::LowerToLibcall;
28045   return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
28046                                                             ExpansionFactor);
28047 }
28048 
28049 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
28050   // Update IsSplitCSR in AArch64unctionInfo.
28051   AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
28052   AFI->setIsSplitCSR(true);
28053 }
28054 
28055 void AArch64TargetLowering::insertCopiesSplitCSR(
28056     MachineBasicBlock *Entry,
28057     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
28058   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
28059   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
28060   if (!IStart)
28061     return;
28062 
28063   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
28064   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
28065   MachineBasicBlock::iterator MBBI = Entry->begin();
28066   for (const MCPhysReg *I = IStart; *I; ++I) {
28067     const TargetRegisterClass *RC = nullptr;
28068     if (AArch64::GPR64RegClass.contains(*I))
28069       RC = &AArch64::GPR64RegClass;
28070     else if (AArch64::FPR64RegClass.contains(*I))
28071       RC = &AArch64::FPR64RegClass;
28072     else
28073       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
28074 
28075     Register NewVR = MRI->createVirtualRegister(RC);
28076     // Create copy from CSR to a virtual register.
28077     // FIXME: this currently does not emit CFI pseudo-instructions, it works
28078     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
28079     // nounwind. If we want to generalize this later, we may need to emit
28080     // CFI pseudo-instructions.
28081     assert(Entry->getParent()->getFunction().hasFnAttribute(
28082                Attribute::NoUnwind) &&
28083            "Function should be nounwind in insertCopiesSplitCSR!");
28084     Entry->addLiveIn(*I);
28085     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
28086         .addReg(*I);
28087 
28088     // Insert the copy-back instructions right before the terminator.
28089     for (auto *Exit : Exits)
28090       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
28091               TII->get(TargetOpcode::COPY), *I)
28092           .addReg(NewVR);
28093   }
28094 }
28095 
28096 bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
28097   // Integer division on AArch64 is expensive. However, when aggressively
28098   // optimizing for code size, we prefer to use a div instruction, as it is
28099   // usually smaller than the alternative sequence.
28100   // The exception to this is vector division. Since AArch64 doesn't have vector
28101   // integer division, leaving the division as-is is a loss even in terms of
28102   // size, because it will have to be scalarized, while the alternative code
28103   // sequence can be performed in vector form.
28104   bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
28105   return OptSize && !VT.isVector();
28106 }
28107 
28108 bool AArch64TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
28109                                              const MachineFunction &MF) const {
28110   // Avoid merging stores into fixed-length vectors when Neon is unavailable.
28111   // In future, we could allow this when SVE is available, but currently,
28112   // the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
28113   // the general lowering may introduce stack spills/reloads).
28114   if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
28115     return false;
28116 
28117   // Do not merge to float value size (128 bytes) if no implicit float attribute
28118   // is set.
28119   bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
28120   return !NoFloat || MemVT.getSizeInBits() <= 64;
28121 }
28122 
28123 bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
28124   // We want inc-of-add for scalars and sub-of-not for vectors.
28125   return VT.isScalarInteger();
28126 }
28127 
28128 bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
28129                                                  EVT VT) const {
28130   // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
28131   // legalize.
28132   if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
28133     return false;
28134   if (FPVT == MVT::v8bf16)
28135     return false;
28136   return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
28137 }
28138 
28139 bool AArch64TargetLowering::shouldExpandCmpUsingSelects(EVT VT) const {
28140   // Expand scalar and SVE operations using selects. Neon vectors prefer sub to
28141   // avoid vselect becoming bsl / unrolling.
28142   return !VT.isFixedLengthVector();
28143 }
28144 
28145 MachineInstr *
28146 AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
28147                                      MachineBasicBlock::instr_iterator &MBBI,
28148                                      const TargetInstrInfo *TII) const {
28149   assert(MBBI->isCall() && MBBI->getCFIType() &&
28150          "Invalid call instruction for a KCFI check");
28151 
28152   switch (MBBI->getOpcode()) {
28153   case AArch64::BLR:
28154   case AArch64::BLRNoIP:
28155   case AArch64::TCRETURNri:
28156   case AArch64::TCRETURNrix16x17:
28157   case AArch64::TCRETURNrix17:
28158   case AArch64::TCRETURNrinotx16:
28159     break;
28160   default:
28161     llvm_unreachable("Unexpected CFI call opcode");
28162   }
28163 
28164   MachineOperand &Target = MBBI->getOperand(0);
28165   assert(Target.isReg() && "Invalid target operand for an indirect call");
28166   Target.setIsRenamable(false);
28167 
28168   return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
28169       .addReg(Target.getReg())
28170       .addImm(MBBI->getCFIType())
28171       .getInstr();
28172 }
28173 
28174 bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
28175   return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
28176 }
28177 
28178 unsigned
28179 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
28180   if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
28181     return getPointerTy(DL).getSizeInBits();
28182 
28183   return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
28184 }
28185 
28186 void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
28187   MachineFrameInfo &MFI = MF.getFrameInfo();
28188   // If we have any vulnerable SVE stack objects then the stack protector
28189   // needs to be placed at the top of the SVE stack area, as the SVE locals
28190   // are placed above the other locals, so we allocate it as if it were a
28191   // scalable vector.
28192   // FIXME: It may be worthwhile having a specific interface for this rather
28193   // than doing it here in finalizeLowering.
28194   if (MFI.hasStackProtectorIndex()) {
28195     for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
28196       if (MFI.getStackID(i) == TargetStackID::ScalableVector &&
28197           MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) {
28198         MFI.setStackID(MFI.getStackProtectorIndex(),
28199                        TargetStackID::ScalableVector);
28200         MFI.setObjectAlignment(MFI.getStackProtectorIndex(), Align(16));
28201         break;
28202       }
28203     }
28204   }
28205   MFI.computeMaxCallFrameSize(MF);
28206   TargetLoweringBase::finalizeLowering(MF);
28207 }
28208 
28209 // Unlike X86, we let frame lowering assign offsets to all catch objects.
28210 bool AArch64TargetLowering::needsFixedCatchObjects() const {
28211   return false;
28212 }
28213 
28214 bool AArch64TargetLowering::shouldLocalize(
28215     const MachineInstr &MI, const TargetTransformInfo *TTI) const {
28216   auto &MF = *MI.getMF();
28217   auto &MRI = MF.getRegInfo();
28218   auto maxUses = [](unsigned RematCost) {
28219     // A cost of 1 means remats are basically free.
28220     if (RematCost == 1)
28221       return std::numeric_limits<unsigned>::max();
28222     if (RematCost == 2)
28223       return 2U;
28224 
28225     // Remat is too expensive, only sink if there's one user.
28226     if (RematCost > 2)
28227       return 1U;
28228     llvm_unreachable("Unexpected remat cost");
28229   };
28230 
28231   unsigned Opc = MI.getOpcode();
28232   switch (Opc) {
28233   case TargetOpcode::G_GLOBAL_VALUE: {
28234     // On Darwin, TLS global vars get selected into function calls, which
28235     // we don't want localized, as they can get moved into the middle of a
28236     // another call sequence.
28237     const GlobalValue &GV = *MI.getOperand(1).getGlobal();
28238     if (GV.isThreadLocal() && Subtarget->isTargetMachO())
28239       return false;
28240     return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
28241   }
28242   case TargetOpcode::G_FCONSTANT:
28243   case TargetOpcode::G_CONSTANT: {
28244     const ConstantInt *CI;
28245     unsigned AdditionalCost = 0;
28246 
28247     if (Opc == TargetOpcode::G_CONSTANT)
28248       CI = MI.getOperand(1).getCImm();
28249     else {
28250       LLT Ty = MRI.getType(MI.getOperand(0).getReg());
28251       // We try to estimate cost of 32/64b fpimms, as they'll likely be
28252       // materialized as integers.
28253       if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
28254         break;
28255       auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
28256       bool OptForSize =
28257           MF.getFunction().hasOptSize() || MF.getFunction().hasMinSize();
28258       if (isFPImmLegal(APF, EVT::getFloatingPointVT(Ty.getScalarSizeInBits()),
28259                        OptForSize))
28260         return true; // Constant should be cheap.
28261       CI =
28262           ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
28263       // FP materialization also costs an extra move, from gpr to fpr.
28264       AdditionalCost = 1;
28265     }
28266     APInt Imm = CI->getValue();
28267     InstructionCost Cost = TTI->getIntImmCost(
28268         Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize);
28269     assert(Cost.isValid() && "Expected a valid imm cost");
28270 
28271     unsigned RematCost = *Cost.getValue();
28272     RematCost += AdditionalCost;
28273     Register Reg = MI.getOperand(0).getReg();
28274     unsigned MaxUses = maxUses(RematCost);
28275     // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
28276     if (MaxUses == std::numeric_limits<unsigned>::max())
28277       --MaxUses;
28278     return MRI.hasAtMostUserInstrs(Reg, MaxUses);
28279   }
28280   // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
28281   // localizable.
28282   case AArch64::ADRP:
28283   case AArch64::G_ADD_LOW:
28284   // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
28285   case TargetOpcode::G_PTR_ADD:
28286     return true;
28287   default:
28288     break;
28289   }
28290   return TargetLoweringBase::shouldLocalize(MI, TTI);
28291 }
28292 
28293 bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
28294   // Fallback for scalable vectors.
28295   // Note that if EnableSVEGISel is true, we allow scalable vector types for
28296   // all instructions, regardless of whether they are actually supported.
28297   if (!EnableSVEGISel) {
28298     if (Inst.getType()->isScalableTy()) {
28299       return true;
28300     }
28301 
28302     for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
28303       if (Inst.getOperand(i)->getType()->isScalableTy())
28304         return true;
28305 
28306     if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
28307       if (AI->getAllocatedType()->isScalableTy())
28308         return true;
28309     }
28310   }
28311 
28312   // Checks to allow the use of SME instructions
28313   if (auto *Base = dyn_cast<CallBase>(&Inst)) {
28314     auto CallerAttrs = SMEAttrs(*Inst.getFunction());
28315     auto CalleeAttrs = SMEAttrs(*Base);
28316     if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
28317         CallerAttrs.requiresLazySave(CalleeAttrs) ||
28318         CallerAttrs.requiresPreservingZT0(CalleeAttrs) ||
28319         CallerAttrs.requiresPreservingAllZAState(CalleeAttrs))
28320       return true;
28321   }
28322   return false;
28323 }
28324 
28325 // Return the largest legal scalable vector type that matches VT's element type.
28326 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
28327   assert(VT.isFixedLengthVector() &&
28328          DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
28329          "Expected legal fixed length vector!");
28330   switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
28331   default:
28332     llvm_unreachable("unexpected element type for SVE container");
28333   case MVT::i8:
28334     return EVT(MVT::nxv16i8);
28335   case MVT::i16:
28336     return EVT(MVT::nxv8i16);
28337   case MVT::i32:
28338     return EVT(MVT::nxv4i32);
28339   case MVT::i64:
28340     return EVT(MVT::nxv2i64);
28341   case MVT::bf16:
28342     return EVT(MVT::nxv8bf16);
28343   case MVT::f16:
28344     return EVT(MVT::nxv8f16);
28345   case MVT::f32:
28346     return EVT(MVT::nxv4f32);
28347   case MVT::f64:
28348     return EVT(MVT::nxv2f64);
28349   }
28350 }
28351 
28352 // Return a PTRUE with active lanes corresponding to the extent of VT.
28353 static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
28354                                                 EVT VT) {
28355   assert(VT.isFixedLengthVector() &&
28356          DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
28357          "Expected legal fixed length vector!");
28358 
28359   std::optional<unsigned> PgPattern =
28360       getSVEPredPatternFromNumElements(VT.getVectorNumElements());
28361   assert(PgPattern && "Unexpected element count for SVE predicate");
28362 
28363   // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
28364   // AArch64SVEPredPattern::all, which can enable the use of unpredicated
28365   // variants of instructions when available.
28366   const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
28367   unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
28368   unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
28369   if (MaxSVESize && MinSVESize == MaxSVESize &&
28370       MaxSVESize == VT.getSizeInBits())
28371     PgPattern = AArch64SVEPredPattern::all;
28372 
28373   MVT MaskVT;
28374   switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
28375   default:
28376     llvm_unreachable("unexpected element type for SVE predicate");
28377   case MVT::i8:
28378     MaskVT = MVT::nxv16i1;
28379     break;
28380   case MVT::i16:
28381   case MVT::f16:
28382   case MVT::bf16:
28383     MaskVT = MVT::nxv8i1;
28384     break;
28385   case MVT::i32:
28386   case MVT::f32:
28387     MaskVT = MVT::nxv4i1;
28388     break;
28389   case MVT::i64:
28390   case MVT::f64:
28391     MaskVT = MVT::nxv2i1;
28392     break;
28393   }
28394 
28395   return getPTrue(DAG, DL, MaskVT, *PgPattern);
28396 }
28397 
28398 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
28399                                              EVT VT) {
28400   assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
28401          "Expected legal scalable vector!");
28402   auto PredTy = VT.changeVectorElementType(MVT::i1);
28403   return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
28404 }
28405 
28406 static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
28407   if (VT.isFixedLengthVector())
28408     return getPredicateForFixedLengthVector(DAG, DL, VT);
28409 
28410   return getPredicateForScalableVector(DAG, DL, VT);
28411 }
28412 
28413 // Grow V to consume an entire SVE register.
28414 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
28415   assert(VT.isScalableVector() &&
28416          "Expected to convert into a scalable vector!");
28417   assert(V.getValueType().isFixedLengthVector() &&
28418          "Expected a fixed length vector operand!");
28419   SDLoc DL(V);
28420   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
28421   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
28422 }
28423 
28424 // Shrink V so it's just big enough to maintain a VT's worth of data.
28425 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
28426   assert(VT.isFixedLengthVector() &&
28427          "Expected to convert into a fixed length vector!");
28428   assert(V.getValueType().isScalableVector() &&
28429          "Expected a scalable vector operand!");
28430   SDLoc DL(V);
28431   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
28432   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
28433 }
28434 
28435 // Convert all fixed length vector loads larger than NEON to masked_loads.
28436 SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
28437     SDValue Op, SelectionDAG &DAG) const {
28438   auto Load = cast<LoadSDNode>(Op);
28439 
28440   SDLoc DL(Op);
28441   EVT VT = Op.getValueType();
28442   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28443   EVT LoadVT = ContainerVT;
28444   EVT MemVT = Load->getMemoryVT();
28445 
28446   auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
28447 
28448   if (VT.isFloatingPoint()) {
28449     LoadVT = ContainerVT.changeTypeToInteger();
28450     MemVT = MemVT.changeTypeToInteger();
28451   }
28452 
28453   SDValue NewLoad = DAG.getMaskedLoad(
28454       LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
28455       DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
28456       Load->getAddressingMode(), Load->getExtensionType());
28457 
28458   SDValue Result = NewLoad;
28459   if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
28460     EVT ExtendVT = ContainerVT.changeVectorElementType(
28461         Load->getMemoryVT().getVectorElementType());
28462 
28463     Result = getSVESafeBitCast(ExtendVT, Result, DAG);
28464     Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
28465                          Pg, Result, DAG.getUNDEF(ContainerVT));
28466   } else if (VT.isFloatingPoint()) {
28467     Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
28468   }
28469 
28470   Result = convertFromScalableVector(DAG, VT, Result);
28471   SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
28472   return DAG.getMergeValues(MergedValues, DL);
28473 }
28474 
28475 static SDValue convertFixedMaskToScalableVector(SDValue Mask,
28476                                                 SelectionDAG &DAG) {
28477   SDLoc DL(Mask);
28478   EVT InVT = Mask.getValueType();
28479   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
28480 
28481   auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
28482 
28483   if (ISD::isBuildVectorAllOnes(Mask.getNode()))
28484     return Pg;
28485 
28486   auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
28487   auto Op2 = DAG.getConstant(0, DL, ContainerVT);
28488 
28489   return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
28490                      {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
28491 }
28492 
28493 // Convert all fixed length vector loads larger than NEON to masked_loads.
28494 SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
28495     SDValue Op, SelectionDAG &DAG) const {
28496   auto Load = cast<MaskedLoadSDNode>(Op);
28497 
28498   SDLoc DL(Op);
28499   EVT VT = Op.getValueType();
28500   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28501 
28502   SDValue Mask = Load->getMask();
28503   // If this is an extending load and the mask type is not the same as
28504   // load's type then we have to extend the mask type.
28505   if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
28506     assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
28507            "Incorrect mask type");
28508     Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Mask);
28509   }
28510   Mask = convertFixedMaskToScalableVector(Mask, DAG);
28511 
28512   SDValue PassThru;
28513   bool IsPassThruZeroOrUndef = false;
28514 
28515   if (Load->getPassThru()->isUndef()) {
28516     PassThru = DAG.getUNDEF(ContainerVT);
28517     IsPassThruZeroOrUndef = true;
28518   } else {
28519     if (ContainerVT.isInteger())
28520       PassThru = DAG.getConstant(0, DL, ContainerVT);
28521     else
28522       PassThru = DAG.getConstantFP(0, DL, ContainerVT);
28523     if (isZerosVector(Load->getPassThru().getNode()))
28524       IsPassThruZeroOrUndef = true;
28525   }
28526 
28527   SDValue NewLoad = DAG.getMaskedLoad(
28528       ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
28529       Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
28530       Load->getAddressingMode(), Load->getExtensionType());
28531 
28532   SDValue Result = NewLoad;
28533   if (!IsPassThruZeroOrUndef) {
28534     SDValue OldPassThru =
28535         convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
28536     Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
28537   }
28538 
28539   Result = convertFromScalableVector(DAG, VT, Result);
28540   SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
28541   return DAG.getMergeValues(MergedValues, DL);
28542 }
28543 
28544 // Convert all fixed length vector stores larger than NEON to masked_stores.
28545 SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
28546     SDValue Op, SelectionDAG &DAG) const {
28547   auto Store = cast<StoreSDNode>(Op);
28548 
28549   SDLoc DL(Op);
28550   EVT VT = Store->getValue().getValueType();
28551   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28552   EVT MemVT = Store->getMemoryVT();
28553 
28554   auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
28555   auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
28556 
28557   if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
28558     EVT TruncVT = ContainerVT.changeVectorElementType(
28559         Store->getMemoryVT().getVectorElementType());
28560     MemVT = MemVT.changeTypeToInteger();
28561     NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
28562                            NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
28563                            DAG.getUNDEF(TruncVT));
28564     NewValue =
28565         getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
28566   } else if (VT.isFloatingPoint()) {
28567     MemVT = MemVT.changeTypeToInteger();
28568     NewValue =
28569         getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
28570   }
28571 
28572   return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
28573                             Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
28574                             Store->getMemOperand(), Store->getAddressingMode(),
28575                             Store->isTruncatingStore());
28576 }
28577 
28578 SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
28579     SDValue Op, SelectionDAG &DAG) const {
28580   auto *Store = cast<MaskedStoreSDNode>(Op);
28581 
28582   SDLoc DL(Op);
28583   EVT VT = Store->getValue().getValueType();
28584   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28585 
28586   auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
28587   SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG);
28588 
28589   return DAG.getMaskedStore(
28590       Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
28591       Mask, Store->getMemoryVT(), Store->getMemOperand(),
28592       Store->getAddressingMode(), Store->isTruncatingStore());
28593 }
28594 
28595 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
28596     SDValue Op, SelectionDAG &DAG) const {
28597   SDLoc dl(Op);
28598   EVT VT = Op.getValueType();
28599   EVT EltVT = VT.getVectorElementType();
28600 
28601   bool Signed = Op.getOpcode() == ISD::SDIV;
28602   unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
28603 
28604   bool Negated;
28605   uint64_t SplatVal;
28606   if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
28607     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28608     SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
28609     SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
28610 
28611     SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
28612     SDValue Res =
28613         DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
28614     if (Negated)
28615       Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
28616                         DAG.getConstant(0, dl, ContainerVT), Res);
28617 
28618     return convertFromScalableVector(DAG, VT, Res);
28619   }
28620 
28621   // Scalable vector i32/i64 DIV is supported.
28622   if (EltVT == MVT::i32 || EltVT == MVT::i64)
28623     return LowerToPredicatedOp(Op, DAG, PredOpcode);
28624 
28625   // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
28626   EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
28627   EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
28628   unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28629 
28630   // If the wider type is legal: extend, op, and truncate.
28631   EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
28632   if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
28633     SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
28634     SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
28635     SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
28636     return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
28637   }
28638 
28639   auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
28640                                &ExtendOpcode](SDValue Op) {
28641     SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
28642     SDValue IdxHalf =
28643         DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
28644     SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
28645     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
28646     return std::pair<SDValue, SDValue>(
28647         {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
28648          DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
28649   };
28650 
28651   // If wider type is not legal: split, extend, op, trunc and concat.
28652   auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
28653   auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
28654   SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
28655   SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
28656   SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
28657   SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
28658   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
28659 }
28660 
28661 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
28662     SDValue Op, SelectionDAG &DAG) const {
28663   EVT VT = Op.getValueType();
28664   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
28665 
28666   SDLoc DL(Op);
28667   SDValue Val = Op.getOperand(0);
28668   EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
28669   Val = convertToScalableVector(DAG, ContainerVT, Val);
28670 
28671   bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
28672   unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
28673 
28674   // Repeatedly unpack Val until the result is of the desired element type.
28675   switch (ContainerVT.getSimpleVT().SimpleTy) {
28676   default:
28677     llvm_unreachable("unimplemented container type");
28678   case MVT::nxv16i8:
28679     Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
28680     if (VT.getVectorElementType() == MVT::i16)
28681       break;
28682     [[fallthrough]];
28683   case MVT::nxv8i16:
28684     Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
28685     if (VT.getVectorElementType() == MVT::i32)
28686       break;
28687     [[fallthrough]];
28688   case MVT::nxv4i32:
28689     Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
28690     assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
28691     break;
28692   }
28693 
28694   return convertFromScalableVector(DAG, VT, Val);
28695 }
28696 
28697 SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
28698     SDValue Op, SelectionDAG &DAG) const {
28699   EVT VT = Op.getValueType();
28700   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
28701 
28702   SDLoc DL(Op);
28703   SDValue Val = Op.getOperand(0);
28704   EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
28705   Val = convertToScalableVector(DAG, ContainerVT, Val);
28706 
28707   // Repeatedly truncate Val until the result is of the desired element type.
28708   switch (ContainerVT.getSimpleVT().SimpleTy) {
28709   default:
28710     llvm_unreachable("unimplemented container type");
28711   case MVT::nxv2i64:
28712     Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
28713     Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
28714     if (VT.getVectorElementType() == MVT::i32)
28715       break;
28716     [[fallthrough]];
28717   case MVT::nxv4i32:
28718     Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
28719     Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
28720     if (VT.getVectorElementType() == MVT::i16)
28721       break;
28722     [[fallthrough]];
28723   case MVT::nxv8i16:
28724     Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
28725     Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
28726     assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
28727     break;
28728   }
28729 
28730   return convertFromScalableVector(DAG, VT, Val);
28731 }
28732 
28733 SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
28734     SDValue Op, SelectionDAG &DAG) const {
28735   EVT VT = Op.getValueType();
28736   EVT InVT = Op.getOperand(0).getValueType();
28737   assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
28738 
28739   SDLoc DL(Op);
28740   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
28741   SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
28742 
28743   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
28744 }
28745 
28746 SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
28747     SDValue Op, SelectionDAG &DAG) const {
28748   EVT VT = Op.getValueType();
28749   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
28750 
28751   SDLoc DL(Op);
28752   EVT InVT = Op.getOperand(0).getValueType();
28753   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
28754   SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
28755 
28756   auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
28757                                  Op.getOperand(1), Op.getOperand(2));
28758 
28759   return convertFromScalableVector(DAG, VT, ScalableRes);
28760 }
28761 
28762 // Convert vector operation 'Op' to an equivalent predicated operation whereby
28763 // the original operation's type is used to construct a suitable predicate.
28764 // NOTE: The results for inactive lanes are undefined.
28765 SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
28766                                                    SelectionDAG &DAG,
28767                                                    unsigned NewOp) const {
28768   EVT VT = Op.getValueType();
28769   SDLoc DL(Op);
28770   auto Pg = getPredicateForVector(DAG, DL, VT);
28771 
28772   if (VT.isFixedLengthVector()) {
28773     assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
28774     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28775 
28776     // Create list of operands by converting existing ones to scalable types.
28777     SmallVector<SDValue, 4> Operands = {Pg};
28778     for (const SDValue &V : Op->op_values()) {
28779       if (isa<CondCodeSDNode>(V)) {
28780         Operands.push_back(V);
28781         continue;
28782       }
28783 
28784       if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
28785         EVT VTArg = VTNode->getVT().getVectorElementType();
28786         EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
28787         Operands.push_back(DAG.getValueType(NewVTArg));
28788         continue;
28789       }
28790 
28791       assert(isTypeLegal(V.getValueType()) &&
28792              "Expected only legal fixed-width types");
28793       Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
28794     }
28795 
28796     if (isMergePassthruOpcode(NewOp))
28797       Operands.push_back(DAG.getUNDEF(ContainerVT));
28798 
28799     auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
28800     return convertFromScalableVector(DAG, VT, ScalableRes);
28801   }
28802 
28803   assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
28804 
28805   SmallVector<SDValue, 4> Operands = {Pg};
28806   for (const SDValue &V : Op->op_values()) {
28807     assert((!V.getValueType().isVector() ||
28808             V.getValueType().isScalableVector()) &&
28809            "Only scalable vectors are supported!");
28810     Operands.push_back(V);
28811   }
28812 
28813   if (isMergePassthruOpcode(NewOp))
28814     Operands.push_back(DAG.getUNDEF(VT));
28815 
28816   return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
28817 }
28818 
28819 // If a fixed length vector operation has no side effects when applied to
28820 // undefined elements, we can safely use scalable vectors to perform the same
28821 // operation without needing to worry about predication.
28822 SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
28823                                                  SelectionDAG &DAG) const {
28824   EVT VT = Op.getValueType();
28825   assert(VT.isFixedLengthVector() && isTypeLegal(VT) &&
28826          "Only expected to lower fixed length vector operation!");
28827   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28828 
28829   // Create list of operands by converting existing ones to scalable types.
28830   SmallVector<SDValue, 4> Ops;
28831   for (const SDValue &V : Op->op_values()) {
28832     assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
28833 
28834     // Pass through non-vector operands.
28835     if (!V.getValueType().isVector()) {
28836       Ops.push_back(V);
28837       continue;
28838     }
28839 
28840     // "cast" fixed length vector to a scalable vector.
28841     assert(V.getValueType().isFixedLengthVector() &&
28842            isTypeLegal(V.getValueType()) &&
28843            "Only fixed length vectors are supported!");
28844     Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
28845   }
28846 
28847   auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
28848   return convertFromScalableVector(DAG, VT, ScalableRes);
28849 }
28850 
28851 SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
28852     SelectionDAG &DAG) const {
28853   SDLoc DL(ScalarOp);
28854   SDValue AccOp = ScalarOp.getOperand(0);
28855   SDValue VecOp = ScalarOp.getOperand(1);
28856   EVT SrcVT = VecOp.getValueType();
28857   EVT ResVT = SrcVT.getVectorElementType();
28858 
28859   EVT ContainerVT = SrcVT;
28860   if (SrcVT.isFixedLengthVector()) {
28861     ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
28862     VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
28863   }
28864 
28865   SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
28866   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
28867 
28868   // Convert operands to Scalable.
28869   AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
28870                       DAG.getUNDEF(ContainerVT), AccOp, Zero);
28871 
28872   // Perform reduction.
28873   SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
28874                             Pg, AccOp, VecOp);
28875 
28876   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
28877 }
28878 
28879 SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
28880                                                        SelectionDAG &DAG) const {
28881   SDLoc DL(ReduceOp);
28882   SDValue Op = ReduceOp.getOperand(0);
28883   EVT OpVT = Op.getValueType();
28884   EVT VT = ReduceOp.getValueType();
28885 
28886   if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
28887     return SDValue();
28888 
28889   SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
28890 
28891   switch (ReduceOp.getOpcode()) {
28892   default:
28893     return SDValue();
28894   case ISD::VECREDUCE_OR:
28895     if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
28896       // The predicate can be 'Op' because
28897       // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
28898       return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
28899     else
28900       return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
28901   case ISD::VECREDUCE_AND: {
28902     Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
28903     return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
28904   }
28905   case ISD::VECREDUCE_XOR: {
28906     SDValue ID =
28907         DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
28908     if (OpVT == MVT::nxv1i1) {
28909       // Emulate a CNTP on .Q using .D and a different governing predicate.
28910       Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
28911       Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
28912     }
28913     SDValue Cntp =
28914         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
28915     return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
28916   }
28917   }
28918 
28919   return SDValue();
28920 }
28921 
28922 SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
28923                                                    SDValue ScalarOp,
28924                                                    SelectionDAG &DAG) const {
28925   SDLoc DL(ScalarOp);
28926   SDValue VecOp = ScalarOp.getOperand(0);
28927   EVT SrcVT = VecOp.getValueType();
28928 
28929   if (useSVEForFixedLengthVectorVT(
28930           SrcVT,
28931           /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
28932     EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
28933     VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
28934   }
28935 
28936   // Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.
28937   if (ScalarOp.getOpcode() == ISD::VECREDUCE_ADD &&
28938       VecOp.getOpcode() == ISD::ZERO_EXTEND) {
28939     SDValue BoolVec = VecOp.getOperand(0);
28940     if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {
28941       // CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)
28942       SDValue CntpOp = DAG.getNode(
28943           ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
28944           DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64),
28945           BoolVec, BoolVec);
28946       return DAG.getAnyExtOrTrunc(CntpOp, DL, ScalarOp.getValueType());
28947     }
28948   }
28949 
28950   // UADDV always returns an i64 result.
28951   EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
28952                                                    SrcVT.getVectorElementType();
28953   EVT RdxVT = SrcVT;
28954   if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
28955     RdxVT = getPackedSVEVectorVT(ResVT);
28956 
28957   SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
28958   SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
28959   SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
28960                             Rdx, DAG.getConstant(0, DL, MVT::i64));
28961 
28962   // The VEC_REDUCE nodes expect an element size result.
28963   if (ResVT != ScalarOp.getValueType())
28964     Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
28965 
28966   return Res;
28967 }
28968 
28969 SDValue
28970 AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
28971     SelectionDAG &DAG) const {
28972   EVT VT = Op.getValueType();
28973   SDLoc DL(Op);
28974 
28975   EVT InVT = Op.getOperand(1).getValueType();
28976   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
28977   SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
28978   SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
28979 
28980   // Convert the mask to a predicated (NOTE: We don't need to worry about
28981   // inactive lanes since VSELECT is safe when given undefined elements).
28982   EVT MaskVT = Op.getOperand(0).getValueType();
28983   EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
28984   auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
28985   Mask = DAG.getNode(ISD::TRUNCATE, DL,
28986                      MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
28987 
28988   auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
28989                                 Mask, Op1, Op2);
28990 
28991   return convertFromScalableVector(DAG, VT, ScalableRes);
28992 }
28993 
28994 SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
28995     SDValue Op, SelectionDAG &DAG) const {
28996   SDLoc DL(Op);
28997   EVT InVT = Op.getOperand(0).getValueType();
28998   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
28999 
29000   assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
29001          "Only expected to lower fixed length vector operation!");
29002   assert(Op.getValueType() == InVT.changeTypeToInteger() &&
29003          "Expected integer result of the same bit length as the inputs!");
29004 
29005   auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
29006   auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
29007   auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
29008 
29009   EVT CmpVT = Pg.getValueType();
29010   auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
29011                          {Pg, Op1, Op2, Op.getOperand(2)});
29012 
29013   EVT PromoteVT = ContainerVT.changeTypeToInteger();
29014   auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
29015   return convertFromScalableVector(DAG, Op.getValueType(), Promote);
29016 }
29017 
29018 SDValue
29019 AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
29020                                                     SelectionDAG &DAG) const {
29021   SDLoc DL(Op);
29022   auto SrcOp = Op.getOperand(0);
29023   EVT VT = Op.getValueType();
29024   EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
29025   EVT ContainerSrcVT =
29026       getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
29027 
29028   SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
29029   Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
29030   return convertFromScalableVector(DAG, VT, Op);
29031 }
29032 
29033 SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
29034     SDValue Op, SelectionDAG &DAG) const {
29035   SDLoc DL(Op);
29036   unsigned NumOperands = Op->getNumOperands();
29037 
29038   assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
29039          "Unexpected number of operands in CONCAT_VECTORS");
29040 
29041   auto SrcOp1 = Op.getOperand(0);
29042   auto SrcOp2 = Op.getOperand(1);
29043   EVT VT = Op.getValueType();
29044   EVT SrcVT = SrcOp1.getValueType();
29045 
29046   if (NumOperands > 2) {
29047     SmallVector<SDValue, 4> Ops;
29048     EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
29049     for (unsigned I = 0; I < NumOperands; I += 2)
29050       Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
29051                                 Op->getOperand(I), Op->getOperand(I + 1)));
29052 
29053     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
29054   }
29055 
29056   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29057 
29058   SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
29059   SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
29060   SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
29061 
29062   Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
29063 
29064   return convertFromScalableVector(DAG, VT, Op);
29065 }
29066 
29067 SDValue
29068 AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
29069                                                      SelectionDAG &DAG) const {
29070   EVT VT = Op.getValueType();
29071   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29072 
29073   SDLoc DL(Op);
29074   SDValue Val = Op.getOperand(0);
29075   SDValue Pg = getPredicateForVector(DAG, DL, VT);
29076   EVT SrcVT = Val.getValueType();
29077   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29078   EVT ExtendVT = ContainerVT.changeVectorElementType(
29079       SrcVT.getVectorElementType());
29080 
29081   Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
29082   Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
29083 
29084   Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
29085   Val = getSVESafeBitCast(ExtendVT, Val, DAG);
29086   Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
29087                     Pg, Val, DAG.getUNDEF(ContainerVT));
29088 
29089   return convertFromScalableVector(DAG, VT, Val);
29090 }
29091 
29092 SDValue
29093 AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
29094                                                     SelectionDAG &DAG) const {
29095   EVT VT = Op.getValueType();
29096   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29097 
29098   SDLoc DL(Op);
29099   SDValue Val = Op.getOperand(0);
29100   EVT SrcVT = Val.getValueType();
29101   EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
29102   EVT RoundVT = ContainerSrcVT.changeVectorElementType(
29103       VT.getVectorElementType());
29104   SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
29105 
29106   Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
29107   Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
29108                     Op.getOperand(1), DAG.getUNDEF(RoundVT));
29109   Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
29110   Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
29111 
29112   Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
29113   return DAG.getNode(ISD::BITCAST, DL, VT, Val);
29114 }
29115 
29116 SDValue
29117 AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
29118                                                     SelectionDAG &DAG) const {
29119   EVT VT = Op.getValueType();
29120   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29121 
29122   bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
29123   unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
29124                              : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
29125 
29126   SDLoc DL(Op);
29127   SDValue Val = Op.getOperand(0);
29128   EVT SrcVT = Val.getValueType();
29129   EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
29130   EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
29131 
29132   if (VT.bitsGE(SrcVT)) {
29133     SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29134 
29135     Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
29136                       VT.changeTypeToInteger(), Val);
29137 
29138     // Safe to use a larger than specified operand because by promoting the
29139     // value nothing has changed from an arithmetic point of view.
29140     Val =
29141         convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
29142     Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
29143                       DAG.getUNDEF(ContainerDstVT));
29144     return convertFromScalableVector(DAG, VT, Val);
29145   } else {
29146     EVT CvtVT = ContainerSrcVT.changeVectorElementType(
29147         ContainerDstVT.getVectorElementType());
29148     SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
29149 
29150     Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
29151     Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
29152     Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
29153     Val = convertFromScalableVector(DAG, SrcVT, Val);
29154 
29155     Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
29156     return DAG.getNode(ISD::BITCAST, DL, VT, Val);
29157   }
29158 }
29159 
29160 SDValue
29161 AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
29162                                                 SelectionDAG &DAG) const {
29163   SDLoc DL(Op);
29164   EVT OpVT = Op.getValueType();
29165   assert(OpVT.isScalableVector() &&
29166          "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
29167   SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
29168                              Op.getOperand(1));
29169   SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
29170                             Op.getOperand(1));
29171   return DAG.getMergeValues({Even, Odd}, DL);
29172 }
29173 
29174 SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
29175                                                       SelectionDAG &DAG) const {
29176   SDLoc DL(Op);
29177   EVT OpVT = Op.getValueType();
29178   assert(OpVT.isScalableVector() &&
29179          "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
29180 
29181   SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
29182                            Op.getOperand(1));
29183   SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
29184                            Op.getOperand(1));
29185   return DAG.getMergeValues({Lo, Hi}, DL);
29186 }
29187 
29188 SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
29189                                                      SelectionDAG &DAG) const {
29190   // FIXME: Maybe share some code with LowerMGather/Scatter?
29191   MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
29192   SDLoc DL(HG);
29193   SDValue Chain = HG->getChain();
29194   SDValue Inc = HG->getInc();
29195   SDValue Mask = HG->getMask();
29196   SDValue Ptr = HG->getBasePtr();
29197   SDValue Index = HG->getIndex();
29198   SDValue Scale = HG->getScale();
29199   SDValue IntID = HG->getIntID();
29200 
29201   // The Intrinsic ID determines the type of update operation.
29202   [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
29203   // Right now, we only support 'add' as an update.
29204   assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
29205          "Unexpected histogram update operation");
29206 
29207   EVT IndexVT = Index.getValueType();
29208   LLVMContext &Ctx = *DAG.getContext();
29209   ElementCount EC = IndexVT.getVectorElementCount();
29210   EVT MemVT = EVT::getVectorVT(Ctx, HG->getMemoryVT(), EC);
29211   EVT IncExtVT =
29212       EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
29213   EVT IncSplatVT = EVT::getVectorVT(Ctx, IncExtVT, EC);
29214   bool ExtTrunc = IncSplatVT != MemVT;
29215 
29216   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
29217   SDValue PassThru = DAG.getSplatVector(IncSplatVT, DL, Zero);
29218   SDValue IncSplat = DAG.getSplatVector(
29219       IncSplatVT, DL, DAG.getAnyExtOrTrunc(Inc, DL, IncExtVT));
29220   SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
29221 
29222   MachineMemOperand *MMO = HG->getMemOperand();
29223   // Create an MMO for the gather, without load|store flags.
29224   MachineMemOperand *GMMO = DAG.getMachineFunction().getMachineMemOperand(
29225       MMO->getPointerInfo(), MachineMemOperand::MOLoad, MMO->getSize(),
29226       MMO->getAlign(), MMO->getAAInfo());
29227   ISD::MemIndexType IndexType = HG->getIndexType();
29228   SDValue Gather = DAG.getMaskedGather(
29229       DAG.getVTList(IncSplatVT, MVT::Other), MemVT, DL, Ops, GMMO, IndexType,
29230       ExtTrunc ? ISD::EXTLOAD : ISD::NON_EXTLOAD);
29231 
29232   SDValue GChain = Gather.getValue(1);
29233 
29234   // Perform the histcnt, multiply by inc, add to bucket data.
29235   SDValue ID =
29236       DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncExtVT);
29237   SDValue HistCnt =
29238       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
29239   SDValue Mul = DAG.getNode(ISD::MUL, DL, IncSplatVT, HistCnt, IncSplat);
29240   SDValue Add = DAG.getNode(ISD::ADD, DL, IncSplatVT, Gather, Mul);
29241 
29242   // Create an MMO for the scatter, without load|store flags.
29243   MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
29244       MMO->getPointerInfo(), MachineMemOperand::MOStore, MMO->getSize(),
29245       MMO->getAlign(), MMO->getAAInfo());
29246 
29247   SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
29248   SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
29249                                          ScatterOps, SMMO, IndexType, ExtTrunc);
29250   return Scatter;
29251 }
29252 
29253 SDValue
29254 AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
29255                                                     SelectionDAG &DAG) const {
29256   EVT VT = Op.getValueType();
29257   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29258 
29259   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
29260   unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
29261                              : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
29262 
29263   SDLoc DL(Op);
29264   SDValue Val = Op.getOperand(0);
29265   EVT SrcVT = Val.getValueType();
29266   EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
29267   EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
29268 
29269   if (VT.bitsGT(SrcVT)) {
29270     EVT CvtVT = ContainerDstVT.changeVectorElementType(
29271       ContainerSrcVT.getVectorElementType());
29272     SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29273 
29274     Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
29275     Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
29276 
29277     Val = convertToScalableVector(DAG, ContainerDstVT, Val);
29278     Val = getSVESafeBitCast(CvtVT, Val, DAG);
29279     Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
29280                       DAG.getUNDEF(ContainerDstVT));
29281     return convertFromScalableVector(DAG, VT, Val);
29282   } else {
29283     EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
29284     SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
29285 
29286     // Safe to use a larger than specified result since an fp_to_int where the
29287     // result doesn't fit into the destination is undefined.
29288     Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
29289     Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
29290     Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
29291 
29292     return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
29293   }
29294 }
29295 
29296 static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
29297                                          ArrayRef<int> ShuffleMask, EVT VT,
29298                                          EVT ContainerVT, SelectionDAG &DAG) {
29299   auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
29300   SDLoc DL(Op);
29301   unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
29302   unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
29303   bool IsSingleOp =
29304       ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
29305 
29306   if (!Subtarget.isNeonAvailable() && !MinSVESize)
29307     MinSVESize = 128;
29308 
29309   // Ignore two operands if no SVE2 or all index numbers couldn't
29310   // be represented.
29311   if (!IsSingleOp && !Subtarget.hasSVE2())
29312     return SDValue();
29313 
29314   EVT VTOp1 = Op.getOperand(0).getValueType();
29315   unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
29316   unsigned IndexLen = MinSVESize / BitsPerElt;
29317   unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
29318   uint64_t MaxOffset = maxUIntN(BitsPerElt);
29319   EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
29320   EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
29321   bool MinMaxEqual = (MinSVESize == MaxSVESize);
29322   assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
29323          "Incorrectly legalised shuffle operation");
29324 
29325   SmallVector<SDValue, 8> TBLMask;
29326   // If MinSVESize is not equal to MaxSVESize then we need to know which
29327   // TBL mask element needs adjustment.
29328   SmallVector<SDValue, 8> AddRuntimeVLMask;
29329 
29330   // Bail out for 8-bits element types, because with 2048-bit SVE register
29331   // size 8 bits is only sufficient to index into the first source vector.
29332   if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
29333     return SDValue();
29334 
29335   for (int Index : ShuffleMask) {
29336     // Handling poison index value.
29337     if (Index < 0)
29338       Index = 0;
29339     // If the mask refers to elements in the second operand, then we have to
29340     // offset the index by the number of elements in a vector. If this is number
29341     // is not known at compile-time, we need to maintain a mask with 'VL' values
29342     // to add at runtime.
29343     if ((unsigned)Index >= ElementsPerVectorReg) {
29344       if (MinMaxEqual) {
29345         Index += IndexLen - ElementsPerVectorReg;
29346       } else {
29347         Index = Index - ElementsPerVectorReg;
29348         AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
29349       }
29350     } else if (!MinMaxEqual)
29351       AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
29352     // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
29353     // to 255, this might point to the last element of in the second operand
29354     // of the shufflevector, thus we are rejecting this transform.
29355     if ((unsigned)Index >= MaxOffset)
29356       return SDValue();
29357     TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
29358   }
29359 
29360   // Choosing an out-of-range index leads to the lane being zeroed vs zero
29361   // value where it would perform first lane duplication for out of
29362   // index elements. For i8 elements an out-of-range index could be a valid
29363   // for 2048-bit vector register size.
29364   for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
29365     TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
29366     if (!MinMaxEqual)
29367       AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
29368   }
29369 
29370   EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
29371   SDValue VecMask =
29372       DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
29373   SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
29374 
29375   SDValue Shuffle;
29376   if (IsSingleOp)
29377     Shuffle =
29378         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
29379                     DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
29380                     Op1, SVEMask);
29381   else if (Subtarget.hasSVE2()) {
29382     if (!MinMaxEqual) {
29383       unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
29384       SDValue VScale = (BitsPerElt == 64)
29385                            ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
29386                            : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
29387       SDValue VecMask =
29388           DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
29389       SDValue MulByMask = DAG.getNode(
29390           ISD::MUL, DL, MaskType,
29391           DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
29392           DAG.getBuildVector(MaskType, DL,
29393                              ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
29394       SDValue UpdatedVecMask =
29395           DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
29396       SVEMask = convertToScalableVector(
29397           DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
29398     }
29399     Shuffle =
29400         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
29401                     DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
29402                     Op1, Op2, SVEMask);
29403   }
29404   Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
29405   return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
29406 }
29407 
29408 SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
29409     SDValue Op, SelectionDAG &DAG) const {
29410   EVT VT = Op.getValueType();
29411   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29412 
29413   auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
29414   auto ShuffleMask = SVN->getMask();
29415 
29416   SDLoc DL(Op);
29417   SDValue Op1 = Op.getOperand(0);
29418   SDValue Op2 = Op.getOperand(1);
29419 
29420   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29421   Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
29422   Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
29423 
29424   auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
29425     if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
29426       return MVT::i32;
29427     return ScalarTy;
29428   };
29429 
29430   if (SVN->isSplat()) {
29431     unsigned Lane = std::max(0, SVN->getSplatIndex());
29432     EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
29433     SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
29434                                   DAG.getConstant(Lane, DL, MVT::i64));
29435     Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
29436     return convertFromScalableVector(DAG, VT, Op);
29437   }
29438 
29439   bool ReverseEXT = false;
29440   unsigned Imm;
29441   if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
29442       Imm == VT.getVectorNumElements() - 1) {
29443     if (ReverseEXT)
29444       std::swap(Op1, Op2);
29445     EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
29446     SDValue Scalar = DAG.getNode(
29447         ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
29448         DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
29449     Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
29450     return convertFromScalableVector(DAG, VT, Op);
29451   }
29452 
29453   unsigned EltSize = VT.getScalarSizeInBits();
29454   for (unsigned LaneSize : {64U, 32U, 16U}) {
29455     if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), LaneSize)) {
29456       EVT NewVT =
29457           getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), LaneSize));
29458       unsigned RevOp;
29459       if (EltSize == 8)
29460         RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
29461       else if (EltSize == 16)
29462         RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
29463       else
29464         RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
29465 
29466       Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
29467       Op = LowerToPredicatedOp(Op, DAG, RevOp);
29468       Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
29469       return convertFromScalableVector(DAG, VT, Op);
29470     }
29471   }
29472 
29473   if (Subtarget->hasSVE2p1() && EltSize == 64 &&
29474       isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
29475     if (!VT.isFloatingPoint())
29476       return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
29477 
29478     EVT NewVT = getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), 64));
29479     Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
29480     Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
29481     Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
29482     return convertFromScalableVector(DAG, VT, Op);
29483   }
29484 
29485   unsigned WhichResult;
29486   if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
29487       WhichResult == 0)
29488     return convertFromScalableVector(
29489         DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
29490 
29491   if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
29492     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
29493     return convertFromScalableVector(
29494         DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
29495   }
29496 
29497   if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
29498     return convertFromScalableVector(
29499         DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
29500 
29501   if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
29502     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
29503     return convertFromScalableVector(
29504         DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
29505   }
29506 
29507   // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
29508   // represents the same logical operation as performed by a ZIP instruction. In
29509   // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
29510   // equivalent to an AArch64 instruction. There's the extra component of
29511   // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
29512   // only operated on 64/128bit vector types that have a direct mapping to a
29513   // target register and so an exact mapping is implied.
29514   // However, when using SVE for fixed length vectors, most legal vector types
29515   // are actually sub-vectors of a larger SVE register. When mapping
29516   // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
29517   // how the mask's indices translate. Specifically, when the mapping requires
29518   // an exact meaning for a specific vector index (e.g. Index X is the last
29519   // vector element in the register) then such mappings are often only safe when
29520   // the exact SVE register size is know. The main exception to this is when
29521   // indices are logically relative to the first element of either
29522   // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
29523   // when converting from fixed-length to scalable vector types (i.e. the start
29524   // of a fixed length vector is always the start of a scalable vector).
29525   unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
29526   unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
29527   if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
29528     if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
29529         Op2.isUndef()) {
29530       Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
29531       return convertFromScalableVector(DAG, VT, Op);
29532     }
29533 
29534     if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
29535         WhichResult != 0)
29536       return convertFromScalableVector(
29537           DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
29538 
29539     if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
29540       unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
29541       return convertFromScalableVector(
29542           DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
29543     }
29544 
29545     if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
29546       return convertFromScalableVector(
29547           DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
29548 
29549     if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
29550       unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
29551       return convertFromScalableVector(
29552           DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
29553     }
29554   }
29555 
29556   // Try to widen the shuffle before generating a possibly expensive SVE TBL.
29557   // This may allow the shuffle to be matched as something cheaper like ZIP1.
29558   if (SDValue WideOp = tryWidenMaskForShuffle(Op, DAG))
29559     return WideOp;
29560 
29561   // Avoid producing TBL instruction if we don't know SVE register minimal size,
29562   // unless NEON is not available and we can assume minimal SVE register size is
29563   // 128-bits.
29564   if (MinSVESize || !Subtarget->isNeonAvailable())
29565     return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
29566                                      DAG);
29567 
29568   return SDValue();
29569 }
29570 
29571 SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
29572                                                  SelectionDAG &DAG) const {
29573   SDLoc DL(Op);
29574   EVT InVT = Op.getValueType();
29575 
29576   assert(VT.isScalableVector() && isTypeLegal(VT) &&
29577          InVT.isScalableVector() && isTypeLegal(InVT) &&
29578          "Only expect to cast between legal scalable vector types!");
29579   assert(VT.getVectorElementType() != MVT::i1 &&
29580          InVT.getVectorElementType() != MVT::i1 &&
29581          "For predicate bitcasts, use getSVEPredicateBitCast");
29582 
29583   if (InVT == VT)
29584     return Op;
29585 
29586   EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
29587   EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
29588 
29589   // Safe bitcasting between unpacked vector types of different element counts
29590   // is currently unsupported because the following is missing the necessary
29591   // work to ensure the result's elements live where they're supposed to within
29592   // an SVE register.
29593   //                01234567
29594   // e.g. nxv2i32 = XX??XX??
29595   //      nxv4f16 = X?X?X?X?
29596   assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||
29597           VT == PackedVT || InVT == PackedInVT) &&
29598          "Unexpected bitcast!");
29599 
29600   // Pack input if required.
29601   if (InVT != PackedInVT)
29602     Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
29603 
29604   if (Subtarget->isLittleEndian() ||
29605       PackedVT.getScalarSizeInBits() == PackedInVT.getScalarSizeInBits())
29606     Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
29607   else {
29608     EVT PackedVTAsInt = PackedVT.changeTypeToInteger();
29609     EVT PackedInVTAsInt = PackedInVT.changeTypeToInteger();
29610 
29611     // Simulate the effect of casting through memory.
29612     Op = DAG.getNode(ISD::BITCAST, DL, PackedInVTAsInt, Op);
29613     if (PackedInVTAsInt.getScalarSizeInBits() != 8)
29614       Op = DAG.getNode(ISD::BSWAP, DL, PackedInVTAsInt, Op);
29615     Op = DAG.getNode(AArch64ISD::NVCAST, DL, PackedVTAsInt, Op);
29616     if (PackedVTAsInt.getScalarSizeInBits() != 8)
29617       Op = DAG.getNode(ISD::BSWAP, DL, PackedVTAsInt, Op);
29618     Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
29619   }
29620 
29621   // Unpack result if required.
29622   if (VT != PackedVT)
29623     Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
29624 
29625   return Op;
29626 }
29627 
29628 bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,
29629                                                  SDValue N) const {
29630   return ::isAllActivePredicate(DAG, N);
29631 }
29632 
29633 EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
29634   return ::getPromotedVTForPredicate(VT);
29635 }
29636 
29637 bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
29638     SDValue Op, const APInt &OriginalDemandedBits,
29639     const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
29640     unsigned Depth) const {
29641 
29642   unsigned Opc = Op.getOpcode();
29643   switch (Opc) {
29644   case AArch64ISD::VSHL: {
29645     // Match (VSHL (VLSHR Val X) X)
29646     SDValue ShiftL = Op;
29647     SDValue ShiftR = Op->getOperand(0);
29648     if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
29649       return false;
29650 
29651     if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
29652       return false;
29653 
29654     unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
29655     unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
29656 
29657     // Other cases can be handled as well, but this is not
29658     // implemented.
29659     if (ShiftRBits != ShiftLBits)
29660       return false;
29661 
29662     unsigned ScalarSize = Op.getScalarValueSizeInBits();
29663     assert(ScalarSize > ShiftLBits && "Invalid shift imm");
29664 
29665     APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
29666     APInt UnusedBits = ~OriginalDemandedBits;
29667 
29668     if ((ZeroBits & UnusedBits) != ZeroBits)
29669       return false;
29670 
29671     // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
29672     // used - simplify to just Val.
29673     return TLO.CombineTo(Op, ShiftR->getOperand(0));
29674   }
29675   case AArch64ISD::BICi: {
29676     // Fold BICi if all destination bits already known to be zeroed
29677     SDValue Op0 = Op.getOperand(0);
29678     KnownBits KnownOp0 =
29679         TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
29680     // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
29681     APInt BitsToClear =
29682         (Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
29683             .trunc(KnownOp0.getBitWidth());
29684     APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
29685     if (BitsToClear.isSubsetOf(AlreadyZeroedBitsToClear))
29686       return TLO.CombineTo(Op, Op0);
29687 
29688     Known = KnownOp0 & KnownBits::makeConstant(~BitsToClear);
29689     return false;
29690   }
29691   case ISD::INTRINSIC_WO_CHAIN: {
29692     if (auto ElementSize = IsSVECntIntrinsic(Op)) {
29693       unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
29694       if (!MaxSVEVectorSizeInBits)
29695         MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
29696       unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
29697       // The SVE count intrinsics don't support the multiplier immediate so we
29698       // don't have to account for that here. The value returned may be slightly
29699       // over the true required bits, as this is based on the "ALL" pattern. The
29700       // other patterns are also exposed by these intrinsics, but they all
29701       // return a value that's strictly less than "ALL".
29702       unsigned RequiredBits = llvm::bit_width(MaxElements);
29703       unsigned BitWidth = Known.Zero.getBitWidth();
29704       if (RequiredBits < BitWidth)
29705         Known.Zero.setHighBits(BitWidth - RequiredBits);
29706       return false;
29707     }
29708   }
29709   }
29710 
29711   return TargetLowering::SimplifyDemandedBitsForTargetNode(
29712       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
29713 }
29714 
29715 bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
29716   return Op.getOpcode() == AArch64ISD::DUP ||
29717          Op.getOpcode() == AArch64ISD::MOVI ||
29718          (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
29719           Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
29720          TargetLowering::isTargetCanonicalConstantNode(Op);
29721 }
29722 
29723 bool AArch64TargetLowering::isComplexDeinterleavingSupported() const {
29724   return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
29725          Subtarget->hasComplxNum();
29726 }
29727 
29728 bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
29729     ComplexDeinterleavingOperation Operation, Type *Ty) const {
29730   auto *VTy = dyn_cast<VectorType>(Ty);
29731   if (!VTy)
29732     return false;
29733 
29734   // If the vector is scalable, SVE is enabled, implying support for complex
29735   // numbers. Otherwise, we need to ensure complex number support is available
29736   if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
29737     return false;
29738 
29739   auto *ScalarTy = VTy->getScalarType();
29740   unsigned NumElements = VTy->getElementCount().getKnownMinValue();
29741 
29742   // We can only process vectors that have a bit size of 128 or higher (with an
29743   // additional 64 bits for Neon). Additionally, these vectors must have a
29744   // power-of-2 size, as we later split them into the smallest supported size
29745   // and merging them back together after applying complex operation.
29746   unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
29747   if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
29748       !llvm::isPowerOf2_32(VTyWidth))
29749     return false;
29750 
29751   if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
29752     unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
29753 
29754     if (Operation == ComplexDeinterleavingOperation::CDot)
29755       return ScalarWidth == 32 || ScalarWidth == 64;
29756     return 8 <= ScalarWidth && ScalarWidth <= 64;
29757   }
29758 
29759   // CDot is not supported outside of scalable/sve scopes
29760   if (Operation == ComplexDeinterleavingOperation::CDot)
29761     return false;
29762 
29763   return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
29764          ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
29765 }
29766 
29767 Value *AArch64TargetLowering::createComplexDeinterleavingIR(
29768     IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
29769     ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
29770     Value *Accumulator) const {
29771   VectorType *Ty = cast<VectorType>(InputA->getType());
29772   if (Accumulator == nullptr)
29773     Accumulator = Constant::getNullValue(Ty);
29774   bool IsScalable = Ty->isScalableTy();
29775   bool IsInt = Ty->getElementType()->isIntegerTy();
29776 
29777   unsigned TyWidth =
29778       Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
29779 
29780   assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
29781          "Vector type must be either 64 or a power of 2 that is at least 128");
29782 
29783   if (TyWidth > 128) {
29784     int Stride = Ty->getElementCount().getKnownMinValue() / 2;
29785     int AccStride = cast<VectorType>(Accumulator->getType())
29786                         ->getElementCount()
29787                         .getKnownMinValue() /
29788                     2;
29789     auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
29790     auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
29791     auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
29792     auto *UpperSplitA =
29793         B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
29794     auto *UpperSplitB =
29795         B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
29796     Value *LowerSplitAcc = nullptr;
29797     Value *UpperSplitAcc = nullptr;
29798     Type *FullTy = Ty;
29799     FullTy = Accumulator->getType();
29800     auto *HalfAccTy = VectorType::getHalfElementsVectorType(
29801         cast<VectorType>(Accumulator->getType()));
29802     LowerSplitAcc =
29803         B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0));
29804     UpperSplitAcc =
29805         B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride));
29806     auto *LowerSplitInt = createComplexDeinterleavingIR(
29807         B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
29808     auto *UpperSplitInt = createComplexDeinterleavingIR(
29809         B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
29810 
29811     auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
29812                                         LowerSplitInt, B.getInt64(0));
29813     return B.CreateInsertVector(FullTy, Result, UpperSplitInt,
29814                                 B.getInt64(AccStride));
29815   }
29816 
29817   if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
29818     if (IsScalable) {
29819       if (IsInt)
29820         return B.CreateIntrinsic(
29821             Intrinsic::aarch64_sve_cmla_x, Ty,
29822             {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
29823 
29824       auto *Mask = B.getAllOnesMask(Ty->getElementCount());
29825       return B.CreateIntrinsic(
29826           Intrinsic::aarch64_sve_fcmla, Ty,
29827           {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
29828     }
29829 
29830     Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
29831                               Intrinsic::aarch64_neon_vcmla_rot90,
29832                               Intrinsic::aarch64_neon_vcmla_rot180,
29833                               Intrinsic::aarch64_neon_vcmla_rot270};
29834 
29835 
29836     return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
29837                              {Accumulator, InputA, InputB});
29838   }
29839 
29840   if (OperationType == ComplexDeinterleavingOperation::CAdd) {
29841     if (IsScalable) {
29842       if (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||
29843           Rotation == ComplexDeinterleavingRotation::Rotation_270) {
29844         if (IsInt)
29845           return B.CreateIntrinsic(
29846               Intrinsic::aarch64_sve_cadd_x, Ty,
29847               {InputA, InputB, B.getInt32((int)Rotation * 90)});
29848 
29849         auto *Mask = B.getAllOnesMask(Ty->getElementCount());
29850         return B.CreateIntrinsic(
29851             Intrinsic::aarch64_sve_fcadd, Ty,
29852             {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
29853       }
29854       return nullptr;
29855     }
29856 
29857     Intrinsic::ID IntId = Intrinsic::not_intrinsic;
29858     if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
29859       IntId = Intrinsic::aarch64_neon_vcadd_rot90;
29860     else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
29861       IntId = Intrinsic::aarch64_neon_vcadd_rot270;
29862 
29863     if (IntId == Intrinsic::not_intrinsic)
29864       return nullptr;
29865 
29866     return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
29867   }
29868 
29869   if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
29870       IsScalable) {
29871     return B.CreateIntrinsic(
29872         Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
29873         {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
29874   }
29875 
29876   return nullptr;
29877 }
29878 
29879 bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
29880   unsigned Opc = N->getOpcode();
29881   if (ISD::isExtOpcode(Opc)) {
29882     if (any_of(N->users(),
29883                [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
29884       return false;
29885   }
29886   return true;
29887 }
29888 
29889 unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
29890   return Subtarget->getMinimumJumpTableEntries();
29891 }
29892 
29893 MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
29894                                                          CallingConv::ID CC,
29895                                                          EVT VT) const {
29896   bool NonUnitFixedLengthVector =
29897       VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
29898   if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
29899     return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
29900 
29901   EVT VT1;
29902   MVT RegisterVT;
29903   unsigned NumIntermediates;
29904   getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
29905                                        RegisterVT);
29906   return RegisterVT;
29907 }
29908 
29909 unsigned AArch64TargetLowering::getNumRegistersForCallingConv(
29910     LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
29911   bool NonUnitFixedLengthVector =
29912       VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
29913   if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
29914     return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
29915 
29916   EVT VT1;
29917   MVT VT2;
29918   unsigned NumIntermediates;
29919   return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
29920                                               NumIntermediates, VT2);
29921 }
29922 
29923 unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv(
29924     LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
29925     unsigned &NumIntermediates, MVT &RegisterVT) const {
29926   int NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv(
29927       Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
29928   if (!RegisterVT.isFixedLengthVector() ||
29929       RegisterVT.getFixedSizeInBits() <= 128)
29930     return NumRegs;
29931 
29932   assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
29933   assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
29934   assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
29935 
29936   // A size mismatch here implies either type promotion or widening and would
29937   // have resulted in scalarisation if larger vectors had not be available.
29938   if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
29939     EVT EltTy = VT.getVectorElementType();
29940     EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
29941     if (!isTypeLegal(NewVT))
29942       NewVT = EltTy;
29943 
29944     IntermediateVT = NewVT;
29945     NumIntermediates = VT.getVectorNumElements();
29946     RegisterVT = getRegisterType(Context, NewVT);
29947     return NumIntermediates;
29948   }
29949 
29950   // SVE VLS support does not introduce a new ABI so we should use NEON sized
29951   // types for vector arguments and returns.
29952 
29953   unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
29954   NumIntermediates *= NumSubRegs;
29955   NumRegs *= NumSubRegs;
29956 
29957   switch (RegisterVT.getVectorElementType().SimpleTy) {
29958   default:
29959     llvm_unreachable("unexpected element type for vector");
29960   case MVT::i8:
29961     IntermediateVT = RegisterVT = MVT::v16i8;
29962     break;
29963   case MVT::i16:
29964     IntermediateVT = RegisterVT = MVT::v8i16;
29965     break;
29966   case MVT::i32:
29967     IntermediateVT = RegisterVT = MVT::v4i32;
29968     break;
29969   case MVT::i64:
29970     IntermediateVT = RegisterVT = MVT::v2i64;
29971     break;
29972   case MVT::f16:
29973     IntermediateVT = RegisterVT = MVT::v8f16;
29974     break;
29975   case MVT::f32:
29976     IntermediateVT = RegisterVT = MVT::v4f32;
29977     break;
29978   case MVT::f64:
29979     IntermediateVT = RegisterVT = MVT::v2f64;
29980     break;
29981   case MVT::bf16:
29982     IntermediateVT = RegisterVT = MVT::v8bf16;
29983     break;
29984   }
29985 
29986   return NumRegs;
29987 }
29988 
29989 bool AArch64TargetLowering::hasInlineStackProbe(
29990     const MachineFunction &MF) const {
29991   return !Subtarget->isTargetWindows() &&
29992          MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
29993 }
29994 
29995 bool AArch64TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
29996   switch (Opc) {
29997   case ISD::TRUNCATE_SSAT_S:
29998   case ISD::TRUNCATE_SSAT_U:
29999   case ISD::TRUNCATE_USAT_U:
30000     if (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)
30001       return true;
30002   }
30003 
30004   return TargetLowering::isTypeDesirableForOp(Opc, VT);
30005 }
30006 
30007 #ifndef NDEBUG
30008 void AArch64TargetLowering::verifyTargetSDNode(const SDNode *N) const {
30009   switch (N->getOpcode()) {
30010   default:
30011     break;
30012   case AArch64ISD::SADDWT:
30013   case AArch64ISD::SADDWB:
30014   case AArch64ISD::UADDWT:
30015   case AArch64ISD::UADDWB: {
30016     assert(N->getNumValues() == 1 && "Expected one result!");
30017     assert(N->getNumOperands() == 2 && "Expected two operands!");
30018     EVT VT = N->getValueType(0);
30019     EVT Op0VT = N->getOperand(0).getValueType();
30020     EVT Op1VT = N->getOperand(1).getValueType();
30021     assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
30022            VT.isInteger() && Op0VT.isInteger() && Op1VT.isInteger() &&
30023            "Expected integer vectors!");
30024     assert(VT == Op0VT &&
30025            "Expected result and first input to have the same type!");
30026     assert(Op0VT.getSizeInBits() == Op1VT.getSizeInBits() &&
30027            "Expected vectors of equal size!");
30028     assert(Op0VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount() &&
30029            "Expected result vector and first input vector to have half the "
30030            "lanes of the second input vector!");
30031     break;
30032   }
30033   case AArch64ISD::SUNPKLO:
30034   case AArch64ISD::SUNPKHI:
30035   case AArch64ISD::UUNPKLO:
30036   case AArch64ISD::UUNPKHI: {
30037     assert(N->getNumValues() == 1 && "Expected one result!");
30038     assert(N->getNumOperands() == 1 && "Expected one operand!");
30039     EVT VT = N->getValueType(0);
30040     EVT OpVT = N->getOperand(0).getValueType();
30041     assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
30042            VT.isInteger() && "Expected integer vectors!");
30043     assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
30044            "Expected vectors of equal size!");
30045     assert(OpVT.getVectorElementCount() == VT.getVectorElementCount() * 2 &&
30046            "Expected result vector with half the lanes of its input!");
30047     break;
30048   }
30049   case AArch64ISD::TRN1:
30050   case AArch64ISD::TRN2:
30051   case AArch64ISD::UZP1:
30052   case AArch64ISD::UZP2:
30053   case AArch64ISD::ZIP1:
30054   case AArch64ISD::ZIP2: {
30055     assert(N->getNumValues() == 1 && "Expected one result!");
30056     assert(N->getNumOperands() == 2 && "Expected two operands!");
30057     EVT VT = N->getValueType(0);
30058     EVT Op0VT = N->getOperand(0).getValueType();
30059     EVT Op1VT = N->getOperand(1).getValueType();
30060     assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
30061            "Expected vectors!");
30062     assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
30063     break;
30064   }
30065   case AArch64ISD::RSHRNB_I: {
30066     assert(N->getNumValues() == 1 && "Expected one result!");
30067     assert(N->getNumOperands() == 2 && "Expected two operands!");
30068     EVT VT = N->getValueType(0);
30069     EVT Op0VT = N->getOperand(0).getValueType();
30070     EVT Op1VT = N->getOperand(1).getValueType();
30071     assert(VT.isVector() && VT.isInteger() &&
30072            "Expected integer vector result type!");
30073     assert(Op0VT.isVector() && Op0VT.isInteger() &&
30074            "Expected first operand to be an integer vector!");
30075     assert(VT.getSizeInBits() == Op0VT.getSizeInBits() &&
30076            "Expected vectors of equal size!");
30077     assert(VT.getVectorElementCount() == Op0VT.getVectorElementCount() * 2 &&
30078            "Expected input vector with half the lanes of its result!");
30079     assert(Op1VT == MVT::i32 && isa<ConstantSDNode>(N->getOperand(1)) &&
30080            "Expected second operand to be a constant i32!");
30081     break;
30082   }
30083   }
30084 }
30085 #endif
30086