xref: /llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp (revision 2d53eaff4aee73605170ce9910cde68fa7a300b2)
1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64LegalizerInfo.h"
15 #include "AArch64Subtarget.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
19 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
22 #include "llvm/CodeGen/GlobalISel/Utils.h"
23 #include "llvm/CodeGen/MachineInstr.h"
24 #include "llvm/CodeGen/MachineRegisterInfo.h"
25 #include "llvm/CodeGen/TargetOpcodes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/Intrinsics.h"
28 #include "llvm/IR/IntrinsicsAArch64.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/MathExtras.h"
31 #include <initializer_list>
32 
33 #define DEBUG_TYPE "aarch64-legalinfo"
34 
35 using namespace llvm;
36 using namespace LegalizeActions;
37 using namespace LegalizeMutations;
38 using namespace LegalityPredicates;
39 using namespace MIPatternMatch;
40 
41 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
42     : ST(&ST) {
43   using namespace TargetOpcode;
44   const LLT p0 = LLT::pointer(0, 64);
45   const LLT s8 = LLT::scalar(8);
46   const LLT s16 = LLT::scalar(16);
47   const LLT s32 = LLT::scalar(32);
48   const LLT s64 = LLT::scalar(64);
49   const LLT s128 = LLT::scalar(128);
50   const LLT v16s8 = LLT::fixed_vector(16, 8);
51   const LLT v8s8 = LLT::fixed_vector(8, 8);
52   const LLT v4s8 = LLT::fixed_vector(4, 8);
53   const LLT v2s8 = LLT::fixed_vector(2, 8);
54   const LLT v8s16 = LLT::fixed_vector(8, 16);
55   const LLT v4s16 = LLT::fixed_vector(4, 16);
56   const LLT v2s16 = LLT::fixed_vector(2, 16);
57   const LLT v2s32 = LLT::fixed_vector(2, 32);
58   const LLT v4s32 = LLT::fixed_vector(4, 32);
59   const LLT v2s64 = LLT::fixed_vector(2, 64);
60   const LLT v2p0 = LLT::fixed_vector(2, p0);
61 
62   const LLT nxv16s8 = LLT::scalable_vector(16, s8);
63   const LLT nxv8s16 = LLT::scalable_vector(8, s16);
64   const LLT nxv4s32 = LLT::scalable_vector(4, s32);
65   const LLT nxv2s64 = LLT::scalable_vector(2, s64);
66 
67   std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
68                                                         v16s8, v8s16, v4s32,
69                                                         v2s64, v2p0,
70                                                         /* End 128bit types */
71                                                         /* Begin 64bit types */
72                                                         v8s8, v4s16, v2s32};
73   std::initializer_list<LLT> ScalarAndPtrTypesList = {s8, s16, s32, s64, p0};
74   SmallVector<LLT, 8> PackedVectorAllTypesVec(PackedVectorAllTypeList);
75   SmallVector<LLT, 8> ScalarAndPtrTypesVec(ScalarAndPtrTypesList);
76 
77   const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
78 
79   // FIXME: support subtargets which have neon/fp-armv8 disabled.
80   if (!ST.hasNEON() || !ST.hasFPARMv8()) {
81     getLegacyLegalizerInfo().computeTables();
82     return;
83   }
84 
85   // Some instructions only support s16 if the subtarget has full 16-bit FP
86   // support.
87   const bool HasFP16 = ST.hasFullFP16();
88   const LLT &MinFPScalar = HasFP16 ? s16 : s32;
89 
90   const bool HasCSSC = ST.hasCSSC();
91   const bool HasRCPC3 = ST.hasRCPC3();
92   const bool HasSVE = ST.hasSVE();
93 
94   getActionDefinitionsBuilder(
95       {G_IMPLICIT_DEF, G_FREEZE, G_CONSTANT_FOLD_BARRIER})
96       .legalFor({p0, s8, s16, s32, s64})
97       .legalFor({v16s8, v8s16, v4s32, v2s64, v2p0, v8s8, v4s16, v2s32, v4s8,
98                  v2s16, v2s8})
99       .widenScalarToNextPow2(0)
100       .clampScalar(0, s8, s64)
101       .moreElementsToNextPow2(0)
102       .widenVectorEltsToVectorMinSize(0, 64)
103       .clampNumElements(0, v8s8, v16s8)
104       .clampNumElements(0, v4s16, v8s16)
105       .clampNumElements(0, v2s32, v4s32)
106       .clampMaxNumElements(0, s64, 2)
107       .clampMaxNumElements(0, p0, 2)
108       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
109 
110   getActionDefinitionsBuilder(G_PHI)
111       .legalFor({p0, s16, s32, s64})
112       .legalFor(PackedVectorAllTypeList)
113       .widenScalarToNextPow2(0)
114       .moreElementsToNextPow2(0)
115       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
116       .clampScalar(0, s16, s64)
117       .clampNumElements(0, v8s8, v16s8)
118       .clampNumElements(0, v4s16, v8s16)
119       .clampNumElements(0, v2s32, v4s32)
120       .clampMaxNumElements(0, s64, 2)
121       .clampMaxNumElements(0, p0, 2);
122 
123   getActionDefinitionsBuilder(G_BSWAP)
124       .legalFor({s32, s64, v4s16, v8s16, v2s32, v4s32, v2s64})
125       .widenScalarOrEltToNextPow2(0, 16)
126       .clampScalar(0, s32, s64)
127       .clampNumElements(0, v4s16, v8s16)
128       .clampNumElements(0, v2s32, v4s32)
129       .clampNumElements(0, v2s64, v2s64)
130       .moreElementsToNextPow2(0);
131 
132   getActionDefinitionsBuilder({G_ADD, G_SUB, G_AND, G_OR, G_XOR})
133       .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8})
134       .legalFor(HasSVE, {nxv16s8, nxv8s16, nxv4s32, nxv2s64})
135       .widenScalarToNextPow2(0)
136       .clampScalar(0, s32, s64)
137       .clampMaxNumElements(0, s8, 16)
138       .clampMaxNumElements(0, s16, 8)
139       .clampNumElements(0, v2s32, v4s32)
140       .clampNumElements(0, v2s64, v2s64)
141       .minScalarOrEltIf(
142           [=](const LegalityQuery &Query) {
143             return Query.Types[0].getNumElements() <= 2;
144           },
145           0, s32)
146       .minScalarOrEltIf(
147           [=](const LegalityQuery &Query) {
148             return Query.Types[0].getNumElements() <= 4;
149           },
150           0, s16)
151       .minScalarOrEltIf(
152           [=](const LegalityQuery &Query) {
153             return Query.Types[0].getNumElements() <= 16;
154           },
155           0, s8)
156       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
157       .moreElementsToNextPow2(0);
158 
159   getActionDefinitionsBuilder(G_MUL)
160       .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8})
161       .widenScalarToNextPow2(0)
162       .clampScalar(0, s32, s64)
163       .clampMaxNumElements(0, s8, 16)
164       .clampMaxNumElements(0, s16, 8)
165       .clampNumElements(0, v2s32, v4s32)
166       .clampNumElements(0, v2s64, v2s64)
167       .minScalarOrEltIf(
168           [=](const LegalityQuery &Query) {
169             return Query.Types[0].getNumElements() <= 2;
170           },
171           0, s32)
172       .minScalarOrEltIf(
173           [=](const LegalityQuery &Query) {
174             return Query.Types[0].getNumElements() <= 4;
175           },
176           0, s16)
177       .minScalarOrEltIf(
178           [=](const LegalityQuery &Query) {
179             return Query.Types[0].getNumElements() <= 16;
180           },
181           0, s8)
182       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
183       .moreElementsToNextPow2(0);
184 
185   getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
186       .customIf([=](const LegalityQuery &Query) {
187         const auto &SrcTy = Query.Types[0];
188         const auto &AmtTy = Query.Types[1];
189         return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
190                AmtTy.getSizeInBits() == 32;
191       })
192       .legalFor({
193           {s32, s32},
194           {s32, s64},
195           {s64, s64},
196           {v8s8, v8s8},
197           {v16s8, v16s8},
198           {v4s16, v4s16},
199           {v8s16, v8s16},
200           {v2s32, v2s32},
201           {v4s32, v4s32},
202           {v2s64, v2s64},
203       })
204       .widenScalarToNextPow2(0)
205       .clampScalar(1, s32, s64)
206       .clampScalar(0, s32, s64)
207       .clampNumElements(0, v8s8, v16s8)
208       .clampNumElements(0, v4s16, v8s16)
209       .clampNumElements(0, v2s32, v4s32)
210       .clampNumElements(0, v2s64, v2s64)
211       .moreElementsToNextPow2(0)
212       .minScalarSameAs(1, 0)
213       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
214 
215   getActionDefinitionsBuilder(G_PTR_ADD)
216       .legalFor({{p0, s64}, {v2p0, v2s64}})
217       .clampScalarOrElt(1, s64, s64)
218       .clampNumElements(0, v2p0, v2p0);
219 
220   getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}});
221 
222   getActionDefinitionsBuilder({G_SDIV, G_UDIV})
223       .legalFor({s32, s64})
224       .libcallFor({s128})
225       .clampScalar(0, s32, s64)
226       .widenScalarToNextPow2(0)
227       .scalarize(0);
228 
229   getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
230       .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32})
231       .libcallFor({s128})
232       .widenScalarOrEltToNextPow2(0)
233       .minScalarOrElt(0, s32)
234       .clampNumElements(0, v2s32, v4s32)
235       .clampNumElements(0, v2s64, v2s64)
236       .scalarize(0);
237 
238   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
239       .widenScalarToNextPow2(0, /*Min = */ 32)
240       .clampScalar(0, s32, s64)
241       .lower();
242 
243   getActionDefinitionsBuilder({G_SMULH, G_UMULH})
244       .legalFor({s64, v8s16, v16s8, v4s32})
245       .lower();
246 
247   getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
248       .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
249       .legalFor(HasCSSC, {s32, s64})
250       .minScalar(HasCSSC, 0, s32)
251       .clampNumElements(0, v8s8, v16s8)
252       .clampNumElements(0, v4s16, v8s16)
253       .clampNumElements(0, v2s32, v4s32)
254       // FIXME: This sholdn't be needed as v2s64 types are going to
255       // be expanded anyway, but G_ICMP doesn't support splitting vectors yet
256       .clampNumElements(0, v2s64, v2s64)
257       .lower();
258 
259   getActionDefinitionsBuilder(
260       {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
261       .legalFor({{s32, s32}, {s64, s32}})
262       .clampScalar(0, s32, s64)
263       .clampScalar(1, s32, s64)
264       .widenScalarToNextPow2(0);
265 
266   getActionDefinitionsBuilder(
267       {G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FSQRT, G_FMAXNUM, G_FMINNUM,
268        G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR, G_FRINT, G_FNEARBYINT,
269        G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
270       .legalFor({s32, s64, v2s32, v4s32, v2s64})
271       .legalFor(HasFP16, {s16, v4s16, v8s16})
272       .libcallFor({s128})
273       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
274       .minScalarOrElt(0, MinFPScalar)
275       .clampNumElements(0, v4s16, v8s16)
276       .clampNumElements(0, v2s32, v4s32)
277       .clampNumElements(0, v2s64, v2s64)
278       .moreElementsToNextPow2(0);
279 
280   getActionDefinitionsBuilder({G_FABS, G_FNEG})
281       .legalFor({s32, s64, v2s32, v4s32, v2s64})
282       .legalFor(HasFP16, {s16, v4s16, v8s16})
283       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
284       .lowerIf(scalarOrEltWiderThan(0, 64))
285       .clampNumElements(0, v4s16, v8s16)
286       .clampNumElements(0, v2s32, v4s32)
287       .clampNumElements(0, v2s64, v2s64)
288       .moreElementsToNextPow2(0)
289       .lowerFor({s16, v4s16, v8s16});
290 
291   getActionDefinitionsBuilder(G_FREM)
292       .libcallFor({s32, s64, s128})
293       .minScalar(0, s32)
294       .scalarize(0);
295 
296   getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
297       .legalFor({{s64, MinFPScalar}, {s64, s32}, {s64, s64}})
298       .libcallFor({{s64, s128}})
299       .minScalarOrElt(1, MinFPScalar);
300 
301   getActionDefinitionsBuilder({G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2,
302                                G_FLOG10, G_FTAN, G_FEXP, G_FEXP2, G_FEXP10,
303                                G_FACOS, G_FASIN, G_FATAN, G_FATAN2, G_FCOSH,
304                                G_FSINH, G_FTANH})
305       // We need a call for these, so we always need to scalarize.
306       .scalarize(0)
307       // Regardless of FP16 support, widen 16-bit elements to 32-bits.
308       .minScalar(0, s32)
309       .libcallFor({s32, s64, s128});
310   getActionDefinitionsBuilder(G_FPOWI)
311       .scalarize(0)
312       .minScalar(0, s32)
313       .libcallFor({{s32, s32}, {s64, s32}, {s128, s32}});
314 
315   getActionDefinitionsBuilder(G_INSERT)
316       .legalIf(all(typeInSet(0, {s32, s64, p0}),
317                    typeInSet(1, {s8, s16, s32}), smallerThan(1, 0)))
318       .widenScalarToNextPow2(0)
319       .clampScalar(0, s32, s64)
320       .widenScalarToNextPow2(1)
321       .minScalar(1, s8)
322       .maxScalarIf(typeInSet(0, {s32}), 1, s16)
323       .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32);
324 
325   getActionDefinitionsBuilder(G_EXTRACT)
326       .legalIf(all(typeInSet(0, {s16, s32, s64, p0}),
327                    typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1)))
328       .widenScalarToNextPow2(1)
329       .clampScalar(1, s32, s128)
330       .widenScalarToNextPow2(0)
331       .minScalar(0, s16)
332       .maxScalarIf(typeInSet(1, {s32}), 0, s16)
333       .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32)
334       .maxScalarIf(typeInSet(1, {s128}), 0, s64);
335 
336 
337   for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) {
338     auto &Actions =  getActionDefinitionsBuilder(Op);
339 
340     if (Op == G_SEXTLOAD)
341       Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered));
342 
343     // Atomics have zero extending behavior.
344     Actions
345       .legalForTypesWithMemDesc({{s32, p0, s8, 8},
346                                  {s32, p0, s16, 8},
347                                  {s32, p0, s32, 8},
348                                  {s64, p0, s8, 2},
349                                  {s64, p0, s16, 2},
350                                  {s64, p0, s32, 4},
351                                  {s64, p0, s64, 8},
352                                  {p0, p0, s64, 8},
353                                  {v2s32, p0, s64, 8}})
354       .widenScalarToNextPow2(0)
355       .clampScalar(0, s32, s64)
356       // TODO: We could support sum-of-pow2's but the lowering code doesn't know
357       //       how to do that yet.
358       .unsupportedIfMemSizeNotPow2()
359       // Lower anything left over into G_*EXT and G_LOAD
360       .lower();
361   }
362 
363   auto IsPtrVecPred = [=](const LegalityQuery &Query) {
364     const LLT &ValTy = Query.Types[0];
365     return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0;
366   };
367 
368   getActionDefinitionsBuilder(G_LOAD)
369       .customIf([=](const LegalityQuery &Query) {
370         return HasRCPC3 && Query.Types[0] == s128 &&
371                Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire;
372       })
373       .customIf([=](const LegalityQuery &Query) {
374         return Query.Types[0] == s128 &&
375                Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
376       })
377       .legalForTypesWithMemDesc({{s8, p0, s8, 8},
378                                  {s16, p0, s16, 8},
379                                  {s32, p0, s32, 8},
380                                  {s64, p0, s64, 8},
381                                  {p0, p0, s64, 8},
382                                  {s128, p0, s128, 8},
383                                  {v8s8, p0, s64, 8},
384                                  {v16s8, p0, s128, 8},
385                                  {v4s16, p0, s64, 8},
386                                  {v8s16, p0, s128, 8},
387                                  {v2s32, p0, s64, 8},
388                                  {v4s32, p0, s128, 8},
389                                  {v2s64, p0, s128, 8}})
390       // These extends are also legal
391       .legalForTypesWithMemDesc(
392           {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
393       .legalForTypesWithMemDesc({
394           // SVE vscale x 128 bit base sizes
395           {nxv16s8, p0, nxv16s8, 8},
396           {nxv8s16, p0, nxv8s16, 8},
397           {nxv4s32, p0, nxv4s32, 8},
398           {nxv2s64, p0, nxv2s64, 8},
399       })
400       .widenScalarToNextPow2(0, /* MinSize = */ 8)
401       .clampMaxNumElements(0, s8, 16)
402       .clampMaxNumElements(0, s16, 8)
403       .clampMaxNumElements(0, s32, 4)
404       .clampMaxNumElements(0, s64, 2)
405       .clampMaxNumElements(0, p0, 2)
406       .lowerIfMemSizeNotByteSizePow2()
407       .clampScalar(0, s8, s64)
408       .narrowScalarIf(
409           [=](const LegalityQuery &Query) {
410             // Clamp extending load results to 32-bits.
411             return Query.Types[0].isScalar() &&
412                    Query.Types[0] != Query.MMODescrs[0].MemoryTy &&
413                    Query.Types[0].getSizeInBits() > 32;
414           },
415           changeTo(0, s32))
416       // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
417       .bitcastIf(typeInSet(0, {v4s8}),
418                  [=](const LegalityQuery &Query) {
419                    const LLT VecTy = Query.Types[0];
420                    return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
421                  })
422       .customIf(IsPtrVecPred)
423       .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0)
424       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
425 
426   getActionDefinitionsBuilder(G_STORE)
427       .customIf([=](const LegalityQuery &Query) {
428         return HasRCPC3 && Query.Types[0] == s128 &&
429                Query.MMODescrs[0].Ordering == AtomicOrdering::Release;
430       })
431       .customIf([=](const LegalityQuery &Query) {
432         return Query.Types[0] == s128 &&
433                Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
434       })
435       .legalForTypesWithMemDesc(
436           {{s8, p0, s8, 8},     {s16, p0, s8, 8},  // truncstorei8 from s16
437            {s32, p0, s8, 8},                       // truncstorei8 from s32
438            {s64, p0, s8, 8},                       // truncstorei8 from s64
439            {s16, p0, s16, 8},   {s32, p0, s16, 8}, // truncstorei16 from s32
440            {s64, p0, s16, 8},                      // truncstorei16 from s64
441            {s32, p0, s8, 8},    {s32, p0, s16, 8},    {s32, p0, s32, 8},
442            {s64, p0, s64, 8},   {s64, p0, s32, 8}, // truncstorei32 from s64
443            {p0, p0, s64, 8},    {s128, p0, s128, 8},  {v16s8, p0, s128, 8},
444            {v8s8, p0, s64, 8},  {v4s16, p0, s64, 8},  {v8s16, p0, s128, 8},
445            {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}})
446       .legalForTypesWithMemDesc({
447           // SVE vscale x 128 bit base sizes
448           // TODO: Add nxv2p0. Consider bitcastIf.
449           //       See #92130
450           // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
451           {nxv16s8, p0, nxv16s8, 8},
452           {nxv8s16, p0, nxv8s16, 8},
453           {nxv4s32, p0, nxv4s32, 8},
454           {nxv2s64, p0, nxv2s64, 8},
455       })
456       .clampScalar(0, s8, s64)
457       .minScalarOrElt(0, s8)
458       .lowerIf([=](const LegalityQuery &Query) {
459         return Query.Types[0].isScalar() &&
460                Query.Types[0] != Query.MMODescrs[0].MemoryTy;
461       })
462       // Maximum: sN * k = 128
463       .clampMaxNumElements(0, s8, 16)
464       .clampMaxNumElements(0, s16, 8)
465       .clampMaxNumElements(0, s32, 4)
466       .clampMaxNumElements(0, s64, 2)
467       .clampMaxNumElements(0, p0, 2)
468       .lowerIfMemSizeNotPow2()
469       // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
470       .bitcastIf(all(typeInSet(0, {v4s8}),
471                      LegalityPredicate([=](const LegalityQuery &Query) {
472                        return Query.Types[0].getSizeInBits() ==
473                               Query.MMODescrs[0].MemoryTy.getSizeInBits();
474                      })),
475                  [=](const LegalityQuery &Query) {
476                    const LLT VecTy = Query.Types[0];
477                    return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
478                  })
479       .customIf(IsPtrVecPred)
480       .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0)
481       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
482       .lower();
483 
484   getActionDefinitionsBuilder(G_INDEXED_STORE)
485       // Idx 0 == Ptr, Idx 1 == Val
486       // TODO: we can implement legalizations but as of now these are
487       // generated in a very specific way.
488       .legalForTypesWithMemDesc({
489           {p0, s8, s8, 8},
490           {p0, s16, s16, 8},
491           {p0, s32, s8, 8},
492           {p0, s32, s16, 8},
493           {p0, s32, s32, 8},
494           {p0, s64, s64, 8},
495           {p0, p0, p0, 8},
496           {p0, v8s8, v8s8, 8},
497           {p0, v16s8, v16s8, 8},
498           {p0, v4s16, v4s16, 8},
499           {p0, v8s16, v8s16, 8},
500           {p0, v2s32, v2s32, 8},
501           {p0, v4s32, v4s32, 8},
502           {p0, v2s64, v2s64, 8},
503           {p0, v2p0, v2p0, 8},
504           {p0, s128, s128, 8},
505       })
506       .unsupported();
507 
508   auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) {
509     LLT LdTy = Query.Types[0];
510     LLT PtrTy = Query.Types[1];
511     if (!llvm::is_contained(PackedVectorAllTypesVec, LdTy) &&
512         !llvm::is_contained(ScalarAndPtrTypesVec, LdTy) && LdTy != s128)
513       return false;
514     if (PtrTy != p0)
515       return false;
516     return true;
517   };
518   getActionDefinitionsBuilder(G_INDEXED_LOAD)
519       .unsupportedIf(
520           atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
521       .legalIf(IndexedLoadBasicPred)
522       .unsupported();
523   getActionDefinitionsBuilder({G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD})
524       .unsupportedIf(
525           atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
526       .legalIf(all(typeInSet(0, {s16, s32, s64}),
527                    LegalityPredicate([=](const LegalityQuery &Q) {
528                      LLT LdTy = Q.Types[0];
529                      LLT PtrTy = Q.Types[1];
530                      LLT MemTy = Q.MMODescrs[0].MemoryTy;
531                      if (PtrTy != p0)
532                        return false;
533                      if (LdTy == s16)
534                        return MemTy == s8;
535                      if (LdTy == s32)
536                        return MemTy == s8 || MemTy == s16;
537                      if (LdTy == s64)
538                        return MemTy == s8 || MemTy == s16 || MemTy == s32;
539                      return false;
540                    })))
541       .unsupported();
542 
543   // Constants
544   getActionDefinitionsBuilder(G_CONSTANT)
545       .legalFor({p0, s8, s16, s32, s64})
546       .widenScalarToNextPow2(0)
547       .clampScalar(0, s8, s64);
548   getActionDefinitionsBuilder(G_FCONSTANT)
549       .legalFor({s32, s64, s128})
550       .legalFor(HasFP16, {s16})
551       .clampScalar(0, MinFPScalar, s128);
552 
553   // FIXME: fix moreElementsToNextPow2
554   getActionDefinitionsBuilder(G_ICMP)
555       .legalFor({{s32, s32}, {s32, s64}, {s32, p0}})
556       .widenScalarOrEltToNextPow2(1)
557       .clampScalar(1, s32, s64)
558       .clampScalar(0, s32, s32)
559       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
560       .minScalarEltSameAsIf(
561           [=](const LegalityQuery &Query) {
562             const LLT &Ty = Query.Types[0];
563             const LLT &SrcTy = Query.Types[1];
564             return Ty.isVector() && !SrcTy.isPointerVector() &&
565                    Ty.getElementType() != SrcTy.getElementType();
566           },
567           0, 1)
568       .minScalarOrEltIf(
569           [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; },
570           1, s32)
571       .minScalarOrEltIf(
572           [=](const LegalityQuery &Query) {
573             return Query.Types[1].isPointerVector();
574           },
575           0, s64)
576       .moreElementsToNextPow2(1)
577       .clampNumElements(1, v8s8, v16s8)
578       .clampNumElements(1, v4s16, v8s16)
579       .clampNumElements(1, v2s32, v4s32)
580       .clampNumElements(1, v2s64, v2s64)
581       .clampNumElements(1, v2p0, v2p0)
582       .customIf(isVector(0));
583 
584   getActionDefinitionsBuilder(G_FCMP)
585       .legalFor({{s32, s32},
586                  {s32, s64},
587                  {v4s32, v4s32},
588                  {v2s32, v2s32},
589                  {v2s64, v2s64}})
590       .legalFor(HasFP16, {{s32, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
591       .widenScalarOrEltToNextPow2(1)
592       .clampScalar(0, s32, s32)
593       .minScalarOrElt(1, MinFPScalar)
594       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
595       .minScalarEltSameAsIf(
596           [=](const LegalityQuery &Query) {
597             const LLT &Ty = Query.Types[0];
598             const LLT &SrcTy = Query.Types[1];
599             return Ty.isVector() && !SrcTy.isPointerVector() &&
600                    Ty.getElementType() != SrcTy.getElementType();
601           },
602           0, 1)
603       .clampNumElements(1, v4s16, v8s16)
604       .clampNumElements(1, v2s32, v4s32)
605       .clampMaxNumElements(1, s64, 2)
606       .moreElementsToNextPow2(1)
607       .libcallFor({{s32, s128}});
608 
609   // Extensions
610   auto ExtLegalFunc = [=](const LegalityQuery &Query) {
611     unsigned DstSize = Query.Types[0].getSizeInBits();
612 
613     // Handle legal vectors using legalFor
614     if (Query.Types[0].isVector())
615       return false;
616 
617     if (DstSize < 8 || DstSize >= 128 || !isPowerOf2_32(DstSize))
618       return false; // Extending to a scalar s128 needs narrowing.
619 
620     const LLT &SrcTy = Query.Types[1];
621 
622     // Make sure we fit in a register otherwise. Don't bother checking that
623     // the source type is below 128 bits. We shouldn't be allowing anything
624     // through which is wider than the destination in the first place.
625     unsigned SrcSize = SrcTy.getSizeInBits();
626     if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
627       return false;
628 
629     return true;
630   };
631   getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
632       .legalIf(ExtLegalFunc)
633       .legalFor({{v2s64, v2s32}, {v4s32, v4s16}, {v8s16, v8s8}})
634       .clampScalar(0, s64, s64) // Just for s128, others are handled above.
635       .moreElementsToNextPow2(0)
636       .clampMaxNumElements(1, s8, 8)
637       .clampMaxNumElements(1, s16, 4)
638       .clampMaxNumElements(1, s32, 2)
639       // Tries to convert a large EXTEND into two smaller EXTENDs
640       .lowerIf([=](const LegalityQuery &Query) {
641         return (Query.Types[0].getScalarSizeInBits() >
642                 Query.Types[1].getScalarSizeInBits() * 2) &&
643                Query.Types[0].isVector() &&
644                (Query.Types[1].getScalarSizeInBits() == 8 ||
645                 Query.Types[1].getScalarSizeInBits() == 16);
646       })
647       .clampMinNumElements(1, s8, 8)
648       .clampMinNumElements(1, s16, 4);
649 
650   getActionDefinitionsBuilder(G_TRUNC)
651       .legalFor({{v2s32, v2s64}, {v4s16, v4s32}, {v8s8, v8s16}})
652       .moreElementsToNextPow2(0)
653       .clampMaxNumElements(0, s8, 8)
654       .clampMaxNumElements(0, s16, 4)
655       .clampMaxNumElements(0, s32, 2)
656       .minScalarOrEltIf(
657           [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
658           0, s8)
659       .lowerIf([=](const LegalityQuery &Query) {
660         LLT DstTy = Query.Types[0];
661         LLT SrcTy = Query.Types[1];
662         return DstTy.isVector() && SrcTy.getSizeInBits() > 128 &&
663                DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits();
664       })
665       .clampMinNumElements(0, s8, 8)
666       .clampMinNumElements(0, s16, 4)
667       .alwaysLegal();
668 
669   getActionDefinitionsBuilder(G_SEXT_INREG)
670       .legalFor({s32, s64})
671       .legalFor(PackedVectorAllTypeList)
672       .maxScalar(0, s64)
673       .clampNumElements(0, v8s8, v16s8)
674       .clampNumElements(0, v4s16, v8s16)
675       .clampNumElements(0, v2s32, v4s32)
676       .clampMaxNumElements(0, s64, 2)
677       .lower();
678 
679   // FP conversions
680   getActionDefinitionsBuilder(G_FPTRUNC)
681       .legalFor(
682           {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
683       .libcallFor({{s16, s128}, {s32, s128}, {s64, s128}})
684       .clampNumElements(0, v4s16, v4s16)
685       .clampNumElements(0, v2s32, v2s32)
686       .scalarize(0);
687 
688   getActionDefinitionsBuilder(G_FPEXT)
689       .legalFor(
690           {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}})
691       .libcallFor({{s128, s64}, {s128, s32}, {s128, s16}})
692       .clampNumElements(0, v4s32, v4s32)
693       .clampNumElements(0, v2s64, v2s64)
694       .scalarize(0);
695 
696   // Conversions
697   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
698       .legalFor({{s32, s32},
699                  {s64, s32},
700                  {s32, s64},
701                  {s64, s64},
702                  {v2s64, v2s64},
703                  {v4s32, v4s32},
704                  {v2s32, v2s32}})
705       .legalFor(HasFP16,
706                 {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
707       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
708       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
709       // The range of a fp16 value fits into an i17, so we can lower the width
710       // to i64.
711       .narrowScalarIf(
712           [=](const LegalityQuery &Query) {
713             return Query.Types[1] == s16 && Query.Types[0].getSizeInBits() > 64;
714           },
715           changeTo(0, s64))
716       .moreElementsToNextPow2(0)
717       .widenScalarOrEltToNextPow2OrMinSize(0)
718       .minScalar(0, s32)
719       .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16 ? 16 : 32)
720       .widenScalarIf(
721           [=](const LegalityQuery &Query) {
722             return Query.Types[0].getScalarSizeInBits() <= 64 &&
723                    Query.Types[0].getScalarSizeInBits() >
724                        Query.Types[1].getScalarSizeInBits();
725           },
726           LegalizeMutations::changeElementSizeTo(1, 0))
727       .widenScalarIf(
728           [=](const LegalityQuery &Query) {
729             return Query.Types[1].getScalarSizeInBits() <= 64 &&
730                    Query.Types[0].getScalarSizeInBits() <
731                        Query.Types[1].getScalarSizeInBits();
732           },
733           LegalizeMutations::changeElementSizeTo(0, 1))
734       .clampNumElements(0, v4s16, v8s16)
735       .clampNumElements(0, v2s32, v4s32)
736       .clampMaxNumElements(0, s64, 2)
737       .libcallFor(
738           {{s32, s128}, {s64, s128}, {s128, s128}, {s128, s32}, {s128, s64}});
739 
740   getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
741       .legalFor({{s32, s32},
742                  {s64, s32},
743                  {s32, s64},
744                  {s64, s64},
745                  {v2s64, v2s64},
746                  {v4s32, v4s32},
747                  {v2s32, v2s32}})
748       .legalFor(HasFP16,
749                 {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
750       // Handle types larger than i64 by scalarizing/lowering.
751       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
752       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
753       // The range of a fp16 value fits into an i17, so we can lower the width
754       // to i64.
755       .narrowScalarIf(
756           [=](const LegalityQuery &Query) {
757             return Query.Types[1] == s16 && Query.Types[0].getSizeInBits() > 64;
758           },
759           changeTo(0, s64))
760       .lowerIf(::any(scalarWiderThan(0, 64), scalarWiderThan(1, 64)), 0)
761       .moreElementsToNextPow2(0)
762       .widenScalarToNextPow2(0, /*MinSize=*/32)
763       .minScalar(0, s32)
764       .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16 ? 16 : 32)
765       .widenScalarIf(
766           [=](const LegalityQuery &Query) {
767             unsigned ITySize = Query.Types[0].getScalarSizeInBits();
768             return (ITySize == 16 || ITySize == 32 || ITySize == 64) &&
769                    ITySize > Query.Types[1].getScalarSizeInBits();
770           },
771           LegalizeMutations::changeElementSizeTo(1, 0))
772       .widenScalarIf(
773           [=](const LegalityQuery &Query) {
774             unsigned FTySize = Query.Types[1].getScalarSizeInBits();
775             return (FTySize == 16 || FTySize == 32 || FTySize == 64) &&
776                    Query.Types[0].getScalarSizeInBits() < FTySize;
777           },
778           LegalizeMutations::changeElementSizeTo(0, 1))
779       .widenScalarOrEltToNextPow2(0)
780       .clampNumElements(0, v4s16, v8s16)
781       .clampNumElements(0, v2s32, v4s32)
782       .clampMaxNumElements(0, s64, 2);
783 
784   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
785       .legalFor({{s32, s32},
786                  {s64, s32},
787                  {s32, s64},
788                  {s64, s64},
789                  {v2s64, v2s64},
790                  {v4s32, v4s32},
791                  {v2s32, v2s32}})
792       .legalFor(HasFP16,
793                 {{s16, s32}, {s16, s64}, {v4s16, v4s16}, {v8s16, v8s16}})
794       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
795       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
796       .moreElementsToNextPow2(1)
797       .widenScalarOrEltToNextPow2OrMinSize(1)
798       .minScalar(1, s32)
799       .widenScalarOrEltToNextPow2OrMinSize(0, /*MinSize=*/HasFP16 ? 16 : 32)
800       .widenScalarIf(
801           [=](const LegalityQuery &Query) {
802             return Query.Types[1].getScalarSizeInBits() <= 64 &&
803                    Query.Types[0].getScalarSizeInBits() <
804                        Query.Types[1].getScalarSizeInBits();
805           },
806           LegalizeMutations::changeElementSizeTo(0, 1))
807       .widenScalarIf(
808           [=](const LegalityQuery &Query) {
809             return Query.Types[0].getScalarSizeInBits() <= 64 &&
810                    Query.Types[0].getScalarSizeInBits() >
811                        Query.Types[1].getScalarSizeInBits();
812           },
813           LegalizeMutations::changeElementSizeTo(1, 0))
814       .clampNumElements(0, v4s16, v8s16)
815       .clampNumElements(0, v2s32, v4s32)
816       .clampMaxNumElements(0, s64, 2)
817       .libcallFor({{s16, s128},
818                    {s32, s128},
819                    {s64, s128},
820                    {s128, s128},
821                    {s128, s32},
822                    {s128, s64}});
823 
824   // Control-flow
825   getActionDefinitionsBuilder(G_BRCOND)
826     .legalFor({s32})
827     .clampScalar(0, s32, s32);
828   getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
829 
830   getActionDefinitionsBuilder(G_SELECT)
831       .legalFor({{s32, s32}, {s64, s32}, {p0, s32}})
832       .widenScalarToNextPow2(0)
833       .clampScalar(0, s32, s64)
834       .clampScalar(1, s32, s32)
835       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
836       .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
837       .lowerIf(isVector(0));
838 
839   // Pointer-handling
840   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
841 
842   if (TM.getCodeModel() == CodeModel::Small)
843     getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom();
844   else
845     getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
846 
847   getActionDefinitionsBuilder(G_PTRAUTH_GLOBAL_VALUE)
848       .legalIf(all(typeIs(0, p0), typeIs(1, p0)));
849 
850   getActionDefinitionsBuilder(G_PTRTOINT)
851       .legalFor({{s64, p0}, {v2s64, v2p0}})
852       .widenScalarToNextPow2(0, 64)
853       .clampScalar(0, s64, s64)
854       .clampMaxNumElements(0, s64, 2);
855 
856   getActionDefinitionsBuilder(G_INTTOPTR)
857       .unsupportedIf([&](const LegalityQuery &Query) {
858         return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits();
859       })
860       .legalFor({{p0, s64}, {v2p0, v2s64}})
861       .clampMaxNumElements(1, s64, 2);
862 
863   // Casts for 32 and 64-bit width type are just copies.
864   // Same for 128-bit width type, except they are on the FPR bank.
865   getActionDefinitionsBuilder(G_BITCAST)
866       // Keeping 32-bit instructions legal to prevent regression in some tests
867       .legalForCartesianProduct({s32, v2s16, v4s8})
868       .legalForCartesianProduct({s64, v8s8, v4s16, v2s32})
869       .legalForCartesianProduct({s128, v16s8, v8s16, v4s32, v2s64, v2p0})
870       .customIf([=](const LegalityQuery &Query) {
871         // Handle casts from i1 vectors to scalars.
872         LLT DstTy = Query.Types[0];
873         LLT SrcTy = Query.Types[1];
874         return DstTy.isScalar() && SrcTy.isVector() &&
875                SrcTy.getScalarSizeInBits() == 1;
876       })
877       .lowerIf([=](const LegalityQuery &Query) {
878         return Query.Types[0].isVector() != Query.Types[1].isVector();
879       })
880       .moreElementsToNextPow2(0)
881       .clampNumElements(0, v8s8, v16s8)
882       .clampNumElements(0, v4s16, v8s16)
883       .clampNumElements(0, v2s32, v4s32)
884       .lower();
885 
886   getActionDefinitionsBuilder(G_VASTART).legalFor({p0});
887 
888   // va_list must be a pointer, but most sized types are pretty easy to handle
889   // as the destination.
890   getActionDefinitionsBuilder(G_VAARG)
891       .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0})
892       .clampScalar(0, s8, s64)
893       .widenScalarToNextPow2(0, /*Min*/ 8);
894 
895   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
896       .lowerIf(
897           all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
898 
899   bool UseOutlineAtomics = ST.outlineAtomics() && !ST.hasLSE();
900 
901   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
902       .legalFor(!UseOutlineAtomics, {{s32, p0}, {s64, p0}})
903       .customFor(!UseOutlineAtomics, {{s128, p0}})
904       .libcallFor(UseOutlineAtomics,
905                   {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}, {s128, p0}})
906       .clampScalar(0, s32, s64);
907 
908   getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
909                                G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR,
910                                G_ATOMICRMW_XOR})
911       .legalFor(!UseOutlineAtomics, {{s32, p0}, {s64, p0}})
912       .libcallFor(UseOutlineAtomics,
913                   {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}})
914       .clampScalar(0, s32, s64);
915 
916   // Do not outline these atomics operations, as per comment in
917   // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR().
918   getActionDefinitionsBuilder(
919       {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
920       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)))
921       .clampScalar(0, s32, s64);
922 
923   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0});
924 
925   // Merge/Unmerge
926   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
927     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
928     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
929     getActionDefinitionsBuilder(Op)
930         .widenScalarToNextPow2(LitTyIdx, 8)
931         .widenScalarToNextPow2(BigTyIdx, 32)
932         .clampScalar(LitTyIdx, s8, s64)
933         .clampScalar(BigTyIdx, s32, s128)
934         .legalIf([=](const LegalityQuery &Q) {
935           switch (Q.Types[BigTyIdx].getSizeInBits()) {
936           case 32:
937           case 64:
938           case 128:
939             break;
940           default:
941             return false;
942           }
943           switch (Q.Types[LitTyIdx].getSizeInBits()) {
944           case 8:
945           case 16:
946           case 32:
947           case 64:
948             return true;
949           default:
950             return false;
951           }
952         });
953   }
954 
955   // TODO : nxv4s16, nxv2s16, nxv2s32
956   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
957       .legalFor(HasSVE, {{s16, nxv16s8, s64},
958                          {s16, nxv8s16, s64},
959                          {s32, nxv4s32, s64},
960                          {s64, nxv2s64, s64}})
961       .unsupportedIf([=](const LegalityQuery &Query) {
962         const LLT &EltTy = Query.Types[1].getElementType();
963         if (Query.Types[1].isScalableVector())
964           return false;
965         return Query.Types[0] != EltTy;
966       })
967       .minScalar(2, s64)
968       .customIf([=](const LegalityQuery &Query) {
969         const LLT &VecTy = Query.Types[1];
970         return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 ||
971                VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 ||
972                VecTy == v8s8 || VecTy == v16s8 || VecTy == v2p0;
973       })
974       .minScalarOrEltIf(
975           [=](const LegalityQuery &Query) {
976             // We want to promote to <M x s1> to <M x s64> if that wouldn't
977             // cause the total vec size to be > 128b.
978             return Query.Types[1].isFixedVector() &&
979                    Query.Types[1].getNumElements() <= 2;
980           },
981           0, s64)
982       .minScalarOrEltIf(
983           [=](const LegalityQuery &Query) {
984             return Query.Types[1].isFixedVector() &&
985                    Query.Types[1].getNumElements() <= 4;
986           },
987           0, s32)
988       .minScalarOrEltIf(
989           [=](const LegalityQuery &Query) {
990             return Query.Types[1].isFixedVector() &&
991                    Query.Types[1].getNumElements() <= 8;
992           },
993           0, s16)
994       .minScalarOrEltIf(
995           [=](const LegalityQuery &Query) {
996             return Query.Types[1].isFixedVector() &&
997                    Query.Types[1].getNumElements() <= 16;
998           },
999           0, s8)
1000       .minScalarOrElt(0, s8) // Worst case, we need at least s8.
1001       .moreElementsToNextPow2(1)
1002       .clampMaxNumElements(1, s64, 2)
1003       .clampMaxNumElements(1, s32, 4)
1004       .clampMaxNumElements(1, s16, 8)
1005       .clampMaxNumElements(1, s8, 16)
1006       .clampMaxNumElements(1, p0, 2);
1007 
1008   getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
1009       .legalIf(
1010           typeInSet(0, {v16s8, v8s8, v8s16, v4s16, v4s32, v2s32, v2s64, v2p0}))
1011       .legalFor(HasSVE, {{nxv16s8, s32, s64},
1012                          {nxv8s16, s32, s64},
1013                          {nxv4s32, s32, s64},
1014                          {nxv2s64, s64, s64}})
1015       .moreElementsToNextPow2(0)
1016       .widenVectorEltsToVectorMinSize(0, 64)
1017       .clampNumElements(0, v8s8, v16s8)
1018       .clampNumElements(0, v4s16, v8s16)
1019       .clampNumElements(0, v2s32, v4s32)
1020       .clampMaxNumElements(0, s64, 2)
1021       .clampMaxNumElements(0, p0, 2);
1022 
1023   getActionDefinitionsBuilder(G_BUILD_VECTOR)
1024       .legalFor({{v8s8, s8},
1025                  {v16s8, s8},
1026                  {v4s16, s16},
1027                  {v8s16, s16},
1028                  {v2s32, s32},
1029                  {v4s32, s32},
1030                  {v2p0, p0},
1031                  {v2s64, s64}})
1032       .clampNumElements(0, v4s32, v4s32)
1033       .clampNumElements(0, v2s64, v2s64)
1034       .minScalarOrElt(0, s8)
1035       .widenVectorEltsToVectorMinSize(0, 64)
1036       .widenScalarOrEltToNextPow2(0)
1037       .minScalarSameAs(1, 0);
1038 
1039   getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower();
1040 
1041   getActionDefinitionsBuilder(G_CTLZ)
1042       .legalForCartesianProduct(
1043           {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
1044       .scalarize(1)
1045       .widenScalarToNextPow2(1, /*Min=*/32)
1046       .clampScalar(1, s32, s64)
1047       .scalarSameSizeAs(0, 1);
1048   getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower();
1049 
1050   // TODO: Custom lowering for v2s32, v4s32, v2s64.
1051   getActionDefinitionsBuilder(G_BITREVERSE)
1052       .legalFor({s32, s64, v8s8, v16s8})
1053       .widenScalarToNextPow2(0, /*Min = */ 32)
1054       .clampScalar(0, s32, s64)
1055       .lower();
1056 
1057   getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower();
1058 
1059   getActionDefinitionsBuilder(G_CTTZ)
1060       .lowerIf(isVector(0))
1061       .widenScalarToNextPow2(1, /*Min=*/32)
1062       .clampScalar(1, s32, s64)
1063       .scalarSameSizeAs(0, 1)
1064       .legalFor(HasCSSC, {s32, s64})
1065       .customFor(!HasCSSC, {s32, s64});
1066 
1067   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1068       .legalIf([=](const LegalityQuery &Query) {
1069         const LLT &DstTy = Query.Types[0];
1070         const LLT &SrcTy = Query.Types[1];
1071         // For now just support the TBL2 variant which needs the source vectors
1072         // to be the same size as the dest.
1073         if (DstTy != SrcTy)
1074           return false;
1075         return llvm::is_contained(
1076             {v2s64, v2s32, v4s32, v4s16, v16s8, v8s8, v8s16}, DstTy);
1077       })
1078       // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors) or scalar
1079       // destinations, we just want those lowered into G_BUILD_VECTOR or
1080       // G_EXTRACT_ELEMENT.
1081       .lowerIf([=](const LegalityQuery &Query) {
1082         return !Query.Types[0].isVector() || !Query.Types[1].isVector();
1083       })
1084       .moreElementsIf(
1085           [](const LegalityQuery &Query) {
1086             return Query.Types[0].isVector() && Query.Types[1].isVector() &&
1087                    Query.Types[0].getNumElements() >
1088                        Query.Types[1].getNumElements();
1089           },
1090           changeTo(1, 0))
1091       .moreElementsToNextPow2(0)
1092       .moreElementsIf(
1093           [](const LegalityQuery &Query) {
1094             return Query.Types[0].isVector() && Query.Types[1].isVector() &&
1095                    Query.Types[0].getNumElements() <
1096                        Query.Types[1].getNumElements();
1097           },
1098           changeTo(0, 1))
1099       .widenScalarOrEltToNextPow2OrMinSize(0, 8)
1100       .clampNumElements(0, v8s8, v16s8)
1101       .clampNumElements(0, v4s16, v8s16)
1102       .clampNumElements(0, v4s32, v4s32)
1103       .clampNumElements(0, v2s64, v2s64)
1104       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
1105       .bitcastIf(isPointerVector(0), [=](const LegalityQuery &Query) {
1106         // Bitcast pointers vector to i64.
1107         const LLT DstTy = Query.Types[0];
1108         return std::pair(0, LLT::vector(DstTy.getElementCount(), 64));
1109       });
1110 
1111   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1112       .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}})
1113       .bitcastIf(
1114           [=](const LegalityQuery &Query) {
1115             return Query.Types[0].getSizeInBits() <= 128 &&
1116                    Query.Types[1].getSizeInBits() <= 64;
1117           },
1118           [=](const LegalityQuery &Query) {
1119             const LLT DstTy = Query.Types[0];
1120             const LLT SrcTy = Query.Types[1];
1121             return std::pair(
1122                 0, DstTy.changeElementSize(SrcTy.getSizeInBits())
1123                        .changeElementCount(
1124                            DstTy.getElementCount().divideCoefficientBy(
1125                                SrcTy.getNumElements())));
1126           });
1127 
1128   getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({p0});
1129 
1130   getActionDefinitionsBuilder(G_BRJT).legalFor({{p0, s64}});
1131 
1132   getActionDefinitionsBuilder(G_DYN_STACKALLOC).custom();
1133 
1134   getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).lower();
1135 
1136   if (ST.hasMOPS()) {
1137     // G_BZERO is not supported. Currently it is only emitted by
1138     // PreLegalizerCombiner for G_MEMSET with zero constant.
1139     getActionDefinitionsBuilder(G_BZERO).unsupported();
1140 
1141     getActionDefinitionsBuilder(G_MEMSET)
1142         .legalForCartesianProduct({p0}, {s64}, {s64})
1143         .customForCartesianProduct({p0}, {s8}, {s64})
1144         .immIdx(0); // Inform verifier imm idx 0 is handled.
1145 
1146     getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE})
1147         .legalForCartesianProduct({p0}, {p0}, {s64})
1148         .immIdx(0); // Inform verifier imm idx 0 is handled.
1149 
1150     // G_MEMCPY_INLINE does not have a tailcall immediate
1151     getActionDefinitionsBuilder(G_MEMCPY_INLINE)
1152         .legalForCartesianProduct({p0}, {p0}, {s64});
1153 
1154   } else {
1155     getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
1156         .libcall();
1157   }
1158 
1159   // FIXME: Legal vector types are only legal with NEON.
1160   getActionDefinitionsBuilder(G_ABS)
1161       .legalFor(HasCSSC, {s32, s64})
1162       .legalFor(PackedVectorAllTypeList)
1163       .customIf([=](const LegalityQuery &Q) {
1164         // TODO: Fix suboptimal codegen for 128+ bit types.
1165         LLT SrcTy = Q.Types[0];
1166         return SrcTy.isScalar() && SrcTy.getSizeInBits() < 128;
1167       })
1168       .widenScalarIf(
1169           [=](const LegalityQuery &Query) { return Query.Types[0] == v4s8; },
1170           [=](const LegalityQuery &Query) { return std::make_pair(0, v4s16); })
1171       .widenScalarIf(
1172           [=](const LegalityQuery &Query) { return Query.Types[0] == v2s16; },
1173           [=](const LegalityQuery &Query) { return std::make_pair(0, v2s32); })
1174       .clampNumElements(0, v8s8, v16s8)
1175       .clampNumElements(0, v4s16, v8s16)
1176       .clampNumElements(0, v2s32, v4s32)
1177       .clampNumElements(0, v2s64, v2s64)
1178       .moreElementsToNextPow2(0)
1179       .lower();
1180 
1181   // For fadd reductions we have pairwise operations available. We treat the
1182   // usual legal types as legal and handle the lowering to pairwise instructions
1183   // later.
1184   getActionDefinitionsBuilder(G_VECREDUCE_FADD)
1185       .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}})
1186       .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}})
1187       .minScalarOrElt(0, MinFPScalar)
1188       .clampMaxNumElements(1, s64, 2)
1189       .clampMaxNumElements(1, s32, 4)
1190       .clampMaxNumElements(1, s16, 8)
1191       .lower();
1192 
1193   // For fmul reductions we need to split up into individual operations. We
1194   // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of
1195   // smaller types, followed by scalarizing what remains.
1196   getActionDefinitionsBuilder(G_VECREDUCE_FMUL)
1197       .minScalarOrElt(0, MinFPScalar)
1198       .clampMaxNumElements(1, s64, 2)
1199       .clampMaxNumElements(1, s32, 4)
1200       .clampMaxNumElements(1, s16, 8)
1201       .clampMaxNumElements(1, s32, 2)
1202       .clampMaxNumElements(1, s16, 4)
1203       .scalarize(1)
1204       .lower();
1205 
1206   getActionDefinitionsBuilder({G_VECREDUCE_SEQ_FADD, G_VECREDUCE_SEQ_FMUL})
1207       .scalarize(2)
1208       .lower();
1209 
1210   getActionDefinitionsBuilder(G_VECREDUCE_ADD)
1211       .legalFor({{s8, v16s8},
1212                  {s8, v8s8},
1213                  {s16, v8s16},
1214                  {s16, v4s16},
1215                  {s32, v4s32},
1216                  {s32, v2s32},
1217                  {s64, v2s64}})
1218       .clampMaxNumElements(1, s64, 2)
1219       .clampMaxNumElements(1, s32, 4)
1220       .clampMaxNumElements(1, s16, 8)
1221       .clampMaxNumElements(1, s8, 16)
1222       .lower();
1223 
1224   getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX,
1225                                G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM})
1226       .legalFor({{s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
1227       .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}})
1228       .minScalarOrElt(0, MinFPScalar)
1229       .clampMaxNumElements(1, s64, 2)
1230       .clampMaxNumElements(1, s32, 4)
1231       .clampMaxNumElements(1, s16, 8)
1232       .lower();
1233 
1234   getActionDefinitionsBuilder(G_VECREDUCE_MUL)
1235       .clampMaxNumElements(1, s32, 2)
1236       .clampMaxNumElements(1, s16, 4)
1237       .clampMaxNumElements(1, s8, 8)
1238       .scalarize(1)
1239       .lower();
1240 
1241   getActionDefinitionsBuilder(
1242       {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX})
1243       .legalFor({{s8, v8s8},
1244                  {s8, v16s8},
1245                  {s16, v4s16},
1246                  {s16, v8s16},
1247                  {s32, v2s32},
1248                  {s32, v4s32}})
1249       .moreElementsIf(
1250           [=](const LegalityQuery &Query) {
1251             return Query.Types[1].isVector() &&
1252                    Query.Types[1].getElementType() != s8 &&
1253                    Query.Types[1].getNumElements() & 1;
1254           },
1255           LegalizeMutations::moreElementsToNextPow2(1))
1256       .clampMaxNumElements(1, s64, 2)
1257       .clampMaxNumElements(1, s32, 4)
1258       .clampMaxNumElements(1, s16, 8)
1259       .clampMaxNumElements(1, s8, 16)
1260       .scalarize(1)
1261       .lower();
1262 
1263   getActionDefinitionsBuilder(
1264       {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
1265       // Try to break down into smaller vectors as long as they're at least 64
1266       // bits. This lets us use vector operations for some parts of the
1267       // reduction.
1268       .fewerElementsIf(
1269           [=](const LegalityQuery &Q) {
1270             LLT SrcTy = Q.Types[1];
1271             if (SrcTy.isScalar())
1272               return false;
1273             if (!isPowerOf2_32(SrcTy.getNumElements()))
1274               return false;
1275             // We can usually perform 64b vector operations.
1276             return SrcTy.getSizeInBits() > 64;
1277           },
1278           [=](const LegalityQuery &Q) {
1279             LLT SrcTy = Q.Types[1];
1280             return std::make_pair(1, SrcTy.divide(2));
1281           })
1282       .scalarize(1)
1283       .lower();
1284 
1285   // TODO: Update this to correct handling when adding AArch64/SVE support.
1286   getActionDefinitionsBuilder(G_VECTOR_COMPRESS).lower();
1287 
1288   getActionDefinitionsBuilder({G_FSHL, G_FSHR})
1289       .customFor({{s32, s32}, {s32, s64}, {s64, s64}})
1290       .lower();
1291 
1292   getActionDefinitionsBuilder(G_ROTR)
1293       .legalFor({{s32, s64}, {s64, s64}})
1294       .customIf([=](const LegalityQuery &Q) {
1295         return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64;
1296       })
1297       .lower();
1298   getActionDefinitionsBuilder(G_ROTL).lower();
1299 
1300   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1301       .customFor({{s32, s32}, {s64, s64}});
1302 
1303   auto always = [=](const LegalityQuery &Q) { return true; };
1304   getActionDefinitionsBuilder(G_CTPOP)
1305       .legalFor(HasCSSC, {{s32, s32}, {s64, s64}})
1306       .legalFor({{v8s8, v8s8}, {v16s8, v16s8}})
1307       .customFor(!HasCSSC, {{s32, s32}, {s64, s64}})
1308       .customFor({{s128, s128},
1309                   {v2s64, v2s64},
1310                   {v2s32, v2s32},
1311                   {v4s32, v4s32},
1312                   {v4s16, v4s16},
1313                   {v8s16, v8s16}})
1314       .clampScalar(0, s32, s128)
1315       .widenScalarToNextPow2(0)
1316       .minScalarEltSameAsIf(always, 1, 0)
1317       .maxScalarEltSameAsIf(always, 1, 0);
1318 
1319   getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT})
1320       .legalFor({v2s64, v2s32, v4s32, v4s16, v8s16, v8s8, v16s8})
1321       .legalFor(HasSVE, {nxv2s64, nxv4s32, nxv8s16, nxv16s8})
1322       .clampNumElements(0, v8s8, v16s8)
1323       .clampNumElements(0, v4s16, v8s16)
1324       .clampNumElements(0, v2s32, v4s32)
1325       .clampMaxNumElements(0, s64, 2)
1326       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
1327       .moreElementsToNextPow2(0)
1328       .lower();
1329 
1330   // TODO: Libcall support for s128.
1331   // TODO: s16 should be legal with full FP16 support.
1332   getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1333       .legalFor({{s64, s32}, {s64, s64}});
1334 
1335   // TODO: Custom legalization for mismatched types.
1336   getActionDefinitionsBuilder(G_FCOPYSIGN)
1337       .moreElementsIf(
1338           [](const LegalityQuery &Query) { return Query.Types[0].isScalar(); },
1339           [=](const LegalityQuery &Query) {
1340             const LLT Ty = Query.Types[0];
1341             return std::pair(0, LLT::fixed_vector(Ty == s16 ? 4 : 2, Ty));
1342           })
1343       .lower();
1344 
1345   getActionDefinitionsBuilder(G_FMAD).lower();
1346 
1347   // Access to floating-point environment.
1348   getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV,
1349                                G_GET_FPMODE, G_SET_FPMODE, G_RESET_FPMODE})
1350       .libcall();
1351 
1352   getActionDefinitionsBuilder(G_IS_FPCLASS).lower();
1353 
1354   getActionDefinitionsBuilder(G_PREFETCH).custom();
1355 
1356   getActionDefinitionsBuilder({G_SCMP, G_UCMP}).lower();
1357 
1358   getActionDefinitionsBuilder(G_EXTRACT_SUBVECTOR)
1359       .legalFor({{v8s8, v16s8}, {v4s16, v8s16}, {v2s32, v4s32}})
1360       .widenScalarOrEltToNextPow2(0)
1361       .immIdx(0); // Inform verifier imm idx 0 is handled.
1362 
1363   // TODO: {nxv16s8, s8}, {nxv8s16, s16}
1364   getActionDefinitionsBuilder(G_SPLAT_VECTOR)
1365       .legalFor(HasSVE, {{nxv4s32, s32}, {nxv2s64, s64}});
1366 
1367   getLegacyLegalizerInfo().computeTables();
1368   verify(*ST.getInstrInfo());
1369 }
1370 
1371 bool AArch64LegalizerInfo::legalizeCustom(
1372     LegalizerHelper &Helper, MachineInstr &MI,
1373     LostDebugLocObserver &LocObserver) const {
1374   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1375   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1376   GISelChangeObserver &Observer = Helper.Observer;
1377   switch (MI.getOpcode()) {
1378   default:
1379     // No idea what to do.
1380     return false;
1381   case TargetOpcode::G_VAARG:
1382     return legalizeVaArg(MI, MRI, MIRBuilder);
1383   case TargetOpcode::G_LOAD:
1384   case TargetOpcode::G_STORE:
1385     return legalizeLoadStore(MI, MRI, MIRBuilder, Observer);
1386   case TargetOpcode::G_SHL:
1387   case TargetOpcode::G_ASHR:
1388   case TargetOpcode::G_LSHR:
1389     return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
1390   case TargetOpcode::G_GLOBAL_VALUE:
1391     return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
1392   case TargetOpcode::G_SBFX:
1393   case TargetOpcode::G_UBFX:
1394     return legalizeBitfieldExtract(MI, MRI, Helper);
1395   case TargetOpcode::G_FSHL:
1396   case TargetOpcode::G_FSHR:
1397     return legalizeFunnelShift(MI, MRI, MIRBuilder, Observer, Helper);
1398   case TargetOpcode::G_ROTR:
1399     return legalizeRotate(MI, MRI, Helper);
1400   case TargetOpcode::G_CTPOP:
1401     return legalizeCTPOP(MI, MRI, Helper);
1402   case TargetOpcode::G_ATOMIC_CMPXCHG:
1403     return legalizeAtomicCmpxchg128(MI, MRI, Helper);
1404   case TargetOpcode::G_CTTZ:
1405     return legalizeCTTZ(MI, Helper);
1406   case TargetOpcode::G_BZERO:
1407   case TargetOpcode::G_MEMCPY:
1408   case TargetOpcode::G_MEMMOVE:
1409   case TargetOpcode::G_MEMSET:
1410     return legalizeMemOps(MI, Helper);
1411   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1412     return legalizeExtractVectorElt(MI, MRI, Helper);
1413   case TargetOpcode::G_DYN_STACKALLOC:
1414     return legalizeDynStackAlloc(MI, Helper);
1415   case TargetOpcode::G_PREFETCH:
1416     return legalizePrefetch(MI, Helper);
1417   case TargetOpcode::G_ABS:
1418     return Helper.lowerAbsToCNeg(MI);
1419   case TargetOpcode::G_ICMP:
1420     return legalizeICMP(MI, MRI, MIRBuilder);
1421   case TargetOpcode::G_BITCAST:
1422     return legalizeBitcast(MI, Helper);
1423   }
1424 
1425   llvm_unreachable("expected switch to return");
1426 }
1427 
1428 bool AArch64LegalizerInfo::legalizeBitcast(MachineInstr &MI,
1429                                            LegalizerHelper &Helper) const {
1430   assert(MI.getOpcode() == TargetOpcode::G_BITCAST && "Unexpected opcode");
1431   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
1432   // We're trying to handle casts from i1 vectors to scalars but reloading from
1433   // stack.
1434   if (!DstTy.isScalar() || !SrcTy.isVector() ||
1435       SrcTy.getElementType() != LLT::scalar(1))
1436     return false;
1437 
1438   Helper.createStackStoreLoad(DstReg, SrcReg);
1439   MI.eraseFromParent();
1440   return true;
1441 }
1442 
1443 bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
1444                                                MachineRegisterInfo &MRI,
1445                                                MachineIRBuilder &MIRBuilder,
1446                                                GISelChangeObserver &Observer,
1447                                                LegalizerHelper &Helper) const {
1448   assert(MI.getOpcode() == TargetOpcode::G_FSHL ||
1449          MI.getOpcode() == TargetOpcode::G_FSHR);
1450 
1451   // Keep as G_FSHR if shift amount is a G_CONSTANT, else use generic
1452   // lowering
1453   Register ShiftNo = MI.getOperand(3).getReg();
1454   LLT ShiftTy = MRI.getType(ShiftNo);
1455   auto VRegAndVal = getIConstantVRegValWithLookThrough(ShiftNo, MRI);
1456 
1457   // Adjust shift amount according to Opcode (FSHL/FSHR)
1458   // Convert FSHL to FSHR
1459   LLT OperationTy = MRI.getType(MI.getOperand(0).getReg());
1460   APInt BitWidth(ShiftTy.getSizeInBits(), OperationTy.getSizeInBits(), false);
1461 
1462   // Lower non-constant shifts and leave zero shifts to the optimizer.
1463   if (!VRegAndVal || VRegAndVal->Value.urem(BitWidth) == 0)
1464     return (Helper.lowerFunnelShiftAsShifts(MI) ==
1465             LegalizerHelper::LegalizeResult::Legalized);
1466 
1467   APInt Amount = VRegAndVal->Value.urem(BitWidth);
1468 
1469   Amount = MI.getOpcode() == TargetOpcode::G_FSHL ? BitWidth - Amount : Amount;
1470 
1471   // If the instruction is G_FSHR, has a 64-bit G_CONSTANT for shift amount
1472   // in the range of 0 <-> BitWidth, it is legal
1473   if (ShiftTy.getSizeInBits() == 64 && MI.getOpcode() == TargetOpcode::G_FSHR &&
1474       VRegAndVal->Value.ult(BitWidth))
1475     return true;
1476 
1477   // Cast the ShiftNumber to a 64-bit type
1478   auto Cast64 = MIRBuilder.buildConstant(LLT::scalar(64), Amount.zext(64));
1479 
1480   if (MI.getOpcode() == TargetOpcode::G_FSHR) {
1481     Observer.changingInstr(MI);
1482     MI.getOperand(3).setReg(Cast64.getReg(0));
1483     Observer.changedInstr(MI);
1484   }
1485   // If Opcode is FSHL, remove the FSHL instruction and create a FSHR
1486   // instruction
1487   else if (MI.getOpcode() == TargetOpcode::G_FSHL) {
1488     MIRBuilder.buildInstr(TargetOpcode::G_FSHR, {MI.getOperand(0).getReg()},
1489                           {MI.getOperand(1).getReg(), MI.getOperand(2).getReg(),
1490                            Cast64.getReg(0)});
1491     MI.eraseFromParent();
1492   }
1493   return true;
1494 }
1495 
1496 bool AArch64LegalizerInfo::legalizeICMP(MachineInstr &MI,
1497                                         MachineRegisterInfo &MRI,
1498                                         MachineIRBuilder &MIRBuilder) const {
1499   Register DstReg = MI.getOperand(0).getReg();
1500   Register SrcReg1 = MI.getOperand(2).getReg();
1501   Register SrcReg2 = MI.getOperand(3).getReg();
1502   LLT DstTy = MRI.getType(DstReg);
1503   LLT SrcTy = MRI.getType(SrcReg1);
1504 
1505   // Check the vector types are legal
1506   if (DstTy.getScalarSizeInBits() != SrcTy.getScalarSizeInBits() ||
1507       DstTy.getNumElements() != SrcTy.getNumElements() ||
1508       (DstTy.getSizeInBits() != 64 && DstTy.getSizeInBits() != 128))
1509     return false;
1510 
1511   // Lowers G_ICMP NE => G_ICMP EQ to allow better pattern matching for
1512   // following passes
1513   CmpInst::Predicate Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate();
1514   if (Pred != CmpInst::ICMP_NE)
1515     return true;
1516   Register CmpReg =
1517       MIRBuilder
1518           .buildICmp(CmpInst::ICMP_EQ, MRI.getType(DstReg), SrcReg1, SrcReg2)
1519           .getReg(0);
1520   MIRBuilder.buildNot(DstReg, CmpReg);
1521 
1522   MI.eraseFromParent();
1523   return true;
1524 }
1525 
1526 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI,
1527                                           MachineRegisterInfo &MRI,
1528                                           LegalizerHelper &Helper) const {
1529   // To allow for imported patterns to match, we ensure that the rotate amount
1530   // is 64b with an extension.
1531   Register AmtReg = MI.getOperand(2).getReg();
1532   LLT AmtTy = MRI.getType(AmtReg);
1533   (void)AmtTy;
1534   assert(AmtTy.isScalar() && "Expected a scalar rotate");
1535   assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal");
1536   auto NewAmt = Helper.MIRBuilder.buildZExt(LLT::scalar(64), AmtReg);
1537   Helper.Observer.changingInstr(MI);
1538   MI.getOperand(2).setReg(NewAmt.getReg(0));
1539   Helper.Observer.changedInstr(MI);
1540   return true;
1541 }
1542 
1543 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
1544     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1545     GISelChangeObserver &Observer) const {
1546   assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
1547   // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
1548   // G_ADD_LOW instructions.
1549   // By splitting this here, we can optimize accesses in the small code model by
1550   // folding in the G_ADD_LOW into the load/store offset.
1551   auto &GlobalOp = MI.getOperand(1);
1552   // Don't modify an intrinsic call.
1553   if (GlobalOp.isSymbol())
1554     return true;
1555   const auto* GV = GlobalOp.getGlobal();
1556   if (GV->isThreadLocal())
1557     return true; // Don't want to modify TLS vars.
1558 
1559   auto &TM = ST->getTargetLowering()->getTargetMachine();
1560   unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM);
1561 
1562   if (OpFlags & AArch64II::MO_GOT)
1563     return true;
1564 
1565   auto Offset = GlobalOp.getOffset();
1566   Register DstReg = MI.getOperand(0).getReg();
1567   auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {})
1568                   .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE);
1569   // Set the regclass on the dest reg too.
1570   MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1571 
1572   // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
1573   // by creating a MOVK that sets bits 48-63 of the register to (global address
1574   // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
1575   // prevent an incorrect tag being generated during relocation when the
1576   // global appears before the code section. Without the offset, a global at
1577   // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
1578   // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
1579   // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
1580   // instead of `0xf`.
1581   // This assumes that we're in the small code model so we can assume a binary
1582   // size of <= 4GB, which makes the untagged PC relative offset positive. The
1583   // binary must also be loaded into address range [0, 2^48). Both of these
1584   // properties need to be ensured at runtime when using tagged addresses.
1585   if (OpFlags & AArch64II::MO_TAGGED) {
1586     assert(!Offset &&
1587            "Should not have folded in an offset for a tagged global!");
1588     ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP})
1589                .addGlobalAddress(GV, 0x100000000,
1590                                  AArch64II::MO_PREL | AArch64II::MO_G3)
1591                .addImm(48);
1592     MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1593   }
1594 
1595   MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
1596       .addGlobalAddress(GV, Offset,
1597                         OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
1598   MI.eraseFromParent();
1599   return true;
1600 }
1601 
1602 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
1603                                              MachineInstr &MI) const {
1604   auto LowerBinOp = [&MI](unsigned Opcode) {
1605     MachineIRBuilder MIB(MI);
1606     MIB.buildInstr(Opcode, {MI.getOperand(0)},
1607                    {MI.getOperand(2), MI.getOperand(3)});
1608     MI.eraseFromParent();
1609     return true;
1610   };
1611 
1612   Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1613   switch (IntrinsicID) {
1614   case Intrinsic::vacopy: {
1615     unsigned PtrSize = ST->isTargetILP32() ? 4 : 8;
1616     unsigned VaListSize =
1617       (ST->isTargetDarwin() || ST->isTargetWindows())
1618           ? PtrSize
1619           : ST->isTargetILP32() ? 20 : 32;
1620 
1621     MachineFunction &MF = *MI.getMF();
1622     auto Val = MF.getRegInfo().createGenericVirtualRegister(
1623         LLT::scalar(VaListSize * 8));
1624     MachineIRBuilder MIB(MI);
1625     MIB.buildLoad(Val, MI.getOperand(2),
1626                   *MF.getMachineMemOperand(MachinePointerInfo(),
1627                                            MachineMemOperand::MOLoad,
1628                                            VaListSize, Align(PtrSize)));
1629     MIB.buildStore(Val, MI.getOperand(1),
1630                    *MF.getMachineMemOperand(MachinePointerInfo(),
1631                                             MachineMemOperand::MOStore,
1632                                             VaListSize, Align(PtrSize)));
1633     MI.eraseFromParent();
1634     return true;
1635   }
1636   case Intrinsic::get_dynamic_area_offset: {
1637     MachineIRBuilder &MIB = Helper.MIRBuilder;
1638     MIB.buildConstant(MI.getOperand(0).getReg(), 0);
1639     MI.eraseFromParent();
1640     return true;
1641   }
1642   case Intrinsic::aarch64_mops_memset_tag: {
1643     assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1644     // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
1645     // the instruction).
1646     MachineIRBuilder MIB(MI);
1647     auto &Value = MI.getOperand(3);
1648     Register ExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0);
1649     Value.setReg(ExtValueReg);
1650     return true;
1651   }
1652   case Intrinsic::aarch64_prefetch: {
1653     MachineIRBuilder MIB(MI);
1654     auto &AddrVal = MI.getOperand(1);
1655 
1656     int64_t IsWrite = MI.getOperand(2).getImm();
1657     int64_t Target = MI.getOperand(3).getImm();
1658     int64_t IsStream = MI.getOperand(4).getImm();
1659     int64_t IsData = MI.getOperand(5).getImm();
1660 
1661     unsigned PrfOp = (IsWrite << 4) |    // Load/Store bit
1662                      (!IsData << 3) |    // IsDataCache bit
1663                      (Target << 1) |     // Cache level bits
1664                      (unsigned)IsStream; // Stream bit
1665 
1666     MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal);
1667     MI.eraseFromParent();
1668     return true;
1669   }
1670   case Intrinsic::aarch64_neon_uaddv:
1671   case Intrinsic::aarch64_neon_saddv:
1672   case Intrinsic::aarch64_neon_umaxv:
1673   case Intrinsic::aarch64_neon_smaxv:
1674   case Intrinsic::aarch64_neon_uminv:
1675   case Intrinsic::aarch64_neon_sminv: {
1676     MachineIRBuilder MIB(MI);
1677     MachineRegisterInfo &MRI = *MIB.getMRI();
1678     bool IsSigned = IntrinsicID == Intrinsic::aarch64_neon_saddv ||
1679                     IntrinsicID == Intrinsic::aarch64_neon_smaxv ||
1680                     IntrinsicID == Intrinsic::aarch64_neon_sminv;
1681 
1682     auto OldDst = MI.getOperand(0).getReg();
1683     auto OldDstTy = MRI.getType(OldDst);
1684     LLT NewDstTy = MRI.getType(MI.getOperand(2).getReg()).getElementType();
1685     if (OldDstTy == NewDstTy)
1686       return true;
1687 
1688     auto NewDst = MRI.createGenericVirtualRegister(NewDstTy);
1689 
1690     Helper.Observer.changingInstr(MI);
1691     MI.getOperand(0).setReg(NewDst);
1692     Helper.Observer.changedInstr(MI);
1693 
1694     MIB.setInsertPt(MIB.getMBB(), ++MIB.getInsertPt());
1695     MIB.buildExtOrTrunc(IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT,
1696                         OldDst, NewDst);
1697 
1698     return true;
1699   }
1700   case Intrinsic::aarch64_neon_uaddlp:
1701   case Intrinsic::aarch64_neon_saddlp: {
1702     MachineIRBuilder MIB(MI);
1703 
1704     unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp
1705                        ? AArch64::G_UADDLP
1706                        : AArch64::G_SADDLP;
1707     MIB.buildInstr(Opc, {MI.getOperand(0)}, {MI.getOperand(2)});
1708     MI.eraseFromParent();
1709 
1710     return true;
1711   }
1712   case Intrinsic::aarch64_neon_uaddlv:
1713   case Intrinsic::aarch64_neon_saddlv: {
1714     MachineIRBuilder MIB(MI);
1715     MachineRegisterInfo &MRI = *MIB.getMRI();
1716 
1717     unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv
1718                        ? AArch64::G_UADDLV
1719                        : AArch64::G_SADDLV;
1720     Register DstReg = MI.getOperand(0).getReg();
1721     Register SrcReg = MI.getOperand(2).getReg();
1722     LLT DstTy = MRI.getType(DstReg);
1723 
1724     LLT MidTy, ExtTy;
1725     if (DstTy.isScalar() && DstTy.getScalarSizeInBits() <= 32) {
1726       MidTy = LLT::fixed_vector(4, 32);
1727       ExtTy = LLT::scalar(32);
1728     } else {
1729       MidTy = LLT::fixed_vector(2, 64);
1730       ExtTy = LLT::scalar(64);
1731     }
1732 
1733     Register MidReg =
1734         MIB.buildInstr(Opc, {MidTy}, {SrcReg})->getOperand(0).getReg();
1735     Register ZeroReg =
1736         MIB.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg();
1737     Register ExtReg = MIB.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, {ExtTy},
1738                                      {MidReg, ZeroReg})
1739                           .getReg(0);
1740 
1741     if (DstTy.getScalarSizeInBits() < 32)
1742       MIB.buildTrunc(DstReg, ExtReg);
1743     else
1744       MIB.buildCopy(DstReg, ExtReg);
1745 
1746     MI.eraseFromParent();
1747 
1748     return true;
1749   }
1750   case Intrinsic::aarch64_neon_smax:
1751     return LowerBinOp(TargetOpcode::G_SMAX);
1752   case Intrinsic::aarch64_neon_smin:
1753     return LowerBinOp(TargetOpcode::G_SMIN);
1754   case Intrinsic::aarch64_neon_umax:
1755     return LowerBinOp(TargetOpcode::G_UMAX);
1756   case Intrinsic::aarch64_neon_umin:
1757     return LowerBinOp(TargetOpcode::G_UMIN);
1758   case Intrinsic::aarch64_neon_fmax:
1759     return LowerBinOp(TargetOpcode::G_FMAXIMUM);
1760   case Intrinsic::aarch64_neon_fmin:
1761     return LowerBinOp(TargetOpcode::G_FMINIMUM);
1762   case Intrinsic::aarch64_neon_fmaxnm:
1763     return LowerBinOp(TargetOpcode::G_FMAXNUM);
1764   case Intrinsic::aarch64_neon_fminnm:
1765     return LowerBinOp(TargetOpcode::G_FMINNUM);
1766   case Intrinsic::aarch64_neon_smull:
1767     return LowerBinOp(AArch64::G_SMULL);
1768   case Intrinsic::aarch64_neon_umull:
1769     return LowerBinOp(AArch64::G_UMULL);
1770   case Intrinsic::aarch64_neon_abs: {
1771     // Lower the intrinsic to G_ABS.
1772     MachineIRBuilder MIB(MI);
1773     MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)});
1774     MI.eraseFromParent();
1775     return true;
1776   }
1777 
1778   case Intrinsic::vector_reverse:
1779     // TODO: Add support for vector_reverse
1780     return false;
1781   }
1782 
1783   return true;
1784 }
1785 
1786 bool AArch64LegalizerInfo::legalizeShlAshrLshr(
1787     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1788     GISelChangeObserver &Observer) const {
1789   assert(MI.getOpcode() == TargetOpcode::G_ASHR ||
1790          MI.getOpcode() == TargetOpcode::G_LSHR ||
1791          MI.getOpcode() == TargetOpcode::G_SHL);
1792   // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
1793   // imported patterns can select it later. Either way, it will be legal.
1794   Register AmtReg = MI.getOperand(2).getReg();
1795   auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI);
1796   if (!VRegAndVal)
1797     return true;
1798   // Check the shift amount is in range for an immediate form.
1799   int64_t Amount = VRegAndVal->Value.getSExtValue();
1800   if (Amount > 31)
1801     return true; // This will have to remain a register variant.
1802   auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount);
1803   Observer.changingInstr(MI);
1804   MI.getOperand(2).setReg(ExtCst.getReg(0));
1805   Observer.changedInstr(MI);
1806   return true;
1807 }
1808 
1809 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset,
1810                                 MachineRegisterInfo &MRI) {
1811   Base = Root;
1812   Offset = 0;
1813 
1814   Register NewBase;
1815   int64_t NewOffset;
1816   if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) &&
1817       isShiftedInt<7, 3>(NewOffset)) {
1818     Base = NewBase;
1819     Offset = NewOffset;
1820   }
1821 }
1822 
1823 // FIXME: This should be removed and replaced with the generic bitcast legalize
1824 // action.
1825 bool AArch64LegalizerInfo::legalizeLoadStore(
1826     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1827     GISelChangeObserver &Observer) const {
1828   assert(MI.getOpcode() == TargetOpcode::G_STORE ||
1829          MI.getOpcode() == TargetOpcode::G_LOAD);
1830   // Here we just try to handle vector loads/stores where our value type might
1831   // have pointer elements, which the SelectionDAG importer can't handle. To
1832   // allow the existing patterns for s64 to fire for p0, we just try to bitcast
1833   // the value to use s64 types.
1834 
1835   // Custom legalization requires the instruction, if not deleted, must be fully
1836   // legalized. In order to allow further legalization of the inst, we create
1837   // a new instruction and erase the existing one.
1838 
1839   Register ValReg = MI.getOperand(0).getReg();
1840   const LLT ValTy = MRI.getType(ValReg);
1841 
1842   if (ValTy == LLT::scalar(128)) {
1843 
1844     AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering();
1845     bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD;
1846     bool IsLoadAcquire = IsLoad && Ordering == AtomicOrdering::Acquire;
1847     bool IsStoreRelease = !IsLoad && Ordering == AtomicOrdering::Release;
1848     bool IsRcpC3 =
1849         ST->hasLSE2() && ST->hasRCPC3() && (IsLoadAcquire || IsStoreRelease);
1850 
1851     LLT s64 = LLT::scalar(64);
1852 
1853     unsigned Opcode;
1854     if (IsRcpC3) {
1855       Opcode = IsLoad ? AArch64::LDIAPPX : AArch64::STILPX;
1856     } else {
1857       // For LSE2, loads/stores should have been converted to monotonic and had
1858       // a fence inserted after them.
1859       assert(Ordering == AtomicOrdering::Monotonic ||
1860              Ordering == AtomicOrdering::Unordered);
1861       assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
1862 
1863       Opcode = IsLoad ? AArch64::LDPXi : AArch64::STPXi;
1864     }
1865 
1866     MachineInstrBuilder NewI;
1867     if (IsLoad) {
1868       NewI = MIRBuilder.buildInstr(Opcode, {s64, s64}, {});
1869       MIRBuilder.buildMergeLikeInstr(
1870           ValReg, {NewI->getOperand(0), NewI->getOperand(1)});
1871     } else {
1872       auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0));
1873       NewI = MIRBuilder.buildInstr(
1874           Opcode, {}, {Split->getOperand(0), Split->getOperand(1)});
1875     }
1876 
1877     if (IsRcpC3) {
1878       NewI.addUse(MI.getOperand(1).getReg());
1879     } else {
1880       Register Base;
1881       int Offset;
1882       matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI);
1883       NewI.addUse(Base);
1884       NewI.addImm(Offset / 8);
1885     }
1886 
1887     NewI.cloneMemRefs(MI);
1888     constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(),
1889                                      *MRI.getTargetRegisterInfo(),
1890                                      *ST->getRegBankInfo());
1891     MI.eraseFromParent();
1892     return true;
1893   }
1894 
1895   if (!ValTy.isPointerVector() ||
1896       ValTy.getElementType().getAddressSpace() != 0) {
1897     LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
1898     return false;
1899   }
1900 
1901   unsigned PtrSize = ValTy.getElementType().getSizeInBits();
1902   const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize);
1903   auto &MMO = **MI.memoperands_begin();
1904   MMO.setType(NewTy);
1905 
1906   if (MI.getOpcode() == TargetOpcode::G_STORE) {
1907     auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg);
1908     MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO);
1909   } else {
1910     auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO);
1911     MIRBuilder.buildBitcast(ValReg, NewLoad);
1912   }
1913   MI.eraseFromParent();
1914   return true;
1915 }
1916 
1917 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
1918                                          MachineRegisterInfo &MRI,
1919                                          MachineIRBuilder &MIRBuilder) const {
1920   MachineFunction &MF = MIRBuilder.getMF();
1921   Align Alignment(MI.getOperand(2).getImm());
1922   Register Dst = MI.getOperand(0).getReg();
1923   Register ListPtr = MI.getOperand(1).getReg();
1924 
1925   LLT PtrTy = MRI.getType(ListPtr);
1926   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1927 
1928   const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
1929   const Align PtrAlign = Align(PtrSize);
1930   auto List = MIRBuilder.buildLoad(
1931       PtrTy, ListPtr,
1932       *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1933                                PtrTy, PtrAlign));
1934 
1935   MachineInstrBuilder DstPtr;
1936   if (Alignment > PtrAlign) {
1937     // Realign the list to the actual required alignment.
1938     auto AlignMinus1 =
1939         MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1);
1940     auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0));
1941     DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment));
1942   } else
1943     DstPtr = List;
1944 
1945   LLT ValTy = MRI.getType(Dst);
1946   uint64_t ValSize = ValTy.getSizeInBits() / 8;
1947   MIRBuilder.buildLoad(
1948       Dst, DstPtr,
1949       *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1950                                ValTy, std::max(Alignment, PtrAlign)));
1951 
1952   auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign));
1953 
1954   auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0));
1955 
1956   MIRBuilder.buildStore(NewList, ListPtr,
1957                         *MF.getMachineMemOperand(MachinePointerInfo(),
1958                                                  MachineMemOperand::MOStore,
1959                                                  PtrTy, PtrAlign));
1960 
1961   MI.eraseFromParent();
1962   return true;
1963 }
1964 
1965 bool AArch64LegalizerInfo::legalizeBitfieldExtract(
1966     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
1967   // Only legal if we can select immediate forms.
1968   // TODO: Lower this otherwise.
1969   return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
1970          getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
1971 }
1972 
1973 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
1974                                          MachineRegisterInfo &MRI,
1975                                          LegalizerHelper &Helper) const {
1976   // When there is no integer popcount instruction (FEAT_CSSC isn't available),
1977   // it can be more efficiently lowered to the following sequence that uses
1978   // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD
1979   // registers are cheap.
1980   //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
1981   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
1982   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
1983   //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
1984   //
1985   // For 128 bit vector popcounts, we lower to the following sequence:
1986   //  cnt.16b   v0, v0  // v8s16, v4s32, v2s64
1987   //  uaddlp.8h v0, v0  // v8s16, v4s32, v2s64
1988   //  uaddlp.4s v0, v0  //        v4s32, v2s64
1989   //  uaddlp.2d v0, v0  //               v2s64
1990   //
1991   // For 64 bit vector popcounts, we lower to the following sequence:
1992   //  cnt.8b    v0, v0  // v4s16, v2s32
1993   //  uaddlp.4h v0, v0  // v4s16, v2s32
1994   //  uaddlp.2s v0, v0  //        v2s32
1995 
1996   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1997   Register Dst = MI.getOperand(0).getReg();
1998   Register Val = MI.getOperand(1).getReg();
1999   LLT Ty = MRI.getType(Val);
2000   unsigned Size = Ty.getSizeInBits();
2001 
2002   assert(Ty == MRI.getType(Dst) &&
2003          "Expected src and dst to have the same type!");
2004 
2005   if (ST->hasCSSC() && Ty.isScalar() && Size == 128) {
2006     LLT s64 = LLT::scalar(64);
2007 
2008     auto Split = MIRBuilder.buildUnmerge(s64, Val);
2009     auto CTPOP1 = MIRBuilder.buildCTPOP(s64, Split->getOperand(0));
2010     auto CTPOP2 = MIRBuilder.buildCTPOP(s64, Split->getOperand(1));
2011     auto Add = MIRBuilder.buildAdd(s64, CTPOP1, CTPOP2);
2012 
2013     MIRBuilder.buildZExt(Dst, Add);
2014     MI.eraseFromParent();
2015     return true;
2016   }
2017 
2018   if (!ST->hasNEON() ||
2019       MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) {
2020     // Use generic lowering when custom lowering is not possible.
2021     return Ty.isScalar() && (Size == 32 || Size == 64) &&
2022            Helper.lowerBitCount(MI) ==
2023                LegalizerHelper::LegalizeResult::Legalized;
2024   }
2025 
2026   // Pre-conditioning: widen Val up to the nearest vector type.
2027   // s32,s64,v4s16,v2s32 -> v8i8
2028   // v8s16,v4s32,v2s64 -> v16i8
2029   LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
2030   if (Ty.isScalar()) {
2031     assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!");
2032     if (Size == 32) {
2033       Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);
2034     }
2035   }
2036   Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0);
2037 
2038   // Count bits in each byte-sized lane.
2039   auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val);
2040 
2041   // Sum across lanes.
2042 
2043   if (ST->hasDotProd() && Ty.isVector() && Ty.getNumElements() >= 2 &&
2044       Ty.getScalarSizeInBits() != 16) {
2045     LLT Dt = Ty == LLT::fixed_vector(2, 64) ? LLT::fixed_vector(4, 32) : Ty;
2046     auto Zeros = MIRBuilder.buildConstant(Dt, 0);
2047     auto Ones = MIRBuilder.buildConstant(VTy, 1);
2048     MachineInstrBuilder Sum;
2049 
2050     if (Ty == LLT::fixed_vector(2, 64)) {
2051       auto UDOT =
2052           MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
2053       Sum = MIRBuilder.buildInstr(AArch64::G_UADDLP, {Ty}, {UDOT});
2054     } else if (Ty == LLT::fixed_vector(4, 32)) {
2055       Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
2056     } else if (Ty == LLT::fixed_vector(2, 32)) {
2057       Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
2058     } else {
2059       llvm_unreachable("unexpected vector shape");
2060     }
2061 
2062     Sum->getOperand(0).setReg(Dst);
2063     MI.eraseFromParent();
2064     return true;
2065   }
2066 
2067   Register HSum = CTPOP.getReg(0);
2068   unsigned Opc;
2069   SmallVector<LLT> HAddTys;
2070   if (Ty.isScalar()) {
2071     Opc = Intrinsic::aarch64_neon_uaddlv;
2072     HAddTys.push_back(LLT::scalar(32));
2073   } else if (Ty == LLT::fixed_vector(8, 16)) {
2074     Opc = Intrinsic::aarch64_neon_uaddlp;
2075     HAddTys.push_back(LLT::fixed_vector(8, 16));
2076   } else if (Ty == LLT::fixed_vector(4, 32)) {
2077     Opc = Intrinsic::aarch64_neon_uaddlp;
2078     HAddTys.push_back(LLT::fixed_vector(8, 16));
2079     HAddTys.push_back(LLT::fixed_vector(4, 32));
2080   } else if (Ty == LLT::fixed_vector(2, 64)) {
2081     Opc = Intrinsic::aarch64_neon_uaddlp;
2082     HAddTys.push_back(LLT::fixed_vector(8, 16));
2083     HAddTys.push_back(LLT::fixed_vector(4, 32));
2084     HAddTys.push_back(LLT::fixed_vector(2, 64));
2085   } else if (Ty == LLT::fixed_vector(4, 16)) {
2086     Opc = Intrinsic::aarch64_neon_uaddlp;
2087     HAddTys.push_back(LLT::fixed_vector(4, 16));
2088   } else if (Ty == LLT::fixed_vector(2, 32)) {
2089     Opc = Intrinsic::aarch64_neon_uaddlp;
2090     HAddTys.push_back(LLT::fixed_vector(4, 16));
2091     HAddTys.push_back(LLT::fixed_vector(2, 32));
2092   } else
2093     llvm_unreachable("unexpected vector shape");
2094   MachineInstrBuilder UADD;
2095   for (LLT HTy : HAddTys) {
2096     UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}).addUse(HSum);
2097     HSum = UADD.getReg(0);
2098   }
2099 
2100   // Post-conditioning.
2101   if (Ty.isScalar() && (Size == 64 || Size == 128))
2102     MIRBuilder.buildZExt(Dst, UADD);
2103   else
2104     UADD->getOperand(0).setReg(Dst);
2105   MI.eraseFromParent();
2106   return true;
2107 }
2108 
2109 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
2110     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2111   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2112   LLT s64 = LLT::scalar(64);
2113   auto Addr = MI.getOperand(1).getReg();
2114   auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2));
2115   auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3));
2116   auto DstLo = MRI.createGenericVirtualRegister(s64);
2117   auto DstHi = MRI.createGenericVirtualRegister(s64);
2118 
2119   MachineInstrBuilder CAS;
2120   if (ST->hasLSE()) {
2121     // We have 128-bit CASP instructions taking XSeqPair registers, which are
2122     // s128. We need the merge/unmerge to bracket the expansion and pair up with
2123     // the rest of the MIR so we must reassemble the extracted registers into a
2124     // 128-bit known-regclass one with code like this:
2125     //
2126     //     %in1 = REG_SEQUENCE Lo, Hi    ; One for each input
2127     //     %out = CASP %in1, ...
2128     //     %OldLo = G_EXTRACT %out, 0
2129     //     %OldHi = G_EXTRACT %out, 64
2130     auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
2131     unsigned Opcode;
2132     switch (Ordering) {
2133     case AtomicOrdering::Acquire:
2134       Opcode = AArch64::CASPAX;
2135       break;
2136     case AtomicOrdering::Release:
2137       Opcode = AArch64::CASPLX;
2138       break;
2139     case AtomicOrdering::AcquireRelease:
2140     case AtomicOrdering::SequentiallyConsistent:
2141       Opcode = AArch64::CASPALX;
2142       break;
2143     default:
2144       Opcode = AArch64::CASPX;
2145       break;
2146     }
2147 
2148     LLT s128 = LLT::scalar(128);
2149     auto CASDst = MRI.createGenericVirtualRegister(s128);
2150     auto CASDesired = MRI.createGenericVirtualRegister(s128);
2151     auto CASNew = MRI.createGenericVirtualRegister(s128);
2152     MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {})
2153         .addUse(DesiredI->getOperand(0).getReg())
2154         .addImm(AArch64::sube64)
2155         .addUse(DesiredI->getOperand(1).getReg())
2156         .addImm(AArch64::subo64);
2157     MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {})
2158         .addUse(NewI->getOperand(0).getReg())
2159         .addImm(AArch64::sube64)
2160         .addUse(NewI->getOperand(1).getReg())
2161         .addImm(AArch64::subo64);
2162 
2163     CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr});
2164 
2165     MIRBuilder.buildExtract({DstLo}, {CASDst}, 0);
2166     MIRBuilder.buildExtract({DstHi}, {CASDst}, 64);
2167   } else {
2168     // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
2169     // can take arbitrary registers so it just has the normal GPR64 operands the
2170     // rest of AArch64 is expecting.
2171     auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
2172     unsigned Opcode;
2173     switch (Ordering) {
2174     case AtomicOrdering::Acquire:
2175       Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
2176       break;
2177     case AtomicOrdering::Release:
2178       Opcode = AArch64::CMP_SWAP_128_RELEASE;
2179       break;
2180     case AtomicOrdering::AcquireRelease:
2181     case AtomicOrdering::SequentiallyConsistent:
2182       Opcode = AArch64::CMP_SWAP_128;
2183       break;
2184     default:
2185       Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
2186       break;
2187     }
2188 
2189     auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2190     CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch},
2191                                 {Addr, DesiredI->getOperand(0),
2192                                  DesiredI->getOperand(1), NewI->getOperand(0),
2193                                  NewI->getOperand(1)});
2194   }
2195 
2196   CAS.cloneMemRefs(MI);
2197   constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(),
2198                                    *MRI.getTargetRegisterInfo(),
2199                                    *ST->getRegBankInfo());
2200 
2201   MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {DstLo, DstHi});
2202   MI.eraseFromParent();
2203   return true;
2204 }
2205 
2206 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI,
2207                                         LegalizerHelper &Helper) const {
2208   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2209   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
2210   LLT Ty = MRI.getType(MI.getOperand(1).getReg());
2211   auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1));
2212   MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse);
2213   MI.eraseFromParent();
2214   return true;
2215 }
2216 
2217 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI,
2218                                           LegalizerHelper &Helper) const {
2219   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2220 
2221   // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
2222   if (MI.getOpcode() == TargetOpcode::G_MEMSET) {
2223     // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
2224     // the instruction).
2225     auto &Value = MI.getOperand(1);
2226     Register ExtValueReg =
2227         MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0);
2228     Value.setReg(ExtValueReg);
2229     return true;
2230   }
2231 
2232   return false;
2233 }
2234 
2235 bool AArch64LegalizerInfo::legalizeExtractVectorElt(
2236     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2237   const GExtractVectorElement *Element = cast<GExtractVectorElement>(&MI);
2238   auto VRegAndVal =
2239       getIConstantVRegValWithLookThrough(Element->getIndexReg(), MRI);
2240   if (VRegAndVal)
2241     return true;
2242   LLT VecTy = MRI.getType(Element->getVectorReg());
2243   if (VecTy.isScalableVector())
2244     return true;
2245   return Helper.lowerExtractInsertVectorElt(MI) !=
2246          LegalizerHelper::LegalizeResult::UnableToLegalize;
2247 }
2248 
2249 bool AArch64LegalizerInfo::legalizeDynStackAlloc(
2250     MachineInstr &MI, LegalizerHelper &Helper) const {
2251   MachineFunction &MF = *MI.getParent()->getParent();
2252   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2253   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
2254 
2255   // If stack probing is not enabled for this function, use the default
2256   // lowering.
2257   if (!MF.getFunction().hasFnAttribute("probe-stack") ||
2258       MF.getFunction().getFnAttribute("probe-stack").getValueAsString() !=
2259           "inline-asm") {
2260     Helper.lowerDynStackAlloc(MI);
2261     return true;
2262   }
2263 
2264   Register Dst = MI.getOperand(0).getReg();
2265   Register AllocSize = MI.getOperand(1).getReg();
2266   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
2267 
2268   assert(MRI.getType(Dst) == LLT::pointer(0, 64) &&
2269          "Unexpected type for dynamic alloca");
2270   assert(MRI.getType(AllocSize) == LLT::scalar(64) &&
2271          "Unexpected type for dynamic alloca");
2272 
2273   LLT PtrTy = MRI.getType(Dst);
2274   Register SPReg =
2275       Helper.getTargetLowering().getStackPointerRegisterToSaveRestore();
2276   Register SPTmp =
2277       Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
2278   auto NewMI =
2279       MIRBuilder.buildInstr(AArch64::PROBED_STACKALLOC_DYN, {}, {SPTmp});
2280   MRI.setRegClass(NewMI.getReg(0), &AArch64::GPR64commonRegClass);
2281   MIRBuilder.setInsertPt(*NewMI->getParent(), NewMI);
2282   MIRBuilder.buildCopy(Dst, SPTmp);
2283 
2284   MI.eraseFromParent();
2285   return true;
2286 }
2287 
2288 bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI,
2289                                             LegalizerHelper &Helper) const {
2290   MachineIRBuilder &MIB = Helper.MIRBuilder;
2291   auto &AddrVal = MI.getOperand(0);
2292 
2293   int64_t IsWrite = MI.getOperand(1).getImm();
2294   int64_t Locality = MI.getOperand(2).getImm();
2295   int64_t IsData = MI.getOperand(3).getImm();
2296 
2297   bool IsStream = Locality == 0;
2298   if (Locality != 0) {
2299     assert(Locality <= 3 && "Prefetch locality out-of-range");
2300     // The locality degree is the opposite of the cache speed.
2301     // Put the number the other way around.
2302     // The encoding starts at 0 for level 1
2303     Locality = 3 - Locality;
2304   }
2305 
2306   unsigned PrfOp = (IsWrite << 4) | (!IsData << 3) | (Locality << 1) | IsStream;
2307 
2308   MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal);
2309   MI.eraseFromParent();
2310   return true;
2311 }
2312