xref: /freebsd-src/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp (revision b3edf4467982447620505a28fc82e38a414c07dc)
1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
19 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
20 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24 #include "llvm/CodeGen/GlobalISel/Utils.h"
25 #include "llvm/CodeGen/MachineConstantPool.h"
26 #include "llvm/CodeGen/MachineFrameInfo.h"
27 #include "llvm/CodeGen/MachineRegisterInfo.h"
28 #include "llvm/CodeGen/RuntimeLibcalls.h"
29 #include "llvm/CodeGen/TargetFrameLowering.h"
30 #include "llvm/CodeGen/TargetInstrInfo.h"
31 #include "llvm/CodeGen/TargetLowering.h"
32 #include "llvm/CodeGen/TargetOpcodes.h"
33 #include "llvm/CodeGen/TargetSubtargetInfo.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/Support/Debug.h"
36 #include "llvm/Support/MathExtras.h"
37 #include "llvm/Support/raw_ostream.h"
38 #include "llvm/Target/TargetMachine.h"
39 #include <numeric>
40 #include <optional>
41 
42 #define DEBUG_TYPE "legalizer"
43 
44 using namespace llvm;
45 using namespace LegalizeActions;
46 using namespace MIPatternMatch;
47 
48 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
49 ///
50 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
51 /// with any leftover piece as type \p LeftoverTy
52 ///
53 /// Returns -1 in the first element of the pair if the breakdown is not
54 /// satisfiable.
55 static std::pair<int, int>
56 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
57   assert(!LeftoverTy.isValid() && "this is an out argument");
58 
59   unsigned Size = OrigTy.getSizeInBits();
60   unsigned NarrowSize = NarrowTy.getSizeInBits();
61   unsigned NumParts = Size / NarrowSize;
62   unsigned LeftoverSize = Size - NumParts * NarrowSize;
63   assert(Size > NarrowSize);
64 
65   if (LeftoverSize == 0)
66     return {NumParts, 0};
67 
68   if (NarrowTy.isVector()) {
69     unsigned EltSize = OrigTy.getScalarSizeInBits();
70     if (LeftoverSize % EltSize != 0)
71       return {-1, -1};
72     LeftoverTy = LLT::scalarOrVector(
73         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
74   } else {
75     LeftoverTy = LLT::scalar(LeftoverSize);
76   }
77 
78   int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
79   return std::make_pair(NumParts, NumLeftover);
80 }
81 
82 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
83 
84   if (!Ty.isScalar())
85     return nullptr;
86 
87   switch (Ty.getSizeInBits()) {
88   case 16:
89     return Type::getHalfTy(Ctx);
90   case 32:
91     return Type::getFloatTy(Ctx);
92   case 64:
93     return Type::getDoubleTy(Ctx);
94   case 80:
95     return Type::getX86_FP80Ty(Ctx);
96   case 128:
97     return Type::getFP128Ty(Ctx);
98   default:
99     return nullptr;
100   }
101 }
102 
103 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
104                                  GISelChangeObserver &Observer,
105                                  MachineIRBuilder &Builder)
106     : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
107       LI(*MF.getSubtarget().getLegalizerInfo()),
108       TLI(*MF.getSubtarget().getTargetLowering()), KB(nullptr) {}
109 
110 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
111                                  GISelChangeObserver &Observer,
112                                  MachineIRBuilder &B, GISelKnownBits *KB)
113     : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
114       TLI(*MF.getSubtarget().getTargetLowering()), KB(KB) {}
115 
116 LegalizerHelper::LegalizeResult
117 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
118                                    LostDebugLocObserver &LocObserver) {
119   LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
120 
121   MIRBuilder.setInstrAndDebugLoc(MI);
122 
123   if (isa<GIntrinsic>(MI))
124     return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
125   auto Step = LI.getAction(MI, MRI);
126   switch (Step.Action) {
127   case Legal:
128     LLVM_DEBUG(dbgs() << ".. Already legal\n");
129     return AlreadyLegal;
130   case Libcall:
131     LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
132     return libcall(MI, LocObserver);
133   case NarrowScalar:
134     LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
135     return narrowScalar(MI, Step.TypeIdx, Step.NewType);
136   case WidenScalar:
137     LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
138     return widenScalar(MI, Step.TypeIdx, Step.NewType);
139   case Bitcast:
140     LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
141     return bitcast(MI, Step.TypeIdx, Step.NewType);
142   case Lower:
143     LLVM_DEBUG(dbgs() << ".. Lower\n");
144     return lower(MI, Step.TypeIdx, Step.NewType);
145   case FewerElements:
146     LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
147     return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
148   case MoreElements:
149     LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
150     return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
151   case Custom:
152     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
153     return LI.legalizeCustom(*this, MI, LocObserver) ? Legalized
154                                                      : UnableToLegalize;
155   default:
156     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
157     return UnableToLegalize;
158   }
159 }
160 
161 void LegalizerHelper::insertParts(Register DstReg,
162                                   LLT ResultTy, LLT PartTy,
163                                   ArrayRef<Register> PartRegs,
164                                   LLT LeftoverTy,
165                                   ArrayRef<Register> LeftoverRegs) {
166   if (!LeftoverTy.isValid()) {
167     assert(LeftoverRegs.empty());
168 
169     if (!ResultTy.isVector()) {
170       MIRBuilder.buildMergeLikeInstr(DstReg, PartRegs);
171       return;
172     }
173 
174     if (PartTy.isVector())
175       MIRBuilder.buildConcatVectors(DstReg, PartRegs);
176     else
177       MIRBuilder.buildBuildVector(DstReg, PartRegs);
178     return;
179   }
180 
181   // Merge sub-vectors with different number of elements and insert into DstReg.
182   if (ResultTy.isVector()) {
183     assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
184     SmallVector<Register, 8> AllRegs;
185     for (auto Reg : concat<const Register>(PartRegs, LeftoverRegs))
186       AllRegs.push_back(Reg);
187     return mergeMixedSubvectors(DstReg, AllRegs);
188   }
189 
190   SmallVector<Register> GCDRegs;
191   LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
192   for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
193     extractGCDType(GCDRegs, GCDTy, PartReg);
194   LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
195   buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
196 }
197 
198 void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
199                                        Register Reg) {
200   LLT Ty = MRI.getType(Reg);
201   SmallVector<Register, 8> RegElts;
202   extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts,
203                MIRBuilder, MRI);
204   Elts.append(RegElts);
205 }
206 
207 /// Merge \p PartRegs with different types into \p DstReg.
208 void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
209                                            ArrayRef<Register> PartRegs) {
210   SmallVector<Register, 8> AllElts;
211   for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
212     appendVectorElts(AllElts, PartRegs[i]);
213 
214   Register Leftover = PartRegs[PartRegs.size() - 1];
215   if (MRI.getType(Leftover).isScalar())
216     AllElts.push_back(Leftover);
217   else
218     appendVectorElts(AllElts, Leftover);
219 
220   MIRBuilder.buildMergeLikeInstr(DstReg, AllElts);
221 }
222 
223 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
224 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
225                               const MachineInstr &MI) {
226   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
227 
228   const int StartIdx = Regs.size();
229   const int NumResults = MI.getNumOperands() - 1;
230   Regs.resize(Regs.size() + NumResults);
231   for (int I = 0; I != NumResults; ++I)
232     Regs[StartIdx + I] = MI.getOperand(I).getReg();
233 }
234 
235 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
236                                      LLT GCDTy, Register SrcReg) {
237   LLT SrcTy = MRI.getType(SrcReg);
238   if (SrcTy == GCDTy) {
239     // If the source already evenly divides the result type, we don't need to do
240     // anything.
241     Parts.push_back(SrcReg);
242   } else {
243     // Need to split into common type sized pieces.
244     auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
245     getUnmergeResults(Parts, *Unmerge);
246   }
247 }
248 
249 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
250                                     LLT NarrowTy, Register SrcReg) {
251   LLT SrcTy = MRI.getType(SrcReg);
252   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
253   extractGCDType(Parts, GCDTy, SrcReg);
254   return GCDTy;
255 }
256 
257 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
258                                          SmallVectorImpl<Register> &VRegs,
259                                          unsigned PadStrategy) {
260   LLT LCMTy = getLCMType(DstTy, NarrowTy);
261 
262   int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
263   int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
264   int NumOrigSrc = VRegs.size();
265 
266   Register PadReg;
267 
268   // Get a value we can use to pad the source value if the sources won't evenly
269   // cover the result type.
270   if (NumOrigSrc < NumParts * NumSubParts) {
271     if (PadStrategy == TargetOpcode::G_ZEXT)
272       PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
273     else if (PadStrategy == TargetOpcode::G_ANYEXT)
274       PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
275     else {
276       assert(PadStrategy == TargetOpcode::G_SEXT);
277 
278       // Shift the sign bit of the low register through the high register.
279       auto ShiftAmt =
280         MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
281       PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
282     }
283   }
284 
285   // Registers for the final merge to be produced.
286   SmallVector<Register, 4> Remerge(NumParts);
287 
288   // Registers needed for intermediate merges, which will be merged into a
289   // source for Remerge.
290   SmallVector<Register, 4> SubMerge(NumSubParts);
291 
292   // Once we've fully read off the end of the original source bits, we can reuse
293   // the same high bits for remaining padding elements.
294   Register AllPadReg;
295 
296   // Build merges to the LCM type to cover the original result type.
297   for (int I = 0; I != NumParts; ++I) {
298     bool AllMergePartsArePadding = true;
299 
300     // Build the requested merges to the requested type.
301     for (int J = 0; J != NumSubParts; ++J) {
302       int Idx = I * NumSubParts + J;
303       if (Idx >= NumOrigSrc) {
304         SubMerge[J] = PadReg;
305         continue;
306       }
307 
308       SubMerge[J] = VRegs[Idx];
309 
310       // There are meaningful bits here we can't reuse later.
311       AllMergePartsArePadding = false;
312     }
313 
314     // If we've filled up a complete piece with padding bits, we can directly
315     // emit the natural sized constant if applicable, rather than a merge of
316     // smaller constants.
317     if (AllMergePartsArePadding && !AllPadReg) {
318       if (PadStrategy == TargetOpcode::G_ANYEXT)
319         AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
320       else if (PadStrategy == TargetOpcode::G_ZEXT)
321         AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
322 
323       // If this is a sign extension, we can't materialize a trivial constant
324       // with the right type and have to produce a merge.
325     }
326 
327     if (AllPadReg) {
328       // Avoid creating additional instructions if we're just adding additional
329       // copies of padding bits.
330       Remerge[I] = AllPadReg;
331       continue;
332     }
333 
334     if (NumSubParts == 1)
335       Remerge[I] = SubMerge[0];
336     else
337       Remerge[I] = MIRBuilder.buildMergeLikeInstr(NarrowTy, SubMerge).getReg(0);
338 
339     // In the sign extend padding case, re-use the first all-signbit merge.
340     if (AllMergePartsArePadding && !AllPadReg)
341       AllPadReg = Remerge[I];
342   }
343 
344   VRegs = std::move(Remerge);
345   return LCMTy;
346 }
347 
348 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
349                                                ArrayRef<Register> RemergeRegs) {
350   LLT DstTy = MRI.getType(DstReg);
351 
352   // Create the merge to the widened source, and extract the relevant bits into
353   // the result.
354 
355   if (DstTy == LCMTy) {
356     MIRBuilder.buildMergeLikeInstr(DstReg, RemergeRegs);
357     return;
358   }
359 
360   auto Remerge = MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs);
361   if (DstTy.isScalar() && LCMTy.isScalar()) {
362     MIRBuilder.buildTrunc(DstReg, Remerge);
363     return;
364   }
365 
366   if (LCMTy.isVector()) {
367     unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
368     SmallVector<Register, 8> UnmergeDefs(NumDefs);
369     UnmergeDefs[0] = DstReg;
370     for (unsigned I = 1; I != NumDefs; ++I)
371       UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
372 
373     MIRBuilder.buildUnmerge(UnmergeDefs,
374                             MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs));
375     return;
376   }
377 
378   llvm_unreachable("unhandled case");
379 }
380 
381 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
382 #define RTLIBCASE_INT(LibcallPrefix)                                           \
383   do {                                                                         \
384     switch (Size) {                                                            \
385     case 32:                                                                   \
386       return RTLIB::LibcallPrefix##32;                                         \
387     case 64:                                                                   \
388       return RTLIB::LibcallPrefix##64;                                         \
389     case 128:                                                                  \
390       return RTLIB::LibcallPrefix##128;                                        \
391     default:                                                                   \
392       llvm_unreachable("unexpected size");                                     \
393     }                                                                          \
394   } while (0)
395 
396 #define RTLIBCASE(LibcallPrefix)                                               \
397   do {                                                                         \
398     switch (Size) {                                                            \
399     case 32:                                                                   \
400       return RTLIB::LibcallPrefix##32;                                         \
401     case 64:                                                                   \
402       return RTLIB::LibcallPrefix##64;                                         \
403     case 80:                                                                   \
404       return RTLIB::LibcallPrefix##80;                                         \
405     case 128:                                                                  \
406       return RTLIB::LibcallPrefix##128;                                        \
407     default:                                                                   \
408       llvm_unreachable("unexpected size");                                     \
409     }                                                                          \
410   } while (0)
411 
412   switch (Opcode) {
413   case TargetOpcode::G_MUL:
414     RTLIBCASE_INT(MUL_I);
415   case TargetOpcode::G_SDIV:
416     RTLIBCASE_INT(SDIV_I);
417   case TargetOpcode::G_UDIV:
418     RTLIBCASE_INT(UDIV_I);
419   case TargetOpcode::G_SREM:
420     RTLIBCASE_INT(SREM_I);
421   case TargetOpcode::G_UREM:
422     RTLIBCASE_INT(UREM_I);
423   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
424     RTLIBCASE_INT(CTLZ_I);
425   case TargetOpcode::G_FADD:
426     RTLIBCASE(ADD_F);
427   case TargetOpcode::G_FSUB:
428     RTLIBCASE(SUB_F);
429   case TargetOpcode::G_FMUL:
430     RTLIBCASE(MUL_F);
431   case TargetOpcode::G_FDIV:
432     RTLIBCASE(DIV_F);
433   case TargetOpcode::G_FEXP:
434     RTLIBCASE(EXP_F);
435   case TargetOpcode::G_FEXP2:
436     RTLIBCASE(EXP2_F);
437   case TargetOpcode::G_FEXP10:
438     RTLIBCASE(EXP10_F);
439   case TargetOpcode::G_FREM:
440     RTLIBCASE(REM_F);
441   case TargetOpcode::G_FPOW:
442     RTLIBCASE(POW_F);
443   case TargetOpcode::G_FPOWI:
444     RTLIBCASE(POWI_F);
445   case TargetOpcode::G_FMA:
446     RTLIBCASE(FMA_F);
447   case TargetOpcode::G_FSIN:
448     RTLIBCASE(SIN_F);
449   case TargetOpcode::G_FCOS:
450     RTLIBCASE(COS_F);
451   case TargetOpcode::G_FLOG10:
452     RTLIBCASE(LOG10_F);
453   case TargetOpcode::G_FLOG:
454     RTLIBCASE(LOG_F);
455   case TargetOpcode::G_FLOG2:
456     RTLIBCASE(LOG2_F);
457   case TargetOpcode::G_FLDEXP:
458     RTLIBCASE(LDEXP_F);
459   case TargetOpcode::G_FCEIL:
460     RTLIBCASE(CEIL_F);
461   case TargetOpcode::G_FFLOOR:
462     RTLIBCASE(FLOOR_F);
463   case TargetOpcode::G_FMINNUM:
464     RTLIBCASE(FMIN_F);
465   case TargetOpcode::G_FMAXNUM:
466     RTLIBCASE(FMAX_F);
467   case TargetOpcode::G_FSQRT:
468     RTLIBCASE(SQRT_F);
469   case TargetOpcode::G_FRINT:
470     RTLIBCASE(RINT_F);
471   case TargetOpcode::G_FNEARBYINT:
472     RTLIBCASE(NEARBYINT_F);
473   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
474     RTLIBCASE(ROUNDEVEN_F);
475   }
476   llvm_unreachable("Unknown libcall function");
477 }
478 
479 /// True if an instruction is in tail position in its caller. Intended for
480 /// legalizing libcalls as tail calls when possible.
481 static bool isLibCallInTailPosition(const CallLowering::ArgInfo &Result,
482                                     MachineInstr &MI,
483                                     const TargetInstrInfo &TII,
484                                     MachineRegisterInfo &MRI) {
485   MachineBasicBlock &MBB = *MI.getParent();
486   const Function &F = MBB.getParent()->getFunction();
487 
488   // Conservatively require the attributes of the call to match those of
489   // the return. Ignore NoAlias and NonNull because they don't affect the
490   // call sequence.
491   AttributeList CallerAttrs = F.getAttributes();
492   if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
493           .removeAttribute(Attribute::NoAlias)
494           .removeAttribute(Attribute::NonNull)
495           .hasAttributes())
496     return false;
497 
498   // It's not safe to eliminate the sign / zero extension of the return value.
499   if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
500       CallerAttrs.hasRetAttr(Attribute::SExt))
501     return false;
502 
503   // Only tail call if the following instruction is a standard return or if we
504   // have a `thisreturn` callee, and a sequence like:
505   //
506   //   G_MEMCPY %0, %1, %2
507   //   $x0 = COPY %0
508   //   RET_ReallyLR implicit $x0
509   auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
510   if (Next != MBB.instr_end() && Next->isCopy()) {
511     if (MI.getOpcode() == TargetOpcode::G_BZERO)
512       return false;
513 
514     // For MEMCPY/MOMMOVE/MEMSET these will be the first use (the dst), as the
515     // mempy/etc routines return the same parameter. For other it will be the
516     // returned value.
517     Register VReg = MI.getOperand(0).getReg();
518     if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
519       return false;
520 
521     Register PReg = Next->getOperand(0).getReg();
522     if (!PReg.isPhysical())
523       return false;
524 
525     auto Ret = next_nodbg(Next, MBB.instr_end());
526     if (Ret == MBB.instr_end() || !Ret->isReturn())
527       return false;
528 
529     if (Ret->getNumImplicitOperands() != 1)
530       return false;
531 
532     if (!Ret->getOperand(0).isReg() || PReg != Ret->getOperand(0).getReg())
533       return false;
534 
535     // Skip over the COPY that we just validated.
536     Next = Ret;
537   }
538 
539   if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
540     return false;
541 
542   return true;
543 }
544 
545 LegalizerHelper::LegalizeResult
546 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
547                     const CallLowering::ArgInfo &Result,
548                     ArrayRef<CallLowering::ArgInfo> Args,
549                     const CallingConv::ID CC, LostDebugLocObserver &LocObserver,
550                     MachineInstr *MI) {
551   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
552 
553   CallLowering::CallLoweringInfo Info;
554   Info.CallConv = CC;
555   Info.Callee = MachineOperand::CreateES(Name);
556   Info.OrigRet = Result;
557   if (MI)
558     Info.IsTailCall =
559         (Result.Ty->isVoidTy() ||
560          Result.Ty == MIRBuilder.getMF().getFunction().getReturnType()) &&
561         isLibCallInTailPosition(Result, *MI, MIRBuilder.getTII(),
562                                 *MIRBuilder.getMRI());
563 
564   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
565   if (!CLI.lowerCall(MIRBuilder, Info))
566     return LegalizerHelper::UnableToLegalize;
567 
568   if (MI && Info.LoweredTailCall) {
569     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
570 
571     // Check debug locations before removing the return.
572     LocObserver.checkpoint(true);
573 
574     // We must have a return following the call (or debug insts) to get past
575     // isLibCallInTailPosition.
576     do {
577       MachineInstr *Next = MI->getNextNode();
578       assert(Next &&
579              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
580              "Expected instr following MI to be return or debug inst?");
581       // We lowered a tail call, so the call is now the return from the block.
582       // Delete the old return.
583       Next->eraseFromParent();
584     } while (MI->getNextNode());
585 
586     // We expect to lose the debug location from the return.
587     LocObserver.checkpoint(false);
588   }
589   return LegalizerHelper::Legalized;
590 }
591 
592 LegalizerHelper::LegalizeResult
593 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
594                     const CallLowering::ArgInfo &Result,
595                     ArrayRef<CallLowering::ArgInfo> Args,
596                     LostDebugLocObserver &LocObserver, MachineInstr *MI) {
597   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
598   const char *Name = TLI.getLibcallName(Libcall);
599   const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
600   return createLibcall(MIRBuilder, Name, Result, Args, CC, LocObserver, MI);
601 }
602 
603 // Useful for libcalls where all operands have the same type.
604 static LegalizerHelper::LegalizeResult
605 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
606               Type *OpType, LostDebugLocObserver &LocObserver) {
607   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
608 
609   // FIXME: What does the original arg index mean here?
610   SmallVector<CallLowering::ArgInfo, 3> Args;
611   for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
612     Args.push_back({MO.getReg(), OpType, 0});
613   return createLibcall(MIRBuilder, Libcall,
614                        {MI.getOperand(0).getReg(), OpType, 0}, Args,
615                        LocObserver, &MI);
616 }
617 
618 LegalizerHelper::LegalizeResult
619 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
620                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
621   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
622 
623   SmallVector<CallLowering::ArgInfo, 3> Args;
624   // Add all the args, except for the last which is an imm denoting 'tail'.
625   for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
626     Register Reg = MI.getOperand(i).getReg();
627 
628     // Need derive an IR type for call lowering.
629     LLT OpLLT = MRI.getType(Reg);
630     Type *OpTy = nullptr;
631     if (OpLLT.isPointer())
632       OpTy = PointerType::get(Ctx, OpLLT.getAddressSpace());
633     else
634       OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
635     Args.push_back({Reg, OpTy, 0});
636   }
637 
638   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
639   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
640   RTLIB::Libcall RTLibcall;
641   unsigned Opc = MI.getOpcode();
642   switch (Opc) {
643   case TargetOpcode::G_BZERO:
644     RTLibcall = RTLIB::BZERO;
645     break;
646   case TargetOpcode::G_MEMCPY:
647     RTLibcall = RTLIB::MEMCPY;
648     Args[0].Flags[0].setReturned();
649     break;
650   case TargetOpcode::G_MEMMOVE:
651     RTLibcall = RTLIB::MEMMOVE;
652     Args[0].Flags[0].setReturned();
653     break;
654   case TargetOpcode::G_MEMSET:
655     RTLibcall = RTLIB::MEMSET;
656     Args[0].Flags[0].setReturned();
657     break;
658   default:
659     llvm_unreachable("unsupported opcode");
660   }
661   const char *Name = TLI.getLibcallName(RTLibcall);
662 
663   // Unsupported libcall on the target.
664   if (!Name) {
665     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
666                       << MIRBuilder.getTII().getName(Opc) << "\n");
667     return LegalizerHelper::UnableToLegalize;
668   }
669 
670   CallLowering::CallLoweringInfo Info;
671   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
672   Info.Callee = MachineOperand::CreateES(Name);
673   Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
674   Info.IsTailCall =
675       MI.getOperand(MI.getNumOperands() - 1).getImm() &&
676       isLibCallInTailPosition(Info.OrigRet, MI, MIRBuilder.getTII(), MRI);
677 
678   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
679   if (!CLI.lowerCall(MIRBuilder, Info))
680     return LegalizerHelper::UnableToLegalize;
681 
682   if (Info.LoweredTailCall) {
683     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
684 
685     // Check debug locations before removing the return.
686     LocObserver.checkpoint(true);
687 
688     // We must have a return following the call (or debug insts) to get past
689     // isLibCallInTailPosition.
690     do {
691       MachineInstr *Next = MI.getNextNode();
692       assert(Next &&
693              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
694              "Expected instr following MI to be return or debug inst?");
695       // We lowered a tail call, so the call is now the return from the block.
696       // Delete the old return.
697       Next->eraseFromParent();
698     } while (MI.getNextNode());
699 
700     // We expect to lose the debug location from the return.
701     LocObserver.checkpoint(false);
702   }
703 
704   return LegalizerHelper::Legalized;
705 }
706 
707 static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
708   unsigned Opc = MI.getOpcode();
709   auto &AtomicMI = cast<GMemOperation>(MI);
710   auto &MMO = AtomicMI.getMMO();
711   auto Ordering = MMO.getMergedOrdering();
712   LLT MemType = MMO.getMemoryType();
713   uint64_t MemSize = MemType.getSizeInBytes();
714   if (MemType.isVector())
715     return RTLIB::UNKNOWN_LIBCALL;
716 
717 #define LCALLS(A, B)                                                           \
718   { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
719 #define LCALL5(A)                                                              \
720   LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
721   switch (Opc) {
722   case TargetOpcode::G_ATOMIC_CMPXCHG:
723   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
724     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS)};
725     return getOutlineAtomicHelper(LC, Ordering, MemSize);
726   }
727   case TargetOpcode::G_ATOMICRMW_XCHG: {
728     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP)};
729     return getOutlineAtomicHelper(LC, Ordering, MemSize);
730   }
731   case TargetOpcode::G_ATOMICRMW_ADD:
732   case TargetOpcode::G_ATOMICRMW_SUB: {
733     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
734     return getOutlineAtomicHelper(LC, Ordering, MemSize);
735   }
736   case TargetOpcode::G_ATOMICRMW_AND: {
737     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)};
738     return getOutlineAtomicHelper(LC, Ordering, MemSize);
739   }
740   case TargetOpcode::G_ATOMICRMW_OR: {
741     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET)};
742     return getOutlineAtomicHelper(LC, Ordering, MemSize);
743   }
744   case TargetOpcode::G_ATOMICRMW_XOR: {
745     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR)};
746     return getOutlineAtomicHelper(LC, Ordering, MemSize);
747   }
748   default:
749     return RTLIB::UNKNOWN_LIBCALL;
750   }
751 #undef LCALLS
752 #undef LCALL5
753 }
754 
755 static LegalizerHelper::LegalizeResult
756 createAtomicLibcall(MachineIRBuilder &MIRBuilder, MachineInstr &MI) {
757   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
758 
759   Type *RetTy;
760   SmallVector<Register> RetRegs;
761   SmallVector<CallLowering::ArgInfo, 3> Args;
762   unsigned Opc = MI.getOpcode();
763   switch (Opc) {
764   case TargetOpcode::G_ATOMIC_CMPXCHG:
765   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
766     Register Success;
767     LLT SuccessLLT;
768     auto [Ret, RetLLT, Mem, MemLLT, Cmp, CmpLLT, New, NewLLT] =
769         MI.getFirst4RegLLTs();
770     RetRegs.push_back(Ret);
771     RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
772     if (Opc == TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
773       std::tie(Ret, RetLLT, Success, SuccessLLT, Mem, MemLLT, Cmp, CmpLLT, New,
774                NewLLT) = MI.getFirst5RegLLTs();
775       RetRegs.push_back(Success);
776       RetTy = StructType::get(
777           Ctx, {RetTy, IntegerType::get(Ctx, SuccessLLT.getSizeInBits())});
778     }
779     Args.push_back({Cmp, IntegerType::get(Ctx, CmpLLT.getSizeInBits()), 0});
780     Args.push_back({New, IntegerType::get(Ctx, NewLLT.getSizeInBits()), 0});
781     Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
782     break;
783   }
784   case TargetOpcode::G_ATOMICRMW_XCHG:
785   case TargetOpcode::G_ATOMICRMW_ADD:
786   case TargetOpcode::G_ATOMICRMW_SUB:
787   case TargetOpcode::G_ATOMICRMW_AND:
788   case TargetOpcode::G_ATOMICRMW_OR:
789   case TargetOpcode::G_ATOMICRMW_XOR: {
790     auto [Ret, RetLLT, Mem, MemLLT, Val, ValLLT] = MI.getFirst3RegLLTs();
791     RetRegs.push_back(Ret);
792     RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
793     if (Opc == TargetOpcode::G_ATOMICRMW_AND)
794       Val =
795           MIRBuilder.buildXor(ValLLT, MIRBuilder.buildConstant(ValLLT, -1), Val)
796               .getReg(0);
797     else if (Opc == TargetOpcode::G_ATOMICRMW_SUB)
798       Val =
799           MIRBuilder.buildSub(ValLLT, MIRBuilder.buildConstant(ValLLT, 0), Val)
800               .getReg(0);
801     Args.push_back({Val, IntegerType::get(Ctx, ValLLT.getSizeInBits()), 0});
802     Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
803     break;
804   }
805   default:
806     llvm_unreachable("unsupported opcode");
807   }
808 
809   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
810   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
811   RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(MI);
812   const char *Name = TLI.getLibcallName(RTLibcall);
813 
814   // Unsupported libcall on the target.
815   if (!Name) {
816     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
817                       << MIRBuilder.getTII().getName(Opc) << "\n");
818     return LegalizerHelper::UnableToLegalize;
819   }
820 
821   CallLowering::CallLoweringInfo Info;
822   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
823   Info.Callee = MachineOperand::CreateES(Name);
824   Info.OrigRet = CallLowering::ArgInfo(RetRegs, RetTy, 0);
825 
826   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
827   if (!CLI.lowerCall(MIRBuilder, Info))
828     return LegalizerHelper::UnableToLegalize;
829 
830   return LegalizerHelper::Legalized;
831 }
832 
833 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
834                                        Type *FromType) {
835   auto ToMVT = MVT::getVT(ToType);
836   auto FromMVT = MVT::getVT(FromType);
837 
838   switch (Opcode) {
839   case TargetOpcode::G_FPEXT:
840     return RTLIB::getFPEXT(FromMVT, ToMVT);
841   case TargetOpcode::G_FPTRUNC:
842     return RTLIB::getFPROUND(FromMVT, ToMVT);
843   case TargetOpcode::G_FPTOSI:
844     return RTLIB::getFPTOSINT(FromMVT, ToMVT);
845   case TargetOpcode::G_FPTOUI:
846     return RTLIB::getFPTOUINT(FromMVT, ToMVT);
847   case TargetOpcode::G_SITOFP:
848     return RTLIB::getSINTTOFP(FromMVT, ToMVT);
849   case TargetOpcode::G_UITOFP:
850     return RTLIB::getUINTTOFP(FromMVT, ToMVT);
851   }
852   llvm_unreachable("Unsupported libcall function");
853 }
854 
855 static LegalizerHelper::LegalizeResult
856 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
857                   Type *FromType, LostDebugLocObserver &LocObserver) {
858   RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
859   return createLibcall(
860       MIRBuilder, Libcall, {MI.getOperand(0).getReg(), ToType, 0},
861       {{MI.getOperand(1).getReg(), FromType, 0}}, LocObserver, &MI);
862 }
863 
864 static RTLIB::Libcall
865 getStateLibraryFunctionFor(MachineInstr &MI, const TargetLowering &TLI) {
866   RTLIB::Libcall RTLibcall;
867   switch (MI.getOpcode()) {
868   case TargetOpcode::G_GET_FPENV:
869     RTLibcall = RTLIB::FEGETENV;
870     break;
871   case TargetOpcode::G_SET_FPENV:
872   case TargetOpcode::G_RESET_FPENV:
873     RTLibcall = RTLIB::FESETENV;
874     break;
875   case TargetOpcode::G_GET_FPMODE:
876     RTLibcall = RTLIB::FEGETMODE;
877     break;
878   case TargetOpcode::G_SET_FPMODE:
879   case TargetOpcode::G_RESET_FPMODE:
880     RTLibcall = RTLIB::FESETMODE;
881     break;
882   default:
883     llvm_unreachable("Unexpected opcode");
884   }
885   return RTLibcall;
886 }
887 
888 // Some library functions that read FP state (fegetmode, fegetenv) write the
889 // state into a region in memory. IR intrinsics that do the same operations
890 // (get_fpmode, get_fpenv) return the state as integer value. To implement these
891 // intrinsics via the library functions, we need to use temporary variable,
892 // for example:
893 //
894 //     %0:_(s32) = G_GET_FPMODE
895 //
896 // is transformed to:
897 //
898 //     %1:_(p0) = G_FRAME_INDEX %stack.0
899 //     BL &fegetmode
900 //     %0:_(s32) = G_LOAD % 1
901 //
902 LegalizerHelper::LegalizeResult
903 LegalizerHelper::createGetStateLibcall(MachineIRBuilder &MIRBuilder,
904                                        MachineInstr &MI,
905                                        LostDebugLocObserver &LocObserver) {
906   const DataLayout &DL = MIRBuilder.getDataLayout();
907   auto &MF = MIRBuilder.getMF();
908   auto &MRI = *MIRBuilder.getMRI();
909   auto &Ctx = MF.getFunction().getContext();
910 
911   // Create temporary, where library function will put the read state.
912   Register Dst = MI.getOperand(0).getReg();
913   LLT StateTy = MRI.getType(Dst);
914   TypeSize StateSize = StateTy.getSizeInBytes();
915   Align TempAlign = getStackTemporaryAlignment(StateTy);
916   MachinePointerInfo TempPtrInfo;
917   auto Temp = createStackTemporary(StateSize, TempAlign, TempPtrInfo);
918 
919   // Create a call to library function, with the temporary as an argument.
920   unsigned TempAddrSpace = DL.getAllocaAddrSpace();
921   Type *StatePtrTy = PointerType::get(Ctx, TempAddrSpace);
922   RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
923   auto Res =
924       createLibcall(MIRBuilder, RTLibcall,
925                     CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
926                     CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}),
927                     LocObserver, nullptr);
928   if (Res != LegalizerHelper::Legalized)
929     return Res;
930 
931   // Create a load from the temporary.
932   MachineMemOperand *MMO = MF.getMachineMemOperand(
933       TempPtrInfo, MachineMemOperand::MOLoad, StateTy, TempAlign);
934   MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, Dst, Temp, *MMO);
935 
936   return LegalizerHelper::Legalized;
937 }
938 
939 // Similar to `createGetStateLibcall` the function calls a library function
940 // using transient space in stack. In this case the library function reads
941 // content of memory region.
942 LegalizerHelper::LegalizeResult
943 LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder,
944                                        MachineInstr &MI,
945                                        LostDebugLocObserver &LocObserver) {
946   const DataLayout &DL = MIRBuilder.getDataLayout();
947   auto &MF = MIRBuilder.getMF();
948   auto &MRI = *MIRBuilder.getMRI();
949   auto &Ctx = MF.getFunction().getContext();
950 
951   // Create temporary, where library function will get the new state.
952   Register Src = MI.getOperand(0).getReg();
953   LLT StateTy = MRI.getType(Src);
954   TypeSize StateSize = StateTy.getSizeInBytes();
955   Align TempAlign = getStackTemporaryAlignment(StateTy);
956   MachinePointerInfo TempPtrInfo;
957   auto Temp = createStackTemporary(StateSize, TempAlign, TempPtrInfo);
958 
959   // Put the new state into the temporary.
960   MachineMemOperand *MMO = MF.getMachineMemOperand(
961       TempPtrInfo, MachineMemOperand::MOStore, StateTy, TempAlign);
962   MIRBuilder.buildStore(Src, Temp, *MMO);
963 
964   // Create a call to library function, with the temporary as an argument.
965   unsigned TempAddrSpace = DL.getAllocaAddrSpace();
966   Type *StatePtrTy = PointerType::get(Ctx, TempAddrSpace);
967   RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
968   return createLibcall(MIRBuilder, RTLibcall,
969                        CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
970                        CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}),
971                        LocObserver, nullptr);
972 }
973 
974 // The function is used to legalize operations that set default environment
975 // state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
976 // On most targets supported in glibc FE_DFL_MODE is defined as
977 // `((const femode_t *) -1)`. Such assumption is used here. If for some target
978 // it is not true, the target must provide custom lowering.
979 LegalizerHelper::LegalizeResult
980 LegalizerHelper::createResetStateLibcall(MachineIRBuilder &MIRBuilder,
981                                          MachineInstr &MI,
982                                          LostDebugLocObserver &LocObserver) {
983   const DataLayout &DL = MIRBuilder.getDataLayout();
984   auto &MF = MIRBuilder.getMF();
985   auto &Ctx = MF.getFunction().getContext();
986 
987   // Create an argument for the library function.
988   unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
989   Type *StatePtrTy = PointerType::get(Ctx, AddrSpace);
990   unsigned PtrSize = DL.getPointerSizeInBits(AddrSpace);
991   LLT MemTy = LLT::pointer(AddrSpace, PtrSize);
992   auto DefValue = MIRBuilder.buildConstant(LLT::scalar(PtrSize), -1LL);
993   DstOp Dest(MRI.createGenericVirtualRegister(MemTy));
994   MIRBuilder.buildIntToPtr(Dest, DefValue);
995 
996   RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
997   return createLibcall(MIRBuilder, RTLibcall,
998                        CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
999                        CallLowering::ArgInfo({Dest.getReg(), StatePtrTy, 0}),
1000                        LocObserver, &MI);
1001 }
1002 
1003 LegalizerHelper::LegalizeResult
1004 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
1005   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
1006 
1007   switch (MI.getOpcode()) {
1008   default:
1009     return UnableToLegalize;
1010   case TargetOpcode::G_MUL:
1011   case TargetOpcode::G_SDIV:
1012   case TargetOpcode::G_UDIV:
1013   case TargetOpcode::G_SREM:
1014   case TargetOpcode::G_UREM:
1015   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
1016     LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1017     unsigned Size = LLTy.getSizeInBits();
1018     Type *HLTy = IntegerType::get(Ctx, Size);
1019     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
1020     if (Status != Legalized)
1021       return Status;
1022     break;
1023   }
1024   case TargetOpcode::G_FADD:
1025   case TargetOpcode::G_FSUB:
1026   case TargetOpcode::G_FMUL:
1027   case TargetOpcode::G_FDIV:
1028   case TargetOpcode::G_FMA:
1029   case TargetOpcode::G_FPOW:
1030   case TargetOpcode::G_FREM:
1031   case TargetOpcode::G_FCOS:
1032   case TargetOpcode::G_FSIN:
1033   case TargetOpcode::G_FLOG10:
1034   case TargetOpcode::G_FLOG:
1035   case TargetOpcode::G_FLOG2:
1036   case TargetOpcode::G_FLDEXP:
1037   case TargetOpcode::G_FEXP:
1038   case TargetOpcode::G_FEXP2:
1039   case TargetOpcode::G_FEXP10:
1040   case TargetOpcode::G_FCEIL:
1041   case TargetOpcode::G_FFLOOR:
1042   case TargetOpcode::G_FMINNUM:
1043   case TargetOpcode::G_FMAXNUM:
1044   case TargetOpcode::G_FSQRT:
1045   case TargetOpcode::G_FRINT:
1046   case TargetOpcode::G_FNEARBYINT:
1047   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
1048     LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1049     unsigned Size = LLTy.getSizeInBits();
1050     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
1051     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1052       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1053       return UnableToLegalize;
1054     }
1055     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
1056     if (Status != Legalized)
1057       return Status;
1058     break;
1059   }
1060   case TargetOpcode::G_FPOWI: {
1061     LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1062     unsigned Size = LLTy.getSizeInBits();
1063     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
1064     Type *ITy = IntegerType::get(
1065         Ctx, MRI.getType(MI.getOperand(2).getReg()).getSizeInBits());
1066     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1067       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1068       return UnableToLegalize;
1069     }
1070     auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
1071     std::initializer_list<CallLowering::ArgInfo> Args = {
1072         {MI.getOperand(1).getReg(), HLTy, 0},
1073         {MI.getOperand(2).getReg(), ITy, 1}};
1074     LegalizeResult Status =
1075         createLibcall(MIRBuilder, Libcall, {MI.getOperand(0).getReg(), HLTy, 0},
1076                       Args, LocObserver, &MI);
1077     if (Status != Legalized)
1078       return Status;
1079     break;
1080   }
1081   case TargetOpcode::G_FPEXT:
1082   case TargetOpcode::G_FPTRUNC: {
1083     Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
1084     Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
1085     if (!FromTy || !ToTy)
1086       return UnableToLegalize;
1087     LegalizeResult Status =
1088         conversionLibcall(MI, MIRBuilder, ToTy, FromTy, LocObserver);
1089     if (Status != Legalized)
1090       return Status;
1091     break;
1092   }
1093   case TargetOpcode::G_FPTOSI:
1094   case TargetOpcode::G_FPTOUI: {
1095     // FIXME: Support other types
1096     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1097     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1098     if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64))
1099       return UnableToLegalize;
1100     LegalizeResult Status = conversionLibcall(
1101         MI, MIRBuilder,
1102         ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
1103         FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
1104         LocObserver);
1105     if (Status != Legalized)
1106       return Status;
1107     break;
1108   }
1109   case TargetOpcode::G_SITOFP:
1110   case TargetOpcode::G_UITOFP: {
1111     // FIXME: Support other types
1112     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1113     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1114     if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
1115       return UnableToLegalize;
1116     LegalizeResult Status = conversionLibcall(
1117         MI, MIRBuilder,
1118         ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
1119         FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
1120         LocObserver);
1121     if (Status != Legalized)
1122       return Status;
1123     break;
1124   }
1125   case TargetOpcode::G_ATOMICRMW_XCHG:
1126   case TargetOpcode::G_ATOMICRMW_ADD:
1127   case TargetOpcode::G_ATOMICRMW_SUB:
1128   case TargetOpcode::G_ATOMICRMW_AND:
1129   case TargetOpcode::G_ATOMICRMW_OR:
1130   case TargetOpcode::G_ATOMICRMW_XOR:
1131   case TargetOpcode::G_ATOMIC_CMPXCHG:
1132   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
1133     auto Status = createAtomicLibcall(MIRBuilder, MI);
1134     if (Status != Legalized)
1135       return Status;
1136     break;
1137   }
1138   case TargetOpcode::G_BZERO:
1139   case TargetOpcode::G_MEMCPY:
1140   case TargetOpcode::G_MEMMOVE:
1141   case TargetOpcode::G_MEMSET: {
1142     LegalizeResult Result =
1143         createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
1144     if (Result != Legalized)
1145       return Result;
1146     MI.eraseFromParent();
1147     return Result;
1148   }
1149   case TargetOpcode::G_GET_FPENV:
1150   case TargetOpcode::G_GET_FPMODE: {
1151     LegalizeResult Result = createGetStateLibcall(MIRBuilder, MI, LocObserver);
1152     if (Result != Legalized)
1153       return Result;
1154     break;
1155   }
1156   case TargetOpcode::G_SET_FPENV:
1157   case TargetOpcode::G_SET_FPMODE: {
1158     LegalizeResult Result = createSetStateLibcall(MIRBuilder, MI, LocObserver);
1159     if (Result != Legalized)
1160       return Result;
1161     break;
1162   }
1163   case TargetOpcode::G_RESET_FPENV:
1164   case TargetOpcode::G_RESET_FPMODE: {
1165     LegalizeResult Result =
1166         createResetStateLibcall(MIRBuilder, MI, LocObserver);
1167     if (Result != Legalized)
1168       return Result;
1169     break;
1170   }
1171   }
1172 
1173   MI.eraseFromParent();
1174   return Legalized;
1175 }
1176 
1177 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
1178                                                               unsigned TypeIdx,
1179                                                               LLT NarrowTy) {
1180   uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1181   uint64_t NarrowSize = NarrowTy.getSizeInBits();
1182 
1183   switch (MI.getOpcode()) {
1184   default:
1185     return UnableToLegalize;
1186   case TargetOpcode::G_IMPLICIT_DEF: {
1187     Register DstReg = MI.getOperand(0).getReg();
1188     LLT DstTy = MRI.getType(DstReg);
1189 
1190     // If SizeOp0 is not an exact multiple of NarrowSize, emit
1191     // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
1192     // FIXME: Although this would also be legal for the general case, it causes
1193     //  a lot of regressions in the emitted code (superfluous COPYs, artifact
1194     //  combines not being hit). This seems to be a problem related to the
1195     //  artifact combiner.
1196     if (SizeOp0 % NarrowSize != 0) {
1197       LLT ImplicitTy = NarrowTy;
1198       if (DstTy.isVector())
1199         ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
1200 
1201       Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
1202       MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
1203 
1204       MI.eraseFromParent();
1205       return Legalized;
1206     }
1207 
1208     int NumParts = SizeOp0 / NarrowSize;
1209 
1210     SmallVector<Register, 2> DstRegs;
1211     for (int i = 0; i < NumParts; ++i)
1212       DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
1213 
1214     if (DstTy.isVector())
1215       MIRBuilder.buildBuildVector(DstReg, DstRegs);
1216     else
1217       MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
1218     MI.eraseFromParent();
1219     return Legalized;
1220   }
1221   case TargetOpcode::G_CONSTANT: {
1222     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1223     const APInt &Val = MI.getOperand(1).getCImm()->getValue();
1224     unsigned TotalSize = Ty.getSizeInBits();
1225     unsigned NarrowSize = NarrowTy.getSizeInBits();
1226     int NumParts = TotalSize / NarrowSize;
1227 
1228     SmallVector<Register, 4> PartRegs;
1229     for (int I = 0; I != NumParts; ++I) {
1230       unsigned Offset = I * NarrowSize;
1231       auto K = MIRBuilder.buildConstant(NarrowTy,
1232                                         Val.lshr(Offset).trunc(NarrowSize));
1233       PartRegs.push_back(K.getReg(0));
1234     }
1235 
1236     LLT LeftoverTy;
1237     unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
1238     SmallVector<Register, 1> LeftoverRegs;
1239     if (LeftoverBits != 0) {
1240       LeftoverTy = LLT::scalar(LeftoverBits);
1241       auto K = MIRBuilder.buildConstant(
1242         LeftoverTy,
1243         Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
1244       LeftoverRegs.push_back(K.getReg(0));
1245     }
1246 
1247     insertParts(MI.getOperand(0).getReg(),
1248                 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
1249 
1250     MI.eraseFromParent();
1251     return Legalized;
1252   }
1253   case TargetOpcode::G_SEXT:
1254   case TargetOpcode::G_ZEXT:
1255   case TargetOpcode::G_ANYEXT:
1256     return narrowScalarExt(MI, TypeIdx, NarrowTy);
1257   case TargetOpcode::G_TRUNC: {
1258     if (TypeIdx != 1)
1259       return UnableToLegalize;
1260 
1261     uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1262     if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
1263       LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
1264       return UnableToLegalize;
1265     }
1266 
1267     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
1268     MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
1269     MI.eraseFromParent();
1270     return Legalized;
1271   }
1272 
1273   case TargetOpcode::G_FREEZE: {
1274     if (TypeIdx != 0)
1275       return UnableToLegalize;
1276 
1277     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1278     // Should widen scalar first
1279     if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
1280       return UnableToLegalize;
1281 
1282     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg());
1283     SmallVector<Register, 8> Parts;
1284     for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
1285       Parts.push_back(
1286           MIRBuilder.buildFreeze(NarrowTy, Unmerge.getReg(i)).getReg(0));
1287     }
1288 
1289     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0).getReg(), Parts);
1290     MI.eraseFromParent();
1291     return Legalized;
1292   }
1293   case TargetOpcode::G_ADD:
1294   case TargetOpcode::G_SUB:
1295   case TargetOpcode::G_SADDO:
1296   case TargetOpcode::G_SSUBO:
1297   case TargetOpcode::G_SADDE:
1298   case TargetOpcode::G_SSUBE:
1299   case TargetOpcode::G_UADDO:
1300   case TargetOpcode::G_USUBO:
1301   case TargetOpcode::G_UADDE:
1302   case TargetOpcode::G_USUBE:
1303     return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
1304   case TargetOpcode::G_MUL:
1305   case TargetOpcode::G_UMULH:
1306     return narrowScalarMul(MI, NarrowTy);
1307   case TargetOpcode::G_EXTRACT:
1308     return narrowScalarExtract(MI, TypeIdx, NarrowTy);
1309   case TargetOpcode::G_INSERT:
1310     return narrowScalarInsert(MI, TypeIdx, NarrowTy);
1311   case TargetOpcode::G_LOAD: {
1312     auto &LoadMI = cast<GLoad>(MI);
1313     Register DstReg = LoadMI.getDstReg();
1314     LLT DstTy = MRI.getType(DstReg);
1315     if (DstTy.isVector())
1316       return UnableToLegalize;
1317 
1318     if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
1319       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1320       MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
1321       MIRBuilder.buildAnyExt(DstReg, TmpReg);
1322       LoadMI.eraseFromParent();
1323       return Legalized;
1324     }
1325 
1326     return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
1327   }
1328   case TargetOpcode::G_ZEXTLOAD:
1329   case TargetOpcode::G_SEXTLOAD: {
1330     auto &LoadMI = cast<GExtLoad>(MI);
1331     Register DstReg = LoadMI.getDstReg();
1332     Register PtrReg = LoadMI.getPointerReg();
1333 
1334     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1335     auto &MMO = LoadMI.getMMO();
1336     unsigned MemSize = MMO.getSizeInBits();
1337 
1338     if (MemSize == NarrowSize) {
1339       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
1340     } else if (MemSize < NarrowSize) {
1341       MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
1342     } else if (MemSize > NarrowSize) {
1343       // FIXME: Need to split the load.
1344       return UnableToLegalize;
1345     }
1346 
1347     if (isa<GZExtLoad>(LoadMI))
1348       MIRBuilder.buildZExt(DstReg, TmpReg);
1349     else
1350       MIRBuilder.buildSExt(DstReg, TmpReg);
1351 
1352     LoadMI.eraseFromParent();
1353     return Legalized;
1354   }
1355   case TargetOpcode::G_STORE: {
1356     auto &StoreMI = cast<GStore>(MI);
1357 
1358     Register SrcReg = StoreMI.getValueReg();
1359     LLT SrcTy = MRI.getType(SrcReg);
1360     if (SrcTy.isVector())
1361       return UnableToLegalize;
1362 
1363     int NumParts = SizeOp0 / NarrowSize;
1364     unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
1365     unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
1366     if (SrcTy.isVector() && LeftoverBits != 0)
1367       return UnableToLegalize;
1368 
1369     if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
1370       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1371       MIRBuilder.buildTrunc(TmpReg, SrcReg);
1372       MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
1373       StoreMI.eraseFromParent();
1374       return Legalized;
1375     }
1376 
1377     return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
1378   }
1379   case TargetOpcode::G_SELECT:
1380     return narrowScalarSelect(MI, TypeIdx, NarrowTy);
1381   case TargetOpcode::G_AND:
1382   case TargetOpcode::G_OR:
1383   case TargetOpcode::G_XOR: {
1384     // Legalize bitwise operation:
1385     // A = BinOp<Ty> B, C
1386     // into:
1387     // B1, ..., BN = G_UNMERGE_VALUES B
1388     // C1, ..., CN = G_UNMERGE_VALUES C
1389     // A1 = BinOp<Ty/N> B1, C2
1390     // ...
1391     // AN = BinOp<Ty/N> BN, CN
1392     // A = G_MERGE_VALUES A1, ..., AN
1393     return narrowScalarBasic(MI, TypeIdx, NarrowTy);
1394   }
1395   case TargetOpcode::G_SHL:
1396   case TargetOpcode::G_LSHR:
1397   case TargetOpcode::G_ASHR:
1398     return narrowScalarShift(MI, TypeIdx, NarrowTy);
1399   case TargetOpcode::G_CTLZ:
1400   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1401   case TargetOpcode::G_CTTZ:
1402   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1403   case TargetOpcode::G_CTPOP:
1404     if (TypeIdx == 1)
1405       switch (MI.getOpcode()) {
1406       case TargetOpcode::G_CTLZ:
1407       case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1408         return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
1409       case TargetOpcode::G_CTTZ:
1410       case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1411         return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
1412       case TargetOpcode::G_CTPOP:
1413         return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
1414       default:
1415         return UnableToLegalize;
1416       }
1417 
1418     Observer.changingInstr(MI);
1419     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1420     Observer.changedInstr(MI);
1421     return Legalized;
1422   case TargetOpcode::G_INTTOPTR:
1423     if (TypeIdx != 1)
1424       return UnableToLegalize;
1425 
1426     Observer.changingInstr(MI);
1427     narrowScalarSrc(MI, NarrowTy, 1);
1428     Observer.changedInstr(MI);
1429     return Legalized;
1430   case TargetOpcode::G_PTRTOINT:
1431     if (TypeIdx != 0)
1432       return UnableToLegalize;
1433 
1434     Observer.changingInstr(MI);
1435     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1436     Observer.changedInstr(MI);
1437     return Legalized;
1438   case TargetOpcode::G_PHI: {
1439     // FIXME: add support for when SizeOp0 isn't an exact multiple of
1440     // NarrowSize.
1441     if (SizeOp0 % NarrowSize != 0)
1442       return UnableToLegalize;
1443 
1444     unsigned NumParts = SizeOp0 / NarrowSize;
1445     SmallVector<Register, 2> DstRegs(NumParts);
1446     SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1447     Observer.changingInstr(MI);
1448     for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1449       MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1450       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
1451       extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1452                    SrcRegs[i / 2], MIRBuilder, MRI);
1453     }
1454     MachineBasicBlock &MBB = *MI.getParent();
1455     MIRBuilder.setInsertPt(MBB, MI);
1456     for (unsigned i = 0; i < NumParts; ++i) {
1457       DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1458       MachineInstrBuilder MIB =
1459           MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1460       for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1461         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1462     }
1463     MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1464     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs);
1465     Observer.changedInstr(MI);
1466     MI.eraseFromParent();
1467     return Legalized;
1468   }
1469   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1470   case TargetOpcode::G_INSERT_VECTOR_ELT: {
1471     if (TypeIdx != 2)
1472       return UnableToLegalize;
1473 
1474     int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1475     Observer.changingInstr(MI);
1476     narrowScalarSrc(MI, NarrowTy, OpIdx);
1477     Observer.changedInstr(MI);
1478     return Legalized;
1479   }
1480   case TargetOpcode::G_ICMP: {
1481     Register LHS = MI.getOperand(2).getReg();
1482     LLT SrcTy = MRI.getType(LHS);
1483     uint64_t SrcSize = SrcTy.getSizeInBits();
1484     CmpInst::Predicate Pred =
1485         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1486 
1487     // TODO: Handle the non-equality case for weird sizes.
1488     if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred))
1489       return UnableToLegalize;
1490 
1491     LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1492     SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1493     if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1494                       LHSLeftoverRegs, MIRBuilder, MRI))
1495       return UnableToLegalize;
1496 
1497     LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1498     SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1499     if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1500                       RHSPartRegs, RHSLeftoverRegs, MIRBuilder, MRI))
1501       return UnableToLegalize;
1502 
1503     // We now have the LHS and RHS of the compare split into narrow-type
1504     // registers, plus potentially some leftover type.
1505     Register Dst = MI.getOperand(0).getReg();
1506     LLT ResTy = MRI.getType(Dst);
1507     if (ICmpInst::isEquality(Pred)) {
1508       // For each part on the LHS and RHS, keep track of the result of XOR-ing
1509       // them together. For each equal part, the result should be all 0s. For
1510       // each non-equal part, we'll get at least one 1.
1511       auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1512       SmallVector<Register, 4> Xors;
1513       for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1514         auto LHS = std::get<0>(LHSAndRHS);
1515         auto RHS = std::get<1>(LHSAndRHS);
1516         auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1517         Xors.push_back(Xor);
1518       }
1519 
1520       // Build a G_XOR for each leftover register. Each G_XOR must be widened
1521       // to the desired narrow type so that we can OR them together later.
1522       SmallVector<Register, 4> WidenedXors;
1523       for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1524         auto LHS = std::get<0>(LHSAndRHS);
1525         auto RHS = std::get<1>(LHSAndRHS);
1526         auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1527         LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1528         buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1529                             /* PadStrategy = */ TargetOpcode::G_ZEXT);
1530         Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
1531       }
1532 
1533       // Now, for each part we broke up, we know if they are equal/not equal
1534       // based off the G_XOR. We can OR these all together and compare against
1535       // 0 to get the result.
1536       assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1537       auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1538       for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1539         Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1540       MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1541     } else {
1542       // TODO: Handle non-power-of-two types.
1543       assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
1544       assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
1545       Register LHSL = LHSPartRegs[0];
1546       Register LHSH = LHSPartRegs[1];
1547       Register RHSL = RHSPartRegs[0];
1548       Register RHSH = RHSPartRegs[1];
1549       MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
1550       MachineInstrBuilder CmpHEQ =
1551           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
1552       MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
1553           ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
1554       MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH);
1555     }
1556     MI.eraseFromParent();
1557     return Legalized;
1558   }
1559   case TargetOpcode::G_SEXT_INREG: {
1560     if (TypeIdx != 0)
1561       return UnableToLegalize;
1562 
1563     int64_t SizeInBits = MI.getOperand(2).getImm();
1564 
1565     // So long as the new type has more bits than the bits we're extending we
1566     // don't need to break it apart.
1567     if (NarrowTy.getScalarSizeInBits() > SizeInBits) {
1568       Observer.changingInstr(MI);
1569       // We don't lose any non-extension bits by truncating the src and
1570       // sign-extending the dst.
1571       MachineOperand &MO1 = MI.getOperand(1);
1572       auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1573       MO1.setReg(TruncMIB.getReg(0));
1574 
1575       MachineOperand &MO2 = MI.getOperand(0);
1576       Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1577       MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1578       MIRBuilder.buildSExt(MO2, DstExt);
1579       MO2.setReg(DstExt);
1580       Observer.changedInstr(MI);
1581       return Legalized;
1582     }
1583 
1584     // Break it apart. Components below the extension point are unmodified. The
1585     // component containing the extension point becomes a narrower SEXT_INREG.
1586     // Components above it are ashr'd from the component containing the
1587     // extension point.
1588     if (SizeOp0 % NarrowSize != 0)
1589       return UnableToLegalize;
1590     int NumParts = SizeOp0 / NarrowSize;
1591 
1592     // List the registers where the destination will be scattered.
1593     SmallVector<Register, 2> DstRegs;
1594     // List the registers where the source will be split.
1595     SmallVector<Register, 2> SrcRegs;
1596 
1597     // Create all the temporary registers.
1598     for (int i = 0; i < NumParts; ++i) {
1599       Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1600 
1601       SrcRegs.push_back(SrcReg);
1602     }
1603 
1604     // Explode the big arguments into smaller chunks.
1605     MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1606 
1607     Register AshrCstReg =
1608         MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1609             .getReg(0);
1610     Register FullExtensionReg;
1611     Register PartialExtensionReg;
1612 
1613     // Do the operation on each small part.
1614     for (int i = 0; i < NumParts; ++i) {
1615       if ((i + 1) * NarrowTy.getScalarSizeInBits() <= SizeInBits) {
1616         DstRegs.push_back(SrcRegs[i]);
1617         PartialExtensionReg = DstRegs.back();
1618       } else if (i * NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1619         assert(PartialExtensionReg &&
1620                "Expected to visit partial extension before full");
1621         if (FullExtensionReg) {
1622           DstRegs.push_back(FullExtensionReg);
1623           continue;
1624         }
1625         DstRegs.push_back(
1626             MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1627                 .getReg(0));
1628         FullExtensionReg = DstRegs.back();
1629       } else {
1630         DstRegs.push_back(
1631             MIRBuilder
1632                 .buildInstr(
1633                     TargetOpcode::G_SEXT_INREG, {NarrowTy},
1634                     {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1635                 .getReg(0));
1636         PartialExtensionReg = DstRegs.back();
1637       }
1638     }
1639 
1640     // Gather the destination registers into the final destination.
1641     Register DstReg = MI.getOperand(0).getReg();
1642     MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
1643     MI.eraseFromParent();
1644     return Legalized;
1645   }
1646   case TargetOpcode::G_BSWAP:
1647   case TargetOpcode::G_BITREVERSE: {
1648     if (SizeOp0 % NarrowSize != 0)
1649       return UnableToLegalize;
1650 
1651     Observer.changingInstr(MI);
1652     SmallVector<Register, 2> SrcRegs, DstRegs;
1653     unsigned NumParts = SizeOp0 / NarrowSize;
1654     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs,
1655                  MIRBuilder, MRI);
1656 
1657     for (unsigned i = 0; i < NumParts; ++i) {
1658       auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
1659                                            {SrcRegs[NumParts - 1 - i]});
1660       DstRegs.push_back(DstPart.getReg(0));
1661     }
1662 
1663     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs);
1664 
1665     Observer.changedInstr(MI);
1666     MI.eraseFromParent();
1667     return Legalized;
1668   }
1669   case TargetOpcode::G_PTR_ADD:
1670   case TargetOpcode::G_PTRMASK: {
1671     if (TypeIdx != 1)
1672       return UnableToLegalize;
1673     Observer.changingInstr(MI);
1674     narrowScalarSrc(MI, NarrowTy, 2);
1675     Observer.changedInstr(MI);
1676     return Legalized;
1677   }
1678   case TargetOpcode::G_FPTOUI:
1679   case TargetOpcode::G_FPTOSI:
1680     return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
1681   case TargetOpcode::G_FPEXT:
1682     if (TypeIdx != 0)
1683       return UnableToLegalize;
1684     Observer.changingInstr(MI);
1685     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
1686     Observer.changedInstr(MI);
1687     return Legalized;
1688   case TargetOpcode::G_FLDEXP:
1689   case TargetOpcode::G_STRICT_FLDEXP:
1690     return narrowScalarFLDEXP(MI, TypeIdx, NarrowTy);
1691   }
1692 }
1693 
1694 Register LegalizerHelper::coerceToScalar(Register Val) {
1695   LLT Ty = MRI.getType(Val);
1696   if (Ty.isScalar())
1697     return Val;
1698 
1699   const DataLayout &DL = MIRBuilder.getDataLayout();
1700   LLT NewTy = LLT::scalar(Ty.getSizeInBits());
1701   if (Ty.isPointer()) {
1702     if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
1703       return Register();
1704     return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
1705   }
1706 
1707   Register NewVal = Val;
1708 
1709   assert(Ty.isVector());
1710   LLT EltTy = Ty.getElementType();
1711   if (EltTy.isPointer())
1712     NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
1713   return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
1714 }
1715 
1716 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
1717                                      unsigned OpIdx, unsigned ExtOpcode) {
1718   MachineOperand &MO = MI.getOperand(OpIdx);
1719   auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
1720   MO.setReg(ExtB.getReg(0));
1721 }
1722 
1723 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
1724                                       unsigned OpIdx) {
1725   MachineOperand &MO = MI.getOperand(OpIdx);
1726   auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
1727   MO.setReg(ExtB.getReg(0));
1728 }
1729 
1730 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
1731                                      unsigned OpIdx, unsigned TruncOpcode) {
1732   MachineOperand &MO = MI.getOperand(OpIdx);
1733   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1734   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1735   MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
1736   MO.setReg(DstExt);
1737 }
1738 
1739 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
1740                                       unsigned OpIdx, unsigned ExtOpcode) {
1741   MachineOperand &MO = MI.getOperand(OpIdx);
1742   Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
1743   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1744   MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
1745   MO.setReg(DstTrunc);
1746 }
1747 
1748 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
1749                                             unsigned OpIdx) {
1750   MachineOperand &MO = MI.getOperand(OpIdx);
1751   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1752   Register Dst = MO.getReg();
1753   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1754   MO.setReg(DstExt);
1755   MIRBuilder.buildDeleteTrailingVectorElements(Dst, DstExt);
1756 }
1757 
1758 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
1759                                             unsigned OpIdx) {
1760   MachineOperand &MO = MI.getOperand(OpIdx);
1761   SmallVector<Register, 8> Regs;
1762   MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0));
1763 }
1764 
1765 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1766   MachineOperand &Op = MI.getOperand(OpIdx);
1767   Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
1768 }
1769 
1770 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1771   MachineOperand &MO = MI.getOperand(OpIdx);
1772   Register CastDst = MRI.createGenericVirtualRegister(CastTy);
1773   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1774   MIRBuilder.buildBitcast(MO, CastDst);
1775   MO.setReg(CastDst);
1776 }
1777 
1778 LegalizerHelper::LegalizeResult
1779 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
1780                                         LLT WideTy) {
1781   if (TypeIdx != 1)
1782     return UnableToLegalize;
1783 
1784   auto [DstReg, DstTy, Src1Reg, Src1Ty] = MI.getFirst2RegLLTs();
1785   if (DstTy.isVector())
1786     return UnableToLegalize;
1787 
1788   LLT SrcTy = MRI.getType(Src1Reg);
1789   const int DstSize = DstTy.getSizeInBits();
1790   const int SrcSize = SrcTy.getSizeInBits();
1791   const int WideSize = WideTy.getSizeInBits();
1792   const int NumMerge = (DstSize + WideSize - 1) / WideSize;
1793 
1794   unsigned NumOps = MI.getNumOperands();
1795   unsigned NumSrc = MI.getNumOperands() - 1;
1796   unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
1797 
1798   if (WideSize >= DstSize) {
1799     // Directly pack the bits in the target type.
1800     Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1Reg).getReg(0);
1801 
1802     for (unsigned I = 2; I != NumOps; ++I) {
1803       const unsigned Offset = (I - 1) * PartSize;
1804 
1805       Register SrcReg = MI.getOperand(I).getReg();
1806       assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
1807 
1808       auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
1809 
1810       Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
1811         MRI.createGenericVirtualRegister(WideTy);
1812 
1813       auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
1814       auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
1815       MIRBuilder.buildOr(NextResult, ResultReg, Shl);
1816       ResultReg = NextResult;
1817     }
1818 
1819     if (WideSize > DstSize)
1820       MIRBuilder.buildTrunc(DstReg, ResultReg);
1821     else if (DstTy.isPointer())
1822       MIRBuilder.buildIntToPtr(DstReg, ResultReg);
1823 
1824     MI.eraseFromParent();
1825     return Legalized;
1826   }
1827 
1828   // Unmerge the original values to the GCD type, and recombine to the next
1829   // multiple greater than the original type.
1830   //
1831   // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
1832   // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
1833   // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
1834   // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
1835   // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
1836   // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
1837   // %12:_(s12) = G_MERGE_VALUES %10, %11
1838   //
1839   // Padding with undef if necessary:
1840   //
1841   // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
1842   // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
1843   // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
1844   // %7:_(s2) = G_IMPLICIT_DEF
1845   // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
1846   // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
1847   // %10:_(s12) = G_MERGE_VALUES %8, %9
1848 
1849   const int GCD = std::gcd(SrcSize, WideSize);
1850   LLT GCDTy = LLT::scalar(GCD);
1851 
1852   SmallVector<Register, 8> Parts;
1853   SmallVector<Register, 8> NewMergeRegs;
1854   SmallVector<Register, 8> Unmerges;
1855   LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
1856 
1857   // Decompose the original operands if they don't evenly divide.
1858   for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
1859     Register SrcReg = MO.getReg();
1860     if (GCD == SrcSize) {
1861       Unmerges.push_back(SrcReg);
1862     } else {
1863       auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
1864       for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
1865         Unmerges.push_back(Unmerge.getReg(J));
1866     }
1867   }
1868 
1869   // Pad with undef to the next size that is a multiple of the requested size.
1870   if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
1871     Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
1872     for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
1873       Unmerges.push_back(UndefReg);
1874   }
1875 
1876   const int PartsPerGCD = WideSize / GCD;
1877 
1878   // Build merges of each piece.
1879   ArrayRef<Register> Slicer(Unmerges);
1880   for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
1881     auto Merge =
1882         MIRBuilder.buildMergeLikeInstr(WideTy, Slicer.take_front(PartsPerGCD));
1883     NewMergeRegs.push_back(Merge.getReg(0));
1884   }
1885 
1886   // A truncate may be necessary if the requested type doesn't evenly divide the
1887   // original result type.
1888   if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
1889     MIRBuilder.buildMergeLikeInstr(DstReg, NewMergeRegs);
1890   } else {
1891     auto FinalMerge = MIRBuilder.buildMergeLikeInstr(WideDstTy, NewMergeRegs);
1892     MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
1893   }
1894 
1895   MI.eraseFromParent();
1896   return Legalized;
1897 }
1898 
1899 LegalizerHelper::LegalizeResult
1900 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
1901                                           LLT WideTy) {
1902   if (TypeIdx != 0)
1903     return UnableToLegalize;
1904 
1905   int NumDst = MI.getNumOperands() - 1;
1906   Register SrcReg = MI.getOperand(NumDst).getReg();
1907   LLT SrcTy = MRI.getType(SrcReg);
1908   if (SrcTy.isVector())
1909     return UnableToLegalize;
1910 
1911   Register Dst0Reg = MI.getOperand(0).getReg();
1912   LLT DstTy = MRI.getType(Dst0Reg);
1913   if (!DstTy.isScalar())
1914     return UnableToLegalize;
1915 
1916   if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
1917     if (SrcTy.isPointer()) {
1918       const DataLayout &DL = MIRBuilder.getDataLayout();
1919       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
1920         LLVM_DEBUG(
1921             dbgs() << "Not casting non-integral address space integer\n");
1922         return UnableToLegalize;
1923       }
1924 
1925       SrcTy = LLT::scalar(SrcTy.getSizeInBits());
1926       SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
1927     }
1928 
1929     // Widen SrcTy to WideTy. This does not affect the result, but since the
1930     // user requested this size, it is probably better handled than SrcTy and
1931     // should reduce the total number of legalization artifacts.
1932     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1933       SrcTy = WideTy;
1934       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
1935     }
1936 
1937     // Theres no unmerge type to target. Directly extract the bits from the
1938     // source type
1939     unsigned DstSize = DstTy.getSizeInBits();
1940 
1941     MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
1942     for (int I = 1; I != NumDst; ++I) {
1943       auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
1944       auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
1945       MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
1946     }
1947 
1948     MI.eraseFromParent();
1949     return Legalized;
1950   }
1951 
1952   // Extend the source to a wider type.
1953   LLT LCMTy = getLCMType(SrcTy, WideTy);
1954 
1955   Register WideSrc = SrcReg;
1956   if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
1957     // TODO: If this is an integral address space, cast to integer and anyext.
1958     if (SrcTy.isPointer()) {
1959       LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
1960       return UnableToLegalize;
1961     }
1962 
1963     WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
1964   }
1965 
1966   auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
1967 
1968   // Create a sequence of unmerges and merges to the original results. Since we
1969   // may have widened the source, we will need to pad the results with dead defs
1970   // to cover the source register.
1971   // e.g. widen s48 to s64:
1972   // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
1973   //
1974   // =>
1975   //  %4:_(s192) = G_ANYEXT %0:_(s96)
1976   //  %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
1977   //  ; unpack to GCD type, with extra dead defs
1978   //  %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
1979   //  %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
1980   //  dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
1981   //  %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10   ; Remerge to destination
1982   //  %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
1983   const LLT GCDTy = getGCDType(WideTy, DstTy);
1984   const int NumUnmerge = Unmerge->getNumOperands() - 1;
1985   const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
1986 
1987   // Directly unmerge to the destination without going through a GCD type
1988   // if possible
1989   if (PartsPerRemerge == 1) {
1990     const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
1991 
1992     for (int I = 0; I != NumUnmerge; ++I) {
1993       auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
1994 
1995       for (int J = 0; J != PartsPerUnmerge; ++J) {
1996         int Idx = I * PartsPerUnmerge + J;
1997         if (Idx < NumDst)
1998           MIB.addDef(MI.getOperand(Idx).getReg());
1999         else {
2000           // Create dead def for excess components.
2001           MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
2002         }
2003       }
2004 
2005       MIB.addUse(Unmerge.getReg(I));
2006     }
2007   } else {
2008     SmallVector<Register, 16> Parts;
2009     for (int J = 0; J != NumUnmerge; ++J)
2010       extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
2011 
2012     SmallVector<Register, 8> RemergeParts;
2013     for (int I = 0; I != NumDst; ++I) {
2014       for (int J = 0; J < PartsPerRemerge; ++J) {
2015         const int Idx = I * PartsPerRemerge + J;
2016         RemergeParts.emplace_back(Parts[Idx]);
2017       }
2018 
2019       MIRBuilder.buildMergeLikeInstr(MI.getOperand(I).getReg(), RemergeParts);
2020       RemergeParts.clear();
2021     }
2022   }
2023 
2024   MI.eraseFromParent();
2025   return Legalized;
2026 }
2027 
2028 LegalizerHelper::LegalizeResult
2029 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
2030                                     LLT WideTy) {
2031   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
2032   unsigned Offset = MI.getOperand(2).getImm();
2033 
2034   if (TypeIdx == 0) {
2035     if (SrcTy.isVector() || DstTy.isVector())
2036       return UnableToLegalize;
2037 
2038     SrcOp Src(SrcReg);
2039     if (SrcTy.isPointer()) {
2040       // Extracts from pointers can be handled only if they are really just
2041       // simple integers.
2042       const DataLayout &DL = MIRBuilder.getDataLayout();
2043       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
2044         return UnableToLegalize;
2045 
2046       LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
2047       Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
2048       SrcTy = SrcAsIntTy;
2049     }
2050 
2051     if (DstTy.isPointer())
2052       return UnableToLegalize;
2053 
2054     if (Offset == 0) {
2055       // Avoid a shift in the degenerate case.
2056       MIRBuilder.buildTrunc(DstReg,
2057                             MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
2058       MI.eraseFromParent();
2059       return Legalized;
2060     }
2061 
2062     // Do a shift in the source type.
2063     LLT ShiftTy = SrcTy;
2064     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2065       Src = MIRBuilder.buildAnyExt(WideTy, Src);
2066       ShiftTy = WideTy;
2067     }
2068 
2069     auto LShr = MIRBuilder.buildLShr(
2070       ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
2071     MIRBuilder.buildTrunc(DstReg, LShr);
2072     MI.eraseFromParent();
2073     return Legalized;
2074   }
2075 
2076   if (SrcTy.isScalar()) {
2077     Observer.changingInstr(MI);
2078     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2079     Observer.changedInstr(MI);
2080     return Legalized;
2081   }
2082 
2083   if (!SrcTy.isVector())
2084     return UnableToLegalize;
2085 
2086   if (DstTy != SrcTy.getElementType())
2087     return UnableToLegalize;
2088 
2089   if (Offset % SrcTy.getScalarSizeInBits() != 0)
2090     return UnableToLegalize;
2091 
2092   Observer.changingInstr(MI);
2093   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2094 
2095   MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
2096                           Offset);
2097   widenScalarDst(MI, WideTy.getScalarType(), 0);
2098   Observer.changedInstr(MI);
2099   return Legalized;
2100 }
2101 
2102 LegalizerHelper::LegalizeResult
2103 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
2104                                    LLT WideTy) {
2105   if (TypeIdx != 0 || WideTy.isVector())
2106     return UnableToLegalize;
2107   Observer.changingInstr(MI);
2108   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2109   widenScalarDst(MI, WideTy);
2110   Observer.changedInstr(MI);
2111   return Legalized;
2112 }
2113 
2114 LegalizerHelper::LegalizeResult
2115 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
2116                                            LLT WideTy) {
2117   unsigned Opcode;
2118   unsigned ExtOpcode;
2119   std::optional<Register> CarryIn;
2120   switch (MI.getOpcode()) {
2121   default:
2122     llvm_unreachable("Unexpected opcode!");
2123   case TargetOpcode::G_SADDO:
2124     Opcode = TargetOpcode::G_ADD;
2125     ExtOpcode = TargetOpcode::G_SEXT;
2126     break;
2127   case TargetOpcode::G_SSUBO:
2128     Opcode = TargetOpcode::G_SUB;
2129     ExtOpcode = TargetOpcode::G_SEXT;
2130     break;
2131   case TargetOpcode::G_UADDO:
2132     Opcode = TargetOpcode::G_ADD;
2133     ExtOpcode = TargetOpcode::G_ZEXT;
2134     break;
2135   case TargetOpcode::G_USUBO:
2136     Opcode = TargetOpcode::G_SUB;
2137     ExtOpcode = TargetOpcode::G_ZEXT;
2138     break;
2139   case TargetOpcode::G_SADDE:
2140     Opcode = TargetOpcode::G_UADDE;
2141     ExtOpcode = TargetOpcode::G_SEXT;
2142     CarryIn = MI.getOperand(4).getReg();
2143     break;
2144   case TargetOpcode::G_SSUBE:
2145     Opcode = TargetOpcode::G_USUBE;
2146     ExtOpcode = TargetOpcode::G_SEXT;
2147     CarryIn = MI.getOperand(4).getReg();
2148     break;
2149   case TargetOpcode::G_UADDE:
2150     Opcode = TargetOpcode::G_UADDE;
2151     ExtOpcode = TargetOpcode::G_ZEXT;
2152     CarryIn = MI.getOperand(4).getReg();
2153     break;
2154   case TargetOpcode::G_USUBE:
2155     Opcode = TargetOpcode::G_USUBE;
2156     ExtOpcode = TargetOpcode::G_ZEXT;
2157     CarryIn = MI.getOperand(4).getReg();
2158     break;
2159   }
2160 
2161   if (TypeIdx == 1) {
2162     unsigned BoolExtOp = MIRBuilder.getBoolExtOp(WideTy.isVector(), false);
2163 
2164     Observer.changingInstr(MI);
2165     if (CarryIn)
2166       widenScalarSrc(MI, WideTy, 4, BoolExtOp);
2167     widenScalarDst(MI, WideTy, 1);
2168 
2169     Observer.changedInstr(MI);
2170     return Legalized;
2171   }
2172 
2173   auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
2174   auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
2175   // Do the arithmetic in the larger type.
2176   Register NewOp;
2177   if (CarryIn) {
2178     LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
2179     NewOp = MIRBuilder
2180                 .buildInstr(Opcode, {WideTy, CarryOutTy},
2181                             {LHSExt, RHSExt, *CarryIn})
2182                 .getReg(0);
2183   } else {
2184     NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
2185   }
2186   LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
2187   auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
2188   auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
2189   // There is no overflow if the ExtOp is the same as NewOp.
2190   MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
2191   // Now trunc the NewOp to the original result.
2192   MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
2193   MI.eraseFromParent();
2194   return Legalized;
2195 }
2196 
2197 LegalizerHelper::LegalizeResult
2198 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
2199                                          LLT WideTy) {
2200   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
2201                   MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
2202                   MI.getOpcode() == TargetOpcode::G_SSHLSAT;
2203   bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
2204                  MI.getOpcode() == TargetOpcode::G_USHLSAT;
2205   // We can convert this to:
2206   //   1. Any extend iN to iM
2207   //   2. SHL by M-N
2208   //   3. [US][ADD|SUB|SHL]SAT
2209   //   4. L/ASHR by M-N
2210   //
2211   // It may be more efficient to lower this to a min and a max operation in
2212   // the higher precision arithmetic if the promoted operation isn't legal,
2213   // but this decision is up to the target's lowering request.
2214   Register DstReg = MI.getOperand(0).getReg();
2215 
2216   unsigned NewBits = WideTy.getScalarSizeInBits();
2217   unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
2218 
2219   // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
2220   // must not left shift the RHS to preserve the shift amount.
2221   auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
2222   auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
2223                      : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
2224   auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
2225   auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
2226   auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
2227 
2228   auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
2229                                         {ShiftL, ShiftR}, MI.getFlags());
2230 
2231   // Use a shift that will preserve the number of sign bits when the trunc is
2232   // folded away.
2233   auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
2234                          : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
2235 
2236   MIRBuilder.buildTrunc(DstReg, Result);
2237   MI.eraseFromParent();
2238   return Legalized;
2239 }
2240 
2241 LegalizerHelper::LegalizeResult
2242 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
2243                                  LLT WideTy) {
2244   if (TypeIdx == 1) {
2245     Observer.changingInstr(MI);
2246     widenScalarDst(MI, WideTy, 1);
2247     Observer.changedInstr(MI);
2248     return Legalized;
2249   }
2250 
2251   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
2252   auto [Result, OriginalOverflow, LHS, RHS] = MI.getFirst4Regs();
2253   LLT SrcTy = MRI.getType(LHS);
2254   LLT OverflowTy = MRI.getType(OriginalOverflow);
2255   unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
2256 
2257   // To determine if the result overflowed in the larger type, we extend the
2258   // input to the larger type, do the multiply (checking if it overflows),
2259   // then also check the high bits of the result to see if overflow happened
2260   // there.
2261   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2262   auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
2263   auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
2264 
2265   // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2266   // so we don't need to check the overflow result of larger type Mulo.
2267   bool WideMulCanOverflow = WideTy.getScalarSizeInBits() < 2 * SrcBitWidth;
2268 
2269   unsigned MulOpc =
2270       WideMulCanOverflow ? MI.getOpcode() : (unsigned)TargetOpcode::G_MUL;
2271 
2272   MachineInstrBuilder Mulo;
2273   if (WideMulCanOverflow)
2274     Mulo = MIRBuilder.buildInstr(MulOpc, {WideTy, OverflowTy},
2275                                  {LeftOperand, RightOperand});
2276   else
2277     Mulo = MIRBuilder.buildInstr(MulOpc, {WideTy}, {LeftOperand, RightOperand});
2278 
2279   auto Mul = Mulo->getOperand(0);
2280   MIRBuilder.buildTrunc(Result, Mul);
2281 
2282   MachineInstrBuilder ExtResult;
2283   // Overflow occurred if it occurred in the larger type, or if the high part
2284   // of the result does not zero/sign-extend the low part.  Check this second
2285   // possibility first.
2286   if (IsSigned) {
2287     // For signed, overflow occurred when the high part does not sign-extend
2288     // the low part.
2289     ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
2290   } else {
2291     // Unsigned overflow occurred when the high part does not zero-extend the
2292     // low part.
2293     ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
2294   }
2295 
2296   if (WideMulCanOverflow) {
2297     auto Overflow =
2298         MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
2299     // Finally check if the multiplication in the larger type itself overflowed.
2300     MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
2301   } else {
2302     MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
2303   }
2304   MI.eraseFromParent();
2305   return Legalized;
2306 }
2307 
2308 LegalizerHelper::LegalizeResult
2309 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
2310   switch (MI.getOpcode()) {
2311   default:
2312     return UnableToLegalize;
2313   case TargetOpcode::G_ATOMICRMW_XCHG:
2314   case TargetOpcode::G_ATOMICRMW_ADD:
2315   case TargetOpcode::G_ATOMICRMW_SUB:
2316   case TargetOpcode::G_ATOMICRMW_AND:
2317   case TargetOpcode::G_ATOMICRMW_OR:
2318   case TargetOpcode::G_ATOMICRMW_XOR:
2319   case TargetOpcode::G_ATOMICRMW_MIN:
2320   case TargetOpcode::G_ATOMICRMW_MAX:
2321   case TargetOpcode::G_ATOMICRMW_UMIN:
2322   case TargetOpcode::G_ATOMICRMW_UMAX:
2323     assert(TypeIdx == 0 && "atomicrmw with second scalar type");
2324     Observer.changingInstr(MI);
2325     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2326     widenScalarDst(MI, WideTy, 0);
2327     Observer.changedInstr(MI);
2328     return Legalized;
2329   case TargetOpcode::G_ATOMIC_CMPXCHG:
2330     assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2331     Observer.changingInstr(MI);
2332     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2333     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2334     widenScalarDst(MI, WideTy, 0);
2335     Observer.changedInstr(MI);
2336     return Legalized;
2337   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2338     if (TypeIdx == 0) {
2339       Observer.changingInstr(MI);
2340       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2341       widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
2342       widenScalarDst(MI, WideTy, 0);
2343       Observer.changedInstr(MI);
2344       return Legalized;
2345     }
2346     assert(TypeIdx == 1 &&
2347            "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2348     Observer.changingInstr(MI);
2349     widenScalarDst(MI, WideTy, 1);
2350     Observer.changedInstr(MI);
2351     return Legalized;
2352   case TargetOpcode::G_EXTRACT:
2353     return widenScalarExtract(MI, TypeIdx, WideTy);
2354   case TargetOpcode::G_INSERT:
2355     return widenScalarInsert(MI, TypeIdx, WideTy);
2356   case TargetOpcode::G_MERGE_VALUES:
2357     return widenScalarMergeValues(MI, TypeIdx, WideTy);
2358   case TargetOpcode::G_UNMERGE_VALUES:
2359     return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2360   case TargetOpcode::G_SADDO:
2361   case TargetOpcode::G_SSUBO:
2362   case TargetOpcode::G_UADDO:
2363   case TargetOpcode::G_USUBO:
2364   case TargetOpcode::G_SADDE:
2365   case TargetOpcode::G_SSUBE:
2366   case TargetOpcode::G_UADDE:
2367   case TargetOpcode::G_USUBE:
2368     return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2369   case TargetOpcode::G_UMULO:
2370   case TargetOpcode::G_SMULO:
2371     return widenScalarMulo(MI, TypeIdx, WideTy);
2372   case TargetOpcode::G_SADDSAT:
2373   case TargetOpcode::G_SSUBSAT:
2374   case TargetOpcode::G_SSHLSAT:
2375   case TargetOpcode::G_UADDSAT:
2376   case TargetOpcode::G_USUBSAT:
2377   case TargetOpcode::G_USHLSAT:
2378     return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2379   case TargetOpcode::G_CTTZ:
2380   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2381   case TargetOpcode::G_CTLZ:
2382   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2383   case TargetOpcode::G_CTPOP: {
2384     if (TypeIdx == 0) {
2385       Observer.changingInstr(MI);
2386       widenScalarDst(MI, WideTy, 0);
2387       Observer.changedInstr(MI);
2388       return Legalized;
2389     }
2390 
2391     Register SrcReg = MI.getOperand(1).getReg();
2392 
2393     // First extend the input.
2394     unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
2395                               MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
2396                           ? TargetOpcode::G_ANYEXT
2397                           : TargetOpcode::G_ZEXT;
2398     auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
2399     LLT CurTy = MRI.getType(SrcReg);
2400     unsigned NewOpc = MI.getOpcode();
2401     if (NewOpc == TargetOpcode::G_CTTZ) {
2402       // The count is the same in the larger type except if the original
2403       // value was zero.  This can be handled by setting the bit just off
2404       // the top of the original type.
2405       auto TopBit =
2406           APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
2407       MIBSrc = MIRBuilder.buildOr(
2408         WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
2409       // Now we know the operand is non-zero, use the more relaxed opcode.
2410       NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2411     }
2412 
2413     // Perform the operation at the larger size.
2414     auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
2415     // This is already the correct result for CTPOP and CTTZs
2416     if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
2417         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2418       // The correct result is NewOp - (Difference in widety and current ty).
2419       unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2420       MIBNewOp = MIRBuilder.buildSub(
2421           WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
2422     }
2423 
2424     MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
2425     MI.eraseFromParent();
2426     return Legalized;
2427   }
2428   case TargetOpcode::G_BSWAP: {
2429     Observer.changingInstr(MI);
2430     Register DstReg = MI.getOperand(0).getReg();
2431 
2432     Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2433     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2434     Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2435     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2436 
2437     MI.getOperand(0).setReg(DstExt);
2438 
2439     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2440 
2441     LLT Ty = MRI.getType(DstReg);
2442     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2443     MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2444     MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2445 
2446     MIRBuilder.buildTrunc(DstReg, ShrReg);
2447     Observer.changedInstr(MI);
2448     return Legalized;
2449   }
2450   case TargetOpcode::G_BITREVERSE: {
2451     Observer.changingInstr(MI);
2452 
2453     Register DstReg = MI.getOperand(0).getReg();
2454     LLT Ty = MRI.getType(DstReg);
2455     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2456 
2457     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2458     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2459     MI.getOperand(0).setReg(DstExt);
2460     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2461 
2462     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2463     auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2464     MIRBuilder.buildTrunc(DstReg, Shift);
2465     Observer.changedInstr(MI);
2466     return Legalized;
2467   }
2468   case TargetOpcode::G_FREEZE:
2469     Observer.changingInstr(MI);
2470     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2471     widenScalarDst(MI, WideTy);
2472     Observer.changedInstr(MI);
2473     return Legalized;
2474 
2475   case TargetOpcode::G_ABS:
2476     Observer.changingInstr(MI);
2477     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2478     widenScalarDst(MI, WideTy);
2479     Observer.changedInstr(MI);
2480     return Legalized;
2481 
2482   case TargetOpcode::G_ADD:
2483   case TargetOpcode::G_AND:
2484   case TargetOpcode::G_MUL:
2485   case TargetOpcode::G_OR:
2486   case TargetOpcode::G_XOR:
2487   case TargetOpcode::G_SUB:
2488     // Perform operation at larger width (any extension is fines here, high bits
2489     // don't affect the result) and then truncate the result back to the
2490     // original type.
2491     Observer.changingInstr(MI);
2492     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2493     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2494     widenScalarDst(MI, WideTy);
2495     Observer.changedInstr(MI);
2496     return Legalized;
2497 
2498   case TargetOpcode::G_SBFX:
2499   case TargetOpcode::G_UBFX:
2500     Observer.changingInstr(MI);
2501 
2502     if (TypeIdx == 0) {
2503       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2504       widenScalarDst(MI, WideTy);
2505     } else {
2506       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2507       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2508     }
2509 
2510     Observer.changedInstr(MI);
2511     return Legalized;
2512 
2513   case TargetOpcode::G_SHL:
2514     Observer.changingInstr(MI);
2515 
2516     if (TypeIdx == 0) {
2517       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2518       widenScalarDst(MI, WideTy);
2519     } else {
2520       assert(TypeIdx == 1);
2521       // The "number of bits to shift" operand must preserve its value as an
2522       // unsigned integer:
2523       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2524     }
2525 
2526     Observer.changedInstr(MI);
2527     return Legalized;
2528 
2529   case TargetOpcode::G_ROTR:
2530   case TargetOpcode::G_ROTL:
2531     if (TypeIdx != 1)
2532       return UnableToLegalize;
2533 
2534     Observer.changingInstr(MI);
2535     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2536     Observer.changedInstr(MI);
2537     return Legalized;
2538 
2539   case TargetOpcode::G_SDIV:
2540   case TargetOpcode::G_SREM:
2541   case TargetOpcode::G_SMIN:
2542   case TargetOpcode::G_SMAX:
2543     Observer.changingInstr(MI);
2544     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2545     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2546     widenScalarDst(MI, WideTy);
2547     Observer.changedInstr(MI);
2548     return Legalized;
2549 
2550   case TargetOpcode::G_SDIVREM:
2551     Observer.changingInstr(MI);
2552     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2553     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2554     widenScalarDst(MI, WideTy);
2555     widenScalarDst(MI, WideTy, 1);
2556     Observer.changedInstr(MI);
2557     return Legalized;
2558 
2559   case TargetOpcode::G_ASHR:
2560   case TargetOpcode::G_LSHR:
2561     Observer.changingInstr(MI);
2562 
2563     if (TypeIdx == 0) {
2564       unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
2565         TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2566 
2567       widenScalarSrc(MI, WideTy, 1, CvtOp);
2568       widenScalarDst(MI, WideTy);
2569     } else {
2570       assert(TypeIdx == 1);
2571       // The "number of bits to shift" operand must preserve its value as an
2572       // unsigned integer:
2573       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2574     }
2575 
2576     Observer.changedInstr(MI);
2577     return Legalized;
2578   case TargetOpcode::G_UDIV:
2579   case TargetOpcode::G_UREM:
2580   case TargetOpcode::G_UMIN:
2581   case TargetOpcode::G_UMAX:
2582     Observer.changingInstr(MI);
2583     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2584     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2585     widenScalarDst(MI, WideTy);
2586     Observer.changedInstr(MI);
2587     return Legalized;
2588 
2589   case TargetOpcode::G_UDIVREM:
2590     Observer.changingInstr(MI);
2591     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2592     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2593     widenScalarDst(MI, WideTy);
2594     widenScalarDst(MI, WideTy, 1);
2595     Observer.changedInstr(MI);
2596     return Legalized;
2597 
2598   case TargetOpcode::G_SELECT:
2599     Observer.changingInstr(MI);
2600     if (TypeIdx == 0) {
2601       // Perform operation at larger width (any extension is fine here, high
2602       // bits don't affect the result) and then truncate the result back to the
2603       // original type.
2604       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2605       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2606       widenScalarDst(MI, WideTy);
2607     } else {
2608       bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
2609       // Explicit extension is required here since high bits affect the result.
2610       widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
2611     }
2612     Observer.changedInstr(MI);
2613     return Legalized;
2614 
2615   case TargetOpcode::G_FPTOSI:
2616   case TargetOpcode::G_FPTOUI:
2617   case TargetOpcode::G_IS_FPCLASS:
2618     Observer.changingInstr(MI);
2619 
2620     if (TypeIdx == 0)
2621       widenScalarDst(MI, WideTy);
2622     else
2623       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2624 
2625     Observer.changedInstr(MI);
2626     return Legalized;
2627   case TargetOpcode::G_SITOFP:
2628     Observer.changingInstr(MI);
2629 
2630     if (TypeIdx == 0)
2631       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2632     else
2633       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2634 
2635     Observer.changedInstr(MI);
2636     return Legalized;
2637   case TargetOpcode::G_UITOFP:
2638     Observer.changingInstr(MI);
2639 
2640     if (TypeIdx == 0)
2641       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2642     else
2643       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2644 
2645     Observer.changedInstr(MI);
2646     return Legalized;
2647   case TargetOpcode::G_LOAD:
2648   case TargetOpcode::G_SEXTLOAD:
2649   case TargetOpcode::G_ZEXTLOAD:
2650     Observer.changingInstr(MI);
2651     widenScalarDst(MI, WideTy);
2652     Observer.changedInstr(MI);
2653     return Legalized;
2654 
2655   case TargetOpcode::G_STORE: {
2656     if (TypeIdx != 0)
2657       return UnableToLegalize;
2658 
2659     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2660     if (!Ty.isScalar())
2661       return UnableToLegalize;
2662 
2663     Observer.changingInstr(MI);
2664 
2665     unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
2666       TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
2667     widenScalarSrc(MI, WideTy, 0, ExtType);
2668 
2669     Observer.changedInstr(MI);
2670     return Legalized;
2671   }
2672   case TargetOpcode::G_CONSTANT: {
2673     MachineOperand &SrcMO = MI.getOperand(1);
2674     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2675     unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
2676         MRI.getType(MI.getOperand(0).getReg()));
2677     assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
2678             ExtOpc == TargetOpcode::G_ANYEXT) &&
2679            "Illegal Extend");
2680     const APInt &SrcVal = SrcMO.getCImm()->getValue();
2681     const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
2682                            ? SrcVal.sext(WideTy.getSizeInBits())
2683                            : SrcVal.zext(WideTy.getSizeInBits());
2684     Observer.changingInstr(MI);
2685     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
2686 
2687     widenScalarDst(MI, WideTy);
2688     Observer.changedInstr(MI);
2689     return Legalized;
2690   }
2691   case TargetOpcode::G_FCONSTANT: {
2692     // To avoid changing the bits of the constant due to extension to a larger
2693     // type and then using G_FPTRUNC, we simply convert to a G_CONSTANT.
2694     MachineOperand &SrcMO = MI.getOperand(1);
2695     APInt Val = SrcMO.getFPImm()->getValueAPF().bitcastToAPInt();
2696     MIRBuilder.setInstrAndDebugLoc(MI);
2697     auto IntCst = MIRBuilder.buildConstant(MI.getOperand(0).getReg(), Val);
2698     widenScalarDst(*IntCst, WideTy, 0, TargetOpcode::G_TRUNC);
2699     MI.eraseFromParent();
2700     return Legalized;
2701   }
2702   case TargetOpcode::G_IMPLICIT_DEF: {
2703     Observer.changingInstr(MI);
2704     widenScalarDst(MI, WideTy);
2705     Observer.changedInstr(MI);
2706     return Legalized;
2707   }
2708   case TargetOpcode::G_BRCOND:
2709     Observer.changingInstr(MI);
2710     widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
2711     Observer.changedInstr(MI);
2712     return Legalized;
2713 
2714   case TargetOpcode::G_FCMP:
2715     Observer.changingInstr(MI);
2716     if (TypeIdx == 0)
2717       widenScalarDst(MI, WideTy);
2718     else {
2719       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2720       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
2721     }
2722     Observer.changedInstr(MI);
2723     return Legalized;
2724 
2725   case TargetOpcode::G_ICMP:
2726     Observer.changingInstr(MI);
2727     if (TypeIdx == 0)
2728       widenScalarDst(MI, WideTy);
2729     else {
2730       unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>(
2731                                MI.getOperand(1).getPredicate()))
2732                                ? TargetOpcode::G_SEXT
2733                                : TargetOpcode::G_ZEXT;
2734       widenScalarSrc(MI, WideTy, 2, ExtOpcode);
2735       widenScalarSrc(MI, WideTy, 3, ExtOpcode);
2736     }
2737     Observer.changedInstr(MI);
2738     return Legalized;
2739 
2740   case TargetOpcode::G_PTR_ADD:
2741     assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
2742     Observer.changingInstr(MI);
2743     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2744     Observer.changedInstr(MI);
2745     return Legalized;
2746 
2747   case TargetOpcode::G_PHI: {
2748     assert(TypeIdx == 0 && "Expecting only Idx 0");
2749 
2750     Observer.changingInstr(MI);
2751     for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
2752       MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
2753       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
2754       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
2755     }
2756 
2757     MachineBasicBlock &MBB = *MI.getParent();
2758     MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
2759     widenScalarDst(MI, WideTy);
2760     Observer.changedInstr(MI);
2761     return Legalized;
2762   }
2763   case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
2764     if (TypeIdx == 0) {
2765       Register VecReg = MI.getOperand(1).getReg();
2766       LLT VecTy = MRI.getType(VecReg);
2767       Observer.changingInstr(MI);
2768 
2769       widenScalarSrc(
2770           MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
2771           TargetOpcode::G_ANYEXT);
2772 
2773       widenScalarDst(MI, WideTy, 0);
2774       Observer.changedInstr(MI);
2775       return Legalized;
2776     }
2777 
2778     if (TypeIdx != 2)
2779       return UnableToLegalize;
2780     Observer.changingInstr(MI);
2781     // TODO: Probably should be zext
2782     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2783     Observer.changedInstr(MI);
2784     return Legalized;
2785   }
2786   case TargetOpcode::G_INSERT_VECTOR_ELT: {
2787     if (TypeIdx == 0) {
2788       Observer.changingInstr(MI);
2789       const LLT WideEltTy = WideTy.getElementType();
2790 
2791       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2792       widenScalarSrc(MI, WideEltTy, 2, TargetOpcode::G_ANYEXT);
2793       widenScalarDst(MI, WideTy, 0);
2794       Observer.changedInstr(MI);
2795       return Legalized;
2796     }
2797 
2798     if (TypeIdx == 1) {
2799       Observer.changingInstr(MI);
2800 
2801       Register VecReg = MI.getOperand(1).getReg();
2802       LLT VecTy = MRI.getType(VecReg);
2803       LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
2804 
2805       widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
2806       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2807       widenScalarDst(MI, WideVecTy, 0);
2808       Observer.changedInstr(MI);
2809       return Legalized;
2810     }
2811 
2812     if (TypeIdx == 2) {
2813       Observer.changingInstr(MI);
2814       // TODO: Probably should be zext
2815       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2816       Observer.changedInstr(MI);
2817       return Legalized;
2818     }
2819 
2820     return UnableToLegalize;
2821   }
2822   case TargetOpcode::G_FADD:
2823   case TargetOpcode::G_FMUL:
2824   case TargetOpcode::G_FSUB:
2825   case TargetOpcode::G_FMA:
2826   case TargetOpcode::G_FMAD:
2827   case TargetOpcode::G_FNEG:
2828   case TargetOpcode::G_FABS:
2829   case TargetOpcode::G_FCANONICALIZE:
2830   case TargetOpcode::G_FMINNUM:
2831   case TargetOpcode::G_FMAXNUM:
2832   case TargetOpcode::G_FMINNUM_IEEE:
2833   case TargetOpcode::G_FMAXNUM_IEEE:
2834   case TargetOpcode::G_FMINIMUM:
2835   case TargetOpcode::G_FMAXIMUM:
2836   case TargetOpcode::G_FDIV:
2837   case TargetOpcode::G_FREM:
2838   case TargetOpcode::G_FCEIL:
2839   case TargetOpcode::G_FFLOOR:
2840   case TargetOpcode::G_FCOS:
2841   case TargetOpcode::G_FSIN:
2842   case TargetOpcode::G_FLOG10:
2843   case TargetOpcode::G_FLOG:
2844   case TargetOpcode::G_FLOG2:
2845   case TargetOpcode::G_FRINT:
2846   case TargetOpcode::G_FNEARBYINT:
2847   case TargetOpcode::G_FSQRT:
2848   case TargetOpcode::G_FEXP:
2849   case TargetOpcode::G_FEXP2:
2850   case TargetOpcode::G_FEXP10:
2851   case TargetOpcode::G_FPOW:
2852   case TargetOpcode::G_INTRINSIC_TRUNC:
2853   case TargetOpcode::G_INTRINSIC_ROUND:
2854   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2855     assert(TypeIdx == 0);
2856     Observer.changingInstr(MI);
2857 
2858     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
2859       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
2860 
2861     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2862     Observer.changedInstr(MI);
2863     return Legalized;
2864   case TargetOpcode::G_FPOWI:
2865   case TargetOpcode::G_FLDEXP:
2866   case TargetOpcode::G_STRICT_FLDEXP: {
2867     if (TypeIdx == 0) {
2868       if (MI.getOpcode() == TargetOpcode::G_STRICT_FLDEXP)
2869         return UnableToLegalize;
2870 
2871       Observer.changingInstr(MI);
2872       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2873       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2874       Observer.changedInstr(MI);
2875       return Legalized;
2876     }
2877 
2878     if (TypeIdx == 1) {
2879       // For some reason SelectionDAG tries to promote to a libcall without
2880       // actually changing the integer type for promotion.
2881       Observer.changingInstr(MI);
2882       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2883       Observer.changedInstr(MI);
2884       return Legalized;
2885     }
2886 
2887     return UnableToLegalize;
2888   }
2889   case TargetOpcode::G_FFREXP: {
2890     Observer.changingInstr(MI);
2891 
2892     if (TypeIdx == 0) {
2893       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2894       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2895     } else {
2896       widenScalarDst(MI, WideTy, 1);
2897     }
2898 
2899     Observer.changedInstr(MI);
2900     return Legalized;
2901   }
2902   case TargetOpcode::G_INTTOPTR:
2903     if (TypeIdx != 1)
2904       return UnableToLegalize;
2905 
2906     Observer.changingInstr(MI);
2907     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2908     Observer.changedInstr(MI);
2909     return Legalized;
2910   case TargetOpcode::G_PTRTOINT:
2911     if (TypeIdx != 0)
2912       return UnableToLegalize;
2913 
2914     Observer.changingInstr(MI);
2915     widenScalarDst(MI, WideTy, 0);
2916     Observer.changedInstr(MI);
2917     return Legalized;
2918   case TargetOpcode::G_BUILD_VECTOR: {
2919     Observer.changingInstr(MI);
2920 
2921     const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
2922     for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
2923       widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
2924 
2925     // Avoid changing the result vector type if the source element type was
2926     // requested.
2927     if (TypeIdx == 1) {
2928       MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
2929     } else {
2930       widenScalarDst(MI, WideTy, 0);
2931     }
2932 
2933     Observer.changedInstr(MI);
2934     return Legalized;
2935   }
2936   case TargetOpcode::G_SEXT_INREG:
2937     if (TypeIdx != 0)
2938       return UnableToLegalize;
2939 
2940     Observer.changingInstr(MI);
2941     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2942     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
2943     Observer.changedInstr(MI);
2944     return Legalized;
2945   case TargetOpcode::G_PTRMASK: {
2946     if (TypeIdx != 1)
2947       return UnableToLegalize;
2948     Observer.changingInstr(MI);
2949     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2950     Observer.changedInstr(MI);
2951     return Legalized;
2952   }
2953   case TargetOpcode::G_VECREDUCE_FADD:
2954   case TargetOpcode::G_VECREDUCE_FMUL:
2955   case TargetOpcode::G_VECREDUCE_FMIN:
2956   case TargetOpcode::G_VECREDUCE_FMAX:
2957   case TargetOpcode::G_VECREDUCE_FMINIMUM:
2958   case TargetOpcode::G_VECREDUCE_FMAXIMUM:
2959     if (TypeIdx != 0)
2960       return UnableToLegalize;
2961     Observer.changingInstr(MI);
2962     Register VecReg = MI.getOperand(1).getReg();
2963     LLT VecTy = MRI.getType(VecReg);
2964     LLT WideVecTy = VecTy.isVector()
2965                         ? LLT::vector(VecTy.getElementCount(), WideTy)
2966                         : WideTy;
2967     widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_FPEXT);
2968     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2969     Observer.changedInstr(MI);
2970     return Legalized;
2971   }
2972 }
2973 
2974 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
2975                              MachineIRBuilder &B, Register Src, LLT Ty) {
2976   auto Unmerge = B.buildUnmerge(Ty, Src);
2977   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2978     Pieces.push_back(Unmerge.getReg(I));
2979 }
2980 
2981 LegalizerHelper::LegalizeResult
2982 LegalizerHelper::lowerFConstant(MachineInstr &MI) {
2983   Register Dst = MI.getOperand(0).getReg();
2984 
2985   MachineFunction &MF = MIRBuilder.getMF();
2986   const DataLayout &DL = MIRBuilder.getDataLayout();
2987 
2988   unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
2989   LLT AddrPtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
2990   Align Alignment = Align(DL.getABITypeAlign(
2991       getFloatTypeForLLT(MF.getFunction().getContext(), MRI.getType(Dst))));
2992 
2993   auto Addr = MIRBuilder.buildConstantPool(
2994       AddrPtrTy, MF.getConstantPool()->getConstantPoolIndex(
2995                      MI.getOperand(1).getFPImm(), Alignment));
2996 
2997   MachineMemOperand *MMO = MF.getMachineMemOperand(
2998       MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad,
2999       MRI.getType(Dst), Alignment);
3000 
3001   MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, Dst, Addr, *MMO);
3002   MI.eraseFromParent();
3003 
3004   return Legalized;
3005 }
3006 
3007 LegalizerHelper::LegalizeResult
3008 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
3009   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
3010   if (SrcTy.isVector()) {
3011     LLT SrcEltTy = SrcTy.getElementType();
3012     SmallVector<Register, 8> SrcRegs;
3013 
3014     if (DstTy.isVector()) {
3015       int NumDstElt = DstTy.getNumElements();
3016       int NumSrcElt = SrcTy.getNumElements();
3017 
3018       LLT DstEltTy = DstTy.getElementType();
3019       LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
3020       LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
3021 
3022       // If there's an element size mismatch, insert intermediate casts to match
3023       // the result element type.
3024       if (NumSrcElt < NumDstElt) { // Source element type is larger.
3025         // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
3026         //
3027         // =>
3028         //
3029         // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
3030         // %3:_(<2 x s8>) = G_BITCAST %2
3031         // %4:_(<2 x s8>) = G_BITCAST %3
3032         // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
3033         DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
3034         SrcPartTy = SrcEltTy;
3035       } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
3036         //
3037         // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
3038         //
3039         // =>
3040         //
3041         // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
3042         // %3:_(s16) = G_BITCAST %2
3043         // %4:_(s16) = G_BITCAST %3
3044         // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
3045         SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
3046         DstCastTy = DstEltTy;
3047       }
3048 
3049       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
3050       for (Register &SrcReg : SrcRegs)
3051         SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
3052     } else
3053       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
3054 
3055     MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs);
3056     MI.eraseFromParent();
3057     return Legalized;
3058   }
3059 
3060   if (DstTy.isVector()) {
3061     SmallVector<Register, 8> SrcRegs;
3062     getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
3063     MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs);
3064     MI.eraseFromParent();
3065     return Legalized;
3066   }
3067 
3068   return UnableToLegalize;
3069 }
3070 
3071 /// Figure out the bit offset into a register when coercing a vector index for
3072 /// the wide element type. This is only for the case when promoting vector to
3073 /// one with larger elements.
3074 //
3075 ///
3076 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3077 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3078 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
3079                                                    Register Idx,
3080                                                    unsigned NewEltSize,
3081                                                    unsigned OldEltSize) {
3082   const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3083   LLT IdxTy = B.getMRI()->getType(Idx);
3084 
3085   // Now figure out the amount we need to shift to get the target bits.
3086   auto OffsetMask = B.buildConstant(
3087       IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio));
3088   auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
3089   return B.buildShl(IdxTy, OffsetIdx,
3090                     B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
3091 }
3092 
3093 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
3094 /// is casting to a vector with a smaller element size, perform multiple element
3095 /// extracts and merge the results. If this is coercing to a vector with larger
3096 /// elements, index the bitcasted vector and extract the target element with bit
3097 /// operations. This is intended to force the indexing in the native register
3098 /// size for architectures that can dynamically index the register file.
3099 LegalizerHelper::LegalizeResult
3100 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
3101                                          LLT CastTy) {
3102   if (TypeIdx != 1)
3103     return UnableToLegalize;
3104 
3105   auto [Dst, DstTy, SrcVec, SrcVecTy, Idx, IdxTy] = MI.getFirst3RegLLTs();
3106 
3107   LLT SrcEltTy = SrcVecTy.getElementType();
3108   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3109   unsigned OldNumElts = SrcVecTy.getNumElements();
3110 
3111   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3112   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
3113 
3114   const unsigned NewEltSize = NewEltTy.getSizeInBits();
3115   const unsigned OldEltSize = SrcEltTy.getSizeInBits();
3116   if (NewNumElts > OldNumElts) {
3117     // Decreasing the vector element size
3118     //
3119     // e.g. i64 = extract_vector_elt x:v2i64, y:i32
3120     //  =>
3121     //  v4i32:castx = bitcast x:v2i64
3122     //
3123     // i64 = bitcast
3124     //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
3125     //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
3126     //
3127     if (NewNumElts % OldNumElts != 0)
3128       return UnableToLegalize;
3129 
3130     // Type of the intermediate result vector.
3131     const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
3132     LLT MidTy =
3133         LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
3134 
3135     auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
3136 
3137     SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
3138     auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
3139 
3140     for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
3141       auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
3142       auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
3143       auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
3144       NewOps[I] = Elt.getReg(0);
3145     }
3146 
3147     auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
3148     MIRBuilder.buildBitcast(Dst, NewVec);
3149     MI.eraseFromParent();
3150     return Legalized;
3151   }
3152 
3153   if (NewNumElts < OldNumElts) {
3154     if (NewEltSize % OldEltSize != 0)
3155       return UnableToLegalize;
3156 
3157     // This only depends on powers of 2 because we use bit tricks to figure out
3158     // the bit offset we need to shift to get the target element. A general
3159     // expansion could emit division/multiply.
3160     if (!isPowerOf2_32(NewEltSize / OldEltSize))
3161       return UnableToLegalize;
3162 
3163     // Increasing the vector element size.
3164     // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
3165     //
3166     //   =>
3167     //
3168     // %cast = G_BITCAST %vec
3169     // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
3170     // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
3171     // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3172     // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3173     // %elt_bits = G_LSHR %wide_elt, %offset_bits
3174     // %elt = G_TRUNC %elt_bits
3175 
3176     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3177     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
3178 
3179     // Divide to get the index in the wider element type.
3180     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
3181 
3182     Register WideElt = CastVec;
3183     if (CastTy.isVector()) {
3184       WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
3185                                                      ScaledIdx).getReg(0);
3186     }
3187 
3188     // Compute the bit offset into the register of the target element.
3189     Register OffsetBits = getBitcastWiderVectorElementOffset(
3190       MIRBuilder, Idx, NewEltSize, OldEltSize);
3191 
3192     // Shift the wide element to get the target element.
3193     auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
3194     MIRBuilder.buildTrunc(Dst, ExtractedBits);
3195     MI.eraseFromParent();
3196     return Legalized;
3197   }
3198 
3199   return UnableToLegalize;
3200 }
3201 
3202 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
3203 /// TargetReg, while preserving other bits in \p TargetReg.
3204 ///
3205 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
3206 static Register buildBitFieldInsert(MachineIRBuilder &B,
3207                                     Register TargetReg, Register InsertReg,
3208                                     Register OffsetBits) {
3209   LLT TargetTy = B.getMRI()->getType(TargetReg);
3210   LLT InsertTy = B.getMRI()->getType(InsertReg);
3211   auto ZextVal = B.buildZExt(TargetTy, InsertReg);
3212   auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
3213 
3214   // Produce a bitmask of the value to insert
3215   auto EltMask = B.buildConstant(
3216     TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
3217                                    InsertTy.getSizeInBits()));
3218   // Shift it into position
3219   auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
3220   auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
3221 
3222   // Clear out the bits in the wide element
3223   auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
3224 
3225   // The value to insert has all zeros already, so stick it into the masked
3226   // wide element.
3227   return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
3228 }
3229 
3230 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
3231 /// is increasing the element size, perform the indexing in the target element
3232 /// type, and use bit operations to insert at the element position. This is
3233 /// intended for architectures that can dynamically index the register file and
3234 /// want to force indexing in the native register size.
3235 LegalizerHelper::LegalizeResult
3236 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
3237                                         LLT CastTy) {
3238   if (TypeIdx != 0)
3239     return UnableToLegalize;
3240 
3241   auto [Dst, DstTy, SrcVec, SrcVecTy, Val, ValTy, Idx, IdxTy] =
3242       MI.getFirst4RegLLTs();
3243   LLT VecTy = DstTy;
3244 
3245   LLT VecEltTy = VecTy.getElementType();
3246   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3247   const unsigned NewEltSize = NewEltTy.getSizeInBits();
3248   const unsigned OldEltSize = VecEltTy.getSizeInBits();
3249 
3250   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3251   unsigned OldNumElts = VecTy.getNumElements();
3252 
3253   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
3254   if (NewNumElts < OldNumElts) {
3255     if (NewEltSize % OldEltSize != 0)
3256       return UnableToLegalize;
3257 
3258     // This only depends on powers of 2 because we use bit tricks to figure out
3259     // the bit offset we need to shift to get the target element. A general
3260     // expansion could emit division/multiply.
3261     if (!isPowerOf2_32(NewEltSize / OldEltSize))
3262       return UnableToLegalize;
3263 
3264     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3265     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
3266 
3267     // Divide to get the index in the wider element type.
3268     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
3269 
3270     Register ExtractedElt = CastVec;
3271     if (CastTy.isVector()) {
3272       ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
3273                                                           ScaledIdx).getReg(0);
3274     }
3275 
3276     // Compute the bit offset into the register of the target element.
3277     Register OffsetBits = getBitcastWiderVectorElementOffset(
3278       MIRBuilder, Idx, NewEltSize, OldEltSize);
3279 
3280     Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
3281                                                Val, OffsetBits);
3282     if (CastTy.isVector()) {
3283       InsertedElt = MIRBuilder.buildInsertVectorElement(
3284         CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
3285     }
3286 
3287     MIRBuilder.buildBitcast(Dst, InsertedElt);
3288     MI.eraseFromParent();
3289     return Legalized;
3290   }
3291 
3292   return UnableToLegalize;
3293 }
3294 
3295 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
3296   // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
3297   Register DstReg = LoadMI.getDstReg();
3298   Register PtrReg = LoadMI.getPointerReg();
3299   LLT DstTy = MRI.getType(DstReg);
3300   MachineMemOperand &MMO = LoadMI.getMMO();
3301   LLT MemTy = MMO.getMemoryType();
3302   MachineFunction &MF = MIRBuilder.getMF();
3303 
3304   unsigned MemSizeInBits = MemTy.getSizeInBits();
3305   unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
3306 
3307   if (MemSizeInBits != MemStoreSizeInBits) {
3308     if (MemTy.isVector())
3309       return UnableToLegalize;
3310 
3311     // Promote to a byte-sized load if not loading an integral number of
3312     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
3313     LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
3314     MachineMemOperand *NewMMO =
3315         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
3316 
3317     Register LoadReg = DstReg;
3318     LLT LoadTy = DstTy;
3319 
3320     // If this wasn't already an extending load, we need to widen the result
3321     // register to avoid creating a load with a narrower result than the source.
3322     if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
3323       LoadTy = WideMemTy;
3324       LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
3325     }
3326 
3327     if (isa<GSExtLoad>(LoadMI)) {
3328       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
3329       MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
3330     } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == LoadTy) {
3331       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
3332       // The extra bits are guaranteed to be zero, since we stored them that
3333       // way.  A zext load from Wide thus automatically gives zext from MemVT.
3334       MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
3335     } else {
3336       MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
3337     }
3338 
3339     if (DstTy != LoadTy)
3340       MIRBuilder.buildTrunc(DstReg, LoadReg);
3341 
3342     LoadMI.eraseFromParent();
3343     return Legalized;
3344   }
3345 
3346   // Big endian lowering not implemented.
3347   if (MIRBuilder.getDataLayout().isBigEndian())
3348     return UnableToLegalize;
3349 
3350   // This load needs splitting into power of 2 sized loads.
3351   //
3352   // Our strategy here is to generate anyextending loads for the smaller
3353   // types up to next power-2 result type, and then combine the two larger
3354   // result values together, before truncating back down to the non-pow-2
3355   // type.
3356   // E.g. v1 = i24 load =>
3357   // v2 = i32 zextload (2 byte)
3358   // v3 = i32 load (1 byte)
3359   // v4 = i32 shl v3, 16
3360   // v5 = i32 or v4, v2
3361   // v1 = i24 trunc v5
3362   // By doing this we generate the correct truncate which should get
3363   // combined away as an artifact with a matching extend.
3364 
3365   uint64_t LargeSplitSize, SmallSplitSize;
3366 
3367   if (!isPowerOf2_32(MemSizeInBits)) {
3368     // This load needs splitting into power of 2 sized loads.
3369     LargeSplitSize = llvm::bit_floor(MemSizeInBits);
3370     SmallSplitSize = MemSizeInBits - LargeSplitSize;
3371   } else {
3372     // This is already a power of 2, but we still need to split this in half.
3373     //
3374     // Assume we're being asked to decompose an unaligned load.
3375     // TODO: If this requires multiple splits, handle them all at once.
3376     auto &Ctx = MF.getFunction().getContext();
3377     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3378       return UnableToLegalize;
3379 
3380     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3381   }
3382 
3383   if (MemTy.isVector()) {
3384     // TODO: Handle vector extloads
3385     if (MemTy != DstTy)
3386       return UnableToLegalize;
3387 
3388     // TODO: We can do better than scalarizing the vector and at least split it
3389     // in half.
3390     return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
3391   }
3392 
3393   MachineMemOperand *LargeMMO =
3394       MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3395   MachineMemOperand *SmallMMO =
3396       MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3397 
3398   LLT PtrTy = MRI.getType(PtrReg);
3399   unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
3400   LLT AnyExtTy = LLT::scalar(AnyExtSize);
3401   auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
3402                                              PtrReg, *LargeMMO);
3403 
3404   auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
3405                                             LargeSplitSize / 8);
3406   Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
3407   auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
3408   auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
3409                                              SmallPtr, *SmallMMO);
3410 
3411   auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
3412   auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
3413 
3414   if (AnyExtTy == DstTy)
3415     MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
3416   else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
3417     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3418     MIRBuilder.buildTrunc(DstReg, {Or});
3419   } else {
3420     assert(DstTy.isPointer() && "expected pointer");
3421     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3422 
3423     // FIXME: We currently consider this to be illegal for non-integral address
3424     // spaces, but we need still need a way to reinterpret the bits.
3425     MIRBuilder.buildIntToPtr(DstReg, Or);
3426   }
3427 
3428   LoadMI.eraseFromParent();
3429   return Legalized;
3430 }
3431 
3432 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
3433   // Lower a non-power of 2 store into multiple pow-2 stores.
3434   // E.g. split an i24 store into an i16 store + i8 store.
3435   // We do this by first extending the stored value to the next largest power
3436   // of 2 type, and then using truncating stores to store the components.
3437   // By doing this, likewise with G_LOAD, generate an extend that can be
3438   // artifact-combined away instead of leaving behind extracts.
3439   Register SrcReg = StoreMI.getValueReg();
3440   Register PtrReg = StoreMI.getPointerReg();
3441   LLT SrcTy = MRI.getType(SrcReg);
3442   MachineFunction &MF = MIRBuilder.getMF();
3443   MachineMemOperand &MMO = **StoreMI.memoperands_begin();
3444   LLT MemTy = MMO.getMemoryType();
3445 
3446   unsigned StoreWidth = MemTy.getSizeInBits();
3447   unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
3448 
3449   if (StoreWidth != StoreSizeInBits) {
3450     if (SrcTy.isVector())
3451       return UnableToLegalize;
3452 
3453     // Promote to a byte-sized store with upper bits zero if not
3454     // storing an integral number of bytes.  For example, promote
3455     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
3456     LLT WideTy = LLT::scalar(StoreSizeInBits);
3457 
3458     if (StoreSizeInBits > SrcTy.getSizeInBits()) {
3459       // Avoid creating a store with a narrower source than result.
3460       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
3461       SrcTy = WideTy;
3462     }
3463 
3464     auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
3465 
3466     MachineMemOperand *NewMMO =
3467         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
3468     MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
3469     StoreMI.eraseFromParent();
3470     return Legalized;
3471   }
3472 
3473   if (MemTy.isVector()) {
3474     // TODO: Handle vector trunc stores
3475     if (MemTy != SrcTy)
3476       return UnableToLegalize;
3477 
3478     // TODO: We can do better than scalarizing the vector and at least split it
3479     // in half.
3480     return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
3481   }
3482 
3483   unsigned MemSizeInBits = MemTy.getSizeInBits();
3484   uint64_t LargeSplitSize, SmallSplitSize;
3485 
3486   if (!isPowerOf2_32(MemSizeInBits)) {
3487     LargeSplitSize = llvm::bit_floor<uint64_t>(MemTy.getSizeInBits());
3488     SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
3489   } else {
3490     auto &Ctx = MF.getFunction().getContext();
3491     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3492       return UnableToLegalize; // Don't know what we're being asked to do.
3493 
3494     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3495   }
3496 
3497   // Extend to the next pow-2. If this store was itself the result of lowering,
3498   // e.g. an s56 store being broken into s32 + s24, we might have a stored type
3499   // that's wider than the stored size.
3500   unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
3501   const LLT NewSrcTy = LLT::scalar(AnyExtSize);
3502 
3503   if (SrcTy.isPointer()) {
3504     const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
3505     SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
3506   }
3507 
3508   auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
3509 
3510   // Obtain the smaller value by shifting away the larger value.
3511   auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
3512   auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
3513 
3514   // Generate the PtrAdd and truncating stores.
3515   LLT PtrTy = MRI.getType(PtrReg);
3516   auto OffsetCst = MIRBuilder.buildConstant(
3517     LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
3518   auto SmallPtr =
3519     MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
3520 
3521   MachineMemOperand *LargeMMO =
3522     MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3523   MachineMemOperand *SmallMMO =
3524     MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3525   MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
3526   MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
3527   StoreMI.eraseFromParent();
3528   return Legalized;
3529 }
3530 
3531 LegalizerHelper::LegalizeResult
3532 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
3533   switch (MI.getOpcode()) {
3534   case TargetOpcode::G_LOAD: {
3535     if (TypeIdx != 0)
3536       return UnableToLegalize;
3537     MachineMemOperand &MMO = **MI.memoperands_begin();
3538 
3539     // Not sure how to interpret a bitcast of an extending load.
3540     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3541       return UnableToLegalize;
3542 
3543     Observer.changingInstr(MI);
3544     bitcastDst(MI, CastTy, 0);
3545     MMO.setType(CastTy);
3546     Observer.changedInstr(MI);
3547     return Legalized;
3548   }
3549   case TargetOpcode::G_STORE: {
3550     if (TypeIdx != 0)
3551       return UnableToLegalize;
3552 
3553     MachineMemOperand &MMO = **MI.memoperands_begin();
3554 
3555     // Not sure how to interpret a bitcast of a truncating store.
3556     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3557       return UnableToLegalize;
3558 
3559     Observer.changingInstr(MI);
3560     bitcastSrc(MI, CastTy, 0);
3561     MMO.setType(CastTy);
3562     Observer.changedInstr(MI);
3563     return Legalized;
3564   }
3565   case TargetOpcode::G_SELECT: {
3566     if (TypeIdx != 0)
3567       return UnableToLegalize;
3568 
3569     if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
3570       LLVM_DEBUG(
3571           dbgs() << "bitcast action not implemented for vector select\n");
3572       return UnableToLegalize;
3573     }
3574 
3575     Observer.changingInstr(MI);
3576     bitcastSrc(MI, CastTy, 2);
3577     bitcastSrc(MI, CastTy, 3);
3578     bitcastDst(MI, CastTy, 0);
3579     Observer.changedInstr(MI);
3580     return Legalized;
3581   }
3582   case TargetOpcode::G_AND:
3583   case TargetOpcode::G_OR:
3584   case TargetOpcode::G_XOR: {
3585     Observer.changingInstr(MI);
3586     bitcastSrc(MI, CastTy, 1);
3587     bitcastSrc(MI, CastTy, 2);
3588     bitcastDst(MI, CastTy, 0);
3589     Observer.changedInstr(MI);
3590     return Legalized;
3591   }
3592   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3593     return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
3594   case TargetOpcode::G_INSERT_VECTOR_ELT:
3595     return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
3596   default:
3597     return UnableToLegalize;
3598   }
3599 }
3600 
3601 // Legalize an instruction by changing the opcode in place.
3602 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
3603     Observer.changingInstr(MI);
3604     MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
3605     Observer.changedInstr(MI);
3606 }
3607 
3608 LegalizerHelper::LegalizeResult
3609 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
3610   using namespace TargetOpcode;
3611 
3612   switch(MI.getOpcode()) {
3613   default:
3614     return UnableToLegalize;
3615   case TargetOpcode::G_FCONSTANT:
3616     return lowerFConstant(MI);
3617   case TargetOpcode::G_BITCAST:
3618     return lowerBitcast(MI);
3619   case TargetOpcode::G_SREM:
3620   case TargetOpcode::G_UREM: {
3621     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3622     auto Quot =
3623         MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
3624                               {MI.getOperand(1), MI.getOperand(2)});
3625 
3626     auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
3627     MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
3628     MI.eraseFromParent();
3629     return Legalized;
3630   }
3631   case TargetOpcode::G_SADDO:
3632   case TargetOpcode::G_SSUBO:
3633     return lowerSADDO_SSUBO(MI);
3634   case TargetOpcode::G_UMULH:
3635   case TargetOpcode::G_SMULH:
3636     return lowerSMULH_UMULH(MI);
3637   case TargetOpcode::G_SMULO:
3638   case TargetOpcode::G_UMULO: {
3639     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
3640     // result.
3641     auto [Res, Overflow, LHS, RHS] = MI.getFirst4Regs();
3642     LLT Ty = MRI.getType(Res);
3643 
3644     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
3645                           ? TargetOpcode::G_SMULH
3646                           : TargetOpcode::G_UMULH;
3647 
3648     Observer.changingInstr(MI);
3649     const auto &TII = MIRBuilder.getTII();
3650     MI.setDesc(TII.get(TargetOpcode::G_MUL));
3651     MI.removeOperand(1);
3652     Observer.changedInstr(MI);
3653 
3654     auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
3655     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3656 
3657     // Move insert point forward so we can use the Res register if needed.
3658     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
3659 
3660     // For *signed* multiply, overflow is detected by checking:
3661     // (hi != (lo >> bitwidth-1))
3662     if (Opcode == TargetOpcode::G_SMULH) {
3663       auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
3664       auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
3665       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
3666     } else {
3667       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
3668     }
3669     return Legalized;
3670   }
3671   case TargetOpcode::G_FNEG: {
3672     auto [Res, SubByReg] = MI.getFirst2Regs();
3673     LLT Ty = MRI.getType(Res);
3674 
3675     // TODO: Handle vector types once we are able to
3676     // represent them.
3677     if (Ty.isVector())
3678       return UnableToLegalize;
3679     auto SignMask =
3680         MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
3681     MIRBuilder.buildXor(Res, SubByReg, SignMask);
3682     MI.eraseFromParent();
3683     return Legalized;
3684   }
3685   case TargetOpcode::G_FSUB:
3686   case TargetOpcode::G_STRICT_FSUB: {
3687     auto [Res, LHS, RHS] = MI.getFirst3Regs();
3688     LLT Ty = MRI.getType(Res);
3689 
3690     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
3691     auto Neg = MIRBuilder.buildFNeg(Ty, RHS);
3692 
3693     if (MI.getOpcode() == TargetOpcode::G_STRICT_FSUB)
3694       MIRBuilder.buildStrictFAdd(Res, LHS, Neg, MI.getFlags());
3695     else
3696       MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
3697 
3698     MI.eraseFromParent();
3699     return Legalized;
3700   }
3701   case TargetOpcode::G_FMAD:
3702     return lowerFMad(MI);
3703   case TargetOpcode::G_FFLOOR:
3704     return lowerFFloor(MI);
3705   case TargetOpcode::G_INTRINSIC_ROUND:
3706     return lowerIntrinsicRound(MI);
3707   case TargetOpcode::G_FRINT: {
3708     // Since round even is the assumed rounding mode for unconstrained FP
3709     // operations, rint and roundeven are the same operation.
3710     changeOpcode(MI, TargetOpcode::G_INTRINSIC_ROUNDEVEN);
3711     return Legalized;
3712   }
3713   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
3714     auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs();
3715     MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
3716                                   **MI.memoperands_begin());
3717     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal);
3718     MI.eraseFromParent();
3719     return Legalized;
3720   }
3721   case TargetOpcode::G_LOAD:
3722   case TargetOpcode::G_SEXTLOAD:
3723   case TargetOpcode::G_ZEXTLOAD:
3724     return lowerLoad(cast<GAnyLoad>(MI));
3725   case TargetOpcode::G_STORE:
3726     return lowerStore(cast<GStore>(MI));
3727   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
3728   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
3729   case TargetOpcode::G_CTLZ:
3730   case TargetOpcode::G_CTTZ:
3731   case TargetOpcode::G_CTPOP:
3732     return lowerBitCount(MI);
3733   case G_UADDO: {
3734     auto [Res, CarryOut, LHS, RHS] = MI.getFirst4Regs();
3735 
3736     MIRBuilder.buildAdd(Res, LHS, RHS);
3737     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS);
3738 
3739     MI.eraseFromParent();
3740     return Legalized;
3741   }
3742   case G_UADDE: {
3743     auto [Res, CarryOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
3744     const LLT CondTy = MRI.getType(CarryOut);
3745     const LLT Ty = MRI.getType(Res);
3746 
3747     // Initial add of the two operands.
3748     auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
3749 
3750     // Initial check for carry.
3751     auto Carry = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, TmpRes, LHS);
3752 
3753     // Add the sum and the carry.
3754     auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
3755     MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
3756 
3757     // Second check for carry. We can only carry if the initial sum is all 1s
3758     // and the carry is set, resulting in a new sum of 0.
3759     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3760     auto ResEqZero = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, Res, Zero);
3761     auto Carry2 = MIRBuilder.buildAnd(CondTy, ResEqZero, CarryIn);
3762     MIRBuilder.buildOr(CarryOut, Carry, Carry2);
3763 
3764     MI.eraseFromParent();
3765     return Legalized;
3766   }
3767   case G_USUBO: {
3768     auto [Res, BorrowOut, LHS, RHS] = MI.getFirst4Regs();
3769 
3770     MIRBuilder.buildSub(Res, LHS, RHS);
3771     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
3772 
3773     MI.eraseFromParent();
3774     return Legalized;
3775   }
3776   case G_USUBE: {
3777     auto [Res, BorrowOut, LHS, RHS, BorrowIn] = MI.getFirst5Regs();
3778     const LLT CondTy = MRI.getType(BorrowOut);
3779     const LLT Ty = MRI.getType(Res);
3780 
3781     // Initial subtract of the two operands.
3782     auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
3783 
3784     // Initial check for borrow.
3785     auto Borrow = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, CondTy, TmpRes, LHS);
3786 
3787     // Subtract the borrow from the first subtract.
3788     auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
3789     MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
3790 
3791     // Second check for borrow. We can only borrow if the initial difference is
3792     // 0 and the borrow is set, resulting in a new difference of all 1s.
3793     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3794     auto TmpResEqZero =
3795         MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, TmpRes, Zero);
3796     auto Borrow2 = MIRBuilder.buildAnd(CondTy, TmpResEqZero, BorrowIn);
3797     MIRBuilder.buildOr(BorrowOut, Borrow, Borrow2);
3798 
3799     MI.eraseFromParent();
3800     return Legalized;
3801   }
3802   case G_UITOFP:
3803     return lowerUITOFP(MI);
3804   case G_SITOFP:
3805     return lowerSITOFP(MI);
3806   case G_FPTOUI:
3807     return lowerFPTOUI(MI);
3808   case G_FPTOSI:
3809     return lowerFPTOSI(MI);
3810   case G_FPTRUNC:
3811     return lowerFPTRUNC(MI);
3812   case G_FPOWI:
3813     return lowerFPOWI(MI);
3814   case G_SMIN:
3815   case G_SMAX:
3816   case G_UMIN:
3817   case G_UMAX:
3818     return lowerMinMax(MI);
3819   case G_FCOPYSIGN:
3820     return lowerFCopySign(MI);
3821   case G_FMINNUM:
3822   case G_FMAXNUM:
3823     return lowerFMinNumMaxNum(MI);
3824   case G_MERGE_VALUES:
3825     return lowerMergeValues(MI);
3826   case G_UNMERGE_VALUES:
3827     return lowerUnmergeValues(MI);
3828   case TargetOpcode::G_SEXT_INREG: {
3829     assert(MI.getOperand(2).isImm() && "Expected immediate");
3830     int64_t SizeInBits = MI.getOperand(2).getImm();
3831 
3832     auto [DstReg, SrcReg] = MI.getFirst2Regs();
3833     LLT DstTy = MRI.getType(DstReg);
3834     Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
3835 
3836     auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
3837     MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
3838     MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
3839     MI.eraseFromParent();
3840     return Legalized;
3841   }
3842   case G_EXTRACT_VECTOR_ELT:
3843   case G_INSERT_VECTOR_ELT:
3844     return lowerExtractInsertVectorElt(MI);
3845   case G_SHUFFLE_VECTOR:
3846     return lowerShuffleVector(MI);
3847   case G_DYN_STACKALLOC:
3848     return lowerDynStackAlloc(MI);
3849   case G_STACKSAVE:
3850     return lowerStackSave(MI);
3851   case G_STACKRESTORE:
3852     return lowerStackRestore(MI);
3853   case G_EXTRACT:
3854     return lowerExtract(MI);
3855   case G_INSERT:
3856     return lowerInsert(MI);
3857   case G_BSWAP:
3858     return lowerBswap(MI);
3859   case G_BITREVERSE:
3860     return lowerBitreverse(MI);
3861   case G_READ_REGISTER:
3862   case G_WRITE_REGISTER:
3863     return lowerReadWriteRegister(MI);
3864   case G_UADDSAT:
3865   case G_USUBSAT: {
3866     // Try to make a reasonable guess about which lowering strategy to use. The
3867     // target can override this with custom lowering and calling the
3868     // implementation functions.
3869     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3870     if (LI.isLegalOrCustom({G_UMIN, Ty}))
3871       return lowerAddSubSatToMinMax(MI);
3872     return lowerAddSubSatToAddoSubo(MI);
3873   }
3874   case G_SADDSAT:
3875   case G_SSUBSAT: {
3876     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3877 
3878     // FIXME: It would probably make more sense to see if G_SADDO is preferred,
3879     // since it's a shorter expansion. However, we would need to figure out the
3880     // preferred boolean type for the carry out for the query.
3881     if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
3882       return lowerAddSubSatToMinMax(MI);
3883     return lowerAddSubSatToAddoSubo(MI);
3884   }
3885   case G_SSHLSAT:
3886   case G_USHLSAT:
3887     return lowerShlSat(MI);
3888   case G_ABS:
3889     return lowerAbsToAddXor(MI);
3890   case G_SELECT:
3891     return lowerSelect(MI);
3892   case G_IS_FPCLASS:
3893     return lowerISFPCLASS(MI);
3894   case G_SDIVREM:
3895   case G_UDIVREM:
3896     return lowerDIVREM(MI);
3897   case G_FSHL:
3898   case G_FSHR:
3899     return lowerFunnelShift(MI);
3900   case G_ROTL:
3901   case G_ROTR:
3902     return lowerRotate(MI);
3903   case G_MEMSET:
3904   case G_MEMCPY:
3905   case G_MEMMOVE:
3906     return lowerMemCpyFamily(MI);
3907   case G_MEMCPY_INLINE:
3908     return lowerMemcpyInline(MI);
3909   case G_ZEXT:
3910   case G_SEXT:
3911   case G_ANYEXT:
3912     return lowerEXT(MI);
3913   case G_TRUNC:
3914     return lowerTRUNC(MI);
3915   GISEL_VECREDUCE_CASES_NONSEQ
3916     return lowerVectorReduction(MI);
3917   case G_VAARG:
3918     return lowerVAArg(MI);
3919   }
3920 }
3921 
3922 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
3923                                                   Align MinAlign) const {
3924   // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
3925   // datalayout for the preferred alignment. Also there should be a target hook
3926   // for this to allow targets to reduce the alignment and ignore the
3927   // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
3928   // the type.
3929   return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
3930 }
3931 
3932 MachineInstrBuilder
3933 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
3934                                       MachinePointerInfo &PtrInfo) {
3935   MachineFunction &MF = MIRBuilder.getMF();
3936   const DataLayout &DL = MIRBuilder.getDataLayout();
3937   int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
3938 
3939   unsigned AddrSpace = DL.getAllocaAddrSpace();
3940   LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3941 
3942   PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
3943   return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
3944 }
3945 
3946 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
3947                                         LLT VecTy) {
3948   int64_t IdxVal;
3949   if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal)))
3950     return IdxReg;
3951 
3952   LLT IdxTy = B.getMRI()->getType(IdxReg);
3953   unsigned NElts = VecTy.getNumElements();
3954   if (isPowerOf2_32(NElts)) {
3955     APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
3956     return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
3957   }
3958 
3959   return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
3960       .getReg(0);
3961 }
3962 
3963 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
3964                                                   Register Index) {
3965   LLT EltTy = VecTy.getElementType();
3966 
3967   // Calculate the element offset and add it to the pointer.
3968   unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
3969   assert(EltSize * 8 == EltTy.getSizeInBits() &&
3970          "Converting bits to bytes lost precision");
3971 
3972   Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy);
3973 
3974   LLT IdxTy = MRI.getType(Index);
3975   auto Mul = MIRBuilder.buildMul(IdxTy, Index,
3976                                  MIRBuilder.buildConstant(IdxTy, EltSize));
3977 
3978   LLT PtrTy = MRI.getType(VecPtr);
3979   return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
3980 }
3981 
3982 #ifndef NDEBUG
3983 /// Check that all vector operands have same number of elements. Other operands
3984 /// should be listed in NonVecOp.
3985 static bool hasSameNumEltsOnAllVectorOperands(
3986     GenericMachineInstr &MI, MachineRegisterInfo &MRI,
3987     std::initializer_list<unsigned> NonVecOpIndices) {
3988   if (MI.getNumMemOperands() != 0)
3989     return false;
3990 
3991   LLT VecTy = MRI.getType(MI.getReg(0));
3992   if (!VecTy.isVector())
3993     return false;
3994   unsigned NumElts = VecTy.getNumElements();
3995 
3996   for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
3997     MachineOperand &Op = MI.getOperand(OpIdx);
3998     if (!Op.isReg()) {
3999       if (!is_contained(NonVecOpIndices, OpIdx))
4000         return false;
4001       continue;
4002     }
4003 
4004     LLT Ty = MRI.getType(Op.getReg());
4005     if (!Ty.isVector()) {
4006       if (!is_contained(NonVecOpIndices, OpIdx))
4007         return false;
4008       continue;
4009     }
4010 
4011     if (Ty.getNumElements() != NumElts)
4012       return false;
4013   }
4014 
4015   return true;
4016 }
4017 #endif
4018 
4019 /// Fill \p DstOps with DstOps that have same number of elements combined as
4020 /// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
4021 /// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
4022 /// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
4023 static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
4024                        unsigned NumElts) {
4025   LLT LeftoverTy;
4026   assert(Ty.isVector() && "Expected vector type");
4027   LLT EltTy = Ty.getElementType();
4028   LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
4029   int NumParts, NumLeftover;
4030   std::tie(NumParts, NumLeftover) =
4031       getNarrowTypeBreakDown(Ty, NarrowTy, LeftoverTy);
4032 
4033   assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
4034   for (int i = 0; i < NumParts; ++i) {
4035     DstOps.push_back(NarrowTy);
4036   }
4037 
4038   if (LeftoverTy.isValid()) {
4039     assert(NumLeftover == 1 && "expected exactly one leftover");
4040     DstOps.push_back(LeftoverTy);
4041   }
4042 }
4043 
4044 /// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
4045 /// made from \p Op depending on operand type.
4046 static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
4047                            MachineOperand &Op) {
4048   for (unsigned i = 0; i < N; ++i) {
4049     if (Op.isReg())
4050       Ops.push_back(Op.getReg());
4051     else if (Op.isImm())
4052       Ops.push_back(Op.getImm());
4053     else if (Op.isPredicate())
4054       Ops.push_back(static_cast<CmpInst::Predicate>(Op.getPredicate()));
4055     else
4056       llvm_unreachable("Unsupported type");
4057   }
4058 }
4059 
4060 // Handle splitting vector operations which need to have the same number of
4061 // elements in each type index, but each type index may have a different element
4062 // type.
4063 //
4064 // e.g.  <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
4065 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4066 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4067 //
4068 // Also handles some irregular breakdown cases, e.g.
4069 // e.g.  <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
4070 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4071 //             s64 = G_SHL s64, s32
4072 LegalizerHelper::LegalizeResult
4073 LegalizerHelper::fewerElementsVectorMultiEltType(
4074     GenericMachineInstr &MI, unsigned NumElts,
4075     std::initializer_list<unsigned> NonVecOpIndices) {
4076   assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
4077          "Non-compatible opcode or not specified non-vector operands");
4078   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
4079 
4080   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
4081   unsigned NumDefs = MI.getNumDefs();
4082 
4083   // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
4084   // Build instructions with DstOps to use instruction found by CSE directly.
4085   // CSE copies found instruction into given vreg when building with vreg dest.
4086   SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
4087   // Output registers will be taken from created instructions.
4088   SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
4089   for (unsigned i = 0; i < NumDefs; ++i) {
4090     makeDstOps(OutputOpsPieces[i], MRI.getType(MI.getReg(i)), NumElts);
4091   }
4092 
4093   // Split vector input operands into sub-vectors with NumElts elts + Leftover.
4094   // Operands listed in NonVecOpIndices will be used as is without splitting;
4095   // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
4096   // scalar condition (op 1), immediate in sext_inreg (op 2).
4097   SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
4098   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
4099        ++UseIdx, ++UseNo) {
4100     if (is_contained(NonVecOpIndices, UseIdx)) {
4101       broadcastSrcOp(InputOpsPieces[UseNo], OutputOpsPieces[0].size(),
4102                      MI.getOperand(UseIdx));
4103     } else {
4104       SmallVector<Register, 8> SplitPieces;
4105       extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces, MIRBuilder,
4106                          MRI);
4107       for (auto Reg : SplitPieces)
4108         InputOpsPieces[UseNo].push_back(Reg);
4109     }
4110   }
4111 
4112   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
4113 
4114   // Take i-th piece of each input operand split and build sub-vector/scalar
4115   // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
4116   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
4117     SmallVector<DstOp, 2> Defs;
4118     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
4119       Defs.push_back(OutputOpsPieces[DstNo][i]);
4120 
4121     SmallVector<SrcOp, 3> Uses;
4122     for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
4123       Uses.push_back(InputOpsPieces[InputNo][i]);
4124 
4125     auto I = MIRBuilder.buildInstr(MI.getOpcode(), Defs, Uses, MI.getFlags());
4126     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
4127       OutputRegs[DstNo].push_back(I.getReg(DstNo));
4128   }
4129 
4130   // Merge small outputs into MI's output for each def operand.
4131   if (NumLeftovers) {
4132     for (unsigned i = 0; i < NumDefs; ++i)
4133       mergeMixedSubvectors(MI.getReg(i), OutputRegs[i]);
4134   } else {
4135     for (unsigned i = 0; i < NumDefs; ++i)
4136       MIRBuilder.buildMergeLikeInstr(MI.getReg(i), OutputRegs[i]);
4137   }
4138 
4139   MI.eraseFromParent();
4140   return Legalized;
4141 }
4142 
4143 LegalizerHelper::LegalizeResult
4144 LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
4145                                         unsigned NumElts) {
4146   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
4147 
4148   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
4149   unsigned NumDefs = MI.getNumDefs();
4150 
4151   SmallVector<DstOp, 8> OutputOpsPieces;
4152   SmallVector<Register, 8> OutputRegs;
4153   makeDstOps(OutputOpsPieces, MRI.getType(MI.getReg(0)), NumElts);
4154 
4155   // Instructions that perform register split will be inserted in basic block
4156   // where register is defined (basic block is in the next operand).
4157   SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
4158   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
4159        UseIdx += 2, ++UseNo) {
4160     MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB();
4161     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
4162     extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo],
4163                        MIRBuilder, MRI);
4164   }
4165 
4166   // Build PHIs with fewer elements.
4167   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
4168   MIRBuilder.setInsertPt(*MI.getParent(), MI);
4169   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
4170     auto Phi = MIRBuilder.buildInstr(TargetOpcode::G_PHI);
4171     Phi.addDef(
4172         MRI.createGenericVirtualRegister(OutputOpsPieces[i].getLLTTy(MRI)));
4173     OutputRegs.push_back(Phi.getReg(0));
4174 
4175     for (unsigned j = 0; j < NumInputs / 2; ++j) {
4176       Phi.addUse(InputOpsPieces[j][i]);
4177       Phi.add(MI.getOperand(1 + j * 2 + 1));
4178     }
4179   }
4180 
4181   // Merge small outputs into MI's def.
4182   if (NumLeftovers) {
4183     mergeMixedSubvectors(MI.getReg(0), OutputRegs);
4184   } else {
4185     MIRBuilder.buildMergeLikeInstr(MI.getReg(0), OutputRegs);
4186   }
4187 
4188   MI.eraseFromParent();
4189   return Legalized;
4190 }
4191 
4192 LegalizerHelper::LegalizeResult
4193 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
4194                                                   unsigned TypeIdx,
4195                                                   LLT NarrowTy) {
4196   const int NumDst = MI.getNumOperands() - 1;
4197   const Register SrcReg = MI.getOperand(NumDst).getReg();
4198   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
4199   LLT SrcTy = MRI.getType(SrcReg);
4200 
4201   if (TypeIdx != 1 || NarrowTy == DstTy)
4202     return UnableToLegalize;
4203 
4204   // Requires compatible types. Otherwise SrcReg should have been defined by
4205   // merge-like instruction that would get artifact combined. Most likely
4206   // instruction that defines SrcReg has to perform more/fewer elements
4207   // legalization compatible with NarrowTy.
4208   assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
4209   assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
4210 
4211   if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
4212       (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
4213     return UnableToLegalize;
4214 
4215   // This is most likely DstTy (smaller then register size) packed in SrcTy
4216   // (larger then register size) and since unmerge was not combined it will be
4217   // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
4218   // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
4219 
4220   // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
4221   //
4222   // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
4223   // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
4224   // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
4225   auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, SrcReg);
4226   const int NumUnmerge = Unmerge->getNumOperands() - 1;
4227   const int PartsPerUnmerge = NumDst / NumUnmerge;
4228 
4229   for (int I = 0; I != NumUnmerge; ++I) {
4230     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
4231 
4232     for (int J = 0; J != PartsPerUnmerge; ++J)
4233       MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
4234     MIB.addUse(Unmerge.getReg(I));
4235   }
4236 
4237   MI.eraseFromParent();
4238   return Legalized;
4239 }
4240 
4241 LegalizerHelper::LegalizeResult
4242 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
4243                                           LLT NarrowTy) {
4244   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
4245   // Requires compatible types. Otherwise user of DstReg did not perform unmerge
4246   // that should have been artifact combined. Most likely instruction that uses
4247   // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
4248   assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
4249   assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
4250   if (NarrowTy == SrcTy)
4251     return UnableToLegalize;
4252 
4253   // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
4254   // is for old mir tests. Since the changes to more/fewer elements it should no
4255   // longer be possible to generate MIR like this when starting from llvm-ir
4256   // because LCMTy approach was replaced with merge/unmerge to vector elements.
4257   if (TypeIdx == 1) {
4258     assert(SrcTy.isVector() && "Expected vector types");
4259     assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
4260     if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
4261         (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
4262       return UnableToLegalize;
4263     // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
4264     //
4265     // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
4266     // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
4267     // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
4268     // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
4269     // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
4270     // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
4271 
4272     SmallVector<Register, 8> Elts;
4273     LLT EltTy = MRI.getType(MI.getOperand(1).getReg()).getScalarType();
4274     for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
4275       auto Unmerge = MIRBuilder.buildUnmerge(EltTy, MI.getOperand(i).getReg());
4276       for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
4277         Elts.push_back(Unmerge.getReg(j));
4278     }
4279 
4280     SmallVector<Register, 8> NarrowTyElts;
4281     unsigned NumNarrowTyElts = NarrowTy.getNumElements();
4282     unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
4283     for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
4284          ++i, Offset += NumNarrowTyElts) {
4285       ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
4286       NarrowTyElts.push_back(
4287           MIRBuilder.buildMergeLikeInstr(NarrowTy, Pieces).getReg(0));
4288     }
4289 
4290     MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts);
4291     MI.eraseFromParent();
4292     return Legalized;
4293   }
4294 
4295   assert(TypeIdx == 0 && "Bad type index");
4296   if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
4297       (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
4298     return UnableToLegalize;
4299 
4300   // This is most likely SrcTy (smaller then register size) packed in DstTy
4301   // (larger then register size) and since merge was not combined it will be
4302   // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
4303   // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
4304 
4305   // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
4306   //
4307   // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
4308   // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
4309   // %0:_(DstTy)  = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
4310   SmallVector<Register, 8> NarrowTyElts;
4311   unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
4312   unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
4313   unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
4314   for (unsigned i = 0; i < NumParts; ++i) {
4315     SmallVector<Register, 8> Sources;
4316     for (unsigned j = 0; j < NumElts; ++j)
4317       Sources.push_back(MI.getOperand(1 + i * NumElts + j).getReg());
4318     NarrowTyElts.push_back(
4319         MIRBuilder.buildMergeLikeInstr(NarrowTy, Sources).getReg(0));
4320   }
4321 
4322   MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts);
4323   MI.eraseFromParent();
4324   return Legalized;
4325 }
4326 
4327 LegalizerHelper::LegalizeResult
4328 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
4329                                                            unsigned TypeIdx,
4330                                                            LLT NarrowVecTy) {
4331   auto [DstReg, SrcVec] = MI.getFirst2Regs();
4332   Register InsertVal;
4333   bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
4334 
4335   assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
4336   if (IsInsert)
4337     InsertVal = MI.getOperand(2).getReg();
4338 
4339   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
4340 
4341   // TODO: Handle total scalarization case.
4342   if (!NarrowVecTy.isVector())
4343     return UnableToLegalize;
4344 
4345   LLT VecTy = MRI.getType(SrcVec);
4346 
4347   // If the index is a constant, we can really break this down as you would
4348   // expect, and index into the target size pieces.
4349   int64_t IdxVal;
4350   auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
4351   if (MaybeCst) {
4352     IdxVal = MaybeCst->Value.getSExtValue();
4353     // Avoid out of bounds indexing the pieces.
4354     if (IdxVal >= VecTy.getNumElements()) {
4355       MIRBuilder.buildUndef(DstReg);
4356       MI.eraseFromParent();
4357       return Legalized;
4358     }
4359 
4360     SmallVector<Register, 8> VecParts;
4361     LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
4362 
4363     // Build a sequence of NarrowTy pieces in VecParts for this operand.
4364     LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
4365                                     TargetOpcode::G_ANYEXT);
4366 
4367     unsigned NewNumElts = NarrowVecTy.getNumElements();
4368 
4369     LLT IdxTy = MRI.getType(Idx);
4370     int64_t PartIdx = IdxVal / NewNumElts;
4371     auto NewIdx =
4372         MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
4373 
4374     if (IsInsert) {
4375       LLT PartTy = MRI.getType(VecParts[PartIdx]);
4376 
4377       // Use the adjusted index to insert into one of the subvectors.
4378       auto InsertPart = MIRBuilder.buildInsertVectorElement(
4379           PartTy, VecParts[PartIdx], InsertVal, NewIdx);
4380       VecParts[PartIdx] = InsertPart.getReg(0);
4381 
4382       // Recombine the inserted subvector with the others to reform the result
4383       // vector.
4384       buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
4385     } else {
4386       MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
4387     }
4388 
4389     MI.eraseFromParent();
4390     return Legalized;
4391   }
4392 
4393   // With a variable index, we can't perform the operation in a smaller type, so
4394   // we're forced to expand this.
4395   //
4396   // TODO: We could emit a chain of compare/select to figure out which piece to
4397   // index.
4398   return lowerExtractInsertVectorElt(MI);
4399 }
4400 
4401 LegalizerHelper::LegalizeResult
4402 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
4403                                       LLT NarrowTy) {
4404   // FIXME: Don't know how to handle secondary types yet.
4405   if (TypeIdx != 0)
4406     return UnableToLegalize;
4407 
4408   // This implementation doesn't work for atomics. Give up instead of doing
4409   // something invalid.
4410   if (LdStMI.isAtomic())
4411     return UnableToLegalize;
4412 
4413   bool IsLoad = isa<GLoad>(LdStMI);
4414   Register ValReg = LdStMI.getReg(0);
4415   Register AddrReg = LdStMI.getPointerReg();
4416   LLT ValTy = MRI.getType(ValReg);
4417 
4418   // FIXME: Do we need a distinct NarrowMemory legalize action?
4419   if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
4420     LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
4421     return UnableToLegalize;
4422   }
4423 
4424   int NumParts = -1;
4425   int NumLeftover = -1;
4426   LLT LeftoverTy;
4427   SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
4428   if (IsLoad) {
4429     std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
4430   } else {
4431     if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
4432                      NarrowLeftoverRegs, MIRBuilder, MRI)) {
4433       NumParts = NarrowRegs.size();
4434       NumLeftover = NarrowLeftoverRegs.size();
4435     }
4436   }
4437 
4438   if (NumParts == -1)
4439     return UnableToLegalize;
4440 
4441   LLT PtrTy = MRI.getType(AddrReg);
4442   const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
4443 
4444   unsigned TotalSize = ValTy.getSizeInBits();
4445 
4446   // Split the load/store into PartTy sized pieces starting at Offset. If this
4447   // is a load, return the new registers in ValRegs. For a store, each elements
4448   // of ValRegs should be PartTy. Returns the next offset that needs to be
4449   // handled.
4450   bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian();
4451   auto MMO = LdStMI.getMMO();
4452   auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
4453                              unsigned NumParts, unsigned Offset) -> unsigned {
4454     MachineFunction &MF = MIRBuilder.getMF();
4455     unsigned PartSize = PartTy.getSizeInBits();
4456     for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
4457          ++Idx) {
4458       unsigned ByteOffset = Offset / 8;
4459       Register NewAddrReg;
4460 
4461       MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
4462 
4463       MachineMemOperand *NewMMO =
4464           MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
4465 
4466       if (IsLoad) {
4467         Register Dst = MRI.createGenericVirtualRegister(PartTy);
4468         ValRegs.push_back(Dst);
4469         MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
4470       } else {
4471         MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
4472       }
4473       Offset = isBigEndian ? Offset - PartSize : Offset + PartSize;
4474     }
4475 
4476     return Offset;
4477   };
4478 
4479   unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0;
4480   unsigned HandledOffset =
4481       splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset);
4482 
4483   // Handle the rest of the register if this isn't an even type breakdown.
4484   if (LeftoverTy.isValid())
4485     splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
4486 
4487   if (IsLoad) {
4488     insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
4489                 LeftoverTy, NarrowLeftoverRegs);
4490   }
4491 
4492   LdStMI.eraseFromParent();
4493   return Legalized;
4494 }
4495 
4496 LegalizerHelper::LegalizeResult
4497 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
4498                                      LLT NarrowTy) {
4499   using namespace TargetOpcode;
4500   GenericMachineInstr &GMI = cast<GenericMachineInstr>(MI);
4501   unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
4502 
4503   switch (MI.getOpcode()) {
4504   case G_IMPLICIT_DEF:
4505   case G_TRUNC:
4506   case G_AND:
4507   case G_OR:
4508   case G_XOR:
4509   case G_ADD:
4510   case G_SUB:
4511   case G_MUL:
4512   case G_PTR_ADD:
4513   case G_SMULH:
4514   case G_UMULH:
4515   case G_FADD:
4516   case G_FMUL:
4517   case G_FSUB:
4518   case G_FNEG:
4519   case G_FABS:
4520   case G_FCANONICALIZE:
4521   case G_FDIV:
4522   case G_FREM:
4523   case G_FMA:
4524   case G_FMAD:
4525   case G_FPOW:
4526   case G_FEXP:
4527   case G_FEXP2:
4528   case G_FEXP10:
4529   case G_FLOG:
4530   case G_FLOG2:
4531   case G_FLOG10:
4532   case G_FLDEXP:
4533   case G_FNEARBYINT:
4534   case G_FCEIL:
4535   case G_FFLOOR:
4536   case G_FRINT:
4537   case G_INTRINSIC_ROUND:
4538   case G_INTRINSIC_ROUNDEVEN:
4539   case G_INTRINSIC_TRUNC:
4540   case G_FCOS:
4541   case G_FSIN:
4542   case G_FSQRT:
4543   case G_BSWAP:
4544   case G_BITREVERSE:
4545   case G_SDIV:
4546   case G_UDIV:
4547   case G_SREM:
4548   case G_UREM:
4549   case G_SDIVREM:
4550   case G_UDIVREM:
4551   case G_SMIN:
4552   case G_SMAX:
4553   case G_UMIN:
4554   case G_UMAX:
4555   case G_ABS:
4556   case G_FMINNUM:
4557   case G_FMAXNUM:
4558   case G_FMINNUM_IEEE:
4559   case G_FMAXNUM_IEEE:
4560   case G_FMINIMUM:
4561   case G_FMAXIMUM:
4562   case G_FSHL:
4563   case G_FSHR:
4564   case G_ROTL:
4565   case G_ROTR:
4566   case G_FREEZE:
4567   case G_SADDSAT:
4568   case G_SSUBSAT:
4569   case G_UADDSAT:
4570   case G_USUBSAT:
4571   case G_UMULO:
4572   case G_SMULO:
4573   case G_SHL:
4574   case G_LSHR:
4575   case G_ASHR:
4576   case G_SSHLSAT:
4577   case G_USHLSAT:
4578   case G_CTLZ:
4579   case G_CTLZ_ZERO_UNDEF:
4580   case G_CTTZ:
4581   case G_CTTZ_ZERO_UNDEF:
4582   case G_CTPOP:
4583   case G_FCOPYSIGN:
4584   case G_ZEXT:
4585   case G_SEXT:
4586   case G_ANYEXT:
4587   case G_FPEXT:
4588   case G_FPTRUNC:
4589   case G_SITOFP:
4590   case G_UITOFP:
4591   case G_FPTOSI:
4592   case G_FPTOUI:
4593   case G_INTTOPTR:
4594   case G_PTRTOINT:
4595   case G_ADDRSPACE_CAST:
4596   case G_UADDO:
4597   case G_USUBO:
4598   case G_UADDE:
4599   case G_USUBE:
4600   case G_SADDO:
4601   case G_SSUBO:
4602   case G_SADDE:
4603   case G_SSUBE:
4604   case G_STRICT_FADD:
4605   case G_STRICT_FSUB:
4606   case G_STRICT_FMUL:
4607   case G_STRICT_FMA:
4608   case G_STRICT_FLDEXP:
4609   case G_FFREXP:
4610     return fewerElementsVectorMultiEltType(GMI, NumElts);
4611   case G_ICMP:
4612   case G_FCMP:
4613     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*cpm predicate*/});
4614   case G_IS_FPCLASS:
4615     return fewerElementsVectorMultiEltType(GMI, NumElts, {2, 3 /*mask,fpsem*/});
4616   case G_SELECT:
4617     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
4618       return fewerElementsVectorMultiEltType(GMI, NumElts);
4619     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*scalar cond*/});
4620   case G_PHI:
4621     return fewerElementsVectorPhi(GMI, NumElts);
4622   case G_UNMERGE_VALUES:
4623     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
4624   case G_BUILD_VECTOR:
4625     assert(TypeIdx == 0 && "not a vector type index");
4626     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4627   case G_CONCAT_VECTORS:
4628     if (TypeIdx != 1) // TODO: This probably does work as expected already.
4629       return UnableToLegalize;
4630     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4631   case G_EXTRACT_VECTOR_ELT:
4632   case G_INSERT_VECTOR_ELT:
4633     return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
4634   case G_LOAD:
4635   case G_STORE:
4636     return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
4637   case G_SEXT_INREG:
4638     return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/});
4639   GISEL_VECREDUCE_CASES_NONSEQ
4640     return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
4641   case TargetOpcode::G_VECREDUCE_SEQ_FADD:
4642   case TargetOpcode::G_VECREDUCE_SEQ_FMUL:
4643     return fewerElementsVectorSeqReductions(MI, TypeIdx, NarrowTy);
4644   case G_SHUFFLE_VECTOR:
4645     return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
4646   case G_FPOWI:
4647     return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*pow*/});
4648   default:
4649     return UnableToLegalize;
4650   }
4651 }
4652 
4653 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
4654     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4655   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
4656   if (TypeIdx != 0)
4657     return UnableToLegalize;
4658 
4659   auto [DstReg, DstTy, Src1Reg, Src1Ty, Src2Reg, Src2Ty] =
4660       MI.getFirst3RegLLTs();
4661   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4662   // The shuffle should be canonicalized by now.
4663   if (DstTy != Src1Ty)
4664     return UnableToLegalize;
4665   if (DstTy != Src2Ty)
4666     return UnableToLegalize;
4667 
4668   if (!isPowerOf2_32(DstTy.getNumElements()))
4669     return UnableToLegalize;
4670 
4671   // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
4672   // Further legalization attempts will be needed to do split further.
4673   NarrowTy =
4674       DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
4675   unsigned NewElts = NarrowTy.getNumElements();
4676 
4677   SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
4678   extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs, MIRBuilder, MRI);
4679   extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs, MIRBuilder, MRI);
4680   Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
4681                         SplitSrc2Regs[1]};
4682 
4683   Register Hi, Lo;
4684 
4685   // If Lo or Hi uses elements from at most two of the four input vectors, then
4686   // express it as a vector shuffle of those two inputs.  Otherwise extract the
4687   // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
4688   SmallVector<int, 16> Ops;
4689   for (unsigned High = 0; High < 2; ++High) {
4690     Register &Output = High ? Hi : Lo;
4691 
4692     // Build a shuffle mask for the output, discovering on the fly which
4693     // input vectors to use as shuffle operands (recorded in InputUsed).
4694     // If building a suitable shuffle vector proves too hard, then bail
4695     // out with useBuildVector set.
4696     unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
4697     unsigned FirstMaskIdx = High * NewElts;
4698     bool UseBuildVector = false;
4699     for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4700       // The mask element.  This indexes into the input.
4701       int Idx = Mask[FirstMaskIdx + MaskOffset];
4702 
4703       // The input vector this mask element indexes into.
4704       unsigned Input = (unsigned)Idx / NewElts;
4705 
4706       if (Input >= std::size(Inputs)) {
4707         // The mask element does not index into any input vector.
4708         Ops.push_back(-1);
4709         continue;
4710       }
4711 
4712       // Turn the index into an offset from the start of the input vector.
4713       Idx -= Input * NewElts;
4714 
4715       // Find or create a shuffle vector operand to hold this input.
4716       unsigned OpNo;
4717       for (OpNo = 0; OpNo < std::size(InputUsed); ++OpNo) {
4718         if (InputUsed[OpNo] == Input) {
4719           // This input vector is already an operand.
4720           break;
4721         } else if (InputUsed[OpNo] == -1U) {
4722           // Create a new operand for this input vector.
4723           InputUsed[OpNo] = Input;
4724           break;
4725         }
4726       }
4727 
4728       if (OpNo >= std::size(InputUsed)) {
4729         // More than two input vectors used!  Give up on trying to create a
4730         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
4731         UseBuildVector = true;
4732         break;
4733       }
4734 
4735       // Add the mask index for the new shuffle vector.
4736       Ops.push_back(Idx + OpNo * NewElts);
4737     }
4738 
4739     if (UseBuildVector) {
4740       LLT EltTy = NarrowTy.getElementType();
4741       SmallVector<Register, 16> SVOps;
4742 
4743       // Extract the input elements by hand.
4744       for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4745         // The mask element.  This indexes into the input.
4746         int Idx = Mask[FirstMaskIdx + MaskOffset];
4747 
4748         // The input vector this mask element indexes into.
4749         unsigned Input = (unsigned)Idx / NewElts;
4750 
4751         if (Input >= std::size(Inputs)) {
4752           // The mask element is "undef" or indexes off the end of the input.
4753           SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
4754           continue;
4755         }
4756 
4757         // Turn the index into an offset from the start of the input vector.
4758         Idx -= Input * NewElts;
4759 
4760         // Extract the vector element by hand.
4761         SVOps.push_back(MIRBuilder
4762                             .buildExtractVectorElement(
4763                                 EltTy, Inputs[Input],
4764                                 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
4765                             .getReg(0));
4766       }
4767 
4768       // Construct the Lo/Hi output using a G_BUILD_VECTOR.
4769       Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
4770     } else if (InputUsed[0] == -1U) {
4771       // No input vectors were used! The result is undefined.
4772       Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
4773     } else {
4774       Register Op0 = Inputs[InputUsed[0]];
4775       // If only one input was used, use an undefined vector for the other.
4776       Register Op1 = InputUsed[1] == -1U
4777                          ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
4778                          : Inputs[InputUsed[1]];
4779       // At least one input vector was used. Create a new shuffle vector.
4780       Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
4781     }
4782 
4783     Ops.clear();
4784   }
4785 
4786   MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi});
4787   MI.eraseFromParent();
4788   return Legalized;
4789 }
4790 
4791 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
4792     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4793   auto &RdxMI = cast<GVecReduce>(MI);
4794 
4795   if (TypeIdx != 1)
4796     return UnableToLegalize;
4797 
4798   // The semantics of the normal non-sequential reductions allow us to freely
4799   // re-associate the operation.
4800   auto [DstReg, DstTy, SrcReg, SrcTy] = RdxMI.getFirst2RegLLTs();
4801 
4802   if (NarrowTy.isVector() &&
4803       (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
4804     return UnableToLegalize;
4805 
4806   unsigned ScalarOpc = RdxMI.getScalarOpcForReduction();
4807   SmallVector<Register> SplitSrcs;
4808   // If NarrowTy is a scalar then we're being asked to scalarize.
4809   const unsigned NumParts =
4810       NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
4811                           : SrcTy.getNumElements();
4812 
4813   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs, MIRBuilder, MRI);
4814   if (NarrowTy.isScalar()) {
4815     if (DstTy != NarrowTy)
4816       return UnableToLegalize; // FIXME: handle implicit extensions.
4817 
4818     if (isPowerOf2_32(NumParts)) {
4819       // Generate a tree of scalar operations to reduce the critical path.
4820       SmallVector<Register> PartialResults;
4821       unsigned NumPartsLeft = NumParts;
4822       while (NumPartsLeft > 1) {
4823         for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
4824           PartialResults.emplace_back(
4825               MIRBuilder
4826                   .buildInstr(ScalarOpc, {NarrowTy},
4827                               {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
4828                   .getReg(0));
4829         }
4830         SplitSrcs = PartialResults;
4831         PartialResults.clear();
4832         NumPartsLeft = SplitSrcs.size();
4833       }
4834       assert(SplitSrcs.size() == 1);
4835       MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
4836       MI.eraseFromParent();
4837       return Legalized;
4838     }
4839     // If we can't generate a tree, then just do sequential operations.
4840     Register Acc = SplitSrcs[0];
4841     for (unsigned Idx = 1; Idx < NumParts; ++Idx)
4842       Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
4843                 .getReg(0);
4844     MIRBuilder.buildCopy(DstReg, Acc);
4845     MI.eraseFromParent();
4846     return Legalized;
4847   }
4848   SmallVector<Register> PartialReductions;
4849   for (unsigned Part = 0; Part < NumParts; ++Part) {
4850     PartialReductions.push_back(
4851         MIRBuilder.buildInstr(RdxMI.getOpcode(), {DstTy}, {SplitSrcs[Part]})
4852             .getReg(0));
4853   }
4854 
4855   // If the types involved are powers of 2, we can generate intermediate vector
4856   // ops, before generating a final reduction operation.
4857   if (isPowerOf2_32(SrcTy.getNumElements()) &&
4858       isPowerOf2_32(NarrowTy.getNumElements())) {
4859     return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
4860   }
4861 
4862   Register Acc = PartialReductions[0];
4863   for (unsigned Part = 1; Part < NumParts; ++Part) {
4864     if (Part == NumParts - 1) {
4865       MIRBuilder.buildInstr(ScalarOpc, {DstReg},
4866                             {Acc, PartialReductions[Part]});
4867     } else {
4868       Acc = MIRBuilder
4869                 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
4870                 .getReg(0);
4871     }
4872   }
4873   MI.eraseFromParent();
4874   return Legalized;
4875 }
4876 
4877 LegalizerHelper::LegalizeResult
4878 LegalizerHelper::fewerElementsVectorSeqReductions(MachineInstr &MI,
4879                                                   unsigned int TypeIdx,
4880                                                   LLT NarrowTy) {
4881   auto [DstReg, DstTy, ScalarReg, ScalarTy, SrcReg, SrcTy] =
4882       MI.getFirst3RegLLTs();
4883   if (!NarrowTy.isScalar() || TypeIdx != 2 || DstTy != ScalarTy ||
4884       DstTy != NarrowTy)
4885     return UnableToLegalize;
4886 
4887   assert((MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD ||
4888           MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FMUL) &&
4889          "Unexpected vecreduce opcode");
4890   unsigned ScalarOpc = MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD
4891                            ? TargetOpcode::G_FADD
4892                            : TargetOpcode::G_FMUL;
4893 
4894   SmallVector<Register> SplitSrcs;
4895   unsigned NumParts = SrcTy.getNumElements();
4896   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs, MIRBuilder, MRI);
4897   Register Acc = ScalarReg;
4898   for (unsigned i = 0; i < NumParts; i++)
4899     Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[i]})
4900               .getReg(0);
4901 
4902   MIRBuilder.buildCopy(DstReg, Acc);
4903   MI.eraseFromParent();
4904   return Legalized;
4905 }
4906 
4907 LegalizerHelper::LegalizeResult
4908 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
4909                                         LLT SrcTy, LLT NarrowTy,
4910                                         unsigned ScalarOpc) {
4911   SmallVector<Register> SplitSrcs;
4912   // Split the sources into NarrowTy size pieces.
4913   extractParts(SrcReg, NarrowTy,
4914                SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs,
4915                MIRBuilder, MRI);
4916   // We're going to do a tree reduction using vector operations until we have
4917   // one NarrowTy size value left.
4918   while (SplitSrcs.size() > 1) {
4919     SmallVector<Register> PartialRdxs;
4920     for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
4921       Register LHS = SplitSrcs[Idx];
4922       Register RHS = SplitSrcs[Idx + 1];
4923       // Create the intermediate vector op.
4924       Register Res =
4925           MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
4926       PartialRdxs.push_back(Res);
4927     }
4928     SplitSrcs = std::move(PartialRdxs);
4929   }
4930   // Finally generate the requested NarrowTy based reduction.
4931   Observer.changingInstr(MI);
4932   MI.getOperand(1).setReg(SplitSrcs[0]);
4933   Observer.changedInstr(MI);
4934   return Legalized;
4935 }
4936 
4937 LegalizerHelper::LegalizeResult
4938 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
4939                                              const LLT HalfTy, const LLT AmtTy) {
4940 
4941   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4942   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4943   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4944 
4945   if (Amt.isZero()) {
4946     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {InL, InH});
4947     MI.eraseFromParent();
4948     return Legalized;
4949   }
4950 
4951   LLT NVT = HalfTy;
4952   unsigned NVTBits = HalfTy.getSizeInBits();
4953   unsigned VTBits = 2 * NVTBits;
4954 
4955   SrcOp Lo(Register(0)), Hi(Register(0));
4956   if (MI.getOpcode() == TargetOpcode::G_SHL) {
4957     if (Amt.ugt(VTBits)) {
4958       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4959     } else if (Amt.ugt(NVTBits)) {
4960       Lo = MIRBuilder.buildConstant(NVT, 0);
4961       Hi = MIRBuilder.buildShl(NVT, InL,
4962                                MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4963     } else if (Amt == NVTBits) {
4964       Lo = MIRBuilder.buildConstant(NVT, 0);
4965       Hi = InL;
4966     } else {
4967       Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
4968       auto OrLHS =
4969           MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
4970       auto OrRHS = MIRBuilder.buildLShr(
4971           NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4972       Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4973     }
4974   } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4975     if (Amt.ugt(VTBits)) {
4976       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4977     } else if (Amt.ugt(NVTBits)) {
4978       Lo = MIRBuilder.buildLShr(NVT, InH,
4979                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4980       Hi = MIRBuilder.buildConstant(NVT, 0);
4981     } else if (Amt == NVTBits) {
4982       Lo = InH;
4983       Hi = MIRBuilder.buildConstant(NVT, 0);
4984     } else {
4985       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4986 
4987       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4988       auto OrRHS = MIRBuilder.buildShl(
4989           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4990 
4991       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4992       Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
4993     }
4994   } else {
4995     if (Amt.ugt(VTBits)) {
4996       Hi = Lo = MIRBuilder.buildAShr(
4997           NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4998     } else if (Amt.ugt(NVTBits)) {
4999       Lo = MIRBuilder.buildAShr(NVT, InH,
5000                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
5001       Hi = MIRBuilder.buildAShr(NVT, InH,
5002                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
5003     } else if (Amt == NVTBits) {
5004       Lo = InH;
5005       Hi = MIRBuilder.buildAShr(NVT, InH,
5006                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
5007     } else {
5008       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
5009 
5010       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
5011       auto OrRHS = MIRBuilder.buildShl(
5012           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
5013 
5014       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
5015       Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
5016     }
5017   }
5018 
5019   MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {Lo, Hi});
5020   MI.eraseFromParent();
5021 
5022   return Legalized;
5023 }
5024 
5025 // TODO: Optimize if constant shift amount.
5026 LegalizerHelper::LegalizeResult
5027 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
5028                                    LLT RequestedTy) {
5029   if (TypeIdx == 1) {
5030     Observer.changingInstr(MI);
5031     narrowScalarSrc(MI, RequestedTy, 2);
5032     Observer.changedInstr(MI);
5033     return Legalized;
5034   }
5035 
5036   Register DstReg = MI.getOperand(0).getReg();
5037   LLT DstTy = MRI.getType(DstReg);
5038   if (DstTy.isVector())
5039     return UnableToLegalize;
5040 
5041   Register Amt = MI.getOperand(2).getReg();
5042   LLT ShiftAmtTy = MRI.getType(Amt);
5043   const unsigned DstEltSize = DstTy.getScalarSizeInBits();
5044   if (DstEltSize % 2 != 0)
5045     return UnableToLegalize;
5046 
5047   // Ignore the input type. We can only go to exactly half the size of the
5048   // input. If that isn't small enough, the resulting pieces will be further
5049   // legalized.
5050   const unsigned NewBitSize = DstEltSize / 2;
5051   const LLT HalfTy = LLT::scalar(NewBitSize);
5052   const LLT CondTy = LLT::scalar(1);
5053 
5054   if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) {
5055     return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
5056                                        ShiftAmtTy);
5057   }
5058 
5059   // TODO: Expand with known bits.
5060 
5061   // Handle the fully general expansion by an unknown amount.
5062   auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
5063 
5064   Register InL = MRI.createGenericVirtualRegister(HalfTy);
5065   Register InH = MRI.createGenericVirtualRegister(HalfTy);
5066   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
5067 
5068   auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
5069   auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
5070 
5071   auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
5072   auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
5073   auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
5074 
5075   Register ResultRegs[2];
5076   switch (MI.getOpcode()) {
5077   case TargetOpcode::G_SHL: {
5078     // Short: ShAmt < NewBitSize
5079     auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
5080 
5081     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
5082     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
5083     auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
5084 
5085     // Long: ShAmt >= NewBitSize
5086     auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
5087     auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
5088 
5089     auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
5090     auto Hi = MIRBuilder.buildSelect(
5091         HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
5092 
5093     ResultRegs[0] = Lo.getReg(0);
5094     ResultRegs[1] = Hi.getReg(0);
5095     break;
5096   }
5097   case TargetOpcode::G_LSHR:
5098   case TargetOpcode::G_ASHR: {
5099     // Short: ShAmt < NewBitSize
5100     auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
5101 
5102     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
5103     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
5104     auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
5105 
5106     // Long: ShAmt >= NewBitSize
5107     MachineInstrBuilder HiL;
5108     if (MI.getOpcode() == TargetOpcode::G_LSHR) {
5109       HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
5110     } else {
5111       auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
5112       HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
5113     }
5114     auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
5115                                      {InH, AmtExcess});     // Lo from Hi part.
5116 
5117     auto Lo = MIRBuilder.buildSelect(
5118         HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
5119 
5120     auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
5121 
5122     ResultRegs[0] = Lo.getReg(0);
5123     ResultRegs[1] = Hi.getReg(0);
5124     break;
5125   }
5126   default:
5127     llvm_unreachable("not a shift");
5128   }
5129 
5130   MIRBuilder.buildMergeLikeInstr(DstReg, ResultRegs);
5131   MI.eraseFromParent();
5132   return Legalized;
5133 }
5134 
5135 LegalizerHelper::LegalizeResult
5136 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
5137                                        LLT MoreTy) {
5138   assert(TypeIdx == 0 && "Expecting only Idx 0");
5139 
5140   Observer.changingInstr(MI);
5141   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
5142     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
5143     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
5144     moreElementsVectorSrc(MI, MoreTy, I);
5145   }
5146 
5147   MachineBasicBlock &MBB = *MI.getParent();
5148   MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
5149   moreElementsVectorDst(MI, MoreTy, 0);
5150   Observer.changedInstr(MI);
5151   return Legalized;
5152 }
5153 
5154 LegalizerHelper::LegalizeResult
5155 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
5156                                     LLT MoreTy) {
5157   unsigned Opc = MI.getOpcode();
5158   switch (Opc) {
5159   case TargetOpcode::G_IMPLICIT_DEF:
5160   case TargetOpcode::G_LOAD: {
5161     if (TypeIdx != 0)
5162       return UnableToLegalize;
5163     Observer.changingInstr(MI);
5164     moreElementsVectorDst(MI, MoreTy, 0);
5165     Observer.changedInstr(MI);
5166     return Legalized;
5167   }
5168   case TargetOpcode::G_STORE:
5169     if (TypeIdx != 0)
5170       return UnableToLegalize;
5171     Observer.changingInstr(MI);
5172     moreElementsVectorSrc(MI, MoreTy, 0);
5173     Observer.changedInstr(MI);
5174     return Legalized;
5175   case TargetOpcode::G_AND:
5176   case TargetOpcode::G_OR:
5177   case TargetOpcode::G_XOR:
5178   case TargetOpcode::G_ADD:
5179   case TargetOpcode::G_SUB:
5180   case TargetOpcode::G_MUL:
5181   case TargetOpcode::G_FADD:
5182   case TargetOpcode::G_FSUB:
5183   case TargetOpcode::G_FMUL:
5184   case TargetOpcode::G_FDIV:
5185   case TargetOpcode::G_UADDSAT:
5186   case TargetOpcode::G_USUBSAT:
5187   case TargetOpcode::G_SADDSAT:
5188   case TargetOpcode::G_SSUBSAT:
5189   case TargetOpcode::G_SMIN:
5190   case TargetOpcode::G_SMAX:
5191   case TargetOpcode::G_UMIN:
5192   case TargetOpcode::G_UMAX:
5193   case TargetOpcode::G_FMINNUM:
5194   case TargetOpcode::G_FMAXNUM:
5195   case TargetOpcode::G_FMINNUM_IEEE:
5196   case TargetOpcode::G_FMAXNUM_IEEE:
5197   case TargetOpcode::G_FMINIMUM:
5198   case TargetOpcode::G_FMAXIMUM:
5199   case TargetOpcode::G_STRICT_FADD:
5200   case TargetOpcode::G_STRICT_FSUB:
5201   case TargetOpcode::G_STRICT_FMUL:
5202   case TargetOpcode::G_SHL:
5203   case TargetOpcode::G_ASHR:
5204   case TargetOpcode::G_LSHR: {
5205     Observer.changingInstr(MI);
5206     moreElementsVectorSrc(MI, MoreTy, 1);
5207     moreElementsVectorSrc(MI, MoreTy, 2);
5208     moreElementsVectorDst(MI, MoreTy, 0);
5209     Observer.changedInstr(MI);
5210     return Legalized;
5211   }
5212   case TargetOpcode::G_FMA:
5213   case TargetOpcode::G_STRICT_FMA:
5214   case TargetOpcode::G_FSHR:
5215   case TargetOpcode::G_FSHL: {
5216     Observer.changingInstr(MI);
5217     moreElementsVectorSrc(MI, MoreTy, 1);
5218     moreElementsVectorSrc(MI, MoreTy, 2);
5219     moreElementsVectorSrc(MI, MoreTy, 3);
5220     moreElementsVectorDst(MI, MoreTy, 0);
5221     Observer.changedInstr(MI);
5222     return Legalized;
5223   }
5224   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
5225   case TargetOpcode::G_EXTRACT:
5226     if (TypeIdx != 1)
5227       return UnableToLegalize;
5228     Observer.changingInstr(MI);
5229     moreElementsVectorSrc(MI, MoreTy, 1);
5230     Observer.changedInstr(MI);
5231     return Legalized;
5232   case TargetOpcode::G_INSERT:
5233   case TargetOpcode::G_INSERT_VECTOR_ELT:
5234   case TargetOpcode::G_FREEZE:
5235   case TargetOpcode::G_FNEG:
5236   case TargetOpcode::G_FABS:
5237   case TargetOpcode::G_FSQRT:
5238   case TargetOpcode::G_FCEIL:
5239   case TargetOpcode::G_FFLOOR:
5240   case TargetOpcode::G_FNEARBYINT:
5241   case TargetOpcode::G_FRINT:
5242   case TargetOpcode::G_INTRINSIC_ROUND:
5243   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
5244   case TargetOpcode::G_INTRINSIC_TRUNC:
5245   case TargetOpcode::G_BSWAP:
5246   case TargetOpcode::G_FCANONICALIZE:
5247   case TargetOpcode::G_SEXT_INREG:
5248     if (TypeIdx != 0)
5249       return UnableToLegalize;
5250     Observer.changingInstr(MI);
5251     moreElementsVectorSrc(MI, MoreTy, 1);
5252     moreElementsVectorDst(MI, MoreTy, 0);
5253     Observer.changedInstr(MI);
5254     return Legalized;
5255   case TargetOpcode::G_SELECT: {
5256     auto [DstReg, DstTy, CondReg, CondTy] = MI.getFirst2RegLLTs();
5257     if (TypeIdx == 1) {
5258       if (!CondTy.isScalar() ||
5259           DstTy.getElementCount() != MoreTy.getElementCount())
5260         return UnableToLegalize;
5261 
5262       // This is turning a scalar select of vectors into a vector
5263       // select. Broadcast the select condition.
5264       auto ShufSplat = MIRBuilder.buildShuffleSplat(MoreTy, CondReg);
5265       Observer.changingInstr(MI);
5266       MI.getOperand(1).setReg(ShufSplat.getReg(0));
5267       Observer.changedInstr(MI);
5268       return Legalized;
5269     }
5270 
5271     if (CondTy.isVector())
5272       return UnableToLegalize;
5273 
5274     Observer.changingInstr(MI);
5275     moreElementsVectorSrc(MI, MoreTy, 2);
5276     moreElementsVectorSrc(MI, MoreTy, 3);
5277     moreElementsVectorDst(MI, MoreTy, 0);
5278     Observer.changedInstr(MI);
5279     return Legalized;
5280   }
5281   case TargetOpcode::G_UNMERGE_VALUES:
5282     return UnableToLegalize;
5283   case TargetOpcode::G_PHI:
5284     return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
5285   case TargetOpcode::G_SHUFFLE_VECTOR:
5286     return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
5287   case TargetOpcode::G_BUILD_VECTOR: {
5288     SmallVector<SrcOp, 8> Elts;
5289     for (auto Op : MI.uses()) {
5290       Elts.push_back(Op.getReg());
5291     }
5292 
5293     for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
5294       Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType()));
5295     }
5296 
5297     MIRBuilder.buildDeleteTrailingVectorElements(
5298         MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts));
5299     MI.eraseFromParent();
5300     return Legalized;
5301   }
5302   case TargetOpcode::G_TRUNC:
5303   case TargetOpcode::G_FPTRUNC:
5304   case TargetOpcode::G_FPEXT:
5305   case TargetOpcode::G_FPTOSI:
5306   case TargetOpcode::G_FPTOUI:
5307   case TargetOpcode::G_SITOFP:
5308   case TargetOpcode::G_UITOFP: {
5309     if (TypeIdx != 0)
5310       return UnableToLegalize;
5311     Observer.changingInstr(MI);
5312     LLT SrcTy = LLT::fixed_vector(
5313         MoreTy.getNumElements(),
5314         MRI.getType(MI.getOperand(1).getReg()).getElementType());
5315     moreElementsVectorSrc(MI, SrcTy, 1);
5316     moreElementsVectorDst(MI, MoreTy, 0);
5317     Observer.changedInstr(MI);
5318     return Legalized;
5319   }
5320   case TargetOpcode::G_ICMP: {
5321     // TODO: the symmetric MoreTy works for targets like, e.g. NEON.
5322     // For targets, like e.g. MVE, the result is a predicated vector (i1).
5323     // This will need some refactoring.
5324     Observer.changingInstr(MI);
5325     moreElementsVectorSrc(MI, MoreTy, 2);
5326     moreElementsVectorSrc(MI, MoreTy, 3);
5327     moreElementsVectorDst(MI, MoreTy, 0);
5328     Observer.changedInstr(MI);
5329     return Legalized;
5330   }
5331   default:
5332     return UnableToLegalize;
5333   }
5334 }
5335 
5336 LegalizerHelper::LegalizeResult
5337 LegalizerHelper::equalizeVectorShuffleLengths(MachineInstr &MI) {
5338   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5339   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
5340   unsigned MaskNumElts = Mask.size();
5341   unsigned SrcNumElts = SrcTy.getNumElements();
5342   LLT DestEltTy = DstTy.getElementType();
5343 
5344   if (MaskNumElts == SrcNumElts)
5345     return Legalized;
5346 
5347   if (MaskNumElts < SrcNumElts) {
5348     // Extend mask to match new destination vector size with
5349     // undef values.
5350     SmallVector<int, 16> NewMask(Mask);
5351     for (unsigned I = MaskNumElts; I < SrcNumElts; ++I)
5352       NewMask.push_back(-1);
5353 
5354     moreElementsVectorDst(MI, SrcTy, 0);
5355     MIRBuilder.setInstrAndDebugLoc(MI);
5356     MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
5357                                   MI.getOperand(1).getReg(),
5358                                   MI.getOperand(2).getReg(), NewMask);
5359     MI.eraseFromParent();
5360 
5361     return Legalized;
5362   }
5363 
5364   unsigned PaddedMaskNumElts = alignTo(MaskNumElts, SrcNumElts);
5365   unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
5366   LLT PaddedTy = LLT::fixed_vector(PaddedMaskNumElts, DestEltTy);
5367 
5368   // Create new source vectors by concatenating the initial
5369   // source vectors with undefined vectors of the same size.
5370   auto Undef = MIRBuilder.buildUndef(SrcTy);
5371   SmallVector<Register, 8> MOps1(NumConcat, Undef.getReg(0));
5372   SmallVector<Register, 8> MOps2(NumConcat, Undef.getReg(0));
5373   MOps1[0] = MI.getOperand(1).getReg();
5374   MOps2[0] = MI.getOperand(2).getReg();
5375 
5376   auto Src1 = MIRBuilder.buildConcatVectors(PaddedTy, MOps1);
5377   auto Src2 = MIRBuilder.buildConcatVectors(PaddedTy, MOps2);
5378 
5379   // Readjust mask for new input vector length.
5380   SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
5381   for (unsigned I = 0; I != MaskNumElts; ++I) {
5382     int Idx = Mask[I];
5383     if (Idx >= static_cast<int>(SrcNumElts))
5384       Idx += PaddedMaskNumElts - SrcNumElts;
5385     MappedOps[I] = Idx;
5386   }
5387 
5388   // If we got more elements than required, extract subvector.
5389   if (MaskNumElts != PaddedMaskNumElts) {
5390     auto Shuffle =
5391         MIRBuilder.buildShuffleVector(PaddedTy, Src1, Src2, MappedOps);
5392 
5393     SmallVector<Register, 16> Elts(MaskNumElts);
5394     for (unsigned I = 0; I < MaskNumElts; ++I) {
5395       Elts[I] =
5396           MIRBuilder.buildExtractVectorElementConstant(DestEltTy, Shuffle, I)
5397               .getReg(0);
5398     }
5399     MIRBuilder.buildBuildVector(DstReg, Elts);
5400   } else {
5401     MIRBuilder.buildShuffleVector(DstReg, Src1, Src2, MappedOps);
5402   }
5403 
5404   MI.eraseFromParent();
5405   return LegalizerHelper::LegalizeResult::Legalized;
5406 }
5407 
5408 LegalizerHelper::LegalizeResult
5409 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
5410                                            unsigned int TypeIdx, LLT MoreTy) {
5411   auto [DstTy, Src1Ty, Src2Ty] = MI.getFirst3LLTs();
5412   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
5413   unsigned NumElts = DstTy.getNumElements();
5414   unsigned WidenNumElts = MoreTy.getNumElements();
5415 
5416   if (DstTy.isVector() && Src1Ty.isVector() &&
5417       DstTy.getNumElements() != Src1Ty.getNumElements()) {
5418     return equalizeVectorShuffleLengths(MI);
5419   }
5420 
5421   if (TypeIdx != 0)
5422     return UnableToLegalize;
5423 
5424   // Expect a canonicalized shuffle.
5425   if (DstTy != Src1Ty || DstTy != Src2Ty)
5426     return UnableToLegalize;
5427 
5428   moreElementsVectorSrc(MI, MoreTy, 1);
5429   moreElementsVectorSrc(MI, MoreTy, 2);
5430 
5431   // Adjust mask based on new input vector length.
5432   SmallVector<int, 16> NewMask;
5433   for (unsigned I = 0; I != NumElts; ++I) {
5434     int Idx = Mask[I];
5435     if (Idx < static_cast<int>(NumElts))
5436       NewMask.push_back(Idx);
5437     else
5438       NewMask.push_back(Idx - NumElts + WidenNumElts);
5439   }
5440   for (unsigned I = NumElts; I != WidenNumElts; ++I)
5441     NewMask.push_back(-1);
5442   moreElementsVectorDst(MI, MoreTy, 0);
5443   MIRBuilder.setInstrAndDebugLoc(MI);
5444   MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
5445                                 MI.getOperand(1).getReg(),
5446                                 MI.getOperand(2).getReg(), NewMask);
5447   MI.eraseFromParent();
5448   return Legalized;
5449 }
5450 
5451 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
5452                                         ArrayRef<Register> Src1Regs,
5453                                         ArrayRef<Register> Src2Regs,
5454                                         LLT NarrowTy) {
5455   MachineIRBuilder &B = MIRBuilder;
5456   unsigned SrcParts = Src1Regs.size();
5457   unsigned DstParts = DstRegs.size();
5458 
5459   unsigned DstIdx = 0; // Low bits of the result.
5460   Register FactorSum =
5461       B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
5462   DstRegs[DstIdx] = FactorSum;
5463 
5464   unsigned CarrySumPrevDstIdx;
5465   SmallVector<Register, 4> Factors;
5466 
5467   for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
5468     // Collect low parts of muls for DstIdx.
5469     for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
5470          i <= std::min(DstIdx, SrcParts - 1); ++i) {
5471       MachineInstrBuilder Mul =
5472           B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
5473       Factors.push_back(Mul.getReg(0));
5474     }
5475     // Collect high parts of muls from previous DstIdx.
5476     for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
5477          i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
5478       MachineInstrBuilder Umulh =
5479           B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
5480       Factors.push_back(Umulh.getReg(0));
5481     }
5482     // Add CarrySum from additions calculated for previous DstIdx.
5483     if (DstIdx != 1) {
5484       Factors.push_back(CarrySumPrevDstIdx);
5485     }
5486 
5487     Register CarrySum;
5488     // Add all factors and accumulate all carries into CarrySum.
5489     if (DstIdx != DstParts - 1) {
5490       MachineInstrBuilder Uaddo =
5491           B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
5492       FactorSum = Uaddo.getReg(0);
5493       CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
5494       for (unsigned i = 2; i < Factors.size(); ++i) {
5495         MachineInstrBuilder Uaddo =
5496             B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
5497         FactorSum = Uaddo.getReg(0);
5498         MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
5499         CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
5500       }
5501     } else {
5502       // Since value for the next index is not calculated, neither is CarrySum.
5503       FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
5504       for (unsigned i = 2; i < Factors.size(); ++i)
5505         FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
5506     }
5507 
5508     CarrySumPrevDstIdx = CarrySum;
5509     DstRegs[DstIdx] = FactorSum;
5510     Factors.clear();
5511   }
5512 }
5513 
5514 LegalizerHelper::LegalizeResult
5515 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
5516                                     LLT NarrowTy) {
5517   if (TypeIdx != 0)
5518     return UnableToLegalize;
5519 
5520   Register DstReg = MI.getOperand(0).getReg();
5521   LLT DstType = MRI.getType(DstReg);
5522   // FIXME: add support for vector types
5523   if (DstType.isVector())
5524     return UnableToLegalize;
5525 
5526   unsigned Opcode = MI.getOpcode();
5527   unsigned OpO, OpE, OpF;
5528   switch (Opcode) {
5529   case TargetOpcode::G_SADDO:
5530   case TargetOpcode::G_SADDE:
5531   case TargetOpcode::G_UADDO:
5532   case TargetOpcode::G_UADDE:
5533   case TargetOpcode::G_ADD:
5534     OpO = TargetOpcode::G_UADDO;
5535     OpE = TargetOpcode::G_UADDE;
5536     OpF = TargetOpcode::G_UADDE;
5537     if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
5538       OpF = TargetOpcode::G_SADDE;
5539     break;
5540   case TargetOpcode::G_SSUBO:
5541   case TargetOpcode::G_SSUBE:
5542   case TargetOpcode::G_USUBO:
5543   case TargetOpcode::G_USUBE:
5544   case TargetOpcode::G_SUB:
5545     OpO = TargetOpcode::G_USUBO;
5546     OpE = TargetOpcode::G_USUBE;
5547     OpF = TargetOpcode::G_USUBE;
5548     if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
5549       OpF = TargetOpcode::G_SSUBE;
5550     break;
5551   default:
5552     llvm_unreachable("Unexpected add/sub opcode!");
5553   }
5554 
5555   // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
5556   unsigned NumDefs = MI.getNumExplicitDefs();
5557   Register Src1 = MI.getOperand(NumDefs).getReg();
5558   Register Src2 = MI.getOperand(NumDefs + 1).getReg();
5559   Register CarryDst, CarryIn;
5560   if (NumDefs == 2)
5561     CarryDst = MI.getOperand(1).getReg();
5562   if (MI.getNumOperands() == NumDefs + 3)
5563     CarryIn = MI.getOperand(NumDefs + 2).getReg();
5564 
5565   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5566   LLT LeftoverTy, DummyTy;
5567   SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
5568   extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left,
5569                MIRBuilder, MRI);
5570   extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left, MIRBuilder,
5571                MRI);
5572 
5573   int NarrowParts = Src1Regs.size();
5574   for (int I = 0, E = Src1Left.size(); I != E; ++I) {
5575     Src1Regs.push_back(Src1Left[I]);
5576     Src2Regs.push_back(Src2Left[I]);
5577   }
5578   DstRegs.reserve(Src1Regs.size());
5579 
5580   for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
5581     Register DstReg =
5582         MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
5583     Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
5584     // Forward the final carry-out to the destination register
5585     if (i == e - 1 && CarryDst)
5586       CarryOut = CarryDst;
5587 
5588     if (!CarryIn) {
5589       MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
5590                             {Src1Regs[i], Src2Regs[i]});
5591     } else if (i == e - 1) {
5592       MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
5593                             {Src1Regs[i], Src2Regs[i], CarryIn});
5594     } else {
5595       MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
5596                             {Src1Regs[i], Src2Regs[i], CarryIn});
5597     }
5598 
5599     DstRegs.push_back(DstReg);
5600     CarryIn = CarryOut;
5601   }
5602   insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
5603               ArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
5604               ArrayRef(DstRegs).drop_front(NarrowParts));
5605 
5606   MI.eraseFromParent();
5607   return Legalized;
5608 }
5609 
5610 LegalizerHelper::LegalizeResult
5611 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
5612   auto [DstReg, Src1, Src2] = MI.getFirst3Regs();
5613 
5614   LLT Ty = MRI.getType(DstReg);
5615   if (Ty.isVector())
5616     return UnableToLegalize;
5617 
5618   unsigned Size = Ty.getSizeInBits();
5619   unsigned NarrowSize = NarrowTy.getSizeInBits();
5620   if (Size % NarrowSize != 0)
5621     return UnableToLegalize;
5622 
5623   unsigned NumParts = Size / NarrowSize;
5624   bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
5625   unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
5626 
5627   SmallVector<Register, 2> Src1Parts, Src2Parts;
5628   SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
5629   extractParts(Src1, NarrowTy, NumParts, Src1Parts, MIRBuilder, MRI);
5630   extractParts(Src2, NarrowTy, NumParts, Src2Parts, MIRBuilder, MRI);
5631   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
5632 
5633   // Take only high half of registers if this is high mul.
5634   ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
5635   MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
5636   MI.eraseFromParent();
5637   return Legalized;
5638 }
5639 
5640 LegalizerHelper::LegalizeResult
5641 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
5642                                    LLT NarrowTy) {
5643   if (TypeIdx != 0)
5644     return UnableToLegalize;
5645 
5646   bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
5647 
5648   Register Src = MI.getOperand(1).getReg();
5649   LLT SrcTy = MRI.getType(Src);
5650 
5651   // If all finite floats fit into the narrowed integer type, we can just swap
5652   // out the result type. This is practically only useful for conversions from
5653   // half to at least 16-bits, so just handle the one case.
5654   if (SrcTy.getScalarType() != LLT::scalar(16) ||
5655       NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
5656     return UnableToLegalize;
5657 
5658   Observer.changingInstr(MI);
5659   narrowScalarDst(MI, NarrowTy, 0,
5660                   IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
5661   Observer.changedInstr(MI);
5662   return Legalized;
5663 }
5664 
5665 LegalizerHelper::LegalizeResult
5666 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
5667                                      LLT NarrowTy) {
5668   if (TypeIdx != 1)
5669     return UnableToLegalize;
5670 
5671   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5672 
5673   int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5674   // FIXME: add support for when SizeOp1 isn't an exact multiple of
5675   // NarrowSize.
5676   if (SizeOp1 % NarrowSize != 0)
5677     return UnableToLegalize;
5678   int NumParts = SizeOp1 / NarrowSize;
5679 
5680   SmallVector<Register, 2> SrcRegs, DstRegs;
5681   SmallVector<uint64_t, 2> Indexes;
5682   extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs,
5683                MIRBuilder, MRI);
5684 
5685   Register OpReg = MI.getOperand(0).getReg();
5686   uint64_t OpStart = MI.getOperand(2).getImm();
5687   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5688   for (int i = 0; i < NumParts; ++i) {
5689     unsigned SrcStart = i * NarrowSize;
5690 
5691     if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
5692       // No part of the extract uses this subregister, ignore it.
5693       continue;
5694     } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5695       // The entire subregister is extracted, forward the value.
5696       DstRegs.push_back(SrcRegs[i]);
5697       continue;
5698     }
5699 
5700     // OpSegStart is where this destination segment would start in OpReg if it
5701     // extended infinitely in both directions.
5702     int64_t ExtractOffset;
5703     uint64_t SegSize;
5704     if (OpStart < SrcStart) {
5705       ExtractOffset = 0;
5706       SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
5707     } else {
5708       ExtractOffset = OpStart - SrcStart;
5709       SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
5710     }
5711 
5712     Register SegReg = SrcRegs[i];
5713     if (ExtractOffset != 0 || SegSize != NarrowSize) {
5714       // A genuine extract is needed.
5715       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5716       MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
5717     }
5718 
5719     DstRegs.push_back(SegReg);
5720   }
5721 
5722   Register DstReg = MI.getOperand(0).getReg();
5723   if (MRI.getType(DstReg).isVector())
5724     MIRBuilder.buildBuildVector(DstReg, DstRegs);
5725   else if (DstRegs.size() > 1)
5726     MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
5727   else
5728     MIRBuilder.buildCopy(DstReg, DstRegs[0]);
5729   MI.eraseFromParent();
5730   return Legalized;
5731 }
5732 
5733 LegalizerHelper::LegalizeResult
5734 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
5735                                     LLT NarrowTy) {
5736   // FIXME: Don't know how to handle secondary types yet.
5737   if (TypeIdx != 0)
5738     return UnableToLegalize;
5739 
5740   SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
5741   SmallVector<uint64_t, 2> Indexes;
5742   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5743   LLT LeftoverTy;
5744   extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
5745                LeftoverRegs, MIRBuilder, MRI);
5746 
5747   for (Register Reg : LeftoverRegs)
5748     SrcRegs.push_back(Reg);
5749 
5750   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5751   Register OpReg = MI.getOperand(2).getReg();
5752   uint64_t OpStart = MI.getOperand(3).getImm();
5753   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5754   for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
5755     unsigned DstStart = I * NarrowSize;
5756 
5757     if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5758       // The entire subregister is defined by this insert, forward the new
5759       // value.
5760       DstRegs.push_back(OpReg);
5761       continue;
5762     }
5763 
5764     Register SrcReg = SrcRegs[I];
5765     if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
5766       // The leftover reg is smaller than NarrowTy, so we need to extend it.
5767       SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
5768       MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
5769     }
5770 
5771     if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
5772       // No part of the insert affects this subregister, forward the original.
5773       DstRegs.push_back(SrcReg);
5774       continue;
5775     }
5776 
5777     // OpSegStart is where this destination segment would start in OpReg if it
5778     // extended infinitely in both directions.
5779     int64_t ExtractOffset, InsertOffset;
5780     uint64_t SegSize;
5781     if (OpStart < DstStart) {
5782       InsertOffset = 0;
5783       ExtractOffset = DstStart - OpStart;
5784       SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
5785     } else {
5786       InsertOffset = OpStart - DstStart;
5787       ExtractOffset = 0;
5788       SegSize =
5789         std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
5790     }
5791 
5792     Register SegReg = OpReg;
5793     if (ExtractOffset != 0 || SegSize != OpSize) {
5794       // A genuine extract is needed.
5795       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5796       MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
5797     }
5798 
5799     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
5800     MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
5801     DstRegs.push_back(DstReg);
5802   }
5803 
5804   uint64_t WideSize = DstRegs.size() * NarrowSize;
5805   Register DstReg = MI.getOperand(0).getReg();
5806   if (WideSize > RegTy.getSizeInBits()) {
5807     Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
5808     MIRBuilder.buildMergeLikeInstr(MergeReg, DstRegs);
5809     MIRBuilder.buildTrunc(DstReg, MergeReg);
5810   } else
5811     MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
5812 
5813   MI.eraseFromParent();
5814   return Legalized;
5815 }
5816 
5817 LegalizerHelper::LegalizeResult
5818 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
5819                                    LLT NarrowTy) {
5820   Register DstReg = MI.getOperand(0).getReg();
5821   LLT DstTy = MRI.getType(DstReg);
5822 
5823   assert(MI.getNumOperands() == 3 && TypeIdx == 0);
5824 
5825   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5826   SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
5827   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5828   LLT LeftoverTy;
5829   if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
5830                     Src0Regs, Src0LeftoverRegs, MIRBuilder, MRI))
5831     return UnableToLegalize;
5832 
5833   LLT Unused;
5834   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
5835                     Src1Regs, Src1LeftoverRegs, MIRBuilder, MRI))
5836     llvm_unreachable("inconsistent extractParts result");
5837 
5838   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5839     auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
5840                                         {Src0Regs[I], Src1Regs[I]});
5841     DstRegs.push_back(Inst.getReg(0));
5842   }
5843 
5844   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5845     auto Inst = MIRBuilder.buildInstr(
5846       MI.getOpcode(),
5847       {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
5848     DstLeftoverRegs.push_back(Inst.getReg(0));
5849   }
5850 
5851   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5852               LeftoverTy, DstLeftoverRegs);
5853 
5854   MI.eraseFromParent();
5855   return Legalized;
5856 }
5857 
5858 LegalizerHelper::LegalizeResult
5859 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
5860                                  LLT NarrowTy) {
5861   if (TypeIdx != 0)
5862     return UnableToLegalize;
5863 
5864   auto [DstReg, SrcReg] = MI.getFirst2Regs();
5865 
5866   LLT DstTy = MRI.getType(DstReg);
5867   if (DstTy.isVector())
5868     return UnableToLegalize;
5869 
5870   SmallVector<Register, 8> Parts;
5871   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
5872   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
5873   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
5874 
5875   MI.eraseFromParent();
5876   return Legalized;
5877 }
5878 
5879 LegalizerHelper::LegalizeResult
5880 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
5881                                     LLT NarrowTy) {
5882   if (TypeIdx != 0)
5883     return UnableToLegalize;
5884 
5885   Register CondReg = MI.getOperand(1).getReg();
5886   LLT CondTy = MRI.getType(CondReg);
5887   if (CondTy.isVector()) // TODO: Handle vselect
5888     return UnableToLegalize;
5889 
5890   Register DstReg = MI.getOperand(0).getReg();
5891   LLT DstTy = MRI.getType(DstReg);
5892 
5893   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5894   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5895   SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
5896   LLT LeftoverTy;
5897   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
5898                     Src1Regs, Src1LeftoverRegs, MIRBuilder, MRI))
5899     return UnableToLegalize;
5900 
5901   LLT Unused;
5902   if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
5903                     Src2Regs, Src2LeftoverRegs, MIRBuilder, MRI))
5904     llvm_unreachable("inconsistent extractParts result");
5905 
5906   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5907     auto Select = MIRBuilder.buildSelect(NarrowTy,
5908                                          CondReg, Src1Regs[I], Src2Regs[I]);
5909     DstRegs.push_back(Select.getReg(0));
5910   }
5911 
5912   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5913     auto Select = MIRBuilder.buildSelect(
5914       LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
5915     DstLeftoverRegs.push_back(Select.getReg(0));
5916   }
5917 
5918   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5919               LeftoverTy, DstLeftoverRegs);
5920 
5921   MI.eraseFromParent();
5922   return Legalized;
5923 }
5924 
5925 LegalizerHelper::LegalizeResult
5926 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
5927                                   LLT NarrowTy) {
5928   if (TypeIdx != 1)
5929     return UnableToLegalize;
5930 
5931   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5932   unsigned NarrowSize = NarrowTy.getSizeInBits();
5933 
5934   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5935     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
5936 
5937     MachineIRBuilder &B = MIRBuilder;
5938     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5939     // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
5940     auto C_0 = B.buildConstant(NarrowTy, 0);
5941     auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5942                                 UnmergeSrc.getReg(1), C_0);
5943     auto LoCTLZ = IsUndef ?
5944       B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
5945       B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
5946     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5947     auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
5948     auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
5949     B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
5950 
5951     MI.eraseFromParent();
5952     return Legalized;
5953   }
5954 
5955   return UnableToLegalize;
5956 }
5957 
5958 LegalizerHelper::LegalizeResult
5959 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
5960                                   LLT NarrowTy) {
5961   if (TypeIdx != 1)
5962     return UnableToLegalize;
5963 
5964   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5965   unsigned NarrowSize = NarrowTy.getSizeInBits();
5966 
5967   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5968     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
5969 
5970     MachineIRBuilder &B = MIRBuilder;
5971     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5972     // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
5973     auto C_0 = B.buildConstant(NarrowTy, 0);
5974     auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5975                                 UnmergeSrc.getReg(0), C_0);
5976     auto HiCTTZ = IsUndef ?
5977       B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
5978       B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
5979     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5980     auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
5981     auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
5982     B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
5983 
5984     MI.eraseFromParent();
5985     return Legalized;
5986   }
5987 
5988   return UnableToLegalize;
5989 }
5990 
5991 LegalizerHelper::LegalizeResult
5992 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
5993                                    LLT NarrowTy) {
5994   if (TypeIdx != 1)
5995     return UnableToLegalize;
5996 
5997   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5998   unsigned NarrowSize = NarrowTy.getSizeInBits();
5999 
6000   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
6001     auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
6002 
6003     auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
6004     auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
6005     MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
6006 
6007     MI.eraseFromParent();
6008     return Legalized;
6009   }
6010 
6011   return UnableToLegalize;
6012 }
6013 
6014 LegalizerHelper::LegalizeResult
6015 LegalizerHelper::narrowScalarFLDEXP(MachineInstr &MI, unsigned TypeIdx,
6016                                     LLT NarrowTy) {
6017   if (TypeIdx != 1)
6018     return UnableToLegalize;
6019 
6020   MachineIRBuilder &B = MIRBuilder;
6021   Register ExpReg = MI.getOperand(2).getReg();
6022   LLT ExpTy = MRI.getType(ExpReg);
6023 
6024   unsigned ClampSize = NarrowTy.getScalarSizeInBits();
6025 
6026   // Clamp the exponent to the range of the target type.
6027   auto MinExp = B.buildConstant(ExpTy, minIntN(ClampSize));
6028   auto ClampMin = B.buildSMax(ExpTy, ExpReg, MinExp);
6029   auto MaxExp = B.buildConstant(ExpTy, maxIntN(ClampSize));
6030   auto Clamp = B.buildSMin(ExpTy, ClampMin, MaxExp);
6031 
6032   auto Trunc = B.buildTrunc(NarrowTy, Clamp);
6033   Observer.changingInstr(MI);
6034   MI.getOperand(2).setReg(Trunc.getReg(0));
6035   Observer.changedInstr(MI);
6036   return Legalized;
6037 }
6038 
6039 LegalizerHelper::LegalizeResult
6040 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
6041   unsigned Opc = MI.getOpcode();
6042   const auto &TII = MIRBuilder.getTII();
6043   auto isSupported = [this](const LegalityQuery &Q) {
6044     auto QAction = LI.getAction(Q).Action;
6045     return QAction == Legal || QAction == Libcall || QAction == Custom;
6046   };
6047   switch (Opc) {
6048   default:
6049     return UnableToLegalize;
6050   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
6051     // This trivially expands to CTLZ.
6052     Observer.changingInstr(MI);
6053     MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
6054     Observer.changedInstr(MI);
6055     return Legalized;
6056   }
6057   case TargetOpcode::G_CTLZ: {
6058     auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6059     unsigned Len = SrcTy.getSizeInBits();
6060 
6061     if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
6062       // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
6063       auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
6064       auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
6065       auto ICmp = MIRBuilder.buildICmp(
6066           CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
6067       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
6068       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
6069       MI.eraseFromParent();
6070       return Legalized;
6071     }
6072     // for now, we do this:
6073     // NewLen = NextPowerOf2(Len);
6074     // x = x | (x >> 1);
6075     // x = x | (x >> 2);
6076     // ...
6077     // x = x | (x >>16);
6078     // x = x | (x >>32); // for 64-bit input
6079     // Upto NewLen/2
6080     // return Len - popcount(x);
6081     //
6082     // Ref: "Hacker's Delight" by Henry Warren
6083     Register Op = SrcReg;
6084     unsigned NewLen = PowerOf2Ceil(Len);
6085     for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
6086       auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
6087       auto MIBOp = MIRBuilder.buildOr(
6088           SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
6089       Op = MIBOp.getReg(0);
6090     }
6091     auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
6092     MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
6093                         MIBPop);
6094     MI.eraseFromParent();
6095     return Legalized;
6096   }
6097   case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
6098     // This trivially expands to CTTZ.
6099     Observer.changingInstr(MI);
6100     MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
6101     Observer.changedInstr(MI);
6102     return Legalized;
6103   }
6104   case TargetOpcode::G_CTTZ: {
6105     auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6106 
6107     unsigned Len = SrcTy.getSizeInBits();
6108     if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
6109       // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
6110       // zero.
6111       auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
6112       auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
6113       auto ICmp = MIRBuilder.buildICmp(
6114           CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
6115       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
6116       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
6117       MI.eraseFromParent();
6118       return Legalized;
6119     }
6120     // for now, we use: { return popcount(~x & (x - 1)); }
6121     // unless the target has ctlz but not ctpop, in which case we use:
6122     // { return 32 - nlz(~x & (x-1)); }
6123     // Ref: "Hacker's Delight" by Henry Warren
6124     auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
6125     auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
6126     auto MIBTmp = MIRBuilder.buildAnd(
6127         SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
6128     if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
6129         isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
6130       auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
6131       MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
6132                           MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
6133       MI.eraseFromParent();
6134       return Legalized;
6135     }
6136     Observer.changingInstr(MI);
6137     MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
6138     MI.getOperand(1).setReg(MIBTmp.getReg(0));
6139     Observer.changedInstr(MI);
6140     return Legalized;
6141   }
6142   case TargetOpcode::G_CTPOP: {
6143     Register SrcReg = MI.getOperand(1).getReg();
6144     LLT Ty = MRI.getType(SrcReg);
6145     unsigned Size = Ty.getSizeInBits();
6146     MachineIRBuilder &B = MIRBuilder;
6147 
6148     // Count set bits in blocks of 2 bits. Default approach would be
6149     // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
6150     // We use following formula instead:
6151     // B2Count = val - { (val >> 1) & 0x55555555 }
6152     // since it gives same result in blocks of 2 with one instruction less.
6153     auto C_1 = B.buildConstant(Ty, 1);
6154     auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
6155     APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
6156     auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
6157     auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
6158     auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
6159 
6160     // In order to get count in blocks of 4 add values from adjacent block of 2.
6161     // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
6162     auto C_2 = B.buildConstant(Ty, 2);
6163     auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
6164     APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
6165     auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
6166     auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
6167     auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
6168     auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
6169 
6170     // For count in blocks of 8 bits we don't have to mask high 4 bits before
6171     // addition since count value sits in range {0,...,8} and 4 bits are enough
6172     // to hold such binary values. After addition high 4 bits still hold count
6173     // of set bits in high 4 bit block, set them to zero and get 8 bit result.
6174     // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
6175     auto C_4 = B.buildConstant(Ty, 4);
6176     auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
6177     auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
6178     APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
6179     auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
6180     auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
6181 
6182     assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
6183     // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
6184     // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
6185     auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
6186     auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
6187 
6188     // Shift count result from 8 high bits to low bits.
6189     auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
6190     B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
6191 
6192     MI.eraseFromParent();
6193     return Legalized;
6194   }
6195   }
6196 }
6197 
6198 // Check that (every element of) Reg is undef or not an exact multiple of BW.
6199 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
6200                                         Register Reg, unsigned BW) {
6201   return matchUnaryPredicate(
6202       MRI, Reg,
6203       [=](const Constant *C) {
6204         // Null constant here means an undef.
6205         const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
6206         return !CI || CI->getValue().urem(BW) != 0;
6207       },
6208       /*AllowUndefs*/ true);
6209 }
6210 
6211 LegalizerHelper::LegalizeResult
6212 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
6213   auto [Dst, X, Y, Z] = MI.getFirst4Regs();
6214   LLT Ty = MRI.getType(Dst);
6215   LLT ShTy = MRI.getType(Z);
6216 
6217   unsigned BW = Ty.getScalarSizeInBits();
6218 
6219   if (!isPowerOf2_32(BW))
6220     return UnableToLegalize;
6221 
6222   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
6223   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
6224 
6225   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
6226     // fshl X, Y, Z -> fshr X, Y, -Z
6227     // fshr X, Y, Z -> fshl X, Y, -Z
6228     auto Zero = MIRBuilder.buildConstant(ShTy, 0);
6229     Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
6230   } else {
6231     // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
6232     // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
6233     auto One = MIRBuilder.buildConstant(ShTy, 1);
6234     if (IsFSHL) {
6235       Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
6236       X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
6237     } else {
6238       X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
6239       Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
6240     }
6241 
6242     Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
6243   }
6244 
6245   MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
6246   MI.eraseFromParent();
6247   return Legalized;
6248 }
6249 
6250 LegalizerHelper::LegalizeResult
6251 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
6252   auto [Dst, X, Y, Z] = MI.getFirst4Regs();
6253   LLT Ty = MRI.getType(Dst);
6254   LLT ShTy = MRI.getType(Z);
6255 
6256   const unsigned BW = Ty.getScalarSizeInBits();
6257   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
6258 
6259   Register ShX, ShY;
6260   Register ShAmt, InvShAmt;
6261 
6262   // FIXME: Emit optimized urem by constant instead of letting it expand later.
6263   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
6264     // fshl: X << C | Y >> (BW - C)
6265     // fshr: X << (BW - C) | Y >> C
6266     // where C = Z % BW is not zero
6267     auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
6268     ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
6269     InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
6270     ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
6271     ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
6272   } else {
6273     // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
6274     // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
6275     auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
6276     if (isPowerOf2_32(BW)) {
6277       // Z % BW -> Z & (BW - 1)
6278       ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
6279       // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
6280       auto NotZ = MIRBuilder.buildNot(ShTy, Z);
6281       InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
6282     } else {
6283       auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
6284       ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
6285       InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
6286     }
6287 
6288     auto One = MIRBuilder.buildConstant(ShTy, 1);
6289     if (IsFSHL) {
6290       ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
6291       auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
6292       ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
6293     } else {
6294       auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
6295       ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
6296       ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
6297     }
6298   }
6299 
6300   MIRBuilder.buildOr(Dst, ShX, ShY);
6301   MI.eraseFromParent();
6302   return Legalized;
6303 }
6304 
6305 LegalizerHelper::LegalizeResult
6306 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
6307   // These operations approximately do the following (while avoiding undefined
6308   // shifts by BW):
6309   // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
6310   // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
6311   Register Dst = MI.getOperand(0).getReg();
6312   LLT Ty = MRI.getType(Dst);
6313   LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
6314 
6315   bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
6316   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
6317 
6318   // TODO: Use smarter heuristic that accounts for vector legalization.
6319   if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
6320     return lowerFunnelShiftAsShifts(MI);
6321 
6322   // This only works for powers of 2, fallback to shifts if it fails.
6323   LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
6324   if (Result == UnableToLegalize)
6325     return lowerFunnelShiftAsShifts(MI);
6326   return Result;
6327 }
6328 
6329 LegalizerHelper::LegalizeResult LegalizerHelper::lowerEXT(MachineInstr &MI) {
6330   auto [Dst, Src] = MI.getFirst2Regs();
6331   LLT DstTy = MRI.getType(Dst);
6332   LLT SrcTy = MRI.getType(Src);
6333 
6334   uint32_t DstTySize = DstTy.getSizeInBits();
6335   uint32_t DstTyScalarSize = DstTy.getScalarSizeInBits();
6336   uint32_t SrcTyScalarSize = SrcTy.getScalarSizeInBits();
6337 
6338   if (!isPowerOf2_32(DstTySize) || !isPowerOf2_32(DstTyScalarSize) ||
6339       !isPowerOf2_32(SrcTyScalarSize))
6340     return UnableToLegalize;
6341 
6342   // The step between extend is too large, split it by creating an intermediate
6343   // extend instruction
6344   if (SrcTyScalarSize * 2 < DstTyScalarSize) {
6345     LLT MidTy = SrcTy.changeElementSize(SrcTyScalarSize * 2);
6346     // If the destination type is illegal, split it into multiple statements
6347     // zext x -> zext(merge(zext(unmerge), zext(unmerge)))
6348     auto NewExt = MIRBuilder.buildInstr(MI.getOpcode(), {MidTy}, {Src});
6349     // Unmerge the vector
6350     LLT EltTy = MidTy.changeElementCount(
6351         MidTy.getElementCount().divideCoefficientBy(2));
6352     auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, NewExt);
6353 
6354     // ZExt the vectors
6355     LLT ZExtResTy = DstTy.changeElementCount(
6356         DstTy.getElementCount().divideCoefficientBy(2));
6357     auto ZExtRes1 = MIRBuilder.buildInstr(MI.getOpcode(), {ZExtResTy},
6358                                           {UnmergeSrc.getReg(0)});
6359     auto ZExtRes2 = MIRBuilder.buildInstr(MI.getOpcode(), {ZExtResTy},
6360                                           {UnmergeSrc.getReg(1)});
6361 
6362     // Merge the ending vectors
6363     MIRBuilder.buildMergeLikeInstr(Dst, {ZExtRes1, ZExtRes2});
6364 
6365     MI.eraseFromParent();
6366     return Legalized;
6367   }
6368   return UnableToLegalize;
6369 }
6370 
6371 LegalizerHelper::LegalizeResult LegalizerHelper::lowerTRUNC(MachineInstr &MI) {
6372   // MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
6373   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
6374   // Similar to how operand splitting is done in SelectiondDAG, we can handle
6375   // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
6376   //   %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
6377   //   %lo16(<4 x s16>) = G_TRUNC %inlo
6378   //   %hi16(<4 x s16>) = G_TRUNC %inhi
6379   //   %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
6380   //   %res(<8 x s8>) = G_TRUNC %in16
6381 
6382   assert(MI.getOpcode() == TargetOpcode::G_TRUNC);
6383 
6384   Register DstReg = MI.getOperand(0).getReg();
6385   Register SrcReg = MI.getOperand(1).getReg();
6386   LLT DstTy = MRI.getType(DstReg);
6387   LLT SrcTy = MRI.getType(SrcReg);
6388 
6389   if (DstTy.isVector() && isPowerOf2_32(DstTy.getNumElements()) &&
6390       isPowerOf2_32(DstTy.getScalarSizeInBits()) &&
6391       isPowerOf2_32(SrcTy.getNumElements()) &&
6392       isPowerOf2_32(SrcTy.getScalarSizeInBits())) {
6393     // Split input type.
6394     LLT SplitSrcTy = SrcTy.changeElementCount(
6395         SrcTy.getElementCount().divideCoefficientBy(2));
6396 
6397     // First, split the source into two smaller vectors.
6398     SmallVector<Register, 2> SplitSrcs;
6399     extractParts(SrcReg, SplitSrcTy, 2, SplitSrcs, MIRBuilder, MRI);
6400 
6401     // Truncate the splits into intermediate narrower elements.
6402     LLT InterTy;
6403     if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
6404       InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2);
6405     else
6406       InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits());
6407     for (unsigned I = 0; I < SplitSrcs.size(); ++I) {
6408       SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0);
6409     }
6410 
6411     // Combine the new truncates into one vector
6412     auto Merge = MIRBuilder.buildMergeLikeInstr(
6413         DstTy.changeElementSize(InterTy.getScalarSizeInBits()), SplitSrcs);
6414 
6415     // Truncate the new vector to the final result type
6416     if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
6417       MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), Merge.getReg(0));
6418     else
6419       MIRBuilder.buildCopy(MI.getOperand(0).getReg(), Merge.getReg(0));
6420 
6421     MI.eraseFromParent();
6422 
6423     return Legalized;
6424   }
6425   return UnableToLegalize;
6426 }
6427 
6428 LegalizerHelper::LegalizeResult
6429 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
6430   auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
6431   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
6432   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6433   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6434   auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
6435   MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
6436   MI.eraseFromParent();
6437   return Legalized;
6438 }
6439 
6440 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
6441   auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
6442 
6443   unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
6444   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6445 
6446   MIRBuilder.setInstrAndDebugLoc(MI);
6447 
6448   // If a rotate in the other direction is supported, use it.
6449   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6450   if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
6451       isPowerOf2_32(EltSizeInBits))
6452     return lowerRotateWithReverseRotate(MI);
6453 
6454   // If a funnel shift is supported, use it.
6455   unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
6456   unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
6457   bool IsFShLegal = false;
6458   if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) ||
6459       LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) {
6460     auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
6461                                 Register R3) {
6462       MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3});
6463       MI.eraseFromParent();
6464       return Legalized;
6465     };
6466     // If a funnel shift in the other direction is supported, use it.
6467     if (IsFShLegal) {
6468       return buildFunnelShift(FShOpc, Dst, Src, Amt);
6469     } else if (isPowerOf2_32(EltSizeInBits)) {
6470       Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0);
6471       return buildFunnelShift(RevFsh, Dst, Src, Amt);
6472     }
6473   }
6474 
6475   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
6476   unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
6477   unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
6478   auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
6479   Register ShVal;
6480   Register RevShiftVal;
6481   if (isPowerOf2_32(EltSizeInBits)) {
6482     // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
6483     // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
6484     auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
6485     auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
6486     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6487     auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
6488     RevShiftVal =
6489         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
6490   } else {
6491     // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
6492     // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
6493     auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
6494     auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
6495     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6496     auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
6497     auto One = MIRBuilder.buildConstant(AmtTy, 1);
6498     auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
6499     RevShiftVal =
6500         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
6501   }
6502   MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
6503   MI.eraseFromParent();
6504   return Legalized;
6505 }
6506 
6507 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
6508 // representation.
6509 LegalizerHelper::LegalizeResult
6510 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
6511   auto [Dst, Src] = MI.getFirst2Regs();
6512   const LLT S64 = LLT::scalar(64);
6513   const LLT S32 = LLT::scalar(32);
6514   const LLT S1 = LLT::scalar(1);
6515 
6516   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
6517 
6518   // unsigned cul2f(ulong u) {
6519   //   uint lz = clz(u);
6520   //   uint e = (u != 0) ? 127U + 63U - lz : 0;
6521   //   u = (u << lz) & 0x7fffffffffffffffUL;
6522   //   ulong t = u & 0xffffffffffUL;
6523   //   uint v = (e << 23) | (uint)(u >> 40);
6524   //   uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
6525   //   return as_float(v + r);
6526   // }
6527 
6528   auto Zero32 = MIRBuilder.buildConstant(S32, 0);
6529   auto Zero64 = MIRBuilder.buildConstant(S64, 0);
6530 
6531   auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
6532 
6533   auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
6534   auto Sub = MIRBuilder.buildSub(S32, K, LZ);
6535 
6536   auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
6537   auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
6538 
6539   auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
6540   auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
6541 
6542   auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
6543 
6544   auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
6545   auto T = MIRBuilder.buildAnd(S64, U, Mask1);
6546 
6547   auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
6548   auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
6549   auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
6550 
6551   auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
6552   auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
6553   auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
6554   auto One = MIRBuilder.buildConstant(S32, 1);
6555 
6556   auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
6557   auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
6558   auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
6559   MIRBuilder.buildAdd(Dst, V, R);
6560 
6561   MI.eraseFromParent();
6562   return Legalized;
6563 }
6564 
6565 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
6566   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6567 
6568   if (SrcTy == LLT::scalar(1)) {
6569     auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
6570     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6571     MIRBuilder.buildSelect(Dst, Src, True, False);
6572     MI.eraseFromParent();
6573     return Legalized;
6574   }
6575 
6576   if (SrcTy != LLT::scalar(64))
6577     return UnableToLegalize;
6578 
6579   if (DstTy == LLT::scalar(32)) {
6580     // TODO: SelectionDAG has several alternative expansions to port which may
6581     // be more reasonble depending on the available instructions. If a target
6582     // has sitofp, does not have CTLZ, or can efficiently use f64 as an
6583     // intermediate type, this is probably worse.
6584     return lowerU64ToF32BitOps(MI);
6585   }
6586 
6587   return UnableToLegalize;
6588 }
6589 
6590 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
6591   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6592 
6593   const LLT S64 = LLT::scalar(64);
6594   const LLT S32 = LLT::scalar(32);
6595   const LLT S1 = LLT::scalar(1);
6596 
6597   if (SrcTy == S1) {
6598     auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
6599     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6600     MIRBuilder.buildSelect(Dst, Src, True, False);
6601     MI.eraseFromParent();
6602     return Legalized;
6603   }
6604 
6605   if (SrcTy != S64)
6606     return UnableToLegalize;
6607 
6608   if (DstTy == S32) {
6609     // signed cl2f(long l) {
6610     //   long s = l >> 63;
6611     //   float r = cul2f((l + s) ^ s);
6612     //   return s ? -r : r;
6613     // }
6614     Register L = Src;
6615     auto SignBit = MIRBuilder.buildConstant(S64, 63);
6616     auto S = MIRBuilder.buildAShr(S64, L, SignBit);
6617 
6618     auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
6619     auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
6620     auto R = MIRBuilder.buildUITOFP(S32, Xor);
6621 
6622     auto RNeg = MIRBuilder.buildFNeg(S32, R);
6623     auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
6624                                             MIRBuilder.buildConstant(S64, 0));
6625     MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
6626     MI.eraseFromParent();
6627     return Legalized;
6628   }
6629 
6630   return UnableToLegalize;
6631 }
6632 
6633 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
6634   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6635   const LLT S64 = LLT::scalar(64);
6636   const LLT S32 = LLT::scalar(32);
6637 
6638   if (SrcTy != S64 && SrcTy != S32)
6639     return UnableToLegalize;
6640   if (DstTy != S32 && DstTy != S64)
6641     return UnableToLegalize;
6642 
6643   // FPTOSI gives same result as FPTOUI for positive signed integers.
6644   // FPTOUI needs to deal with fp values that convert to unsigned integers
6645   // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
6646 
6647   APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
6648   APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
6649                                                 : APFloat::IEEEdouble(),
6650                     APInt::getZero(SrcTy.getSizeInBits()));
6651   TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
6652 
6653   MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
6654 
6655   MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
6656   // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
6657   // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
6658   MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
6659   MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
6660   MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
6661   MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
6662 
6663   const LLT S1 = LLT::scalar(1);
6664 
6665   MachineInstrBuilder FCMP =
6666       MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
6667   MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
6668 
6669   MI.eraseFromParent();
6670   return Legalized;
6671 }
6672 
6673 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
6674   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6675   const LLT S64 = LLT::scalar(64);
6676   const LLT S32 = LLT::scalar(32);
6677 
6678   // FIXME: Only f32 to i64 conversions are supported.
6679   if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
6680     return UnableToLegalize;
6681 
6682   // Expand f32 -> i64 conversion
6683   // This algorithm comes from compiler-rt's implementation of fixsfdi:
6684   // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
6685 
6686   unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
6687 
6688   auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
6689   auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
6690 
6691   auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
6692   auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
6693 
6694   auto SignMask = MIRBuilder.buildConstant(SrcTy,
6695                                            APInt::getSignMask(SrcEltBits));
6696   auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
6697   auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
6698   auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
6699   Sign = MIRBuilder.buildSExt(DstTy, Sign);
6700 
6701   auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
6702   auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
6703   auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
6704 
6705   auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
6706   R = MIRBuilder.buildZExt(DstTy, R);
6707 
6708   auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
6709   auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
6710   auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
6711   auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
6712 
6713   auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
6714   auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
6715 
6716   const LLT S1 = LLT::scalar(1);
6717   auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
6718                                     S1, Exponent, ExponentLoBit);
6719 
6720   R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
6721 
6722   auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
6723   auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
6724 
6725   auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
6726 
6727   auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
6728                                           S1, Exponent, ZeroSrcTy);
6729 
6730   auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
6731   MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
6732 
6733   MI.eraseFromParent();
6734   return Legalized;
6735 }
6736 
6737 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
6738 LegalizerHelper::LegalizeResult
6739 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
6740   const LLT S1 = LLT::scalar(1);
6741   const LLT S32 = LLT::scalar(32);
6742 
6743   auto [Dst, Src] = MI.getFirst2Regs();
6744   assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) &&
6745          MRI.getType(Src).getScalarType() == LLT::scalar(64));
6746 
6747   if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
6748     return UnableToLegalize;
6749 
6750   if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) {
6751     unsigned Flags = MI.getFlags();
6752     auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags);
6753     MIRBuilder.buildFPTrunc(Dst, Src32, Flags);
6754     MI.eraseFromParent();
6755     return Legalized;
6756   }
6757 
6758   const unsigned ExpMask = 0x7ff;
6759   const unsigned ExpBiasf64 = 1023;
6760   const unsigned ExpBiasf16 = 15;
6761 
6762   auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
6763   Register U = Unmerge.getReg(0);
6764   Register UH = Unmerge.getReg(1);
6765 
6766   auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
6767   E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
6768 
6769   // Subtract the fp64 exponent bias (1023) to get the real exponent and
6770   // add the f16 bias (15) to get the biased exponent for the f16 format.
6771   E = MIRBuilder.buildAdd(
6772     S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
6773 
6774   auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
6775   M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
6776 
6777   auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
6778                                        MIRBuilder.buildConstant(S32, 0x1ff));
6779   MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
6780 
6781   auto Zero = MIRBuilder.buildConstant(S32, 0);
6782   auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
6783   auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
6784   M = MIRBuilder.buildOr(S32, M, Lo40Set);
6785 
6786   // (M != 0 ? 0x0200 : 0) | 0x7c00;
6787   auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
6788   auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
6789   auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
6790 
6791   auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
6792   auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
6793 
6794   // N = M | (E << 12);
6795   auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
6796   auto N = MIRBuilder.buildOr(S32, M, EShl12);
6797 
6798   // B = clamp(1-E, 0, 13);
6799   auto One = MIRBuilder.buildConstant(S32, 1);
6800   auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
6801   auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
6802   B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
6803 
6804   auto SigSetHigh = MIRBuilder.buildOr(S32, M,
6805                                        MIRBuilder.buildConstant(S32, 0x1000));
6806 
6807   auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
6808   auto D0 = MIRBuilder.buildShl(S32, D, B);
6809 
6810   auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
6811                                              D0, SigSetHigh);
6812   auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
6813   D = MIRBuilder.buildOr(S32, D, D1);
6814 
6815   auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
6816   auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
6817 
6818   auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
6819   V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
6820 
6821   auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
6822                                        MIRBuilder.buildConstant(S32, 3));
6823   auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
6824 
6825   auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
6826                                        MIRBuilder.buildConstant(S32, 5));
6827   auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
6828 
6829   V1 = MIRBuilder.buildOr(S32, V0, V1);
6830   V = MIRBuilder.buildAdd(S32, V, V1);
6831 
6832   auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
6833                                        E, MIRBuilder.buildConstant(S32, 30));
6834   V = MIRBuilder.buildSelect(S32, CmpEGt30,
6835                              MIRBuilder.buildConstant(S32, 0x7c00), V);
6836 
6837   auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
6838                                          E, MIRBuilder.buildConstant(S32, 1039));
6839   V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
6840 
6841   // Extract the sign bit.
6842   auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
6843   Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
6844 
6845   // Insert the sign bit
6846   V = MIRBuilder.buildOr(S32, Sign, V);
6847 
6848   MIRBuilder.buildTrunc(Dst, V);
6849   MI.eraseFromParent();
6850   return Legalized;
6851 }
6852 
6853 LegalizerHelper::LegalizeResult
6854 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
6855   auto [DstTy, SrcTy] = MI.getFirst2LLTs();
6856   const LLT S64 = LLT::scalar(64);
6857   const LLT S16 = LLT::scalar(16);
6858 
6859   if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
6860     return lowerFPTRUNC_F64_TO_F16(MI);
6861 
6862   return UnableToLegalize;
6863 }
6864 
6865 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a
6866 // multiplication tree.
6867 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
6868   auto [Dst, Src0, Src1] = MI.getFirst3Regs();
6869   LLT Ty = MRI.getType(Dst);
6870 
6871   auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
6872   MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
6873   MI.eraseFromParent();
6874   return Legalized;
6875 }
6876 
6877 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
6878   switch (Opc) {
6879   case TargetOpcode::G_SMIN:
6880     return CmpInst::ICMP_SLT;
6881   case TargetOpcode::G_SMAX:
6882     return CmpInst::ICMP_SGT;
6883   case TargetOpcode::G_UMIN:
6884     return CmpInst::ICMP_ULT;
6885   case TargetOpcode::G_UMAX:
6886     return CmpInst::ICMP_UGT;
6887   default:
6888     llvm_unreachable("not in integer min/max");
6889   }
6890 }
6891 
6892 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
6893   auto [Dst, Src0, Src1] = MI.getFirst3Regs();
6894 
6895   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
6896   LLT CmpType = MRI.getType(Dst).changeElementSize(1);
6897 
6898   auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
6899   MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
6900 
6901   MI.eraseFromParent();
6902   return Legalized;
6903 }
6904 
6905 LegalizerHelper::LegalizeResult
6906 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
6907   auto [Dst, DstTy, Src0, Src0Ty, Src1, Src1Ty] = MI.getFirst3RegLLTs();
6908   const int Src0Size = Src0Ty.getScalarSizeInBits();
6909   const int Src1Size = Src1Ty.getScalarSizeInBits();
6910 
6911   auto SignBitMask = MIRBuilder.buildConstant(
6912     Src0Ty, APInt::getSignMask(Src0Size));
6913 
6914   auto NotSignBitMask = MIRBuilder.buildConstant(
6915     Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
6916 
6917   Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
6918   Register And1;
6919   if (Src0Ty == Src1Ty) {
6920     And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
6921   } else if (Src0Size > Src1Size) {
6922     auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
6923     auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
6924     auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
6925     And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
6926   } else {
6927     auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
6928     auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
6929     auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
6930     And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
6931   }
6932 
6933   // Be careful about setting nsz/nnan/ninf on every instruction, since the
6934   // constants are a nan and -0.0, but the final result should preserve
6935   // everything.
6936   unsigned Flags = MI.getFlags();
6937   MIRBuilder.buildOr(Dst, And0, And1, Flags);
6938 
6939   MI.eraseFromParent();
6940   return Legalized;
6941 }
6942 
6943 LegalizerHelper::LegalizeResult
6944 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
6945   unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
6946     TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
6947 
6948   auto [Dst, Src0, Src1] = MI.getFirst3Regs();
6949   LLT Ty = MRI.getType(Dst);
6950 
6951   if (!MI.getFlag(MachineInstr::FmNoNans)) {
6952     // Insert canonicalizes if it's possible we need to quiet to get correct
6953     // sNaN behavior.
6954 
6955     // Note this must be done here, and not as an optimization combine in the
6956     // absence of a dedicate quiet-snan instruction as we're using an
6957     // omni-purpose G_FCANONICALIZE.
6958     if (!isKnownNeverSNaN(Src0, MRI))
6959       Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
6960 
6961     if (!isKnownNeverSNaN(Src1, MRI))
6962       Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
6963   }
6964 
6965   // If there are no nans, it's safe to simply replace this with the non-IEEE
6966   // version.
6967   MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
6968   MI.eraseFromParent();
6969   return Legalized;
6970 }
6971 
6972 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
6973   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
6974   Register DstReg = MI.getOperand(0).getReg();
6975   LLT Ty = MRI.getType(DstReg);
6976   unsigned Flags = MI.getFlags();
6977 
6978   auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
6979                                   Flags);
6980   MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
6981   MI.eraseFromParent();
6982   return Legalized;
6983 }
6984 
6985 LegalizerHelper::LegalizeResult
6986 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
6987   auto [DstReg, X] = MI.getFirst2Regs();
6988   const unsigned Flags = MI.getFlags();
6989   const LLT Ty = MRI.getType(DstReg);
6990   const LLT CondTy = Ty.changeElementSize(1);
6991 
6992   // round(x) =>
6993   //  t = trunc(x);
6994   //  d = fabs(x - t);
6995   //  o = copysign(d >= 0.5 ? 1.0 : 0.0, x);
6996   //  return t + o;
6997 
6998   auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
6999 
7000   auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
7001   auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
7002 
7003   auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
7004   auto Cmp =
7005       MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half, Flags);
7006 
7007   // Could emit G_UITOFP instead
7008   auto One = MIRBuilder.buildFConstant(Ty, 1.0);
7009   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
7010   auto BoolFP = MIRBuilder.buildSelect(Ty, Cmp, One, Zero);
7011   auto SignedOffset = MIRBuilder.buildFCopysign(Ty, BoolFP, X);
7012 
7013   MIRBuilder.buildFAdd(DstReg, T, SignedOffset, Flags);
7014 
7015   MI.eraseFromParent();
7016   return Legalized;
7017 }
7018 
7019 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFFloor(MachineInstr &MI) {
7020   auto [DstReg, SrcReg] = MI.getFirst2Regs();
7021   unsigned Flags = MI.getFlags();
7022   LLT Ty = MRI.getType(DstReg);
7023   const LLT CondTy = Ty.changeElementSize(1);
7024 
7025   // result = trunc(src);
7026   // if (src < 0.0 && src != result)
7027   //   result += -1.0.
7028 
7029   auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
7030   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
7031 
7032   auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
7033                                   SrcReg, Zero, Flags);
7034   auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
7035                                       SrcReg, Trunc, Flags);
7036   auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
7037   auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
7038 
7039   MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
7040   MI.eraseFromParent();
7041   return Legalized;
7042 }
7043 
7044 LegalizerHelper::LegalizeResult
7045 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
7046   const unsigned NumOps = MI.getNumOperands();
7047   auto [DstReg, DstTy, Src0Reg, Src0Ty] = MI.getFirst2RegLLTs();
7048   unsigned PartSize = Src0Ty.getSizeInBits();
7049 
7050   LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
7051   Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
7052 
7053   for (unsigned I = 2; I != NumOps; ++I) {
7054     const unsigned Offset = (I - 1) * PartSize;
7055 
7056     Register SrcReg = MI.getOperand(I).getReg();
7057     auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
7058 
7059     Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
7060       MRI.createGenericVirtualRegister(WideTy);
7061 
7062     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
7063     auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
7064     MIRBuilder.buildOr(NextResult, ResultReg, Shl);
7065     ResultReg = NextResult;
7066   }
7067 
7068   if (DstTy.isPointer()) {
7069     if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
7070           DstTy.getAddressSpace())) {
7071       LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
7072       return UnableToLegalize;
7073     }
7074 
7075     MIRBuilder.buildIntToPtr(DstReg, ResultReg);
7076   }
7077 
7078   MI.eraseFromParent();
7079   return Legalized;
7080 }
7081 
7082 LegalizerHelper::LegalizeResult
7083 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
7084   const unsigned NumDst = MI.getNumOperands() - 1;
7085   Register SrcReg = MI.getOperand(NumDst).getReg();
7086   Register Dst0Reg = MI.getOperand(0).getReg();
7087   LLT DstTy = MRI.getType(Dst0Reg);
7088   if (DstTy.isPointer())
7089     return UnableToLegalize; // TODO
7090 
7091   SrcReg = coerceToScalar(SrcReg);
7092   if (!SrcReg)
7093     return UnableToLegalize;
7094 
7095   // Expand scalarizing unmerge as bitcast to integer and shift.
7096   LLT IntTy = MRI.getType(SrcReg);
7097 
7098   MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
7099 
7100   const unsigned DstSize = DstTy.getSizeInBits();
7101   unsigned Offset = DstSize;
7102   for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
7103     auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
7104     auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
7105     MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
7106   }
7107 
7108   MI.eraseFromParent();
7109   return Legalized;
7110 }
7111 
7112 /// Lower a vector extract or insert by writing the vector to a stack temporary
7113 /// and reloading the element or vector.
7114 ///
7115 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
7116 ///  =>
7117 ///  %stack_temp = G_FRAME_INDEX
7118 ///  G_STORE %vec, %stack_temp
7119 ///  %idx = clamp(%idx, %vec.getNumElements())
7120 ///  %element_ptr = G_PTR_ADD %stack_temp, %idx
7121 ///  %dst = G_LOAD %element_ptr
7122 LegalizerHelper::LegalizeResult
7123 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
7124   Register DstReg = MI.getOperand(0).getReg();
7125   Register SrcVec = MI.getOperand(1).getReg();
7126   Register InsertVal;
7127   if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
7128     InsertVal = MI.getOperand(2).getReg();
7129 
7130   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
7131 
7132   LLT VecTy = MRI.getType(SrcVec);
7133   LLT EltTy = VecTy.getElementType();
7134   unsigned NumElts = VecTy.getNumElements();
7135 
7136   int64_t IdxVal;
7137   if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) {
7138     SmallVector<Register, 8> SrcRegs;
7139     extractParts(SrcVec, EltTy, NumElts, SrcRegs, MIRBuilder, MRI);
7140 
7141     if (InsertVal) {
7142       SrcRegs[IdxVal] = MI.getOperand(2).getReg();
7143       MIRBuilder.buildMergeLikeInstr(DstReg, SrcRegs);
7144     } else {
7145       MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]);
7146     }
7147 
7148     MI.eraseFromParent();
7149     return Legalized;
7150   }
7151 
7152   if (!EltTy.isByteSized()) { // Not implemented.
7153     LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
7154     return UnableToLegalize;
7155   }
7156 
7157   unsigned EltBytes = EltTy.getSizeInBytes();
7158   Align VecAlign = getStackTemporaryAlignment(VecTy);
7159   Align EltAlign;
7160 
7161   MachinePointerInfo PtrInfo;
7162   auto StackTemp = createStackTemporary(
7163       TypeSize::getFixed(VecTy.getSizeInBytes()), VecAlign, PtrInfo);
7164   MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
7165 
7166   // Get the pointer to the element, and be sure not to hit undefined behavior
7167   // if the index is out of bounds.
7168   Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
7169 
7170   if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
7171     int64_t Offset = IdxVal * EltBytes;
7172     PtrInfo = PtrInfo.getWithOffset(Offset);
7173     EltAlign = commonAlignment(VecAlign, Offset);
7174   } else {
7175     // We lose information with a variable offset.
7176     EltAlign = getStackTemporaryAlignment(EltTy);
7177     PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
7178   }
7179 
7180   if (InsertVal) {
7181     // Write the inserted element
7182     MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
7183 
7184     // Reload the whole vector.
7185     MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
7186   } else {
7187     MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
7188   }
7189 
7190   MI.eraseFromParent();
7191   return Legalized;
7192 }
7193 
7194 LegalizerHelper::LegalizeResult
7195 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
7196   auto [DstReg, DstTy, Src0Reg, Src0Ty, Src1Reg, Src1Ty] =
7197       MI.getFirst3RegLLTs();
7198   LLT IdxTy = LLT::scalar(32);
7199 
7200   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
7201   Register Undef;
7202   SmallVector<Register, 32> BuildVec;
7203   LLT EltTy = DstTy.getScalarType();
7204 
7205   for (int Idx : Mask) {
7206     if (Idx < 0) {
7207       if (!Undef.isValid())
7208         Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
7209       BuildVec.push_back(Undef);
7210       continue;
7211     }
7212 
7213     if (Src0Ty.isScalar()) {
7214       BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
7215     } else {
7216       int NumElts = Src0Ty.getNumElements();
7217       Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
7218       int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
7219       auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
7220       auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
7221       BuildVec.push_back(Extract.getReg(0));
7222     }
7223   }
7224 
7225   if (DstTy.isScalar())
7226     MIRBuilder.buildCopy(DstReg, BuildVec[0]);
7227   else
7228     MIRBuilder.buildBuildVector(DstReg, BuildVec);
7229   MI.eraseFromParent();
7230   return Legalized;
7231 }
7232 
7233 Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg,
7234                                                     Register AllocSize,
7235                                                     Align Alignment,
7236                                                     LLT PtrTy) {
7237   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
7238 
7239   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
7240   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
7241 
7242   // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
7243   // have to generate an extra instruction to negate the alloc and then use
7244   // G_PTR_ADD to add the negative offset.
7245   auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
7246   if (Alignment > Align(1)) {
7247     APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
7248     AlignMask.negate();
7249     auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
7250     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
7251   }
7252 
7253   return MIRBuilder.buildCast(PtrTy, Alloc).getReg(0);
7254 }
7255 
7256 LegalizerHelper::LegalizeResult
7257 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
7258   const auto &MF = *MI.getMF();
7259   const auto &TFI = *MF.getSubtarget().getFrameLowering();
7260   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
7261     return UnableToLegalize;
7262 
7263   Register Dst = MI.getOperand(0).getReg();
7264   Register AllocSize = MI.getOperand(1).getReg();
7265   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
7266 
7267   LLT PtrTy = MRI.getType(Dst);
7268   Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
7269   Register SPTmp =
7270       getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
7271 
7272   MIRBuilder.buildCopy(SPReg, SPTmp);
7273   MIRBuilder.buildCopy(Dst, SPTmp);
7274 
7275   MI.eraseFromParent();
7276   return Legalized;
7277 }
7278 
7279 LegalizerHelper::LegalizeResult
7280 LegalizerHelper::lowerStackSave(MachineInstr &MI) {
7281   Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
7282   if (!StackPtr)
7283     return UnableToLegalize;
7284 
7285   MIRBuilder.buildCopy(MI.getOperand(0), StackPtr);
7286   MI.eraseFromParent();
7287   return Legalized;
7288 }
7289 
7290 LegalizerHelper::LegalizeResult
7291 LegalizerHelper::lowerStackRestore(MachineInstr &MI) {
7292   Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
7293   if (!StackPtr)
7294     return UnableToLegalize;
7295 
7296   MIRBuilder.buildCopy(StackPtr, MI.getOperand(0));
7297   MI.eraseFromParent();
7298   return Legalized;
7299 }
7300 
7301 LegalizerHelper::LegalizeResult
7302 LegalizerHelper::lowerExtract(MachineInstr &MI) {
7303   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7304   unsigned Offset = MI.getOperand(2).getImm();
7305 
7306   // Extract sub-vector or one element
7307   if (SrcTy.isVector()) {
7308     unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
7309     unsigned DstSize = DstTy.getSizeInBits();
7310 
7311     if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
7312         (Offset + DstSize <= SrcTy.getSizeInBits())) {
7313       // Unmerge and allow access to each Src element for the artifact combiner.
7314       auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), SrcReg);
7315 
7316       // Take element(s) we need to extract and copy it (merge them).
7317       SmallVector<Register, 8> SubVectorElts;
7318       for (unsigned Idx = Offset / SrcEltSize;
7319            Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
7320         SubVectorElts.push_back(Unmerge.getReg(Idx));
7321       }
7322       if (SubVectorElts.size() == 1)
7323         MIRBuilder.buildCopy(DstReg, SubVectorElts[0]);
7324       else
7325         MIRBuilder.buildMergeLikeInstr(DstReg, SubVectorElts);
7326 
7327       MI.eraseFromParent();
7328       return Legalized;
7329     }
7330   }
7331 
7332   if (DstTy.isScalar() &&
7333       (SrcTy.isScalar() ||
7334        (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
7335     LLT SrcIntTy = SrcTy;
7336     if (!SrcTy.isScalar()) {
7337       SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
7338       SrcReg = MIRBuilder.buildBitcast(SrcIntTy, SrcReg).getReg(0);
7339     }
7340 
7341     if (Offset == 0)
7342       MIRBuilder.buildTrunc(DstReg, SrcReg);
7343     else {
7344       auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
7345       auto Shr = MIRBuilder.buildLShr(SrcIntTy, SrcReg, ShiftAmt);
7346       MIRBuilder.buildTrunc(DstReg, Shr);
7347     }
7348 
7349     MI.eraseFromParent();
7350     return Legalized;
7351   }
7352 
7353   return UnableToLegalize;
7354 }
7355 
7356 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
7357   auto [Dst, Src, InsertSrc] = MI.getFirst3Regs();
7358   uint64_t Offset = MI.getOperand(3).getImm();
7359 
7360   LLT DstTy = MRI.getType(Src);
7361   LLT InsertTy = MRI.getType(InsertSrc);
7362 
7363   // Insert sub-vector or one element
7364   if (DstTy.isVector() && !InsertTy.isPointer()) {
7365     LLT EltTy = DstTy.getElementType();
7366     unsigned EltSize = EltTy.getSizeInBits();
7367     unsigned InsertSize = InsertTy.getSizeInBits();
7368 
7369     if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
7370         (Offset + InsertSize <= DstTy.getSizeInBits())) {
7371       auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, Src);
7372       SmallVector<Register, 8> DstElts;
7373       unsigned Idx = 0;
7374       // Elements from Src before insert start Offset
7375       for (; Idx < Offset / EltSize; ++Idx) {
7376         DstElts.push_back(UnmergeSrc.getReg(Idx));
7377       }
7378 
7379       // Replace elements in Src with elements from InsertSrc
7380       if (InsertTy.getSizeInBits() > EltSize) {
7381         auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(EltTy, InsertSrc);
7382         for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
7383              ++Idx, ++i) {
7384           DstElts.push_back(UnmergeInsertSrc.getReg(i));
7385         }
7386       } else {
7387         DstElts.push_back(InsertSrc);
7388         ++Idx;
7389       }
7390 
7391       // Remaining elements from Src after insert
7392       for (; Idx < DstTy.getNumElements(); ++Idx) {
7393         DstElts.push_back(UnmergeSrc.getReg(Idx));
7394       }
7395 
7396       MIRBuilder.buildMergeLikeInstr(Dst, DstElts);
7397       MI.eraseFromParent();
7398       return Legalized;
7399     }
7400   }
7401 
7402   if (InsertTy.isVector() ||
7403       (DstTy.isVector() && DstTy.getElementType() != InsertTy))
7404     return UnableToLegalize;
7405 
7406   const DataLayout &DL = MIRBuilder.getDataLayout();
7407   if ((DstTy.isPointer() &&
7408        DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
7409       (InsertTy.isPointer() &&
7410        DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
7411     LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
7412     return UnableToLegalize;
7413   }
7414 
7415   LLT IntDstTy = DstTy;
7416 
7417   if (!DstTy.isScalar()) {
7418     IntDstTy = LLT::scalar(DstTy.getSizeInBits());
7419     Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
7420   }
7421 
7422   if (!InsertTy.isScalar()) {
7423     const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
7424     InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
7425   }
7426 
7427   Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
7428   if (Offset != 0) {
7429     auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
7430     ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
7431   }
7432 
7433   APInt MaskVal = APInt::getBitsSetWithWrap(
7434       DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
7435 
7436   auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
7437   auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
7438   auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
7439 
7440   MIRBuilder.buildCast(Dst, Or);
7441   MI.eraseFromParent();
7442   return Legalized;
7443 }
7444 
7445 LegalizerHelper::LegalizeResult
7446 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
7447   auto [Dst0, Dst0Ty, Dst1, Dst1Ty, LHS, LHSTy, RHS, RHSTy] =
7448       MI.getFirst4RegLLTs();
7449   const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
7450 
7451   LLT Ty = Dst0Ty;
7452   LLT BoolTy = Dst1Ty;
7453 
7454   if (IsAdd)
7455     MIRBuilder.buildAdd(Dst0, LHS, RHS);
7456   else
7457     MIRBuilder.buildSub(Dst0, LHS, RHS);
7458 
7459   // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
7460 
7461   auto Zero = MIRBuilder.buildConstant(Ty, 0);
7462 
7463   // For an addition, the result should be less than one of the operands (LHS)
7464   // if and only if the other operand (RHS) is negative, otherwise there will
7465   // be overflow.
7466   // For a subtraction, the result should be less than one of the operands
7467   // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
7468   // otherwise there will be overflow.
7469   auto ResultLowerThanLHS =
7470       MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
7471   auto ConditionRHS = MIRBuilder.buildICmp(
7472       IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
7473 
7474   MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
7475   MI.eraseFromParent();
7476   return Legalized;
7477 }
7478 
7479 LegalizerHelper::LegalizeResult
7480 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
7481   auto [Res, LHS, RHS] = MI.getFirst3Regs();
7482   LLT Ty = MRI.getType(Res);
7483   bool IsSigned;
7484   bool IsAdd;
7485   unsigned BaseOp;
7486   switch (MI.getOpcode()) {
7487   default:
7488     llvm_unreachable("unexpected addsat/subsat opcode");
7489   case TargetOpcode::G_UADDSAT:
7490     IsSigned = false;
7491     IsAdd = true;
7492     BaseOp = TargetOpcode::G_ADD;
7493     break;
7494   case TargetOpcode::G_SADDSAT:
7495     IsSigned = true;
7496     IsAdd = true;
7497     BaseOp = TargetOpcode::G_ADD;
7498     break;
7499   case TargetOpcode::G_USUBSAT:
7500     IsSigned = false;
7501     IsAdd = false;
7502     BaseOp = TargetOpcode::G_SUB;
7503     break;
7504   case TargetOpcode::G_SSUBSAT:
7505     IsSigned = true;
7506     IsAdd = false;
7507     BaseOp = TargetOpcode::G_SUB;
7508     break;
7509   }
7510 
7511   if (IsSigned) {
7512     // sadd.sat(a, b) ->
7513     //   hi = 0x7fffffff - smax(a, 0)
7514     //   lo = 0x80000000 - smin(a, 0)
7515     //   a + smin(smax(lo, b), hi)
7516     // ssub.sat(a, b) ->
7517     //   lo = smax(a, -1) - 0x7fffffff
7518     //   hi = smin(a, -1) - 0x80000000
7519     //   a - smin(smax(lo, b), hi)
7520     // TODO: AMDGPU can use a "median of 3" instruction here:
7521     //   a +/- med3(lo, b, hi)
7522     uint64_t NumBits = Ty.getScalarSizeInBits();
7523     auto MaxVal =
7524         MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
7525     auto MinVal =
7526         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7527     MachineInstrBuilder Hi, Lo;
7528     if (IsAdd) {
7529       auto Zero = MIRBuilder.buildConstant(Ty, 0);
7530       Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
7531       Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
7532     } else {
7533       auto NegOne = MIRBuilder.buildConstant(Ty, -1);
7534       Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
7535                                MaxVal);
7536       Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
7537                                MinVal);
7538     }
7539     auto RHSClamped =
7540         MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
7541     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
7542   } else {
7543     // uadd.sat(a, b) -> a + umin(~a, b)
7544     // usub.sat(a, b) -> a - umin(a, b)
7545     Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
7546     auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
7547     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
7548   }
7549 
7550   MI.eraseFromParent();
7551   return Legalized;
7552 }
7553 
7554 LegalizerHelper::LegalizeResult
7555 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
7556   auto [Res, LHS, RHS] = MI.getFirst3Regs();
7557   LLT Ty = MRI.getType(Res);
7558   LLT BoolTy = Ty.changeElementSize(1);
7559   bool IsSigned;
7560   bool IsAdd;
7561   unsigned OverflowOp;
7562   switch (MI.getOpcode()) {
7563   default:
7564     llvm_unreachable("unexpected addsat/subsat opcode");
7565   case TargetOpcode::G_UADDSAT:
7566     IsSigned = false;
7567     IsAdd = true;
7568     OverflowOp = TargetOpcode::G_UADDO;
7569     break;
7570   case TargetOpcode::G_SADDSAT:
7571     IsSigned = true;
7572     IsAdd = true;
7573     OverflowOp = TargetOpcode::G_SADDO;
7574     break;
7575   case TargetOpcode::G_USUBSAT:
7576     IsSigned = false;
7577     IsAdd = false;
7578     OverflowOp = TargetOpcode::G_USUBO;
7579     break;
7580   case TargetOpcode::G_SSUBSAT:
7581     IsSigned = true;
7582     IsAdd = false;
7583     OverflowOp = TargetOpcode::G_SSUBO;
7584     break;
7585   }
7586 
7587   auto OverflowRes =
7588       MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
7589   Register Tmp = OverflowRes.getReg(0);
7590   Register Ov = OverflowRes.getReg(1);
7591   MachineInstrBuilder Clamp;
7592   if (IsSigned) {
7593     // sadd.sat(a, b) ->
7594     //   {tmp, ov} = saddo(a, b)
7595     //   ov ? (tmp >>s 31) + 0x80000000 : r
7596     // ssub.sat(a, b) ->
7597     //   {tmp, ov} = ssubo(a, b)
7598     //   ov ? (tmp >>s 31) + 0x80000000 : r
7599     uint64_t NumBits = Ty.getScalarSizeInBits();
7600     auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
7601     auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
7602     auto MinVal =
7603         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7604     Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
7605   } else {
7606     // uadd.sat(a, b) ->
7607     //   {tmp, ov} = uaddo(a, b)
7608     //   ov ? 0xffffffff : tmp
7609     // usub.sat(a, b) ->
7610     //   {tmp, ov} = usubo(a, b)
7611     //   ov ? 0 : tmp
7612     Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
7613   }
7614   MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
7615 
7616   MI.eraseFromParent();
7617   return Legalized;
7618 }
7619 
7620 LegalizerHelper::LegalizeResult
7621 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
7622   assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
7623           MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
7624          "Expected shlsat opcode!");
7625   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
7626   auto [Res, LHS, RHS] = MI.getFirst3Regs();
7627   LLT Ty = MRI.getType(Res);
7628   LLT BoolTy = Ty.changeElementSize(1);
7629 
7630   unsigned BW = Ty.getScalarSizeInBits();
7631   auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
7632   auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
7633                        : MIRBuilder.buildLShr(Ty, Result, RHS);
7634 
7635   MachineInstrBuilder SatVal;
7636   if (IsSigned) {
7637     auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
7638     auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
7639     auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
7640                                     MIRBuilder.buildConstant(Ty, 0));
7641     SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
7642   } else {
7643     SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
7644   }
7645   auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
7646   MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
7647 
7648   MI.eraseFromParent();
7649   return Legalized;
7650 }
7651 
7652 LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) {
7653   auto [Dst, Src] = MI.getFirst2Regs();
7654   const LLT Ty = MRI.getType(Src);
7655   unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
7656   unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
7657 
7658   // Swap most and least significant byte, set remaining bytes in Res to zero.
7659   auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
7660   auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
7661   auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7662   auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
7663 
7664   // Set i-th high/low byte in Res to i-th low/high byte from Src.
7665   for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
7666     // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
7667     APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
7668     auto Mask = MIRBuilder.buildConstant(Ty, APMask);
7669     auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
7670     // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
7671     auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
7672     auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
7673     Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
7674     // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
7675     auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7676     auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
7677     Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
7678   }
7679   Res.getInstr()->getOperand(0).setReg(Dst);
7680 
7681   MI.eraseFromParent();
7682   return Legalized;
7683 }
7684 
7685 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
7686 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
7687                                  MachineInstrBuilder Src, APInt Mask) {
7688   const LLT Ty = Dst.getLLTTy(*B.getMRI());
7689   MachineInstrBuilder C_N = B.buildConstant(Ty, N);
7690   MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
7691   auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
7692   auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
7693   return B.buildOr(Dst, LHS, RHS);
7694 }
7695 
7696 LegalizerHelper::LegalizeResult
7697 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
7698   auto [Dst, Src] = MI.getFirst2Regs();
7699   const LLT Ty = MRI.getType(Src);
7700   unsigned Size = Ty.getSizeInBits();
7701 
7702   MachineInstrBuilder BSWAP =
7703       MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
7704 
7705   // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
7706   //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
7707   // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
7708   MachineInstrBuilder Swap4 =
7709       SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
7710 
7711   // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
7712   //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
7713   // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
7714   MachineInstrBuilder Swap2 =
7715       SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
7716 
7717   // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
7718   //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
7719   // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
7720   SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
7721 
7722   MI.eraseFromParent();
7723   return Legalized;
7724 }
7725 
7726 LegalizerHelper::LegalizeResult
7727 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
7728   MachineFunction &MF = MIRBuilder.getMF();
7729 
7730   bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
7731   int NameOpIdx = IsRead ? 1 : 0;
7732   int ValRegIndex = IsRead ? 0 : 1;
7733 
7734   Register ValReg = MI.getOperand(ValRegIndex).getReg();
7735   const LLT Ty = MRI.getType(ValReg);
7736   const MDString *RegStr = cast<MDString>(
7737     cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
7738 
7739   Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
7740   if (!PhysReg.isValid())
7741     return UnableToLegalize;
7742 
7743   if (IsRead)
7744     MIRBuilder.buildCopy(ValReg, PhysReg);
7745   else
7746     MIRBuilder.buildCopy(PhysReg, ValReg);
7747 
7748   MI.eraseFromParent();
7749   return Legalized;
7750 }
7751 
7752 LegalizerHelper::LegalizeResult
7753 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
7754   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
7755   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
7756   Register Result = MI.getOperand(0).getReg();
7757   LLT OrigTy = MRI.getType(Result);
7758   auto SizeInBits = OrigTy.getScalarSizeInBits();
7759   LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
7760 
7761   auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
7762   auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
7763   auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
7764   unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
7765 
7766   auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
7767   auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
7768   MIRBuilder.buildTrunc(Result, Shifted);
7769 
7770   MI.eraseFromParent();
7771   return Legalized;
7772 }
7773 
7774 LegalizerHelper::LegalizeResult
7775 LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) {
7776   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7777   FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(2).getImm());
7778 
7779   if (Mask == fcNone) {
7780     MIRBuilder.buildConstant(DstReg, 0);
7781     MI.eraseFromParent();
7782     return Legalized;
7783   }
7784   if (Mask == fcAllFlags) {
7785     MIRBuilder.buildConstant(DstReg, 1);
7786     MI.eraseFromParent();
7787     return Legalized;
7788   }
7789 
7790   // TODO: Try inverting the test with getInvertedFPClassTest like the DAG
7791   // version
7792 
7793   unsigned BitSize = SrcTy.getScalarSizeInBits();
7794   const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType());
7795 
7796   LLT IntTy = LLT::scalar(BitSize);
7797   if (SrcTy.isVector())
7798     IntTy = LLT::vector(SrcTy.getElementCount(), IntTy);
7799   auto AsInt = MIRBuilder.buildCopy(IntTy, SrcReg);
7800 
7801   // Various masks.
7802   APInt SignBit = APInt::getSignMask(BitSize);
7803   APInt ValueMask = APInt::getSignedMaxValue(BitSize);     // All bits but sign.
7804   APInt Inf = APFloat::getInf(Semantics).bitcastToAPInt(); // Exp and int bit.
7805   APInt ExpMask = Inf;
7806   APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf;
7807   APInt QNaNBitMask =
7808       APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1);
7809   APInt InvertionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits());
7810 
7811   auto SignBitC = MIRBuilder.buildConstant(IntTy, SignBit);
7812   auto ValueMaskC = MIRBuilder.buildConstant(IntTy, ValueMask);
7813   auto InfC = MIRBuilder.buildConstant(IntTy, Inf);
7814   auto ExpMaskC = MIRBuilder.buildConstant(IntTy, ExpMask);
7815   auto ZeroC = MIRBuilder.buildConstant(IntTy, 0);
7816 
7817   auto Abs = MIRBuilder.buildAnd(IntTy, AsInt, ValueMaskC);
7818   auto Sign =
7819       MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, DstTy, AsInt, Abs);
7820 
7821   auto Res = MIRBuilder.buildConstant(DstTy, 0);
7822   // Clang doesn't support capture of structured bindings:
7823   LLT DstTyCopy = DstTy;
7824   const auto appendToRes = [&](MachineInstrBuilder ToAppend) {
7825     Res = MIRBuilder.buildOr(DstTyCopy, Res, ToAppend);
7826   };
7827 
7828   // Tests that involve more than one class should be processed first.
7829   if ((Mask & fcFinite) == fcFinite) {
7830     // finite(V) ==> abs(V) u< exp_mask
7831     appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
7832                                      ExpMaskC));
7833     Mask &= ~fcFinite;
7834   } else if ((Mask & fcFinite) == fcPosFinite) {
7835     // finite(V) && V > 0 ==> V u< exp_mask
7836     appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, AsInt,
7837                                      ExpMaskC));
7838     Mask &= ~fcPosFinite;
7839   } else if ((Mask & fcFinite) == fcNegFinite) {
7840     // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1
7841     auto Cmp = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
7842                                     ExpMaskC);
7843     auto And = MIRBuilder.buildAnd(DstTy, Cmp, Sign);
7844     appendToRes(And);
7845     Mask &= ~fcNegFinite;
7846   }
7847 
7848   if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) {
7849     // fcZero | fcSubnormal => test all exponent bits are 0
7850     // TODO: Handle sign bit specific cases
7851     // TODO: Handle inverted case
7852     if (PartialCheck == (fcZero | fcSubnormal)) {
7853       auto ExpBits = MIRBuilder.buildAnd(IntTy, AsInt, ExpMaskC);
7854       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7855                                        ExpBits, ZeroC));
7856       Mask &= ~PartialCheck;
7857     }
7858   }
7859 
7860   // Check for individual classes.
7861   if (FPClassTest PartialCheck = Mask & fcZero) {
7862     if (PartialCheck == fcPosZero)
7863       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7864                                        AsInt, ZeroC));
7865     else if (PartialCheck == fcZero)
7866       appendToRes(
7867           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, ZeroC));
7868     else // fcNegZero
7869       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7870                                        AsInt, SignBitC));
7871   }
7872 
7873   if (FPClassTest PartialCheck = Mask & fcSubnormal) {
7874     // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set)
7875     // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set)
7876     auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs;
7877     auto OneC = MIRBuilder.buildConstant(IntTy, 1);
7878     auto VMinusOne = MIRBuilder.buildSub(IntTy, V, OneC);
7879     auto SubnormalRes =
7880         MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, VMinusOne,
7881                              MIRBuilder.buildConstant(IntTy, AllOneMantissa));
7882     if (PartialCheck == fcNegSubnormal)
7883       SubnormalRes = MIRBuilder.buildAnd(DstTy, SubnormalRes, Sign);
7884     appendToRes(SubnormalRes);
7885   }
7886 
7887   if (FPClassTest PartialCheck = Mask & fcInf) {
7888     if (PartialCheck == fcPosInf)
7889       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7890                                        AsInt, InfC));
7891     else if (PartialCheck == fcInf)
7892       appendToRes(
7893           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, InfC));
7894     else { // fcNegInf
7895       APInt NegInf = APFloat::getInf(Semantics, true).bitcastToAPInt();
7896       auto NegInfC = MIRBuilder.buildConstant(IntTy, NegInf);
7897       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7898                                        AsInt, NegInfC));
7899     }
7900   }
7901 
7902   if (FPClassTest PartialCheck = Mask & fcNan) {
7903     auto InfWithQnanBitC = MIRBuilder.buildConstant(IntTy, Inf | QNaNBitMask);
7904     if (PartialCheck == fcNan) {
7905       // isnan(V) ==> abs(V) u> int(inf)
7906       appendToRes(
7907           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC));
7908     } else if (PartialCheck == fcQNan) {
7909       // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit)
7910       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGE, DstTy, Abs,
7911                                        InfWithQnanBitC));
7912     } else { // fcSNan
7913       // issignaling(V) ==> abs(V) u> unsigned(Inf) &&
7914       //                    abs(V) u< (unsigned(Inf) | quiet_bit)
7915       auto IsNan =
7916           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC);
7917       auto IsNotQnan = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy,
7918                                             Abs, InfWithQnanBitC);
7919       appendToRes(MIRBuilder.buildAnd(DstTy, IsNan, IsNotQnan));
7920     }
7921   }
7922 
7923   if (FPClassTest PartialCheck = Mask & fcNormal) {
7924     // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u<
7925     // (max_exp-1))
7926     APInt ExpLSB = ExpMask & ~(ExpMask.shl(1));
7927     auto ExpMinusOne = MIRBuilder.buildSub(
7928         IntTy, Abs, MIRBuilder.buildConstant(IntTy, ExpLSB));
7929     APInt MaxExpMinusOne = ExpMask - ExpLSB;
7930     auto NormalRes =
7931         MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne,
7932                              MIRBuilder.buildConstant(IntTy, MaxExpMinusOne));
7933     if (PartialCheck == fcNegNormal)
7934       NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, Sign);
7935     else if (PartialCheck == fcPosNormal) {
7936       auto PosSign = MIRBuilder.buildXor(
7937           DstTy, Sign, MIRBuilder.buildConstant(DstTy, InvertionMask));
7938       NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, PosSign);
7939     }
7940     appendToRes(NormalRes);
7941   }
7942 
7943   MIRBuilder.buildCopy(DstReg, Res);
7944   MI.eraseFromParent();
7945   return Legalized;
7946 }
7947 
7948 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
7949   // Implement vector G_SELECT in terms of XOR, AND, OR.
7950   auto [DstReg, DstTy, MaskReg, MaskTy, Op1Reg, Op1Ty, Op2Reg, Op2Ty] =
7951       MI.getFirst4RegLLTs();
7952   if (!DstTy.isVector())
7953     return UnableToLegalize;
7954 
7955   bool IsEltPtr = DstTy.getElementType().isPointer();
7956   if (IsEltPtr) {
7957     LLT ScalarPtrTy = LLT::scalar(DstTy.getScalarSizeInBits());
7958     LLT NewTy = DstTy.changeElementType(ScalarPtrTy);
7959     Op1Reg = MIRBuilder.buildPtrToInt(NewTy, Op1Reg).getReg(0);
7960     Op2Reg = MIRBuilder.buildPtrToInt(NewTy, Op2Reg).getReg(0);
7961     DstTy = NewTy;
7962   }
7963 
7964   if (MaskTy.isScalar()) {
7965     // Turn the scalar condition into a vector condition mask.
7966 
7967     Register MaskElt = MaskReg;
7968 
7969     // The condition was potentially zero extended before, but we want a sign
7970     // extended boolean.
7971     if (MaskTy != LLT::scalar(1))
7972       MaskElt = MIRBuilder.buildSExtInReg(MaskTy, MaskElt, 1).getReg(0);
7973 
7974     // Continue the sign extension (or truncate) to match the data type.
7975     MaskElt = MIRBuilder.buildSExtOrTrunc(DstTy.getElementType(),
7976                                           MaskElt).getReg(0);
7977 
7978     // Generate a vector splat idiom.
7979     auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
7980     MaskReg = ShufSplat.getReg(0);
7981     MaskTy = DstTy;
7982   }
7983 
7984   if (MaskTy.getSizeInBits() != DstTy.getSizeInBits()) {
7985     return UnableToLegalize;
7986   }
7987 
7988   auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
7989   auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
7990   auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
7991   if (IsEltPtr) {
7992     auto Or = MIRBuilder.buildOr(DstTy, NewOp1, NewOp2);
7993     MIRBuilder.buildIntToPtr(DstReg, Or);
7994   } else {
7995     MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
7996   }
7997   MI.eraseFromParent();
7998   return Legalized;
7999 }
8000 
8001 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
8002   // Split DIVREM into individual instructions.
8003   unsigned Opcode = MI.getOpcode();
8004 
8005   MIRBuilder.buildInstr(
8006       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
8007                                         : TargetOpcode::G_UDIV,
8008       {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
8009   MIRBuilder.buildInstr(
8010       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
8011                                         : TargetOpcode::G_UREM,
8012       {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
8013   MI.eraseFromParent();
8014   return Legalized;
8015 }
8016 
8017 LegalizerHelper::LegalizeResult
8018 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
8019   // Expand %res = G_ABS %a into:
8020   // %v1 = G_ASHR %a, scalar_size-1
8021   // %v2 = G_ADD %a, %v1
8022   // %res = G_XOR %v2, %v1
8023   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
8024   Register OpReg = MI.getOperand(1).getReg();
8025   auto ShiftAmt =
8026       MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
8027   auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
8028   auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
8029   MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
8030   MI.eraseFromParent();
8031   return Legalized;
8032 }
8033 
8034 LegalizerHelper::LegalizeResult
8035 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
8036   // Expand %res = G_ABS %a into:
8037   // %v1 = G_CONSTANT 0
8038   // %v2 = G_SUB %v1, %a
8039   // %res = G_SMAX %a, %v2
8040   Register SrcReg = MI.getOperand(1).getReg();
8041   LLT Ty = MRI.getType(SrcReg);
8042   auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
8043   auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
8044   MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
8045   MI.eraseFromParent();
8046   return Legalized;
8047 }
8048 
8049 LegalizerHelper::LegalizeResult
8050 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
8051   Register SrcReg = MI.getOperand(1).getReg();
8052   LLT SrcTy = MRI.getType(SrcReg);
8053   LLT DstTy = MRI.getType(SrcReg);
8054 
8055   // The source could be a scalar if the IR type was <1 x sN>.
8056   if (SrcTy.isScalar()) {
8057     if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
8058       return UnableToLegalize; // FIXME: handle extension.
8059     // This can be just a plain copy.
8060     Observer.changingInstr(MI);
8061     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
8062     Observer.changedInstr(MI);
8063     return Legalized;
8064   }
8065   return UnableToLegalize;
8066 }
8067 
8068 static Type *getTypeForLLT(LLT Ty, LLVMContext &C);
8069 
8070 LegalizerHelper::LegalizeResult LegalizerHelper::lowerVAArg(MachineInstr &MI) {
8071   MachineFunction &MF = *MI.getMF();
8072   const DataLayout &DL = MIRBuilder.getDataLayout();
8073   LLVMContext &Ctx = MF.getFunction().getContext();
8074   Register ListPtr = MI.getOperand(1).getReg();
8075   LLT PtrTy = MRI.getType(ListPtr);
8076 
8077   // LstPtr is a pointer to the head of the list. Get the address
8078   // of the head of the list.
8079   Align PtrAlignment = DL.getABITypeAlign(getTypeForLLT(PtrTy, Ctx));
8080   MachineMemOperand *PtrLoadMMO = MF.getMachineMemOperand(
8081       MachinePointerInfo(), MachineMemOperand::MOLoad, PtrTy, PtrAlignment);
8082   auto VAList = MIRBuilder.buildLoad(PtrTy, ListPtr, *PtrLoadMMO).getReg(0);
8083 
8084   const Align A(MI.getOperand(2).getImm());
8085   LLT PtrTyAsScalarTy = LLT::scalar(PtrTy.getSizeInBits());
8086   if (A > TLI.getMinStackArgumentAlignment()) {
8087     Register AlignAmt =
8088         MIRBuilder.buildConstant(PtrTyAsScalarTy, A.value() - 1).getReg(0);
8089     auto AddDst = MIRBuilder.buildPtrAdd(PtrTy, VAList, AlignAmt);
8090     auto AndDst = MIRBuilder.buildMaskLowPtrBits(PtrTy, AddDst, Log2(A));
8091     VAList = AndDst.getReg(0);
8092   }
8093 
8094   // Increment the pointer, VAList, to the next vaarg
8095   // The list should be bumped by the size of element in the current head of
8096   // list.
8097   Register Dst = MI.getOperand(0).getReg();
8098   LLT LLTTy = MRI.getType(Dst);
8099   Type *Ty = getTypeForLLT(LLTTy, Ctx);
8100   auto IncAmt =
8101       MIRBuilder.buildConstant(PtrTyAsScalarTy, DL.getTypeAllocSize(Ty));
8102   auto Succ = MIRBuilder.buildPtrAdd(PtrTy, VAList, IncAmt);
8103 
8104   // Store the increment VAList to the legalized pointer
8105   MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
8106       MachinePointerInfo(), MachineMemOperand::MOStore, PtrTy, PtrAlignment);
8107   MIRBuilder.buildStore(Succ, ListPtr, *StoreMMO);
8108   // Load the actual argument out of the pointer VAList
8109   Align EltAlignment = DL.getABITypeAlign(Ty);
8110   MachineMemOperand *EltLoadMMO = MF.getMachineMemOperand(
8111       MachinePointerInfo(), MachineMemOperand::MOLoad, LLTTy, EltAlignment);
8112   MIRBuilder.buildLoad(Dst, VAList, *EltLoadMMO);
8113 
8114   MI.eraseFromParent();
8115   return Legalized;
8116 }
8117 
8118 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
8119   // On Darwin, -Os means optimize for size without hurting performance, so
8120   // only really optimize for size when -Oz (MinSize) is used.
8121   if (MF.getTarget().getTargetTriple().isOSDarwin())
8122     return MF.getFunction().hasMinSize();
8123   return MF.getFunction().hasOptSize();
8124 }
8125 
8126 // Returns a list of types to use for memory op lowering in MemOps. A partial
8127 // port of findOptimalMemOpLowering in TargetLowering.
8128 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
8129                                           unsigned Limit, const MemOp &Op,
8130                                           unsigned DstAS, unsigned SrcAS,
8131                                           const AttributeList &FuncAttributes,
8132                                           const TargetLowering &TLI) {
8133   if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
8134     return false;
8135 
8136   LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
8137 
8138   if (Ty == LLT()) {
8139     // Use the largest scalar type whose alignment constraints are satisfied.
8140     // We only need to check DstAlign here as SrcAlign is always greater or
8141     // equal to DstAlign (or zero).
8142     Ty = LLT::scalar(64);
8143     if (Op.isFixedDstAlign())
8144       while (Op.getDstAlign() < Ty.getSizeInBytes() &&
8145              !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
8146         Ty = LLT::scalar(Ty.getSizeInBytes());
8147     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
8148     // FIXME: check for the largest legal type we can load/store to.
8149   }
8150 
8151   unsigned NumMemOps = 0;
8152   uint64_t Size = Op.size();
8153   while (Size) {
8154     unsigned TySize = Ty.getSizeInBytes();
8155     while (TySize > Size) {
8156       // For now, only use non-vector load / store's for the left-over pieces.
8157       LLT NewTy = Ty;
8158       // FIXME: check for mem op safety and legality of the types. Not all of
8159       // SDAGisms map cleanly to GISel concepts.
8160       if (NewTy.isVector())
8161         NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
8162       NewTy = LLT::scalar(llvm::bit_floor(NewTy.getSizeInBits() - 1));
8163       unsigned NewTySize = NewTy.getSizeInBytes();
8164       assert(NewTySize > 0 && "Could not find appropriate type");
8165 
8166       // If the new LLT cannot cover all of the remaining bits, then consider
8167       // issuing a (or a pair of) unaligned and overlapping load / store.
8168       unsigned Fast;
8169       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
8170       MVT VT = getMVTForLLT(Ty);
8171       if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
8172           TLI.allowsMisalignedMemoryAccesses(
8173               VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
8174               MachineMemOperand::MONone, &Fast) &&
8175           Fast)
8176         TySize = Size;
8177       else {
8178         Ty = NewTy;
8179         TySize = NewTySize;
8180       }
8181     }
8182 
8183     if (++NumMemOps > Limit)
8184       return false;
8185 
8186     MemOps.push_back(Ty);
8187     Size -= TySize;
8188   }
8189 
8190   return true;
8191 }
8192 
8193 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
8194   if (Ty.isVector())
8195     return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
8196                                 Ty.getNumElements());
8197   return IntegerType::get(C, Ty.getSizeInBits());
8198 }
8199 
8200 // Get a vectorized representation of the memset value operand, GISel edition.
8201 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
8202   MachineRegisterInfo &MRI = *MIB.getMRI();
8203   unsigned NumBits = Ty.getScalarSizeInBits();
8204   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
8205   if (!Ty.isVector() && ValVRegAndVal) {
8206     APInt Scalar = ValVRegAndVal->Value.trunc(8);
8207     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
8208     return MIB.buildConstant(Ty, SplatVal).getReg(0);
8209   }
8210 
8211   // Extend the byte value to the larger type, and then multiply by a magic
8212   // value 0x010101... in order to replicate it across every byte.
8213   // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
8214   if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
8215     return MIB.buildConstant(Ty, 0).getReg(0);
8216   }
8217 
8218   LLT ExtType = Ty.getScalarType();
8219   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
8220   if (NumBits > 8) {
8221     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
8222     auto MagicMI = MIB.buildConstant(ExtType, Magic);
8223     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
8224   }
8225 
8226   // For vector types create a G_BUILD_VECTOR.
8227   if (Ty.isVector())
8228     Val = MIB.buildSplatVector(Ty, Val).getReg(0);
8229 
8230   return Val;
8231 }
8232 
8233 LegalizerHelper::LegalizeResult
8234 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
8235                              uint64_t KnownLen, Align Alignment,
8236                              bool IsVolatile) {
8237   auto &MF = *MI.getParent()->getParent();
8238   const auto &TLI = *MF.getSubtarget().getTargetLowering();
8239   auto &DL = MF.getDataLayout();
8240   LLVMContext &C = MF.getFunction().getContext();
8241 
8242   assert(KnownLen != 0 && "Have a zero length memset length!");
8243 
8244   bool DstAlignCanChange = false;
8245   MachineFrameInfo &MFI = MF.getFrameInfo();
8246   bool OptSize = shouldLowerMemFuncForSize(MF);
8247 
8248   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
8249   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
8250     DstAlignCanChange = true;
8251 
8252   unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
8253   std::vector<LLT> MemOps;
8254 
8255   const auto &DstMMO = **MI.memoperands_begin();
8256   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
8257 
8258   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
8259   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
8260 
8261   if (!findGISelOptimalMemOpLowering(MemOps, Limit,
8262                                      MemOp::Set(KnownLen, DstAlignCanChange,
8263                                                 Alignment,
8264                                                 /*IsZeroMemset=*/IsZeroVal,
8265                                                 /*IsVolatile=*/IsVolatile),
8266                                      DstPtrInfo.getAddrSpace(), ~0u,
8267                                      MF.getFunction().getAttributes(), TLI))
8268     return UnableToLegalize;
8269 
8270   if (DstAlignCanChange) {
8271     // Get an estimate of the type from the LLT.
8272     Type *IRTy = getTypeForLLT(MemOps[0], C);
8273     Align NewAlign = DL.getABITypeAlign(IRTy);
8274     if (NewAlign > Alignment) {
8275       Alignment = NewAlign;
8276       unsigned FI = FIDef->getOperand(1).getIndex();
8277       // Give the stack frame object a larger alignment if needed.
8278       if (MFI.getObjectAlign(FI) < Alignment)
8279         MFI.setObjectAlignment(FI, Alignment);
8280     }
8281   }
8282 
8283   MachineIRBuilder MIB(MI);
8284   // Find the largest store and generate the bit pattern for it.
8285   LLT LargestTy = MemOps[0];
8286   for (unsigned i = 1; i < MemOps.size(); i++)
8287     if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
8288       LargestTy = MemOps[i];
8289 
8290   // The memset stored value is always defined as an s8, so in order to make it
8291   // work with larger store types we need to repeat the bit pattern across the
8292   // wider type.
8293   Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
8294 
8295   if (!MemSetValue)
8296     return UnableToLegalize;
8297 
8298   // Generate the stores. For each store type in the list, we generate the
8299   // matching store of that type to the destination address.
8300   LLT PtrTy = MRI.getType(Dst);
8301   unsigned DstOff = 0;
8302   unsigned Size = KnownLen;
8303   for (unsigned I = 0; I < MemOps.size(); I++) {
8304     LLT Ty = MemOps[I];
8305     unsigned TySize = Ty.getSizeInBytes();
8306     if (TySize > Size) {
8307       // Issuing an unaligned load / store pair that overlaps with the previous
8308       // pair. Adjust the offset accordingly.
8309       assert(I == MemOps.size() - 1 && I != 0);
8310       DstOff -= TySize - Size;
8311     }
8312 
8313     // If this store is smaller than the largest store see whether we can get
8314     // the smaller value for free with a truncate.
8315     Register Value = MemSetValue;
8316     if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
8317       MVT VT = getMVTForLLT(Ty);
8318       MVT LargestVT = getMVTForLLT(LargestTy);
8319       if (!LargestTy.isVector() && !Ty.isVector() &&
8320           TLI.isTruncateFree(LargestVT, VT))
8321         Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
8322       else
8323         Value = getMemsetValue(Val, Ty, MIB);
8324       if (!Value)
8325         return UnableToLegalize;
8326     }
8327 
8328     auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
8329 
8330     Register Ptr = Dst;
8331     if (DstOff != 0) {
8332       auto Offset =
8333           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
8334       Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
8335     }
8336 
8337     MIB.buildStore(Value, Ptr, *StoreMMO);
8338     DstOff += Ty.getSizeInBytes();
8339     Size -= TySize;
8340   }
8341 
8342   MI.eraseFromParent();
8343   return Legalized;
8344 }
8345 
8346 LegalizerHelper::LegalizeResult
8347 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
8348   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
8349 
8350   auto [Dst, Src, Len] = MI.getFirst3Regs();
8351 
8352   const auto *MMOIt = MI.memoperands_begin();
8353   const MachineMemOperand *MemOp = *MMOIt;
8354   bool IsVolatile = MemOp->isVolatile();
8355 
8356   // See if this is a constant length copy
8357   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
8358   // FIXME: support dynamically sized G_MEMCPY_INLINE
8359   assert(LenVRegAndVal &&
8360          "inline memcpy with dynamic size is not yet supported");
8361   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
8362   if (KnownLen == 0) {
8363     MI.eraseFromParent();
8364     return Legalized;
8365   }
8366 
8367   const auto &DstMMO = **MI.memoperands_begin();
8368   const auto &SrcMMO = **std::next(MI.memoperands_begin());
8369   Align DstAlign = DstMMO.getBaseAlign();
8370   Align SrcAlign = SrcMMO.getBaseAlign();
8371 
8372   return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
8373                            IsVolatile);
8374 }
8375 
8376 LegalizerHelper::LegalizeResult
8377 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
8378                                    uint64_t KnownLen, Align DstAlign,
8379                                    Align SrcAlign, bool IsVolatile) {
8380   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
8381   return lowerMemcpy(MI, Dst, Src, KnownLen,
8382                      std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
8383                      IsVolatile);
8384 }
8385 
8386 LegalizerHelper::LegalizeResult
8387 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
8388                              uint64_t KnownLen, uint64_t Limit, Align DstAlign,
8389                              Align SrcAlign, bool IsVolatile) {
8390   auto &MF = *MI.getParent()->getParent();
8391   const auto &TLI = *MF.getSubtarget().getTargetLowering();
8392   auto &DL = MF.getDataLayout();
8393   LLVMContext &C = MF.getFunction().getContext();
8394 
8395   assert(KnownLen != 0 && "Have a zero length memcpy length!");
8396 
8397   bool DstAlignCanChange = false;
8398   MachineFrameInfo &MFI = MF.getFrameInfo();
8399   Align Alignment = std::min(DstAlign, SrcAlign);
8400 
8401   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
8402   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
8403     DstAlignCanChange = true;
8404 
8405   // FIXME: infer better src pointer alignment like SelectionDAG does here.
8406   // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
8407   // if the memcpy is in a tail call position.
8408 
8409   std::vector<LLT> MemOps;
8410 
8411   const auto &DstMMO = **MI.memoperands_begin();
8412   const auto &SrcMMO = **std::next(MI.memoperands_begin());
8413   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
8414   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
8415 
8416   if (!findGISelOptimalMemOpLowering(
8417           MemOps, Limit,
8418           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
8419                       IsVolatile),
8420           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
8421           MF.getFunction().getAttributes(), TLI))
8422     return UnableToLegalize;
8423 
8424   if (DstAlignCanChange) {
8425     // Get an estimate of the type from the LLT.
8426     Type *IRTy = getTypeForLLT(MemOps[0], C);
8427     Align NewAlign = DL.getABITypeAlign(IRTy);
8428 
8429     // Don't promote to an alignment that would require dynamic stack
8430     // realignment.
8431     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
8432     if (!TRI->hasStackRealignment(MF))
8433       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
8434         NewAlign = NewAlign.previous();
8435 
8436     if (NewAlign > Alignment) {
8437       Alignment = NewAlign;
8438       unsigned FI = FIDef->getOperand(1).getIndex();
8439       // Give the stack frame object a larger alignment if needed.
8440       if (MFI.getObjectAlign(FI) < Alignment)
8441         MFI.setObjectAlignment(FI, Alignment);
8442     }
8443   }
8444 
8445   LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
8446 
8447   MachineIRBuilder MIB(MI);
8448   // Now we need to emit a pair of load and stores for each of the types we've
8449   // collected. I.e. for each type, generate a load from the source pointer of
8450   // that type width, and then generate a corresponding store to the dest buffer
8451   // of that value loaded. This can result in a sequence of loads and stores
8452   // mixed types, depending on what the target specifies as good types to use.
8453   unsigned CurrOffset = 0;
8454   unsigned Size = KnownLen;
8455   for (auto CopyTy : MemOps) {
8456     // Issuing an unaligned load / store pair  that overlaps with the previous
8457     // pair. Adjust the offset accordingly.
8458     if (CopyTy.getSizeInBytes() > Size)
8459       CurrOffset -= CopyTy.getSizeInBytes() - Size;
8460 
8461     // Construct MMOs for the accesses.
8462     auto *LoadMMO =
8463         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
8464     auto *StoreMMO =
8465         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
8466 
8467     // Create the load.
8468     Register LoadPtr = Src;
8469     Register Offset;
8470     if (CurrOffset != 0) {
8471       LLT SrcTy = MRI.getType(Src);
8472       Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
8473                    .getReg(0);
8474       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
8475     }
8476     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
8477 
8478     // Create the store.
8479     Register StorePtr = Dst;
8480     if (CurrOffset != 0) {
8481       LLT DstTy = MRI.getType(Dst);
8482       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
8483     }
8484     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
8485     CurrOffset += CopyTy.getSizeInBytes();
8486     Size -= CopyTy.getSizeInBytes();
8487   }
8488 
8489   MI.eraseFromParent();
8490   return Legalized;
8491 }
8492 
8493 LegalizerHelper::LegalizeResult
8494 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
8495                               uint64_t KnownLen, Align DstAlign, Align SrcAlign,
8496                               bool IsVolatile) {
8497   auto &MF = *MI.getParent()->getParent();
8498   const auto &TLI = *MF.getSubtarget().getTargetLowering();
8499   auto &DL = MF.getDataLayout();
8500   LLVMContext &C = MF.getFunction().getContext();
8501 
8502   assert(KnownLen != 0 && "Have a zero length memmove length!");
8503 
8504   bool DstAlignCanChange = false;
8505   MachineFrameInfo &MFI = MF.getFrameInfo();
8506   bool OptSize = shouldLowerMemFuncForSize(MF);
8507   Align Alignment = std::min(DstAlign, SrcAlign);
8508 
8509   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
8510   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
8511     DstAlignCanChange = true;
8512 
8513   unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
8514   std::vector<LLT> MemOps;
8515 
8516   const auto &DstMMO = **MI.memoperands_begin();
8517   const auto &SrcMMO = **std::next(MI.memoperands_begin());
8518   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
8519   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
8520 
8521   // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
8522   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
8523   // same thing here.
8524   if (!findGISelOptimalMemOpLowering(
8525           MemOps, Limit,
8526           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
8527                       /*IsVolatile*/ true),
8528           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
8529           MF.getFunction().getAttributes(), TLI))
8530     return UnableToLegalize;
8531 
8532   if (DstAlignCanChange) {
8533     // Get an estimate of the type from the LLT.
8534     Type *IRTy = getTypeForLLT(MemOps[0], C);
8535     Align NewAlign = DL.getABITypeAlign(IRTy);
8536 
8537     // Don't promote to an alignment that would require dynamic stack
8538     // realignment.
8539     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
8540     if (!TRI->hasStackRealignment(MF))
8541       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
8542         NewAlign = NewAlign.previous();
8543 
8544     if (NewAlign > Alignment) {
8545       Alignment = NewAlign;
8546       unsigned FI = FIDef->getOperand(1).getIndex();
8547       // Give the stack frame object a larger alignment if needed.
8548       if (MFI.getObjectAlign(FI) < Alignment)
8549         MFI.setObjectAlignment(FI, Alignment);
8550     }
8551   }
8552 
8553   LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
8554 
8555   MachineIRBuilder MIB(MI);
8556   // Memmove requires that we perform the loads first before issuing the stores.
8557   // Apart from that, this loop is pretty much doing the same thing as the
8558   // memcpy codegen function.
8559   unsigned CurrOffset = 0;
8560   SmallVector<Register, 16> LoadVals;
8561   for (auto CopyTy : MemOps) {
8562     // Construct MMO for the load.
8563     auto *LoadMMO =
8564         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
8565 
8566     // Create the load.
8567     Register LoadPtr = Src;
8568     if (CurrOffset != 0) {
8569       LLT SrcTy = MRI.getType(Src);
8570       auto Offset =
8571           MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
8572       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
8573     }
8574     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
8575     CurrOffset += CopyTy.getSizeInBytes();
8576   }
8577 
8578   CurrOffset = 0;
8579   for (unsigned I = 0; I < MemOps.size(); ++I) {
8580     LLT CopyTy = MemOps[I];
8581     // Now store the values loaded.
8582     auto *StoreMMO =
8583         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
8584 
8585     Register StorePtr = Dst;
8586     if (CurrOffset != 0) {
8587       LLT DstTy = MRI.getType(Dst);
8588       auto Offset =
8589           MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
8590       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
8591     }
8592     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
8593     CurrOffset += CopyTy.getSizeInBytes();
8594   }
8595   MI.eraseFromParent();
8596   return Legalized;
8597 }
8598 
8599 LegalizerHelper::LegalizeResult
8600 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
8601   const unsigned Opc = MI.getOpcode();
8602   // This combine is fairly complex so it's not written with a separate
8603   // matcher function.
8604   assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
8605           Opc == TargetOpcode::G_MEMSET) &&
8606          "Expected memcpy like instruction");
8607 
8608   auto MMOIt = MI.memoperands_begin();
8609   const MachineMemOperand *MemOp = *MMOIt;
8610 
8611   Align DstAlign = MemOp->getBaseAlign();
8612   Align SrcAlign;
8613   auto [Dst, Src, Len] = MI.getFirst3Regs();
8614 
8615   if (Opc != TargetOpcode::G_MEMSET) {
8616     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
8617     MemOp = *(++MMOIt);
8618     SrcAlign = MemOp->getBaseAlign();
8619   }
8620 
8621   // See if this is a constant length copy
8622   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
8623   if (!LenVRegAndVal)
8624     return UnableToLegalize;
8625   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
8626 
8627   if (KnownLen == 0) {
8628     MI.eraseFromParent();
8629     return Legalized;
8630   }
8631 
8632   bool IsVolatile = MemOp->isVolatile();
8633   if (Opc == TargetOpcode::G_MEMCPY_INLINE)
8634     return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
8635                              IsVolatile);
8636 
8637   // Don't try to optimize volatile.
8638   if (IsVolatile)
8639     return UnableToLegalize;
8640 
8641   if (MaxLen && KnownLen > MaxLen)
8642     return UnableToLegalize;
8643 
8644   if (Opc == TargetOpcode::G_MEMCPY) {
8645     auto &MF = *MI.getParent()->getParent();
8646     const auto &TLI = *MF.getSubtarget().getTargetLowering();
8647     bool OptSize = shouldLowerMemFuncForSize(MF);
8648     uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
8649     return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
8650                        IsVolatile);
8651   }
8652   if (Opc == TargetOpcode::G_MEMMOVE)
8653     return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
8654   if (Opc == TargetOpcode::G_MEMSET)
8655     return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
8656   return UnableToLegalize;
8657 }
8658