xref: /freebsd-src/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp (revision 5e801ac66d24704442eba426ed13c3effb8a34e7)
1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/GlobalISel/Utils.h"
22 #include "llvm/CodeGen/MachineRegisterInfo.h"
23 #include "llvm/CodeGen/TargetFrameLowering.h"
24 #include "llvm/CodeGen/TargetInstrInfo.h"
25 #include "llvm/CodeGen/TargetLowering.h"
26 #include "llvm/CodeGen/TargetOpcodes.h"
27 #include "llvm/CodeGen/TargetSubtargetInfo.h"
28 #include "llvm/IR/Instructions.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/MathExtras.h"
31 #include "llvm/Support/raw_ostream.h"
32 #include "llvm/Target/TargetMachine.h"
33 
34 #define DEBUG_TYPE "legalizer"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace MIPatternMatch;
39 
40 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
41 ///
42 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
43 /// with any leftover piece as type \p LeftoverTy
44 ///
45 /// Returns -1 in the first element of the pair if the breakdown is not
46 /// satisfiable.
47 static std::pair<int, int>
48 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
49   assert(!LeftoverTy.isValid() && "this is an out argument");
50 
51   unsigned Size = OrigTy.getSizeInBits();
52   unsigned NarrowSize = NarrowTy.getSizeInBits();
53   unsigned NumParts = Size / NarrowSize;
54   unsigned LeftoverSize = Size - NumParts * NarrowSize;
55   assert(Size > NarrowSize);
56 
57   if (LeftoverSize == 0)
58     return {NumParts, 0};
59 
60   if (NarrowTy.isVector()) {
61     unsigned EltSize = OrigTy.getScalarSizeInBits();
62     if (LeftoverSize % EltSize != 0)
63       return {-1, -1};
64     LeftoverTy = LLT::scalarOrVector(
65         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
66   } else {
67     LeftoverTy = LLT::scalar(LeftoverSize);
68   }
69 
70   int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
71   return std::make_pair(NumParts, NumLeftover);
72 }
73 
74 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
75 
76   if (!Ty.isScalar())
77     return nullptr;
78 
79   switch (Ty.getSizeInBits()) {
80   case 16:
81     return Type::getHalfTy(Ctx);
82   case 32:
83     return Type::getFloatTy(Ctx);
84   case 64:
85     return Type::getDoubleTy(Ctx);
86   case 80:
87     return Type::getX86_FP80Ty(Ctx);
88   case 128:
89     return Type::getFP128Ty(Ctx);
90   default:
91     return nullptr;
92   }
93 }
94 
95 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
96                                  GISelChangeObserver &Observer,
97                                  MachineIRBuilder &Builder)
98     : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
99       LI(*MF.getSubtarget().getLegalizerInfo()),
100       TLI(*MF.getSubtarget().getTargetLowering()) { }
101 
102 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
103                                  GISelChangeObserver &Observer,
104                                  MachineIRBuilder &B)
105   : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
106     TLI(*MF.getSubtarget().getTargetLowering()) { }
107 
108 LegalizerHelper::LegalizeResult
109 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
110                                    LostDebugLocObserver &LocObserver) {
111   LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
112 
113   MIRBuilder.setInstrAndDebugLoc(MI);
114 
115   if (MI.getOpcode() == TargetOpcode::G_INTRINSIC ||
116       MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS)
117     return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
118   auto Step = LI.getAction(MI, MRI);
119   switch (Step.Action) {
120   case Legal:
121     LLVM_DEBUG(dbgs() << ".. Already legal\n");
122     return AlreadyLegal;
123   case Libcall:
124     LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
125     return libcall(MI, LocObserver);
126   case NarrowScalar:
127     LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
128     return narrowScalar(MI, Step.TypeIdx, Step.NewType);
129   case WidenScalar:
130     LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
131     return widenScalar(MI, Step.TypeIdx, Step.NewType);
132   case Bitcast:
133     LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
134     return bitcast(MI, Step.TypeIdx, Step.NewType);
135   case Lower:
136     LLVM_DEBUG(dbgs() << ".. Lower\n");
137     return lower(MI, Step.TypeIdx, Step.NewType);
138   case FewerElements:
139     LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
140     return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
141   case MoreElements:
142     LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
143     return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
144   case Custom:
145     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
146     return LI.legalizeCustom(*this, MI) ? Legalized : UnableToLegalize;
147   default:
148     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
149     return UnableToLegalize;
150   }
151 }
152 
153 void LegalizerHelper::extractParts(Register Reg, LLT Ty, int NumParts,
154                                    SmallVectorImpl<Register> &VRegs) {
155   for (int i = 0; i < NumParts; ++i)
156     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
157   MIRBuilder.buildUnmerge(VRegs, Reg);
158 }
159 
160 bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
161                                    LLT MainTy, LLT &LeftoverTy,
162                                    SmallVectorImpl<Register> &VRegs,
163                                    SmallVectorImpl<Register> &LeftoverRegs) {
164   assert(!LeftoverTy.isValid() && "this is an out argument");
165 
166   unsigned RegSize = RegTy.getSizeInBits();
167   unsigned MainSize = MainTy.getSizeInBits();
168   unsigned NumParts = RegSize / MainSize;
169   unsigned LeftoverSize = RegSize - NumParts * MainSize;
170 
171   // Use an unmerge when possible.
172   if (LeftoverSize == 0) {
173     for (unsigned I = 0; I < NumParts; ++I)
174       VRegs.push_back(MRI.createGenericVirtualRegister(MainTy));
175     MIRBuilder.buildUnmerge(VRegs, Reg);
176     return true;
177   }
178 
179   if (MainTy.isVector()) {
180     unsigned EltSize = MainTy.getScalarSizeInBits();
181     if (LeftoverSize % EltSize != 0)
182       return false;
183     LeftoverTy = LLT::scalarOrVector(
184         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
185   } else {
186     LeftoverTy = LLT::scalar(LeftoverSize);
187   }
188 
189   // For irregular sizes, extract the individual parts.
190   for (unsigned I = 0; I != NumParts; ++I) {
191     Register NewReg = MRI.createGenericVirtualRegister(MainTy);
192     VRegs.push_back(NewReg);
193     MIRBuilder.buildExtract(NewReg, Reg, MainSize * I);
194   }
195 
196   for (unsigned Offset = MainSize * NumParts; Offset < RegSize;
197        Offset += LeftoverSize) {
198     Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy);
199     LeftoverRegs.push_back(NewReg);
200     MIRBuilder.buildExtract(NewReg, Reg, Offset);
201   }
202 
203   return true;
204 }
205 
206 void LegalizerHelper::insertParts(Register DstReg,
207                                   LLT ResultTy, LLT PartTy,
208                                   ArrayRef<Register> PartRegs,
209                                   LLT LeftoverTy,
210                                   ArrayRef<Register> LeftoverRegs) {
211   if (!LeftoverTy.isValid()) {
212     assert(LeftoverRegs.empty());
213 
214     if (!ResultTy.isVector()) {
215       MIRBuilder.buildMerge(DstReg, PartRegs);
216       return;
217     }
218 
219     if (PartTy.isVector())
220       MIRBuilder.buildConcatVectors(DstReg, PartRegs);
221     else
222       MIRBuilder.buildBuildVector(DstReg, PartRegs);
223     return;
224   }
225 
226   SmallVector<Register> GCDRegs;
227   LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
228   for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
229     extractGCDType(GCDRegs, GCDTy, PartReg);
230   LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
231   buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
232 }
233 
234 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
235 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
236                               const MachineInstr &MI) {
237   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
238 
239   const int StartIdx = Regs.size();
240   const int NumResults = MI.getNumOperands() - 1;
241   Regs.resize(Regs.size() + NumResults);
242   for (int I = 0; I != NumResults; ++I)
243     Regs[StartIdx + I] = MI.getOperand(I).getReg();
244 }
245 
246 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
247                                      LLT GCDTy, Register SrcReg) {
248   LLT SrcTy = MRI.getType(SrcReg);
249   if (SrcTy == GCDTy) {
250     // If the source already evenly divides the result type, we don't need to do
251     // anything.
252     Parts.push_back(SrcReg);
253   } else {
254     // Need to split into common type sized pieces.
255     auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
256     getUnmergeResults(Parts, *Unmerge);
257   }
258 }
259 
260 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
261                                     LLT NarrowTy, Register SrcReg) {
262   LLT SrcTy = MRI.getType(SrcReg);
263   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
264   extractGCDType(Parts, GCDTy, SrcReg);
265   return GCDTy;
266 }
267 
268 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
269                                          SmallVectorImpl<Register> &VRegs,
270                                          unsigned PadStrategy) {
271   LLT LCMTy = getLCMType(DstTy, NarrowTy);
272 
273   int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
274   int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
275   int NumOrigSrc = VRegs.size();
276 
277   Register PadReg;
278 
279   // Get a value we can use to pad the source value if the sources won't evenly
280   // cover the result type.
281   if (NumOrigSrc < NumParts * NumSubParts) {
282     if (PadStrategy == TargetOpcode::G_ZEXT)
283       PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
284     else if (PadStrategy == TargetOpcode::G_ANYEXT)
285       PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
286     else {
287       assert(PadStrategy == TargetOpcode::G_SEXT);
288 
289       // Shift the sign bit of the low register through the high register.
290       auto ShiftAmt =
291         MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
292       PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
293     }
294   }
295 
296   // Registers for the final merge to be produced.
297   SmallVector<Register, 4> Remerge(NumParts);
298 
299   // Registers needed for intermediate merges, which will be merged into a
300   // source for Remerge.
301   SmallVector<Register, 4> SubMerge(NumSubParts);
302 
303   // Once we've fully read off the end of the original source bits, we can reuse
304   // the same high bits for remaining padding elements.
305   Register AllPadReg;
306 
307   // Build merges to the LCM type to cover the original result type.
308   for (int I = 0; I != NumParts; ++I) {
309     bool AllMergePartsArePadding = true;
310 
311     // Build the requested merges to the requested type.
312     for (int J = 0; J != NumSubParts; ++J) {
313       int Idx = I * NumSubParts + J;
314       if (Idx >= NumOrigSrc) {
315         SubMerge[J] = PadReg;
316         continue;
317       }
318 
319       SubMerge[J] = VRegs[Idx];
320 
321       // There are meaningful bits here we can't reuse later.
322       AllMergePartsArePadding = false;
323     }
324 
325     // If we've filled up a complete piece with padding bits, we can directly
326     // emit the natural sized constant if applicable, rather than a merge of
327     // smaller constants.
328     if (AllMergePartsArePadding && !AllPadReg) {
329       if (PadStrategy == TargetOpcode::G_ANYEXT)
330         AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
331       else if (PadStrategy == TargetOpcode::G_ZEXT)
332         AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
333 
334       // If this is a sign extension, we can't materialize a trivial constant
335       // with the right type and have to produce a merge.
336     }
337 
338     if (AllPadReg) {
339       // Avoid creating additional instructions if we're just adding additional
340       // copies of padding bits.
341       Remerge[I] = AllPadReg;
342       continue;
343     }
344 
345     if (NumSubParts == 1)
346       Remerge[I] = SubMerge[0];
347     else
348       Remerge[I] = MIRBuilder.buildMerge(NarrowTy, SubMerge).getReg(0);
349 
350     // In the sign extend padding case, re-use the first all-signbit merge.
351     if (AllMergePartsArePadding && !AllPadReg)
352       AllPadReg = Remerge[I];
353   }
354 
355   VRegs = std::move(Remerge);
356   return LCMTy;
357 }
358 
359 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
360                                                ArrayRef<Register> RemergeRegs) {
361   LLT DstTy = MRI.getType(DstReg);
362 
363   // Create the merge to the widened source, and extract the relevant bits into
364   // the result.
365 
366   if (DstTy == LCMTy) {
367     MIRBuilder.buildMerge(DstReg, RemergeRegs);
368     return;
369   }
370 
371   auto Remerge = MIRBuilder.buildMerge(LCMTy, RemergeRegs);
372   if (DstTy.isScalar() && LCMTy.isScalar()) {
373     MIRBuilder.buildTrunc(DstReg, Remerge);
374     return;
375   }
376 
377   if (LCMTy.isVector()) {
378     unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
379     SmallVector<Register, 8> UnmergeDefs(NumDefs);
380     UnmergeDefs[0] = DstReg;
381     for (unsigned I = 1; I != NumDefs; ++I)
382       UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
383 
384     MIRBuilder.buildUnmerge(UnmergeDefs,
385                             MIRBuilder.buildMerge(LCMTy, RemergeRegs));
386     return;
387   }
388 
389   llvm_unreachable("unhandled case");
390 }
391 
392 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
393 #define RTLIBCASE_INT(LibcallPrefix)                                           \
394   do {                                                                         \
395     switch (Size) {                                                            \
396     case 32:                                                                   \
397       return RTLIB::LibcallPrefix##32;                                         \
398     case 64:                                                                   \
399       return RTLIB::LibcallPrefix##64;                                         \
400     case 128:                                                                  \
401       return RTLIB::LibcallPrefix##128;                                        \
402     default:                                                                   \
403       llvm_unreachable("unexpected size");                                     \
404     }                                                                          \
405   } while (0)
406 
407 #define RTLIBCASE(LibcallPrefix)                                               \
408   do {                                                                         \
409     switch (Size) {                                                            \
410     case 32:                                                                   \
411       return RTLIB::LibcallPrefix##32;                                         \
412     case 64:                                                                   \
413       return RTLIB::LibcallPrefix##64;                                         \
414     case 80:                                                                   \
415       return RTLIB::LibcallPrefix##80;                                         \
416     case 128:                                                                  \
417       return RTLIB::LibcallPrefix##128;                                        \
418     default:                                                                   \
419       llvm_unreachable("unexpected size");                                     \
420     }                                                                          \
421   } while (0)
422 
423   switch (Opcode) {
424   case TargetOpcode::G_SDIV:
425     RTLIBCASE_INT(SDIV_I);
426   case TargetOpcode::G_UDIV:
427     RTLIBCASE_INT(UDIV_I);
428   case TargetOpcode::G_SREM:
429     RTLIBCASE_INT(SREM_I);
430   case TargetOpcode::G_UREM:
431     RTLIBCASE_INT(UREM_I);
432   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
433     RTLIBCASE_INT(CTLZ_I);
434   case TargetOpcode::G_FADD:
435     RTLIBCASE(ADD_F);
436   case TargetOpcode::G_FSUB:
437     RTLIBCASE(SUB_F);
438   case TargetOpcode::G_FMUL:
439     RTLIBCASE(MUL_F);
440   case TargetOpcode::G_FDIV:
441     RTLIBCASE(DIV_F);
442   case TargetOpcode::G_FEXP:
443     RTLIBCASE(EXP_F);
444   case TargetOpcode::G_FEXP2:
445     RTLIBCASE(EXP2_F);
446   case TargetOpcode::G_FREM:
447     RTLIBCASE(REM_F);
448   case TargetOpcode::G_FPOW:
449     RTLIBCASE(POW_F);
450   case TargetOpcode::G_FMA:
451     RTLIBCASE(FMA_F);
452   case TargetOpcode::G_FSIN:
453     RTLIBCASE(SIN_F);
454   case TargetOpcode::G_FCOS:
455     RTLIBCASE(COS_F);
456   case TargetOpcode::G_FLOG10:
457     RTLIBCASE(LOG10_F);
458   case TargetOpcode::G_FLOG:
459     RTLIBCASE(LOG_F);
460   case TargetOpcode::G_FLOG2:
461     RTLIBCASE(LOG2_F);
462   case TargetOpcode::G_FCEIL:
463     RTLIBCASE(CEIL_F);
464   case TargetOpcode::G_FFLOOR:
465     RTLIBCASE(FLOOR_F);
466   case TargetOpcode::G_FMINNUM:
467     RTLIBCASE(FMIN_F);
468   case TargetOpcode::G_FMAXNUM:
469     RTLIBCASE(FMAX_F);
470   case TargetOpcode::G_FSQRT:
471     RTLIBCASE(SQRT_F);
472   case TargetOpcode::G_FRINT:
473     RTLIBCASE(RINT_F);
474   case TargetOpcode::G_FNEARBYINT:
475     RTLIBCASE(NEARBYINT_F);
476   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
477     RTLIBCASE(ROUNDEVEN_F);
478   }
479   llvm_unreachable("Unknown libcall function");
480 }
481 
482 /// True if an instruction is in tail position in its caller. Intended for
483 /// legalizing libcalls as tail calls when possible.
484 static bool isLibCallInTailPosition(MachineInstr &MI,
485                                     const TargetInstrInfo &TII,
486                                     MachineRegisterInfo &MRI) {
487   MachineBasicBlock &MBB = *MI.getParent();
488   const Function &F = MBB.getParent()->getFunction();
489 
490   // Conservatively require the attributes of the call to match those of
491   // the return. Ignore NoAlias and NonNull because they don't affect the
492   // call sequence.
493   AttributeList CallerAttrs = F.getAttributes();
494   if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex)
495           .removeAttribute(Attribute::NoAlias)
496           .removeAttribute(Attribute::NonNull)
497           .hasAttributes())
498     return false;
499 
500   // It's not safe to eliminate the sign / zero extension of the return value.
501   if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
502       CallerAttrs.hasRetAttr(Attribute::SExt))
503     return false;
504 
505   // Only tail call if the following instruction is a standard return or if we
506   // have a `thisreturn` callee, and a sequence like:
507   //
508   //   G_MEMCPY %0, %1, %2
509   //   $x0 = COPY %0
510   //   RET_ReallyLR implicit $x0
511   auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
512   if (Next != MBB.instr_end() && Next->isCopy()) {
513     switch (MI.getOpcode()) {
514     default:
515       llvm_unreachable("unsupported opcode");
516     case TargetOpcode::G_BZERO:
517       return false;
518     case TargetOpcode::G_MEMCPY:
519     case TargetOpcode::G_MEMMOVE:
520     case TargetOpcode::G_MEMSET:
521       break;
522     }
523 
524     Register VReg = MI.getOperand(0).getReg();
525     if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
526       return false;
527 
528     Register PReg = Next->getOperand(0).getReg();
529     if (!PReg.isPhysical())
530       return false;
531 
532     auto Ret = next_nodbg(Next, MBB.instr_end());
533     if (Ret == MBB.instr_end() || !Ret->isReturn())
534       return false;
535 
536     if (Ret->getNumImplicitOperands() != 1)
537       return false;
538 
539     if (PReg != Ret->getOperand(0).getReg())
540       return false;
541 
542     // Skip over the COPY that we just validated.
543     Next = Ret;
544   }
545 
546   if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
547     return false;
548 
549   return true;
550 }
551 
552 LegalizerHelper::LegalizeResult
553 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
554                     const CallLowering::ArgInfo &Result,
555                     ArrayRef<CallLowering::ArgInfo> Args,
556                     const CallingConv::ID CC) {
557   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
558 
559   CallLowering::CallLoweringInfo Info;
560   Info.CallConv = CC;
561   Info.Callee = MachineOperand::CreateES(Name);
562   Info.OrigRet = Result;
563   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
564   if (!CLI.lowerCall(MIRBuilder, Info))
565     return LegalizerHelper::UnableToLegalize;
566 
567   return LegalizerHelper::Legalized;
568 }
569 
570 LegalizerHelper::LegalizeResult
571 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
572                     const CallLowering::ArgInfo &Result,
573                     ArrayRef<CallLowering::ArgInfo> Args) {
574   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
575   const char *Name = TLI.getLibcallName(Libcall);
576   const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
577   return createLibcall(MIRBuilder, Name, Result, Args, CC);
578 }
579 
580 // Useful for libcalls where all operands have the same type.
581 static LegalizerHelper::LegalizeResult
582 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
583               Type *OpType) {
584   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
585 
586   // FIXME: What does the original arg index mean here?
587   SmallVector<CallLowering::ArgInfo, 3> Args;
588   for (unsigned i = 1; i < MI.getNumOperands(); i++)
589     Args.push_back({MI.getOperand(i).getReg(), OpType, 0});
590   return createLibcall(MIRBuilder, Libcall,
591                        {MI.getOperand(0).getReg(), OpType, 0}, Args);
592 }
593 
594 LegalizerHelper::LegalizeResult
595 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
596                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
597   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
598 
599   SmallVector<CallLowering::ArgInfo, 3> Args;
600   // Add all the args, except for the last which is an imm denoting 'tail'.
601   for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
602     Register Reg = MI.getOperand(i).getReg();
603 
604     // Need derive an IR type for call lowering.
605     LLT OpLLT = MRI.getType(Reg);
606     Type *OpTy = nullptr;
607     if (OpLLT.isPointer())
608       OpTy = Type::getInt8PtrTy(Ctx, OpLLT.getAddressSpace());
609     else
610       OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
611     Args.push_back({Reg, OpTy, 0});
612   }
613 
614   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
615   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
616   RTLIB::Libcall RTLibcall;
617   unsigned Opc = MI.getOpcode();
618   switch (Opc) {
619   case TargetOpcode::G_BZERO:
620     RTLibcall = RTLIB::BZERO;
621     break;
622   case TargetOpcode::G_MEMCPY:
623     RTLibcall = RTLIB::MEMCPY;
624     Args[0].Flags[0].setReturned();
625     break;
626   case TargetOpcode::G_MEMMOVE:
627     RTLibcall = RTLIB::MEMMOVE;
628     Args[0].Flags[0].setReturned();
629     break;
630   case TargetOpcode::G_MEMSET:
631     RTLibcall = RTLIB::MEMSET;
632     Args[0].Flags[0].setReturned();
633     break;
634   default:
635     llvm_unreachable("unsupported opcode");
636   }
637   const char *Name = TLI.getLibcallName(RTLibcall);
638 
639   // Unsupported libcall on the target.
640   if (!Name) {
641     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
642                       << MIRBuilder.getTII().getName(Opc) << "\n");
643     return LegalizerHelper::UnableToLegalize;
644   }
645 
646   CallLowering::CallLoweringInfo Info;
647   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
648   Info.Callee = MachineOperand::CreateES(Name);
649   Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
650   Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() &&
651                     isLibCallInTailPosition(MI, MIRBuilder.getTII(), MRI);
652 
653   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
654   if (!CLI.lowerCall(MIRBuilder, Info))
655     return LegalizerHelper::UnableToLegalize;
656 
657   if (Info.LoweredTailCall) {
658     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
659 
660     // Check debug locations before removing the return.
661     LocObserver.checkpoint(true);
662 
663     // We must have a return following the call (or debug insts) to get past
664     // isLibCallInTailPosition.
665     do {
666       MachineInstr *Next = MI.getNextNode();
667       assert(Next &&
668              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
669              "Expected instr following MI to be return or debug inst?");
670       // We lowered a tail call, so the call is now the return from the block.
671       // Delete the old return.
672       Next->eraseFromParent();
673     } while (MI.getNextNode());
674 
675     // We expect to lose the debug location from the return.
676     LocObserver.checkpoint(false);
677   }
678 
679   return LegalizerHelper::Legalized;
680 }
681 
682 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
683                                        Type *FromType) {
684   auto ToMVT = MVT::getVT(ToType);
685   auto FromMVT = MVT::getVT(FromType);
686 
687   switch (Opcode) {
688   case TargetOpcode::G_FPEXT:
689     return RTLIB::getFPEXT(FromMVT, ToMVT);
690   case TargetOpcode::G_FPTRUNC:
691     return RTLIB::getFPROUND(FromMVT, ToMVT);
692   case TargetOpcode::G_FPTOSI:
693     return RTLIB::getFPTOSINT(FromMVT, ToMVT);
694   case TargetOpcode::G_FPTOUI:
695     return RTLIB::getFPTOUINT(FromMVT, ToMVT);
696   case TargetOpcode::G_SITOFP:
697     return RTLIB::getSINTTOFP(FromMVT, ToMVT);
698   case TargetOpcode::G_UITOFP:
699     return RTLIB::getUINTTOFP(FromMVT, ToMVT);
700   }
701   llvm_unreachable("Unsupported libcall function");
702 }
703 
704 static LegalizerHelper::LegalizeResult
705 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
706                   Type *FromType) {
707   RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
708   return createLibcall(MIRBuilder, Libcall,
709                        {MI.getOperand(0).getReg(), ToType, 0},
710                        {{MI.getOperand(1).getReg(), FromType, 0}});
711 }
712 
713 LegalizerHelper::LegalizeResult
714 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
715   LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
716   unsigned Size = LLTy.getSizeInBits();
717   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
718 
719   switch (MI.getOpcode()) {
720   default:
721     return UnableToLegalize;
722   case TargetOpcode::G_SDIV:
723   case TargetOpcode::G_UDIV:
724   case TargetOpcode::G_SREM:
725   case TargetOpcode::G_UREM:
726   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
727     Type *HLTy = IntegerType::get(Ctx, Size);
728     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
729     if (Status != Legalized)
730       return Status;
731     break;
732   }
733   case TargetOpcode::G_FADD:
734   case TargetOpcode::G_FSUB:
735   case TargetOpcode::G_FMUL:
736   case TargetOpcode::G_FDIV:
737   case TargetOpcode::G_FMA:
738   case TargetOpcode::G_FPOW:
739   case TargetOpcode::G_FREM:
740   case TargetOpcode::G_FCOS:
741   case TargetOpcode::G_FSIN:
742   case TargetOpcode::G_FLOG10:
743   case TargetOpcode::G_FLOG:
744   case TargetOpcode::G_FLOG2:
745   case TargetOpcode::G_FEXP:
746   case TargetOpcode::G_FEXP2:
747   case TargetOpcode::G_FCEIL:
748   case TargetOpcode::G_FFLOOR:
749   case TargetOpcode::G_FMINNUM:
750   case TargetOpcode::G_FMAXNUM:
751   case TargetOpcode::G_FSQRT:
752   case TargetOpcode::G_FRINT:
753   case TargetOpcode::G_FNEARBYINT:
754   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
755     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
756     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
757       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
758       return UnableToLegalize;
759     }
760     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
761     if (Status != Legalized)
762       return Status;
763     break;
764   }
765   case TargetOpcode::G_FPEXT:
766   case TargetOpcode::G_FPTRUNC: {
767     Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
768     Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
769     if (!FromTy || !ToTy)
770       return UnableToLegalize;
771     LegalizeResult Status = conversionLibcall(MI, MIRBuilder, ToTy, FromTy );
772     if (Status != Legalized)
773       return Status;
774     break;
775   }
776   case TargetOpcode::G_FPTOSI:
777   case TargetOpcode::G_FPTOUI: {
778     // FIXME: Support other types
779     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
780     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
781     if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64))
782       return UnableToLegalize;
783     LegalizeResult Status = conversionLibcall(
784         MI, MIRBuilder,
785         ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
786         FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx));
787     if (Status != Legalized)
788       return Status;
789     break;
790   }
791   case TargetOpcode::G_SITOFP:
792   case TargetOpcode::G_UITOFP: {
793     // FIXME: Support other types
794     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
795     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
796     if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
797       return UnableToLegalize;
798     LegalizeResult Status = conversionLibcall(
799         MI, MIRBuilder,
800         ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
801         FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx));
802     if (Status != Legalized)
803       return Status;
804     break;
805   }
806   case TargetOpcode::G_BZERO:
807   case TargetOpcode::G_MEMCPY:
808   case TargetOpcode::G_MEMMOVE:
809   case TargetOpcode::G_MEMSET: {
810     LegalizeResult Result =
811         createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
812     if (Result != Legalized)
813       return Result;
814     MI.eraseFromParent();
815     return Result;
816   }
817   }
818 
819   MI.eraseFromParent();
820   return Legalized;
821 }
822 
823 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
824                                                               unsigned TypeIdx,
825                                                               LLT NarrowTy) {
826   uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
827   uint64_t NarrowSize = NarrowTy.getSizeInBits();
828 
829   switch (MI.getOpcode()) {
830   default:
831     return UnableToLegalize;
832   case TargetOpcode::G_IMPLICIT_DEF: {
833     Register DstReg = MI.getOperand(0).getReg();
834     LLT DstTy = MRI.getType(DstReg);
835 
836     // If SizeOp0 is not an exact multiple of NarrowSize, emit
837     // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
838     // FIXME: Although this would also be legal for the general case, it causes
839     //  a lot of regressions in the emitted code (superfluous COPYs, artifact
840     //  combines not being hit). This seems to be a problem related to the
841     //  artifact combiner.
842     if (SizeOp0 % NarrowSize != 0) {
843       LLT ImplicitTy = NarrowTy;
844       if (DstTy.isVector())
845         ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
846 
847       Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
848       MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
849 
850       MI.eraseFromParent();
851       return Legalized;
852     }
853 
854     int NumParts = SizeOp0 / NarrowSize;
855 
856     SmallVector<Register, 2> DstRegs;
857     for (int i = 0; i < NumParts; ++i)
858       DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
859 
860     if (DstTy.isVector())
861       MIRBuilder.buildBuildVector(DstReg, DstRegs);
862     else
863       MIRBuilder.buildMerge(DstReg, DstRegs);
864     MI.eraseFromParent();
865     return Legalized;
866   }
867   case TargetOpcode::G_CONSTANT: {
868     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
869     const APInt &Val = MI.getOperand(1).getCImm()->getValue();
870     unsigned TotalSize = Ty.getSizeInBits();
871     unsigned NarrowSize = NarrowTy.getSizeInBits();
872     int NumParts = TotalSize / NarrowSize;
873 
874     SmallVector<Register, 4> PartRegs;
875     for (int I = 0; I != NumParts; ++I) {
876       unsigned Offset = I * NarrowSize;
877       auto K = MIRBuilder.buildConstant(NarrowTy,
878                                         Val.lshr(Offset).trunc(NarrowSize));
879       PartRegs.push_back(K.getReg(0));
880     }
881 
882     LLT LeftoverTy;
883     unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
884     SmallVector<Register, 1> LeftoverRegs;
885     if (LeftoverBits != 0) {
886       LeftoverTy = LLT::scalar(LeftoverBits);
887       auto K = MIRBuilder.buildConstant(
888         LeftoverTy,
889         Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
890       LeftoverRegs.push_back(K.getReg(0));
891     }
892 
893     insertParts(MI.getOperand(0).getReg(),
894                 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
895 
896     MI.eraseFromParent();
897     return Legalized;
898   }
899   case TargetOpcode::G_SEXT:
900   case TargetOpcode::G_ZEXT:
901   case TargetOpcode::G_ANYEXT:
902     return narrowScalarExt(MI, TypeIdx, NarrowTy);
903   case TargetOpcode::G_TRUNC: {
904     if (TypeIdx != 1)
905       return UnableToLegalize;
906 
907     uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
908     if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
909       LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
910       return UnableToLegalize;
911     }
912 
913     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
914     MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
915     MI.eraseFromParent();
916     return Legalized;
917   }
918 
919   case TargetOpcode::G_FREEZE:
920     return reduceOperationWidth(MI, TypeIdx, NarrowTy);
921   case TargetOpcode::G_ADD:
922   case TargetOpcode::G_SUB:
923   case TargetOpcode::G_SADDO:
924   case TargetOpcode::G_SSUBO:
925   case TargetOpcode::G_SADDE:
926   case TargetOpcode::G_SSUBE:
927   case TargetOpcode::G_UADDO:
928   case TargetOpcode::G_USUBO:
929   case TargetOpcode::G_UADDE:
930   case TargetOpcode::G_USUBE:
931     return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
932   case TargetOpcode::G_MUL:
933   case TargetOpcode::G_UMULH:
934     return narrowScalarMul(MI, NarrowTy);
935   case TargetOpcode::G_EXTRACT:
936     return narrowScalarExtract(MI, TypeIdx, NarrowTy);
937   case TargetOpcode::G_INSERT:
938     return narrowScalarInsert(MI, TypeIdx, NarrowTy);
939   case TargetOpcode::G_LOAD: {
940     auto &LoadMI = cast<GLoad>(MI);
941     Register DstReg = LoadMI.getDstReg();
942     LLT DstTy = MRI.getType(DstReg);
943     if (DstTy.isVector())
944       return UnableToLegalize;
945 
946     if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
947       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
948       MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
949       MIRBuilder.buildAnyExt(DstReg, TmpReg);
950       LoadMI.eraseFromParent();
951       return Legalized;
952     }
953 
954     return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
955   }
956   case TargetOpcode::G_ZEXTLOAD:
957   case TargetOpcode::G_SEXTLOAD: {
958     auto &LoadMI = cast<GExtLoad>(MI);
959     Register DstReg = LoadMI.getDstReg();
960     Register PtrReg = LoadMI.getPointerReg();
961 
962     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
963     auto &MMO = LoadMI.getMMO();
964     unsigned MemSize = MMO.getSizeInBits();
965 
966     if (MemSize == NarrowSize) {
967       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
968     } else if (MemSize < NarrowSize) {
969       MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
970     } else if (MemSize > NarrowSize) {
971       // FIXME: Need to split the load.
972       return UnableToLegalize;
973     }
974 
975     if (isa<GZExtLoad>(LoadMI))
976       MIRBuilder.buildZExt(DstReg, TmpReg);
977     else
978       MIRBuilder.buildSExt(DstReg, TmpReg);
979 
980     LoadMI.eraseFromParent();
981     return Legalized;
982   }
983   case TargetOpcode::G_STORE: {
984     auto &StoreMI = cast<GStore>(MI);
985 
986     Register SrcReg = StoreMI.getValueReg();
987     LLT SrcTy = MRI.getType(SrcReg);
988     if (SrcTy.isVector())
989       return UnableToLegalize;
990 
991     int NumParts = SizeOp0 / NarrowSize;
992     unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
993     unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
994     if (SrcTy.isVector() && LeftoverBits != 0)
995       return UnableToLegalize;
996 
997     if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
998       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
999       MIRBuilder.buildTrunc(TmpReg, SrcReg);
1000       MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
1001       StoreMI.eraseFromParent();
1002       return Legalized;
1003     }
1004 
1005     return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
1006   }
1007   case TargetOpcode::G_SELECT:
1008     return narrowScalarSelect(MI, TypeIdx, NarrowTy);
1009   case TargetOpcode::G_AND:
1010   case TargetOpcode::G_OR:
1011   case TargetOpcode::G_XOR: {
1012     // Legalize bitwise operation:
1013     // A = BinOp<Ty> B, C
1014     // into:
1015     // B1, ..., BN = G_UNMERGE_VALUES B
1016     // C1, ..., CN = G_UNMERGE_VALUES C
1017     // A1 = BinOp<Ty/N> B1, C2
1018     // ...
1019     // AN = BinOp<Ty/N> BN, CN
1020     // A = G_MERGE_VALUES A1, ..., AN
1021     return narrowScalarBasic(MI, TypeIdx, NarrowTy);
1022   }
1023   case TargetOpcode::G_SHL:
1024   case TargetOpcode::G_LSHR:
1025   case TargetOpcode::G_ASHR:
1026     return narrowScalarShift(MI, TypeIdx, NarrowTy);
1027   case TargetOpcode::G_CTLZ:
1028   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1029   case TargetOpcode::G_CTTZ:
1030   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1031   case TargetOpcode::G_CTPOP:
1032     if (TypeIdx == 1)
1033       switch (MI.getOpcode()) {
1034       case TargetOpcode::G_CTLZ:
1035       case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1036         return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
1037       case TargetOpcode::G_CTTZ:
1038       case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1039         return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
1040       case TargetOpcode::G_CTPOP:
1041         return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
1042       default:
1043         return UnableToLegalize;
1044       }
1045 
1046     Observer.changingInstr(MI);
1047     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1048     Observer.changedInstr(MI);
1049     return Legalized;
1050   case TargetOpcode::G_INTTOPTR:
1051     if (TypeIdx != 1)
1052       return UnableToLegalize;
1053 
1054     Observer.changingInstr(MI);
1055     narrowScalarSrc(MI, NarrowTy, 1);
1056     Observer.changedInstr(MI);
1057     return Legalized;
1058   case TargetOpcode::G_PTRTOINT:
1059     if (TypeIdx != 0)
1060       return UnableToLegalize;
1061 
1062     Observer.changingInstr(MI);
1063     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1064     Observer.changedInstr(MI);
1065     return Legalized;
1066   case TargetOpcode::G_PHI: {
1067     // FIXME: add support for when SizeOp0 isn't an exact multiple of
1068     // NarrowSize.
1069     if (SizeOp0 % NarrowSize != 0)
1070       return UnableToLegalize;
1071 
1072     unsigned NumParts = SizeOp0 / NarrowSize;
1073     SmallVector<Register, 2> DstRegs(NumParts);
1074     SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1075     Observer.changingInstr(MI);
1076     for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1077       MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1078       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
1079       extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1080                    SrcRegs[i / 2]);
1081     }
1082     MachineBasicBlock &MBB = *MI.getParent();
1083     MIRBuilder.setInsertPt(MBB, MI);
1084     for (unsigned i = 0; i < NumParts; ++i) {
1085       DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1086       MachineInstrBuilder MIB =
1087           MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1088       for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1089         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1090     }
1091     MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1092     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1093     Observer.changedInstr(MI);
1094     MI.eraseFromParent();
1095     return Legalized;
1096   }
1097   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1098   case TargetOpcode::G_INSERT_VECTOR_ELT: {
1099     if (TypeIdx != 2)
1100       return UnableToLegalize;
1101 
1102     int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1103     Observer.changingInstr(MI);
1104     narrowScalarSrc(MI, NarrowTy, OpIdx);
1105     Observer.changedInstr(MI);
1106     return Legalized;
1107   }
1108   case TargetOpcode::G_ICMP: {
1109     Register LHS = MI.getOperand(2).getReg();
1110     LLT SrcTy = MRI.getType(LHS);
1111     uint64_t SrcSize = SrcTy.getSizeInBits();
1112     CmpInst::Predicate Pred =
1113         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1114 
1115     // TODO: Handle the non-equality case for weird sizes.
1116     if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred))
1117       return UnableToLegalize;
1118 
1119     LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1120     SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1121     if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1122                       LHSLeftoverRegs))
1123       return UnableToLegalize;
1124 
1125     LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1126     SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1127     if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1128                       RHSPartRegs, RHSLeftoverRegs))
1129       return UnableToLegalize;
1130 
1131     // We now have the LHS and RHS of the compare split into narrow-type
1132     // registers, plus potentially some leftover type.
1133     Register Dst = MI.getOperand(0).getReg();
1134     LLT ResTy = MRI.getType(Dst);
1135     if (ICmpInst::isEquality(Pred)) {
1136       // For each part on the LHS and RHS, keep track of the result of XOR-ing
1137       // them together. For each equal part, the result should be all 0s. For
1138       // each non-equal part, we'll get at least one 1.
1139       auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1140       SmallVector<Register, 4> Xors;
1141       for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1142         auto LHS = std::get<0>(LHSAndRHS);
1143         auto RHS = std::get<1>(LHSAndRHS);
1144         auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1145         Xors.push_back(Xor);
1146       }
1147 
1148       // Build a G_XOR for each leftover register. Each G_XOR must be widened
1149       // to the desired narrow type so that we can OR them together later.
1150       SmallVector<Register, 4> WidenedXors;
1151       for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1152         auto LHS = std::get<0>(LHSAndRHS);
1153         auto RHS = std::get<1>(LHSAndRHS);
1154         auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1155         LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1156         buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1157                             /* PadStrategy = */ TargetOpcode::G_ZEXT);
1158         Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
1159       }
1160 
1161       // Now, for each part we broke up, we know if they are equal/not equal
1162       // based off the G_XOR. We can OR these all together and compare against
1163       // 0 to get the result.
1164       assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1165       auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1166       for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1167         Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1168       MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1169     } else {
1170       // TODO: Handle non-power-of-two types.
1171       assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
1172       assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
1173       Register LHSL = LHSPartRegs[0];
1174       Register LHSH = LHSPartRegs[1];
1175       Register RHSL = RHSPartRegs[0];
1176       Register RHSH = RHSPartRegs[1];
1177       MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
1178       MachineInstrBuilder CmpHEQ =
1179           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
1180       MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
1181           ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
1182       MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH);
1183     }
1184     MI.eraseFromParent();
1185     return Legalized;
1186   }
1187   case TargetOpcode::G_SEXT_INREG: {
1188     if (TypeIdx != 0)
1189       return UnableToLegalize;
1190 
1191     int64_t SizeInBits = MI.getOperand(2).getImm();
1192 
1193     // So long as the new type has more bits than the bits we're extending we
1194     // don't need to break it apart.
1195     if (NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1196       Observer.changingInstr(MI);
1197       // We don't lose any non-extension bits by truncating the src and
1198       // sign-extending the dst.
1199       MachineOperand &MO1 = MI.getOperand(1);
1200       auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1201       MO1.setReg(TruncMIB.getReg(0));
1202 
1203       MachineOperand &MO2 = MI.getOperand(0);
1204       Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1205       MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1206       MIRBuilder.buildSExt(MO2, DstExt);
1207       MO2.setReg(DstExt);
1208       Observer.changedInstr(MI);
1209       return Legalized;
1210     }
1211 
1212     // Break it apart. Components below the extension point are unmodified. The
1213     // component containing the extension point becomes a narrower SEXT_INREG.
1214     // Components above it are ashr'd from the component containing the
1215     // extension point.
1216     if (SizeOp0 % NarrowSize != 0)
1217       return UnableToLegalize;
1218     int NumParts = SizeOp0 / NarrowSize;
1219 
1220     // List the registers where the destination will be scattered.
1221     SmallVector<Register, 2> DstRegs;
1222     // List the registers where the source will be split.
1223     SmallVector<Register, 2> SrcRegs;
1224 
1225     // Create all the temporary registers.
1226     for (int i = 0; i < NumParts; ++i) {
1227       Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1228 
1229       SrcRegs.push_back(SrcReg);
1230     }
1231 
1232     // Explode the big arguments into smaller chunks.
1233     MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1234 
1235     Register AshrCstReg =
1236         MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1237             .getReg(0);
1238     Register FullExtensionReg = 0;
1239     Register PartialExtensionReg = 0;
1240 
1241     // Do the operation on each small part.
1242     for (int i = 0; i < NumParts; ++i) {
1243       if ((i + 1) * NarrowTy.getScalarSizeInBits() < SizeInBits)
1244         DstRegs.push_back(SrcRegs[i]);
1245       else if (i * NarrowTy.getScalarSizeInBits() > SizeInBits) {
1246         assert(PartialExtensionReg &&
1247                "Expected to visit partial extension before full");
1248         if (FullExtensionReg) {
1249           DstRegs.push_back(FullExtensionReg);
1250           continue;
1251         }
1252         DstRegs.push_back(
1253             MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1254                 .getReg(0));
1255         FullExtensionReg = DstRegs.back();
1256       } else {
1257         DstRegs.push_back(
1258             MIRBuilder
1259                 .buildInstr(
1260                     TargetOpcode::G_SEXT_INREG, {NarrowTy},
1261                     {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1262                 .getReg(0));
1263         PartialExtensionReg = DstRegs.back();
1264       }
1265     }
1266 
1267     // Gather the destination registers into the final destination.
1268     Register DstReg = MI.getOperand(0).getReg();
1269     MIRBuilder.buildMerge(DstReg, DstRegs);
1270     MI.eraseFromParent();
1271     return Legalized;
1272   }
1273   case TargetOpcode::G_BSWAP:
1274   case TargetOpcode::G_BITREVERSE: {
1275     if (SizeOp0 % NarrowSize != 0)
1276       return UnableToLegalize;
1277 
1278     Observer.changingInstr(MI);
1279     SmallVector<Register, 2> SrcRegs, DstRegs;
1280     unsigned NumParts = SizeOp0 / NarrowSize;
1281     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
1282 
1283     for (unsigned i = 0; i < NumParts; ++i) {
1284       auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
1285                                            {SrcRegs[NumParts - 1 - i]});
1286       DstRegs.push_back(DstPart.getReg(0));
1287     }
1288 
1289     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1290 
1291     Observer.changedInstr(MI);
1292     MI.eraseFromParent();
1293     return Legalized;
1294   }
1295   case TargetOpcode::G_PTR_ADD:
1296   case TargetOpcode::G_PTRMASK: {
1297     if (TypeIdx != 1)
1298       return UnableToLegalize;
1299     Observer.changingInstr(MI);
1300     narrowScalarSrc(MI, NarrowTy, 2);
1301     Observer.changedInstr(MI);
1302     return Legalized;
1303   }
1304   case TargetOpcode::G_FPTOUI:
1305   case TargetOpcode::G_FPTOSI:
1306     return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
1307   case TargetOpcode::G_FPEXT:
1308     if (TypeIdx != 0)
1309       return UnableToLegalize;
1310     Observer.changingInstr(MI);
1311     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
1312     Observer.changedInstr(MI);
1313     return Legalized;
1314   }
1315 }
1316 
1317 Register LegalizerHelper::coerceToScalar(Register Val) {
1318   LLT Ty = MRI.getType(Val);
1319   if (Ty.isScalar())
1320     return Val;
1321 
1322   const DataLayout &DL = MIRBuilder.getDataLayout();
1323   LLT NewTy = LLT::scalar(Ty.getSizeInBits());
1324   if (Ty.isPointer()) {
1325     if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
1326       return Register();
1327     return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
1328   }
1329 
1330   Register NewVal = Val;
1331 
1332   assert(Ty.isVector());
1333   LLT EltTy = Ty.getElementType();
1334   if (EltTy.isPointer())
1335     NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
1336   return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
1337 }
1338 
1339 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
1340                                      unsigned OpIdx, unsigned ExtOpcode) {
1341   MachineOperand &MO = MI.getOperand(OpIdx);
1342   auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
1343   MO.setReg(ExtB.getReg(0));
1344 }
1345 
1346 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
1347                                       unsigned OpIdx) {
1348   MachineOperand &MO = MI.getOperand(OpIdx);
1349   auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
1350   MO.setReg(ExtB.getReg(0));
1351 }
1352 
1353 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
1354                                      unsigned OpIdx, unsigned TruncOpcode) {
1355   MachineOperand &MO = MI.getOperand(OpIdx);
1356   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1357   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1358   MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
1359   MO.setReg(DstExt);
1360 }
1361 
1362 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
1363                                       unsigned OpIdx, unsigned ExtOpcode) {
1364   MachineOperand &MO = MI.getOperand(OpIdx);
1365   Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
1366   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1367   MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
1368   MO.setReg(DstTrunc);
1369 }
1370 
1371 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
1372                                             unsigned OpIdx) {
1373   MachineOperand &MO = MI.getOperand(OpIdx);
1374   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1375   MO.setReg(widenWithUnmerge(WideTy, MO.getReg()));
1376 }
1377 
1378 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
1379                                             unsigned OpIdx) {
1380   MachineOperand &MO = MI.getOperand(OpIdx);
1381 
1382   LLT OldTy = MRI.getType(MO.getReg());
1383   unsigned OldElts = OldTy.getNumElements();
1384   unsigned NewElts = MoreTy.getNumElements();
1385 
1386   unsigned NumParts = NewElts / OldElts;
1387 
1388   // Use concat_vectors if the result is a multiple of the number of elements.
1389   if (NumParts * OldElts == NewElts) {
1390     SmallVector<Register, 8> Parts;
1391     Parts.push_back(MO.getReg());
1392 
1393     Register ImpDef = MIRBuilder.buildUndef(OldTy).getReg(0);
1394     for (unsigned I = 1; I != NumParts; ++I)
1395       Parts.push_back(ImpDef);
1396 
1397     auto Concat = MIRBuilder.buildConcatVectors(MoreTy, Parts);
1398     MO.setReg(Concat.getReg(0));
1399     return;
1400   }
1401 
1402   Register MoreReg = MRI.createGenericVirtualRegister(MoreTy);
1403   Register ImpDef = MIRBuilder.buildUndef(MoreTy).getReg(0);
1404   MIRBuilder.buildInsert(MoreReg, ImpDef, MO.getReg(), 0);
1405   MO.setReg(MoreReg);
1406 }
1407 
1408 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1409   MachineOperand &Op = MI.getOperand(OpIdx);
1410   Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
1411 }
1412 
1413 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1414   MachineOperand &MO = MI.getOperand(OpIdx);
1415   Register CastDst = MRI.createGenericVirtualRegister(CastTy);
1416   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1417   MIRBuilder.buildBitcast(MO, CastDst);
1418   MO.setReg(CastDst);
1419 }
1420 
1421 LegalizerHelper::LegalizeResult
1422 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
1423                                         LLT WideTy) {
1424   if (TypeIdx != 1)
1425     return UnableToLegalize;
1426 
1427   Register DstReg = MI.getOperand(0).getReg();
1428   LLT DstTy = MRI.getType(DstReg);
1429   if (DstTy.isVector())
1430     return UnableToLegalize;
1431 
1432   Register Src1 = MI.getOperand(1).getReg();
1433   LLT SrcTy = MRI.getType(Src1);
1434   const int DstSize = DstTy.getSizeInBits();
1435   const int SrcSize = SrcTy.getSizeInBits();
1436   const int WideSize = WideTy.getSizeInBits();
1437   const int NumMerge = (DstSize + WideSize - 1) / WideSize;
1438 
1439   unsigned NumOps = MI.getNumOperands();
1440   unsigned NumSrc = MI.getNumOperands() - 1;
1441   unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
1442 
1443   if (WideSize >= DstSize) {
1444     // Directly pack the bits in the target type.
1445     Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1).getReg(0);
1446 
1447     for (unsigned I = 2; I != NumOps; ++I) {
1448       const unsigned Offset = (I - 1) * PartSize;
1449 
1450       Register SrcReg = MI.getOperand(I).getReg();
1451       assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
1452 
1453       auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
1454 
1455       Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
1456         MRI.createGenericVirtualRegister(WideTy);
1457 
1458       auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
1459       auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
1460       MIRBuilder.buildOr(NextResult, ResultReg, Shl);
1461       ResultReg = NextResult;
1462     }
1463 
1464     if (WideSize > DstSize)
1465       MIRBuilder.buildTrunc(DstReg, ResultReg);
1466     else if (DstTy.isPointer())
1467       MIRBuilder.buildIntToPtr(DstReg, ResultReg);
1468 
1469     MI.eraseFromParent();
1470     return Legalized;
1471   }
1472 
1473   // Unmerge the original values to the GCD type, and recombine to the next
1474   // multiple greater than the original type.
1475   //
1476   // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
1477   // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
1478   // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
1479   // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
1480   // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
1481   // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
1482   // %12:_(s12) = G_MERGE_VALUES %10, %11
1483   //
1484   // Padding with undef if necessary:
1485   //
1486   // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
1487   // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
1488   // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
1489   // %7:_(s2) = G_IMPLICIT_DEF
1490   // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
1491   // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
1492   // %10:_(s12) = G_MERGE_VALUES %8, %9
1493 
1494   const int GCD = greatestCommonDivisor(SrcSize, WideSize);
1495   LLT GCDTy = LLT::scalar(GCD);
1496 
1497   SmallVector<Register, 8> Parts;
1498   SmallVector<Register, 8> NewMergeRegs;
1499   SmallVector<Register, 8> Unmerges;
1500   LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
1501 
1502   // Decompose the original operands if they don't evenly divide.
1503   for (int I = 1, E = MI.getNumOperands(); I != E; ++I) {
1504     Register SrcReg = MI.getOperand(I).getReg();
1505     if (GCD == SrcSize) {
1506       Unmerges.push_back(SrcReg);
1507     } else {
1508       auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
1509       for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
1510         Unmerges.push_back(Unmerge.getReg(J));
1511     }
1512   }
1513 
1514   // Pad with undef to the next size that is a multiple of the requested size.
1515   if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
1516     Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
1517     for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
1518       Unmerges.push_back(UndefReg);
1519   }
1520 
1521   const int PartsPerGCD = WideSize / GCD;
1522 
1523   // Build merges of each piece.
1524   ArrayRef<Register> Slicer(Unmerges);
1525   for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
1526     auto Merge = MIRBuilder.buildMerge(WideTy, Slicer.take_front(PartsPerGCD));
1527     NewMergeRegs.push_back(Merge.getReg(0));
1528   }
1529 
1530   // A truncate may be necessary if the requested type doesn't evenly divide the
1531   // original result type.
1532   if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
1533     MIRBuilder.buildMerge(DstReg, NewMergeRegs);
1534   } else {
1535     auto FinalMerge = MIRBuilder.buildMerge(WideDstTy, NewMergeRegs);
1536     MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
1537   }
1538 
1539   MI.eraseFromParent();
1540   return Legalized;
1541 }
1542 
1543 Register LegalizerHelper::widenWithUnmerge(LLT WideTy, Register OrigReg) {
1544   Register WideReg = MRI.createGenericVirtualRegister(WideTy);
1545   LLT OrigTy = MRI.getType(OrigReg);
1546   LLT LCMTy = getLCMType(WideTy, OrigTy);
1547 
1548   const int NumMergeParts = LCMTy.getSizeInBits() / WideTy.getSizeInBits();
1549   const int NumUnmergeParts = LCMTy.getSizeInBits() / OrigTy.getSizeInBits();
1550 
1551   Register UnmergeSrc = WideReg;
1552 
1553   // Create a merge to the LCM type, padding with undef
1554   // %0:_(<3 x s32>) = G_FOO => <4 x s32>
1555   // =>
1556   // %1:_(<4 x s32>) = G_FOO
1557   // %2:_(<4 x s32>) = G_IMPLICIT_DEF
1558   // %3:_(<12 x s32>) = G_CONCAT_VECTORS %1, %2, %2
1559   // %0:_(<3 x s32>), %4:_, %5:_, %6:_ = G_UNMERGE_VALUES %3
1560   if (NumMergeParts > 1) {
1561     Register Undef = MIRBuilder.buildUndef(WideTy).getReg(0);
1562     SmallVector<Register, 8> MergeParts(NumMergeParts, Undef);
1563     MergeParts[0] = WideReg;
1564     UnmergeSrc = MIRBuilder.buildMerge(LCMTy, MergeParts).getReg(0);
1565   }
1566 
1567   // Unmerge to the original register and pad with dead defs.
1568   SmallVector<Register, 8> UnmergeResults(NumUnmergeParts);
1569   UnmergeResults[0] = OrigReg;
1570   for (int I = 1; I != NumUnmergeParts; ++I)
1571     UnmergeResults[I] = MRI.createGenericVirtualRegister(OrigTy);
1572 
1573   MIRBuilder.buildUnmerge(UnmergeResults, UnmergeSrc);
1574   return WideReg;
1575 }
1576 
1577 LegalizerHelper::LegalizeResult
1578 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
1579                                           LLT WideTy) {
1580   if (TypeIdx != 0)
1581     return UnableToLegalize;
1582 
1583   int NumDst = MI.getNumOperands() - 1;
1584   Register SrcReg = MI.getOperand(NumDst).getReg();
1585   LLT SrcTy = MRI.getType(SrcReg);
1586   if (SrcTy.isVector())
1587     return UnableToLegalize;
1588 
1589   Register Dst0Reg = MI.getOperand(0).getReg();
1590   LLT DstTy = MRI.getType(Dst0Reg);
1591   if (!DstTy.isScalar())
1592     return UnableToLegalize;
1593 
1594   if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
1595     if (SrcTy.isPointer()) {
1596       const DataLayout &DL = MIRBuilder.getDataLayout();
1597       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
1598         LLVM_DEBUG(
1599             dbgs() << "Not casting non-integral address space integer\n");
1600         return UnableToLegalize;
1601       }
1602 
1603       SrcTy = LLT::scalar(SrcTy.getSizeInBits());
1604       SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
1605     }
1606 
1607     // Widen SrcTy to WideTy. This does not affect the result, but since the
1608     // user requested this size, it is probably better handled than SrcTy and
1609     // should reduce the total number of legalization artifacts
1610     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1611       SrcTy = WideTy;
1612       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
1613     }
1614 
1615     // Theres no unmerge type to target. Directly extract the bits from the
1616     // source type
1617     unsigned DstSize = DstTy.getSizeInBits();
1618 
1619     MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
1620     for (int I = 1; I != NumDst; ++I) {
1621       auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
1622       auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
1623       MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
1624     }
1625 
1626     MI.eraseFromParent();
1627     return Legalized;
1628   }
1629 
1630   // Extend the source to a wider type.
1631   LLT LCMTy = getLCMType(SrcTy, WideTy);
1632 
1633   Register WideSrc = SrcReg;
1634   if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
1635     // TODO: If this is an integral address space, cast to integer and anyext.
1636     if (SrcTy.isPointer()) {
1637       LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
1638       return UnableToLegalize;
1639     }
1640 
1641     WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
1642   }
1643 
1644   auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
1645 
1646   // Create a sequence of unmerges and merges to the original results. Since we
1647   // may have widened the source, we will need to pad the results with dead defs
1648   // to cover the source register.
1649   // e.g. widen s48 to s64:
1650   // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
1651   //
1652   // =>
1653   //  %4:_(s192) = G_ANYEXT %0:_(s96)
1654   //  %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
1655   //  ; unpack to GCD type, with extra dead defs
1656   //  %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
1657   //  %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
1658   //  dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
1659   //  %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10   ; Remerge to destination
1660   //  %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
1661   const LLT GCDTy = getGCDType(WideTy, DstTy);
1662   const int NumUnmerge = Unmerge->getNumOperands() - 1;
1663   const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
1664 
1665   // Directly unmerge to the destination without going through a GCD type
1666   // if possible
1667   if (PartsPerRemerge == 1) {
1668     const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
1669 
1670     for (int I = 0; I != NumUnmerge; ++I) {
1671       auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
1672 
1673       for (int J = 0; J != PartsPerUnmerge; ++J) {
1674         int Idx = I * PartsPerUnmerge + J;
1675         if (Idx < NumDst)
1676           MIB.addDef(MI.getOperand(Idx).getReg());
1677         else {
1678           // Create dead def for excess components.
1679           MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
1680         }
1681       }
1682 
1683       MIB.addUse(Unmerge.getReg(I));
1684     }
1685   } else {
1686     SmallVector<Register, 16> Parts;
1687     for (int J = 0; J != NumUnmerge; ++J)
1688       extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
1689 
1690     SmallVector<Register, 8> RemergeParts;
1691     for (int I = 0; I != NumDst; ++I) {
1692       for (int J = 0; J < PartsPerRemerge; ++J) {
1693         const int Idx = I * PartsPerRemerge + J;
1694         RemergeParts.emplace_back(Parts[Idx]);
1695       }
1696 
1697       MIRBuilder.buildMerge(MI.getOperand(I).getReg(), RemergeParts);
1698       RemergeParts.clear();
1699     }
1700   }
1701 
1702   MI.eraseFromParent();
1703   return Legalized;
1704 }
1705 
1706 LegalizerHelper::LegalizeResult
1707 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
1708                                     LLT WideTy) {
1709   Register DstReg = MI.getOperand(0).getReg();
1710   Register SrcReg = MI.getOperand(1).getReg();
1711   LLT SrcTy = MRI.getType(SrcReg);
1712 
1713   LLT DstTy = MRI.getType(DstReg);
1714   unsigned Offset = MI.getOperand(2).getImm();
1715 
1716   if (TypeIdx == 0) {
1717     if (SrcTy.isVector() || DstTy.isVector())
1718       return UnableToLegalize;
1719 
1720     SrcOp Src(SrcReg);
1721     if (SrcTy.isPointer()) {
1722       // Extracts from pointers can be handled only if they are really just
1723       // simple integers.
1724       const DataLayout &DL = MIRBuilder.getDataLayout();
1725       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
1726         return UnableToLegalize;
1727 
1728       LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
1729       Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
1730       SrcTy = SrcAsIntTy;
1731     }
1732 
1733     if (DstTy.isPointer())
1734       return UnableToLegalize;
1735 
1736     if (Offset == 0) {
1737       // Avoid a shift in the degenerate case.
1738       MIRBuilder.buildTrunc(DstReg,
1739                             MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
1740       MI.eraseFromParent();
1741       return Legalized;
1742     }
1743 
1744     // Do a shift in the source type.
1745     LLT ShiftTy = SrcTy;
1746     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1747       Src = MIRBuilder.buildAnyExt(WideTy, Src);
1748       ShiftTy = WideTy;
1749     }
1750 
1751     auto LShr = MIRBuilder.buildLShr(
1752       ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
1753     MIRBuilder.buildTrunc(DstReg, LShr);
1754     MI.eraseFromParent();
1755     return Legalized;
1756   }
1757 
1758   if (SrcTy.isScalar()) {
1759     Observer.changingInstr(MI);
1760     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1761     Observer.changedInstr(MI);
1762     return Legalized;
1763   }
1764 
1765   if (!SrcTy.isVector())
1766     return UnableToLegalize;
1767 
1768   if (DstTy != SrcTy.getElementType())
1769     return UnableToLegalize;
1770 
1771   if (Offset % SrcTy.getScalarSizeInBits() != 0)
1772     return UnableToLegalize;
1773 
1774   Observer.changingInstr(MI);
1775   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1776 
1777   MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
1778                           Offset);
1779   widenScalarDst(MI, WideTy.getScalarType(), 0);
1780   Observer.changedInstr(MI);
1781   return Legalized;
1782 }
1783 
1784 LegalizerHelper::LegalizeResult
1785 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
1786                                    LLT WideTy) {
1787   if (TypeIdx != 0 || WideTy.isVector())
1788     return UnableToLegalize;
1789   Observer.changingInstr(MI);
1790   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1791   widenScalarDst(MI, WideTy);
1792   Observer.changedInstr(MI);
1793   return Legalized;
1794 }
1795 
1796 LegalizerHelper::LegalizeResult
1797 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
1798                                            LLT WideTy) {
1799   if (TypeIdx == 1)
1800     return UnableToLegalize; // TODO
1801 
1802   unsigned Opcode;
1803   unsigned ExtOpcode;
1804   Optional<Register> CarryIn = None;
1805   switch (MI.getOpcode()) {
1806   default:
1807     llvm_unreachable("Unexpected opcode!");
1808   case TargetOpcode::G_SADDO:
1809     Opcode = TargetOpcode::G_ADD;
1810     ExtOpcode = TargetOpcode::G_SEXT;
1811     break;
1812   case TargetOpcode::G_SSUBO:
1813     Opcode = TargetOpcode::G_SUB;
1814     ExtOpcode = TargetOpcode::G_SEXT;
1815     break;
1816   case TargetOpcode::G_UADDO:
1817     Opcode = TargetOpcode::G_ADD;
1818     ExtOpcode = TargetOpcode::G_ZEXT;
1819     break;
1820   case TargetOpcode::G_USUBO:
1821     Opcode = TargetOpcode::G_SUB;
1822     ExtOpcode = TargetOpcode::G_ZEXT;
1823     break;
1824   case TargetOpcode::G_SADDE:
1825     Opcode = TargetOpcode::G_UADDE;
1826     ExtOpcode = TargetOpcode::G_SEXT;
1827     CarryIn = MI.getOperand(4).getReg();
1828     break;
1829   case TargetOpcode::G_SSUBE:
1830     Opcode = TargetOpcode::G_USUBE;
1831     ExtOpcode = TargetOpcode::G_SEXT;
1832     CarryIn = MI.getOperand(4).getReg();
1833     break;
1834   case TargetOpcode::G_UADDE:
1835     Opcode = TargetOpcode::G_UADDE;
1836     ExtOpcode = TargetOpcode::G_ZEXT;
1837     CarryIn = MI.getOperand(4).getReg();
1838     break;
1839   case TargetOpcode::G_USUBE:
1840     Opcode = TargetOpcode::G_USUBE;
1841     ExtOpcode = TargetOpcode::G_ZEXT;
1842     CarryIn = MI.getOperand(4).getReg();
1843     break;
1844   }
1845 
1846   auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
1847   auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
1848   // Do the arithmetic in the larger type.
1849   Register NewOp;
1850   if (CarryIn) {
1851     LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
1852     NewOp = MIRBuilder
1853                 .buildInstr(Opcode, {WideTy, CarryOutTy},
1854                             {LHSExt, RHSExt, *CarryIn})
1855                 .getReg(0);
1856   } else {
1857     NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
1858   }
1859   LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
1860   auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
1861   auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
1862   // There is no overflow if the ExtOp is the same as NewOp.
1863   MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
1864   // Now trunc the NewOp to the original result.
1865   MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
1866   MI.eraseFromParent();
1867   return Legalized;
1868 }
1869 
1870 LegalizerHelper::LegalizeResult
1871 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
1872                                          LLT WideTy) {
1873   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
1874                   MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
1875                   MI.getOpcode() == TargetOpcode::G_SSHLSAT;
1876   bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
1877                  MI.getOpcode() == TargetOpcode::G_USHLSAT;
1878   // We can convert this to:
1879   //   1. Any extend iN to iM
1880   //   2. SHL by M-N
1881   //   3. [US][ADD|SUB|SHL]SAT
1882   //   4. L/ASHR by M-N
1883   //
1884   // It may be more efficient to lower this to a min and a max operation in
1885   // the higher precision arithmetic if the promoted operation isn't legal,
1886   // but this decision is up to the target's lowering request.
1887   Register DstReg = MI.getOperand(0).getReg();
1888 
1889   unsigned NewBits = WideTy.getScalarSizeInBits();
1890   unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
1891 
1892   // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
1893   // must not left shift the RHS to preserve the shift amount.
1894   auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
1895   auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
1896                      : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
1897   auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
1898   auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
1899   auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
1900 
1901   auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
1902                                         {ShiftL, ShiftR}, MI.getFlags());
1903 
1904   // Use a shift that will preserve the number of sign bits when the trunc is
1905   // folded away.
1906   auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
1907                          : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
1908 
1909   MIRBuilder.buildTrunc(DstReg, Result);
1910   MI.eraseFromParent();
1911   return Legalized;
1912 }
1913 
1914 LegalizerHelper::LegalizeResult
1915 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
1916                                  LLT WideTy) {
1917   if (TypeIdx == 1)
1918     return UnableToLegalize;
1919 
1920   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
1921   Register Result = MI.getOperand(0).getReg();
1922   Register OriginalOverflow = MI.getOperand(1).getReg();
1923   Register LHS = MI.getOperand(2).getReg();
1924   Register RHS = MI.getOperand(3).getReg();
1925   LLT SrcTy = MRI.getType(LHS);
1926   LLT OverflowTy = MRI.getType(OriginalOverflow);
1927   unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
1928 
1929   // To determine if the result overflowed in the larger type, we extend the
1930   // input to the larger type, do the multiply (checking if it overflows),
1931   // then also check the high bits of the result to see if overflow happened
1932   // there.
1933   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
1934   auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
1935   auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
1936 
1937   auto Mulo = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy, OverflowTy},
1938                                     {LeftOperand, RightOperand});
1939   auto Mul = Mulo->getOperand(0);
1940   MIRBuilder.buildTrunc(Result, Mul);
1941 
1942   MachineInstrBuilder ExtResult;
1943   // Overflow occurred if it occurred in the larger type, or if the high part
1944   // of the result does not zero/sign-extend the low part.  Check this second
1945   // possibility first.
1946   if (IsSigned) {
1947     // For signed, overflow occurred when the high part does not sign-extend
1948     // the low part.
1949     ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
1950   } else {
1951     // Unsigned overflow occurred when the high part does not zero-extend the
1952     // low part.
1953     ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
1954   }
1955 
1956   // Multiplication cannot overflow if the WideTy is >= 2 * original width,
1957   // so we don't need to check the overflow result of larger type Mulo.
1958   if (WideTy.getScalarSizeInBits() < 2 * SrcBitWidth) {
1959     auto Overflow =
1960         MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
1961     // Finally check if the multiplication in the larger type itself overflowed.
1962     MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
1963   } else {
1964     MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
1965   }
1966   MI.eraseFromParent();
1967   return Legalized;
1968 }
1969 
1970 LegalizerHelper::LegalizeResult
1971 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
1972   switch (MI.getOpcode()) {
1973   default:
1974     return UnableToLegalize;
1975   case TargetOpcode::G_ATOMICRMW_XCHG:
1976   case TargetOpcode::G_ATOMICRMW_ADD:
1977   case TargetOpcode::G_ATOMICRMW_SUB:
1978   case TargetOpcode::G_ATOMICRMW_AND:
1979   case TargetOpcode::G_ATOMICRMW_OR:
1980   case TargetOpcode::G_ATOMICRMW_XOR:
1981   case TargetOpcode::G_ATOMICRMW_MIN:
1982   case TargetOpcode::G_ATOMICRMW_MAX:
1983   case TargetOpcode::G_ATOMICRMW_UMIN:
1984   case TargetOpcode::G_ATOMICRMW_UMAX:
1985     assert(TypeIdx == 0 && "atomicrmw with second scalar type");
1986     Observer.changingInstr(MI);
1987     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
1988     widenScalarDst(MI, WideTy, 0);
1989     Observer.changedInstr(MI);
1990     return Legalized;
1991   case TargetOpcode::G_ATOMIC_CMPXCHG:
1992     assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
1993     Observer.changingInstr(MI);
1994     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
1995     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
1996     widenScalarDst(MI, WideTy, 0);
1997     Observer.changedInstr(MI);
1998     return Legalized;
1999   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2000     if (TypeIdx == 0) {
2001       Observer.changingInstr(MI);
2002       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2003       widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
2004       widenScalarDst(MI, WideTy, 0);
2005       Observer.changedInstr(MI);
2006       return Legalized;
2007     }
2008     assert(TypeIdx == 1 &&
2009            "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2010     Observer.changingInstr(MI);
2011     widenScalarDst(MI, WideTy, 1);
2012     Observer.changedInstr(MI);
2013     return Legalized;
2014   case TargetOpcode::G_EXTRACT:
2015     return widenScalarExtract(MI, TypeIdx, WideTy);
2016   case TargetOpcode::G_INSERT:
2017     return widenScalarInsert(MI, TypeIdx, WideTy);
2018   case TargetOpcode::G_MERGE_VALUES:
2019     return widenScalarMergeValues(MI, TypeIdx, WideTy);
2020   case TargetOpcode::G_UNMERGE_VALUES:
2021     return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2022   case TargetOpcode::G_SADDO:
2023   case TargetOpcode::G_SSUBO:
2024   case TargetOpcode::G_UADDO:
2025   case TargetOpcode::G_USUBO:
2026   case TargetOpcode::G_SADDE:
2027   case TargetOpcode::G_SSUBE:
2028   case TargetOpcode::G_UADDE:
2029   case TargetOpcode::G_USUBE:
2030     return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2031   case TargetOpcode::G_UMULO:
2032   case TargetOpcode::G_SMULO:
2033     return widenScalarMulo(MI, TypeIdx, WideTy);
2034   case TargetOpcode::G_SADDSAT:
2035   case TargetOpcode::G_SSUBSAT:
2036   case TargetOpcode::G_SSHLSAT:
2037   case TargetOpcode::G_UADDSAT:
2038   case TargetOpcode::G_USUBSAT:
2039   case TargetOpcode::G_USHLSAT:
2040     return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2041   case TargetOpcode::G_CTTZ:
2042   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2043   case TargetOpcode::G_CTLZ:
2044   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2045   case TargetOpcode::G_CTPOP: {
2046     if (TypeIdx == 0) {
2047       Observer.changingInstr(MI);
2048       widenScalarDst(MI, WideTy, 0);
2049       Observer.changedInstr(MI);
2050       return Legalized;
2051     }
2052 
2053     Register SrcReg = MI.getOperand(1).getReg();
2054 
2055     // First extend the input.
2056     unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
2057                               MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
2058                           ? TargetOpcode::G_ANYEXT
2059                           : TargetOpcode::G_ZEXT;
2060     auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
2061     LLT CurTy = MRI.getType(SrcReg);
2062     unsigned NewOpc = MI.getOpcode();
2063     if (NewOpc == TargetOpcode::G_CTTZ) {
2064       // The count is the same in the larger type except if the original
2065       // value was zero.  This can be handled by setting the bit just off
2066       // the top of the original type.
2067       auto TopBit =
2068           APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
2069       MIBSrc = MIRBuilder.buildOr(
2070         WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
2071       // Now we know the operand is non-zero, use the more relaxed opcode.
2072       NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2073     }
2074 
2075     // Perform the operation at the larger size.
2076     auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
2077     // This is already the correct result for CTPOP and CTTZs
2078     if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
2079         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2080       // The correct result is NewOp - (Difference in widety and current ty).
2081       unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2082       MIBNewOp = MIRBuilder.buildSub(
2083           WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
2084     }
2085 
2086     MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
2087     MI.eraseFromParent();
2088     return Legalized;
2089   }
2090   case TargetOpcode::G_BSWAP: {
2091     Observer.changingInstr(MI);
2092     Register DstReg = MI.getOperand(0).getReg();
2093 
2094     Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2095     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2096     Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2097     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2098 
2099     MI.getOperand(0).setReg(DstExt);
2100 
2101     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2102 
2103     LLT Ty = MRI.getType(DstReg);
2104     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2105     MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2106     MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2107 
2108     MIRBuilder.buildTrunc(DstReg, ShrReg);
2109     Observer.changedInstr(MI);
2110     return Legalized;
2111   }
2112   case TargetOpcode::G_BITREVERSE: {
2113     Observer.changingInstr(MI);
2114 
2115     Register DstReg = MI.getOperand(0).getReg();
2116     LLT Ty = MRI.getType(DstReg);
2117     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2118 
2119     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2120     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2121     MI.getOperand(0).setReg(DstExt);
2122     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2123 
2124     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2125     auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2126     MIRBuilder.buildTrunc(DstReg, Shift);
2127     Observer.changedInstr(MI);
2128     return Legalized;
2129   }
2130   case TargetOpcode::G_FREEZE:
2131     Observer.changingInstr(MI);
2132     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2133     widenScalarDst(MI, WideTy);
2134     Observer.changedInstr(MI);
2135     return Legalized;
2136 
2137   case TargetOpcode::G_ABS:
2138     Observer.changingInstr(MI);
2139     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2140     widenScalarDst(MI, WideTy);
2141     Observer.changedInstr(MI);
2142     return Legalized;
2143 
2144   case TargetOpcode::G_ADD:
2145   case TargetOpcode::G_AND:
2146   case TargetOpcode::G_MUL:
2147   case TargetOpcode::G_OR:
2148   case TargetOpcode::G_XOR:
2149   case TargetOpcode::G_SUB:
2150     // Perform operation at larger width (any extension is fines here, high bits
2151     // don't affect the result) and then truncate the result back to the
2152     // original type.
2153     Observer.changingInstr(MI);
2154     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2155     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2156     widenScalarDst(MI, WideTy);
2157     Observer.changedInstr(MI);
2158     return Legalized;
2159 
2160   case TargetOpcode::G_SBFX:
2161   case TargetOpcode::G_UBFX:
2162     Observer.changingInstr(MI);
2163 
2164     if (TypeIdx == 0) {
2165       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2166       widenScalarDst(MI, WideTy);
2167     } else {
2168       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2169       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2170     }
2171 
2172     Observer.changedInstr(MI);
2173     return Legalized;
2174 
2175   case TargetOpcode::G_SHL:
2176     Observer.changingInstr(MI);
2177 
2178     if (TypeIdx == 0) {
2179       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2180       widenScalarDst(MI, WideTy);
2181     } else {
2182       assert(TypeIdx == 1);
2183       // The "number of bits to shift" operand must preserve its value as an
2184       // unsigned integer:
2185       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2186     }
2187 
2188     Observer.changedInstr(MI);
2189     return Legalized;
2190 
2191   case TargetOpcode::G_SDIV:
2192   case TargetOpcode::G_SREM:
2193   case TargetOpcode::G_SMIN:
2194   case TargetOpcode::G_SMAX:
2195     Observer.changingInstr(MI);
2196     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2197     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2198     widenScalarDst(MI, WideTy);
2199     Observer.changedInstr(MI);
2200     return Legalized;
2201 
2202   case TargetOpcode::G_SDIVREM:
2203     Observer.changingInstr(MI);
2204     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2205     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2206     widenScalarDst(MI, WideTy);
2207     widenScalarDst(MI, WideTy, 1);
2208     Observer.changedInstr(MI);
2209     return Legalized;
2210 
2211   case TargetOpcode::G_ASHR:
2212   case TargetOpcode::G_LSHR:
2213     Observer.changingInstr(MI);
2214 
2215     if (TypeIdx == 0) {
2216       unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
2217         TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2218 
2219       widenScalarSrc(MI, WideTy, 1, CvtOp);
2220       widenScalarDst(MI, WideTy);
2221     } else {
2222       assert(TypeIdx == 1);
2223       // The "number of bits to shift" operand must preserve its value as an
2224       // unsigned integer:
2225       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2226     }
2227 
2228     Observer.changedInstr(MI);
2229     return Legalized;
2230   case TargetOpcode::G_UDIV:
2231   case TargetOpcode::G_UREM:
2232   case TargetOpcode::G_UMIN:
2233   case TargetOpcode::G_UMAX:
2234     Observer.changingInstr(MI);
2235     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2236     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2237     widenScalarDst(MI, WideTy);
2238     Observer.changedInstr(MI);
2239     return Legalized;
2240 
2241   case TargetOpcode::G_UDIVREM:
2242     Observer.changingInstr(MI);
2243     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2244     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2245     widenScalarDst(MI, WideTy);
2246     widenScalarDst(MI, WideTy, 1);
2247     Observer.changedInstr(MI);
2248     return Legalized;
2249 
2250   case TargetOpcode::G_SELECT:
2251     Observer.changingInstr(MI);
2252     if (TypeIdx == 0) {
2253       // Perform operation at larger width (any extension is fine here, high
2254       // bits don't affect the result) and then truncate the result back to the
2255       // original type.
2256       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2257       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2258       widenScalarDst(MI, WideTy);
2259     } else {
2260       bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
2261       // Explicit extension is required here since high bits affect the result.
2262       widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
2263     }
2264     Observer.changedInstr(MI);
2265     return Legalized;
2266 
2267   case TargetOpcode::G_FPTOSI:
2268   case TargetOpcode::G_FPTOUI:
2269     Observer.changingInstr(MI);
2270 
2271     if (TypeIdx == 0)
2272       widenScalarDst(MI, WideTy);
2273     else
2274       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2275 
2276     Observer.changedInstr(MI);
2277     return Legalized;
2278   case TargetOpcode::G_SITOFP:
2279     Observer.changingInstr(MI);
2280 
2281     if (TypeIdx == 0)
2282       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2283     else
2284       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2285 
2286     Observer.changedInstr(MI);
2287     return Legalized;
2288   case TargetOpcode::G_UITOFP:
2289     Observer.changingInstr(MI);
2290 
2291     if (TypeIdx == 0)
2292       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2293     else
2294       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2295 
2296     Observer.changedInstr(MI);
2297     return Legalized;
2298   case TargetOpcode::G_LOAD:
2299   case TargetOpcode::G_SEXTLOAD:
2300   case TargetOpcode::G_ZEXTLOAD:
2301     Observer.changingInstr(MI);
2302     widenScalarDst(MI, WideTy);
2303     Observer.changedInstr(MI);
2304     return Legalized;
2305 
2306   case TargetOpcode::G_STORE: {
2307     if (TypeIdx != 0)
2308       return UnableToLegalize;
2309 
2310     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2311     if (!Ty.isScalar())
2312       return UnableToLegalize;
2313 
2314     Observer.changingInstr(MI);
2315 
2316     unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
2317       TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
2318     widenScalarSrc(MI, WideTy, 0, ExtType);
2319 
2320     Observer.changedInstr(MI);
2321     return Legalized;
2322   }
2323   case TargetOpcode::G_CONSTANT: {
2324     MachineOperand &SrcMO = MI.getOperand(1);
2325     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2326     unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
2327         MRI.getType(MI.getOperand(0).getReg()));
2328     assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
2329             ExtOpc == TargetOpcode::G_ANYEXT) &&
2330            "Illegal Extend");
2331     const APInt &SrcVal = SrcMO.getCImm()->getValue();
2332     const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
2333                            ? SrcVal.sext(WideTy.getSizeInBits())
2334                            : SrcVal.zext(WideTy.getSizeInBits());
2335     Observer.changingInstr(MI);
2336     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
2337 
2338     widenScalarDst(MI, WideTy);
2339     Observer.changedInstr(MI);
2340     return Legalized;
2341   }
2342   case TargetOpcode::G_FCONSTANT: {
2343     MachineOperand &SrcMO = MI.getOperand(1);
2344     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2345     APFloat Val = SrcMO.getFPImm()->getValueAPF();
2346     bool LosesInfo;
2347     switch (WideTy.getSizeInBits()) {
2348     case 32:
2349       Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
2350                   &LosesInfo);
2351       break;
2352     case 64:
2353       Val.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
2354                   &LosesInfo);
2355       break;
2356     default:
2357       return UnableToLegalize;
2358     }
2359 
2360     assert(!LosesInfo && "extend should always be lossless");
2361 
2362     Observer.changingInstr(MI);
2363     SrcMO.setFPImm(ConstantFP::get(Ctx, Val));
2364 
2365     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2366     Observer.changedInstr(MI);
2367     return Legalized;
2368   }
2369   case TargetOpcode::G_IMPLICIT_DEF: {
2370     Observer.changingInstr(MI);
2371     widenScalarDst(MI, WideTy);
2372     Observer.changedInstr(MI);
2373     return Legalized;
2374   }
2375   case TargetOpcode::G_BRCOND:
2376     Observer.changingInstr(MI);
2377     widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
2378     Observer.changedInstr(MI);
2379     return Legalized;
2380 
2381   case TargetOpcode::G_FCMP:
2382     Observer.changingInstr(MI);
2383     if (TypeIdx == 0)
2384       widenScalarDst(MI, WideTy);
2385     else {
2386       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2387       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
2388     }
2389     Observer.changedInstr(MI);
2390     return Legalized;
2391 
2392   case TargetOpcode::G_ICMP:
2393     Observer.changingInstr(MI);
2394     if (TypeIdx == 0)
2395       widenScalarDst(MI, WideTy);
2396     else {
2397       unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>(
2398                                MI.getOperand(1).getPredicate()))
2399                                ? TargetOpcode::G_SEXT
2400                                : TargetOpcode::G_ZEXT;
2401       widenScalarSrc(MI, WideTy, 2, ExtOpcode);
2402       widenScalarSrc(MI, WideTy, 3, ExtOpcode);
2403     }
2404     Observer.changedInstr(MI);
2405     return Legalized;
2406 
2407   case TargetOpcode::G_PTR_ADD:
2408     assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
2409     Observer.changingInstr(MI);
2410     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2411     Observer.changedInstr(MI);
2412     return Legalized;
2413 
2414   case TargetOpcode::G_PHI: {
2415     assert(TypeIdx == 0 && "Expecting only Idx 0");
2416 
2417     Observer.changingInstr(MI);
2418     for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
2419       MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
2420       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
2421       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
2422     }
2423 
2424     MachineBasicBlock &MBB = *MI.getParent();
2425     MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
2426     widenScalarDst(MI, WideTy);
2427     Observer.changedInstr(MI);
2428     return Legalized;
2429   }
2430   case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
2431     if (TypeIdx == 0) {
2432       Register VecReg = MI.getOperand(1).getReg();
2433       LLT VecTy = MRI.getType(VecReg);
2434       Observer.changingInstr(MI);
2435 
2436       widenScalarSrc(
2437           MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
2438           TargetOpcode::G_ANYEXT);
2439 
2440       widenScalarDst(MI, WideTy, 0);
2441       Observer.changedInstr(MI);
2442       return Legalized;
2443     }
2444 
2445     if (TypeIdx != 2)
2446       return UnableToLegalize;
2447     Observer.changingInstr(MI);
2448     // TODO: Probably should be zext
2449     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2450     Observer.changedInstr(MI);
2451     return Legalized;
2452   }
2453   case TargetOpcode::G_INSERT_VECTOR_ELT: {
2454     if (TypeIdx == 1) {
2455       Observer.changingInstr(MI);
2456 
2457       Register VecReg = MI.getOperand(1).getReg();
2458       LLT VecTy = MRI.getType(VecReg);
2459       LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
2460 
2461       widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
2462       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2463       widenScalarDst(MI, WideVecTy, 0);
2464       Observer.changedInstr(MI);
2465       return Legalized;
2466     }
2467 
2468     if (TypeIdx == 2) {
2469       Observer.changingInstr(MI);
2470       // TODO: Probably should be zext
2471       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2472       Observer.changedInstr(MI);
2473       return Legalized;
2474     }
2475 
2476     return UnableToLegalize;
2477   }
2478   case TargetOpcode::G_FADD:
2479   case TargetOpcode::G_FMUL:
2480   case TargetOpcode::G_FSUB:
2481   case TargetOpcode::G_FMA:
2482   case TargetOpcode::G_FMAD:
2483   case TargetOpcode::G_FNEG:
2484   case TargetOpcode::G_FABS:
2485   case TargetOpcode::G_FCANONICALIZE:
2486   case TargetOpcode::G_FMINNUM:
2487   case TargetOpcode::G_FMAXNUM:
2488   case TargetOpcode::G_FMINNUM_IEEE:
2489   case TargetOpcode::G_FMAXNUM_IEEE:
2490   case TargetOpcode::G_FMINIMUM:
2491   case TargetOpcode::G_FMAXIMUM:
2492   case TargetOpcode::G_FDIV:
2493   case TargetOpcode::G_FREM:
2494   case TargetOpcode::G_FCEIL:
2495   case TargetOpcode::G_FFLOOR:
2496   case TargetOpcode::G_FCOS:
2497   case TargetOpcode::G_FSIN:
2498   case TargetOpcode::G_FLOG10:
2499   case TargetOpcode::G_FLOG:
2500   case TargetOpcode::G_FLOG2:
2501   case TargetOpcode::G_FRINT:
2502   case TargetOpcode::G_FNEARBYINT:
2503   case TargetOpcode::G_FSQRT:
2504   case TargetOpcode::G_FEXP:
2505   case TargetOpcode::G_FEXP2:
2506   case TargetOpcode::G_FPOW:
2507   case TargetOpcode::G_INTRINSIC_TRUNC:
2508   case TargetOpcode::G_INTRINSIC_ROUND:
2509   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2510     assert(TypeIdx == 0);
2511     Observer.changingInstr(MI);
2512 
2513     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
2514       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
2515 
2516     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2517     Observer.changedInstr(MI);
2518     return Legalized;
2519   case TargetOpcode::G_FPOWI: {
2520     if (TypeIdx != 0)
2521       return UnableToLegalize;
2522     Observer.changingInstr(MI);
2523     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2524     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2525     Observer.changedInstr(MI);
2526     return Legalized;
2527   }
2528   case TargetOpcode::G_INTTOPTR:
2529     if (TypeIdx != 1)
2530       return UnableToLegalize;
2531 
2532     Observer.changingInstr(MI);
2533     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2534     Observer.changedInstr(MI);
2535     return Legalized;
2536   case TargetOpcode::G_PTRTOINT:
2537     if (TypeIdx != 0)
2538       return UnableToLegalize;
2539 
2540     Observer.changingInstr(MI);
2541     widenScalarDst(MI, WideTy, 0);
2542     Observer.changedInstr(MI);
2543     return Legalized;
2544   case TargetOpcode::G_BUILD_VECTOR: {
2545     Observer.changingInstr(MI);
2546 
2547     const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
2548     for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
2549       widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
2550 
2551     // Avoid changing the result vector type if the source element type was
2552     // requested.
2553     if (TypeIdx == 1) {
2554       MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
2555     } else {
2556       widenScalarDst(MI, WideTy, 0);
2557     }
2558 
2559     Observer.changedInstr(MI);
2560     return Legalized;
2561   }
2562   case TargetOpcode::G_SEXT_INREG:
2563     if (TypeIdx != 0)
2564       return UnableToLegalize;
2565 
2566     Observer.changingInstr(MI);
2567     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2568     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
2569     Observer.changedInstr(MI);
2570     return Legalized;
2571   case TargetOpcode::G_PTRMASK: {
2572     if (TypeIdx != 1)
2573       return UnableToLegalize;
2574     Observer.changingInstr(MI);
2575     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2576     Observer.changedInstr(MI);
2577     return Legalized;
2578   }
2579   }
2580 }
2581 
2582 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
2583                              MachineIRBuilder &B, Register Src, LLT Ty) {
2584   auto Unmerge = B.buildUnmerge(Ty, Src);
2585   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2586     Pieces.push_back(Unmerge.getReg(I));
2587 }
2588 
2589 LegalizerHelper::LegalizeResult
2590 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
2591   Register Dst = MI.getOperand(0).getReg();
2592   Register Src = MI.getOperand(1).getReg();
2593   LLT DstTy = MRI.getType(Dst);
2594   LLT SrcTy = MRI.getType(Src);
2595 
2596   if (SrcTy.isVector()) {
2597     LLT SrcEltTy = SrcTy.getElementType();
2598     SmallVector<Register, 8> SrcRegs;
2599 
2600     if (DstTy.isVector()) {
2601       int NumDstElt = DstTy.getNumElements();
2602       int NumSrcElt = SrcTy.getNumElements();
2603 
2604       LLT DstEltTy = DstTy.getElementType();
2605       LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
2606       LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
2607 
2608       // If there's an element size mismatch, insert intermediate casts to match
2609       // the result element type.
2610       if (NumSrcElt < NumDstElt) { // Source element type is larger.
2611         // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
2612         //
2613         // =>
2614         //
2615         // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
2616         // %3:_(<2 x s8>) = G_BITCAST %2
2617         // %4:_(<2 x s8>) = G_BITCAST %3
2618         // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
2619         DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
2620         SrcPartTy = SrcEltTy;
2621       } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
2622         //
2623         // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
2624         //
2625         // =>
2626         //
2627         // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
2628         // %3:_(s16) = G_BITCAST %2
2629         // %4:_(s16) = G_BITCAST %3
2630         // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
2631         SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
2632         DstCastTy = DstEltTy;
2633       }
2634 
2635       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
2636       for (Register &SrcReg : SrcRegs)
2637         SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
2638     } else
2639       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
2640 
2641     MIRBuilder.buildMerge(Dst, SrcRegs);
2642     MI.eraseFromParent();
2643     return Legalized;
2644   }
2645 
2646   if (DstTy.isVector()) {
2647     SmallVector<Register, 8> SrcRegs;
2648     getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
2649     MIRBuilder.buildMerge(Dst, SrcRegs);
2650     MI.eraseFromParent();
2651     return Legalized;
2652   }
2653 
2654   return UnableToLegalize;
2655 }
2656 
2657 /// Figure out the bit offset into a register when coercing a vector index for
2658 /// the wide element type. This is only for the case when promoting vector to
2659 /// one with larger elements.
2660 //
2661 ///
2662 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2663 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2664 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
2665                                                    Register Idx,
2666                                                    unsigned NewEltSize,
2667                                                    unsigned OldEltSize) {
2668   const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2669   LLT IdxTy = B.getMRI()->getType(Idx);
2670 
2671   // Now figure out the amount we need to shift to get the target bits.
2672   auto OffsetMask = B.buildConstant(
2673       IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio));
2674   auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
2675   return B.buildShl(IdxTy, OffsetIdx,
2676                     B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
2677 }
2678 
2679 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
2680 /// is casting to a vector with a smaller element size, perform multiple element
2681 /// extracts and merge the results. If this is coercing to a vector with larger
2682 /// elements, index the bitcasted vector and extract the target element with bit
2683 /// operations. This is intended to force the indexing in the native register
2684 /// size for architectures that can dynamically index the register file.
2685 LegalizerHelper::LegalizeResult
2686 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
2687                                          LLT CastTy) {
2688   if (TypeIdx != 1)
2689     return UnableToLegalize;
2690 
2691   Register Dst = MI.getOperand(0).getReg();
2692   Register SrcVec = MI.getOperand(1).getReg();
2693   Register Idx = MI.getOperand(2).getReg();
2694   LLT SrcVecTy = MRI.getType(SrcVec);
2695   LLT IdxTy = MRI.getType(Idx);
2696 
2697   LLT SrcEltTy = SrcVecTy.getElementType();
2698   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2699   unsigned OldNumElts = SrcVecTy.getNumElements();
2700 
2701   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2702   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2703 
2704   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2705   const unsigned OldEltSize = SrcEltTy.getSizeInBits();
2706   if (NewNumElts > OldNumElts) {
2707     // Decreasing the vector element size
2708     //
2709     // e.g. i64 = extract_vector_elt x:v2i64, y:i32
2710     //  =>
2711     //  v4i32:castx = bitcast x:v2i64
2712     //
2713     // i64 = bitcast
2714     //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
2715     //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
2716     //
2717     if (NewNumElts % OldNumElts != 0)
2718       return UnableToLegalize;
2719 
2720     // Type of the intermediate result vector.
2721     const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
2722     LLT MidTy =
2723         LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
2724 
2725     auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
2726 
2727     SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
2728     auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
2729 
2730     for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
2731       auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
2732       auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
2733       auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
2734       NewOps[I] = Elt.getReg(0);
2735     }
2736 
2737     auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
2738     MIRBuilder.buildBitcast(Dst, NewVec);
2739     MI.eraseFromParent();
2740     return Legalized;
2741   }
2742 
2743   if (NewNumElts < OldNumElts) {
2744     if (NewEltSize % OldEltSize != 0)
2745       return UnableToLegalize;
2746 
2747     // This only depends on powers of 2 because we use bit tricks to figure out
2748     // the bit offset we need to shift to get the target element. A general
2749     // expansion could emit division/multiply.
2750     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2751       return UnableToLegalize;
2752 
2753     // Increasing the vector element size.
2754     // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
2755     //
2756     //   =>
2757     //
2758     // %cast = G_BITCAST %vec
2759     // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
2760     // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
2761     // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2762     // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2763     // %elt_bits = G_LSHR %wide_elt, %offset_bits
2764     // %elt = G_TRUNC %elt_bits
2765 
2766     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2767     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2768 
2769     // Divide to get the index in the wider element type.
2770     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2771 
2772     Register WideElt = CastVec;
2773     if (CastTy.isVector()) {
2774       WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2775                                                      ScaledIdx).getReg(0);
2776     }
2777 
2778     // Compute the bit offset into the register of the target element.
2779     Register OffsetBits = getBitcastWiderVectorElementOffset(
2780       MIRBuilder, Idx, NewEltSize, OldEltSize);
2781 
2782     // Shift the wide element to get the target element.
2783     auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
2784     MIRBuilder.buildTrunc(Dst, ExtractedBits);
2785     MI.eraseFromParent();
2786     return Legalized;
2787   }
2788 
2789   return UnableToLegalize;
2790 }
2791 
2792 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
2793 /// TargetReg, while preserving other bits in \p TargetReg.
2794 ///
2795 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
2796 static Register buildBitFieldInsert(MachineIRBuilder &B,
2797                                     Register TargetReg, Register InsertReg,
2798                                     Register OffsetBits) {
2799   LLT TargetTy = B.getMRI()->getType(TargetReg);
2800   LLT InsertTy = B.getMRI()->getType(InsertReg);
2801   auto ZextVal = B.buildZExt(TargetTy, InsertReg);
2802   auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
2803 
2804   // Produce a bitmask of the value to insert
2805   auto EltMask = B.buildConstant(
2806     TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
2807                                    InsertTy.getSizeInBits()));
2808   // Shift it into position
2809   auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
2810   auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
2811 
2812   // Clear out the bits in the wide element
2813   auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
2814 
2815   // The value to insert has all zeros already, so stick it into the masked
2816   // wide element.
2817   return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
2818 }
2819 
2820 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
2821 /// is increasing the element size, perform the indexing in the target element
2822 /// type, and use bit operations to insert at the element position. This is
2823 /// intended for architectures that can dynamically index the register file and
2824 /// want to force indexing in the native register size.
2825 LegalizerHelper::LegalizeResult
2826 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
2827                                         LLT CastTy) {
2828   if (TypeIdx != 0)
2829     return UnableToLegalize;
2830 
2831   Register Dst = MI.getOperand(0).getReg();
2832   Register SrcVec = MI.getOperand(1).getReg();
2833   Register Val = MI.getOperand(2).getReg();
2834   Register Idx = MI.getOperand(3).getReg();
2835 
2836   LLT VecTy = MRI.getType(Dst);
2837   LLT IdxTy = MRI.getType(Idx);
2838 
2839   LLT VecEltTy = VecTy.getElementType();
2840   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2841   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2842   const unsigned OldEltSize = VecEltTy.getSizeInBits();
2843 
2844   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2845   unsigned OldNumElts = VecTy.getNumElements();
2846 
2847   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2848   if (NewNumElts < OldNumElts) {
2849     if (NewEltSize % OldEltSize != 0)
2850       return UnableToLegalize;
2851 
2852     // This only depends on powers of 2 because we use bit tricks to figure out
2853     // the bit offset we need to shift to get the target element. A general
2854     // expansion could emit division/multiply.
2855     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2856       return UnableToLegalize;
2857 
2858     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2859     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2860 
2861     // Divide to get the index in the wider element type.
2862     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2863 
2864     Register ExtractedElt = CastVec;
2865     if (CastTy.isVector()) {
2866       ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2867                                                           ScaledIdx).getReg(0);
2868     }
2869 
2870     // Compute the bit offset into the register of the target element.
2871     Register OffsetBits = getBitcastWiderVectorElementOffset(
2872       MIRBuilder, Idx, NewEltSize, OldEltSize);
2873 
2874     Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
2875                                                Val, OffsetBits);
2876     if (CastTy.isVector()) {
2877       InsertedElt = MIRBuilder.buildInsertVectorElement(
2878         CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
2879     }
2880 
2881     MIRBuilder.buildBitcast(Dst, InsertedElt);
2882     MI.eraseFromParent();
2883     return Legalized;
2884   }
2885 
2886   return UnableToLegalize;
2887 }
2888 
2889 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
2890   // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
2891   Register DstReg = LoadMI.getDstReg();
2892   Register PtrReg = LoadMI.getPointerReg();
2893   LLT DstTy = MRI.getType(DstReg);
2894   MachineMemOperand &MMO = LoadMI.getMMO();
2895   LLT MemTy = MMO.getMemoryType();
2896   MachineFunction &MF = MIRBuilder.getMF();
2897 
2898   unsigned MemSizeInBits = MemTy.getSizeInBits();
2899   unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
2900 
2901   if (MemSizeInBits != MemStoreSizeInBits) {
2902     if (MemTy.isVector())
2903       return UnableToLegalize;
2904 
2905     // Promote to a byte-sized load if not loading an integral number of
2906     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
2907     LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
2908     MachineMemOperand *NewMMO =
2909         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
2910 
2911     Register LoadReg = DstReg;
2912     LLT LoadTy = DstTy;
2913 
2914     // If this wasn't already an extending load, we need to widen the result
2915     // register to avoid creating a load with a narrower result than the source.
2916     if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
2917       LoadTy = WideMemTy;
2918       LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
2919     }
2920 
2921     if (isa<GSExtLoad>(LoadMI)) {
2922       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2923       MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
2924     } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == DstTy) {
2925       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2926       // The extra bits are guaranteed to be zero, since we stored them that
2927       // way.  A zext load from Wide thus automatically gives zext from MemVT.
2928       MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
2929     } else {
2930       MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
2931     }
2932 
2933     if (DstTy != LoadTy)
2934       MIRBuilder.buildTrunc(DstReg, LoadReg);
2935 
2936     LoadMI.eraseFromParent();
2937     return Legalized;
2938   }
2939 
2940   // Big endian lowering not implemented.
2941   if (MIRBuilder.getDataLayout().isBigEndian())
2942     return UnableToLegalize;
2943 
2944   // This load needs splitting into power of 2 sized loads.
2945   //
2946   // Our strategy here is to generate anyextending loads for the smaller
2947   // types up to next power-2 result type, and then combine the two larger
2948   // result values together, before truncating back down to the non-pow-2
2949   // type.
2950   // E.g. v1 = i24 load =>
2951   // v2 = i32 zextload (2 byte)
2952   // v3 = i32 load (1 byte)
2953   // v4 = i32 shl v3, 16
2954   // v5 = i32 or v4, v2
2955   // v1 = i24 trunc v5
2956   // By doing this we generate the correct truncate which should get
2957   // combined away as an artifact with a matching extend.
2958 
2959   uint64_t LargeSplitSize, SmallSplitSize;
2960 
2961   if (!isPowerOf2_32(MemSizeInBits)) {
2962     // This load needs splitting into power of 2 sized loads.
2963     LargeSplitSize = PowerOf2Floor(MemSizeInBits);
2964     SmallSplitSize = MemSizeInBits - LargeSplitSize;
2965   } else {
2966     // This is already a power of 2, but we still need to split this in half.
2967     //
2968     // Assume we're being asked to decompose an unaligned load.
2969     // TODO: If this requires multiple splits, handle them all at once.
2970     auto &Ctx = MF.getFunction().getContext();
2971     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
2972       return UnableToLegalize;
2973 
2974     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
2975   }
2976 
2977   if (MemTy.isVector()) {
2978     // TODO: Handle vector extloads
2979     if (MemTy != DstTy)
2980       return UnableToLegalize;
2981 
2982     // TODO: We can do better than scalarizing the vector and at least split it
2983     // in half.
2984     return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
2985   }
2986 
2987   MachineMemOperand *LargeMMO =
2988       MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
2989   MachineMemOperand *SmallMMO =
2990       MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
2991 
2992   LLT PtrTy = MRI.getType(PtrReg);
2993   unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
2994   LLT AnyExtTy = LLT::scalar(AnyExtSize);
2995   auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
2996                                              PtrReg, *LargeMMO);
2997 
2998   auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
2999                                             LargeSplitSize / 8);
3000   Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
3001   auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
3002   auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
3003                                              SmallPtr, *SmallMMO);
3004 
3005   auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
3006   auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
3007 
3008   if (AnyExtTy == DstTy)
3009     MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
3010   else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
3011     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3012     MIRBuilder.buildTrunc(DstReg, {Or});
3013   } else {
3014     assert(DstTy.isPointer() && "expected pointer");
3015     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3016 
3017     // FIXME: We currently consider this to be illegal for non-integral address
3018     // spaces, but we need still need a way to reinterpret the bits.
3019     MIRBuilder.buildIntToPtr(DstReg, Or);
3020   }
3021 
3022   LoadMI.eraseFromParent();
3023   return Legalized;
3024 }
3025 
3026 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
3027   // Lower a non-power of 2 store into multiple pow-2 stores.
3028   // E.g. split an i24 store into an i16 store + i8 store.
3029   // We do this by first extending the stored value to the next largest power
3030   // of 2 type, and then using truncating stores to store the components.
3031   // By doing this, likewise with G_LOAD, generate an extend that can be
3032   // artifact-combined away instead of leaving behind extracts.
3033   Register SrcReg = StoreMI.getValueReg();
3034   Register PtrReg = StoreMI.getPointerReg();
3035   LLT SrcTy = MRI.getType(SrcReg);
3036   MachineFunction &MF = MIRBuilder.getMF();
3037   MachineMemOperand &MMO = **StoreMI.memoperands_begin();
3038   LLT MemTy = MMO.getMemoryType();
3039 
3040   unsigned StoreWidth = MemTy.getSizeInBits();
3041   unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
3042 
3043   if (StoreWidth != StoreSizeInBits) {
3044     if (SrcTy.isVector())
3045       return UnableToLegalize;
3046 
3047     // Promote to a byte-sized store with upper bits zero if not
3048     // storing an integral number of bytes.  For example, promote
3049     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
3050     LLT WideTy = LLT::scalar(StoreSizeInBits);
3051 
3052     if (StoreSizeInBits > SrcTy.getSizeInBits()) {
3053       // Avoid creating a store with a narrower source than result.
3054       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
3055       SrcTy = WideTy;
3056     }
3057 
3058     auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
3059 
3060     MachineMemOperand *NewMMO =
3061         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
3062     MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
3063     StoreMI.eraseFromParent();
3064     return Legalized;
3065   }
3066 
3067   if (MemTy.isVector()) {
3068     // TODO: Handle vector trunc stores
3069     if (MemTy != SrcTy)
3070       return UnableToLegalize;
3071 
3072     // TODO: We can do better than scalarizing the vector and at least split it
3073     // in half.
3074     return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
3075   }
3076 
3077   unsigned MemSizeInBits = MemTy.getSizeInBits();
3078   uint64_t LargeSplitSize, SmallSplitSize;
3079 
3080   if (!isPowerOf2_32(MemSizeInBits)) {
3081     LargeSplitSize = PowerOf2Floor(MemTy.getSizeInBits());
3082     SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
3083   } else {
3084     auto &Ctx = MF.getFunction().getContext();
3085     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3086       return UnableToLegalize; // Don't know what we're being asked to do.
3087 
3088     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3089   }
3090 
3091   // Extend to the next pow-2. If this store was itself the result of lowering,
3092   // e.g. an s56 store being broken into s32 + s24, we might have a stored type
3093   // that's wider than the stored size.
3094   unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
3095   const LLT NewSrcTy = LLT::scalar(AnyExtSize);
3096 
3097   if (SrcTy.isPointer()) {
3098     const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
3099     SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
3100   }
3101 
3102   auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
3103 
3104   // Obtain the smaller value by shifting away the larger value.
3105   auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
3106   auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
3107 
3108   // Generate the PtrAdd and truncating stores.
3109   LLT PtrTy = MRI.getType(PtrReg);
3110   auto OffsetCst = MIRBuilder.buildConstant(
3111     LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
3112   auto SmallPtr =
3113     MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
3114 
3115   MachineMemOperand *LargeMMO =
3116     MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3117   MachineMemOperand *SmallMMO =
3118     MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3119   MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
3120   MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
3121   StoreMI.eraseFromParent();
3122   return Legalized;
3123 }
3124 
3125 LegalizerHelper::LegalizeResult
3126 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
3127   switch (MI.getOpcode()) {
3128   case TargetOpcode::G_LOAD: {
3129     if (TypeIdx != 0)
3130       return UnableToLegalize;
3131     MachineMemOperand &MMO = **MI.memoperands_begin();
3132 
3133     // Not sure how to interpret a bitcast of an extending load.
3134     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3135       return UnableToLegalize;
3136 
3137     Observer.changingInstr(MI);
3138     bitcastDst(MI, CastTy, 0);
3139     MMO.setType(CastTy);
3140     Observer.changedInstr(MI);
3141     return Legalized;
3142   }
3143   case TargetOpcode::G_STORE: {
3144     if (TypeIdx != 0)
3145       return UnableToLegalize;
3146 
3147     MachineMemOperand &MMO = **MI.memoperands_begin();
3148 
3149     // Not sure how to interpret a bitcast of a truncating store.
3150     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3151       return UnableToLegalize;
3152 
3153     Observer.changingInstr(MI);
3154     bitcastSrc(MI, CastTy, 0);
3155     MMO.setType(CastTy);
3156     Observer.changedInstr(MI);
3157     return Legalized;
3158   }
3159   case TargetOpcode::G_SELECT: {
3160     if (TypeIdx != 0)
3161       return UnableToLegalize;
3162 
3163     if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
3164       LLVM_DEBUG(
3165           dbgs() << "bitcast action not implemented for vector select\n");
3166       return UnableToLegalize;
3167     }
3168 
3169     Observer.changingInstr(MI);
3170     bitcastSrc(MI, CastTy, 2);
3171     bitcastSrc(MI, CastTy, 3);
3172     bitcastDst(MI, CastTy, 0);
3173     Observer.changedInstr(MI);
3174     return Legalized;
3175   }
3176   case TargetOpcode::G_AND:
3177   case TargetOpcode::G_OR:
3178   case TargetOpcode::G_XOR: {
3179     Observer.changingInstr(MI);
3180     bitcastSrc(MI, CastTy, 1);
3181     bitcastSrc(MI, CastTy, 2);
3182     bitcastDst(MI, CastTy, 0);
3183     Observer.changedInstr(MI);
3184     return Legalized;
3185   }
3186   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3187     return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
3188   case TargetOpcode::G_INSERT_VECTOR_ELT:
3189     return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
3190   default:
3191     return UnableToLegalize;
3192   }
3193 }
3194 
3195 // Legalize an instruction by changing the opcode in place.
3196 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
3197     Observer.changingInstr(MI);
3198     MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
3199     Observer.changedInstr(MI);
3200 }
3201 
3202 LegalizerHelper::LegalizeResult
3203 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
3204   using namespace TargetOpcode;
3205 
3206   switch(MI.getOpcode()) {
3207   default:
3208     return UnableToLegalize;
3209   case TargetOpcode::G_BITCAST:
3210     return lowerBitcast(MI);
3211   case TargetOpcode::G_SREM:
3212   case TargetOpcode::G_UREM: {
3213     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3214     auto Quot =
3215         MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
3216                               {MI.getOperand(1), MI.getOperand(2)});
3217 
3218     auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
3219     MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
3220     MI.eraseFromParent();
3221     return Legalized;
3222   }
3223   case TargetOpcode::G_SADDO:
3224   case TargetOpcode::G_SSUBO:
3225     return lowerSADDO_SSUBO(MI);
3226   case TargetOpcode::G_UMULH:
3227   case TargetOpcode::G_SMULH:
3228     return lowerSMULH_UMULH(MI);
3229   case TargetOpcode::G_SMULO:
3230   case TargetOpcode::G_UMULO: {
3231     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
3232     // result.
3233     Register Res = MI.getOperand(0).getReg();
3234     Register Overflow = MI.getOperand(1).getReg();
3235     Register LHS = MI.getOperand(2).getReg();
3236     Register RHS = MI.getOperand(3).getReg();
3237     LLT Ty = MRI.getType(Res);
3238 
3239     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
3240                           ? TargetOpcode::G_SMULH
3241                           : TargetOpcode::G_UMULH;
3242 
3243     Observer.changingInstr(MI);
3244     const auto &TII = MIRBuilder.getTII();
3245     MI.setDesc(TII.get(TargetOpcode::G_MUL));
3246     MI.RemoveOperand(1);
3247     Observer.changedInstr(MI);
3248 
3249     auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
3250     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3251 
3252     // Move insert point forward so we can use the Res register if needed.
3253     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
3254 
3255     // For *signed* multiply, overflow is detected by checking:
3256     // (hi != (lo >> bitwidth-1))
3257     if (Opcode == TargetOpcode::G_SMULH) {
3258       auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
3259       auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
3260       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
3261     } else {
3262       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
3263     }
3264     return Legalized;
3265   }
3266   case TargetOpcode::G_FNEG: {
3267     Register Res = MI.getOperand(0).getReg();
3268     LLT Ty = MRI.getType(Res);
3269 
3270     // TODO: Handle vector types once we are able to
3271     // represent them.
3272     if (Ty.isVector())
3273       return UnableToLegalize;
3274     auto SignMask =
3275         MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
3276     Register SubByReg = MI.getOperand(1).getReg();
3277     MIRBuilder.buildXor(Res, SubByReg, SignMask);
3278     MI.eraseFromParent();
3279     return Legalized;
3280   }
3281   case TargetOpcode::G_FSUB: {
3282     Register Res = MI.getOperand(0).getReg();
3283     LLT Ty = MRI.getType(Res);
3284 
3285     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
3286     // First, check if G_FNEG is marked as Lower. If so, we may
3287     // end up with an infinite loop as G_FSUB is used to legalize G_FNEG.
3288     if (LI.getAction({G_FNEG, {Ty}}).Action == Lower)
3289       return UnableToLegalize;
3290     Register LHS = MI.getOperand(1).getReg();
3291     Register RHS = MI.getOperand(2).getReg();
3292     Register Neg = MRI.createGenericVirtualRegister(Ty);
3293     MIRBuilder.buildFNeg(Neg, RHS);
3294     MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
3295     MI.eraseFromParent();
3296     return Legalized;
3297   }
3298   case TargetOpcode::G_FMAD:
3299     return lowerFMad(MI);
3300   case TargetOpcode::G_FFLOOR:
3301     return lowerFFloor(MI);
3302   case TargetOpcode::G_INTRINSIC_ROUND:
3303     return lowerIntrinsicRound(MI);
3304   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
3305     // Since round even is the assumed rounding mode for unconstrained FP
3306     // operations, rint and roundeven are the same operation.
3307     changeOpcode(MI, TargetOpcode::G_FRINT);
3308     return Legalized;
3309   }
3310   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
3311     Register OldValRes = MI.getOperand(0).getReg();
3312     Register SuccessRes = MI.getOperand(1).getReg();
3313     Register Addr = MI.getOperand(2).getReg();
3314     Register CmpVal = MI.getOperand(3).getReg();
3315     Register NewVal = MI.getOperand(4).getReg();
3316     MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
3317                                   **MI.memoperands_begin());
3318     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal);
3319     MI.eraseFromParent();
3320     return Legalized;
3321   }
3322   case TargetOpcode::G_LOAD:
3323   case TargetOpcode::G_SEXTLOAD:
3324   case TargetOpcode::G_ZEXTLOAD:
3325     return lowerLoad(cast<GAnyLoad>(MI));
3326   case TargetOpcode::G_STORE:
3327     return lowerStore(cast<GStore>(MI));
3328   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
3329   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
3330   case TargetOpcode::G_CTLZ:
3331   case TargetOpcode::G_CTTZ:
3332   case TargetOpcode::G_CTPOP:
3333     return lowerBitCount(MI);
3334   case G_UADDO: {
3335     Register Res = MI.getOperand(0).getReg();
3336     Register CarryOut = MI.getOperand(1).getReg();
3337     Register LHS = MI.getOperand(2).getReg();
3338     Register RHS = MI.getOperand(3).getReg();
3339 
3340     MIRBuilder.buildAdd(Res, LHS, RHS);
3341     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS);
3342 
3343     MI.eraseFromParent();
3344     return Legalized;
3345   }
3346   case G_UADDE: {
3347     Register Res = MI.getOperand(0).getReg();
3348     Register CarryOut = MI.getOperand(1).getReg();
3349     Register LHS = MI.getOperand(2).getReg();
3350     Register RHS = MI.getOperand(3).getReg();
3351     Register CarryIn = MI.getOperand(4).getReg();
3352     LLT Ty = MRI.getType(Res);
3353 
3354     auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
3355     auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
3356     MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
3357     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS);
3358 
3359     MI.eraseFromParent();
3360     return Legalized;
3361   }
3362   case G_USUBO: {
3363     Register Res = MI.getOperand(0).getReg();
3364     Register BorrowOut = MI.getOperand(1).getReg();
3365     Register LHS = MI.getOperand(2).getReg();
3366     Register RHS = MI.getOperand(3).getReg();
3367 
3368     MIRBuilder.buildSub(Res, LHS, RHS);
3369     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
3370 
3371     MI.eraseFromParent();
3372     return Legalized;
3373   }
3374   case G_USUBE: {
3375     Register Res = MI.getOperand(0).getReg();
3376     Register BorrowOut = MI.getOperand(1).getReg();
3377     Register LHS = MI.getOperand(2).getReg();
3378     Register RHS = MI.getOperand(3).getReg();
3379     Register BorrowIn = MI.getOperand(4).getReg();
3380     const LLT CondTy = MRI.getType(BorrowOut);
3381     const LLT Ty = MRI.getType(Res);
3382 
3383     auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
3384     auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
3385     MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
3386 
3387     auto LHS_EQ_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, LHS, RHS);
3388     auto LHS_ULT_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, LHS, RHS);
3389     MIRBuilder.buildSelect(BorrowOut, LHS_EQ_RHS, BorrowIn, LHS_ULT_RHS);
3390 
3391     MI.eraseFromParent();
3392     return Legalized;
3393   }
3394   case G_UITOFP:
3395     return lowerUITOFP(MI);
3396   case G_SITOFP:
3397     return lowerSITOFP(MI);
3398   case G_FPTOUI:
3399     return lowerFPTOUI(MI);
3400   case G_FPTOSI:
3401     return lowerFPTOSI(MI);
3402   case G_FPTRUNC:
3403     return lowerFPTRUNC(MI);
3404   case G_FPOWI:
3405     return lowerFPOWI(MI);
3406   case G_SMIN:
3407   case G_SMAX:
3408   case G_UMIN:
3409   case G_UMAX:
3410     return lowerMinMax(MI);
3411   case G_FCOPYSIGN:
3412     return lowerFCopySign(MI);
3413   case G_FMINNUM:
3414   case G_FMAXNUM:
3415     return lowerFMinNumMaxNum(MI);
3416   case G_MERGE_VALUES:
3417     return lowerMergeValues(MI);
3418   case G_UNMERGE_VALUES:
3419     return lowerUnmergeValues(MI);
3420   case TargetOpcode::G_SEXT_INREG: {
3421     assert(MI.getOperand(2).isImm() && "Expected immediate");
3422     int64_t SizeInBits = MI.getOperand(2).getImm();
3423 
3424     Register DstReg = MI.getOperand(0).getReg();
3425     Register SrcReg = MI.getOperand(1).getReg();
3426     LLT DstTy = MRI.getType(DstReg);
3427     Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
3428 
3429     auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
3430     MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
3431     MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
3432     MI.eraseFromParent();
3433     return Legalized;
3434   }
3435   case G_EXTRACT_VECTOR_ELT:
3436   case G_INSERT_VECTOR_ELT:
3437     return lowerExtractInsertVectorElt(MI);
3438   case G_SHUFFLE_VECTOR:
3439     return lowerShuffleVector(MI);
3440   case G_DYN_STACKALLOC:
3441     return lowerDynStackAlloc(MI);
3442   case G_EXTRACT:
3443     return lowerExtract(MI);
3444   case G_INSERT:
3445     return lowerInsert(MI);
3446   case G_BSWAP:
3447     return lowerBswap(MI);
3448   case G_BITREVERSE:
3449     return lowerBitreverse(MI);
3450   case G_READ_REGISTER:
3451   case G_WRITE_REGISTER:
3452     return lowerReadWriteRegister(MI);
3453   case G_UADDSAT:
3454   case G_USUBSAT: {
3455     // Try to make a reasonable guess about which lowering strategy to use. The
3456     // target can override this with custom lowering and calling the
3457     // implementation functions.
3458     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3459     if (LI.isLegalOrCustom({G_UMIN, Ty}))
3460       return lowerAddSubSatToMinMax(MI);
3461     return lowerAddSubSatToAddoSubo(MI);
3462   }
3463   case G_SADDSAT:
3464   case G_SSUBSAT: {
3465     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3466 
3467     // FIXME: It would probably make more sense to see if G_SADDO is preferred,
3468     // since it's a shorter expansion. However, we would need to figure out the
3469     // preferred boolean type for the carry out for the query.
3470     if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
3471       return lowerAddSubSatToMinMax(MI);
3472     return lowerAddSubSatToAddoSubo(MI);
3473   }
3474   case G_SSHLSAT:
3475   case G_USHLSAT:
3476     return lowerShlSat(MI);
3477   case G_ABS:
3478     return lowerAbsToAddXor(MI);
3479   case G_SELECT:
3480     return lowerSelect(MI);
3481   case G_SDIVREM:
3482   case G_UDIVREM:
3483     return lowerDIVREM(MI);
3484   case G_FSHL:
3485   case G_FSHR:
3486     return lowerFunnelShift(MI);
3487   case G_ROTL:
3488   case G_ROTR:
3489     return lowerRotate(MI);
3490   case G_MEMSET:
3491   case G_MEMCPY:
3492   case G_MEMMOVE:
3493     return lowerMemCpyFamily(MI);
3494   case G_MEMCPY_INLINE:
3495     return lowerMemcpyInline(MI);
3496   GISEL_VECREDUCE_CASES_NONSEQ
3497     return lowerVectorReduction(MI);
3498   }
3499 }
3500 
3501 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
3502                                                   Align MinAlign) const {
3503   // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
3504   // datalayout for the preferred alignment. Also there should be a target hook
3505   // for this to allow targets to reduce the alignment and ignore the
3506   // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
3507   // the type.
3508   return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
3509 }
3510 
3511 MachineInstrBuilder
3512 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
3513                                       MachinePointerInfo &PtrInfo) {
3514   MachineFunction &MF = MIRBuilder.getMF();
3515   const DataLayout &DL = MIRBuilder.getDataLayout();
3516   int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
3517 
3518   unsigned AddrSpace = DL.getAllocaAddrSpace();
3519   LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3520 
3521   PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
3522   return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
3523 }
3524 
3525 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
3526                                         LLT VecTy) {
3527   int64_t IdxVal;
3528   if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal)))
3529     return IdxReg;
3530 
3531   LLT IdxTy = B.getMRI()->getType(IdxReg);
3532   unsigned NElts = VecTy.getNumElements();
3533   if (isPowerOf2_32(NElts)) {
3534     APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
3535     return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
3536   }
3537 
3538   return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
3539       .getReg(0);
3540 }
3541 
3542 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
3543                                                   Register Index) {
3544   LLT EltTy = VecTy.getElementType();
3545 
3546   // Calculate the element offset and add it to the pointer.
3547   unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
3548   assert(EltSize * 8 == EltTy.getSizeInBits() &&
3549          "Converting bits to bytes lost precision");
3550 
3551   Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy);
3552 
3553   LLT IdxTy = MRI.getType(Index);
3554   auto Mul = MIRBuilder.buildMul(IdxTy, Index,
3555                                  MIRBuilder.buildConstant(IdxTy, EltSize));
3556 
3557   LLT PtrTy = MRI.getType(VecPtr);
3558   return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
3559 }
3560 
3561 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorImplicitDef(
3562     MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy) {
3563   Register DstReg = MI.getOperand(0).getReg();
3564   LLT DstTy = MRI.getType(DstReg);
3565   LLT LCMTy = getLCMType(DstTy, NarrowTy);
3566 
3567   unsigned NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
3568 
3569   auto NewUndef = MIRBuilder.buildUndef(NarrowTy);
3570   SmallVector<Register, 8> Parts(NumParts, NewUndef.getReg(0));
3571 
3572   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
3573   MI.eraseFromParent();
3574   return Legalized;
3575 }
3576 
3577 // Handle splitting vector operations which need to have the same number of
3578 // elements in each type index, but each type index may have a different element
3579 // type.
3580 //
3581 // e.g.  <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
3582 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3583 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3584 //
3585 // Also handles some irregular breakdown cases, e.g.
3586 // e.g.  <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
3587 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3588 //             s64 = G_SHL s64, s32
3589 LegalizerHelper::LegalizeResult
3590 LegalizerHelper::fewerElementsVectorMultiEltType(
3591   MachineInstr &MI, unsigned TypeIdx, LLT NarrowTyArg) {
3592   if (TypeIdx != 0)
3593     return UnableToLegalize;
3594 
3595   const LLT NarrowTy0 = NarrowTyArg;
3596   const Register DstReg = MI.getOperand(0).getReg();
3597   LLT DstTy = MRI.getType(DstReg);
3598   LLT LeftoverTy0;
3599 
3600   // All of the operands need to have the same number of elements, so if we can
3601   // determine a type breakdown for the result type, we can for all of the
3602   // source types.
3603   int NumParts = getNarrowTypeBreakDown(DstTy, NarrowTy0, LeftoverTy0).first;
3604   if (NumParts < 0)
3605     return UnableToLegalize;
3606 
3607   SmallVector<MachineInstrBuilder, 4> NewInsts;
3608 
3609   SmallVector<Register, 4> DstRegs, LeftoverDstRegs;
3610   SmallVector<Register, 4> PartRegs, LeftoverRegs;
3611 
3612   for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) {
3613     Register SrcReg = MI.getOperand(I).getReg();
3614     LLT SrcTyI = MRI.getType(SrcReg);
3615     const auto NewEC = NarrowTy0.isVector() ? NarrowTy0.getElementCount()
3616                                             : ElementCount::getFixed(1);
3617     LLT NarrowTyI = LLT::scalarOrVector(NewEC, SrcTyI.getScalarType());
3618     LLT LeftoverTyI;
3619 
3620     // Split this operand into the requested typed registers, and any leftover
3621     // required to reproduce the original type.
3622     if (!extractParts(SrcReg, SrcTyI, NarrowTyI, LeftoverTyI, PartRegs,
3623                       LeftoverRegs))
3624       return UnableToLegalize;
3625 
3626     if (I == 1) {
3627       // For the first operand, create an instruction for each part and setup
3628       // the result.
3629       for (Register PartReg : PartRegs) {
3630         Register PartDstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3631         NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode())
3632                                .addDef(PartDstReg)
3633                                .addUse(PartReg));
3634         DstRegs.push_back(PartDstReg);
3635       }
3636 
3637       for (Register LeftoverReg : LeftoverRegs) {
3638         Register PartDstReg = MRI.createGenericVirtualRegister(LeftoverTy0);
3639         NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode())
3640                                .addDef(PartDstReg)
3641                                .addUse(LeftoverReg));
3642         LeftoverDstRegs.push_back(PartDstReg);
3643       }
3644     } else {
3645       assert(NewInsts.size() == PartRegs.size() + LeftoverRegs.size());
3646 
3647       // Add the newly created operand splits to the existing instructions. The
3648       // odd-sized pieces are ordered after the requested NarrowTyArg sized
3649       // pieces.
3650       unsigned InstCount = 0;
3651       for (unsigned J = 0, JE = PartRegs.size(); J != JE; ++J)
3652         NewInsts[InstCount++].addUse(PartRegs[J]);
3653       for (unsigned J = 0, JE = LeftoverRegs.size(); J != JE; ++J)
3654         NewInsts[InstCount++].addUse(LeftoverRegs[J]);
3655     }
3656 
3657     PartRegs.clear();
3658     LeftoverRegs.clear();
3659   }
3660 
3661   // Insert the newly built operations and rebuild the result register.
3662   for (auto &MIB : NewInsts)
3663     MIRBuilder.insertInstr(MIB);
3664 
3665   insertParts(DstReg, DstTy, NarrowTy0, DstRegs, LeftoverTy0, LeftoverDstRegs);
3666 
3667   MI.eraseFromParent();
3668   return Legalized;
3669 }
3670 
3671 LegalizerHelper::LegalizeResult
3672 LegalizerHelper::fewerElementsVectorCasts(MachineInstr &MI, unsigned TypeIdx,
3673                                           LLT NarrowTy) {
3674   if (TypeIdx != 0)
3675     return UnableToLegalize;
3676 
3677   Register DstReg = MI.getOperand(0).getReg();
3678   Register SrcReg = MI.getOperand(1).getReg();
3679   LLT DstTy = MRI.getType(DstReg);
3680   LLT SrcTy = MRI.getType(SrcReg);
3681 
3682   LLT NarrowTy0 = NarrowTy;
3683   LLT NarrowTy1;
3684   unsigned NumParts;
3685 
3686   if (NarrowTy.isVector()) {
3687     // Uneven breakdown not handled.
3688     NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
3689     if (NumParts * NarrowTy.getNumElements() != DstTy.getNumElements())
3690       return UnableToLegalize;
3691 
3692     NarrowTy1 = LLT::vector(NarrowTy.getElementCount(), SrcTy.getElementType());
3693   } else {
3694     NumParts = DstTy.getNumElements();
3695     NarrowTy1 = SrcTy.getElementType();
3696   }
3697 
3698   SmallVector<Register, 4> SrcRegs, DstRegs;
3699   extractParts(SrcReg, NarrowTy1, NumParts, SrcRegs);
3700 
3701   for (unsigned I = 0; I < NumParts; ++I) {
3702     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3703     MachineInstr *NewInst =
3704         MIRBuilder.buildInstr(MI.getOpcode(), {DstReg}, {SrcRegs[I]});
3705 
3706     NewInst->setFlags(MI.getFlags());
3707     DstRegs.push_back(DstReg);
3708   }
3709 
3710   if (NarrowTy.isVector())
3711     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3712   else
3713     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3714 
3715   MI.eraseFromParent();
3716   return Legalized;
3717 }
3718 
3719 LegalizerHelper::LegalizeResult
3720 LegalizerHelper::fewerElementsVectorCmp(MachineInstr &MI, unsigned TypeIdx,
3721                                         LLT NarrowTy) {
3722   Register DstReg = MI.getOperand(0).getReg();
3723   Register Src0Reg = MI.getOperand(2).getReg();
3724   LLT DstTy = MRI.getType(DstReg);
3725   LLT SrcTy = MRI.getType(Src0Reg);
3726 
3727   unsigned NumParts;
3728   LLT NarrowTy0, NarrowTy1;
3729 
3730   if (TypeIdx == 0) {
3731     unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
3732     unsigned OldElts = DstTy.getNumElements();
3733 
3734     NarrowTy0 = NarrowTy;
3735     NumParts = NarrowTy.isVector() ? (OldElts / NewElts) : DstTy.getNumElements();
3736     NarrowTy1 = NarrowTy.isVector() ? LLT::vector(NarrowTy.getElementCount(),
3737                                                   SrcTy.getScalarSizeInBits())
3738                                     : SrcTy.getElementType();
3739 
3740   } else {
3741     unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
3742     unsigned OldElts = SrcTy.getNumElements();
3743 
3744     NumParts = NarrowTy.isVector() ? (OldElts / NewElts) :
3745       NarrowTy.getNumElements();
3746     NarrowTy0 =
3747         LLT::vector(NarrowTy.getElementCount(), DstTy.getScalarSizeInBits());
3748     NarrowTy1 = NarrowTy;
3749   }
3750 
3751   // FIXME: Don't know how to handle the situation where the small vectors
3752   // aren't all the same size yet.
3753   if (NarrowTy1.isVector() &&
3754       NarrowTy1.getNumElements() * NumParts != DstTy.getNumElements())
3755     return UnableToLegalize;
3756 
3757   CmpInst::Predicate Pred
3758     = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3759 
3760   SmallVector<Register, 2> Src1Regs, Src2Regs, DstRegs;
3761   extractParts(MI.getOperand(2).getReg(), NarrowTy1, NumParts, Src1Regs);
3762   extractParts(MI.getOperand(3).getReg(), NarrowTy1, NumParts, Src2Regs);
3763 
3764   for (unsigned I = 0; I < NumParts; ++I) {
3765     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3766     DstRegs.push_back(DstReg);
3767 
3768     if (MI.getOpcode() == TargetOpcode::G_ICMP)
3769       MIRBuilder.buildICmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]);
3770     else {
3771       MachineInstr *NewCmp
3772         = MIRBuilder.buildFCmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]);
3773       NewCmp->setFlags(MI.getFlags());
3774     }
3775   }
3776 
3777   if (NarrowTy1.isVector())
3778     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3779   else
3780     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3781 
3782   MI.eraseFromParent();
3783   return Legalized;
3784 }
3785 
3786 LegalizerHelper::LegalizeResult
3787 LegalizerHelper::fewerElementsVectorSelect(MachineInstr &MI, unsigned TypeIdx,
3788                                            LLT NarrowTy) {
3789   Register DstReg = MI.getOperand(0).getReg();
3790   Register CondReg = MI.getOperand(1).getReg();
3791 
3792   unsigned NumParts = 0;
3793   LLT NarrowTy0, NarrowTy1;
3794 
3795   LLT DstTy = MRI.getType(DstReg);
3796   LLT CondTy = MRI.getType(CondReg);
3797   unsigned Size = DstTy.getSizeInBits();
3798 
3799   assert(TypeIdx == 0 || CondTy.isVector());
3800 
3801   if (TypeIdx == 0) {
3802     NarrowTy0 = NarrowTy;
3803     NarrowTy1 = CondTy;
3804 
3805     unsigned NarrowSize = NarrowTy0.getSizeInBits();
3806     // FIXME: Don't know how to handle the situation where the small vectors
3807     // aren't all the same size yet.
3808     if (Size % NarrowSize != 0)
3809       return UnableToLegalize;
3810 
3811     NumParts = Size / NarrowSize;
3812 
3813     // Need to break down the condition type
3814     if (CondTy.isVector()) {
3815       if (CondTy.getNumElements() == NumParts)
3816         NarrowTy1 = CondTy.getElementType();
3817       else
3818         NarrowTy1 =
3819             LLT::vector(CondTy.getElementCount().divideCoefficientBy(NumParts),
3820                         CondTy.getScalarSizeInBits());
3821     }
3822   } else {
3823     NumParts = CondTy.getNumElements();
3824     if (NarrowTy.isVector()) {
3825       // TODO: Handle uneven breakdown.
3826       if (NumParts * NarrowTy.getNumElements() != CondTy.getNumElements())
3827         return UnableToLegalize;
3828 
3829       return UnableToLegalize;
3830     } else {
3831       NarrowTy0 = DstTy.getElementType();
3832       NarrowTy1 = NarrowTy;
3833     }
3834   }
3835 
3836   SmallVector<Register, 2> DstRegs, Src0Regs, Src1Regs, Src2Regs;
3837   if (CondTy.isVector())
3838     extractParts(MI.getOperand(1).getReg(), NarrowTy1, NumParts, Src0Regs);
3839 
3840   extractParts(MI.getOperand(2).getReg(), NarrowTy0, NumParts, Src1Regs);
3841   extractParts(MI.getOperand(3).getReg(), NarrowTy0, NumParts, Src2Regs);
3842 
3843   for (unsigned i = 0; i < NumParts; ++i) {
3844     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3845     MIRBuilder.buildSelect(DstReg, CondTy.isVector() ? Src0Regs[i] : CondReg,
3846                            Src1Regs[i], Src2Regs[i]);
3847     DstRegs.push_back(DstReg);
3848   }
3849 
3850   if (NarrowTy0.isVector())
3851     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3852   else
3853     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3854 
3855   MI.eraseFromParent();
3856   return Legalized;
3857 }
3858 
3859 LegalizerHelper::LegalizeResult
3860 LegalizerHelper::fewerElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
3861                                         LLT NarrowTy) {
3862   const Register DstReg = MI.getOperand(0).getReg();
3863   LLT PhiTy = MRI.getType(DstReg);
3864   LLT LeftoverTy;
3865 
3866   // All of the operands need to have the same number of elements, so if we can
3867   // determine a type breakdown for the result type, we can for all of the
3868   // source types.
3869   int NumParts, NumLeftover;
3870   std::tie(NumParts, NumLeftover)
3871     = getNarrowTypeBreakDown(PhiTy, NarrowTy, LeftoverTy);
3872   if (NumParts < 0)
3873     return UnableToLegalize;
3874 
3875   SmallVector<Register, 4> DstRegs, LeftoverDstRegs;
3876   SmallVector<MachineInstrBuilder, 4> NewInsts;
3877 
3878   const int TotalNumParts = NumParts + NumLeftover;
3879 
3880   // Insert the new phis in the result block first.
3881   for (int I = 0; I != TotalNumParts; ++I) {
3882     LLT Ty = I < NumParts ? NarrowTy : LeftoverTy;
3883     Register PartDstReg = MRI.createGenericVirtualRegister(Ty);
3884     NewInsts.push_back(MIRBuilder.buildInstr(TargetOpcode::G_PHI)
3885                        .addDef(PartDstReg));
3886     if (I < NumParts)
3887       DstRegs.push_back(PartDstReg);
3888     else
3889       LeftoverDstRegs.push_back(PartDstReg);
3890   }
3891 
3892   MachineBasicBlock *MBB = MI.getParent();
3893   MIRBuilder.setInsertPt(*MBB, MBB->getFirstNonPHI());
3894   insertParts(DstReg, PhiTy, NarrowTy, DstRegs, LeftoverTy, LeftoverDstRegs);
3895 
3896   SmallVector<Register, 4> PartRegs, LeftoverRegs;
3897 
3898   // Insert code to extract the incoming values in each predecessor block.
3899   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3900     PartRegs.clear();
3901     LeftoverRegs.clear();
3902 
3903     Register SrcReg = MI.getOperand(I).getReg();
3904     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
3905     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
3906 
3907     LLT Unused;
3908     if (!extractParts(SrcReg, PhiTy, NarrowTy, Unused, PartRegs,
3909                       LeftoverRegs))
3910       return UnableToLegalize;
3911 
3912     // Add the newly created operand splits to the existing instructions. The
3913     // odd-sized pieces are ordered after the requested NarrowTyArg sized
3914     // pieces.
3915     for (int J = 0; J != TotalNumParts; ++J) {
3916       MachineInstrBuilder MIB = NewInsts[J];
3917       MIB.addUse(J < NumParts ? PartRegs[J] : LeftoverRegs[J - NumParts]);
3918       MIB.addMBB(&OpMBB);
3919     }
3920   }
3921 
3922   MI.eraseFromParent();
3923   return Legalized;
3924 }
3925 
3926 LegalizerHelper::LegalizeResult
3927 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
3928                                                   unsigned TypeIdx,
3929                                                   LLT NarrowTy) {
3930   if (TypeIdx != 1)
3931     return UnableToLegalize;
3932 
3933   const int NumDst = MI.getNumOperands() - 1;
3934   const Register SrcReg = MI.getOperand(NumDst).getReg();
3935   LLT SrcTy = MRI.getType(SrcReg);
3936 
3937   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3938 
3939   // TODO: Create sequence of extracts.
3940   if (DstTy == NarrowTy)
3941     return UnableToLegalize;
3942 
3943   LLT GCDTy = getGCDType(SrcTy, NarrowTy);
3944   if (DstTy == GCDTy) {
3945     // This would just be a copy of the same unmerge.
3946     // TODO: Create extracts, pad with undef and create intermediate merges.
3947     return UnableToLegalize;
3948   }
3949 
3950   auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
3951   const int NumUnmerge = Unmerge->getNumOperands() - 1;
3952   const int PartsPerUnmerge = NumDst / NumUnmerge;
3953 
3954   for (int I = 0; I != NumUnmerge; ++I) {
3955     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
3956 
3957     for (int J = 0; J != PartsPerUnmerge; ++J)
3958       MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
3959     MIB.addUse(Unmerge.getReg(I));
3960   }
3961 
3962   MI.eraseFromParent();
3963   return Legalized;
3964 }
3965 
3966 LegalizerHelper::LegalizeResult
3967 LegalizerHelper::fewerElementsVectorMulo(MachineInstr &MI, unsigned TypeIdx,
3968                                          LLT NarrowTy) {
3969   Register Result = MI.getOperand(0).getReg();
3970   Register Overflow = MI.getOperand(1).getReg();
3971   Register LHS = MI.getOperand(2).getReg();
3972   Register RHS = MI.getOperand(3).getReg();
3973 
3974   LLT SrcTy = MRI.getType(LHS);
3975   if (!SrcTy.isVector())
3976     return UnableToLegalize;
3977 
3978   LLT ElementType = SrcTy.getElementType();
3979   LLT OverflowElementTy = MRI.getType(Overflow).getElementType();
3980   const ElementCount NumResult = SrcTy.getElementCount();
3981   LLT GCDTy = getGCDType(SrcTy, NarrowTy);
3982 
3983   // Unmerge the operands to smaller parts of GCD type.
3984   auto UnmergeLHS = MIRBuilder.buildUnmerge(GCDTy, LHS);
3985   auto UnmergeRHS = MIRBuilder.buildUnmerge(GCDTy, RHS);
3986 
3987   const int NumOps = UnmergeLHS->getNumOperands() - 1;
3988   const ElementCount PartsPerUnmerge = NumResult.divideCoefficientBy(NumOps);
3989   LLT OverflowTy = LLT::scalarOrVector(PartsPerUnmerge, OverflowElementTy);
3990   LLT ResultTy = LLT::scalarOrVector(PartsPerUnmerge, ElementType);
3991 
3992   // Perform the operation over unmerged parts.
3993   SmallVector<Register, 8> ResultParts;
3994   SmallVector<Register, 8> OverflowParts;
3995   for (int I = 0; I != NumOps; ++I) {
3996     Register Operand1 = UnmergeLHS->getOperand(I).getReg();
3997     Register Operand2 = UnmergeRHS->getOperand(I).getReg();
3998     auto PartMul = MIRBuilder.buildInstr(MI.getOpcode(), {ResultTy, OverflowTy},
3999                                          {Operand1, Operand2});
4000     ResultParts.push_back(PartMul->getOperand(0).getReg());
4001     OverflowParts.push_back(PartMul->getOperand(1).getReg());
4002   }
4003 
4004   LLT ResultLCMTy = buildLCMMergePieces(SrcTy, NarrowTy, GCDTy, ResultParts);
4005   LLT OverflowLCMTy =
4006       LLT::scalarOrVector(ResultLCMTy.getElementCount(), OverflowElementTy);
4007 
4008   // Recombine the pieces to the original result and overflow registers.
4009   buildWidenedRemergeToDst(Result, ResultLCMTy, ResultParts);
4010   buildWidenedRemergeToDst(Overflow, OverflowLCMTy, OverflowParts);
4011   MI.eraseFromParent();
4012   return Legalized;
4013 }
4014 
4015 // Handle FewerElementsVector a G_BUILD_VECTOR or G_CONCAT_VECTORS that produces
4016 // a vector
4017 //
4018 // Create a G_BUILD_VECTOR or G_CONCAT_VECTORS of NarrowTy pieces, padding with
4019 // undef as necessary.
4020 //
4021 // %3:_(<3 x s16>) = G_BUILD_VECTOR %0, %1, %2
4022 //   -> <2 x s16>
4023 //
4024 // %4:_(s16) = G_IMPLICIT_DEF
4025 // %5:_(<2 x s16>) = G_BUILD_VECTOR %0, %1
4026 // %6:_(<2 x s16>) = G_BUILD_VECTOR %2, %4
4027 // %7:_(<2 x s16>) = G_IMPLICIT_DEF
4028 // %8:_(<6 x s16>) = G_CONCAT_VECTORS %5, %6, %7
4029 // %3:_(<3 x s16>), %8:_(<3 x s16>) = G_UNMERGE_VALUES %8
4030 LegalizerHelper::LegalizeResult
4031 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
4032                                           LLT NarrowTy) {
4033   Register DstReg = MI.getOperand(0).getReg();
4034   LLT DstTy = MRI.getType(DstReg);
4035   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
4036   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
4037 
4038   // Break into a common type
4039   SmallVector<Register, 16> Parts;
4040   for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
4041     extractGCDType(Parts, GCDTy, MI.getOperand(I).getReg());
4042 
4043   // Build the requested new merge, padding with undef.
4044   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts,
4045                                   TargetOpcode::G_ANYEXT);
4046 
4047   // Pack into the original result register.
4048   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
4049 
4050   MI.eraseFromParent();
4051   return Legalized;
4052 }
4053 
4054 LegalizerHelper::LegalizeResult
4055 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
4056                                                            unsigned TypeIdx,
4057                                                            LLT NarrowVecTy) {
4058   Register DstReg = MI.getOperand(0).getReg();
4059   Register SrcVec = MI.getOperand(1).getReg();
4060   Register InsertVal;
4061   bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
4062 
4063   assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
4064   if (IsInsert)
4065     InsertVal = MI.getOperand(2).getReg();
4066 
4067   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
4068 
4069   // TODO: Handle total scalarization case.
4070   if (!NarrowVecTy.isVector())
4071     return UnableToLegalize;
4072 
4073   LLT VecTy = MRI.getType(SrcVec);
4074 
4075   // If the index is a constant, we can really break this down as you would
4076   // expect, and index into the target size pieces.
4077   int64_t IdxVal;
4078   auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
4079   if (MaybeCst) {
4080     IdxVal = MaybeCst->Value.getSExtValue();
4081     // Avoid out of bounds indexing the pieces.
4082     if (IdxVal >= VecTy.getNumElements()) {
4083       MIRBuilder.buildUndef(DstReg);
4084       MI.eraseFromParent();
4085       return Legalized;
4086     }
4087 
4088     SmallVector<Register, 8> VecParts;
4089     LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
4090 
4091     // Build a sequence of NarrowTy pieces in VecParts for this operand.
4092     LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
4093                                     TargetOpcode::G_ANYEXT);
4094 
4095     unsigned NewNumElts = NarrowVecTy.getNumElements();
4096 
4097     LLT IdxTy = MRI.getType(Idx);
4098     int64_t PartIdx = IdxVal / NewNumElts;
4099     auto NewIdx =
4100         MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
4101 
4102     if (IsInsert) {
4103       LLT PartTy = MRI.getType(VecParts[PartIdx]);
4104 
4105       // Use the adjusted index to insert into one of the subvectors.
4106       auto InsertPart = MIRBuilder.buildInsertVectorElement(
4107           PartTy, VecParts[PartIdx], InsertVal, NewIdx);
4108       VecParts[PartIdx] = InsertPart.getReg(0);
4109 
4110       // Recombine the inserted subvector with the others to reform the result
4111       // vector.
4112       buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
4113     } else {
4114       MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
4115     }
4116 
4117     MI.eraseFromParent();
4118     return Legalized;
4119   }
4120 
4121   // With a variable index, we can't perform the operation in a smaller type, so
4122   // we're forced to expand this.
4123   //
4124   // TODO: We could emit a chain of compare/select to figure out which piece to
4125   // index.
4126   return lowerExtractInsertVectorElt(MI);
4127 }
4128 
4129 LegalizerHelper::LegalizeResult
4130 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
4131                                       LLT NarrowTy) {
4132   // FIXME: Don't know how to handle secondary types yet.
4133   if (TypeIdx != 0)
4134     return UnableToLegalize;
4135 
4136   // This implementation doesn't work for atomics. Give up instead of doing
4137   // something invalid.
4138   if (LdStMI.isAtomic())
4139     return UnableToLegalize;
4140 
4141   bool IsLoad = isa<GLoad>(LdStMI);
4142   Register ValReg = LdStMI.getReg(0);
4143   Register AddrReg = LdStMI.getPointerReg();
4144   LLT ValTy = MRI.getType(ValReg);
4145 
4146   // FIXME: Do we need a distinct NarrowMemory legalize action?
4147   if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
4148     LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
4149     return UnableToLegalize;
4150   }
4151 
4152   int NumParts = -1;
4153   int NumLeftover = -1;
4154   LLT LeftoverTy;
4155   SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
4156   if (IsLoad) {
4157     std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
4158   } else {
4159     if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
4160                      NarrowLeftoverRegs)) {
4161       NumParts = NarrowRegs.size();
4162       NumLeftover = NarrowLeftoverRegs.size();
4163     }
4164   }
4165 
4166   if (NumParts == -1)
4167     return UnableToLegalize;
4168 
4169   LLT PtrTy = MRI.getType(AddrReg);
4170   const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
4171 
4172   unsigned TotalSize = ValTy.getSizeInBits();
4173 
4174   // Split the load/store into PartTy sized pieces starting at Offset. If this
4175   // is a load, return the new registers in ValRegs. For a store, each elements
4176   // of ValRegs should be PartTy. Returns the next offset that needs to be
4177   // handled.
4178   auto MMO = LdStMI.getMMO();
4179   auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
4180                              unsigned Offset) -> unsigned {
4181     MachineFunction &MF = MIRBuilder.getMF();
4182     unsigned PartSize = PartTy.getSizeInBits();
4183     for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
4184          Offset += PartSize, ++Idx) {
4185       unsigned ByteOffset = Offset / 8;
4186       Register NewAddrReg;
4187 
4188       MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
4189 
4190       MachineMemOperand *NewMMO =
4191           MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
4192 
4193       if (IsLoad) {
4194         Register Dst = MRI.createGenericVirtualRegister(PartTy);
4195         ValRegs.push_back(Dst);
4196         MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
4197       } else {
4198         MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
4199       }
4200     }
4201 
4202     return Offset;
4203   };
4204 
4205   unsigned HandledOffset = splitTypePieces(NarrowTy, NarrowRegs, 0);
4206 
4207   // Handle the rest of the register if this isn't an even type breakdown.
4208   if (LeftoverTy.isValid())
4209     splitTypePieces(LeftoverTy, NarrowLeftoverRegs, HandledOffset);
4210 
4211   if (IsLoad) {
4212     insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
4213                 LeftoverTy, NarrowLeftoverRegs);
4214   }
4215 
4216   LdStMI.eraseFromParent();
4217   return Legalized;
4218 }
4219 
4220 LegalizerHelper::LegalizeResult
4221 LegalizerHelper::reduceOperationWidth(MachineInstr &MI, unsigned int TypeIdx,
4222                                       LLT NarrowTy) {
4223   assert(TypeIdx == 0 && "only one type index expected");
4224 
4225   const unsigned Opc = MI.getOpcode();
4226   const int NumDefOps = MI.getNumExplicitDefs();
4227   const int NumSrcOps = MI.getNumOperands() - NumDefOps;
4228   const unsigned Flags = MI.getFlags();
4229   const unsigned NarrowSize = NarrowTy.getSizeInBits();
4230   const LLT NarrowScalarTy = LLT::scalar(NarrowSize);
4231 
4232   assert(MI.getNumOperands() <= 4 && "expected instruction with either 1 "
4233                                      "result and 1-3 sources or 2 results and "
4234                                      "1-2 sources");
4235 
4236   SmallVector<Register, 2> DstRegs;
4237   for (int I = 0; I < NumDefOps; ++I)
4238     DstRegs.push_back(MI.getOperand(I).getReg());
4239 
4240   // First of all check whether we are narrowing (changing the element type)
4241   // or reducing the vector elements
4242   const LLT DstTy = MRI.getType(DstRegs[0]);
4243   const bool IsNarrow = NarrowTy.getScalarType() != DstTy.getScalarType();
4244 
4245   SmallVector<Register, 8> ExtractedRegs[3];
4246   SmallVector<Register, 8> Parts;
4247 
4248   // Break down all the sources into NarrowTy pieces we can operate on. This may
4249   // involve creating merges to a wider type, padded with undef.
4250   for (int I = 0; I != NumSrcOps; ++I) {
4251     Register SrcReg = MI.getOperand(I + NumDefOps).getReg();
4252     LLT SrcTy = MRI.getType(SrcReg);
4253 
4254     // The type to narrow SrcReg to. For narrowing, this is a smaller scalar.
4255     // For fewerElements, this is a smaller vector with the same element type.
4256     LLT OpNarrowTy;
4257     if (IsNarrow) {
4258       OpNarrowTy = NarrowScalarTy;
4259 
4260       // In case of narrowing, we need to cast vectors to scalars for this to
4261       // work properly
4262       // FIXME: Can we do without the bitcast here if we're narrowing?
4263       if (SrcTy.isVector()) {
4264         SrcTy = LLT::scalar(SrcTy.getSizeInBits());
4265         SrcReg = MIRBuilder.buildBitcast(SrcTy, SrcReg).getReg(0);
4266       }
4267     } else {
4268       auto NarrowEC = NarrowTy.isVector() ? NarrowTy.getElementCount()
4269                                           : ElementCount::getFixed(1);
4270       OpNarrowTy = LLT::scalarOrVector(NarrowEC, SrcTy.getScalarType());
4271     }
4272 
4273     LLT GCDTy = extractGCDType(ExtractedRegs[I], SrcTy, OpNarrowTy, SrcReg);
4274 
4275     // Build a sequence of NarrowTy pieces in ExtractedRegs for this operand.
4276     buildLCMMergePieces(SrcTy, OpNarrowTy, GCDTy, ExtractedRegs[I],
4277                         TargetOpcode::G_ANYEXT);
4278   }
4279 
4280   SmallVector<Register, 8> ResultRegs[2];
4281 
4282   // Input operands for each sub-instruction.
4283   SmallVector<SrcOp, 4> InputRegs(NumSrcOps, Register());
4284 
4285   int NumParts = ExtractedRegs[0].size();
4286   const unsigned DstSize = DstTy.getSizeInBits();
4287   const LLT DstScalarTy = LLT::scalar(DstSize);
4288 
4289   // Narrowing needs to use scalar types
4290   LLT DstLCMTy, NarrowDstTy;
4291   if (IsNarrow) {
4292     DstLCMTy = getLCMType(DstScalarTy, NarrowScalarTy);
4293     NarrowDstTy = NarrowScalarTy;
4294   } else {
4295     DstLCMTy = getLCMType(DstTy, NarrowTy);
4296     NarrowDstTy = NarrowTy;
4297   }
4298 
4299   // We widened the source registers to satisfy merge/unmerge size
4300   // constraints. We'll have some extra fully undef parts.
4301   const int NumRealParts = (DstSize + NarrowSize - 1) / NarrowSize;
4302 
4303   for (int I = 0; I != NumRealParts; ++I) {
4304     // Emit this instruction on each of the split pieces.
4305     for (int J = 0; J != NumSrcOps; ++J)
4306       InputRegs[J] = ExtractedRegs[J][I];
4307 
4308     MachineInstrBuilder Inst;
4309     if (NumDefOps == 1)
4310       Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy}, InputRegs, Flags);
4311     else
4312       Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy, NarrowDstTy}, InputRegs,
4313                                    Flags);
4314 
4315     for (int J = 0; J != NumDefOps; ++J)
4316       ResultRegs[J].push_back(Inst.getReg(J));
4317   }
4318 
4319   // Fill out the widened result with undef instead of creating instructions
4320   // with undef inputs.
4321   int NumUndefParts = NumParts - NumRealParts;
4322   if (NumUndefParts != 0) {
4323     Register Undef = MIRBuilder.buildUndef(NarrowDstTy).getReg(0);
4324     for (int I = 0; I != NumDefOps; ++I)
4325       ResultRegs[I].append(NumUndefParts, Undef);
4326   }
4327 
4328   // Extract the possibly padded result. Use a scratch register if we need to do
4329   // a final bitcast, otherwise use the original result register.
4330   Register MergeDstReg;
4331   for (int I = 0; I != NumDefOps; ++I) {
4332     if (IsNarrow && DstTy.isVector())
4333       MergeDstReg = MRI.createGenericVirtualRegister(DstScalarTy);
4334     else
4335       MergeDstReg = DstRegs[I];
4336 
4337     buildWidenedRemergeToDst(MergeDstReg, DstLCMTy, ResultRegs[I]);
4338 
4339     // Recast to vector if we narrowed a vector
4340     if (IsNarrow && DstTy.isVector())
4341       MIRBuilder.buildBitcast(DstRegs[I], MergeDstReg);
4342   }
4343 
4344   MI.eraseFromParent();
4345   return Legalized;
4346 }
4347 
4348 LegalizerHelper::LegalizeResult
4349 LegalizerHelper::fewerElementsVectorSextInReg(MachineInstr &MI, unsigned TypeIdx,
4350                                               LLT NarrowTy) {
4351   Register DstReg = MI.getOperand(0).getReg();
4352   Register SrcReg = MI.getOperand(1).getReg();
4353   int64_t Imm = MI.getOperand(2).getImm();
4354 
4355   LLT DstTy = MRI.getType(DstReg);
4356 
4357   SmallVector<Register, 8> Parts;
4358   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
4359   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts);
4360 
4361   for (Register &R : Parts)
4362     R = MIRBuilder.buildSExtInReg(NarrowTy, R, Imm).getReg(0);
4363 
4364   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
4365 
4366   MI.eraseFromParent();
4367   return Legalized;
4368 }
4369 
4370 LegalizerHelper::LegalizeResult
4371 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
4372                                      LLT NarrowTy) {
4373   using namespace TargetOpcode;
4374 
4375   switch (MI.getOpcode()) {
4376   case G_IMPLICIT_DEF:
4377     return fewerElementsVectorImplicitDef(MI, TypeIdx, NarrowTy);
4378   case G_TRUNC:
4379   case G_AND:
4380   case G_OR:
4381   case G_XOR:
4382   case G_ADD:
4383   case G_SUB:
4384   case G_MUL:
4385   case G_PTR_ADD:
4386   case G_SMULH:
4387   case G_UMULH:
4388   case G_FADD:
4389   case G_FMUL:
4390   case G_FSUB:
4391   case G_FNEG:
4392   case G_FABS:
4393   case G_FCANONICALIZE:
4394   case G_FDIV:
4395   case G_FREM:
4396   case G_FMA:
4397   case G_FMAD:
4398   case G_FPOW:
4399   case G_FEXP:
4400   case G_FEXP2:
4401   case G_FLOG:
4402   case G_FLOG2:
4403   case G_FLOG10:
4404   case G_FNEARBYINT:
4405   case G_FCEIL:
4406   case G_FFLOOR:
4407   case G_FRINT:
4408   case G_INTRINSIC_ROUND:
4409   case G_INTRINSIC_ROUNDEVEN:
4410   case G_INTRINSIC_TRUNC:
4411   case G_FCOS:
4412   case G_FSIN:
4413   case G_FSQRT:
4414   case G_BSWAP:
4415   case G_BITREVERSE:
4416   case G_SDIV:
4417   case G_UDIV:
4418   case G_SREM:
4419   case G_UREM:
4420   case G_SDIVREM:
4421   case G_UDIVREM:
4422   case G_SMIN:
4423   case G_SMAX:
4424   case G_UMIN:
4425   case G_UMAX:
4426   case G_ABS:
4427   case G_FMINNUM:
4428   case G_FMAXNUM:
4429   case G_FMINNUM_IEEE:
4430   case G_FMAXNUM_IEEE:
4431   case G_FMINIMUM:
4432   case G_FMAXIMUM:
4433   case G_FSHL:
4434   case G_FSHR:
4435   case G_ROTL:
4436   case G_ROTR:
4437   case G_FREEZE:
4438   case G_SADDSAT:
4439   case G_SSUBSAT:
4440   case G_UADDSAT:
4441   case G_USUBSAT:
4442     return reduceOperationWidth(MI, TypeIdx, NarrowTy);
4443   case G_UMULO:
4444   case G_SMULO:
4445     return fewerElementsVectorMulo(MI, TypeIdx, NarrowTy);
4446   case G_SHL:
4447   case G_LSHR:
4448   case G_ASHR:
4449   case G_SSHLSAT:
4450   case G_USHLSAT:
4451   case G_CTLZ:
4452   case G_CTLZ_ZERO_UNDEF:
4453   case G_CTTZ:
4454   case G_CTTZ_ZERO_UNDEF:
4455   case G_CTPOP:
4456   case G_FCOPYSIGN:
4457     return fewerElementsVectorMultiEltType(MI, TypeIdx, NarrowTy);
4458   case G_ZEXT:
4459   case G_SEXT:
4460   case G_ANYEXT:
4461   case G_FPEXT:
4462   case G_FPTRUNC:
4463   case G_SITOFP:
4464   case G_UITOFP:
4465   case G_FPTOSI:
4466   case G_FPTOUI:
4467   case G_INTTOPTR:
4468   case G_PTRTOINT:
4469   case G_ADDRSPACE_CAST:
4470     return fewerElementsVectorCasts(MI, TypeIdx, NarrowTy);
4471   case G_ICMP:
4472   case G_FCMP:
4473     return fewerElementsVectorCmp(MI, TypeIdx, NarrowTy);
4474   case G_SELECT:
4475     return fewerElementsVectorSelect(MI, TypeIdx, NarrowTy);
4476   case G_PHI:
4477     return fewerElementsVectorPhi(MI, TypeIdx, NarrowTy);
4478   case G_UNMERGE_VALUES:
4479     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
4480   case G_BUILD_VECTOR:
4481     assert(TypeIdx == 0 && "not a vector type index");
4482     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4483   case G_CONCAT_VECTORS:
4484     if (TypeIdx != 1) // TODO: This probably does work as expected already.
4485       return UnableToLegalize;
4486     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4487   case G_EXTRACT_VECTOR_ELT:
4488   case G_INSERT_VECTOR_ELT:
4489     return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
4490   case G_LOAD:
4491   case G_STORE:
4492     return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
4493   case G_SEXT_INREG:
4494     return fewerElementsVectorSextInReg(MI, TypeIdx, NarrowTy);
4495   GISEL_VECREDUCE_CASES_NONSEQ
4496     return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
4497   case G_SHUFFLE_VECTOR:
4498     return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
4499   default:
4500     return UnableToLegalize;
4501   }
4502 }
4503 
4504 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
4505     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4506   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
4507   if (TypeIdx != 0)
4508     return UnableToLegalize;
4509 
4510   Register DstReg = MI.getOperand(0).getReg();
4511   Register Src1Reg = MI.getOperand(1).getReg();
4512   Register Src2Reg = MI.getOperand(2).getReg();
4513   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4514   LLT DstTy = MRI.getType(DstReg);
4515   LLT Src1Ty = MRI.getType(Src1Reg);
4516   LLT Src2Ty = MRI.getType(Src2Reg);
4517   // The shuffle should be canonicalized by now.
4518   if (DstTy != Src1Ty)
4519     return UnableToLegalize;
4520   if (DstTy != Src2Ty)
4521     return UnableToLegalize;
4522 
4523   if (!isPowerOf2_32(DstTy.getNumElements()))
4524     return UnableToLegalize;
4525 
4526   // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
4527   // Further legalization attempts will be needed to do split further.
4528   NarrowTy =
4529       DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
4530   unsigned NewElts = NarrowTy.getNumElements();
4531 
4532   SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
4533   extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs);
4534   extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs);
4535   Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
4536                         SplitSrc2Regs[1]};
4537 
4538   Register Hi, Lo;
4539 
4540   // If Lo or Hi uses elements from at most two of the four input vectors, then
4541   // express it as a vector shuffle of those two inputs.  Otherwise extract the
4542   // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
4543   SmallVector<int, 16> Ops;
4544   for (unsigned High = 0; High < 2; ++High) {
4545     Register &Output = High ? Hi : Lo;
4546 
4547     // Build a shuffle mask for the output, discovering on the fly which
4548     // input vectors to use as shuffle operands (recorded in InputUsed).
4549     // If building a suitable shuffle vector proves too hard, then bail
4550     // out with useBuildVector set.
4551     unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
4552     unsigned FirstMaskIdx = High * NewElts;
4553     bool UseBuildVector = false;
4554     for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4555       // The mask element.  This indexes into the input.
4556       int Idx = Mask[FirstMaskIdx + MaskOffset];
4557 
4558       // The input vector this mask element indexes into.
4559       unsigned Input = (unsigned)Idx / NewElts;
4560 
4561       if (Input >= array_lengthof(Inputs)) {
4562         // The mask element does not index into any input vector.
4563         Ops.push_back(-1);
4564         continue;
4565       }
4566 
4567       // Turn the index into an offset from the start of the input vector.
4568       Idx -= Input * NewElts;
4569 
4570       // Find or create a shuffle vector operand to hold this input.
4571       unsigned OpNo;
4572       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
4573         if (InputUsed[OpNo] == Input) {
4574           // This input vector is already an operand.
4575           break;
4576         } else if (InputUsed[OpNo] == -1U) {
4577           // Create a new operand for this input vector.
4578           InputUsed[OpNo] = Input;
4579           break;
4580         }
4581       }
4582 
4583       if (OpNo >= array_lengthof(InputUsed)) {
4584         // More than two input vectors used!  Give up on trying to create a
4585         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
4586         UseBuildVector = true;
4587         break;
4588       }
4589 
4590       // Add the mask index for the new shuffle vector.
4591       Ops.push_back(Idx + OpNo * NewElts);
4592     }
4593 
4594     if (UseBuildVector) {
4595       LLT EltTy = NarrowTy.getElementType();
4596       SmallVector<Register, 16> SVOps;
4597 
4598       // Extract the input elements by hand.
4599       for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4600         // The mask element.  This indexes into the input.
4601         int Idx = Mask[FirstMaskIdx + MaskOffset];
4602 
4603         // The input vector this mask element indexes into.
4604         unsigned Input = (unsigned)Idx / NewElts;
4605 
4606         if (Input >= array_lengthof(Inputs)) {
4607           // The mask element is "undef" or indexes off the end of the input.
4608           SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
4609           continue;
4610         }
4611 
4612         // Turn the index into an offset from the start of the input vector.
4613         Idx -= Input * NewElts;
4614 
4615         // Extract the vector element by hand.
4616         SVOps.push_back(MIRBuilder
4617                             .buildExtractVectorElement(
4618                                 EltTy, Inputs[Input],
4619                                 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
4620                             .getReg(0));
4621       }
4622 
4623       // Construct the Lo/Hi output using a G_BUILD_VECTOR.
4624       Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
4625     } else if (InputUsed[0] == -1U) {
4626       // No input vectors were used! The result is undefined.
4627       Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
4628     } else {
4629       Register Op0 = Inputs[InputUsed[0]];
4630       // If only one input was used, use an undefined vector for the other.
4631       Register Op1 = InputUsed[1] == -1U
4632                          ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
4633                          : Inputs[InputUsed[1]];
4634       // At least one input vector was used. Create a new shuffle vector.
4635       Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
4636     }
4637 
4638     Ops.clear();
4639   }
4640 
4641   MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi});
4642   MI.eraseFromParent();
4643   return Legalized;
4644 }
4645 
4646 static unsigned getScalarOpcForReduction(unsigned Opc) {
4647   unsigned ScalarOpc;
4648   switch (Opc) {
4649   case TargetOpcode::G_VECREDUCE_FADD:
4650     ScalarOpc = TargetOpcode::G_FADD;
4651     break;
4652   case TargetOpcode::G_VECREDUCE_FMUL:
4653     ScalarOpc = TargetOpcode::G_FMUL;
4654     break;
4655   case TargetOpcode::G_VECREDUCE_FMAX:
4656     ScalarOpc = TargetOpcode::G_FMAXNUM;
4657     break;
4658   case TargetOpcode::G_VECREDUCE_FMIN:
4659     ScalarOpc = TargetOpcode::G_FMINNUM;
4660     break;
4661   case TargetOpcode::G_VECREDUCE_ADD:
4662     ScalarOpc = TargetOpcode::G_ADD;
4663     break;
4664   case TargetOpcode::G_VECREDUCE_MUL:
4665     ScalarOpc = TargetOpcode::G_MUL;
4666     break;
4667   case TargetOpcode::G_VECREDUCE_AND:
4668     ScalarOpc = TargetOpcode::G_AND;
4669     break;
4670   case TargetOpcode::G_VECREDUCE_OR:
4671     ScalarOpc = TargetOpcode::G_OR;
4672     break;
4673   case TargetOpcode::G_VECREDUCE_XOR:
4674     ScalarOpc = TargetOpcode::G_XOR;
4675     break;
4676   case TargetOpcode::G_VECREDUCE_SMAX:
4677     ScalarOpc = TargetOpcode::G_SMAX;
4678     break;
4679   case TargetOpcode::G_VECREDUCE_SMIN:
4680     ScalarOpc = TargetOpcode::G_SMIN;
4681     break;
4682   case TargetOpcode::G_VECREDUCE_UMAX:
4683     ScalarOpc = TargetOpcode::G_UMAX;
4684     break;
4685   case TargetOpcode::G_VECREDUCE_UMIN:
4686     ScalarOpc = TargetOpcode::G_UMIN;
4687     break;
4688   default:
4689     llvm_unreachable("Unhandled reduction");
4690   }
4691   return ScalarOpc;
4692 }
4693 
4694 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
4695     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4696   unsigned Opc = MI.getOpcode();
4697   assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD &&
4698          Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL &&
4699          "Sequential reductions not expected");
4700 
4701   if (TypeIdx != 1)
4702     return UnableToLegalize;
4703 
4704   // The semantics of the normal non-sequential reductions allow us to freely
4705   // re-associate the operation.
4706   Register SrcReg = MI.getOperand(1).getReg();
4707   LLT SrcTy = MRI.getType(SrcReg);
4708   Register DstReg = MI.getOperand(0).getReg();
4709   LLT DstTy = MRI.getType(DstReg);
4710 
4711   if (NarrowTy.isVector() &&
4712       (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
4713     return UnableToLegalize;
4714 
4715   unsigned ScalarOpc = getScalarOpcForReduction(Opc);
4716   SmallVector<Register> SplitSrcs;
4717   // If NarrowTy is a scalar then we're being asked to scalarize.
4718   const unsigned NumParts =
4719       NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
4720                           : SrcTy.getNumElements();
4721 
4722   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
4723   if (NarrowTy.isScalar()) {
4724     if (DstTy != NarrowTy)
4725       return UnableToLegalize; // FIXME: handle implicit extensions.
4726 
4727     if (isPowerOf2_32(NumParts)) {
4728       // Generate a tree of scalar operations to reduce the critical path.
4729       SmallVector<Register> PartialResults;
4730       unsigned NumPartsLeft = NumParts;
4731       while (NumPartsLeft > 1) {
4732         for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
4733           PartialResults.emplace_back(
4734               MIRBuilder
4735                   .buildInstr(ScalarOpc, {NarrowTy},
4736                               {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
4737                   .getReg(0));
4738         }
4739         SplitSrcs = PartialResults;
4740         PartialResults.clear();
4741         NumPartsLeft = SplitSrcs.size();
4742       }
4743       assert(SplitSrcs.size() == 1);
4744       MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
4745       MI.eraseFromParent();
4746       return Legalized;
4747     }
4748     // If we can't generate a tree, then just do sequential operations.
4749     Register Acc = SplitSrcs[0];
4750     for (unsigned Idx = 1; Idx < NumParts; ++Idx)
4751       Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
4752                 .getReg(0);
4753     MIRBuilder.buildCopy(DstReg, Acc);
4754     MI.eraseFromParent();
4755     return Legalized;
4756   }
4757   SmallVector<Register> PartialReductions;
4758   for (unsigned Part = 0; Part < NumParts; ++Part) {
4759     PartialReductions.push_back(
4760         MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0));
4761   }
4762 
4763 
4764   // If the types involved are powers of 2, we can generate intermediate vector
4765   // ops, before generating a final reduction operation.
4766   if (isPowerOf2_32(SrcTy.getNumElements()) &&
4767       isPowerOf2_32(NarrowTy.getNumElements())) {
4768     return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
4769   }
4770 
4771   Register Acc = PartialReductions[0];
4772   for (unsigned Part = 1; Part < NumParts; ++Part) {
4773     if (Part == NumParts - 1) {
4774       MIRBuilder.buildInstr(ScalarOpc, {DstReg},
4775                             {Acc, PartialReductions[Part]});
4776     } else {
4777       Acc = MIRBuilder
4778                 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
4779                 .getReg(0);
4780     }
4781   }
4782   MI.eraseFromParent();
4783   return Legalized;
4784 }
4785 
4786 LegalizerHelper::LegalizeResult
4787 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
4788                                         LLT SrcTy, LLT NarrowTy,
4789                                         unsigned ScalarOpc) {
4790   SmallVector<Register> SplitSrcs;
4791   // Split the sources into NarrowTy size pieces.
4792   extractParts(SrcReg, NarrowTy,
4793                SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs);
4794   // We're going to do a tree reduction using vector operations until we have
4795   // one NarrowTy size value left.
4796   while (SplitSrcs.size() > 1) {
4797     SmallVector<Register> PartialRdxs;
4798     for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
4799       Register LHS = SplitSrcs[Idx];
4800       Register RHS = SplitSrcs[Idx + 1];
4801       // Create the intermediate vector op.
4802       Register Res =
4803           MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
4804       PartialRdxs.push_back(Res);
4805     }
4806     SplitSrcs = std::move(PartialRdxs);
4807   }
4808   // Finally generate the requested NarrowTy based reduction.
4809   Observer.changingInstr(MI);
4810   MI.getOperand(1).setReg(SplitSrcs[0]);
4811   Observer.changedInstr(MI);
4812   return Legalized;
4813 }
4814 
4815 LegalizerHelper::LegalizeResult
4816 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
4817                                              const LLT HalfTy, const LLT AmtTy) {
4818 
4819   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4820   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4821   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4822 
4823   if (Amt.isZero()) {
4824     MIRBuilder.buildMerge(MI.getOperand(0), {InL, InH});
4825     MI.eraseFromParent();
4826     return Legalized;
4827   }
4828 
4829   LLT NVT = HalfTy;
4830   unsigned NVTBits = HalfTy.getSizeInBits();
4831   unsigned VTBits = 2 * NVTBits;
4832 
4833   SrcOp Lo(Register(0)), Hi(Register(0));
4834   if (MI.getOpcode() == TargetOpcode::G_SHL) {
4835     if (Amt.ugt(VTBits)) {
4836       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4837     } else if (Amt.ugt(NVTBits)) {
4838       Lo = MIRBuilder.buildConstant(NVT, 0);
4839       Hi = MIRBuilder.buildShl(NVT, InL,
4840                                MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4841     } else if (Amt == NVTBits) {
4842       Lo = MIRBuilder.buildConstant(NVT, 0);
4843       Hi = InL;
4844     } else {
4845       Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
4846       auto OrLHS =
4847           MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
4848       auto OrRHS = MIRBuilder.buildLShr(
4849           NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4850       Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4851     }
4852   } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4853     if (Amt.ugt(VTBits)) {
4854       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4855     } else if (Amt.ugt(NVTBits)) {
4856       Lo = MIRBuilder.buildLShr(NVT, InH,
4857                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4858       Hi = MIRBuilder.buildConstant(NVT, 0);
4859     } else if (Amt == NVTBits) {
4860       Lo = InH;
4861       Hi = MIRBuilder.buildConstant(NVT, 0);
4862     } else {
4863       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4864 
4865       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4866       auto OrRHS = MIRBuilder.buildShl(
4867           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4868 
4869       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4870       Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
4871     }
4872   } else {
4873     if (Amt.ugt(VTBits)) {
4874       Hi = Lo = MIRBuilder.buildAShr(
4875           NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4876     } else if (Amt.ugt(NVTBits)) {
4877       Lo = MIRBuilder.buildAShr(NVT, InH,
4878                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4879       Hi = MIRBuilder.buildAShr(NVT, InH,
4880                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4881     } else if (Amt == NVTBits) {
4882       Lo = InH;
4883       Hi = MIRBuilder.buildAShr(NVT, InH,
4884                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4885     } else {
4886       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4887 
4888       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4889       auto OrRHS = MIRBuilder.buildShl(
4890           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4891 
4892       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4893       Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
4894     }
4895   }
4896 
4897   MIRBuilder.buildMerge(MI.getOperand(0), {Lo, Hi});
4898   MI.eraseFromParent();
4899 
4900   return Legalized;
4901 }
4902 
4903 // TODO: Optimize if constant shift amount.
4904 LegalizerHelper::LegalizeResult
4905 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
4906                                    LLT RequestedTy) {
4907   if (TypeIdx == 1) {
4908     Observer.changingInstr(MI);
4909     narrowScalarSrc(MI, RequestedTy, 2);
4910     Observer.changedInstr(MI);
4911     return Legalized;
4912   }
4913 
4914   Register DstReg = MI.getOperand(0).getReg();
4915   LLT DstTy = MRI.getType(DstReg);
4916   if (DstTy.isVector())
4917     return UnableToLegalize;
4918 
4919   Register Amt = MI.getOperand(2).getReg();
4920   LLT ShiftAmtTy = MRI.getType(Amt);
4921   const unsigned DstEltSize = DstTy.getScalarSizeInBits();
4922   if (DstEltSize % 2 != 0)
4923     return UnableToLegalize;
4924 
4925   // Ignore the input type. We can only go to exactly half the size of the
4926   // input. If that isn't small enough, the resulting pieces will be further
4927   // legalized.
4928   const unsigned NewBitSize = DstEltSize / 2;
4929   const LLT HalfTy = LLT::scalar(NewBitSize);
4930   const LLT CondTy = LLT::scalar(1);
4931 
4932   if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) {
4933     return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
4934                                        ShiftAmtTy);
4935   }
4936 
4937   // TODO: Expand with known bits.
4938 
4939   // Handle the fully general expansion by an unknown amount.
4940   auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
4941 
4942   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4943   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4944   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4945 
4946   auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
4947   auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
4948 
4949   auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
4950   auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
4951   auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
4952 
4953   Register ResultRegs[2];
4954   switch (MI.getOpcode()) {
4955   case TargetOpcode::G_SHL: {
4956     // Short: ShAmt < NewBitSize
4957     auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
4958 
4959     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
4960     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
4961     auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4962 
4963     // Long: ShAmt >= NewBitSize
4964     auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
4965     auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
4966 
4967     auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
4968     auto Hi = MIRBuilder.buildSelect(
4969         HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
4970 
4971     ResultRegs[0] = Lo.getReg(0);
4972     ResultRegs[1] = Hi.getReg(0);
4973     break;
4974   }
4975   case TargetOpcode::G_LSHR:
4976   case TargetOpcode::G_ASHR: {
4977     // Short: ShAmt < NewBitSize
4978     auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
4979 
4980     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
4981     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
4982     auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4983 
4984     // Long: ShAmt >= NewBitSize
4985     MachineInstrBuilder HiL;
4986     if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4987       HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
4988     } else {
4989       auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
4990       HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
4991     }
4992     auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
4993                                      {InH, AmtExcess});     // Lo from Hi part.
4994 
4995     auto Lo = MIRBuilder.buildSelect(
4996         HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
4997 
4998     auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
4999 
5000     ResultRegs[0] = Lo.getReg(0);
5001     ResultRegs[1] = Hi.getReg(0);
5002     break;
5003   }
5004   default:
5005     llvm_unreachable("not a shift");
5006   }
5007 
5008   MIRBuilder.buildMerge(DstReg, ResultRegs);
5009   MI.eraseFromParent();
5010   return Legalized;
5011 }
5012 
5013 LegalizerHelper::LegalizeResult
5014 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
5015                                        LLT MoreTy) {
5016   assert(TypeIdx == 0 && "Expecting only Idx 0");
5017 
5018   Observer.changingInstr(MI);
5019   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
5020     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
5021     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
5022     moreElementsVectorSrc(MI, MoreTy, I);
5023   }
5024 
5025   MachineBasicBlock &MBB = *MI.getParent();
5026   MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
5027   moreElementsVectorDst(MI, MoreTy, 0);
5028   Observer.changedInstr(MI);
5029   return Legalized;
5030 }
5031 
5032 LegalizerHelper::LegalizeResult
5033 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
5034                                     LLT MoreTy) {
5035   unsigned Opc = MI.getOpcode();
5036   switch (Opc) {
5037   case TargetOpcode::G_IMPLICIT_DEF:
5038   case TargetOpcode::G_LOAD: {
5039     if (TypeIdx != 0)
5040       return UnableToLegalize;
5041     Observer.changingInstr(MI);
5042     moreElementsVectorDst(MI, MoreTy, 0);
5043     Observer.changedInstr(MI);
5044     return Legalized;
5045   }
5046   case TargetOpcode::G_STORE:
5047     if (TypeIdx != 0)
5048       return UnableToLegalize;
5049     Observer.changingInstr(MI);
5050     moreElementsVectorSrc(MI, MoreTy, 0);
5051     Observer.changedInstr(MI);
5052     return Legalized;
5053   case TargetOpcode::G_AND:
5054   case TargetOpcode::G_OR:
5055   case TargetOpcode::G_XOR:
5056   case TargetOpcode::G_SMIN:
5057   case TargetOpcode::G_SMAX:
5058   case TargetOpcode::G_UMIN:
5059   case TargetOpcode::G_UMAX:
5060   case TargetOpcode::G_FMINNUM:
5061   case TargetOpcode::G_FMAXNUM:
5062   case TargetOpcode::G_FMINNUM_IEEE:
5063   case TargetOpcode::G_FMAXNUM_IEEE:
5064   case TargetOpcode::G_FMINIMUM:
5065   case TargetOpcode::G_FMAXIMUM: {
5066     Observer.changingInstr(MI);
5067     moreElementsVectorSrc(MI, MoreTy, 1);
5068     moreElementsVectorSrc(MI, MoreTy, 2);
5069     moreElementsVectorDst(MI, MoreTy, 0);
5070     Observer.changedInstr(MI);
5071     return Legalized;
5072   }
5073   case TargetOpcode::G_EXTRACT:
5074     if (TypeIdx != 1)
5075       return UnableToLegalize;
5076     Observer.changingInstr(MI);
5077     moreElementsVectorSrc(MI, MoreTy, 1);
5078     Observer.changedInstr(MI);
5079     return Legalized;
5080   case TargetOpcode::G_INSERT:
5081   case TargetOpcode::G_FREEZE:
5082     if (TypeIdx != 0)
5083       return UnableToLegalize;
5084     Observer.changingInstr(MI);
5085     moreElementsVectorSrc(MI, MoreTy, 1);
5086     moreElementsVectorDst(MI, MoreTy, 0);
5087     Observer.changedInstr(MI);
5088     return Legalized;
5089   case TargetOpcode::G_SELECT:
5090     if (TypeIdx != 0)
5091       return UnableToLegalize;
5092     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
5093       return UnableToLegalize;
5094 
5095     Observer.changingInstr(MI);
5096     moreElementsVectorSrc(MI, MoreTy, 2);
5097     moreElementsVectorSrc(MI, MoreTy, 3);
5098     moreElementsVectorDst(MI, MoreTy, 0);
5099     Observer.changedInstr(MI);
5100     return Legalized;
5101   case TargetOpcode::G_UNMERGE_VALUES: {
5102     if (TypeIdx != 1)
5103       return UnableToLegalize;
5104 
5105     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
5106     int NumDst = MI.getNumOperands() - 1;
5107     moreElementsVectorSrc(MI, MoreTy, NumDst);
5108 
5109     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
5110     for (int I = 0; I != NumDst; ++I)
5111       MIB.addDef(MI.getOperand(I).getReg());
5112 
5113     int NewNumDst = MoreTy.getSizeInBits() / DstTy.getSizeInBits();
5114     for (int I = NumDst; I != NewNumDst; ++I)
5115       MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
5116 
5117     MIB.addUse(MI.getOperand(NumDst).getReg());
5118     MI.eraseFromParent();
5119     return Legalized;
5120   }
5121   case TargetOpcode::G_PHI:
5122     return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
5123   case TargetOpcode::G_SHUFFLE_VECTOR:
5124     return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
5125   default:
5126     return UnableToLegalize;
5127   }
5128 }
5129 
5130 LegalizerHelper::LegalizeResult
5131 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
5132                                            unsigned int TypeIdx, LLT MoreTy) {
5133   if (TypeIdx != 0)
5134     return UnableToLegalize;
5135 
5136   Register DstReg = MI.getOperand(0).getReg();
5137   Register Src1Reg = MI.getOperand(1).getReg();
5138   Register Src2Reg = MI.getOperand(2).getReg();
5139   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
5140   LLT DstTy = MRI.getType(DstReg);
5141   LLT Src1Ty = MRI.getType(Src1Reg);
5142   LLT Src2Ty = MRI.getType(Src2Reg);
5143   unsigned NumElts = DstTy.getNumElements();
5144   unsigned WidenNumElts = MoreTy.getNumElements();
5145 
5146   // Expect a canonicalized shuffle.
5147   if (DstTy != Src1Ty || DstTy != Src2Ty)
5148     return UnableToLegalize;
5149 
5150   moreElementsVectorSrc(MI, MoreTy, 1);
5151   moreElementsVectorSrc(MI, MoreTy, 2);
5152 
5153   // Adjust mask based on new input vector length.
5154   SmallVector<int, 16> NewMask;
5155   for (unsigned I = 0; I != NumElts; ++I) {
5156     int Idx = Mask[I];
5157     if (Idx < static_cast<int>(NumElts))
5158       NewMask.push_back(Idx);
5159     else
5160       NewMask.push_back(Idx - NumElts + WidenNumElts);
5161   }
5162   for (unsigned I = NumElts; I != WidenNumElts; ++I)
5163     NewMask.push_back(-1);
5164   moreElementsVectorDst(MI, MoreTy, 0);
5165   MIRBuilder.setInstrAndDebugLoc(MI);
5166   MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
5167                                 MI.getOperand(1).getReg(),
5168                                 MI.getOperand(2).getReg(), NewMask);
5169   MI.eraseFromParent();
5170   return Legalized;
5171 }
5172 
5173 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
5174                                         ArrayRef<Register> Src1Regs,
5175                                         ArrayRef<Register> Src2Regs,
5176                                         LLT NarrowTy) {
5177   MachineIRBuilder &B = MIRBuilder;
5178   unsigned SrcParts = Src1Regs.size();
5179   unsigned DstParts = DstRegs.size();
5180 
5181   unsigned DstIdx = 0; // Low bits of the result.
5182   Register FactorSum =
5183       B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
5184   DstRegs[DstIdx] = FactorSum;
5185 
5186   unsigned CarrySumPrevDstIdx;
5187   SmallVector<Register, 4> Factors;
5188 
5189   for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
5190     // Collect low parts of muls for DstIdx.
5191     for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
5192          i <= std::min(DstIdx, SrcParts - 1); ++i) {
5193       MachineInstrBuilder Mul =
5194           B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
5195       Factors.push_back(Mul.getReg(0));
5196     }
5197     // Collect high parts of muls from previous DstIdx.
5198     for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
5199          i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
5200       MachineInstrBuilder Umulh =
5201           B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
5202       Factors.push_back(Umulh.getReg(0));
5203     }
5204     // Add CarrySum from additions calculated for previous DstIdx.
5205     if (DstIdx != 1) {
5206       Factors.push_back(CarrySumPrevDstIdx);
5207     }
5208 
5209     Register CarrySum;
5210     // Add all factors and accumulate all carries into CarrySum.
5211     if (DstIdx != DstParts - 1) {
5212       MachineInstrBuilder Uaddo =
5213           B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
5214       FactorSum = Uaddo.getReg(0);
5215       CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
5216       for (unsigned i = 2; i < Factors.size(); ++i) {
5217         MachineInstrBuilder Uaddo =
5218             B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
5219         FactorSum = Uaddo.getReg(0);
5220         MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
5221         CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
5222       }
5223     } else {
5224       // Since value for the next index is not calculated, neither is CarrySum.
5225       FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
5226       for (unsigned i = 2; i < Factors.size(); ++i)
5227         FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
5228     }
5229 
5230     CarrySumPrevDstIdx = CarrySum;
5231     DstRegs[DstIdx] = FactorSum;
5232     Factors.clear();
5233   }
5234 }
5235 
5236 LegalizerHelper::LegalizeResult
5237 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
5238                                     LLT NarrowTy) {
5239   if (TypeIdx != 0)
5240     return UnableToLegalize;
5241 
5242   Register DstReg = MI.getOperand(0).getReg();
5243   LLT DstType = MRI.getType(DstReg);
5244   // FIXME: add support for vector types
5245   if (DstType.isVector())
5246     return UnableToLegalize;
5247 
5248   unsigned Opcode = MI.getOpcode();
5249   unsigned OpO, OpE, OpF;
5250   switch (Opcode) {
5251   case TargetOpcode::G_SADDO:
5252   case TargetOpcode::G_SADDE:
5253   case TargetOpcode::G_UADDO:
5254   case TargetOpcode::G_UADDE:
5255   case TargetOpcode::G_ADD:
5256     OpO = TargetOpcode::G_UADDO;
5257     OpE = TargetOpcode::G_UADDE;
5258     OpF = TargetOpcode::G_UADDE;
5259     if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
5260       OpF = TargetOpcode::G_SADDE;
5261     break;
5262   case TargetOpcode::G_SSUBO:
5263   case TargetOpcode::G_SSUBE:
5264   case TargetOpcode::G_USUBO:
5265   case TargetOpcode::G_USUBE:
5266   case TargetOpcode::G_SUB:
5267     OpO = TargetOpcode::G_USUBO;
5268     OpE = TargetOpcode::G_USUBE;
5269     OpF = TargetOpcode::G_USUBE;
5270     if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
5271       OpF = TargetOpcode::G_SSUBE;
5272     break;
5273   default:
5274     llvm_unreachable("Unexpected add/sub opcode!");
5275   }
5276 
5277   // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
5278   unsigned NumDefs = MI.getNumExplicitDefs();
5279   Register Src1 = MI.getOperand(NumDefs).getReg();
5280   Register Src2 = MI.getOperand(NumDefs + 1).getReg();
5281   Register CarryDst, CarryIn;
5282   if (NumDefs == 2)
5283     CarryDst = MI.getOperand(1).getReg();
5284   if (MI.getNumOperands() == NumDefs + 3)
5285     CarryIn = MI.getOperand(NumDefs + 2).getReg();
5286 
5287   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5288   LLT LeftoverTy, DummyTy;
5289   SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
5290   extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left);
5291   extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left);
5292 
5293   int NarrowParts = Src1Regs.size();
5294   for (int I = 0, E = Src1Left.size(); I != E; ++I) {
5295     Src1Regs.push_back(Src1Left[I]);
5296     Src2Regs.push_back(Src2Left[I]);
5297   }
5298   DstRegs.reserve(Src1Regs.size());
5299 
5300   for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
5301     Register DstReg =
5302         MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
5303     Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
5304     // Forward the final carry-out to the destination register
5305     if (i == e - 1 && CarryDst)
5306       CarryOut = CarryDst;
5307 
5308     if (!CarryIn) {
5309       MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
5310                             {Src1Regs[i], Src2Regs[i]});
5311     } else if (i == e - 1) {
5312       MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
5313                             {Src1Regs[i], Src2Regs[i], CarryIn});
5314     } else {
5315       MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
5316                             {Src1Regs[i], Src2Regs[i], CarryIn});
5317     }
5318 
5319     DstRegs.push_back(DstReg);
5320     CarryIn = CarryOut;
5321   }
5322   insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
5323               makeArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
5324               makeArrayRef(DstRegs).drop_front(NarrowParts));
5325 
5326   MI.eraseFromParent();
5327   return Legalized;
5328 }
5329 
5330 LegalizerHelper::LegalizeResult
5331 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
5332   Register DstReg = MI.getOperand(0).getReg();
5333   Register Src1 = MI.getOperand(1).getReg();
5334   Register Src2 = MI.getOperand(2).getReg();
5335 
5336   LLT Ty = MRI.getType(DstReg);
5337   if (Ty.isVector())
5338     return UnableToLegalize;
5339 
5340   unsigned Size = Ty.getSizeInBits();
5341   unsigned NarrowSize = NarrowTy.getSizeInBits();
5342   if (Size % NarrowSize != 0)
5343     return UnableToLegalize;
5344 
5345   unsigned NumParts = Size / NarrowSize;
5346   bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
5347   unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
5348 
5349   SmallVector<Register, 2> Src1Parts, Src2Parts;
5350   SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
5351   extractParts(Src1, NarrowTy, NumParts, Src1Parts);
5352   extractParts(Src2, NarrowTy, NumParts, Src2Parts);
5353   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
5354 
5355   // Take only high half of registers if this is high mul.
5356   ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
5357   MIRBuilder.buildMerge(DstReg, DstRegs);
5358   MI.eraseFromParent();
5359   return Legalized;
5360 }
5361 
5362 LegalizerHelper::LegalizeResult
5363 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
5364                                    LLT NarrowTy) {
5365   if (TypeIdx != 0)
5366     return UnableToLegalize;
5367 
5368   bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
5369 
5370   Register Src = MI.getOperand(1).getReg();
5371   LLT SrcTy = MRI.getType(Src);
5372 
5373   // If all finite floats fit into the narrowed integer type, we can just swap
5374   // out the result type. This is practically only useful for conversions from
5375   // half to at least 16-bits, so just handle the one case.
5376   if (SrcTy.getScalarType() != LLT::scalar(16) ||
5377       NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
5378     return UnableToLegalize;
5379 
5380   Observer.changingInstr(MI);
5381   narrowScalarDst(MI, NarrowTy, 0,
5382                   IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
5383   Observer.changedInstr(MI);
5384   return Legalized;
5385 }
5386 
5387 LegalizerHelper::LegalizeResult
5388 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
5389                                      LLT NarrowTy) {
5390   if (TypeIdx != 1)
5391     return UnableToLegalize;
5392 
5393   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5394 
5395   int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5396   // FIXME: add support for when SizeOp1 isn't an exact multiple of
5397   // NarrowSize.
5398   if (SizeOp1 % NarrowSize != 0)
5399     return UnableToLegalize;
5400   int NumParts = SizeOp1 / NarrowSize;
5401 
5402   SmallVector<Register, 2> SrcRegs, DstRegs;
5403   SmallVector<uint64_t, 2> Indexes;
5404   extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
5405 
5406   Register OpReg = MI.getOperand(0).getReg();
5407   uint64_t OpStart = MI.getOperand(2).getImm();
5408   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5409   for (int i = 0; i < NumParts; ++i) {
5410     unsigned SrcStart = i * NarrowSize;
5411 
5412     if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
5413       // No part of the extract uses this subregister, ignore it.
5414       continue;
5415     } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5416       // The entire subregister is extracted, forward the value.
5417       DstRegs.push_back(SrcRegs[i]);
5418       continue;
5419     }
5420 
5421     // OpSegStart is where this destination segment would start in OpReg if it
5422     // extended infinitely in both directions.
5423     int64_t ExtractOffset;
5424     uint64_t SegSize;
5425     if (OpStart < SrcStart) {
5426       ExtractOffset = 0;
5427       SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
5428     } else {
5429       ExtractOffset = OpStart - SrcStart;
5430       SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
5431     }
5432 
5433     Register SegReg = SrcRegs[i];
5434     if (ExtractOffset != 0 || SegSize != NarrowSize) {
5435       // A genuine extract is needed.
5436       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5437       MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
5438     }
5439 
5440     DstRegs.push_back(SegReg);
5441   }
5442 
5443   Register DstReg = MI.getOperand(0).getReg();
5444   if (MRI.getType(DstReg).isVector())
5445     MIRBuilder.buildBuildVector(DstReg, DstRegs);
5446   else if (DstRegs.size() > 1)
5447     MIRBuilder.buildMerge(DstReg, DstRegs);
5448   else
5449     MIRBuilder.buildCopy(DstReg, DstRegs[0]);
5450   MI.eraseFromParent();
5451   return Legalized;
5452 }
5453 
5454 LegalizerHelper::LegalizeResult
5455 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
5456                                     LLT NarrowTy) {
5457   // FIXME: Don't know how to handle secondary types yet.
5458   if (TypeIdx != 0)
5459     return UnableToLegalize;
5460 
5461   SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
5462   SmallVector<uint64_t, 2> Indexes;
5463   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5464   LLT LeftoverTy;
5465   extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
5466                LeftoverRegs);
5467 
5468   for (Register Reg : LeftoverRegs)
5469     SrcRegs.push_back(Reg);
5470 
5471   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5472   Register OpReg = MI.getOperand(2).getReg();
5473   uint64_t OpStart = MI.getOperand(3).getImm();
5474   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5475   for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
5476     unsigned DstStart = I * NarrowSize;
5477 
5478     if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5479       // The entire subregister is defined by this insert, forward the new
5480       // value.
5481       DstRegs.push_back(OpReg);
5482       continue;
5483     }
5484 
5485     Register SrcReg = SrcRegs[I];
5486     if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
5487       // The leftover reg is smaller than NarrowTy, so we need to extend it.
5488       SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
5489       MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
5490     }
5491 
5492     if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
5493       // No part of the insert affects this subregister, forward the original.
5494       DstRegs.push_back(SrcReg);
5495       continue;
5496     }
5497 
5498     // OpSegStart is where this destination segment would start in OpReg if it
5499     // extended infinitely in both directions.
5500     int64_t ExtractOffset, InsertOffset;
5501     uint64_t SegSize;
5502     if (OpStart < DstStart) {
5503       InsertOffset = 0;
5504       ExtractOffset = DstStart - OpStart;
5505       SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
5506     } else {
5507       InsertOffset = OpStart - DstStart;
5508       ExtractOffset = 0;
5509       SegSize =
5510         std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
5511     }
5512 
5513     Register SegReg = OpReg;
5514     if (ExtractOffset != 0 || SegSize != OpSize) {
5515       // A genuine extract is needed.
5516       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5517       MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
5518     }
5519 
5520     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
5521     MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
5522     DstRegs.push_back(DstReg);
5523   }
5524 
5525   uint64_t WideSize = DstRegs.size() * NarrowSize;
5526   Register DstReg = MI.getOperand(0).getReg();
5527   if (WideSize > RegTy.getSizeInBits()) {
5528     Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
5529     MIRBuilder.buildMerge(MergeReg, DstRegs);
5530     MIRBuilder.buildTrunc(DstReg, MergeReg);
5531   } else
5532     MIRBuilder.buildMerge(DstReg, DstRegs);
5533 
5534   MI.eraseFromParent();
5535   return Legalized;
5536 }
5537 
5538 LegalizerHelper::LegalizeResult
5539 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
5540                                    LLT NarrowTy) {
5541   Register DstReg = MI.getOperand(0).getReg();
5542   LLT DstTy = MRI.getType(DstReg);
5543 
5544   assert(MI.getNumOperands() == 3 && TypeIdx == 0);
5545 
5546   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5547   SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
5548   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5549   LLT LeftoverTy;
5550   if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
5551                     Src0Regs, Src0LeftoverRegs))
5552     return UnableToLegalize;
5553 
5554   LLT Unused;
5555   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
5556                     Src1Regs, Src1LeftoverRegs))
5557     llvm_unreachable("inconsistent extractParts result");
5558 
5559   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5560     auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
5561                                         {Src0Regs[I], Src1Regs[I]});
5562     DstRegs.push_back(Inst.getReg(0));
5563   }
5564 
5565   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5566     auto Inst = MIRBuilder.buildInstr(
5567       MI.getOpcode(),
5568       {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
5569     DstLeftoverRegs.push_back(Inst.getReg(0));
5570   }
5571 
5572   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5573               LeftoverTy, DstLeftoverRegs);
5574 
5575   MI.eraseFromParent();
5576   return Legalized;
5577 }
5578 
5579 LegalizerHelper::LegalizeResult
5580 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
5581                                  LLT NarrowTy) {
5582   if (TypeIdx != 0)
5583     return UnableToLegalize;
5584 
5585   Register DstReg = MI.getOperand(0).getReg();
5586   Register SrcReg = MI.getOperand(1).getReg();
5587 
5588   LLT DstTy = MRI.getType(DstReg);
5589   if (DstTy.isVector())
5590     return UnableToLegalize;
5591 
5592   SmallVector<Register, 8> Parts;
5593   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
5594   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
5595   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
5596 
5597   MI.eraseFromParent();
5598   return Legalized;
5599 }
5600 
5601 LegalizerHelper::LegalizeResult
5602 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
5603                                     LLT NarrowTy) {
5604   if (TypeIdx != 0)
5605     return UnableToLegalize;
5606 
5607   Register CondReg = MI.getOperand(1).getReg();
5608   LLT CondTy = MRI.getType(CondReg);
5609   if (CondTy.isVector()) // TODO: Handle vselect
5610     return UnableToLegalize;
5611 
5612   Register DstReg = MI.getOperand(0).getReg();
5613   LLT DstTy = MRI.getType(DstReg);
5614 
5615   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5616   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5617   SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
5618   LLT LeftoverTy;
5619   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
5620                     Src1Regs, Src1LeftoverRegs))
5621     return UnableToLegalize;
5622 
5623   LLT Unused;
5624   if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
5625                     Src2Regs, Src2LeftoverRegs))
5626     llvm_unreachable("inconsistent extractParts result");
5627 
5628   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5629     auto Select = MIRBuilder.buildSelect(NarrowTy,
5630                                          CondReg, Src1Regs[I], Src2Regs[I]);
5631     DstRegs.push_back(Select.getReg(0));
5632   }
5633 
5634   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5635     auto Select = MIRBuilder.buildSelect(
5636       LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
5637     DstLeftoverRegs.push_back(Select.getReg(0));
5638   }
5639 
5640   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5641               LeftoverTy, DstLeftoverRegs);
5642 
5643   MI.eraseFromParent();
5644   return Legalized;
5645 }
5646 
5647 LegalizerHelper::LegalizeResult
5648 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
5649                                   LLT NarrowTy) {
5650   if (TypeIdx != 1)
5651     return UnableToLegalize;
5652 
5653   Register DstReg = MI.getOperand(0).getReg();
5654   Register SrcReg = MI.getOperand(1).getReg();
5655   LLT DstTy = MRI.getType(DstReg);
5656   LLT SrcTy = MRI.getType(SrcReg);
5657   unsigned NarrowSize = NarrowTy.getSizeInBits();
5658 
5659   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5660     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
5661 
5662     MachineIRBuilder &B = MIRBuilder;
5663     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5664     // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
5665     auto C_0 = B.buildConstant(NarrowTy, 0);
5666     auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5667                                 UnmergeSrc.getReg(1), C_0);
5668     auto LoCTLZ = IsUndef ?
5669       B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
5670       B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
5671     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5672     auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
5673     auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
5674     B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
5675 
5676     MI.eraseFromParent();
5677     return Legalized;
5678   }
5679 
5680   return UnableToLegalize;
5681 }
5682 
5683 LegalizerHelper::LegalizeResult
5684 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
5685                                   LLT NarrowTy) {
5686   if (TypeIdx != 1)
5687     return UnableToLegalize;
5688 
5689   Register DstReg = MI.getOperand(0).getReg();
5690   Register SrcReg = MI.getOperand(1).getReg();
5691   LLT DstTy = MRI.getType(DstReg);
5692   LLT SrcTy = MRI.getType(SrcReg);
5693   unsigned NarrowSize = NarrowTy.getSizeInBits();
5694 
5695   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5696     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
5697 
5698     MachineIRBuilder &B = MIRBuilder;
5699     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5700     // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
5701     auto C_0 = B.buildConstant(NarrowTy, 0);
5702     auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5703                                 UnmergeSrc.getReg(0), C_0);
5704     auto HiCTTZ = IsUndef ?
5705       B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
5706       B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
5707     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5708     auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
5709     auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
5710     B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
5711 
5712     MI.eraseFromParent();
5713     return Legalized;
5714   }
5715 
5716   return UnableToLegalize;
5717 }
5718 
5719 LegalizerHelper::LegalizeResult
5720 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
5721                                    LLT NarrowTy) {
5722   if (TypeIdx != 1)
5723     return UnableToLegalize;
5724 
5725   Register DstReg = MI.getOperand(0).getReg();
5726   LLT DstTy = MRI.getType(DstReg);
5727   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
5728   unsigned NarrowSize = NarrowTy.getSizeInBits();
5729 
5730   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5731     auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
5732 
5733     auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
5734     auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
5735     MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
5736 
5737     MI.eraseFromParent();
5738     return Legalized;
5739   }
5740 
5741   return UnableToLegalize;
5742 }
5743 
5744 LegalizerHelper::LegalizeResult
5745 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
5746   unsigned Opc = MI.getOpcode();
5747   const auto &TII = MIRBuilder.getTII();
5748   auto isSupported = [this](const LegalityQuery &Q) {
5749     auto QAction = LI.getAction(Q).Action;
5750     return QAction == Legal || QAction == Libcall || QAction == Custom;
5751   };
5752   switch (Opc) {
5753   default:
5754     return UnableToLegalize;
5755   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
5756     // This trivially expands to CTLZ.
5757     Observer.changingInstr(MI);
5758     MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
5759     Observer.changedInstr(MI);
5760     return Legalized;
5761   }
5762   case TargetOpcode::G_CTLZ: {
5763     Register DstReg = MI.getOperand(0).getReg();
5764     Register SrcReg = MI.getOperand(1).getReg();
5765     LLT DstTy = MRI.getType(DstReg);
5766     LLT SrcTy = MRI.getType(SrcReg);
5767     unsigned Len = SrcTy.getSizeInBits();
5768 
5769     if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5770       // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
5771       auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
5772       auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
5773       auto ICmp = MIRBuilder.buildICmp(
5774           CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
5775       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5776       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
5777       MI.eraseFromParent();
5778       return Legalized;
5779     }
5780     // for now, we do this:
5781     // NewLen = NextPowerOf2(Len);
5782     // x = x | (x >> 1);
5783     // x = x | (x >> 2);
5784     // ...
5785     // x = x | (x >>16);
5786     // x = x | (x >>32); // for 64-bit input
5787     // Upto NewLen/2
5788     // return Len - popcount(x);
5789     //
5790     // Ref: "Hacker's Delight" by Henry Warren
5791     Register Op = SrcReg;
5792     unsigned NewLen = PowerOf2Ceil(Len);
5793     for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
5794       auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
5795       auto MIBOp = MIRBuilder.buildOr(
5796           SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
5797       Op = MIBOp.getReg(0);
5798     }
5799     auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
5800     MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
5801                         MIBPop);
5802     MI.eraseFromParent();
5803     return Legalized;
5804   }
5805   case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
5806     // This trivially expands to CTTZ.
5807     Observer.changingInstr(MI);
5808     MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
5809     Observer.changedInstr(MI);
5810     return Legalized;
5811   }
5812   case TargetOpcode::G_CTTZ: {
5813     Register DstReg = MI.getOperand(0).getReg();
5814     Register SrcReg = MI.getOperand(1).getReg();
5815     LLT DstTy = MRI.getType(DstReg);
5816     LLT SrcTy = MRI.getType(SrcReg);
5817 
5818     unsigned Len = SrcTy.getSizeInBits();
5819     if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5820       // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
5821       // zero.
5822       auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
5823       auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
5824       auto ICmp = MIRBuilder.buildICmp(
5825           CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
5826       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5827       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
5828       MI.eraseFromParent();
5829       return Legalized;
5830     }
5831     // for now, we use: { return popcount(~x & (x - 1)); }
5832     // unless the target has ctlz but not ctpop, in which case we use:
5833     // { return 32 - nlz(~x & (x-1)); }
5834     // Ref: "Hacker's Delight" by Henry Warren
5835     auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
5836     auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
5837     auto MIBTmp = MIRBuilder.buildAnd(
5838         SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
5839     if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
5840         isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
5841       auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
5842       MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
5843                           MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
5844       MI.eraseFromParent();
5845       return Legalized;
5846     }
5847     MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
5848     MI.getOperand(1).setReg(MIBTmp.getReg(0));
5849     return Legalized;
5850   }
5851   case TargetOpcode::G_CTPOP: {
5852     Register SrcReg = MI.getOperand(1).getReg();
5853     LLT Ty = MRI.getType(SrcReg);
5854     unsigned Size = Ty.getSizeInBits();
5855     MachineIRBuilder &B = MIRBuilder;
5856 
5857     // Count set bits in blocks of 2 bits. Default approach would be
5858     // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
5859     // We use following formula instead:
5860     // B2Count = val - { (val >> 1) & 0x55555555 }
5861     // since it gives same result in blocks of 2 with one instruction less.
5862     auto C_1 = B.buildConstant(Ty, 1);
5863     auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
5864     APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
5865     auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
5866     auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
5867     auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
5868 
5869     // In order to get count in blocks of 4 add values from adjacent block of 2.
5870     // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
5871     auto C_2 = B.buildConstant(Ty, 2);
5872     auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
5873     APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
5874     auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
5875     auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
5876     auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
5877     auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
5878 
5879     // For count in blocks of 8 bits we don't have to mask high 4 bits before
5880     // addition since count value sits in range {0,...,8} and 4 bits are enough
5881     // to hold such binary values. After addition high 4 bits still hold count
5882     // of set bits in high 4 bit block, set them to zero and get 8 bit result.
5883     // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
5884     auto C_4 = B.buildConstant(Ty, 4);
5885     auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
5886     auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
5887     APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
5888     auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
5889     auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
5890 
5891     assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
5892     // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
5893     // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
5894     auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
5895     auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
5896 
5897     // Shift count result from 8 high bits to low bits.
5898     auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
5899     B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
5900 
5901     MI.eraseFromParent();
5902     return Legalized;
5903   }
5904   }
5905 }
5906 
5907 // Check that (every element of) Reg is undef or not an exact multiple of BW.
5908 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
5909                                         Register Reg, unsigned BW) {
5910   return matchUnaryPredicate(
5911       MRI, Reg,
5912       [=](const Constant *C) {
5913         // Null constant here means an undef.
5914         const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
5915         return !CI || CI->getValue().urem(BW) != 0;
5916       },
5917       /*AllowUndefs*/ true);
5918 }
5919 
5920 LegalizerHelper::LegalizeResult
5921 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
5922   Register Dst = MI.getOperand(0).getReg();
5923   Register X = MI.getOperand(1).getReg();
5924   Register Y = MI.getOperand(2).getReg();
5925   Register Z = MI.getOperand(3).getReg();
5926   LLT Ty = MRI.getType(Dst);
5927   LLT ShTy = MRI.getType(Z);
5928 
5929   unsigned BW = Ty.getScalarSizeInBits();
5930 
5931   if (!isPowerOf2_32(BW))
5932     return UnableToLegalize;
5933 
5934   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5935   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5936 
5937   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5938     // fshl X, Y, Z -> fshr X, Y, -Z
5939     // fshr X, Y, Z -> fshl X, Y, -Z
5940     auto Zero = MIRBuilder.buildConstant(ShTy, 0);
5941     Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
5942   } else {
5943     // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
5944     // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
5945     auto One = MIRBuilder.buildConstant(ShTy, 1);
5946     if (IsFSHL) {
5947       Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5948       X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
5949     } else {
5950       X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5951       Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
5952     }
5953 
5954     Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
5955   }
5956 
5957   MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
5958   MI.eraseFromParent();
5959   return Legalized;
5960 }
5961 
5962 LegalizerHelper::LegalizeResult
5963 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
5964   Register Dst = MI.getOperand(0).getReg();
5965   Register X = MI.getOperand(1).getReg();
5966   Register Y = MI.getOperand(2).getReg();
5967   Register Z = MI.getOperand(3).getReg();
5968   LLT Ty = MRI.getType(Dst);
5969   LLT ShTy = MRI.getType(Z);
5970 
5971   const unsigned BW = Ty.getScalarSizeInBits();
5972   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5973 
5974   Register ShX, ShY;
5975   Register ShAmt, InvShAmt;
5976 
5977   // FIXME: Emit optimized urem by constant instead of letting it expand later.
5978   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5979     // fshl: X << C | Y >> (BW - C)
5980     // fshr: X << (BW - C) | Y >> C
5981     // where C = Z % BW is not zero
5982     auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5983     ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5984     InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
5985     ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
5986     ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
5987   } else {
5988     // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
5989     // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
5990     auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
5991     if (isPowerOf2_32(BW)) {
5992       // Z % BW -> Z & (BW - 1)
5993       ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
5994       // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
5995       auto NotZ = MIRBuilder.buildNot(ShTy, Z);
5996       InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
5997     } else {
5998       auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5999       ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
6000       InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
6001     }
6002 
6003     auto One = MIRBuilder.buildConstant(ShTy, 1);
6004     if (IsFSHL) {
6005       ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
6006       auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
6007       ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
6008     } else {
6009       auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
6010       ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
6011       ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
6012     }
6013   }
6014 
6015   MIRBuilder.buildOr(Dst, ShX, ShY);
6016   MI.eraseFromParent();
6017   return Legalized;
6018 }
6019 
6020 LegalizerHelper::LegalizeResult
6021 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
6022   // These operations approximately do the following (while avoiding undefined
6023   // shifts by BW):
6024   // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
6025   // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
6026   Register Dst = MI.getOperand(0).getReg();
6027   LLT Ty = MRI.getType(Dst);
6028   LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
6029 
6030   bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
6031   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
6032 
6033   // TODO: Use smarter heuristic that accounts for vector legalization.
6034   if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
6035     return lowerFunnelShiftAsShifts(MI);
6036 
6037   // This only works for powers of 2, fallback to shifts if it fails.
6038   LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
6039   if (Result == UnableToLegalize)
6040     return lowerFunnelShiftAsShifts(MI);
6041   return Result;
6042 }
6043 
6044 LegalizerHelper::LegalizeResult
6045 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
6046   Register Dst = MI.getOperand(0).getReg();
6047   Register Src = MI.getOperand(1).getReg();
6048   Register Amt = MI.getOperand(2).getReg();
6049   LLT AmtTy = MRI.getType(Amt);
6050   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
6051   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6052   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6053   auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
6054   MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
6055   MI.eraseFromParent();
6056   return Legalized;
6057 }
6058 
6059 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
6060   Register Dst = MI.getOperand(0).getReg();
6061   Register Src = MI.getOperand(1).getReg();
6062   Register Amt = MI.getOperand(2).getReg();
6063   LLT DstTy = MRI.getType(Dst);
6064   LLT SrcTy = MRI.getType(Src);
6065   LLT AmtTy = MRI.getType(Amt);
6066 
6067   unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
6068   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6069 
6070   MIRBuilder.setInstrAndDebugLoc(MI);
6071 
6072   // If a rotate in the other direction is supported, use it.
6073   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6074   if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
6075       isPowerOf2_32(EltSizeInBits))
6076     return lowerRotateWithReverseRotate(MI);
6077 
6078   // If a funnel shift is supported, use it.
6079   unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
6080   unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
6081   bool IsFShLegal = false;
6082   if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) ||
6083       LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) {
6084     auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
6085                                 Register R3) {
6086       MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3});
6087       MI.eraseFromParent();
6088       return Legalized;
6089     };
6090     // If a funnel shift in the other direction is supported, use it.
6091     if (IsFShLegal) {
6092       return buildFunnelShift(FShOpc, Dst, Src, Amt);
6093     } else if (isPowerOf2_32(EltSizeInBits)) {
6094       Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0);
6095       return buildFunnelShift(RevFsh, Dst, Src, Amt);
6096     }
6097   }
6098 
6099   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
6100   unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
6101   unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
6102   auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
6103   Register ShVal;
6104   Register RevShiftVal;
6105   if (isPowerOf2_32(EltSizeInBits)) {
6106     // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
6107     // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
6108     auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
6109     auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
6110     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6111     auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
6112     RevShiftVal =
6113         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
6114   } else {
6115     // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
6116     // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
6117     auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
6118     auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
6119     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6120     auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
6121     auto One = MIRBuilder.buildConstant(AmtTy, 1);
6122     auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
6123     RevShiftVal =
6124         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
6125   }
6126   MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
6127   MI.eraseFromParent();
6128   return Legalized;
6129 }
6130 
6131 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
6132 // representation.
6133 LegalizerHelper::LegalizeResult
6134 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
6135   Register Dst = MI.getOperand(0).getReg();
6136   Register Src = MI.getOperand(1).getReg();
6137   const LLT S64 = LLT::scalar(64);
6138   const LLT S32 = LLT::scalar(32);
6139   const LLT S1 = LLT::scalar(1);
6140 
6141   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
6142 
6143   // unsigned cul2f(ulong u) {
6144   //   uint lz = clz(u);
6145   //   uint e = (u != 0) ? 127U + 63U - lz : 0;
6146   //   u = (u << lz) & 0x7fffffffffffffffUL;
6147   //   ulong t = u & 0xffffffffffUL;
6148   //   uint v = (e << 23) | (uint)(u >> 40);
6149   //   uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
6150   //   return as_float(v + r);
6151   // }
6152 
6153   auto Zero32 = MIRBuilder.buildConstant(S32, 0);
6154   auto Zero64 = MIRBuilder.buildConstant(S64, 0);
6155 
6156   auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
6157 
6158   auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
6159   auto Sub = MIRBuilder.buildSub(S32, K, LZ);
6160 
6161   auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
6162   auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
6163 
6164   auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
6165   auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
6166 
6167   auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
6168 
6169   auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
6170   auto T = MIRBuilder.buildAnd(S64, U, Mask1);
6171 
6172   auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
6173   auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
6174   auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
6175 
6176   auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
6177   auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
6178   auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
6179   auto One = MIRBuilder.buildConstant(S32, 1);
6180 
6181   auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
6182   auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
6183   auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
6184   MIRBuilder.buildAdd(Dst, V, R);
6185 
6186   MI.eraseFromParent();
6187   return Legalized;
6188 }
6189 
6190 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
6191   Register Dst = MI.getOperand(0).getReg();
6192   Register Src = MI.getOperand(1).getReg();
6193   LLT DstTy = MRI.getType(Dst);
6194   LLT SrcTy = MRI.getType(Src);
6195 
6196   if (SrcTy == LLT::scalar(1)) {
6197     auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
6198     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6199     MIRBuilder.buildSelect(Dst, Src, True, False);
6200     MI.eraseFromParent();
6201     return Legalized;
6202   }
6203 
6204   if (SrcTy != LLT::scalar(64))
6205     return UnableToLegalize;
6206 
6207   if (DstTy == LLT::scalar(32)) {
6208     // TODO: SelectionDAG has several alternative expansions to port which may
6209     // be more reasonble depending on the available instructions. If a target
6210     // has sitofp, does not have CTLZ, or can efficiently use f64 as an
6211     // intermediate type, this is probably worse.
6212     return lowerU64ToF32BitOps(MI);
6213   }
6214 
6215   return UnableToLegalize;
6216 }
6217 
6218 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
6219   Register Dst = MI.getOperand(0).getReg();
6220   Register Src = MI.getOperand(1).getReg();
6221   LLT DstTy = MRI.getType(Dst);
6222   LLT SrcTy = MRI.getType(Src);
6223 
6224   const LLT S64 = LLT::scalar(64);
6225   const LLT S32 = LLT::scalar(32);
6226   const LLT S1 = LLT::scalar(1);
6227 
6228   if (SrcTy == S1) {
6229     auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
6230     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6231     MIRBuilder.buildSelect(Dst, Src, True, False);
6232     MI.eraseFromParent();
6233     return Legalized;
6234   }
6235 
6236   if (SrcTy != S64)
6237     return UnableToLegalize;
6238 
6239   if (DstTy == S32) {
6240     // signed cl2f(long l) {
6241     //   long s = l >> 63;
6242     //   float r = cul2f((l + s) ^ s);
6243     //   return s ? -r : r;
6244     // }
6245     Register L = Src;
6246     auto SignBit = MIRBuilder.buildConstant(S64, 63);
6247     auto S = MIRBuilder.buildAShr(S64, L, SignBit);
6248 
6249     auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
6250     auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
6251     auto R = MIRBuilder.buildUITOFP(S32, Xor);
6252 
6253     auto RNeg = MIRBuilder.buildFNeg(S32, R);
6254     auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
6255                                             MIRBuilder.buildConstant(S64, 0));
6256     MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
6257     MI.eraseFromParent();
6258     return Legalized;
6259   }
6260 
6261   return UnableToLegalize;
6262 }
6263 
6264 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
6265   Register Dst = MI.getOperand(0).getReg();
6266   Register Src = MI.getOperand(1).getReg();
6267   LLT DstTy = MRI.getType(Dst);
6268   LLT SrcTy = MRI.getType(Src);
6269   const LLT S64 = LLT::scalar(64);
6270   const LLT S32 = LLT::scalar(32);
6271 
6272   if (SrcTy != S64 && SrcTy != S32)
6273     return UnableToLegalize;
6274   if (DstTy != S32 && DstTy != S64)
6275     return UnableToLegalize;
6276 
6277   // FPTOSI gives same result as FPTOUI for positive signed integers.
6278   // FPTOUI needs to deal with fp values that convert to unsigned integers
6279   // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
6280 
6281   APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
6282   APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
6283                                                 : APFloat::IEEEdouble(),
6284                     APInt::getZero(SrcTy.getSizeInBits()));
6285   TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
6286 
6287   MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
6288 
6289   MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
6290   // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
6291   // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
6292   MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
6293   MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
6294   MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
6295   MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
6296 
6297   const LLT S1 = LLT::scalar(1);
6298 
6299   MachineInstrBuilder FCMP =
6300       MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
6301   MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
6302 
6303   MI.eraseFromParent();
6304   return Legalized;
6305 }
6306 
6307 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
6308   Register Dst = MI.getOperand(0).getReg();
6309   Register Src = MI.getOperand(1).getReg();
6310   LLT DstTy = MRI.getType(Dst);
6311   LLT SrcTy = MRI.getType(Src);
6312   const LLT S64 = LLT::scalar(64);
6313   const LLT S32 = LLT::scalar(32);
6314 
6315   // FIXME: Only f32 to i64 conversions are supported.
6316   if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
6317     return UnableToLegalize;
6318 
6319   // Expand f32 -> i64 conversion
6320   // This algorithm comes from compiler-rt's implementation of fixsfdi:
6321   // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
6322 
6323   unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
6324 
6325   auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
6326   auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
6327 
6328   auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
6329   auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
6330 
6331   auto SignMask = MIRBuilder.buildConstant(SrcTy,
6332                                            APInt::getSignMask(SrcEltBits));
6333   auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
6334   auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
6335   auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
6336   Sign = MIRBuilder.buildSExt(DstTy, Sign);
6337 
6338   auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
6339   auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
6340   auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
6341 
6342   auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
6343   R = MIRBuilder.buildZExt(DstTy, R);
6344 
6345   auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
6346   auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
6347   auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
6348   auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
6349 
6350   auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
6351   auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
6352 
6353   const LLT S1 = LLT::scalar(1);
6354   auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
6355                                     S1, Exponent, ExponentLoBit);
6356 
6357   R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
6358 
6359   auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
6360   auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
6361 
6362   auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
6363 
6364   auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
6365                                           S1, Exponent, ZeroSrcTy);
6366 
6367   auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
6368   MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
6369 
6370   MI.eraseFromParent();
6371   return Legalized;
6372 }
6373 
6374 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
6375 LegalizerHelper::LegalizeResult
6376 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
6377   Register Dst = MI.getOperand(0).getReg();
6378   Register Src = MI.getOperand(1).getReg();
6379 
6380   if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
6381     return UnableToLegalize;
6382 
6383   const unsigned ExpMask = 0x7ff;
6384   const unsigned ExpBiasf64 = 1023;
6385   const unsigned ExpBiasf16 = 15;
6386   const LLT S32 = LLT::scalar(32);
6387   const LLT S1 = LLT::scalar(1);
6388 
6389   auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
6390   Register U = Unmerge.getReg(0);
6391   Register UH = Unmerge.getReg(1);
6392 
6393   auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
6394   E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
6395 
6396   // Subtract the fp64 exponent bias (1023) to get the real exponent and
6397   // add the f16 bias (15) to get the biased exponent for the f16 format.
6398   E = MIRBuilder.buildAdd(
6399     S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
6400 
6401   auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
6402   M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
6403 
6404   auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
6405                                        MIRBuilder.buildConstant(S32, 0x1ff));
6406   MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
6407 
6408   auto Zero = MIRBuilder.buildConstant(S32, 0);
6409   auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
6410   auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
6411   M = MIRBuilder.buildOr(S32, M, Lo40Set);
6412 
6413   // (M != 0 ? 0x0200 : 0) | 0x7c00;
6414   auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
6415   auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
6416   auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
6417 
6418   auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
6419   auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
6420 
6421   // N = M | (E << 12);
6422   auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
6423   auto N = MIRBuilder.buildOr(S32, M, EShl12);
6424 
6425   // B = clamp(1-E, 0, 13);
6426   auto One = MIRBuilder.buildConstant(S32, 1);
6427   auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
6428   auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
6429   B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
6430 
6431   auto SigSetHigh = MIRBuilder.buildOr(S32, M,
6432                                        MIRBuilder.buildConstant(S32, 0x1000));
6433 
6434   auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
6435   auto D0 = MIRBuilder.buildShl(S32, D, B);
6436 
6437   auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
6438                                              D0, SigSetHigh);
6439   auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
6440   D = MIRBuilder.buildOr(S32, D, D1);
6441 
6442   auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
6443   auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
6444 
6445   auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
6446   V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
6447 
6448   auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
6449                                        MIRBuilder.buildConstant(S32, 3));
6450   auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
6451 
6452   auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
6453                                        MIRBuilder.buildConstant(S32, 5));
6454   auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
6455 
6456   V1 = MIRBuilder.buildOr(S32, V0, V1);
6457   V = MIRBuilder.buildAdd(S32, V, V1);
6458 
6459   auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
6460                                        E, MIRBuilder.buildConstant(S32, 30));
6461   V = MIRBuilder.buildSelect(S32, CmpEGt30,
6462                              MIRBuilder.buildConstant(S32, 0x7c00), V);
6463 
6464   auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
6465                                          E, MIRBuilder.buildConstant(S32, 1039));
6466   V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
6467 
6468   // Extract the sign bit.
6469   auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
6470   Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
6471 
6472   // Insert the sign bit
6473   V = MIRBuilder.buildOr(S32, Sign, V);
6474 
6475   MIRBuilder.buildTrunc(Dst, V);
6476   MI.eraseFromParent();
6477   return Legalized;
6478 }
6479 
6480 LegalizerHelper::LegalizeResult
6481 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
6482   Register Dst = MI.getOperand(0).getReg();
6483   Register Src = MI.getOperand(1).getReg();
6484 
6485   LLT DstTy = MRI.getType(Dst);
6486   LLT SrcTy = MRI.getType(Src);
6487   const LLT S64 = LLT::scalar(64);
6488   const LLT S16 = LLT::scalar(16);
6489 
6490   if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
6491     return lowerFPTRUNC_F64_TO_F16(MI);
6492 
6493   return UnableToLegalize;
6494 }
6495 
6496 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a
6497 // multiplication tree.
6498 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
6499   Register Dst = MI.getOperand(0).getReg();
6500   Register Src0 = MI.getOperand(1).getReg();
6501   Register Src1 = MI.getOperand(2).getReg();
6502   LLT Ty = MRI.getType(Dst);
6503 
6504   auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
6505   MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
6506   MI.eraseFromParent();
6507   return Legalized;
6508 }
6509 
6510 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
6511   switch (Opc) {
6512   case TargetOpcode::G_SMIN:
6513     return CmpInst::ICMP_SLT;
6514   case TargetOpcode::G_SMAX:
6515     return CmpInst::ICMP_SGT;
6516   case TargetOpcode::G_UMIN:
6517     return CmpInst::ICMP_ULT;
6518   case TargetOpcode::G_UMAX:
6519     return CmpInst::ICMP_UGT;
6520   default:
6521     llvm_unreachable("not in integer min/max");
6522   }
6523 }
6524 
6525 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
6526   Register Dst = MI.getOperand(0).getReg();
6527   Register Src0 = MI.getOperand(1).getReg();
6528   Register Src1 = MI.getOperand(2).getReg();
6529 
6530   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
6531   LLT CmpType = MRI.getType(Dst).changeElementSize(1);
6532 
6533   auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
6534   MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
6535 
6536   MI.eraseFromParent();
6537   return Legalized;
6538 }
6539 
6540 LegalizerHelper::LegalizeResult
6541 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
6542   Register Dst = MI.getOperand(0).getReg();
6543   Register Src0 = MI.getOperand(1).getReg();
6544   Register Src1 = MI.getOperand(2).getReg();
6545 
6546   const LLT Src0Ty = MRI.getType(Src0);
6547   const LLT Src1Ty = MRI.getType(Src1);
6548 
6549   const int Src0Size = Src0Ty.getScalarSizeInBits();
6550   const int Src1Size = Src1Ty.getScalarSizeInBits();
6551 
6552   auto SignBitMask = MIRBuilder.buildConstant(
6553     Src0Ty, APInt::getSignMask(Src0Size));
6554 
6555   auto NotSignBitMask = MIRBuilder.buildConstant(
6556     Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
6557 
6558   Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
6559   Register And1;
6560   if (Src0Ty == Src1Ty) {
6561     And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
6562   } else if (Src0Size > Src1Size) {
6563     auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
6564     auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
6565     auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
6566     And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
6567   } else {
6568     auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
6569     auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
6570     auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
6571     And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
6572   }
6573 
6574   // Be careful about setting nsz/nnan/ninf on every instruction, since the
6575   // constants are a nan and -0.0, but the final result should preserve
6576   // everything.
6577   unsigned Flags = MI.getFlags();
6578   MIRBuilder.buildOr(Dst, And0, And1, Flags);
6579 
6580   MI.eraseFromParent();
6581   return Legalized;
6582 }
6583 
6584 LegalizerHelper::LegalizeResult
6585 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
6586   unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
6587     TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
6588 
6589   Register Dst = MI.getOperand(0).getReg();
6590   Register Src0 = MI.getOperand(1).getReg();
6591   Register Src1 = MI.getOperand(2).getReg();
6592   LLT Ty = MRI.getType(Dst);
6593 
6594   if (!MI.getFlag(MachineInstr::FmNoNans)) {
6595     // Insert canonicalizes if it's possible we need to quiet to get correct
6596     // sNaN behavior.
6597 
6598     // Note this must be done here, and not as an optimization combine in the
6599     // absence of a dedicate quiet-snan instruction as we're using an
6600     // omni-purpose G_FCANONICALIZE.
6601     if (!isKnownNeverSNaN(Src0, MRI))
6602       Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
6603 
6604     if (!isKnownNeverSNaN(Src1, MRI))
6605       Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
6606   }
6607 
6608   // If there are no nans, it's safe to simply replace this with the non-IEEE
6609   // version.
6610   MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
6611   MI.eraseFromParent();
6612   return Legalized;
6613 }
6614 
6615 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
6616   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
6617   Register DstReg = MI.getOperand(0).getReg();
6618   LLT Ty = MRI.getType(DstReg);
6619   unsigned Flags = MI.getFlags();
6620 
6621   auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
6622                                   Flags);
6623   MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
6624   MI.eraseFromParent();
6625   return Legalized;
6626 }
6627 
6628 LegalizerHelper::LegalizeResult
6629 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
6630   Register DstReg = MI.getOperand(0).getReg();
6631   Register X = MI.getOperand(1).getReg();
6632   const unsigned Flags = MI.getFlags();
6633   const LLT Ty = MRI.getType(DstReg);
6634   const LLT CondTy = Ty.changeElementSize(1);
6635 
6636   // round(x) =>
6637   //  t = trunc(x);
6638   //  d = fabs(x - t);
6639   //  o = copysign(1.0f, x);
6640   //  return t + (d >= 0.5 ? o : 0.0);
6641 
6642   auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
6643 
6644   auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
6645   auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
6646   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6647   auto One = MIRBuilder.buildFConstant(Ty, 1.0);
6648   auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
6649   auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X);
6650 
6651   auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half,
6652                                   Flags);
6653   auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags);
6654 
6655   MIRBuilder.buildFAdd(DstReg, T, Sel, Flags);
6656 
6657   MI.eraseFromParent();
6658   return Legalized;
6659 }
6660 
6661 LegalizerHelper::LegalizeResult
6662 LegalizerHelper::lowerFFloor(MachineInstr &MI) {
6663   Register DstReg = MI.getOperand(0).getReg();
6664   Register SrcReg = MI.getOperand(1).getReg();
6665   unsigned Flags = MI.getFlags();
6666   LLT Ty = MRI.getType(DstReg);
6667   const LLT CondTy = Ty.changeElementSize(1);
6668 
6669   // result = trunc(src);
6670   // if (src < 0.0 && src != result)
6671   //   result += -1.0.
6672 
6673   auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
6674   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6675 
6676   auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
6677                                   SrcReg, Zero, Flags);
6678   auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
6679                                       SrcReg, Trunc, Flags);
6680   auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
6681   auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
6682 
6683   MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
6684   MI.eraseFromParent();
6685   return Legalized;
6686 }
6687 
6688 LegalizerHelper::LegalizeResult
6689 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
6690   const unsigned NumOps = MI.getNumOperands();
6691   Register DstReg = MI.getOperand(0).getReg();
6692   Register Src0Reg = MI.getOperand(1).getReg();
6693   LLT DstTy = MRI.getType(DstReg);
6694   LLT SrcTy = MRI.getType(Src0Reg);
6695   unsigned PartSize = SrcTy.getSizeInBits();
6696 
6697   LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
6698   Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
6699 
6700   for (unsigned I = 2; I != NumOps; ++I) {
6701     const unsigned Offset = (I - 1) * PartSize;
6702 
6703     Register SrcReg = MI.getOperand(I).getReg();
6704     auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
6705 
6706     Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
6707       MRI.createGenericVirtualRegister(WideTy);
6708 
6709     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
6710     auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
6711     MIRBuilder.buildOr(NextResult, ResultReg, Shl);
6712     ResultReg = NextResult;
6713   }
6714 
6715   if (DstTy.isPointer()) {
6716     if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
6717           DstTy.getAddressSpace())) {
6718       LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
6719       return UnableToLegalize;
6720     }
6721 
6722     MIRBuilder.buildIntToPtr(DstReg, ResultReg);
6723   }
6724 
6725   MI.eraseFromParent();
6726   return Legalized;
6727 }
6728 
6729 LegalizerHelper::LegalizeResult
6730 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
6731   const unsigned NumDst = MI.getNumOperands() - 1;
6732   Register SrcReg = MI.getOperand(NumDst).getReg();
6733   Register Dst0Reg = MI.getOperand(0).getReg();
6734   LLT DstTy = MRI.getType(Dst0Reg);
6735   if (DstTy.isPointer())
6736     return UnableToLegalize; // TODO
6737 
6738   SrcReg = coerceToScalar(SrcReg);
6739   if (!SrcReg)
6740     return UnableToLegalize;
6741 
6742   // Expand scalarizing unmerge as bitcast to integer and shift.
6743   LLT IntTy = MRI.getType(SrcReg);
6744 
6745   MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
6746 
6747   const unsigned DstSize = DstTy.getSizeInBits();
6748   unsigned Offset = DstSize;
6749   for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
6750     auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
6751     auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
6752     MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
6753   }
6754 
6755   MI.eraseFromParent();
6756   return Legalized;
6757 }
6758 
6759 /// Lower a vector extract or insert by writing the vector to a stack temporary
6760 /// and reloading the element or vector.
6761 ///
6762 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
6763 ///  =>
6764 ///  %stack_temp = G_FRAME_INDEX
6765 ///  G_STORE %vec, %stack_temp
6766 ///  %idx = clamp(%idx, %vec.getNumElements())
6767 ///  %element_ptr = G_PTR_ADD %stack_temp, %idx
6768 ///  %dst = G_LOAD %element_ptr
6769 LegalizerHelper::LegalizeResult
6770 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
6771   Register DstReg = MI.getOperand(0).getReg();
6772   Register SrcVec = MI.getOperand(1).getReg();
6773   Register InsertVal;
6774   if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
6775     InsertVal = MI.getOperand(2).getReg();
6776 
6777   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
6778 
6779   LLT VecTy = MRI.getType(SrcVec);
6780   LLT EltTy = VecTy.getElementType();
6781   if (!EltTy.isByteSized()) { // Not implemented.
6782     LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
6783     return UnableToLegalize;
6784   }
6785 
6786   unsigned EltBytes = EltTy.getSizeInBytes();
6787   Align VecAlign = getStackTemporaryAlignment(VecTy);
6788   Align EltAlign;
6789 
6790   MachinePointerInfo PtrInfo;
6791   auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()),
6792                                         VecAlign, PtrInfo);
6793   MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
6794 
6795   // Get the pointer to the element, and be sure not to hit undefined behavior
6796   // if the index is out of bounds.
6797   Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
6798 
6799   int64_t IdxVal;
6800   if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
6801     int64_t Offset = IdxVal * EltBytes;
6802     PtrInfo = PtrInfo.getWithOffset(Offset);
6803     EltAlign = commonAlignment(VecAlign, Offset);
6804   } else {
6805     // We lose information with a variable offset.
6806     EltAlign = getStackTemporaryAlignment(EltTy);
6807     PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
6808   }
6809 
6810   if (InsertVal) {
6811     // Write the inserted element
6812     MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
6813 
6814     // Reload the whole vector.
6815     MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
6816   } else {
6817     MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
6818   }
6819 
6820   MI.eraseFromParent();
6821   return Legalized;
6822 }
6823 
6824 LegalizerHelper::LegalizeResult
6825 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
6826   Register DstReg = MI.getOperand(0).getReg();
6827   Register Src0Reg = MI.getOperand(1).getReg();
6828   Register Src1Reg = MI.getOperand(2).getReg();
6829   LLT Src0Ty = MRI.getType(Src0Reg);
6830   LLT DstTy = MRI.getType(DstReg);
6831   LLT IdxTy = LLT::scalar(32);
6832 
6833   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6834 
6835   if (DstTy.isScalar()) {
6836     if (Src0Ty.isVector())
6837       return UnableToLegalize;
6838 
6839     // This is just a SELECT.
6840     assert(Mask.size() == 1 && "Expected a single mask element");
6841     Register Val;
6842     if (Mask[0] < 0 || Mask[0] > 1)
6843       Val = MIRBuilder.buildUndef(DstTy).getReg(0);
6844     else
6845       Val = Mask[0] == 0 ? Src0Reg : Src1Reg;
6846     MIRBuilder.buildCopy(DstReg, Val);
6847     MI.eraseFromParent();
6848     return Legalized;
6849   }
6850 
6851   Register Undef;
6852   SmallVector<Register, 32> BuildVec;
6853   LLT EltTy = DstTy.getElementType();
6854 
6855   for (int Idx : Mask) {
6856     if (Idx < 0) {
6857       if (!Undef.isValid())
6858         Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
6859       BuildVec.push_back(Undef);
6860       continue;
6861     }
6862 
6863     if (Src0Ty.isScalar()) {
6864       BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
6865     } else {
6866       int NumElts = Src0Ty.getNumElements();
6867       Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
6868       int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
6869       auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
6870       auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
6871       BuildVec.push_back(Extract.getReg(0));
6872     }
6873   }
6874 
6875   MIRBuilder.buildBuildVector(DstReg, BuildVec);
6876   MI.eraseFromParent();
6877   return Legalized;
6878 }
6879 
6880 LegalizerHelper::LegalizeResult
6881 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
6882   const auto &MF = *MI.getMF();
6883   const auto &TFI = *MF.getSubtarget().getFrameLowering();
6884   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
6885     return UnableToLegalize;
6886 
6887   Register Dst = MI.getOperand(0).getReg();
6888   Register AllocSize = MI.getOperand(1).getReg();
6889   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
6890 
6891   LLT PtrTy = MRI.getType(Dst);
6892   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
6893 
6894   Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
6895   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
6896   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
6897 
6898   // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
6899   // have to generate an extra instruction to negate the alloc and then use
6900   // G_PTR_ADD to add the negative offset.
6901   auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
6902   if (Alignment > Align(1)) {
6903     APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
6904     AlignMask.negate();
6905     auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
6906     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
6907   }
6908 
6909   SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
6910   MIRBuilder.buildCopy(SPReg, SPTmp);
6911   MIRBuilder.buildCopy(Dst, SPTmp);
6912 
6913   MI.eraseFromParent();
6914   return Legalized;
6915 }
6916 
6917 LegalizerHelper::LegalizeResult
6918 LegalizerHelper::lowerExtract(MachineInstr &MI) {
6919   Register Dst = MI.getOperand(0).getReg();
6920   Register Src = MI.getOperand(1).getReg();
6921   unsigned Offset = MI.getOperand(2).getImm();
6922 
6923   LLT DstTy = MRI.getType(Dst);
6924   LLT SrcTy = MRI.getType(Src);
6925 
6926   if (DstTy.isScalar() &&
6927       (SrcTy.isScalar() ||
6928        (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
6929     LLT SrcIntTy = SrcTy;
6930     if (!SrcTy.isScalar()) {
6931       SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
6932       Src = MIRBuilder.buildBitcast(SrcIntTy, Src).getReg(0);
6933     }
6934 
6935     if (Offset == 0)
6936       MIRBuilder.buildTrunc(Dst, Src);
6937     else {
6938       auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
6939       auto Shr = MIRBuilder.buildLShr(SrcIntTy, Src, ShiftAmt);
6940       MIRBuilder.buildTrunc(Dst, Shr);
6941     }
6942 
6943     MI.eraseFromParent();
6944     return Legalized;
6945   }
6946 
6947   return UnableToLegalize;
6948 }
6949 
6950 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
6951   Register Dst = MI.getOperand(0).getReg();
6952   Register Src = MI.getOperand(1).getReg();
6953   Register InsertSrc = MI.getOperand(2).getReg();
6954   uint64_t Offset = MI.getOperand(3).getImm();
6955 
6956   LLT DstTy = MRI.getType(Src);
6957   LLT InsertTy = MRI.getType(InsertSrc);
6958 
6959   if (InsertTy.isVector() ||
6960       (DstTy.isVector() && DstTy.getElementType() != InsertTy))
6961     return UnableToLegalize;
6962 
6963   const DataLayout &DL = MIRBuilder.getDataLayout();
6964   if ((DstTy.isPointer() &&
6965        DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
6966       (InsertTy.isPointer() &&
6967        DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
6968     LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
6969     return UnableToLegalize;
6970   }
6971 
6972   LLT IntDstTy = DstTy;
6973 
6974   if (!DstTy.isScalar()) {
6975     IntDstTy = LLT::scalar(DstTy.getSizeInBits());
6976     Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
6977   }
6978 
6979   if (!InsertTy.isScalar()) {
6980     const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
6981     InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
6982   }
6983 
6984   Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
6985   if (Offset != 0) {
6986     auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
6987     ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
6988   }
6989 
6990   APInt MaskVal = APInt::getBitsSetWithWrap(
6991       DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
6992 
6993   auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
6994   auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
6995   auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
6996 
6997   MIRBuilder.buildCast(Dst, Or);
6998   MI.eraseFromParent();
6999   return Legalized;
7000 }
7001 
7002 LegalizerHelper::LegalizeResult
7003 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
7004   Register Dst0 = MI.getOperand(0).getReg();
7005   Register Dst1 = MI.getOperand(1).getReg();
7006   Register LHS = MI.getOperand(2).getReg();
7007   Register RHS = MI.getOperand(3).getReg();
7008   const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
7009 
7010   LLT Ty = MRI.getType(Dst0);
7011   LLT BoolTy = MRI.getType(Dst1);
7012 
7013   if (IsAdd)
7014     MIRBuilder.buildAdd(Dst0, LHS, RHS);
7015   else
7016     MIRBuilder.buildSub(Dst0, LHS, RHS);
7017 
7018   // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
7019 
7020   auto Zero = MIRBuilder.buildConstant(Ty, 0);
7021 
7022   // For an addition, the result should be less than one of the operands (LHS)
7023   // if and only if the other operand (RHS) is negative, otherwise there will
7024   // be overflow.
7025   // For a subtraction, the result should be less than one of the operands
7026   // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
7027   // otherwise there will be overflow.
7028   auto ResultLowerThanLHS =
7029       MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
7030   auto ConditionRHS = MIRBuilder.buildICmp(
7031       IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
7032 
7033   MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
7034   MI.eraseFromParent();
7035   return Legalized;
7036 }
7037 
7038 LegalizerHelper::LegalizeResult
7039 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
7040   Register Res = MI.getOperand(0).getReg();
7041   Register LHS = MI.getOperand(1).getReg();
7042   Register RHS = MI.getOperand(2).getReg();
7043   LLT Ty = MRI.getType(Res);
7044   bool IsSigned;
7045   bool IsAdd;
7046   unsigned BaseOp;
7047   switch (MI.getOpcode()) {
7048   default:
7049     llvm_unreachable("unexpected addsat/subsat opcode");
7050   case TargetOpcode::G_UADDSAT:
7051     IsSigned = false;
7052     IsAdd = true;
7053     BaseOp = TargetOpcode::G_ADD;
7054     break;
7055   case TargetOpcode::G_SADDSAT:
7056     IsSigned = true;
7057     IsAdd = true;
7058     BaseOp = TargetOpcode::G_ADD;
7059     break;
7060   case TargetOpcode::G_USUBSAT:
7061     IsSigned = false;
7062     IsAdd = false;
7063     BaseOp = TargetOpcode::G_SUB;
7064     break;
7065   case TargetOpcode::G_SSUBSAT:
7066     IsSigned = true;
7067     IsAdd = false;
7068     BaseOp = TargetOpcode::G_SUB;
7069     break;
7070   }
7071 
7072   if (IsSigned) {
7073     // sadd.sat(a, b) ->
7074     //   hi = 0x7fffffff - smax(a, 0)
7075     //   lo = 0x80000000 - smin(a, 0)
7076     //   a + smin(smax(lo, b), hi)
7077     // ssub.sat(a, b) ->
7078     //   lo = smax(a, -1) - 0x7fffffff
7079     //   hi = smin(a, -1) - 0x80000000
7080     //   a - smin(smax(lo, b), hi)
7081     // TODO: AMDGPU can use a "median of 3" instruction here:
7082     //   a +/- med3(lo, b, hi)
7083     uint64_t NumBits = Ty.getScalarSizeInBits();
7084     auto MaxVal =
7085         MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
7086     auto MinVal =
7087         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7088     MachineInstrBuilder Hi, Lo;
7089     if (IsAdd) {
7090       auto Zero = MIRBuilder.buildConstant(Ty, 0);
7091       Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
7092       Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
7093     } else {
7094       auto NegOne = MIRBuilder.buildConstant(Ty, -1);
7095       Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
7096                                MaxVal);
7097       Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
7098                                MinVal);
7099     }
7100     auto RHSClamped =
7101         MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
7102     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
7103   } else {
7104     // uadd.sat(a, b) -> a + umin(~a, b)
7105     // usub.sat(a, b) -> a - umin(a, b)
7106     Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
7107     auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
7108     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
7109   }
7110 
7111   MI.eraseFromParent();
7112   return Legalized;
7113 }
7114 
7115 LegalizerHelper::LegalizeResult
7116 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
7117   Register Res = MI.getOperand(0).getReg();
7118   Register LHS = MI.getOperand(1).getReg();
7119   Register RHS = MI.getOperand(2).getReg();
7120   LLT Ty = MRI.getType(Res);
7121   LLT BoolTy = Ty.changeElementSize(1);
7122   bool IsSigned;
7123   bool IsAdd;
7124   unsigned OverflowOp;
7125   switch (MI.getOpcode()) {
7126   default:
7127     llvm_unreachable("unexpected addsat/subsat opcode");
7128   case TargetOpcode::G_UADDSAT:
7129     IsSigned = false;
7130     IsAdd = true;
7131     OverflowOp = TargetOpcode::G_UADDO;
7132     break;
7133   case TargetOpcode::G_SADDSAT:
7134     IsSigned = true;
7135     IsAdd = true;
7136     OverflowOp = TargetOpcode::G_SADDO;
7137     break;
7138   case TargetOpcode::G_USUBSAT:
7139     IsSigned = false;
7140     IsAdd = false;
7141     OverflowOp = TargetOpcode::G_USUBO;
7142     break;
7143   case TargetOpcode::G_SSUBSAT:
7144     IsSigned = true;
7145     IsAdd = false;
7146     OverflowOp = TargetOpcode::G_SSUBO;
7147     break;
7148   }
7149 
7150   auto OverflowRes =
7151       MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
7152   Register Tmp = OverflowRes.getReg(0);
7153   Register Ov = OverflowRes.getReg(1);
7154   MachineInstrBuilder Clamp;
7155   if (IsSigned) {
7156     // sadd.sat(a, b) ->
7157     //   {tmp, ov} = saddo(a, b)
7158     //   ov ? (tmp >>s 31) + 0x80000000 : r
7159     // ssub.sat(a, b) ->
7160     //   {tmp, ov} = ssubo(a, b)
7161     //   ov ? (tmp >>s 31) + 0x80000000 : r
7162     uint64_t NumBits = Ty.getScalarSizeInBits();
7163     auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
7164     auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
7165     auto MinVal =
7166         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7167     Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
7168   } else {
7169     // uadd.sat(a, b) ->
7170     //   {tmp, ov} = uaddo(a, b)
7171     //   ov ? 0xffffffff : tmp
7172     // usub.sat(a, b) ->
7173     //   {tmp, ov} = usubo(a, b)
7174     //   ov ? 0 : tmp
7175     Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
7176   }
7177   MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
7178 
7179   MI.eraseFromParent();
7180   return Legalized;
7181 }
7182 
7183 LegalizerHelper::LegalizeResult
7184 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
7185   assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
7186           MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
7187          "Expected shlsat opcode!");
7188   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
7189   Register Res = MI.getOperand(0).getReg();
7190   Register LHS = MI.getOperand(1).getReg();
7191   Register RHS = MI.getOperand(2).getReg();
7192   LLT Ty = MRI.getType(Res);
7193   LLT BoolTy = Ty.changeElementSize(1);
7194 
7195   unsigned BW = Ty.getScalarSizeInBits();
7196   auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
7197   auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
7198                        : MIRBuilder.buildLShr(Ty, Result, RHS);
7199 
7200   MachineInstrBuilder SatVal;
7201   if (IsSigned) {
7202     auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
7203     auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
7204     auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
7205                                     MIRBuilder.buildConstant(Ty, 0));
7206     SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
7207   } else {
7208     SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
7209   }
7210   auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
7211   MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
7212 
7213   MI.eraseFromParent();
7214   return Legalized;
7215 }
7216 
7217 LegalizerHelper::LegalizeResult
7218 LegalizerHelper::lowerBswap(MachineInstr &MI) {
7219   Register Dst = MI.getOperand(0).getReg();
7220   Register Src = MI.getOperand(1).getReg();
7221   const LLT Ty = MRI.getType(Src);
7222   unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
7223   unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
7224 
7225   // Swap most and least significant byte, set remaining bytes in Res to zero.
7226   auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
7227   auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
7228   auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7229   auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
7230 
7231   // Set i-th high/low byte in Res to i-th low/high byte from Src.
7232   for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
7233     // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
7234     APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
7235     auto Mask = MIRBuilder.buildConstant(Ty, APMask);
7236     auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
7237     // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
7238     auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
7239     auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
7240     Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
7241     // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
7242     auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7243     auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
7244     Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
7245   }
7246   Res.getInstr()->getOperand(0).setReg(Dst);
7247 
7248   MI.eraseFromParent();
7249   return Legalized;
7250 }
7251 
7252 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
7253 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
7254                                  MachineInstrBuilder Src, APInt Mask) {
7255   const LLT Ty = Dst.getLLTTy(*B.getMRI());
7256   MachineInstrBuilder C_N = B.buildConstant(Ty, N);
7257   MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
7258   auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
7259   auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
7260   return B.buildOr(Dst, LHS, RHS);
7261 }
7262 
7263 LegalizerHelper::LegalizeResult
7264 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
7265   Register Dst = MI.getOperand(0).getReg();
7266   Register Src = MI.getOperand(1).getReg();
7267   const LLT Ty = MRI.getType(Src);
7268   unsigned Size = Ty.getSizeInBits();
7269 
7270   MachineInstrBuilder BSWAP =
7271       MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
7272 
7273   // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
7274   //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
7275   // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
7276   MachineInstrBuilder Swap4 =
7277       SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
7278 
7279   // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
7280   //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
7281   // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
7282   MachineInstrBuilder Swap2 =
7283       SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
7284 
7285   // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
7286   //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
7287   // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
7288   SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
7289 
7290   MI.eraseFromParent();
7291   return Legalized;
7292 }
7293 
7294 LegalizerHelper::LegalizeResult
7295 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
7296   MachineFunction &MF = MIRBuilder.getMF();
7297 
7298   bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
7299   int NameOpIdx = IsRead ? 1 : 0;
7300   int ValRegIndex = IsRead ? 0 : 1;
7301 
7302   Register ValReg = MI.getOperand(ValRegIndex).getReg();
7303   const LLT Ty = MRI.getType(ValReg);
7304   const MDString *RegStr = cast<MDString>(
7305     cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
7306 
7307   Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
7308   if (!PhysReg.isValid())
7309     return UnableToLegalize;
7310 
7311   if (IsRead)
7312     MIRBuilder.buildCopy(ValReg, PhysReg);
7313   else
7314     MIRBuilder.buildCopy(PhysReg, ValReg);
7315 
7316   MI.eraseFromParent();
7317   return Legalized;
7318 }
7319 
7320 LegalizerHelper::LegalizeResult
7321 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
7322   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
7323   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
7324   Register Result = MI.getOperand(0).getReg();
7325   LLT OrigTy = MRI.getType(Result);
7326   auto SizeInBits = OrigTy.getScalarSizeInBits();
7327   LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
7328 
7329   auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
7330   auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
7331   auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
7332   unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
7333 
7334   auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
7335   auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
7336   MIRBuilder.buildTrunc(Result, Shifted);
7337 
7338   MI.eraseFromParent();
7339   return Legalized;
7340 }
7341 
7342 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
7343   // Implement vector G_SELECT in terms of XOR, AND, OR.
7344   Register DstReg = MI.getOperand(0).getReg();
7345   Register MaskReg = MI.getOperand(1).getReg();
7346   Register Op1Reg = MI.getOperand(2).getReg();
7347   Register Op2Reg = MI.getOperand(3).getReg();
7348   LLT DstTy = MRI.getType(DstReg);
7349   LLT MaskTy = MRI.getType(MaskReg);
7350   LLT Op1Ty = MRI.getType(Op1Reg);
7351   if (!DstTy.isVector())
7352     return UnableToLegalize;
7353 
7354   // Vector selects can have a scalar predicate. If so, splat into a vector and
7355   // finish for later legalization attempts to try again.
7356   if (MaskTy.isScalar()) {
7357     Register MaskElt = MaskReg;
7358     if (MaskTy.getSizeInBits() < DstTy.getScalarSizeInBits())
7359       MaskElt = MIRBuilder.buildSExt(DstTy.getElementType(), MaskElt).getReg(0);
7360     // Generate a vector splat idiom to be pattern matched later.
7361     auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
7362     Observer.changingInstr(MI);
7363     MI.getOperand(1).setReg(ShufSplat.getReg(0));
7364     Observer.changedInstr(MI);
7365     return Legalized;
7366   }
7367 
7368   if (MaskTy.getSizeInBits() != Op1Ty.getSizeInBits()) {
7369     return UnableToLegalize;
7370   }
7371 
7372   auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
7373   auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
7374   auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
7375   MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
7376   MI.eraseFromParent();
7377   return Legalized;
7378 }
7379 
7380 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
7381   // Split DIVREM into individual instructions.
7382   unsigned Opcode = MI.getOpcode();
7383 
7384   MIRBuilder.buildInstr(
7385       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
7386                                         : TargetOpcode::G_UDIV,
7387       {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7388   MIRBuilder.buildInstr(
7389       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
7390                                         : TargetOpcode::G_UREM,
7391       {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7392   MI.eraseFromParent();
7393   return Legalized;
7394 }
7395 
7396 LegalizerHelper::LegalizeResult
7397 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
7398   // Expand %res = G_ABS %a into:
7399   // %v1 = G_ASHR %a, scalar_size-1
7400   // %v2 = G_ADD %a, %v1
7401   // %res = G_XOR %v2, %v1
7402   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
7403   Register OpReg = MI.getOperand(1).getReg();
7404   auto ShiftAmt =
7405       MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
7406   auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
7407   auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
7408   MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
7409   MI.eraseFromParent();
7410   return Legalized;
7411 }
7412 
7413 LegalizerHelper::LegalizeResult
7414 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
7415   // Expand %res = G_ABS %a into:
7416   // %v1 = G_CONSTANT 0
7417   // %v2 = G_SUB %v1, %a
7418   // %res = G_SMAX %a, %v2
7419   Register SrcReg = MI.getOperand(1).getReg();
7420   LLT Ty = MRI.getType(SrcReg);
7421   auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
7422   auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
7423   MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
7424   MI.eraseFromParent();
7425   return Legalized;
7426 }
7427 
7428 LegalizerHelper::LegalizeResult
7429 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
7430   Register SrcReg = MI.getOperand(1).getReg();
7431   LLT SrcTy = MRI.getType(SrcReg);
7432   LLT DstTy = MRI.getType(SrcReg);
7433 
7434   // The source could be a scalar if the IR type was <1 x sN>.
7435   if (SrcTy.isScalar()) {
7436     if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
7437       return UnableToLegalize; // FIXME: handle extension.
7438     // This can be just a plain copy.
7439     Observer.changingInstr(MI);
7440     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
7441     Observer.changedInstr(MI);
7442     return Legalized;
7443   }
7444   return UnableToLegalize;;
7445 }
7446 
7447 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
7448   // On Darwin, -Os means optimize for size without hurting performance, so
7449   // only really optimize for size when -Oz (MinSize) is used.
7450   if (MF.getTarget().getTargetTriple().isOSDarwin())
7451     return MF.getFunction().hasMinSize();
7452   return MF.getFunction().hasOptSize();
7453 }
7454 
7455 // Returns a list of types to use for memory op lowering in MemOps. A partial
7456 // port of findOptimalMemOpLowering in TargetLowering.
7457 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
7458                                           unsigned Limit, const MemOp &Op,
7459                                           unsigned DstAS, unsigned SrcAS,
7460                                           const AttributeList &FuncAttributes,
7461                                           const TargetLowering &TLI) {
7462   if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
7463     return false;
7464 
7465   LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
7466 
7467   if (Ty == LLT()) {
7468     // Use the largest scalar type whose alignment constraints are satisfied.
7469     // We only need to check DstAlign here as SrcAlign is always greater or
7470     // equal to DstAlign (or zero).
7471     Ty = LLT::scalar(64);
7472     if (Op.isFixedDstAlign())
7473       while (Op.getDstAlign() < Ty.getSizeInBytes() &&
7474              !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
7475         Ty = LLT::scalar(Ty.getSizeInBytes());
7476     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
7477     // FIXME: check for the largest legal type we can load/store to.
7478   }
7479 
7480   unsigned NumMemOps = 0;
7481   uint64_t Size = Op.size();
7482   while (Size) {
7483     unsigned TySize = Ty.getSizeInBytes();
7484     while (TySize > Size) {
7485       // For now, only use non-vector load / store's for the left-over pieces.
7486       LLT NewTy = Ty;
7487       // FIXME: check for mem op safety and legality of the types. Not all of
7488       // SDAGisms map cleanly to GISel concepts.
7489       if (NewTy.isVector())
7490         NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
7491       NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
7492       unsigned NewTySize = NewTy.getSizeInBytes();
7493       assert(NewTySize > 0 && "Could not find appropriate type");
7494 
7495       // If the new LLT cannot cover all of the remaining bits, then consider
7496       // issuing a (or a pair of) unaligned and overlapping load / store.
7497       bool Fast;
7498       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
7499       MVT VT = getMVTForLLT(Ty);
7500       if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
7501           TLI.allowsMisalignedMemoryAccesses(
7502               VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
7503               MachineMemOperand::MONone, &Fast) &&
7504           Fast)
7505         TySize = Size;
7506       else {
7507         Ty = NewTy;
7508         TySize = NewTySize;
7509       }
7510     }
7511 
7512     if (++NumMemOps > Limit)
7513       return false;
7514 
7515     MemOps.push_back(Ty);
7516     Size -= TySize;
7517   }
7518 
7519   return true;
7520 }
7521 
7522 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
7523   if (Ty.isVector())
7524     return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
7525                                 Ty.getNumElements());
7526   return IntegerType::get(C, Ty.getSizeInBits());
7527 }
7528 
7529 // Get a vectorized representation of the memset value operand, GISel edition.
7530 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
7531   MachineRegisterInfo &MRI = *MIB.getMRI();
7532   unsigned NumBits = Ty.getScalarSizeInBits();
7533   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
7534   if (!Ty.isVector() && ValVRegAndVal) {
7535     APInt Scalar = ValVRegAndVal->Value.truncOrSelf(8);
7536     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
7537     return MIB.buildConstant(Ty, SplatVal).getReg(0);
7538   }
7539 
7540   // Extend the byte value to the larger type, and then multiply by a magic
7541   // value 0x010101... in order to replicate it across every byte.
7542   // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
7543   if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
7544     return MIB.buildConstant(Ty, 0).getReg(0);
7545   }
7546 
7547   LLT ExtType = Ty.getScalarType();
7548   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
7549   if (NumBits > 8) {
7550     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
7551     auto MagicMI = MIB.buildConstant(ExtType, Magic);
7552     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
7553   }
7554 
7555   // For vector types create a G_BUILD_VECTOR.
7556   if (Ty.isVector())
7557     Val = MIB.buildSplatVector(Ty, Val).getReg(0);
7558 
7559   return Val;
7560 }
7561 
7562 LegalizerHelper::LegalizeResult
7563 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
7564                              uint64_t KnownLen, Align Alignment,
7565                              bool IsVolatile) {
7566   auto &MF = *MI.getParent()->getParent();
7567   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7568   auto &DL = MF.getDataLayout();
7569   LLVMContext &C = MF.getFunction().getContext();
7570 
7571   assert(KnownLen != 0 && "Have a zero length memset length!");
7572 
7573   bool DstAlignCanChange = false;
7574   MachineFrameInfo &MFI = MF.getFrameInfo();
7575   bool OptSize = shouldLowerMemFuncForSize(MF);
7576 
7577   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7578   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7579     DstAlignCanChange = true;
7580 
7581   unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
7582   std::vector<LLT> MemOps;
7583 
7584   const auto &DstMMO = **MI.memoperands_begin();
7585   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7586 
7587   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
7588   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
7589 
7590   if (!findGISelOptimalMemOpLowering(MemOps, Limit,
7591                                      MemOp::Set(KnownLen, DstAlignCanChange,
7592                                                 Alignment,
7593                                                 /*IsZeroMemset=*/IsZeroVal,
7594                                                 /*IsVolatile=*/IsVolatile),
7595                                      DstPtrInfo.getAddrSpace(), ~0u,
7596                                      MF.getFunction().getAttributes(), TLI))
7597     return UnableToLegalize;
7598 
7599   if (DstAlignCanChange) {
7600     // Get an estimate of the type from the LLT.
7601     Type *IRTy = getTypeForLLT(MemOps[0], C);
7602     Align NewAlign = DL.getABITypeAlign(IRTy);
7603     if (NewAlign > Alignment) {
7604       Alignment = NewAlign;
7605       unsigned FI = FIDef->getOperand(1).getIndex();
7606       // Give the stack frame object a larger alignment if needed.
7607       if (MFI.getObjectAlign(FI) < Alignment)
7608         MFI.setObjectAlignment(FI, Alignment);
7609     }
7610   }
7611 
7612   MachineIRBuilder MIB(MI);
7613   // Find the largest store and generate the bit pattern for it.
7614   LLT LargestTy = MemOps[0];
7615   for (unsigned i = 1; i < MemOps.size(); i++)
7616     if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
7617       LargestTy = MemOps[i];
7618 
7619   // The memset stored value is always defined as an s8, so in order to make it
7620   // work with larger store types we need to repeat the bit pattern across the
7621   // wider type.
7622   Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
7623 
7624   if (!MemSetValue)
7625     return UnableToLegalize;
7626 
7627   // Generate the stores. For each store type in the list, we generate the
7628   // matching store of that type to the destination address.
7629   LLT PtrTy = MRI.getType(Dst);
7630   unsigned DstOff = 0;
7631   unsigned Size = KnownLen;
7632   for (unsigned I = 0; I < MemOps.size(); I++) {
7633     LLT Ty = MemOps[I];
7634     unsigned TySize = Ty.getSizeInBytes();
7635     if (TySize > Size) {
7636       // Issuing an unaligned load / store pair that overlaps with the previous
7637       // pair. Adjust the offset accordingly.
7638       assert(I == MemOps.size() - 1 && I != 0);
7639       DstOff -= TySize - Size;
7640     }
7641 
7642     // If this store is smaller than the largest store see whether we can get
7643     // the smaller value for free with a truncate.
7644     Register Value = MemSetValue;
7645     if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
7646       MVT VT = getMVTForLLT(Ty);
7647       MVT LargestVT = getMVTForLLT(LargestTy);
7648       if (!LargestTy.isVector() && !Ty.isVector() &&
7649           TLI.isTruncateFree(LargestVT, VT))
7650         Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
7651       else
7652         Value = getMemsetValue(Val, Ty, MIB);
7653       if (!Value)
7654         return UnableToLegalize;
7655     }
7656 
7657     auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
7658 
7659     Register Ptr = Dst;
7660     if (DstOff != 0) {
7661       auto Offset =
7662           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
7663       Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
7664     }
7665 
7666     MIB.buildStore(Value, Ptr, *StoreMMO);
7667     DstOff += Ty.getSizeInBytes();
7668     Size -= TySize;
7669   }
7670 
7671   MI.eraseFromParent();
7672   return Legalized;
7673 }
7674 
7675 LegalizerHelper::LegalizeResult
7676 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
7677   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7678 
7679   Register Dst = MI.getOperand(0).getReg();
7680   Register Src = MI.getOperand(1).getReg();
7681   Register Len = MI.getOperand(2).getReg();
7682 
7683   const auto *MMOIt = MI.memoperands_begin();
7684   const MachineMemOperand *MemOp = *MMOIt;
7685   bool IsVolatile = MemOp->isVolatile();
7686 
7687   // See if this is a constant length copy
7688   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
7689   // FIXME: support dynamically sized G_MEMCPY_INLINE
7690   assert(LenVRegAndVal.hasValue() &&
7691          "inline memcpy with dynamic size is not yet supported");
7692   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
7693   if (KnownLen == 0) {
7694     MI.eraseFromParent();
7695     return Legalized;
7696   }
7697 
7698   const auto &DstMMO = **MI.memoperands_begin();
7699   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7700   Align DstAlign = DstMMO.getBaseAlign();
7701   Align SrcAlign = SrcMMO.getBaseAlign();
7702 
7703   return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
7704                            IsVolatile);
7705 }
7706 
7707 LegalizerHelper::LegalizeResult
7708 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
7709                                    uint64_t KnownLen, Align DstAlign,
7710                                    Align SrcAlign, bool IsVolatile) {
7711   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7712   return lowerMemcpy(MI, Dst, Src, KnownLen,
7713                      std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
7714                      IsVolatile);
7715 }
7716 
7717 LegalizerHelper::LegalizeResult
7718 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
7719                              uint64_t KnownLen, uint64_t Limit, Align DstAlign,
7720                              Align SrcAlign, bool IsVolatile) {
7721   auto &MF = *MI.getParent()->getParent();
7722   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7723   auto &DL = MF.getDataLayout();
7724   LLVMContext &C = MF.getFunction().getContext();
7725 
7726   assert(KnownLen != 0 && "Have a zero length memcpy length!");
7727 
7728   bool DstAlignCanChange = false;
7729   MachineFrameInfo &MFI = MF.getFrameInfo();
7730   Align Alignment = commonAlignment(DstAlign, SrcAlign);
7731 
7732   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7733   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7734     DstAlignCanChange = true;
7735 
7736   // FIXME: infer better src pointer alignment like SelectionDAG does here.
7737   // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
7738   // if the memcpy is in a tail call position.
7739 
7740   std::vector<LLT> MemOps;
7741 
7742   const auto &DstMMO = **MI.memoperands_begin();
7743   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7744   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7745   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7746 
7747   if (!findGISelOptimalMemOpLowering(
7748           MemOps, Limit,
7749           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7750                       IsVolatile),
7751           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7752           MF.getFunction().getAttributes(), TLI))
7753     return UnableToLegalize;
7754 
7755   if (DstAlignCanChange) {
7756     // Get an estimate of the type from the LLT.
7757     Type *IRTy = getTypeForLLT(MemOps[0], C);
7758     Align NewAlign = DL.getABITypeAlign(IRTy);
7759 
7760     // Don't promote to an alignment that would require dynamic stack
7761     // realignment.
7762     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
7763     if (!TRI->hasStackRealignment(MF))
7764       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
7765         NewAlign = NewAlign / 2;
7766 
7767     if (NewAlign > Alignment) {
7768       Alignment = NewAlign;
7769       unsigned FI = FIDef->getOperand(1).getIndex();
7770       // Give the stack frame object a larger alignment if needed.
7771       if (MFI.getObjectAlign(FI) < Alignment)
7772         MFI.setObjectAlignment(FI, Alignment);
7773     }
7774   }
7775 
7776   LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
7777 
7778   MachineIRBuilder MIB(MI);
7779   // Now we need to emit a pair of load and stores for each of the types we've
7780   // collected. I.e. for each type, generate a load from the source pointer of
7781   // that type width, and then generate a corresponding store to the dest buffer
7782   // of that value loaded. This can result in a sequence of loads and stores
7783   // mixed types, depending on what the target specifies as good types to use.
7784   unsigned CurrOffset = 0;
7785   LLT PtrTy = MRI.getType(Src);
7786   unsigned Size = KnownLen;
7787   for (auto CopyTy : MemOps) {
7788     // Issuing an unaligned load / store pair  that overlaps with the previous
7789     // pair. Adjust the offset accordingly.
7790     if (CopyTy.getSizeInBytes() > Size)
7791       CurrOffset -= CopyTy.getSizeInBytes() - Size;
7792 
7793     // Construct MMOs for the accesses.
7794     auto *LoadMMO =
7795         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
7796     auto *StoreMMO =
7797         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
7798 
7799     // Create the load.
7800     Register LoadPtr = Src;
7801     Register Offset;
7802     if (CurrOffset != 0) {
7803       Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset)
7804                    .getReg(0);
7805       LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
7806     }
7807     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
7808 
7809     // Create the store.
7810     Register StorePtr =
7811         CurrOffset == 0 ? Dst : MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
7812     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
7813     CurrOffset += CopyTy.getSizeInBytes();
7814     Size -= CopyTy.getSizeInBytes();
7815   }
7816 
7817   MI.eraseFromParent();
7818   return Legalized;
7819 }
7820 
7821 LegalizerHelper::LegalizeResult
7822 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
7823                               uint64_t KnownLen, Align DstAlign, Align SrcAlign,
7824                               bool IsVolatile) {
7825   auto &MF = *MI.getParent()->getParent();
7826   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7827   auto &DL = MF.getDataLayout();
7828   LLVMContext &C = MF.getFunction().getContext();
7829 
7830   assert(KnownLen != 0 && "Have a zero length memmove length!");
7831 
7832   bool DstAlignCanChange = false;
7833   MachineFrameInfo &MFI = MF.getFrameInfo();
7834   bool OptSize = shouldLowerMemFuncForSize(MF);
7835   Align Alignment = commonAlignment(DstAlign, SrcAlign);
7836 
7837   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7838   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7839     DstAlignCanChange = true;
7840 
7841   unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
7842   std::vector<LLT> MemOps;
7843 
7844   const auto &DstMMO = **MI.memoperands_begin();
7845   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7846   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7847   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7848 
7849   // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
7850   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
7851   // same thing here.
7852   if (!findGISelOptimalMemOpLowering(
7853           MemOps, Limit,
7854           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7855                       /*IsVolatile*/ true),
7856           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7857           MF.getFunction().getAttributes(), TLI))
7858     return UnableToLegalize;
7859 
7860   if (DstAlignCanChange) {
7861     // Get an estimate of the type from the LLT.
7862     Type *IRTy = getTypeForLLT(MemOps[0], C);
7863     Align NewAlign = DL.getABITypeAlign(IRTy);
7864 
7865     // Don't promote to an alignment that would require dynamic stack
7866     // realignment.
7867     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
7868     if (!TRI->hasStackRealignment(MF))
7869       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
7870         NewAlign = NewAlign / 2;
7871 
7872     if (NewAlign > Alignment) {
7873       Alignment = NewAlign;
7874       unsigned FI = FIDef->getOperand(1).getIndex();
7875       // Give the stack frame object a larger alignment if needed.
7876       if (MFI.getObjectAlign(FI) < Alignment)
7877         MFI.setObjectAlignment(FI, Alignment);
7878     }
7879   }
7880 
7881   LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
7882 
7883   MachineIRBuilder MIB(MI);
7884   // Memmove requires that we perform the loads first before issuing the stores.
7885   // Apart from that, this loop is pretty much doing the same thing as the
7886   // memcpy codegen function.
7887   unsigned CurrOffset = 0;
7888   LLT PtrTy = MRI.getType(Src);
7889   SmallVector<Register, 16> LoadVals;
7890   for (auto CopyTy : MemOps) {
7891     // Construct MMO for the load.
7892     auto *LoadMMO =
7893         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
7894 
7895     // Create the load.
7896     Register LoadPtr = Src;
7897     if (CurrOffset != 0) {
7898       auto Offset =
7899           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
7900       LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
7901     }
7902     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
7903     CurrOffset += CopyTy.getSizeInBytes();
7904   }
7905 
7906   CurrOffset = 0;
7907   for (unsigned I = 0; I < MemOps.size(); ++I) {
7908     LLT CopyTy = MemOps[I];
7909     // Now store the values loaded.
7910     auto *StoreMMO =
7911         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
7912 
7913     Register StorePtr = Dst;
7914     if (CurrOffset != 0) {
7915       auto Offset =
7916           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
7917       StorePtr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
7918     }
7919     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
7920     CurrOffset += CopyTy.getSizeInBytes();
7921   }
7922   MI.eraseFromParent();
7923   return Legalized;
7924 }
7925 
7926 LegalizerHelper::LegalizeResult
7927 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
7928   const unsigned Opc = MI.getOpcode();
7929   // This combine is fairly complex so it's not written with a separate
7930   // matcher function.
7931   assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
7932           Opc == TargetOpcode::G_MEMSET) &&
7933          "Expected memcpy like instruction");
7934 
7935   auto MMOIt = MI.memoperands_begin();
7936   const MachineMemOperand *MemOp = *MMOIt;
7937 
7938   Align DstAlign = MemOp->getBaseAlign();
7939   Align SrcAlign;
7940   Register Dst = MI.getOperand(0).getReg();
7941   Register Src = MI.getOperand(1).getReg();
7942   Register Len = MI.getOperand(2).getReg();
7943 
7944   if (Opc != TargetOpcode::G_MEMSET) {
7945     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
7946     MemOp = *(++MMOIt);
7947     SrcAlign = MemOp->getBaseAlign();
7948   }
7949 
7950   // See if this is a constant length copy
7951   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
7952   if (!LenVRegAndVal)
7953     return UnableToLegalize;
7954   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
7955 
7956   if (KnownLen == 0) {
7957     MI.eraseFromParent();
7958     return Legalized;
7959   }
7960 
7961   bool IsVolatile = MemOp->isVolatile();
7962   if (Opc == TargetOpcode::G_MEMCPY_INLINE)
7963     return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
7964                              IsVolatile);
7965 
7966   // Don't try to optimize volatile.
7967   if (IsVolatile)
7968     return UnableToLegalize;
7969 
7970   if (MaxLen && KnownLen > MaxLen)
7971     return UnableToLegalize;
7972 
7973   if (Opc == TargetOpcode::G_MEMCPY) {
7974     auto &MF = *MI.getParent()->getParent();
7975     const auto &TLI = *MF.getSubtarget().getTargetLowering();
7976     bool OptSize = shouldLowerMemFuncForSize(MF);
7977     uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
7978     return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
7979                        IsVolatile);
7980   }
7981   if (Opc == TargetOpcode::G_MEMMOVE)
7982     return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
7983   if (Opc == TargetOpcode::G_MEMSET)
7984     return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
7985   return UnableToLegalize;
7986 }
7987