xref: /freebsd-src/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp (revision 0eae32dcef82f6f06de6419a0d623d7def0cc8f6)
1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/GlobalISel/Utils.h"
22 #include "llvm/CodeGen/MachineRegisterInfo.h"
23 #include "llvm/CodeGen/TargetFrameLowering.h"
24 #include "llvm/CodeGen/TargetInstrInfo.h"
25 #include "llvm/CodeGen/TargetLowering.h"
26 #include "llvm/CodeGen/TargetOpcodes.h"
27 #include "llvm/CodeGen/TargetSubtargetInfo.h"
28 #include "llvm/IR/Instructions.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/MathExtras.h"
31 #include "llvm/Support/raw_ostream.h"
32 #include "llvm/Target/TargetMachine.h"
33 
34 #define DEBUG_TYPE "legalizer"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace MIPatternMatch;
39 
40 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
41 ///
42 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
43 /// with any leftover piece as type \p LeftoverTy
44 ///
45 /// Returns -1 in the first element of the pair if the breakdown is not
46 /// satisfiable.
47 static std::pair<int, int>
48 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
49   assert(!LeftoverTy.isValid() && "this is an out argument");
50 
51   unsigned Size = OrigTy.getSizeInBits();
52   unsigned NarrowSize = NarrowTy.getSizeInBits();
53   unsigned NumParts = Size / NarrowSize;
54   unsigned LeftoverSize = Size - NumParts * NarrowSize;
55   assert(Size > NarrowSize);
56 
57   if (LeftoverSize == 0)
58     return {NumParts, 0};
59 
60   if (NarrowTy.isVector()) {
61     unsigned EltSize = OrigTy.getScalarSizeInBits();
62     if (LeftoverSize % EltSize != 0)
63       return {-1, -1};
64     LeftoverTy = LLT::scalarOrVector(
65         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
66   } else {
67     LeftoverTy = LLT::scalar(LeftoverSize);
68   }
69 
70   int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
71   return std::make_pair(NumParts, NumLeftover);
72 }
73 
74 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
75 
76   if (!Ty.isScalar())
77     return nullptr;
78 
79   switch (Ty.getSizeInBits()) {
80   case 16:
81     return Type::getHalfTy(Ctx);
82   case 32:
83     return Type::getFloatTy(Ctx);
84   case 64:
85     return Type::getDoubleTy(Ctx);
86   case 80:
87     return Type::getX86_FP80Ty(Ctx);
88   case 128:
89     return Type::getFP128Ty(Ctx);
90   default:
91     return nullptr;
92   }
93 }
94 
95 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
96                                  GISelChangeObserver &Observer,
97                                  MachineIRBuilder &Builder)
98     : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
99       LI(*MF.getSubtarget().getLegalizerInfo()),
100       TLI(*MF.getSubtarget().getTargetLowering()) { }
101 
102 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
103                                  GISelChangeObserver &Observer,
104                                  MachineIRBuilder &B)
105   : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
106     TLI(*MF.getSubtarget().getTargetLowering()) { }
107 
108 LegalizerHelper::LegalizeResult
109 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
110                                    LostDebugLocObserver &LocObserver) {
111   LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
112 
113   MIRBuilder.setInstrAndDebugLoc(MI);
114 
115   if (MI.getOpcode() == TargetOpcode::G_INTRINSIC ||
116       MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS)
117     return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
118   auto Step = LI.getAction(MI, MRI);
119   switch (Step.Action) {
120   case Legal:
121     LLVM_DEBUG(dbgs() << ".. Already legal\n");
122     return AlreadyLegal;
123   case Libcall:
124     LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
125     return libcall(MI, LocObserver);
126   case NarrowScalar:
127     LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
128     return narrowScalar(MI, Step.TypeIdx, Step.NewType);
129   case WidenScalar:
130     LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
131     return widenScalar(MI, Step.TypeIdx, Step.NewType);
132   case Bitcast:
133     LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
134     return bitcast(MI, Step.TypeIdx, Step.NewType);
135   case Lower:
136     LLVM_DEBUG(dbgs() << ".. Lower\n");
137     return lower(MI, Step.TypeIdx, Step.NewType);
138   case FewerElements:
139     LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
140     return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
141   case MoreElements:
142     LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
143     return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
144   case Custom:
145     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
146     return LI.legalizeCustom(*this, MI) ? Legalized : UnableToLegalize;
147   default:
148     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
149     return UnableToLegalize;
150   }
151 }
152 
153 void LegalizerHelper::extractParts(Register Reg, LLT Ty, int NumParts,
154                                    SmallVectorImpl<Register> &VRegs) {
155   for (int i = 0; i < NumParts; ++i)
156     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
157   MIRBuilder.buildUnmerge(VRegs, Reg);
158 }
159 
160 bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
161                                    LLT MainTy, LLT &LeftoverTy,
162                                    SmallVectorImpl<Register> &VRegs,
163                                    SmallVectorImpl<Register> &LeftoverRegs) {
164   assert(!LeftoverTy.isValid() && "this is an out argument");
165 
166   unsigned RegSize = RegTy.getSizeInBits();
167   unsigned MainSize = MainTy.getSizeInBits();
168   unsigned NumParts = RegSize / MainSize;
169   unsigned LeftoverSize = RegSize - NumParts * MainSize;
170 
171   // Use an unmerge when possible.
172   if (LeftoverSize == 0) {
173     for (unsigned I = 0; I < NumParts; ++I)
174       VRegs.push_back(MRI.createGenericVirtualRegister(MainTy));
175     MIRBuilder.buildUnmerge(VRegs, Reg);
176     return true;
177   }
178 
179   // Perform irregular split. Leftover is last element of RegPieces.
180   if (MainTy.isVector()) {
181     SmallVector<Register, 8> RegPieces;
182     extractVectorParts(Reg, MainTy.getNumElements(), RegPieces);
183     for (unsigned i = 0; i < RegPieces.size() - 1; ++i)
184       VRegs.push_back(RegPieces[i]);
185     LeftoverRegs.push_back(RegPieces[RegPieces.size() - 1]);
186     LeftoverTy = MRI.getType(LeftoverRegs[0]);
187     return true;
188   }
189 
190   LeftoverTy = LLT::scalar(LeftoverSize);
191   // For irregular sizes, extract the individual parts.
192   for (unsigned I = 0; I != NumParts; ++I) {
193     Register NewReg = MRI.createGenericVirtualRegister(MainTy);
194     VRegs.push_back(NewReg);
195     MIRBuilder.buildExtract(NewReg, Reg, MainSize * I);
196   }
197 
198   for (unsigned Offset = MainSize * NumParts; Offset < RegSize;
199        Offset += LeftoverSize) {
200     Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy);
201     LeftoverRegs.push_back(NewReg);
202     MIRBuilder.buildExtract(NewReg, Reg, Offset);
203   }
204 
205   return true;
206 }
207 
208 void LegalizerHelper::extractVectorParts(Register Reg, unsigned NumElts,
209                                          SmallVectorImpl<Register> &VRegs) {
210   LLT RegTy = MRI.getType(Reg);
211   assert(RegTy.isVector() && "Expected a vector type");
212 
213   LLT EltTy = RegTy.getElementType();
214   LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
215   unsigned RegNumElts = RegTy.getNumElements();
216   unsigned LeftoverNumElts = RegNumElts % NumElts;
217   unsigned NumNarrowTyPieces = RegNumElts / NumElts;
218 
219   // Perfect split without leftover
220   if (LeftoverNumElts == 0)
221     return extractParts(Reg, NarrowTy, NumNarrowTyPieces, VRegs);
222 
223   // Irregular split. Provide direct access to all elements for artifact
224   // combiner using unmerge to elements. Then build vectors with NumElts
225   // elements. Remaining element(s) will be (used to build vector) Leftover.
226   SmallVector<Register, 8> Elts;
227   extractParts(Reg, EltTy, RegNumElts, Elts);
228 
229   unsigned Offset = 0;
230   // Requested sub-vectors of NarrowTy.
231   for (unsigned i = 0; i < NumNarrowTyPieces; ++i, Offset += NumElts) {
232     ArrayRef<Register> Pieces(&Elts[Offset], NumElts);
233     VRegs.push_back(MIRBuilder.buildMerge(NarrowTy, Pieces).getReg(0));
234   }
235 
236   // Leftover element(s).
237   if (LeftoverNumElts == 1) {
238     VRegs.push_back(Elts[Offset]);
239   } else {
240     LLT LeftoverTy = LLT::fixed_vector(LeftoverNumElts, EltTy);
241     ArrayRef<Register> Pieces(&Elts[Offset], LeftoverNumElts);
242     VRegs.push_back(MIRBuilder.buildMerge(LeftoverTy, Pieces).getReg(0));
243   }
244 }
245 
246 void LegalizerHelper::insertParts(Register DstReg,
247                                   LLT ResultTy, LLT PartTy,
248                                   ArrayRef<Register> PartRegs,
249                                   LLT LeftoverTy,
250                                   ArrayRef<Register> LeftoverRegs) {
251   if (!LeftoverTy.isValid()) {
252     assert(LeftoverRegs.empty());
253 
254     if (!ResultTy.isVector()) {
255       MIRBuilder.buildMerge(DstReg, PartRegs);
256       return;
257     }
258 
259     if (PartTy.isVector())
260       MIRBuilder.buildConcatVectors(DstReg, PartRegs);
261     else
262       MIRBuilder.buildBuildVector(DstReg, PartRegs);
263     return;
264   }
265 
266   // Merge sub-vectors with different number of elements and insert into DstReg.
267   if (ResultTy.isVector()) {
268     assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
269     SmallVector<Register, 8> AllRegs;
270     for (auto Reg : concat<const Register>(PartRegs, LeftoverRegs))
271       AllRegs.push_back(Reg);
272     return mergeMixedSubvectors(DstReg, AllRegs);
273   }
274 
275   SmallVector<Register> GCDRegs;
276   LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
277   for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
278     extractGCDType(GCDRegs, GCDTy, PartReg);
279   LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
280   buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
281 }
282 
283 void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
284                                        Register Reg) {
285   LLT Ty = MRI.getType(Reg);
286   SmallVector<Register, 8> RegElts;
287   extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts);
288   Elts.append(RegElts);
289 }
290 
291 /// Merge \p PartRegs with different types into \p DstReg.
292 void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
293                                            ArrayRef<Register> PartRegs) {
294   SmallVector<Register, 8> AllElts;
295   for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
296     appendVectorElts(AllElts, PartRegs[i]);
297 
298   Register Leftover = PartRegs[PartRegs.size() - 1];
299   if (MRI.getType(Leftover).isScalar())
300     AllElts.push_back(Leftover);
301   else
302     appendVectorElts(AllElts, Leftover);
303 
304   MIRBuilder.buildMerge(DstReg, AllElts);
305 }
306 
307 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
308 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
309                               const MachineInstr &MI) {
310   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
311 
312   const int StartIdx = Regs.size();
313   const int NumResults = MI.getNumOperands() - 1;
314   Regs.resize(Regs.size() + NumResults);
315   for (int I = 0; I != NumResults; ++I)
316     Regs[StartIdx + I] = MI.getOperand(I).getReg();
317 }
318 
319 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
320                                      LLT GCDTy, Register SrcReg) {
321   LLT SrcTy = MRI.getType(SrcReg);
322   if (SrcTy == GCDTy) {
323     // If the source already evenly divides the result type, we don't need to do
324     // anything.
325     Parts.push_back(SrcReg);
326   } else {
327     // Need to split into common type sized pieces.
328     auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
329     getUnmergeResults(Parts, *Unmerge);
330   }
331 }
332 
333 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
334                                     LLT NarrowTy, Register SrcReg) {
335   LLT SrcTy = MRI.getType(SrcReg);
336   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
337   extractGCDType(Parts, GCDTy, SrcReg);
338   return GCDTy;
339 }
340 
341 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
342                                          SmallVectorImpl<Register> &VRegs,
343                                          unsigned PadStrategy) {
344   LLT LCMTy = getLCMType(DstTy, NarrowTy);
345 
346   int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
347   int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
348   int NumOrigSrc = VRegs.size();
349 
350   Register PadReg;
351 
352   // Get a value we can use to pad the source value if the sources won't evenly
353   // cover the result type.
354   if (NumOrigSrc < NumParts * NumSubParts) {
355     if (PadStrategy == TargetOpcode::G_ZEXT)
356       PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
357     else if (PadStrategy == TargetOpcode::G_ANYEXT)
358       PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
359     else {
360       assert(PadStrategy == TargetOpcode::G_SEXT);
361 
362       // Shift the sign bit of the low register through the high register.
363       auto ShiftAmt =
364         MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
365       PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
366     }
367   }
368 
369   // Registers for the final merge to be produced.
370   SmallVector<Register, 4> Remerge(NumParts);
371 
372   // Registers needed for intermediate merges, which will be merged into a
373   // source for Remerge.
374   SmallVector<Register, 4> SubMerge(NumSubParts);
375 
376   // Once we've fully read off the end of the original source bits, we can reuse
377   // the same high bits for remaining padding elements.
378   Register AllPadReg;
379 
380   // Build merges to the LCM type to cover the original result type.
381   for (int I = 0; I != NumParts; ++I) {
382     bool AllMergePartsArePadding = true;
383 
384     // Build the requested merges to the requested type.
385     for (int J = 0; J != NumSubParts; ++J) {
386       int Idx = I * NumSubParts + J;
387       if (Idx >= NumOrigSrc) {
388         SubMerge[J] = PadReg;
389         continue;
390       }
391 
392       SubMerge[J] = VRegs[Idx];
393 
394       // There are meaningful bits here we can't reuse later.
395       AllMergePartsArePadding = false;
396     }
397 
398     // If we've filled up a complete piece with padding bits, we can directly
399     // emit the natural sized constant if applicable, rather than a merge of
400     // smaller constants.
401     if (AllMergePartsArePadding && !AllPadReg) {
402       if (PadStrategy == TargetOpcode::G_ANYEXT)
403         AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
404       else if (PadStrategy == TargetOpcode::G_ZEXT)
405         AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
406 
407       // If this is a sign extension, we can't materialize a trivial constant
408       // with the right type and have to produce a merge.
409     }
410 
411     if (AllPadReg) {
412       // Avoid creating additional instructions if we're just adding additional
413       // copies of padding bits.
414       Remerge[I] = AllPadReg;
415       continue;
416     }
417 
418     if (NumSubParts == 1)
419       Remerge[I] = SubMerge[0];
420     else
421       Remerge[I] = MIRBuilder.buildMerge(NarrowTy, SubMerge).getReg(0);
422 
423     // In the sign extend padding case, re-use the first all-signbit merge.
424     if (AllMergePartsArePadding && !AllPadReg)
425       AllPadReg = Remerge[I];
426   }
427 
428   VRegs = std::move(Remerge);
429   return LCMTy;
430 }
431 
432 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
433                                                ArrayRef<Register> RemergeRegs) {
434   LLT DstTy = MRI.getType(DstReg);
435 
436   // Create the merge to the widened source, and extract the relevant bits into
437   // the result.
438 
439   if (DstTy == LCMTy) {
440     MIRBuilder.buildMerge(DstReg, RemergeRegs);
441     return;
442   }
443 
444   auto Remerge = MIRBuilder.buildMerge(LCMTy, RemergeRegs);
445   if (DstTy.isScalar() && LCMTy.isScalar()) {
446     MIRBuilder.buildTrunc(DstReg, Remerge);
447     return;
448   }
449 
450   if (LCMTy.isVector()) {
451     unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
452     SmallVector<Register, 8> UnmergeDefs(NumDefs);
453     UnmergeDefs[0] = DstReg;
454     for (unsigned I = 1; I != NumDefs; ++I)
455       UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
456 
457     MIRBuilder.buildUnmerge(UnmergeDefs,
458                             MIRBuilder.buildMerge(LCMTy, RemergeRegs));
459     return;
460   }
461 
462   llvm_unreachable("unhandled case");
463 }
464 
465 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
466 #define RTLIBCASE_INT(LibcallPrefix)                                           \
467   do {                                                                         \
468     switch (Size) {                                                            \
469     case 32:                                                                   \
470       return RTLIB::LibcallPrefix##32;                                         \
471     case 64:                                                                   \
472       return RTLIB::LibcallPrefix##64;                                         \
473     case 128:                                                                  \
474       return RTLIB::LibcallPrefix##128;                                        \
475     default:                                                                   \
476       llvm_unreachable("unexpected size");                                     \
477     }                                                                          \
478   } while (0)
479 
480 #define RTLIBCASE(LibcallPrefix)                                               \
481   do {                                                                         \
482     switch (Size) {                                                            \
483     case 32:                                                                   \
484       return RTLIB::LibcallPrefix##32;                                         \
485     case 64:                                                                   \
486       return RTLIB::LibcallPrefix##64;                                         \
487     case 80:                                                                   \
488       return RTLIB::LibcallPrefix##80;                                         \
489     case 128:                                                                  \
490       return RTLIB::LibcallPrefix##128;                                        \
491     default:                                                                   \
492       llvm_unreachable("unexpected size");                                     \
493     }                                                                          \
494   } while (0)
495 
496   switch (Opcode) {
497   case TargetOpcode::G_SDIV:
498     RTLIBCASE_INT(SDIV_I);
499   case TargetOpcode::G_UDIV:
500     RTLIBCASE_INT(UDIV_I);
501   case TargetOpcode::G_SREM:
502     RTLIBCASE_INT(SREM_I);
503   case TargetOpcode::G_UREM:
504     RTLIBCASE_INT(UREM_I);
505   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
506     RTLIBCASE_INT(CTLZ_I);
507   case TargetOpcode::G_FADD:
508     RTLIBCASE(ADD_F);
509   case TargetOpcode::G_FSUB:
510     RTLIBCASE(SUB_F);
511   case TargetOpcode::G_FMUL:
512     RTLIBCASE(MUL_F);
513   case TargetOpcode::G_FDIV:
514     RTLIBCASE(DIV_F);
515   case TargetOpcode::G_FEXP:
516     RTLIBCASE(EXP_F);
517   case TargetOpcode::G_FEXP2:
518     RTLIBCASE(EXP2_F);
519   case TargetOpcode::G_FREM:
520     RTLIBCASE(REM_F);
521   case TargetOpcode::G_FPOW:
522     RTLIBCASE(POW_F);
523   case TargetOpcode::G_FMA:
524     RTLIBCASE(FMA_F);
525   case TargetOpcode::G_FSIN:
526     RTLIBCASE(SIN_F);
527   case TargetOpcode::G_FCOS:
528     RTLIBCASE(COS_F);
529   case TargetOpcode::G_FLOG10:
530     RTLIBCASE(LOG10_F);
531   case TargetOpcode::G_FLOG:
532     RTLIBCASE(LOG_F);
533   case TargetOpcode::G_FLOG2:
534     RTLIBCASE(LOG2_F);
535   case TargetOpcode::G_FCEIL:
536     RTLIBCASE(CEIL_F);
537   case TargetOpcode::G_FFLOOR:
538     RTLIBCASE(FLOOR_F);
539   case TargetOpcode::G_FMINNUM:
540     RTLIBCASE(FMIN_F);
541   case TargetOpcode::G_FMAXNUM:
542     RTLIBCASE(FMAX_F);
543   case TargetOpcode::G_FSQRT:
544     RTLIBCASE(SQRT_F);
545   case TargetOpcode::G_FRINT:
546     RTLIBCASE(RINT_F);
547   case TargetOpcode::G_FNEARBYINT:
548     RTLIBCASE(NEARBYINT_F);
549   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
550     RTLIBCASE(ROUNDEVEN_F);
551   }
552   llvm_unreachable("Unknown libcall function");
553 }
554 
555 /// True if an instruction is in tail position in its caller. Intended for
556 /// legalizing libcalls as tail calls when possible.
557 static bool isLibCallInTailPosition(MachineInstr &MI,
558                                     const TargetInstrInfo &TII,
559                                     MachineRegisterInfo &MRI) {
560   MachineBasicBlock &MBB = *MI.getParent();
561   const Function &F = MBB.getParent()->getFunction();
562 
563   // Conservatively require the attributes of the call to match those of
564   // the return. Ignore NoAlias and NonNull because they don't affect the
565   // call sequence.
566   AttributeList CallerAttrs = F.getAttributes();
567   if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex)
568           .removeAttribute(Attribute::NoAlias)
569           .removeAttribute(Attribute::NonNull)
570           .hasAttributes())
571     return false;
572 
573   // It's not safe to eliminate the sign / zero extension of the return value.
574   if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
575       CallerAttrs.hasRetAttr(Attribute::SExt))
576     return false;
577 
578   // Only tail call if the following instruction is a standard return or if we
579   // have a `thisreturn` callee, and a sequence like:
580   //
581   //   G_MEMCPY %0, %1, %2
582   //   $x0 = COPY %0
583   //   RET_ReallyLR implicit $x0
584   auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
585   if (Next != MBB.instr_end() && Next->isCopy()) {
586     switch (MI.getOpcode()) {
587     default:
588       llvm_unreachable("unsupported opcode");
589     case TargetOpcode::G_BZERO:
590       return false;
591     case TargetOpcode::G_MEMCPY:
592     case TargetOpcode::G_MEMMOVE:
593     case TargetOpcode::G_MEMSET:
594       break;
595     }
596 
597     Register VReg = MI.getOperand(0).getReg();
598     if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
599       return false;
600 
601     Register PReg = Next->getOperand(0).getReg();
602     if (!PReg.isPhysical())
603       return false;
604 
605     auto Ret = next_nodbg(Next, MBB.instr_end());
606     if (Ret == MBB.instr_end() || !Ret->isReturn())
607       return false;
608 
609     if (Ret->getNumImplicitOperands() != 1)
610       return false;
611 
612     if (PReg != Ret->getOperand(0).getReg())
613       return false;
614 
615     // Skip over the COPY that we just validated.
616     Next = Ret;
617   }
618 
619   if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
620     return false;
621 
622   return true;
623 }
624 
625 LegalizerHelper::LegalizeResult
626 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
627                     const CallLowering::ArgInfo &Result,
628                     ArrayRef<CallLowering::ArgInfo> Args,
629                     const CallingConv::ID CC) {
630   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
631 
632   CallLowering::CallLoweringInfo Info;
633   Info.CallConv = CC;
634   Info.Callee = MachineOperand::CreateES(Name);
635   Info.OrigRet = Result;
636   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
637   if (!CLI.lowerCall(MIRBuilder, Info))
638     return LegalizerHelper::UnableToLegalize;
639 
640   return LegalizerHelper::Legalized;
641 }
642 
643 LegalizerHelper::LegalizeResult
644 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
645                     const CallLowering::ArgInfo &Result,
646                     ArrayRef<CallLowering::ArgInfo> Args) {
647   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
648   const char *Name = TLI.getLibcallName(Libcall);
649   const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
650   return createLibcall(MIRBuilder, Name, Result, Args, CC);
651 }
652 
653 // Useful for libcalls where all operands have the same type.
654 static LegalizerHelper::LegalizeResult
655 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
656               Type *OpType) {
657   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
658 
659   // FIXME: What does the original arg index mean here?
660   SmallVector<CallLowering::ArgInfo, 3> Args;
661   for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
662     Args.push_back({MO.getReg(), OpType, 0});
663   return createLibcall(MIRBuilder, Libcall,
664                        {MI.getOperand(0).getReg(), OpType, 0}, Args);
665 }
666 
667 LegalizerHelper::LegalizeResult
668 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
669                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
670   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
671 
672   SmallVector<CallLowering::ArgInfo, 3> Args;
673   // Add all the args, except for the last which is an imm denoting 'tail'.
674   for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
675     Register Reg = MI.getOperand(i).getReg();
676 
677     // Need derive an IR type for call lowering.
678     LLT OpLLT = MRI.getType(Reg);
679     Type *OpTy = nullptr;
680     if (OpLLT.isPointer())
681       OpTy = Type::getInt8PtrTy(Ctx, OpLLT.getAddressSpace());
682     else
683       OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
684     Args.push_back({Reg, OpTy, 0});
685   }
686 
687   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
688   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
689   RTLIB::Libcall RTLibcall;
690   unsigned Opc = MI.getOpcode();
691   switch (Opc) {
692   case TargetOpcode::G_BZERO:
693     RTLibcall = RTLIB::BZERO;
694     break;
695   case TargetOpcode::G_MEMCPY:
696     RTLibcall = RTLIB::MEMCPY;
697     Args[0].Flags[0].setReturned();
698     break;
699   case TargetOpcode::G_MEMMOVE:
700     RTLibcall = RTLIB::MEMMOVE;
701     Args[0].Flags[0].setReturned();
702     break;
703   case TargetOpcode::G_MEMSET:
704     RTLibcall = RTLIB::MEMSET;
705     Args[0].Flags[0].setReturned();
706     break;
707   default:
708     llvm_unreachable("unsupported opcode");
709   }
710   const char *Name = TLI.getLibcallName(RTLibcall);
711 
712   // Unsupported libcall on the target.
713   if (!Name) {
714     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
715                       << MIRBuilder.getTII().getName(Opc) << "\n");
716     return LegalizerHelper::UnableToLegalize;
717   }
718 
719   CallLowering::CallLoweringInfo Info;
720   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
721   Info.Callee = MachineOperand::CreateES(Name);
722   Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
723   Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() &&
724                     isLibCallInTailPosition(MI, MIRBuilder.getTII(), MRI);
725 
726   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
727   if (!CLI.lowerCall(MIRBuilder, Info))
728     return LegalizerHelper::UnableToLegalize;
729 
730   if (Info.LoweredTailCall) {
731     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
732 
733     // Check debug locations before removing the return.
734     LocObserver.checkpoint(true);
735 
736     // We must have a return following the call (or debug insts) to get past
737     // isLibCallInTailPosition.
738     do {
739       MachineInstr *Next = MI.getNextNode();
740       assert(Next &&
741              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
742              "Expected instr following MI to be return or debug inst?");
743       // We lowered a tail call, so the call is now the return from the block.
744       // Delete the old return.
745       Next->eraseFromParent();
746     } while (MI.getNextNode());
747 
748     // We expect to lose the debug location from the return.
749     LocObserver.checkpoint(false);
750   }
751 
752   return LegalizerHelper::Legalized;
753 }
754 
755 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
756                                        Type *FromType) {
757   auto ToMVT = MVT::getVT(ToType);
758   auto FromMVT = MVT::getVT(FromType);
759 
760   switch (Opcode) {
761   case TargetOpcode::G_FPEXT:
762     return RTLIB::getFPEXT(FromMVT, ToMVT);
763   case TargetOpcode::G_FPTRUNC:
764     return RTLIB::getFPROUND(FromMVT, ToMVT);
765   case TargetOpcode::G_FPTOSI:
766     return RTLIB::getFPTOSINT(FromMVT, ToMVT);
767   case TargetOpcode::G_FPTOUI:
768     return RTLIB::getFPTOUINT(FromMVT, ToMVT);
769   case TargetOpcode::G_SITOFP:
770     return RTLIB::getSINTTOFP(FromMVT, ToMVT);
771   case TargetOpcode::G_UITOFP:
772     return RTLIB::getUINTTOFP(FromMVT, ToMVT);
773   }
774   llvm_unreachable("Unsupported libcall function");
775 }
776 
777 static LegalizerHelper::LegalizeResult
778 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
779                   Type *FromType) {
780   RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
781   return createLibcall(MIRBuilder, Libcall,
782                        {MI.getOperand(0).getReg(), ToType, 0},
783                        {{MI.getOperand(1).getReg(), FromType, 0}});
784 }
785 
786 LegalizerHelper::LegalizeResult
787 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
788   LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
789   unsigned Size = LLTy.getSizeInBits();
790   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
791 
792   switch (MI.getOpcode()) {
793   default:
794     return UnableToLegalize;
795   case TargetOpcode::G_SDIV:
796   case TargetOpcode::G_UDIV:
797   case TargetOpcode::G_SREM:
798   case TargetOpcode::G_UREM:
799   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
800     Type *HLTy = IntegerType::get(Ctx, Size);
801     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
802     if (Status != Legalized)
803       return Status;
804     break;
805   }
806   case TargetOpcode::G_FADD:
807   case TargetOpcode::G_FSUB:
808   case TargetOpcode::G_FMUL:
809   case TargetOpcode::G_FDIV:
810   case TargetOpcode::G_FMA:
811   case TargetOpcode::G_FPOW:
812   case TargetOpcode::G_FREM:
813   case TargetOpcode::G_FCOS:
814   case TargetOpcode::G_FSIN:
815   case TargetOpcode::G_FLOG10:
816   case TargetOpcode::G_FLOG:
817   case TargetOpcode::G_FLOG2:
818   case TargetOpcode::G_FEXP:
819   case TargetOpcode::G_FEXP2:
820   case TargetOpcode::G_FCEIL:
821   case TargetOpcode::G_FFLOOR:
822   case TargetOpcode::G_FMINNUM:
823   case TargetOpcode::G_FMAXNUM:
824   case TargetOpcode::G_FSQRT:
825   case TargetOpcode::G_FRINT:
826   case TargetOpcode::G_FNEARBYINT:
827   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
828     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
829     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
830       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
831       return UnableToLegalize;
832     }
833     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
834     if (Status != Legalized)
835       return Status;
836     break;
837   }
838   case TargetOpcode::G_FPEXT:
839   case TargetOpcode::G_FPTRUNC: {
840     Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
841     Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
842     if (!FromTy || !ToTy)
843       return UnableToLegalize;
844     LegalizeResult Status = conversionLibcall(MI, MIRBuilder, ToTy, FromTy );
845     if (Status != Legalized)
846       return Status;
847     break;
848   }
849   case TargetOpcode::G_FPTOSI:
850   case TargetOpcode::G_FPTOUI: {
851     // FIXME: Support other types
852     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
853     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
854     if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64))
855       return UnableToLegalize;
856     LegalizeResult Status = conversionLibcall(
857         MI, MIRBuilder,
858         ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
859         FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx));
860     if (Status != Legalized)
861       return Status;
862     break;
863   }
864   case TargetOpcode::G_SITOFP:
865   case TargetOpcode::G_UITOFP: {
866     // FIXME: Support other types
867     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
868     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
869     if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
870       return UnableToLegalize;
871     LegalizeResult Status = conversionLibcall(
872         MI, MIRBuilder,
873         ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
874         FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx));
875     if (Status != Legalized)
876       return Status;
877     break;
878   }
879   case TargetOpcode::G_BZERO:
880   case TargetOpcode::G_MEMCPY:
881   case TargetOpcode::G_MEMMOVE:
882   case TargetOpcode::G_MEMSET: {
883     LegalizeResult Result =
884         createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
885     if (Result != Legalized)
886       return Result;
887     MI.eraseFromParent();
888     return Result;
889   }
890   }
891 
892   MI.eraseFromParent();
893   return Legalized;
894 }
895 
896 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
897                                                               unsigned TypeIdx,
898                                                               LLT NarrowTy) {
899   uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
900   uint64_t NarrowSize = NarrowTy.getSizeInBits();
901 
902   switch (MI.getOpcode()) {
903   default:
904     return UnableToLegalize;
905   case TargetOpcode::G_IMPLICIT_DEF: {
906     Register DstReg = MI.getOperand(0).getReg();
907     LLT DstTy = MRI.getType(DstReg);
908 
909     // If SizeOp0 is not an exact multiple of NarrowSize, emit
910     // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
911     // FIXME: Although this would also be legal for the general case, it causes
912     //  a lot of regressions in the emitted code (superfluous COPYs, artifact
913     //  combines not being hit). This seems to be a problem related to the
914     //  artifact combiner.
915     if (SizeOp0 % NarrowSize != 0) {
916       LLT ImplicitTy = NarrowTy;
917       if (DstTy.isVector())
918         ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
919 
920       Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
921       MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
922 
923       MI.eraseFromParent();
924       return Legalized;
925     }
926 
927     int NumParts = SizeOp0 / NarrowSize;
928 
929     SmallVector<Register, 2> DstRegs;
930     for (int i = 0; i < NumParts; ++i)
931       DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
932 
933     if (DstTy.isVector())
934       MIRBuilder.buildBuildVector(DstReg, DstRegs);
935     else
936       MIRBuilder.buildMerge(DstReg, DstRegs);
937     MI.eraseFromParent();
938     return Legalized;
939   }
940   case TargetOpcode::G_CONSTANT: {
941     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
942     const APInt &Val = MI.getOperand(1).getCImm()->getValue();
943     unsigned TotalSize = Ty.getSizeInBits();
944     unsigned NarrowSize = NarrowTy.getSizeInBits();
945     int NumParts = TotalSize / NarrowSize;
946 
947     SmallVector<Register, 4> PartRegs;
948     for (int I = 0; I != NumParts; ++I) {
949       unsigned Offset = I * NarrowSize;
950       auto K = MIRBuilder.buildConstant(NarrowTy,
951                                         Val.lshr(Offset).trunc(NarrowSize));
952       PartRegs.push_back(K.getReg(0));
953     }
954 
955     LLT LeftoverTy;
956     unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
957     SmallVector<Register, 1> LeftoverRegs;
958     if (LeftoverBits != 0) {
959       LeftoverTy = LLT::scalar(LeftoverBits);
960       auto K = MIRBuilder.buildConstant(
961         LeftoverTy,
962         Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
963       LeftoverRegs.push_back(K.getReg(0));
964     }
965 
966     insertParts(MI.getOperand(0).getReg(),
967                 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
968 
969     MI.eraseFromParent();
970     return Legalized;
971   }
972   case TargetOpcode::G_SEXT:
973   case TargetOpcode::G_ZEXT:
974   case TargetOpcode::G_ANYEXT:
975     return narrowScalarExt(MI, TypeIdx, NarrowTy);
976   case TargetOpcode::G_TRUNC: {
977     if (TypeIdx != 1)
978       return UnableToLegalize;
979 
980     uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
981     if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
982       LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
983       return UnableToLegalize;
984     }
985 
986     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
987     MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
988     MI.eraseFromParent();
989     return Legalized;
990   }
991 
992   case TargetOpcode::G_FREEZE: {
993     if (TypeIdx != 0)
994       return UnableToLegalize;
995 
996     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
997     // Should widen scalar first
998     if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
999       return UnableToLegalize;
1000 
1001     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg());
1002     SmallVector<Register, 8> Parts;
1003     for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
1004       Parts.push_back(
1005           MIRBuilder.buildFreeze(NarrowTy, Unmerge.getReg(i)).getReg(0));
1006     }
1007 
1008     MIRBuilder.buildMerge(MI.getOperand(0).getReg(), Parts);
1009     MI.eraseFromParent();
1010     return Legalized;
1011   }
1012   case TargetOpcode::G_ADD:
1013   case TargetOpcode::G_SUB:
1014   case TargetOpcode::G_SADDO:
1015   case TargetOpcode::G_SSUBO:
1016   case TargetOpcode::G_SADDE:
1017   case TargetOpcode::G_SSUBE:
1018   case TargetOpcode::G_UADDO:
1019   case TargetOpcode::G_USUBO:
1020   case TargetOpcode::G_UADDE:
1021   case TargetOpcode::G_USUBE:
1022     return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
1023   case TargetOpcode::G_MUL:
1024   case TargetOpcode::G_UMULH:
1025     return narrowScalarMul(MI, NarrowTy);
1026   case TargetOpcode::G_EXTRACT:
1027     return narrowScalarExtract(MI, TypeIdx, NarrowTy);
1028   case TargetOpcode::G_INSERT:
1029     return narrowScalarInsert(MI, TypeIdx, NarrowTy);
1030   case TargetOpcode::G_LOAD: {
1031     auto &LoadMI = cast<GLoad>(MI);
1032     Register DstReg = LoadMI.getDstReg();
1033     LLT DstTy = MRI.getType(DstReg);
1034     if (DstTy.isVector())
1035       return UnableToLegalize;
1036 
1037     if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
1038       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1039       MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
1040       MIRBuilder.buildAnyExt(DstReg, TmpReg);
1041       LoadMI.eraseFromParent();
1042       return Legalized;
1043     }
1044 
1045     return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
1046   }
1047   case TargetOpcode::G_ZEXTLOAD:
1048   case TargetOpcode::G_SEXTLOAD: {
1049     auto &LoadMI = cast<GExtLoad>(MI);
1050     Register DstReg = LoadMI.getDstReg();
1051     Register PtrReg = LoadMI.getPointerReg();
1052 
1053     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1054     auto &MMO = LoadMI.getMMO();
1055     unsigned MemSize = MMO.getSizeInBits();
1056 
1057     if (MemSize == NarrowSize) {
1058       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
1059     } else if (MemSize < NarrowSize) {
1060       MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
1061     } else if (MemSize > NarrowSize) {
1062       // FIXME: Need to split the load.
1063       return UnableToLegalize;
1064     }
1065 
1066     if (isa<GZExtLoad>(LoadMI))
1067       MIRBuilder.buildZExt(DstReg, TmpReg);
1068     else
1069       MIRBuilder.buildSExt(DstReg, TmpReg);
1070 
1071     LoadMI.eraseFromParent();
1072     return Legalized;
1073   }
1074   case TargetOpcode::G_STORE: {
1075     auto &StoreMI = cast<GStore>(MI);
1076 
1077     Register SrcReg = StoreMI.getValueReg();
1078     LLT SrcTy = MRI.getType(SrcReg);
1079     if (SrcTy.isVector())
1080       return UnableToLegalize;
1081 
1082     int NumParts = SizeOp0 / NarrowSize;
1083     unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
1084     unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
1085     if (SrcTy.isVector() && LeftoverBits != 0)
1086       return UnableToLegalize;
1087 
1088     if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
1089       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1090       MIRBuilder.buildTrunc(TmpReg, SrcReg);
1091       MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
1092       StoreMI.eraseFromParent();
1093       return Legalized;
1094     }
1095 
1096     return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
1097   }
1098   case TargetOpcode::G_SELECT:
1099     return narrowScalarSelect(MI, TypeIdx, NarrowTy);
1100   case TargetOpcode::G_AND:
1101   case TargetOpcode::G_OR:
1102   case TargetOpcode::G_XOR: {
1103     // Legalize bitwise operation:
1104     // A = BinOp<Ty> B, C
1105     // into:
1106     // B1, ..., BN = G_UNMERGE_VALUES B
1107     // C1, ..., CN = G_UNMERGE_VALUES C
1108     // A1 = BinOp<Ty/N> B1, C2
1109     // ...
1110     // AN = BinOp<Ty/N> BN, CN
1111     // A = G_MERGE_VALUES A1, ..., AN
1112     return narrowScalarBasic(MI, TypeIdx, NarrowTy);
1113   }
1114   case TargetOpcode::G_SHL:
1115   case TargetOpcode::G_LSHR:
1116   case TargetOpcode::G_ASHR:
1117     return narrowScalarShift(MI, TypeIdx, NarrowTy);
1118   case TargetOpcode::G_CTLZ:
1119   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1120   case TargetOpcode::G_CTTZ:
1121   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1122   case TargetOpcode::G_CTPOP:
1123     if (TypeIdx == 1)
1124       switch (MI.getOpcode()) {
1125       case TargetOpcode::G_CTLZ:
1126       case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1127         return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
1128       case TargetOpcode::G_CTTZ:
1129       case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1130         return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
1131       case TargetOpcode::G_CTPOP:
1132         return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
1133       default:
1134         return UnableToLegalize;
1135       }
1136 
1137     Observer.changingInstr(MI);
1138     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1139     Observer.changedInstr(MI);
1140     return Legalized;
1141   case TargetOpcode::G_INTTOPTR:
1142     if (TypeIdx != 1)
1143       return UnableToLegalize;
1144 
1145     Observer.changingInstr(MI);
1146     narrowScalarSrc(MI, NarrowTy, 1);
1147     Observer.changedInstr(MI);
1148     return Legalized;
1149   case TargetOpcode::G_PTRTOINT:
1150     if (TypeIdx != 0)
1151       return UnableToLegalize;
1152 
1153     Observer.changingInstr(MI);
1154     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1155     Observer.changedInstr(MI);
1156     return Legalized;
1157   case TargetOpcode::G_PHI: {
1158     // FIXME: add support for when SizeOp0 isn't an exact multiple of
1159     // NarrowSize.
1160     if (SizeOp0 % NarrowSize != 0)
1161       return UnableToLegalize;
1162 
1163     unsigned NumParts = SizeOp0 / NarrowSize;
1164     SmallVector<Register, 2> DstRegs(NumParts);
1165     SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1166     Observer.changingInstr(MI);
1167     for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1168       MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1169       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
1170       extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1171                    SrcRegs[i / 2]);
1172     }
1173     MachineBasicBlock &MBB = *MI.getParent();
1174     MIRBuilder.setInsertPt(MBB, MI);
1175     for (unsigned i = 0; i < NumParts; ++i) {
1176       DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1177       MachineInstrBuilder MIB =
1178           MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1179       for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1180         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1181     }
1182     MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1183     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1184     Observer.changedInstr(MI);
1185     MI.eraseFromParent();
1186     return Legalized;
1187   }
1188   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1189   case TargetOpcode::G_INSERT_VECTOR_ELT: {
1190     if (TypeIdx != 2)
1191       return UnableToLegalize;
1192 
1193     int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1194     Observer.changingInstr(MI);
1195     narrowScalarSrc(MI, NarrowTy, OpIdx);
1196     Observer.changedInstr(MI);
1197     return Legalized;
1198   }
1199   case TargetOpcode::G_ICMP: {
1200     Register LHS = MI.getOperand(2).getReg();
1201     LLT SrcTy = MRI.getType(LHS);
1202     uint64_t SrcSize = SrcTy.getSizeInBits();
1203     CmpInst::Predicate Pred =
1204         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1205 
1206     // TODO: Handle the non-equality case for weird sizes.
1207     if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred))
1208       return UnableToLegalize;
1209 
1210     LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1211     SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1212     if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1213                       LHSLeftoverRegs))
1214       return UnableToLegalize;
1215 
1216     LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1217     SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1218     if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1219                       RHSPartRegs, RHSLeftoverRegs))
1220       return UnableToLegalize;
1221 
1222     // We now have the LHS and RHS of the compare split into narrow-type
1223     // registers, plus potentially some leftover type.
1224     Register Dst = MI.getOperand(0).getReg();
1225     LLT ResTy = MRI.getType(Dst);
1226     if (ICmpInst::isEquality(Pred)) {
1227       // For each part on the LHS and RHS, keep track of the result of XOR-ing
1228       // them together. For each equal part, the result should be all 0s. For
1229       // each non-equal part, we'll get at least one 1.
1230       auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1231       SmallVector<Register, 4> Xors;
1232       for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1233         auto LHS = std::get<0>(LHSAndRHS);
1234         auto RHS = std::get<1>(LHSAndRHS);
1235         auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1236         Xors.push_back(Xor);
1237       }
1238 
1239       // Build a G_XOR for each leftover register. Each G_XOR must be widened
1240       // to the desired narrow type so that we can OR them together later.
1241       SmallVector<Register, 4> WidenedXors;
1242       for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1243         auto LHS = std::get<0>(LHSAndRHS);
1244         auto RHS = std::get<1>(LHSAndRHS);
1245         auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1246         LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1247         buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1248                             /* PadStrategy = */ TargetOpcode::G_ZEXT);
1249         Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
1250       }
1251 
1252       // Now, for each part we broke up, we know if they are equal/not equal
1253       // based off the G_XOR. We can OR these all together and compare against
1254       // 0 to get the result.
1255       assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1256       auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1257       for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1258         Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1259       MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1260     } else {
1261       // TODO: Handle non-power-of-two types.
1262       assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
1263       assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
1264       Register LHSL = LHSPartRegs[0];
1265       Register LHSH = LHSPartRegs[1];
1266       Register RHSL = RHSPartRegs[0];
1267       Register RHSH = RHSPartRegs[1];
1268       MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
1269       MachineInstrBuilder CmpHEQ =
1270           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
1271       MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
1272           ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
1273       MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH);
1274     }
1275     MI.eraseFromParent();
1276     return Legalized;
1277   }
1278   case TargetOpcode::G_SEXT_INREG: {
1279     if (TypeIdx != 0)
1280       return UnableToLegalize;
1281 
1282     int64_t SizeInBits = MI.getOperand(2).getImm();
1283 
1284     // So long as the new type has more bits than the bits we're extending we
1285     // don't need to break it apart.
1286     if (NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1287       Observer.changingInstr(MI);
1288       // We don't lose any non-extension bits by truncating the src and
1289       // sign-extending the dst.
1290       MachineOperand &MO1 = MI.getOperand(1);
1291       auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1292       MO1.setReg(TruncMIB.getReg(0));
1293 
1294       MachineOperand &MO2 = MI.getOperand(0);
1295       Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1296       MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1297       MIRBuilder.buildSExt(MO2, DstExt);
1298       MO2.setReg(DstExt);
1299       Observer.changedInstr(MI);
1300       return Legalized;
1301     }
1302 
1303     // Break it apart. Components below the extension point are unmodified. The
1304     // component containing the extension point becomes a narrower SEXT_INREG.
1305     // Components above it are ashr'd from the component containing the
1306     // extension point.
1307     if (SizeOp0 % NarrowSize != 0)
1308       return UnableToLegalize;
1309     int NumParts = SizeOp0 / NarrowSize;
1310 
1311     // List the registers where the destination will be scattered.
1312     SmallVector<Register, 2> DstRegs;
1313     // List the registers where the source will be split.
1314     SmallVector<Register, 2> SrcRegs;
1315 
1316     // Create all the temporary registers.
1317     for (int i = 0; i < NumParts; ++i) {
1318       Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1319 
1320       SrcRegs.push_back(SrcReg);
1321     }
1322 
1323     // Explode the big arguments into smaller chunks.
1324     MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1325 
1326     Register AshrCstReg =
1327         MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1328             .getReg(0);
1329     Register FullExtensionReg = 0;
1330     Register PartialExtensionReg = 0;
1331 
1332     // Do the operation on each small part.
1333     for (int i = 0; i < NumParts; ++i) {
1334       if ((i + 1) * NarrowTy.getScalarSizeInBits() < SizeInBits)
1335         DstRegs.push_back(SrcRegs[i]);
1336       else if (i * NarrowTy.getScalarSizeInBits() > SizeInBits) {
1337         assert(PartialExtensionReg &&
1338                "Expected to visit partial extension before full");
1339         if (FullExtensionReg) {
1340           DstRegs.push_back(FullExtensionReg);
1341           continue;
1342         }
1343         DstRegs.push_back(
1344             MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1345                 .getReg(0));
1346         FullExtensionReg = DstRegs.back();
1347       } else {
1348         DstRegs.push_back(
1349             MIRBuilder
1350                 .buildInstr(
1351                     TargetOpcode::G_SEXT_INREG, {NarrowTy},
1352                     {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1353                 .getReg(0));
1354         PartialExtensionReg = DstRegs.back();
1355       }
1356     }
1357 
1358     // Gather the destination registers into the final destination.
1359     Register DstReg = MI.getOperand(0).getReg();
1360     MIRBuilder.buildMerge(DstReg, DstRegs);
1361     MI.eraseFromParent();
1362     return Legalized;
1363   }
1364   case TargetOpcode::G_BSWAP:
1365   case TargetOpcode::G_BITREVERSE: {
1366     if (SizeOp0 % NarrowSize != 0)
1367       return UnableToLegalize;
1368 
1369     Observer.changingInstr(MI);
1370     SmallVector<Register, 2> SrcRegs, DstRegs;
1371     unsigned NumParts = SizeOp0 / NarrowSize;
1372     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
1373 
1374     for (unsigned i = 0; i < NumParts; ++i) {
1375       auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
1376                                            {SrcRegs[NumParts - 1 - i]});
1377       DstRegs.push_back(DstPart.getReg(0));
1378     }
1379 
1380     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1381 
1382     Observer.changedInstr(MI);
1383     MI.eraseFromParent();
1384     return Legalized;
1385   }
1386   case TargetOpcode::G_PTR_ADD:
1387   case TargetOpcode::G_PTRMASK: {
1388     if (TypeIdx != 1)
1389       return UnableToLegalize;
1390     Observer.changingInstr(MI);
1391     narrowScalarSrc(MI, NarrowTy, 2);
1392     Observer.changedInstr(MI);
1393     return Legalized;
1394   }
1395   case TargetOpcode::G_FPTOUI:
1396   case TargetOpcode::G_FPTOSI:
1397     return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
1398   case TargetOpcode::G_FPEXT:
1399     if (TypeIdx != 0)
1400       return UnableToLegalize;
1401     Observer.changingInstr(MI);
1402     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
1403     Observer.changedInstr(MI);
1404     return Legalized;
1405   }
1406 }
1407 
1408 Register LegalizerHelper::coerceToScalar(Register Val) {
1409   LLT Ty = MRI.getType(Val);
1410   if (Ty.isScalar())
1411     return Val;
1412 
1413   const DataLayout &DL = MIRBuilder.getDataLayout();
1414   LLT NewTy = LLT::scalar(Ty.getSizeInBits());
1415   if (Ty.isPointer()) {
1416     if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
1417       return Register();
1418     return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
1419   }
1420 
1421   Register NewVal = Val;
1422 
1423   assert(Ty.isVector());
1424   LLT EltTy = Ty.getElementType();
1425   if (EltTy.isPointer())
1426     NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
1427   return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
1428 }
1429 
1430 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
1431                                      unsigned OpIdx, unsigned ExtOpcode) {
1432   MachineOperand &MO = MI.getOperand(OpIdx);
1433   auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
1434   MO.setReg(ExtB.getReg(0));
1435 }
1436 
1437 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
1438                                       unsigned OpIdx) {
1439   MachineOperand &MO = MI.getOperand(OpIdx);
1440   auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
1441   MO.setReg(ExtB.getReg(0));
1442 }
1443 
1444 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
1445                                      unsigned OpIdx, unsigned TruncOpcode) {
1446   MachineOperand &MO = MI.getOperand(OpIdx);
1447   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1448   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1449   MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
1450   MO.setReg(DstExt);
1451 }
1452 
1453 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
1454                                       unsigned OpIdx, unsigned ExtOpcode) {
1455   MachineOperand &MO = MI.getOperand(OpIdx);
1456   Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
1457   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1458   MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
1459   MO.setReg(DstTrunc);
1460 }
1461 
1462 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
1463                                             unsigned OpIdx) {
1464   MachineOperand &MO = MI.getOperand(OpIdx);
1465   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1466   Register Dst = MO.getReg();
1467   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1468   MO.setReg(DstExt);
1469   MIRBuilder.buildDeleteTrailingVectorElements(Dst, DstExt);
1470 }
1471 
1472 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
1473                                             unsigned OpIdx) {
1474   MachineOperand &MO = MI.getOperand(OpIdx);
1475   SmallVector<Register, 8> Regs;
1476   MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0));
1477 }
1478 
1479 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1480   MachineOperand &Op = MI.getOperand(OpIdx);
1481   Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
1482 }
1483 
1484 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1485   MachineOperand &MO = MI.getOperand(OpIdx);
1486   Register CastDst = MRI.createGenericVirtualRegister(CastTy);
1487   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1488   MIRBuilder.buildBitcast(MO, CastDst);
1489   MO.setReg(CastDst);
1490 }
1491 
1492 LegalizerHelper::LegalizeResult
1493 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
1494                                         LLT WideTy) {
1495   if (TypeIdx != 1)
1496     return UnableToLegalize;
1497 
1498   Register DstReg = MI.getOperand(0).getReg();
1499   LLT DstTy = MRI.getType(DstReg);
1500   if (DstTy.isVector())
1501     return UnableToLegalize;
1502 
1503   Register Src1 = MI.getOperand(1).getReg();
1504   LLT SrcTy = MRI.getType(Src1);
1505   const int DstSize = DstTy.getSizeInBits();
1506   const int SrcSize = SrcTy.getSizeInBits();
1507   const int WideSize = WideTy.getSizeInBits();
1508   const int NumMerge = (DstSize + WideSize - 1) / WideSize;
1509 
1510   unsigned NumOps = MI.getNumOperands();
1511   unsigned NumSrc = MI.getNumOperands() - 1;
1512   unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
1513 
1514   if (WideSize >= DstSize) {
1515     // Directly pack the bits in the target type.
1516     Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1).getReg(0);
1517 
1518     for (unsigned I = 2; I != NumOps; ++I) {
1519       const unsigned Offset = (I - 1) * PartSize;
1520 
1521       Register SrcReg = MI.getOperand(I).getReg();
1522       assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
1523 
1524       auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
1525 
1526       Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
1527         MRI.createGenericVirtualRegister(WideTy);
1528 
1529       auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
1530       auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
1531       MIRBuilder.buildOr(NextResult, ResultReg, Shl);
1532       ResultReg = NextResult;
1533     }
1534 
1535     if (WideSize > DstSize)
1536       MIRBuilder.buildTrunc(DstReg, ResultReg);
1537     else if (DstTy.isPointer())
1538       MIRBuilder.buildIntToPtr(DstReg, ResultReg);
1539 
1540     MI.eraseFromParent();
1541     return Legalized;
1542   }
1543 
1544   // Unmerge the original values to the GCD type, and recombine to the next
1545   // multiple greater than the original type.
1546   //
1547   // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
1548   // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
1549   // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
1550   // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
1551   // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
1552   // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
1553   // %12:_(s12) = G_MERGE_VALUES %10, %11
1554   //
1555   // Padding with undef if necessary:
1556   //
1557   // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
1558   // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
1559   // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
1560   // %7:_(s2) = G_IMPLICIT_DEF
1561   // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
1562   // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
1563   // %10:_(s12) = G_MERGE_VALUES %8, %9
1564 
1565   const int GCD = greatestCommonDivisor(SrcSize, WideSize);
1566   LLT GCDTy = LLT::scalar(GCD);
1567 
1568   SmallVector<Register, 8> Parts;
1569   SmallVector<Register, 8> NewMergeRegs;
1570   SmallVector<Register, 8> Unmerges;
1571   LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
1572 
1573   // Decompose the original operands if they don't evenly divide.
1574   for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
1575     Register SrcReg = MO.getReg();
1576     if (GCD == SrcSize) {
1577       Unmerges.push_back(SrcReg);
1578     } else {
1579       auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
1580       for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
1581         Unmerges.push_back(Unmerge.getReg(J));
1582     }
1583   }
1584 
1585   // Pad with undef to the next size that is a multiple of the requested size.
1586   if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
1587     Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
1588     for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
1589       Unmerges.push_back(UndefReg);
1590   }
1591 
1592   const int PartsPerGCD = WideSize / GCD;
1593 
1594   // Build merges of each piece.
1595   ArrayRef<Register> Slicer(Unmerges);
1596   for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
1597     auto Merge = MIRBuilder.buildMerge(WideTy, Slicer.take_front(PartsPerGCD));
1598     NewMergeRegs.push_back(Merge.getReg(0));
1599   }
1600 
1601   // A truncate may be necessary if the requested type doesn't evenly divide the
1602   // original result type.
1603   if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
1604     MIRBuilder.buildMerge(DstReg, NewMergeRegs);
1605   } else {
1606     auto FinalMerge = MIRBuilder.buildMerge(WideDstTy, NewMergeRegs);
1607     MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
1608   }
1609 
1610   MI.eraseFromParent();
1611   return Legalized;
1612 }
1613 
1614 Register LegalizerHelper::widenWithUnmerge(LLT WideTy, Register OrigReg) {
1615   Register WideReg = MRI.createGenericVirtualRegister(WideTy);
1616   LLT OrigTy = MRI.getType(OrigReg);
1617   LLT LCMTy = getLCMType(WideTy, OrigTy);
1618 
1619   const int NumMergeParts = LCMTy.getSizeInBits() / WideTy.getSizeInBits();
1620   const int NumUnmergeParts = LCMTy.getSizeInBits() / OrigTy.getSizeInBits();
1621 
1622   Register UnmergeSrc = WideReg;
1623 
1624   // Create a merge to the LCM type, padding with undef
1625   // %0:_(<3 x s32>) = G_FOO => <4 x s32>
1626   // =>
1627   // %1:_(<4 x s32>) = G_FOO
1628   // %2:_(<4 x s32>) = G_IMPLICIT_DEF
1629   // %3:_(<12 x s32>) = G_CONCAT_VECTORS %1, %2, %2
1630   // %0:_(<3 x s32>), %4:_, %5:_, %6:_ = G_UNMERGE_VALUES %3
1631   if (NumMergeParts > 1) {
1632     Register Undef = MIRBuilder.buildUndef(WideTy).getReg(0);
1633     SmallVector<Register, 8> MergeParts(NumMergeParts, Undef);
1634     MergeParts[0] = WideReg;
1635     UnmergeSrc = MIRBuilder.buildMerge(LCMTy, MergeParts).getReg(0);
1636   }
1637 
1638   // Unmerge to the original register and pad with dead defs.
1639   SmallVector<Register, 8> UnmergeResults(NumUnmergeParts);
1640   UnmergeResults[0] = OrigReg;
1641   for (int I = 1; I != NumUnmergeParts; ++I)
1642     UnmergeResults[I] = MRI.createGenericVirtualRegister(OrigTy);
1643 
1644   MIRBuilder.buildUnmerge(UnmergeResults, UnmergeSrc);
1645   return WideReg;
1646 }
1647 
1648 LegalizerHelper::LegalizeResult
1649 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
1650                                           LLT WideTy) {
1651   if (TypeIdx != 0)
1652     return UnableToLegalize;
1653 
1654   int NumDst = MI.getNumOperands() - 1;
1655   Register SrcReg = MI.getOperand(NumDst).getReg();
1656   LLT SrcTy = MRI.getType(SrcReg);
1657   if (SrcTy.isVector())
1658     return UnableToLegalize;
1659 
1660   Register Dst0Reg = MI.getOperand(0).getReg();
1661   LLT DstTy = MRI.getType(Dst0Reg);
1662   if (!DstTy.isScalar())
1663     return UnableToLegalize;
1664 
1665   if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
1666     if (SrcTy.isPointer()) {
1667       const DataLayout &DL = MIRBuilder.getDataLayout();
1668       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
1669         LLVM_DEBUG(
1670             dbgs() << "Not casting non-integral address space integer\n");
1671         return UnableToLegalize;
1672       }
1673 
1674       SrcTy = LLT::scalar(SrcTy.getSizeInBits());
1675       SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
1676     }
1677 
1678     // Widen SrcTy to WideTy. This does not affect the result, but since the
1679     // user requested this size, it is probably better handled than SrcTy and
1680     // should reduce the total number of legalization artifacts
1681     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1682       SrcTy = WideTy;
1683       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
1684     }
1685 
1686     // Theres no unmerge type to target. Directly extract the bits from the
1687     // source type
1688     unsigned DstSize = DstTy.getSizeInBits();
1689 
1690     MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
1691     for (int I = 1; I != NumDst; ++I) {
1692       auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
1693       auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
1694       MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
1695     }
1696 
1697     MI.eraseFromParent();
1698     return Legalized;
1699   }
1700 
1701   // Extend the source to a wider type.
1702   LLT LCMTy = getLCMType(SrcTy, WideTy);
1703 
1704   Register WideSrc = SrcReg;
1705   if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
1706     // TODO: If this is an integral address space, cast to integer and anyext.
1707     if (SrcTy.isPointer()) {
1708       LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
1709       return UnableToLegalize;
1710     }
1711 
1712     WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
1713   }
1714 
1715   auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
1716 
1717   // Create a sequence of unmerges and merges to the original results. Since we
1718   // may have widened the source, we will need to pad the results with dead defs
1719   // to cover the source register.
1720   // e.g. widen s48 to s64:
1721   // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
1722   //
1723   // =>
1724   //  %4:_(s192) = G_ANYEXT %0:_(s96)
1725   //  %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
1726   //  ; unpack to GCD type, with extra dead defs
1727   //  %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
1728   //  %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
1729   //  dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
1730   //  %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10   ; Remerge to destination
1731   //  %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
1732   const LLT GCDTy = getGCDType(WideTy, DstTy);
1733   const int NumUnmerge = Unmerge->getNumOperands() - 1;
1734   const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
1735 
1736   // Directly unmerge to the destination without going through a GCD type
1737   // if possible
1738   if (PartsPerRemerge == 1) {
1739     const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
1740 
1741     for (int I = 0; I != NumUnmerge; ++I) {
1742       auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
1743 
1744       for (int J = 0; J != PartsPerUnmerge; ++J) {
1745         int Idx = I * PartsPerUnmerge + J;
1746         if (Idx < NumDst)
1747           MIB.addDef(MI.getOperand(Idx).getReg());
1748         else {
1749           // Create dead def for excess components.
1750           MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
1751         }
1752       }
1753 
1754       MIB.addUse(Unmerge.getReg(I));
1755     }
1756   } else {
1757     SmallVector<Register, 16> Parts;
1758     for (int J = 0; J != NumUnmerge; ++J)
1759       extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
1760 
1761     SmallVector<Register, 8> RemergeParts;
1762     for (int I = 0; I != NumDst; ++I) {
1763       for (int J = 0; J < PartsPerRemerge; ++J) {
1764         const int Idx = I * PartsPerRemerge + J;
1765         RemergeParts.emplace_back(Parts[Idx]);
1766       }
1767 
1768       MIRBuilder.buildMerge(MI.getOperand(I).getReg(), RemergeParts);
1769       RemergeParts.clear();
1770     }
1771   }
1772 
1773   MI.eraseFromParent();
1774   return Legalized;
1775 }
1776 
1777 LegalizerHelper::LegalizeResult
1778 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
1779                                     LLT WideTy) {
1780   Register DstReg = MI.getOperand(0).getReg();
1781   Register SrcReg = MI.getOperand(1).getReg();
1782   LLT SrcTy = MRI.getType(SrcReg);
1783 
1784   LLT DstTy = MRI.getType(DstReg);
1785   unsigned Offset = MI.getOperand(2).getImm();
1786 
1787   if (TypeIdx == 0) {
1788     if (SrcTy.isVector() || DstTy.isVector())
1789       return UnableToLegalize;
1790 
1791     SrcOp Src(SrcReg);
1792     if (SrcTy.isPointer()) {
1793       // Extracts from pointers can be handled only if they are really just
1794       // simple integers.
1795       const DataLayout &DL = MIRBuilder.getDataLayout();
1796       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
1797         return UnableToLegalize;
1798 
1799       LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
1800       Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
1801       SrcTy = SrcAsIntTy;
1802     }
1803 
1804     if (DstTy.isPointer())
1805       return UnableToLegalize;
1806 
1807     if (Offset == 0) {
1808       // Avoid a shift in the degenerate case.
1809       MIRBuilder.buildTrunc(DstReg,
1810                             MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
1811       MI.eraseFromParent();
1812       return Legalized;
1813     }
1814 
1815     // Do a shift in the source type.
1816     LLT ShiftTy = SrcTy;
1817     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1818       Src = MIRBuilder.buildAnyExt(WideTy, Src);
1819       ShiftTy = WideTy;
1820     }
1821 
1822     auto LShr = MIRBuilder.buildLShr(
1823       ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
1824     MIRBuilder.buildTrunc(DstReg, LShr);
1825     MI.eraseFromParent();
1826     return Legalized;
1827   }
1828 
1829   if (SrcTy.isScalar()) {
1830     Observer.changingInstr(MI);
1831     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1832     Observer.changedInstr(MI);
1833     return Legalized;
1834   }
1835 
1836   if (!SrcTy.isVector())
1837     return UnableToLegalize;
1838 
1839   if (DstTy != SrcTy.getElementType())
1840     return UnableToLegalize;
1841 
1842   if (Offset % SrcTy.getScalarSizeInBits() != 0)
1843     return UnableToLegalize;
1844 
1845   Observer.changingInstr(MI);
1846   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1847 
1848   MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
1849                           Offset);
1850   widenScalarDst(MI, WideTy.getScalarType(), 0);
1851   Observer.changedInstr(MI);
1852   return Legalized;
1853 }
1854 
1855 LegalizerHelper::LegalizeResult
1856 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
1857                                    LLT WideTy) {
1858   if (TypeIdx != 0 || WideTy.isVector())
1859     return UnableToLegalize;
1860   Observer.changingInstr(MI);
1861   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1862   widenScalarDst(MI, WideTy);
1863   Observer.changedInstr(MI);
1864   return Legalized;
1865 }
1866 
1867 LegalizerHelper::LegalizeResult
1868 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
1869                                            LLT WideTy) {
1870   if (TypeIdx == 1)
1871     return UnableToLegalize; // TODO
1872 
1873   unsigned Opcode;
1874   unsigned ExtOpcode;
1875   Optional<Register> CarryIn = None;
1876   switch (MI.getOpcode()) {
1877   default:
1878     llvm_unreachable("Unexpected opcode!");
1879   case TargetOpcode::G_SADDO:
1880     Opcode = TargetOpcode::G_ADD;
1881     ExtOpcode = TargetOpcode::G_SEXT;
1882     break;
1883   case TargetOpcode::G_SSUBO:
1884     Opcode = TargetOpcode::G_SUB;
1885     ExtOpcode = TargetOpcode::G_SEXT;
1886     break;
1887   case TargetOpcode::G_UADDO:
1888     Opcode = TargetOpcode::G_ADD;
1889     ExtOpcode = TargetOpcode::G_ZEXT;
1890     break;
1891   case TargetOpcode::G_USUBO:
1892     Opcode = TargetOpcode::G_SUB;
1893     ExtOpcode = TargetOpcode::G_ZEXT;
1894     break;
1895   case TargetOpcode::G_SADDE:
1896     Opcode = TargetOpcode::G_UADDE;
1897     ExtOpcode = TargetOpcode::G_SEXT;
1898     CarryIn = MI.getOperand(4).getReg();
1899     break;
1900   case TargetOpcode::G_SSUBE:
1901     Opcode = TargetOpcode::G_USUBE;
1902     ExtOpcode = TargetOpcode::G_SEXT;
1903     CarryIn = MI.getOperand(4).getReg();
1904     break;
1905   case TargetOpcode::G_UADDE:
1906     Opcode = TargetOpcode::G_UADDE;
1907     ExtOpcode = TargetOpcode::G_ZEXT;
1908     CarryIn = MI.getOperand(4).getReg();
1909     break;
1910   case TargetOpcode::G_USUBE:
1911     Opcode = TargetOpcode::G_USUBE;
1912     ExtOpcode = TargetOpcode::G_ZEXT;
1913     CarryIn = MI.getOperand(4).getReg();
1914     break;
1915   }
1916 
1917   auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
1918   auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
1919   // Do the arithmetic in the larger type.
1920   Register NewOp;
1921   if (CarryIn) {
1922     LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
1923     NewOp = MIRBuilder
1924                 .buildInstr(Opcode, {WideTy, CarryOutTy},
1925                             {LHSExt, RHSExt, *CarryIn})
1926                 .getReg(0);
1927   } else {
1928     NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
1929   }
1930   LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
1931   auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
1932   auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
1933   // There is no overflow if the ExtOp is the same as NewOp.
1934   MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
1935   // Now trunc the NewOp to the original result.
1936   MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
1937   MI.eraseFromParent();
1938   return Legalized;
1939 }
1940 
1941 LegalizerHelper::LegalizeResult
1942 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
1943                                          LLT WideTy) {
1944   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
1945                   MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
1946                   MI.getOpcode() == TargetOpcode::G_SSHLSAT;
1947   bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
1948                  MI.getOpcode() == TargetOpcode::G_USHLSAT;
1949   // We can convert this to:
1950   //   1. Any extend iN to iM
1951   //   2. SHL by M-N
1952   //   3. [US][ADD|SUB|SHL]SAT
1953   //   4. L/ASHR by M-N
1954   //
1955   // It may be more efficient to lower this to a min and a max operation in
1956   // the higher precision arithmetic if the promoted operation isn't legal,
1957   // but this decision is up to the target's lowering request.
1958   Register DstReg = MI.getOperand(0).getReg();
1959 
1960   unsigned NewBits = WideTy.getScalarSizeInBits();
1961   unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
1962 
1963   // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
1964   // must not left shift the RHS to preserve the shift amount.
1965   auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
1966   auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
1967                      : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
1968   auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
1969   auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
1970   auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
1971 
1972   auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
1973                                         {ShiftL, ShiftR}, MI.getFlags());
1974 
1975   // Use a shift that will preserve the number of sign bits when the trunc is
1976   // folded away.
1977   auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
1978                          : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
1979 
1980   MIRBuilder.buildTrunc(DstReg, Result);
1981   MI.eraseFromParent();
1982   return Legalized;
1983 }
1984 
1985 LegalizerHelper::LegalizeResult
1986 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
1987                                  LLT WideTy) {
1988   if (TypeIdx == 1)
1989     return UnableToLegalize;
1990 
1991   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
1992   Register Result = MI.getOperand(0).getReg();
1993   Register OriginalOverflow = MI.getOperand(1).getReg();
1994   Register LHS = MI.getOperand(2).getReg();
1995   Register RHS = MI.getOperand(3).getReg();
1996   LLT SrcTy = MRI.getType(LHS);
1997   LLT OverflowTy = MRI.getType(OriginalOverflow);
1998   unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
1999 
2000   // To determine if the result overflowed in the larger type, we extend the
2001   // input to the larger type, do the multiply (checking if it overflows),
2002   // then also check the high bits of the result to see if overflow happened
2003   // there.
2004   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2005   auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
2006   auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
2007 
2008   auto Mulo = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy, OverflowTy},
2009                                     {LeftOperand, RightOperand});
2010   auto Mul = Mulo->getOperand(0);
2011   MIRBuilder.buildTrunc(Result, Mul);
2012 
2013   MachineInstrBuilder ExtResult;
2014   // Overflow occurred if it occurred in the larger type, or if the high part
2015   // of the result does not zero/sign-extend the low part.  Check this second
2016   // possibility first.
2017   if (IsSigned) {
2018     // For signed, overflow occurred when the high part does not sign-extend
2019     // the low part.
2020     ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
2021   } else {
2022     // Unsigned overflow occurred when the high part does not zero-extend the
2023     // low part.
2024     ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
2025   }
2026 
2027   // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2028   // so we don't need to check the overflow result of larger type Mulo.
2029   if (WideTy.getScalarSizeInBits() < 2 * SrcBitWidth) {
2030     auto Overflow =
2031         MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
2032     // Finally check if the multiplication in the larger type itself overflowed.
2033     MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
2034   } else {
2035     MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
2036   }
2037   MI.eraseFromParent();
2038   return Legalized;
2039 }
2040 
2041 LegalizerHelper::LegalizeResult
2042 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
2043   switch (MI.getOpcode()) {
2044   default:
2045     return UnableToLegalize;
2046   case TargetOpcode::G_ATOMICRMW_XCHG:
2047   case TargetOpcode::G_ATOMICRMW_ADD:
2048   case TargetOpcode::G_ATOMICRMW_SUB:
2049   case TargetOpcode::G_ATOMICRMW_AND:
2050   case TargetOpcode::G_ATOMICRMW_OR:
2051   case TargetOpcode::G_ATOMICRMW_XOR:
2052   case TargetOpcode::G_ATOMICRMW_MIN:
2053   case TargetOpcode::G_ATOMICRMW_MAX:
2054   case TargetOpcode::G_ATOMICRMW_UMIN:
2055   case TargetOpcode::G_ATOMICRMW_UMAX:
2056     assert(TypeIdx == 0 && "atomicrmw with second scalar type");
2057     Observer.changingInstr(MI);
2058     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2059     widenScalarDst(MI, WideTy, 0);
2060     Observer.changedInstr(MI);
2061     return Legalized;
2062   case TargetOpcode::G_ATOMIC_CMPXCHG:
2063     assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2064     Observer.changingInstr(MI);
2065     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2066     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2067     widenScalarDst(MI, WideTy, 0);
2068     Observer.changedInstr(MI);
2069     return Legalized;
2070   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2071     if (TypeIdx == 0) {
2072       Observer.changingInstr(MI);
2073       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2074       widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
2075       widenScalarDst(MI, WideTy, 0);
2076       Observer.changedInstr(MI);
2077       return Legalized;
2078     }
2079     assert(TypeIdx == 1 &&
2080            "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2081     Observer.changingInstr(MI);
2082     widenScalarDst(MI, WideTy, 1);
2083     Observer.changedInstr(MI);
2084     return Legalized;
2085   case TargetOpcode::G_EXTRACT:
2086     return widenScalarExtract(MI, TypeIdx, WideTy);
2087   case TargetOpcode::G_INSERT:
2088     return widenScalarInsert(MI, TypeIdx, WideTy);
2089   case TargetOpcode::G_MERGE_VALUES:
2090     return widenScalarMergeValues(MI, TypeIdx, WideTy);
2091   case TargetOpcode::G_UNMERGE_VALUES:
2092     return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2093   case TargetOpcode::G_SADDO:
2094   case TargetOpcode::G_SSUBO:
2095   case TargetOpcode::G_UADDO:
2096   case TargetOpcode::G_USUBO:
2097   case TargetOpcode::G_SADDE:
2098   case TargetOpcode::G_SSUBE:
2099   case TargetOpcode::G_UADDE:
2100   case TargetOpcode::G_USUBE:
2101     return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2102   case TargetOpcode::G_UMULO:
2103   case TargetOpcode::G_SMULO:
2104     return widenScalarMulo(MI, TypeIdx, WideTy);
2105   case TargetOpcode::G_SADDSAT:
2106   case TargetOpcode::G_SSUBSAT:
2107   case TargetOpcode::G_SSHLSAT:
2108   case TargetOpcode::G_UADDSAT:
2109   case TargetOpcode::G_USUBSAT:
2110   case TargetOpcode::G_USHLSAT:
2111     return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2112   case TargetOpcode::G_CTTZ:
2113   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2114   case TargetOpcode::G_CTLZ:
2115   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2116   case TargetOpcode::G_CTPOP: {
2117     if (TypeIdx == 0) {
2118       Observer.changingInstr(MI);
2119       widenScalarDst(MI, WideTy, 0);
2120       Observer.changedInstr(MI);
2121       return Legalized;
2122     }
2123 
2124     Register SrcReg = MI.getOperand(1).getReg();
2125 
2126     // First extend the input.
2127     unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
2128                               MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
2129                           ? TargetOpcode::G_ANYEXT
2130                           : TargetOpcode::G_ZEXT;
2131     auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
2132     LLT CurTy = MRI.getType(SrcReg);
2133     unsigned NewOpc = MI.getOpcode();
2134     if (NewOpc == TargetOpcode::G_CTTZ) {
2135       // The count is the same in the larger type except if the original
2136       // value was zero.  This can be handled by setting the bit just off
2137       // the top of the original type.
2138       auto TopBit =
2139           APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
2140       MIBSrc = MIRBuilder.buildOr(
2141         WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
2142       // Now we know the operand is non-zero, use the more relaxed opcode.
2143       NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2144     }
2145 
2146     // Perform the operation at the larger size.
2147     auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
2148     // This is already the correct result for CTPOP and CTTZs
2149     if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
2150         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2151       // The correct result is NewOp - (Difference in widety and current ty).
2152       unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2153       MIBNewOp = MIRBuilder.buildSub(
2154           WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
2155     }
2156 
2157     MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
2158     MI.eraseFromParent();
2159     return Legalized;
2160   }
2161   case TargetOpcode::G_BSWAP: {
2162     Observer.changingInstr(MI);
2163     Register DstReg = MI.getOperand(0).getReg();
2164 
2165     Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2166     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2167     Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2168     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2169 
2170     MI.getOperand(0).setReg(DstExt);
2171 
2172     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2173 
2174     LLT Ty = MRI.getType(DstReg);
2175     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2176     MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2177     MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2178 
2179     MIRBuilder.buildTrunc(DstReg, ShrReg);
2180     Observer.changedInstr(MI);
2181     return Legalized;
2182   }
2183   case TargetOpcode::G_BITREVERSE: {
2184     Observer.changingInstr(MI);
2185 
2186     Register DstReg = MI.getOperand(0).getReg();
2187     LLT Ty = MRI.getType(DstReg);
2188     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2189 
2190     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2191     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2192     MI.getOperand(0).setReg(DstExt);
2193     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2194 
2195     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2196     auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2197     MIRBuilder.buildTrunc(DstReg, Shift);
2198     Observer.changedInstr(MI);
2199     return Legalized;
2200   }
2201   case TargetOpcode::G_FREEZE:
2202     Observer.changingInstr(MI);
2203     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2204     widenScalarDst(MI, WideTy);
2205     Observer.changedInstr(MI);
2206     return Legalized;
2207 
2208   case TargetOpcode::G_ABS:
2209     Observer.changingInstr(MI);
2210     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2211     widenScalarDst(MI, WideTy);
2212     Observer.changedInstr(MI);
2213     return Legalized;
2214 
2215   case TargetOpcode::G_ADD:
2216   case TargetOpcode::G_AND:
2217   case TargetOpcode::G_MUL:
2218   case TargetOpcode::G_OR:
2219   case TargetOpcode::G_XOR:
2220   case TargetOpcode::G_SUB:
2221     // Perform operation at larger width (any extension is fines here, high bits
2222     // don't affect the result) and then truncate the result back to the
2223     // original type.
2224     Observer.changingInstr(MI);
2225     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2226     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2227     widenScalarDst(MI, WideTy);
2228     Observer.changedInstr(MI);
2229     return Legalized;
2230 
2231   case TargetOpcode::G_SBFX:
2232   case TargetOpcode::G_UBFX:
2233     Observer.changingInstr(MI);
2234 
2235     if (TypeIdx == 0) {
2236       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2237       widenScalarDst(MI, WideTy);
2238     } else {
2239       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2240       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2241     }
2242 
2243     Observer.changedInstr(MI);
2244     return Legalized;
2245 
2246   case TargetOpcode::G_SHL:
2247     Observer.changingInstr(MI);
2248 
2249     if (TypeIdx == 0) {
2250       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2251       widenScalarDst(MI, WideTy);
2252     } else {
2253       assert(TypeIdx == 1);
2254       // The "number of bits to shift" operand must preserve its value as an
2255       // unsigned integer:
2256       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2257     }
2258 
2259     Observer.changedInstr(MI);
2260     return Legalized;
2261 
2262   case TargetOpcode::G_SDIV:
2263   case TargetOpcode::G_SREM:
2264   case TargetOpcode::G_SMIN:
2265   case TargetOpcode::G_SMAX:
2266     Observer.changingInstr(MI);
2267     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2268     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2269     widenScalarDst(MI, WideTy);
2270     Observer.changedInstr(MI);
2271     return Legalized;
2272 
2273   case TargetOpcode::G_SDIVREM:
2274     Observer.changingInstr(MI);
2275     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2276     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2277     widenScalarDst(MI, WideTy);
2278     widenScalarDst(MI, WideTy, 1);
2279     Observer.changedInstr(MI);
2280     return Legalized;
2281 
2282   case TargetOpcode::G_ASHR:
2283   case TargetOpcode::G_LSHR:
2284     Observer.changingInstr(MI);
2285 
2286     if (TypeIdx == 0) {
2287       unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
2288         TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2289 
2290       widenScalarSrc(MI, WideTy, 1, CvtOp);
2291       widenScalarDst(MI, WideTy);
2292     } else {
2293       assert(TypeIdx == 1);
2294       // The "number of bits to shift" operand must preserve its value as an
2295       // unsigned integer:
2296       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2297     }
2298 
2299     Observer.changedInstr(MI);
2300     return Legalized;
2301   case TargetOpcode::G_UDIV:
2302   case TargetOpcode::G_UREM:
2303   case TargetOpcode::G_UMIN:
2304   case TargetOpcode::G_UMAX:
2305     Observer.changingInstr(MI);
2306     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2307     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2308     widenScalarDst(MI, WideTy);
2309     Observer.changedInstr(MI);
2310     return Legalized;
2311 
2312   case TargetOpcode::G_UDIVREM:
2313     Observer.changingInstr(MI);
2314     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2315     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2316     widenScalarDst(MI, WideTy);
2317     widenScalarDst(MI, WideTy, 1);
2318     Observer.changedInstr(MI);
2319     return Legalized;
2320 
2321   case TargetOpcode::G_SELECT:
2322     Observer.changingInstr(MI);
2323     if (TypeIdx == 0) {
2324       // Perform operation at larger width (any extension is fine here, high
2325       // bits don't affect the result) and then truncate the result back to the
2326       // original type.
2327       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2328       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2329       widenScalarDst(MI, WideTy);
2330     } else {
2331       bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
2332       // Explicit extension is required here since high bits affect the result.
2333       widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
2334     }
2335     Observer.changedInstr(MI);
2336     return Legalized;
2337 
2338   case TargetOpcode::G_FPTOSI:
2339   case TargetOpcode::G_FPTOUI:
2340     Observer.changingInstr(MI);
2341 
2342     if (TypeIdx == 0)
2343       widenScalarDst(MI, WideTy);
2344     else
2345       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2346 
2347     Observer.changedInstr(MI);
2348     return Legalized;
2349   case TargetOpcode::G_SITOFP:
2350     Observer.changingInstr(MI);
2351 
2352     if (TypeIdx == 0)
2353       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2354     else
2355       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2356 
2357     Observer.changedInstr(MI);
2358     return Legalized;
2359   case TargetOpcode::G_UITOFP:
2360     Observer.changingInstr(MI);
2361 
2362     if (TypeIdx == 0)
2363       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2364     else
2365       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2366 
2367     Observer.changedInstr(MI);
2368     return Legalized;
2369   case TargetOpcode::G_LOAD:
2370   case TargetOpcode::G_SEXTLOAD:
2371   case TargetOpcode::G_ZEXTLOAD:
2372     Observer.changingInstr(MI);
2373     widenScalarDst(MI, WideTy);
2374     Observer.changedInstr(MI);
2375     return Legalized;
2376 
2377   case TargetOpcode::G_STORE: {
2378     if (TypeIdx != 0)
2379       return UnableToLegalize;
2380 
2381     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2382     if (!Ty.isScalar())
2383       return UnableToLegalize;
2384 
2385     Observer.changingInstr(MI);
2386 
2387     unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
2388       TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
2389     widenScalarSrc(MI, WideTy, 0, ExtType);
2390 
2391     Observer.changedInstr(MI);
2392     return Legalized;
2393   }
2394   case TargetOpcode::G_CONSTANT: {
2395     MachineOperand &SrcMO = MI.getOperand(1);
2396     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2397     unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
2398         MRI.getType(MI.getOperand(0).getReg()));
2399     assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
2400             ExtOpc == TargetOpcode::G_ANYEXT) &&
2401            "Illegal Extend");
2402     const APInt &SrcVal = SrcMO.getCImm()->getValue();
2403     const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
2404                            ? SrcVal.sext(WideTy.getSizeInBits())
2405                            : SrcVal.zext(WideTy.getSizeInBits());
2406     Observer.changingInstr(MI);
2407     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
2408 
2409     widenScalarDst(MI, WideTy);
2410     Observer.changedInstr(MI);
2411     return Legalized;
2412   }
2413   case TargetOpcode::G_FCONSTANT: {
2414     MachineOperand &SrcMO = MI.getOperand(1);
2415     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2416     APFloat Val = SrcMO.getFPImm()->getValueAPF();
2417     bool LosesInfo;
2418     switch (WideTy.getSizeInBits()) {
2419     case 32:
2420       Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
2421                   &LosesInfo);
2422       break;
2423     case 64:
2424       Val.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
2425                   &LosesInfo);
2426       break;
2427     default:
2428       return UnableToLegalize;
2429     }
2430 
2431     assert(!LosesInfo && "extend should always be lossless");
2432 
2433     Observer.changingInstr(MI);
2434     SrcMO.setFPImm(ConstantFP::get(Ctx, Val));
2435 
2436     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2437     Observer.changedInstr(MI);
2438     return Legalized;
2439   }
2440   case TargetOpcode::G_IMPLICIT_DEF: {
2441     Observer.changingInstr(MI);
2442     widenScalarDst(MI, WideTy);
2443     Observer.changedInstr(MI);
2444     return Legalized;
2445   }
2446   case TargetOpcode::G_BRCOND:
2447     Observer.changingInstr(MI);
2448     widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
2449     Observer.changedInstr(MI);
2450     return Legalized;
2451 
2452   case TargetOpcode::G_FCMP:
2453     Observer.changingInstr(MI);
2454     if (TypeIdx == 0)
2455       widenScalarDst(MI, WideTy);
2456     else {
2457       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2458       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
2459     }
2460     Observer.changedInstr(MI);
2461     return Legalized;
2462 
2463   case TargetOpcode::G_ICMP:
2464     Observer.changingInstr(MI);
2465     if (TypeIdx == 0)
2466       widenScalarDst(MI, WideTy);
2467     else {
2468       unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>(
2469                                MI.getOperand(1).getPredicate()))
2470                                ? TargetOpcode::G_SEXT
2471                                : TargetOpcode::G_ZEXT;
2472       widenScalarSrc(MI, WideTy, 2, ExtOpcode);
2473       widenScalarSrc(MI, WideTy, 3, ExtOpcode);
2474     }
2475     Observer.changedInstr(MI);
2476     return Legalized;
2477 
2478   case TargetOpcode::G_PTR_ADD:
2479     assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
2480     Observer.changingInstr(MI);
2481     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2482     Observer.changedInstr(MI);
2483     return Legalized;
2484 
2485   case TargetOpcode::G_PHI: {
2486     assert(TypeIdx == 0 && "Expecting only Idx 0");
2487 
2488     Observer.changingInstr(MI);
2489     for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
2490       MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
2491       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
2492       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
2493     }
2494 
2495     MachineBasicBlock &MBB = *MI.getParent();
2496     MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
2497     widenScalarDst(MI, WideTy);
2498     Observer.changedInstr(MI);
2499     return Legalized;
2500   }
2501   case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
2502     if (TypeIdx == 0) {
2503       Register VecReg = MI.getOperand(1).getReg();
2504       LLT VecTy = MRI.getType(VecReg);
2505       Observer.changingInstr(MI);
2506 
2507       widenScalarSrc(
2508           MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
2509           TargetOpcode::G_ANYEXT);
2510 
2511       widenScalarDst(MI, WideTy, 0);
2512       Observer.changedInstr(MI);
2513       return Legalized;
2514     }
2515 
2516     if (TypeIdx != 2)
2517       return UnableToLegalize;
2518     Observer.changingInstr(MI);
2519     // TODO: Probably should be zext
2520     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2521     Observer.changedInstr(MI);
2522     return Legalized;
2523   }
2524   case TargetOpcode::G_INSERT_VECTOR_ELT: {
2525     if (TypeIdx == 1) {
2526       Observer.changingInstr(MI);
2527 
2528       Register VecReg = MI.getOperand(1).getReg();
2529       LLT VecTy = MRI.getType(VecReg);
2530       LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
2531 
2532       widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
2533       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2534       widenScalarDst(MI, WideVecTy, 0);
2535       Observer.changedInstr(MI);
2536       return Legalized;
2537     }
2538 
2539     if (TypeIdx == 2) {
2540       Observer.changingInstr(MI);
2541       // TODO: Probably should be zext
2542       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2543       Observer.changedInstr(MI);
2544       return Legalized;
2545     }
2546 
2547     return UnableToLegalize;
2548   }
2549   case TargetOpcode::G_FADD:
2550   case TargetOpcode::G_FMUL:
2551   case TargetOpcode::G_FSUB:
2552   case TargetOpcode::G_FMA:
2553   case TargetOpcode::G_FMAD:
2554   case TargetOpcode::G_FNEG:
2555   case TargetOpcode::G_FABS:
2556   case TargetOpcode::G_FCANONICALIZE:
2557   case TargetOpcode::G_FMINNUM:
2558   case TargetOpcode::G_FMAXNUM:
2559   case TargetOpcode::G_FMINNUM_IEEE:
2560   case TargetOpcode::G_FMAXNUM_IEEE:
2561   case TargetOpcode::G_FMINIMUM:
2562   case TargetOpcode::G_FMAXIMUM:
2563   case TargetOpcode::G_FDIV:
2564   case TargetOpcode::G_FREM:
2565   case TargetOpcode::G_FCEIL:
2566   case TargetOpcode::G_FFLOOR:
2567   case TargetOpcode::G_FCOS:
2568   case TargetOpcode::G_FSIN:
2569   case TargetOpcode::G_FLOG10:
2570   case TargetOpcode::G_FLOG:
2571   case TargetOpcode::G_FLOG2:
2572   case TargetOpcode::G_FRINT:
2573   case TargetOpcode::G_FNEARBYINT:
2574   case TargetOpcode::G_FSQRT:
2575   case TargetOpcode::G_FEXP:
2576   case TargetOpcode::G_FEXP2:
2577   case TargetOpcode::G_FPOW:
2578   case TargetOpcode::G_INTRINSIC_TRUNC:
2579   case TargetOpcode::G_INTRINSIC_ROUND:
2580   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2581     assert(TypeIdx == 0);
2582     Observer.changingInstr(MI);
2583 
2584     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
2585       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
2586 
2587     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2588     Observer.changedInstr(MI);
2589     return Legalized;
2590   case TargetOpcode::G_FPOWI: {
2591     if (TypeIdx != 0)
2592       return UnableToLegalize;
2593     Observer.changingInstr(MI);
2594     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2595     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2596     Observer.changedInstr(MI);
2597     return Legalized;
2598   }
2599   case TargetOpcode::G_INTTOPTR:
2600     if (TypeIdx != 1)
2601       return UnableToLegalize;
2602 
2603     Observer.changingInstr(MI);
2604     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2605     Observer.changedInstr(MI);
2606     return Legalized;
2607   case TargetOpcode::G_PTRTOINT:
2608     if (TypeIdx != 0)
2609       return UnableToLegalize;
2610 
2611     Observer.changingInstr(MI);
2612     widenScalarDst(MI, WideTy, 0);
2613     Observer.changedInstr(MI);
2614     return Legalized;
2615   case TargetOpcode::G_BUILD_VECTOR: {
2616     Observer.changingInstr(MI);
2617 
2618     const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
2619     for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
2620       widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
2621 
2622     // Avoid changing the result vector type if the source element type was
2623     // requested.
2624     if (TypeIdx == 1) {
2625       MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
2626     } else {
2627       widenScalarDst(MI, WideTy, 0);
2628     }
2629 
2630     Observer.changedInstr(MI);
2631     return Legalized;
2632   }
2633   case TargetOpcode::G_SEXT_INREG:
2634     if (TypeIdx != 0)
2635       return UnableToLegalize;
2636 
2637     Observer.changingInstr(MI);
2638     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2639     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
2640     Observer.changedInstr(MI);
2641     return Legalized;
2642   case TargetOpcode::G_PTRMASK: {
2643     if (TypeIdx != 1)
2644       return UnableToLegalize;
2645     Observer.changingInstr(MI);
2646     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2647     Observer.changedInstr(MI);
2648     return Legalized;
2649   }
2650   }
2651 }
2652 
2653 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
2654                              MachineIRBuilder &B, Register Src, LLT Ty) {
2655   auto Unmerge = B.buildUnmerge(Ty, Src);
2656   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2657     Pieces.push_back(Unmerge.getReg(I));
2658 }
2659 
2660 LegalizerHelper::LegalizeResult
2661 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
2662   Register Dst = MI.getOperand(0).getReg();
2663   Register Src = MI.getOperand(1).getReg();
2664   LLT DstTy = MRI.getType(Dst);
2665   LLT SrcTy = MRI.getType(Src);
2666 
2667   if (SrcTy.isVector()) {
2668     LLT SrcEltTy = SrcTy.getElementType();
2669     SmallVector<Register, 8> SrcRegs;
2670 
2671     if (DstTy.isVector()) {
2672       int NumDstElt = DstTy.getNumElements();
2673       int NumSrcElt = SrcTy.getNumElements();
2674 
2675       LLT DstEltTy = DstTy.getElementType();
2676       LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
2677       LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
2678 
2679       // If there's an element size mismatch, insert intermediate casts to match
2680       // the result element type.
2681       if (NumSrcElt < NumDstElt) { // Source element type is larger.
2682         // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
2683         //
2684         // =>
2685         //
2686         // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
2687         // %3:_(<2 x s8>) = G_BITCAST %2
2688         // %4:_(<2 x s8>) = G_BITCAST %3
2689         // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
2690         DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
2691         SrcPartTy = SrcEltTy;
2692       } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
2693         //
2694         // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
2695         //
2696         // =>
2697         //
2698         // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
2699         // %3:_(s16) = G_BITCAST %2
2700         // %4:_(s16) = G_BITCAST %3
2701         // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
2702         SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
2703         DstCastTy = DstEltTy;
2704       }
2705 
2706       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
2707       for (Register &SrcReg : SrcRegs)
2708         SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
2709     } else
2710       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
2711 
2712     MIRBuilder.buildMerge(Dst, SrcRegs);
2713     MI.eraseFromParent();
2714     return Legalized;
2715   }
2716 
2717   if (DstTy.isVector()) {
2718     SmallVector<Register, 8> SrcRegs;
2719     getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
2720     MIRBuilder.buildMerge(Dst, SrcRegs);
2721     MI.eraseFromParent();
2722     return Legalized;
2723   }
2724 
2725   return UnableToLegalize;
2726 }
2727 
2728 /// Figure out the bit offset into a register when coercing a vector index for
2729 /// the wide element type. This is only for the case when promoting vector to
2730 /// one with larger elements.
2731 //
2732 ///
2733 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2734 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2735 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
2736                                                    Register Idx,
2737                                                    unsigned NewEltSize,
2738                                                    unsigned OldEltSize) {
2739   const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2740   LLT IdxTy = B.getMRI()->getType(Idx);
2741 
2742   // Now figure out the amount we need to shift to get the target bits.
2743   auto OffsetMask = B.buildConstant(
2744       IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio));
2745   auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
2746   return B.buildShl(IdxTy, OffsetIdx,
2747                     B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
2748 }
2749 
2750 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
2751 /// is casting to a vector with a smaller element size, perform multiple element
2752 /// extracts and merge the results. If this is coercing to a vector with larger
2753 /// elements, index the bitcasted vector and extract the target element with bit
2754 /// operations. This is intended to force the indexing in the native register
2755 /// size for architectures that can dynamically index the register file.
2756 LegalizerHelper::LegalizeResult
2757 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
2758                                          LLT CastTy) {
2759   if (TypeIdx != 1)
2760     return UnableToLegalize;
2761 
2762   Register Dst = MI.getOperand(0).getReg();
2763   Register SrcVec = MI.getOperand(1).getReg();
2764   Register Idx = MI.getOperand(2).getReg();
2765   LLT SrcVecTy = MRI.getType(SrcVec);
2766   LLT IdxTy = MRI.getType(Idx);
2767 
2768   LLT SrcEltTy = SrcVecTy.getElementType();
2769   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2770   unsigned OldNumElts = SrcVecTy.getNumElements();
2771 
2772   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2773   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2774 
2775   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2776   const unsigned OldEltSize = SrcEltTy.getSizeInBits();
2777   if (NewNumElts > OldNumElts) {
2778     // Decreasing the vector element size
2779     //
2780     // e.g. i64 = extract_vector_elt x:v2i64, y:i32
2781     //  =>
2782     //  v4i32:castx = bitcast x:v2i64
2783     //
2784     // i64 = bitcast
2785     //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
2786     //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
2787     //
2788     if (NewNumElts % OldNumElts != 0)
2789       return UnableToLegalize;
2790 
2791     // Type of the intermediate result vector.
2792     const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
2793     LLT MidTy =
2794         LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
2795 
2796     auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
2797 
2798     SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
2799     auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
2800 
2801     for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
2802       auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
2803       auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
2804       auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
2805       NewOps[I] = Elt.getReg(0);
2806     }
2807 
2808     auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
2809     MIRBuilder.buildBitcast(Dst, NewVec);
2810     MI.eraseFromParent();
2811     return Legalized;
2812   }
2813 
2814   if (NewNumElts < OldNumElts) {
2815     if (NewEltSize % OldEltSize != 0)
2816       return UnableToLegalize;
2817 
2818     // This only depends on powers of 2 because we use bit tricks to figure out
2819     // the bit offset we need to shift to get the target element. A general
2820     // expansion could emit division/multiply.
2821     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2822       return UnableToLegalize;
2823 
2824     // Increasing the vector element size.
2825     // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
2826     //
2827     //   =>
2828     //
2829     // %cast = G_BITCAST %vec
2830     // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
2831     // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
2832     // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2833     // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2834     // %elt_bits = G_LSHR %wide_elt, %offset_bits
2835     // %elt = G_TRUNC %elt_bits
2836 
2837     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2838     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2839 
2840     // Divide to get the index in the wider element type.
2841     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2842 
2843     Register WideElt = CastVec;
2844     if (CastTy.isVector()) {
2845       WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2846                                                      ScaledIdx).getReg(0);
2847     }
2848 
2849     // Compute the bit offset into the register of the target element.
2850     Register OffsetBits = getBitcastWiderVectorElementOffset(
2851       MIRBuilder, Idx, NewEltSize, OldEltSize);
2852 
2853     // Shift the wide element to get the target element.
2854     auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
2855     MIRBuilder.buildTrunc(Dst, ExtractedBits);
2856     MI.eraseFromParent();
2857     return Legalized;
2858   }
2859 
2860   return UnableToLegalize;
2861 }
2862 
2863 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
2864 /// TargetReg, while preserving other bits in \p TargetReg.
2865 ///
2866 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
2867 static Register buildBitFieldInsert(MachineIRBuilder &B,
2868                                     Register TargetReg, Register InsertReg,
2869                                     Register OffsetBits) {
2870   LLT TargetTy = B.getMRI()->getType(TargetReg);
2871   LLT InsertTy = B.getMRI()->getType(InsertReg);
2872   auto ZextVal = B.buildZExt(TargetTy, InsertReg);
2873   auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
2874 
2875   // Produce a bitmask of the value to insert
2876   auto EltMask = B.buildConstant(
2877     TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
2878                                    InsertTy.getSizeInBits()));
2879   // Shift it into position
2880   auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
2881   auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
2882 
2883   // Clear out the bits in the wide element
2884   auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
2885 
2886   // The value to insert has all zeros already, so stick it into the masked
2887   // wide element.
2888   return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
2889 }
2890 
2891 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
2892 /// is increasing the element size, perform the indexing in the target element
2893 /// type, and use bit operations to insert at the element position. This is
2894 /// intended for architectures that can dynamically index the register file and
2895 /// want to force indexing in the native register size.
2896 LegalizerHelper::LegalizeResult
2897 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
2898                                         LLT CastTy) {
2899   if (TypeIdx != 0)
2900     return UnableToLegalize;
2901 
2902   Register Dst = MI.getOperand(0).getReg();
2903   Register SrcVec = MI.getOperand(1).getReg();
2904   Register Val = MI.getOperand(2).getReg();
2905   Register Idx = MI.getOperand(3).getReg();
2906 
2907   LLT VecTy = MRI.getType(Dst);
2908   LLT IdxTy = MRI.getType(Idx);
2909 
2910   LLT VecEltTy = VecTy.getElementType();
2911   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2912   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2913   const unsigned OldEltSize = VecEltTy.getSizeInBits();
2914 
2915   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2916   unsigned OldNumElts = VecTy.getNumElements();
2917 
2918   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2919   if (NewNumElts < OldNumElts) {
2920     if (NewEltSize % OldEltSize != 0)
2921       return UnableToLegalize;
2922 
2923     // This only depends on powers of 2 because we use bit tricks to figure out
2924     // the bit offset we need to shift to get the target element. A general
2925     // expansion could emit division/multiply.
2926     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2927       return UnableToLegalize;
2928 
2929     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2930     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2931 
2932     // Divide to get the index in the wider element type.
2933     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2934 
2935     Register ExtractedElt = CastVec;
2936     if (CastTy.isVector()) {
2937       ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2938                                                           ScaledIdx).getReg(0);
2939     }
2940 
2941     // Compute the bit offset into the register of the target element.
2942     Register OffsetBits = getBitcastWiderVectorElementOffset(
2943       MIRBuilder, Idx, NewEltSize, OldEltSize);
2944 
2945     Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
2946                                                Val, OffsetBits);
2947     if (CastTy.isVector()) {
2948       InsertedElt = MIRBuilder.buildInsertVectorElement(
2949         CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
2950     }
2951 
2952     MIRBuilder.buildBitcast(Dst, InsertedElt);
2953     MI.eraseFromParent();
2954     return Legalized;
2955   }
2956 
2957   return UnableToLegalize;
2958 }
2959 
2960 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
2961   // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
2962   Register DstReg = LoadMI.getDstReg();
2963   Register PtrReg = LoadMI.getPointerReg();
2964   LLT DstTy = MRI.getType(DstReg);
2965   MachineMemOperand &MMO = LoadMI.getMMO();
2966   LLT MemTy = MMO.getMemoryType();
2967   MachineFunction &MF = MIRBuilder.getMF();
2968 
2969   unsigned MemSizeInBits = MemTy.getSizeInBits();
2970   unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
2971 
2972   if (MemSizeInBits != MemStoreSizeInBits) {
2973     if (MemTy.isVector())
2974       return UnableToLegalize;
2975 
2976     // Promote to a byte-sized load if not loading an integral number of
2977     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
2978     LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
2979     MachineMemOperand *NewMMO =
2980         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
2981 
2982     Register LoadReg = DstReg;
2983     LLT LoadTy = DstTy;
2984 
2985     // If this wasn't already an extending load, we need to widen the result
2986     // register to avoid creating a load with a narrower result than the source.
2987     if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
2988       LoadTy = WideMemTy;
2989       LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
2990     }
2991 
2992     if (isa<GSExtLoad>(LoadMI)) {
2993       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2994       MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
2995     } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == DstTy) {
2996       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2997       // The extra bits are guaranteed to be zero, since we stored them that
2998       // way.  A zext load from Wide thus automatically gives zext from MemVT.
2999       MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
3000     } else {
3001       MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
3002     }
3003 
3004     if (DstTy != LoadTy)
3005       MIRBuilder.buildTrunc(DstReg, LoadReg);
3006 
3007     LoadMI.eraseFromParent();
3008     return Legalized;
3009   }
3010 
3011   // Big endian lowering not implemented.
3012   if (MIRBuilder.getDataLayout().isBigEndian())
3013     return UnableToLegalize;
3014 
3015   // This load needs splitting into power of 2 sized loads.
3016   //
3017   // Our strategy here is to generate anyextending loads for the smaller
3018   // types up to next power-2 result type, and then combine the two larger
3019   // result values together, before truncating back down to the non-pow-2
3020   // type.
3021   // E.g. v1 = i24 load =>
3022   // v2 = i32 zextload (2 byte)
3023   // v3 = i32 load (1 byte)
3024   // v4 = i32 shl v3, 16
3025   // v5 = i32 or v4, v2
3026   // v1 = i24 trunc v5
3027   // By doing this we generate the correct truncate which should get
3028   // combined away as an artifact with a matching extend.
3029 
3030   uint64_t LargeSplitSize, SmallSplitSize;
3031 
3032   if (!isPowerOf2_32(MemSizeInBits)) {
3033     // This load needs splitting into power of 2 sized loads.
3034     LargeSplitSize = PowerOf2Floor(MemSizeInBits);
3035     SmallSplitSize = MemSizeInBits - LargeSplitSize;
3036   } else {
3037     // This is already a power of 2, but we still need to split this in half.
3038     //
3039     // Assume we're being asked to decompose an unaligned load.
3040     // TODO: If this requires multiple splits, handle them all at once.
3041     auto &Ctx = MF.getFunction().getContext();
3042     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3043       return UnableToLegalize;
3044 
3045     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3046   }
3047 
3048   if (MemTy.isVector()) {
3049     // TODO: Handle vector extloads
3050     if (MemTy != DstTy)
3051       return UnableToLegalize;
3052 
3053     // TODO: We can do better than scalarizing the vector and at least split it
3054     // in half.
3055     return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
3056   }
3057 
3058   MachineMemOperand *LargeMMO =
3059       MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3060   MachineMemOperand *SmallMMO =
3061       MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3062 
3063   LLT PtrTy = MRI.getType(PtrReg);
3064   unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
3065   LLT AnyExtTy = LLT::scalar(AnyExtSize);
3066   auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
3067                                              PtrReg, *LargeMMO);
3068 
3069   auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
3070                                             LargeSplitSize / 8);
3071   Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
3072   auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
3073   auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
3074                                              SmallPtr, *SmallMMO);
3075 
3076   auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
3077   auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
3078 
3079   if (AnyExtTy == DstTy)
3080     MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
3081   else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
3082     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3083     MIRBuilder.buildTrunc(DstReg, {Or});
3084   } else {
3085     assert(DstTy.isPointer() && "expected pointer");
3086     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3087 
3088     // FIXME: We currently consider this to be illegal for non-integral address
3089     // spaces, but we need still need a way to reinterpret the bits.
3090     MIRBuilder.buildIntToPtr(DstReg, Or);
3091   }
3092 
3093   LoadMI.eraseFromParent();
3094   return Legalized;
3095 }
3096 
3097 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
3098   // Lower a non-power of 2 store into multiple pow-2 stores.
3099   // E.g. split an i24 store into an i16 store + i8 store.
3100   // We do this by first extending the stored value to the next largest power
3101   // of 2 type, and then using truncating stores to store the components.
3102   // By doing this, likewise with G_LOAD, generate an extend that can be
3103   // artifact-combined away instead of leaving behind extracts.
3104   Register SrcReg = StoreMI.getValueReg();
3105   Register PtrReg = StoreMI.getPointerReg();
3106   LLT SrcTy = MRI.getType(SrcReg);
3107   MachineFunction &MF = MIRBuilder.getMF();
3108   MachineMemOperand &MMO = **StoreMI.memoperands_begin();
3109   LLT MemTy = MMO.getMemoryType();
3110 
3111   unsigned StoreWidth = MemTy.getSizeInBits();
3112   unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
3113 
3114   if (StoreWidth != StoreSizeInBits) {
3115     if (SrcTy.isVector())
3116       return UnableToLegalize;
3117 
3118     // Promote to a byte-sized store with upper bits zero if not
3119     // storing an integral number of bytes.  For example, promote
3120     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
3121     LLT WideTy = LLT::scalar(StoreSizeInBits);
3122 
3123     if (StoreSizeInBits > SrcTy.getSizeInBits()) {
3124       // Avoid creating a store with a narrower source than result.
3125       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
3126       SrcTy = WideTy;
3127     }
3128 
3129     auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
3130 
3131     MachineMemOperand *NewMMO =
3132         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
3133     MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
3134     StoreMI.eraseFromParent();
3135     return Legalized;
3136   }
3137 
3138   if (MemTy.isVector()) {
3139     // TODO: Handle vector trunc stores
3140     if (MemTy != SrcTy)
3141       return UnableToLegalize;
3142 
3143     // TODO: We can do better than scalarizing the vector and at least split it
3144     // in half.
3145     return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
3146   }
3147 
3148   unsigned MemSizeInBits = MemTy.getSizeInBits();
3149   uint64_t LargeSplitSize, SmallSplitSize;
3150 
3151   if (!isPowerOf2_32(MemSizeInBits)) {
3152     LargeSplitSize = PowerOf2Floor(MemTy.getSizeInBits());
3153     SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
3154   } else {
3155     auto &Ctx = MF.getFunction().getContext();
3156     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3157       return UnableToLegalize; // Don't know what we're being asked to do.
3158 
3159     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3160   }
3161 
3162   // Extend to the next pow-2. If this store was itself the result of lowering,
3163   // e.g. an s56 store being broken into s32 + s24, we might have a stored type
3164   // that's wider than the stored size.
3165   unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
3166   const LLT NewSrcTy = LLT::scalar(AnyExtSize);
3167 
3168   if (SrcTy.isPointer()) {
3169     const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
3170     SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
3171   }
3172 
3173   auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
3174 
3175   // Obtain the smaller value by shifting away the larger value.
3176   auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
3177   auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
3178 
3179   // Generate the PtrAdd and truncating stores.
3180   LLT PtrTy = MRI.getType(PtrReg);
3181   auto OffsetCst = MIRBuilder.buildConstant(
3182     LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
3183   auto SmallPtr =
3184     MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
3185 
3186   MachineMemOperand *LargeMMO =
3187     MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3188   MachineMemOperand *SmallMMO =
3189     MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3190   MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
3191   MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
3192   StoreMI.eraseFromParent();
3193   return Legalized;
3194 }
3195 
3196 LegalizerHelper::LegalizeResult
3197 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
3198   switch (MI.getOpcode()) {
3199   case TargetOpcode::G_LOAD: {
3200     if (TypeIdx != 0)
3201       return UnableToLegalize;
3202     MachineMemOperand &MMO = **MI.memoperands_begin();
3203 
3204     // Not sure how to interpret a bitcast of an extending load.
3205     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3206       return UnableToLegalize;
3207 
3208     Observer.changingInstr(MI);
3209     bitcastDst(MI, CastTy, 0);
3210     MMO.setType(CastTy);
3211     Observer.changedInstr(MI);
3212     return Legalized;
3213   }
3214   case TargetOpcode::G_STORE: {
3215     if (TypeIdx != 0)
3216       return UnableToLegalize;
3217 
3218     MachineMemOperand &MMO = **MI.memoperands_begin();
3219 
3220     // Not sure how to interpret a bitcast of a truncating store.
3221     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3222       return UnableToLegalize;
3223 
3224     Observer.changingInstr(MI);
3225     bitcastSrc(MI, CastTy, 0);
3226     MMO.setType(CastTy);
3227     Observer.changedInstr(MI);
3228     return Legalized;
3229   }
3230   case TargetOpcode::G_SELECT: {
3231     if (TypeIdx != 0)
3232       return UnableToLegalize;
3233 
3234     if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
3235       LLVM_DEBUG(
3236           dbgs() << "bitcast action not implemented for vector select\n");
3237       return UnableToLegalize;
3238     }
3239 
3240     Observer.changingInstr(MI);
3241     bitcastSrc(MI, CastTy, 2);
3242     bitcastSrc(MI, CastTy, 3);
3243     bitcastDst(MI, CastTy, 0);
3244     Observer.changedInstr(MI);
3245     return Legalized;
3246   }
3247   case TargetOpcode::G_AND:
3248   case TargetOpcode::G_OR:
3249   case TargetOpcode::G_XOR: {
3250     Observer.changingInstr(MI);
3251     bitcastSrc(MI, CastTy, 1);
3252     bitcastSrc(MI, CastTy, 2);
3253     bitcastDst(MI, CastTy, 0);
3254     Observer.changedInstr(MI);
3255     return Legalized;
3256   }
3257   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3258     return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
3259   case TargetOpcode::G_INSERT_VECTOR_ELT:
3260     return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
3261   default:
3262     return UnableToLegalize;
3263   }
3264 }
3265 
3266 // Legalize an instruction by changing the opcode in place.
3267 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
3268     Observer.changingInstr(MI);
3269     MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
3270     Observer.changedInstr(MI);
3271 }
3272 
3273 LegalizerHelper::LegalizeResult
3274 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
3275   using namespace TargetOpcode;
3276 
3277   switch(MI.getOpcode()) {
3278   default:
3279     return UnableToLegalize;
3280   case TargetOpcode::G_BITCAST:
3281     return lowerBitcast(MI);
3282   case TargetOpcode::G_SREM:
3283   case TargetOpcode::G_UREM: {
3284     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3285     auto Quot =
3286         MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
3287                               {MI.getOperand(1), MI.getOperand(2)});
3288 
3289     auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
3290     MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
3291     MI.eraseFromParent();
3292     return Legalized;
3293   }
3294   case TargetOpcode::G_SADDO:
3295   case TargetOpcode::G_SSUBO:
3296     return lowerSADDO_SSUBO(MI);
3297   case TargetOpcode::G_UMULH:
3298   case TargetOpcode::G_SMULH:
3299     return lowerSMULH_UMULH(MI);
3300   case TargetOpcode::G_SMULO:
3301   case TargetOpcode::G_UMULO: {
3302     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
3303     // result.
3304     Register Res = MI.getOperand(0).getReg();
3305     Register Overflow = MI.getOperand(1).getReg();
3306     Register LHS = MI.getOperand(2).getReg();
3307     Register RHS = MI.getOperand(3).getReg();
3308     LLT Ty = MRI.getType(Res);
3309 
3310     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
3311                           ? TargetOpcode::G_SMULH
3312                           : TargetOpcode::G_UMULH;
3313 
3314     Observer.changingInstr(MI);
3315     const auto &TII = MIRBuilder.getTII();
3316     MI.setDesc(TII.get(TargetOpcode::G_MUL));
3317     MI.RemoveOperand(1);
3318     Observer.changedInstr(MI);
3319 
3320     auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
3321     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3322 
3323     // Move insert point forward so we can use the Res register if needed.
3324     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
3325 
3326     // For *signed* multiply, overflow is detected by checking:
3327     // (hi != (lo >> bitwidth-1))
3328     if (Opcode == TargetOpcode::G_SMULH) {
3329       auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
3330       auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
3331       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
3332     } else {
3333       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
3334     }
3335     return Legalized;
3336   }
3337   case TargetOpcode::G_FNEG: {
3338     Register Res = MI.getOperand(0).getReg();
3339     LLT Ty = MRI.getType(Res);
3340 
3341     // TODO: Handle vector types once we are able to
3342     // represent them.
3343     if (Ty.isVector())
3344       return UnableToLegalize;
3345     auto SignMask =
3346         MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
3347     Register SubByReg = MI.getOperand(1).getReg();
3348     MIRBuilder.buildXor(Res, SubByReg, SignMask);
3349     MI.eraseFromParent();
3350     return Legalized;
3351   }
3352   case TargetOpcode::G_FSUB: {
3353     Register Res = MI.getOperand(0).getReg();
3354     LLT Ty = MRI.getType(Res);
3355 
3356     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
3357     // First, check if G_FNEG is marked as Lower. If so, we may
3358     // end up with an infinite loop as G_FSUB is used to legalize G_FNEG.
3359     if (LI.getAction({G_FNEG, {Ty}}).Action == Lower)
3360       return UnableToLegalize;
3361     Register LHS = MI.getOperand(1).getReg();
3362     Register RHS = MI.getOperand(2).getReg();
3363     Register Neg = MRI.createGenericVirtualRegister(Ty);
3364     MIRBuilder.buildFNeg(Neg, RHS);
3365     MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
3366     MI.eraseFromParent();
3367     return Legalized;
3368   }
3369   case TargetOpcode::G_FMAD:
3370     return lowerFMad(MI);
3371   case TargetOpcode::G_FFLOOR:
3372     return lowerFFloor(MI);
3373   case TargetOpcode::G_INTRINSIC_ROUND:
3374     return lowerIntrinsicRound(MI);
3375   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
3376     // Since round even is the assumed rounding mode for unconstrained FP
3377     // operations, rint and roundeven are the same operation.
3378     changeOpcode(MI, TargetOpcode::G_FRINT);
3379     return Legalized;
3380   }
3381   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
3382     Register OldValRes = MI.getOperand(0).getReg();
3383     Register SuccessRes = MI.getOperand(1).getReg();
3384     Register Addr = MI.getOperand(2).getReg();
3385     Register CmpVal = MI.getOperand(3).getReg();
3386     Register NewVal = MI.getOperand(4).getReg();
3387     MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
3388                                   **MI.memoperands_begin());
3389     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal);
3390     MI.eraseFromParent();
3391     return Legalized;
3392   }
3393   case TargetOpcode::G_LOAD:
3394   case TargetOpcode::G_SEXTLOAD:
3395   case TargetOpcode::G_ZEXTLOAD:
3396     return lowerLoad(cast<GAnyLoad>(MI));
3397   case TargetOpcode::G_STORE:
3398     return lowerStore(cast<GStore>(MI));
3399   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
3400   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
3401   case TargetOpcode::G_CTLZ:
3402   case TargetOpcode::G_CTTZ:
3403   case TargetOpcode::G_CTPOP:
3404     return lowerBitCount(MI);
3405   case G_UADDO: {
3406     Register Res = MI.getOperand(0).getReg();
3407     Register CarryOut = MI.getOperand(1).getReg();
3408     Register LHS = MI.getOperand(2).getReg();
3409     Register RHS = MI.getOperand(3).getReg();
3410 
3411     MIRBuilder.buildAdd(Res, LHS, RHS);
3412     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS);
3413 
3414     MI.eraseFromParent();
3415     return Legalized;
3416   }
3417   case G_UADDE: {
3418     Register Res = MI.getOperand(0).getReg();
3419     Register CarryOut = MI.getOperand(1).getReg();
3420     Register LHS = MI.getOperand(2).getReg();
3421     Register RHS = MI.getOperand(3).getReg();
3422     Register CarryIn = MI.getOperand(4).getReg();
3423     LLT Ty = MRI.getType(Res);
3424 
3425     auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
3426     auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
3427     MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
3428     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS);
3429 
3430     MI.eraseFromParent();
3431     return Legalized;
3432   }
3433   case G_USUBO: {
3434     Register Res = MI.getOperand(0).getReg();
3435     Register BorrowOut = MI.getOperand(1).getReg();
3436     Register LHS = MI.getOperand(2).getReg();
3437     Register RHS = MI.getOperand(3).getReg();
3438 
3439     MIRBuilder.buildSub(Res, LHS, RHS);
3440     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
3441 
3442     MI.eraseFromParent();
3443     return Legalized;
3444   }
3445   case G_USUBE: {
3446     Register Res = MI.getOperand(0).getReg();
3447     Register BorrowOut = MI.getOperand(1).getReg();
3448     Register LHS = MI.getOperand(2).getReg();
3449     Register RHS = MI.getOperand(3).getReg();
3450     Register BorrowIn = MI.getOperand(4).getReg();
3451     const LLT CondTy = MRI.getType(BorrowOut);
3452     const LLT Ty = MRI.getType(Res);
3453 
3454     auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
3455     auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
3456     MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
3457 
3458     auto LHS_EQ_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, LHS, RHS);
3459     auto LHS_ULT_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, LHS, RHS);
3460     MIRBuilder.buildSelect(BorrowOut, LHS_EQ_RHS, BorrowIn, LHS_ULT_RHS);
3461 
3462     MI.eraseFromParent();
3463     return Legalized;
3464   }
3465   case G_UITOFP:
3466     return lowerUITOFP(MI);
3467   case G_SITOFP:
3468     return lowerSITOFP(MI);
3469   case G_FPTOUI:
3470     return lowerFPTOUI(MI);
3471   case G_FPTOSI:
3472     return lowerFPTOSI(MI);
3473   case G_FPTRUNC:
3474     return lowerFPTRUNC(MI);
3475   case G_FPOWI:
3476     return lowerFPOWI(MI);
3477   case G_SMIN:
3478   case G_SMAX:
3479   case G_UMIN:
3480   case G_UMAX:
3481     return lowerMinMax(MI);
3482   case G_FCOPYSIGN:
3483     return lowerFCopySign(MI);
3484   case G_FMINNUM:
3485   case G_FMAXNUM:
3486     return lowerFMinNumMaxNum(MI);
3487   case G_MERGE_VALUES:
3488     return lowerMergeValues(MI);
3489   case G_UNMERGE_VALUES:
3490     return lowerUnmergeValues(MI);
3491   case TargetOpcode::G_SEXT_INREG: {
3492     assert(MI.getOperand(2).isImm() && "Expected immediate");
3493     int64_t SizeInBits = MI.getOperand(2).getImm();
3494 
3495     Register DstReg = MI.getOperand(0).getReg();
3496     Register SrcReg = MI.getOperand(1).getReg();
3497     LLT DstTy = MRI.getType(DstReg);
3498     Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
3499 
3500     auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
3501     MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
3502     MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
3503     MI.eraseFromParent();
3504     return Legalized;
3505   }
3506   case G_EXTRACT_VECTOR_ELT:
3507   case G_INSERT_VECTOR_ELT:
3508     return lowerExtractInsertVectorElt(MI);
3509   case G_SHUFFLE_VECTOR:
3510     return lowerShuffleVector(MI);
3511   case G_DYN_STACKALLOC:
3512     return lowerDynStackAlloc(MI);
3513   case G_EXTRACT:
3514     return lowerExtract(MI);
3515   case G_INSERT:
3516     return lowerInsert(MI);
3517   case G_BSWAP:
3518     return lowerBswap(MI);
3519   case G_BITREVERSE:
3520     return lowerBitreverse(MI);
3521   case G_READ_REGISTER:
3522   case G_WRITE_REGISTER:
3523     return lowerReadWriteRegister(MI);
3524   case G_UADDSAT:
3525   case G_USUBSAT: {
3526     // Try to make a reasonable guess about which lowering strategy to use. The
3527     // target can override this with custom lowering and calling the
3528     // implementation functions.
3529     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3530     if (LI.isLegalOrCustom({G_UMIN, Ty}))
3531       return lowerAddSubSatToMinMax(MI);
3532     return lowerAddSubSatToAddoSubo(MI);
3533   }
3534   case G_SADDSAT:
3535   case G_SSUBSAT: {
3536     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3537 
3538     // FIXME: It would probably make more sense to see if G_SADDO is preferred,
3539     // since it's a shorter expansion. However, we would need to figure out the
3540     // preferred boolean type for the carry out for the query.
3541     if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
3542       return lowerAddSubSatToMinMax(MI);
3543     return lowerAddSubSatToAddoSubo(MI);
3544   }
3545   case G_SSHLSAT:
3546   case G_USHLSAT:
3547     return lowerShlSat(MI);
3548   case G_ABS:
3549     return lowerAbsToAddXor(MI);
3550   case G_SELECT:
3551     return lowerSelect(MI);
3552   case G_SDIVREM:
3553   case G_UDIVREM:
3554     return lowerDIVREM(MI);
3555   case G_FSHL:
3556   case G_FSHR:
3557     return lowerFunnelShift(MI);
3558   case G_ROTL:
3559   case G_ROTR:
3560     return lowerRotate(MI);
3561   case G_MEMSET:
3562   case G_MEMCPY:
3563   case G_MEMMOVE:
3564     return lowerMemCpyFamily(MI);
3565   case G_MEMCPY_INLINE:
3566     return lowerMemcpyInline(MI);
3567   GISEL_VECREDUCE_CASES_NONSEQ
3568     return lowerVectorReduction(MI);
3569   }
3570 }
3571 
3572 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
3573                                                   Align MinAlign) const {
3574   // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
3575   // datalayout for the preferred alignment. Also there should be a target hook
3576   // for this to allow targets to reduce the alignment and ignore the
3577   // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
3578   // the type.
3579   return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
3580 }
3581 
3582 MachineInstrBuilder
3583 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
3584                                       MachinePointerInfo &PtrInfo) {
3585   MachineFunction &MF = MIRBuilder.getMF();
3586   const DataLayout &DL = MIRBuilder.getDataLayout();
3587   int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
3588 
3589   unsigned AddrSpace = DL.getAllocaAddrSpace();
3590   LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3591 
3592   PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
3593   return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
3594 }
3595 
3596 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
3597                                         LLT VecTy) {
3598   int64_t IdxVal;
3599   if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal)))
3600     return IdxReg;
3601 
3602   LLT IdxTy = B.getMRI()->getType(IdxReg);
3603   unsigned NElts = VecTy.getNumElements();
3604   if (isPowerOf2_32(NElts)) {
3605     APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
3606     return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
3607   }
3608 
3609   return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
3610       .getReg(0);
3611 }
3612 
3613 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
3614                                                   Register Index) {
3615   LLT EltTy = VecTy.getElementType();
3616 
3617   // Calculate the element offset and add it to the pointer.
3618   unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
3619   assert(EltSize * 8 == EltTy.getSizeInBits() &&
3620          "Converting bits to bytes lost precision");
3621 
3622   Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy);
3623 
3624   LLT IdxTy = MRI.getType(Index);
3625   auto Mul = MIRBuilder.buildMul(IdxTy, Index,
3626                                  MIRBuilder.buildConstant(IdxTy, EltSize));
3627 
3628   LLT PtrTy = MRI.getType(VecPtr);
3629   return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
3630 }
3631 
3632 #ifndef NDEBUG
3633 /// Check that all vector operands have same number of elements. Other operands
3634 /// should be listed in NonVecOp.
3635 static bool hasSameNumEltsOnAllVectorOperands(
3636     GenericMachineInstr &MI, MachineRegisterInfo &MRI,
3637     std::initializer_list<unsigned> NonVecOpIndices) {
3638   if (MI.getNumMemOperands() != 0)
3639     return false;
3640 
3641   LLT VecTy = MRI.getType(MI.getReg(0));
3642   if (!VecTy.isVector())
3643     return false;
3644   unsigned NumElts = VecTy.getNumElements();
3645 
3646   for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
3647     MachineOperand &Op = MI.getOperand(OpIdx);
3648     if (!Op.isReg()) {
3649       if (!is_contained(NonVecOpIndices, OpIdx))
3650         return false;
3651       continue;
3652     }
3653 
3654     LLT Ty = MRI.getType(Op.getReg());
3655     if (!Ty.isVector()) {
3656       if (!is_contained(NonVecOpIndices, OpIdx))
3657         return false;
3658       is_contained(NonVecOpIndices, OpIdx);
3659       continue;
3660     }
3661 
3662     if (Ty.getNumElements() != NumElts)
3663       return false;
3664   }
3665 
3666   return true;
3667 }
3668 #endif
3669 
3670 /// Fill \p DstOps with DstOps that have same number of elements combined as
3671 /// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
3672 /// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
3673 /// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
3674 static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
3675                        unsigned NumElts) {
3676   LLT LeftoverTy;
3677   assert(Ty.isVector() && "Expected vector type");
3678   LLT EltTy = Ty.getElementType();
3679   LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
3680   int NumParts, NumLeftover;
3681   std::tie(NumParts, NumLeftover) =
3682       getNarrowTypeBreakDown(Ty, NarrowTy, LeftoverTy);
3683 
3684   assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
3685   for (int i = 0; i < NumParts; ++i) {
3686     DstOps.push_back(NarrowTy);
3687   }
3688 
3689   if (LeftoverTy.isValid()) {
3690     assert(NumLeftover == 1 && "expected exactly one leftover");
3691     DstOps.push_back(LeftoverTy);
3692   }
3693 }
3694 
3695 /// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
3696 /// made from \p Op depending on operand type.
3697 static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
3698                            MachineOperand &Op) {
3699   for (unsigned i = 0; i < N; ++i) {
3700     if (Op.isReg())
3701       Ops.push_back(Op.getReg());
3702     else if (Op.isImm())
3703       Ops.push_back(Op.getImm());
3704     else if (Op.isPredicate())
3705       Ops.push_back(static_cast<CmpInst::Predicate>(Op.getPredicate()));
3706     else
3707       llvm_unreachable("Unsupported type");
3708   }
3709 }
3710 
3711 // Handle splitting vector operations which need to have the same number of
3712 // elements in each type index, but each type index may have a different element
3713 // type.
3714 //
3715 // e.g.  <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
3716 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3717 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3718 //
3719 // Also handles some irregular breakdown cases, e.g.
3720 // e.g.  <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
3721 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3722 //             s64 = G_SHL s64, s32
3723 LegalizerHelper::LegalizeResult
3724 LegalizerHelper::fewerElementsVectorMultiEltType(
3725     GenericMachineInstr &MI, unsigned NumElts,
3726     std::initializer_list<unsigned> NonVecOpIndices) {
3727   assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
3728          "Non-compatible opcode or not specified non-vector operands");
3729   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
3730 
3731   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
3732   unsigned NumDefs = MI.getNumDefs();
3733 
3734   // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
3735   // Build instructions with DstOps to use instruction found by CSE directly.
3736   // CSE copies found instruction into given vreg when building with vreg dest.
3737   SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
3738   // Output registers will be taken from created instructions.
3739   SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
3740   for (unsigned i = 0; i < NumDefs; ++i) {
3741     makeDstOps(OutputOpsPieces[i], MRI.getType(MI.getReg(i)), NumElts);
3742   }
3743 
3744   // Split vector input operands into sub-vectors with NumElts elts + Leftover.
3745   // Operands listed in NonVecOpIndices will be used as is without splitting;
3746   // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
3747   // scalar condition (op 1), immediate in sext_inreg (op 2).
3748   SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
3749   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
3750        ++UseIdx, ++UseNo) {
3751     if (is_contained(NonVecOpIndices, UseIdx)) {
3752       broadcastSrcOp(InputOpsPieces[UseNo], OutputOpsPieces[0].size(),
3753                      MI.getOperand(UseIdx));
3754     } else {
3755       SmallVector<Register, 8> SplitPieces;
3756       extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces);
3757       for (auto Reg : SplitPieces)
3758         InputOpsPieces[UseNo].push_back(Reg);
3759     }
3760   }
3761 
3762   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
3763 
3764   // Take i-th piece of each input operand split and build sub-vector/scalar
3765   // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
3766   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
3767     SmallVector<DstOp, 2> Defs;
3768     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
3769       Defs.push_back(OutputOpsPieces[DstNo][i]);
3770 
3771     SmallVector<SrcOp, 3> Uses;
3772     for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
3773       Uses.push_back(InputOpsPieces[InputNo][i]);
3774 
3775     auto I = MIRBuilder.buildInstr(MI.getOpcode(), Defs, Uses, MI.getFlags());
3776     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
3777       OutputRegs[DstNo].push_back(I.getReg(DstNo));
3778   }
3779 
3780   // Merge small outputs into MI's output for each def operand.
3781   if (NumLeftovers) {
3782     for (unsigned i = 0; i < NumDefs; ++i)
3783       mergeMixedSubvectors(MI.getReg(i), OutputRegs[i]);
3784   } else {
3785     for (unsigned i = 0; i < NumDefs; ++i)
3786       MIRBuilder.buildMerge(MI.getReg(i), OutputRegs[i]);
3787   }
3788 
3789   MI.eraseFromParent();
3790   return Legalized;
3791 }
3792 
3793 LegalizerHelper::LegalizeResult
3794 LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
3795                                         unsigned NumElts) {
3796   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
3797 
3798   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
3799   unsigned NumDefs = MI.getNumDefs();
3800 
3801   SmallVector<DstOp, 8> OutputOpsPieces;
3802   SmallVector<Register, 8> OutputRegs;
3803   makeDstOps(OutputOpsPieces, MRI.getType(MI.getReg(0)), NumElts);
3804 
3805   // Instructions that perform register split will be inserted in basic block
3806   // where register is defined (basic block is in the next operand).
3807   SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
3808   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
3809        UseIdx += 2, ++UseNo) {
3810     MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB();
3811     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
3812     extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo]);
3813   }
3814 
3815   // Build PHIs with fewer elements.
3816   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
3817   MIRBuilder.setInsertPt(*MI.getParent(), MI);
3818   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
3819     auto Phi = MIRBuilder.buildInstr(TargetOpcode::G_PHI);
3820     Phi.addDef(
3821         MRI.createGenericVirtualRegister(OutputOpsPieces[i].getLLTTy(MRI)));
3822     OutputRegs.push_back(Phi.getReg(0));
3823 
3824     for (unsigned j = 0; j < NumInputs / 2; ++j) {
3825       Phi.addUse(InputOpsPieces[j][i]);
3826       Phi.add(MI.getOperand(1 + j * 2 + 1));
3827     }
3828   }
3829 
3830   // Merge small outputs into MI's def.
3831   if (NumLeftovers) {
3832     mergeMixedSubvectors(MI.getReg(0), OutputRegs);
3833   } else {
3834     MIRBuilder.buildMerge(MI.getReg(0), OutputRegs);
3835   }
3836 
3837   MI.eraseFromParent();
3838   return Legalized;
3839 }
3840 
3841 LegalizerHelper::LegalizeResult
3842 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
3843                                                   unsigned TypeIdx,
3844                                                   LLT NarrowTy) {
3845   const int NumDst = MI.getNumOperands() - 1;
3846   const Register SrcReg = MI.getOperand(NumDst).getReg();
3847   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3848   LLT SrcTy = MRI.getType(SrcReg);
3849 
3850   if (TypeIdx != 1 || NarrowTy == DstTy)
3851     return UnableToLegalize;
3852 
3853   // Requires compatible types. Otherwise SrcReg should have been defined by
3854   // merge-like instruction that would get artifact combined. Most likely
3855   // instruction that defines SrcReg has to perform more/fewer elements
3856   // legalization compatible with NarrowTy.
3857   assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
3858   assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3859 
3860   if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
3861       (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
3862     return UnableToLegalize;
3863 
3864   // This is most likely DstTy (smaller then register size) packed in SrcTy
3865   // (larger then register size) and since unmerge was not combined it will be
3866   // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
3867   // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
3868 
3869   // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
3870   //
3871   // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
3872   // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
3873   // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
3874   auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, SrcReg);
3875   const int NumUnmerge = Unmerge->getNumOperands() - 1;
3876   const int PartsPerUnmerge = NumDst / NumUnmerge;
3877 
3878   for (int I = 0; I != NumUnmerge; ++I) {
3879     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
3880 
3881     for (int J = 0; J != PartsPerUnmerge; ++J)
3882       MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
3883     MIB.addUse(Unmerge.getReg(I));
3884   }
3885 
3886   MI.eraseFromParent();
3887   return Legalized;
3888 }
3889 
3890 LegalizerHelper::LegalizeResult
3891 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
3892                                           LLT NarrowTy) {
3893   Register DstReg = MI.getOperand(0).getReg();
3894   LLT DstTy = MRI.getType(DstReg);
3895   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
3896   // Requires compatible types. Otherwise user of DstReg did not perform unmerge
3897   // that should have been artifact combined. Most likely instruction that uses
3898   // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
3899   assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
3900   assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3901   if (NarrowTy == SrcTy)
3902     return UnableToLegalize;
3903 
3904   // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
3905   // is for old mir tests. Since the changes to more/fewer elements it should no
3906   // longer be possible to generate MIR like this when starting from llvm-ir
3907   // because LCMTy approach was replaced with merge/unmerge to vector elements.
3908   if (TypeIdx == 1) {
3909     assert(SrcTy.isVector() && "Expected vector types");
3910     assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3911     if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
3912         (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
3913       return UnableToLegalize;
3914     // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
3915     //
3916     // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
3917     // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
3918     // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
3919     // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
3920     // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
3921     // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
3922 
3923     SmallVector<Register, 8> Elts;
3924     LLT EltTy = MRI.getType(MI.getOperand(1).getReg()).getScalarType();
3925     for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
3926       auto Unmerge = MIRBuilder.buildUnmerge(EltTy, MI.getOperand(i).getReg());
3927       for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
3928         Elts.push_back(Unmerge.getReg(j));
3929     }
3930 
3931     SmallVector<Register, 8> NarrowTyElts;
3932     unsigned NumNarrowTyElts = NarrowTy.getNumElements();
3933     unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
3934     for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
3935          ++i, Offset += NumNarrowTyElts) {
3936       ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
3937       NarrowTyElts.push_back(MIRBuilder.buildMerge(NarrowTy, Pieces).getReg(0));
3938     }
3939 
3940     MIRBuilder.buildMerge(DstReg, NarrowTyElts);
3941     MI.eraseFromParent();
3942     return Legalized;
3943   }
3944 
3945   assert(TypeIdx == 0 && "Bad type index");
3946   if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
3947       (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
3948     return UnableToLegalize;
3949 
3950   // This is most likely SrcTy (smaller then register size) packed in DstTy
3951   // (larger then register size) and since merge was not combined it will be
3952   // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
3953   // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
3954 
3955   // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
3956   //
3957   // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
3958   // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
3959   // %0:_(DstTy)  = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
3960   SmallVector<Register, 8> NarrowTyElts;
3961   unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
3962   unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
3963   unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
3964   for (unsigned i = 0; i < NumParts; ++i) {
3965     SmallVector<Register, 8> Sources;
3966     for (unsigned j = 0; j < NumElts; ++j)
3967       Sources.push_back(MI.getOperand(1 + i * NumElts + j).getReg());
3968     NarrowTyElts.push_back(MIRBuilder.buildMerge(NarrowTy, Sources).getReg(0));
3969   }
3970 
3971   MIRBuilder.buildMerge(DstReg, NarrowTyElts);
3972   MI.eraseFromParent();
3973   return Legalized;
3974 }
3975 
3976 LegalizerHelper::LegalizeResult
3977 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
3978                                                            unsigned TypeIdx,
3979                                                            LLT NarrowVecTy) {
3980   Register DstReg = MI.getOperand(0).getReg();
3981   Register SrcVec = MI.getOperand(1).getReg();
3982   Register InsertVal;
3983   bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
3984 
3985   assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
3986   if (IsInsert)
3987     InsertVal = MI.getOperand(2).getReg();
3988 
3989   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
3990 
3991   // TODO: Handle total scalarization case.
3992   if (!NarrowVecTy.isVector())
3993     return UnableToLegalize;
3994 
3995   LLT VecTy = MRI.getType(SrcVec);
3996 
3997   // If the index is a constant, we can really break this down as you would
3998   // expect, and index into the target size pieces.
3999   int64_t IdxVal;
4000   auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
4001   if (MaybeCst) {
4002     IdxVal = MaybeCst->Value.getSExtValue();
4003     // Avoid out of bounds indexing the pieces.
4004     if (IdxVal >= VecTy.getNumElements()) {
4005       MIRBuilder.buildUndef(DstReg);
4006       MI.eraseFromParent();
4007       return Legalized;
4008     }
4009 
4010     SmallVector<Register, 8> VecParts;
4011     LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
4012 
4013     // Build a sequence of NarrowTy pieces in VecParts for this operand.
4014     LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
4015                                     TargetOpcode::G_ANYEXT);
4016 
4017     unsigned NewNumElts = NarrowVecTy.getNumElements();
4018 
4019     LLT IdxTy = MRI.getType(Idx);
4020     int64_t PartIdx = IdxVal / NewNumElts;
4021     auto NewIdx =
4022         MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
4023 
4024     if (IsInsert) {
4025       LLT PartTy = MRI.getType(VecParts[PartIdx]);
4026 
4027       // Use the adjusted index to insert into one of the subvectors.
4028       auto InsertPart = MIRBuilder.buildInsertVectorElement(
4029           PartTy, VecParts[PartIdx], InsertVal, NewIdx);
4030       VecParts[PartIdx] = InsertPart.getReg(0);
4031 
4032       // Recombine the inserted subvector with the others to reform the result
4033       // vector.
4034       buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
4035     } else {
4036       MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
4037     }
4038 
4039     MI.eraseFromParent();
4040     return Legalized;
4041   }
4042 
4043   // With a variable index, we can't perform the operation in a smaller type, so
4044   // we're forced to expand this.
4045   //
4046   // TODO: We could emit a chain of compare/select to figure out which piece to
4047   // index.
4048   return lowerExtractInsertVectorElt(MI);
4049 }
4050 
4051 LegalizerHelper::LegalizeResult
4052 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
4053                                       LLT NarrowTy) {
4054   // FIXME: Don't know how to handle secondary types yet.
4055   if (TypeIdx != 0)
4056     return UnableToLegalize;
4057 
4058   // This implementation doesn't work for atomics. Give up instead of doing
4059   // something invalid.
4060   if (LdStMI.isAtomic())
4061     return UnableToLegalize;
4062 
4063   bool IsLoad = isa<GLoad>(LdStMI);
4064   Register ValReg = LdStMI.getReg(0);
4065   Register AddrReg = LdStMI.getPointerReg();
4066   LLT ValTy = MRI.getType(ValReg);
4067 
4068   // FIXME: Do we need a distinct NarrowMemory legalize action?
4069   if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
4070     LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
4071     return UnableToLegalize;
4072   }
4073 
4074   int NumParts = -1;
4075   int NumLeftover = -1;
4076   LLT LeftoverTy;
4077   SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
4078   if (IsLoad) {
4079     std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
4080   } else {
4081     if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
4082                      NarrowLeftoverRegs)) {
4083       NumParts = NarrowRegs.size();
4084       NumLeftover = NarrowLeftoverRegs.size();
4085     }
4086   }
4087 
4088   if (NumParts == -1)
4089     return UnableToLegalize;
4090 
4091   LLT PtrTy = MRI.getType(AddrReg);
4092   const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
4093 
4094   unsigned TotalSize = ValTy.getSizeInBits();
4095 
4096   // Split the load/store into PartTy sized pieces starting at Offset. If this
4097   // is a load, return the new registers in ValRegs. For a store, each elements
4098   // of ValRegs should be PartTy. Returns the next offset that needs to be
4099   // handled.
4100   auto MMO = LdStMI.getMMO();
4101   auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
4102                              unsigned Offset) -> unsigned {
4103     MachineFunction &MF = MIRBuilder.getMF();
4104     unsigned PartSize = PartTy.getSizeInBits();
4105     for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
4106          Offset += PartSize, ++Idx) {
4107       unsigned ByteOffset = Offset / 8;
4108       Register NewAddrReg;
4109 
4110       MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
4111 
4112       MachineMemOperand *NewMMO =
4113           MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
4114 
4115       if (IsLoad) {
4116         Register Dst = MRI.createGenericVirtualRegister(PartTy);
4117         ValRegs.push_back(Dst);
4118         MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
4119       } else {
4120         MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
4121       }
4122     }
4123 
4124     return Offset;
4125   };
4126 
4127   unsigned HandledOffset = splitTypePieces(NarrowTy, NarrowRegs, 0);
4128 
4129   // Handle the rest of the register if this isn't an even type breakdown.
4130   if (LeftoverTy.isValid())
4131     splitTypePieces(LeftoverTy, NarrowLeftoverRegs, HandledOffset);
4132 
4133   if (IsLoad) {
4134     insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
4135                 LeftoverTy, NarrowLeftoverRegs);
4136   }
4137 
4138   LdStMI.eraseFromParent();
4139   return Legalized;
4140 }
4141 
4142 LegalizerHelper::LegalizeResult
4143 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
4144                                      LLT NarrowTy) {
4145   using namespace TargetOpcode;
4146   GenericMachineInstr &GMI = cast<GenericMachineInstr>(MI);
4147   unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
4148 
4149   switch (MI.getOpcode()) {
4150   case G_IMPLICIT_DEF:
4151   case G_TRUNC:
4152   case G_AND:
4153   case G_OR:
4154   case G_XOR:
4155   case G_ADD:
4156   case G_SUB:
4157   case G_MUL:
4158   case G_PTR_ADD:
4159   case G_SMULH:
4160   case G_UMULH:
4161   case G_FADD:
4162   case G_FMUL:
4163   case G_FSUB:
4164   case G_FNEG:
4165   case G_FABS:
4166   case G_FCANONICALIZE:
4167   case G_FDIV:
4168   case G_FREM:
4169   case G_FMA:
4170   case G_FMAD:
4171   case G_FPOW:
4172   case G_FEXP:
4173   case G_FEXP2:
4174   case G_FLOG:
4175   case G_FLOG2:
4176   case G_FLOG10:
4177   case G_FNEARBYINT:
4178   case G_FCEIL:
4179   case G_FFLOOR:
4180   case G_FRINT:
4181   case G_INTRINSIC_ROUND:
4182   case G_INTRINSIC_ROUNDEVEN:
4183   case G_INTRINSIC_TRUNC:
4184   case G_FCOS:
4185   case G_FSIN:
4186   case G_FSQRT:
4187   case G_BSWAP:
4188   case G_BITREVERSE:
4189   case G_SDIV:
4190   case G_UDIV:
4191   case G_SREM:
4192   case G_UREM:
4193   case G_SDIVREM:
4194   case G_UDIVREM:
4195   case G_SMIN:
4196   case G_SMAX:
4197   case G_UMIN:
4198   case G_UMAX:
4199   case G_ABS:
4200   case G_FMINNUM:
4201   case G_FMAXNUM:
4202   case G_FMINNUM_IEEE:
4203   case G_FMAXNUM_IEEE:
4204   case G_FMINIMUM:
4205   case G_FMAXIMUM:
4206   case G_FSHL:
4207   case G_FSHR:
4208   case G_ROTL:
4209   case G_ROTR:
4210   case G_FREEZE:
4211   case G_SADDSAT:
4212   case G_SSUBSAT:
4213   case G_UADDSAT:
4214   case G_USUBSAT:
4215   case G_UMULO:
4216   case G_SMULO:
4217   case G_SHL:
4218   case G_LSHR:
4219   case G_ASHR:
4220   case G_SSHLSAT:
4221   case G_USHLSAT:
4222   case G_CTLZ:
4223   case G_CTLZ_ZERO_UNDEF:
4224   case G_CTTZ:
4225   case G_CTTZ_ZERO_UNDEF:
4226   case G_CTPOP:
4227   case G_FCOPYSIGN:
4228   case G_ZEXT:
4229   case G_SEXT:
4230   case G_ANYEXT:
4231   case G_FPEXT:
4232   case G_FPTRUNC:
4233   case G_SITOFP:
4234   case G_UITOFP:
4235   case G_FPTOSI:
4236   case G_FPTOUI:
4237   case G_INTTOPTR:
4238   case G_PTRTOINT:
4239   case G_ADDRSPACE_CAST:
4240     return fewerElementsVectorMultiEltType(GMI, NumElts);
4241   case G_ICMP:
4242   case G_FCMP:
4243     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*cpm predicate*/});
4244   case G_SELECT:
4245     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
4246       return fewerElementsVectorMultiEltType(GMI, NumElts);
4247     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*scalar cond*/});
4248   case G_PHI:
4249     return fewerElementsVectorPhi(GMI, NumElts);
4250   case G_UNMERGE_VALUES:
4251     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
4252   case G_BUILD_VECTOR:
4253     assert(TypeIdx == 0 && "not a vector type index");
4254     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4255   case G_CONCAT_VECTORS:
4256     if (TypeIdx != 1) // TODO: This probably does work as expected already.
4257       return UnableToLegalize;
4258     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4259   case G_EXTRACT_VECTOR_ELT:
4260   case G_INSERT_VECTOR_ELT:
4261     return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
4262   case G_LOAD:
4263   case G_STORE:
4264     return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
4265   case G_SEXT_INREG:
4266     return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/});
4267   GISEL_VECREDUCE_CASES_NONSEQ
4268     return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
4269   case G_SHUFFLE_VECTOR:
4270     return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
4271   default:
4272     return UnableToLegalize;
4273   }
4274 }
4275 
4276 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
4277     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4278   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
4279   if (TypeIdx != 0)
4280     return UnableToLegalize;
4281 
4282   Register DstReg = MI.getOperand(0).getReg();
4283   Register Src1Reg = MI.getOperand(1).getReg();
4284   Register Src2Reg = MI.getOperand(2).getReg();
4285   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4286   LLT DstTy = MRI.getType(DstReg);
4287   LLT Src1Ty = MRI.getType(Src1Reg);
4288   LLT Src2Ty = MRI.getType(Src2Reg);
4289   // The shuffle should be canonicalized by now.
4290   if (DstTy != Src1Ty)
4291     return UnableToLegalize;
4292   if (DstTy != Src2Ty)
4293     return UnableToLegalize;
4294 
4295   if (!isPowerOf2_32(DstTy.getNumElements()))
4296     return UnableToLegalize;
4297 
4298   // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
4299   // Further legalization attempts will be needed to do split further.
4300   NarrowTy =
4301       DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
4302   unsigned NewElts = NarrowTy.getNumElements();
4303 
4304   SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
4305   extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs);
4306   extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs);
4307   Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
4308                         SplitSrc2Regs[1]};
4309 
4310   Register Hi, Lo;
4311 
4312   // If Lo or Hi uses elements from at most two of the four input vectors, then
4313   // express it as a vector shuffle of those two inputs.  Otherwise extract the
4314   // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
4315   SmallVector<int, 16> Ops;
4316   for (unsigned High = 0; High < 2; ++High) {
4317     Register &Output = High ? Hi : Lo;
4318 
4319     // Build a shuffle mask for the output, discovering on the fly which
4320     // input vectors to use as shuffle operands (recorded in InputUsed).
4321     // If building a suitable shuffle vector proves too hard, then bail
4322     // out with useBuildVector set.
4323     unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
4324     unsigned FirstMaskIdx = High * NewElts;
4325     bool UseBuildVector = false;
4326     for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4327       // The mask element.  This indexes into the input.
4328       int Idx = Mask[FirstMaskIdx + MaskOffset];
4329 
4330       // The input vector this mask element indexes into.
4331       unsigned Input = (unsigned)Idx / NewElts;
4332 
4333       if (Input >= array_lengthof(Inputs)) {
4334         // The mask element does not index into any input vector.
4335         Ops.push_back(-1);
4336         continue;
4337       }
4338 
4339       // Turn the index into an offset from the start of the input vector.
4340       Idx -= Input * NewElts;
4341 
4342       // Find or create a shuffle vector operand to hold this input.
4343       unsigned OpNo;
4344       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
4345         if (InputUsed[OpNo] == Input) {
4346           // This input vector is already an operand.
4347           break;
4348         } else if (InputUsed[OpNo] == -1U) {
4349           // Create a new operand for this input vector.
4350           InputUsed[OpNo] = Input;
4351           break;
4352         }
4353       }
4354 
4355       if (OpNo >= array_lengthof(InputUsed)) {
4356         // More than two input vectors used!  Give up on trying to create a
4357         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
4358         UseBuildVector = true;
4359         break;
4360       }
4361 
4362       // Add the mask index for the new shuffle vector.
4363       Ops.push_back(Idx + OpNo * NewElts);
4364     }
4365 
4366     if (UseBuildVector) {
4367       LLT EltTy = NarrowTy.getElementType();
4368       SmallVector<Register, 16> SVOps;
4369 
4370       // Extract the input elements by hand.
4371       for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4372         // The mask element.  This indexes into the input.
4373         int Idx = Mask[FirstMaskIdx + MaskOffset];
4374 
4375         // The input vector this mask element indexes into.
4376         unsigned Input = (unsigned)Idx / NewElts;
4377 
4378         if (Input >= array_lengthof(Inputs)) {
4379           // The mask element is "undef" or indexes off the end of the input.
4380           SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
4381           continue;
4382         }
4383 
4384         // Turn the index into an offset from the start of the input vector.
4385         Idx -= Input * NewElts;
4386 
4387         // Extract the vector element by hand.
4388         SVOps.push_back(MIRBuilder
4389                             .buildExtractVectorElement(
4390                                 EltTy, Inputs[Input],
4391                                 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
4392                             .getReg(0));
4393       }
4394 
4395       // Construct the Lo/Hi output using a G_BUILD_VECTOR.
4396       Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
4397     } else if (InputUsed[0] == -1U) {
4398       // No input vectors were used! The result is undefined.
4399       Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
4400     } else {
4401       Register Op0 = Inputs[InputUsed[0]];
4402       // If only one input was used, use an undefined vector for the other.
4403       Register Op1 = InputUsed[1] == -1U
4404                          ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
4405                          : Inputs[InputUsed[1]];
4406       // At least one input vector was used. Create a new shuffle vector.
4407       Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
4408     }
4409 
4410     Ops.clear();
4411   }
4412 
4413   MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi});
4414   MI.eraseFromParent();
4415   return Legalized;
4416 }
4417 
4418 static unsigned getScalarOpcForReduction(unsigned Opc) {
4419   unsigned ScalarOpc;
4420   switch (Opc) {
4421   case TargetOpcode::G_VECREDUCE_FADD:
4422     ScalarOpc = TargetOpcode::G_FADD;
4423     break;
4424   case TargetOpcode::G_VECREDUCE_FMUL:
4425     ScalarOpc = TargetOpcode::G_FMUL;
4426     break;
4427   case TargetOpcode::G_VECREDUCE_FMAX:
4428     ScalarOpc = TargetOpcode::G_FMAXNUM;
4429     break;
4430   case TargetOpcode::G_VECREDUCE_FMIN:
4431     ScalarOpc = TargetOpcode::G_FMINNUM;
4432     break;
4433   case TargetOpcode::G_VECREDUCE_ADD:
4434     ScalarOpc = TargetOpcode::G_ADD;
4435     break;
4436   case TargetOpcode::G_VECREDUCE_MUL:
4437     ScalarOpc = TargetOpcode::G_MUL;
4438     break;
4439   case TargetOpcode::G_VECREDUCE_AND:
4440     ScalarOpc = TargetOpcode::G_AND;
4441     break;
4442   case TargetOpcode::G_VECREDUCE_OR:
4443     ScalarOpc = TargetOpcode::G_OR;
4444     break;
4445   case TargetOpcode::G_VECREDUCE_XOR:
4446     ScalarOpc = TargetOpcode::G_XOR;
4447     break;
4448   case TargetOpcode::G_VECREDUCE_SMAX:
4449     ScalarOpc = TargetOpcode::G_SMAX;
4450     break;
4451   case TargetOpcode::G_VECREDUCE_SMIN:
4452     ScalarOpc = TargetOpcode::G_SMIN;
4453     break;
4454   case TargetOpcode::G_VECREDUCE_UMAX:
4455     ScalarOpc = TargetOpcode::G_UMAX;
4456     break;
4457   case TargetOpcode::G_VECREDUCE_UMIN:
4458     ScalarOpc = TargetOpcode::G_UMIN;
4459     break;
4460   default:
4461     llvm_unreachable("Unhandled reduction");
4462   }
4463   return ScalarOpc;
4464 }
4465 
4466 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
4467     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4468   unsigned Opc = MI.getOpcode();
4469   assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD &&
4470          Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL &&
4471          "Sequential reductions not expected");
4472 
4473   if (TypeIdx != 1)
4474     return UnableToLegalize;
4475 
4476   // The semantics of the normal non-sequential reductions allow us to freely
4477   // re-associate the operation.
4478   Register SrcReg = MI.getOperand(1).getReg();
4479   LLT SrcTy = MRI.getType(SrcReg);
4480   Register DstReg = MI.getOperand(0).getReg();
4481   LLT DstTy = MRI.getType(DstReg);
4482 
4483   if (NarrowTy.isVector() &&
4484       (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
4485     return UnableToLegalize;
4486 
4487   unsigned ScalarOpc = getScalarOpcForReduction(Opc);
4488   SmallVector<Register> SplitSrcs;
4489   // If NarrowTy is a scalar then we're being asked to scalarize.
4490   const unsigned NumParts =
4491       NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
4492                           : SrcTy.getNumElements();
4493 
4494   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
4495   if (NarrowTy.isScalar()) {
4496     if (DstTy != NarrowTy)
4497       return UnableToLegalize; // FIXME: handle implicit extensions.
4498 
4499     if (isPowerOf2_32(NumParts)) {
4500       // Generate a tree of scalar operations to reduce the critical path.
4501       SmallVector<Register> PartialResults;
4502       unsigned NumPartsLeft = NumParts;
4503       while (NumPartsLeft > 1) {
4504         for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
4505           PartialResults.emplace_back(
4506               MIRBuilder
4507                   .buildInstr(ScalarOpc, {NarrowTy},
4508                               {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
4509                   .getReg(0));
4510         }
4511         SplitSrcs = PartialResults;
4512         PartialResults.clear();
4513         NumPartsLeft = SplitSrcs.size();
4514       }
4515       assert(SplitSrcs.size() == 1);
4516       MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
4517       MI.eraseFromParent();
4518       return Legalized;
4519     }
4520     // If we can't generate a tree, then just do sequential operations.
4521     Register Acc = SplitSrcs[0];
4522     for (unsigned Idx = 1; Idx < NumParts; ++Idx)
4523       Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
4524                 .getReg(0);
4525     MIRBuilder.buildCopy(DstReg, Acc);
4526     MI.eraseFromParent();
4527     return Legalized;
4528   }
4529   SmallVector<Register> PartialReductions;
4530   for (unsigned Part = 0; Part < NumParts; ++Part) {
4531     PartialReductions.push_back(
4532         MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0));
4533   }
4534 
4535 
4536   // If the types involved are powers of 2, we can generate intermediate vector
4537   // ops, before generating a final reduction operation.
4538   if (isPowerOf2_32(SrcTy.getNumElements()) &&
4539       isPowerOf2_32(NarrowTy.getNumElements())) {
4540     return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
4541   }
4542 
4543   Register Acc = PartialReductions[0];
4544   for (unsigned Part = 1; Part < NumParts; ++Part) {
4545     if (Part == NumParts - 1) {
4546       MIRBuilder.buildInstr(ScalarOpc, {DstReg},
4547                             {Acc, PartialReductions[Part]});
4548     } else {
4549       Acc = MIRBuilder
4550                 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
4551                 .getReg(0);
4552     }
4553   }
4554   MI.eraseFromParent();
4555   return Legalized;
4556 }
4557 
4558 LegalizerHelper::LegalizeResult
4559 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
4560                                         LLT SrcTy, LLT NarrowTy,
4561                                         unsigned ScalarOpc) {
4562   SmallVector<Register> SplitSrcs;
4563   // Split the sources into NarrowTy size pieces.
4564   extractParts(SrcReg, NarrowTy,
4565                SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs);
4566   // We're going to do a tree reduction using vector operations until we have
4567   // one NarrowTy size value left.
4568   while (SplitSrcs.size() > 1) {
4569     SmallVector<Register> PartialRdxs;
4570     for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
4571       Register LHS = SplitSrcs[Idx];
4572       Register RHS = SplitSrcs[Idx + 1];
4573       // Create the intermediate vector op.
4574       Register Res =
4575           MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
4576       PartialRdxs.push_back(Res);
4577     }
4578     SplitSrcs = std::move(PartialRdxs);
4579   }
4580   // Finally generate the requested NarrowTy based reduction.
4581   Observer.changingInstr(MI);
4582   MI.getOperand(1).setReg(SplitSrcs[0]);
4583   Observer.changedInstr(MI);
4584   return Legalized;
4585 }
4586 
4587 LegalizerHelper::LegalizeResult
4588 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
4589                                              const LLT HalfTy, const LLT AmtTy) {
4590 
4591   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4592   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4593   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4594 
4595   if (Amt.isZero()) {
4596     MIRBuilder.buildMerge(MI.getOperand(0), {InL, InH});
4597     MI.eraseFromParent();
4598     return Legalized;
4599   }
4600 
4601   LLT NVT = HalfTy;
4602   unsigned NVTBits = HalfTy.getSizeInBits();
4603   unsigned VTBits = 2 * NVTBits;
4604 
4605   SrcOp Lo(Register(0)), Hi(Register(0));
4606   if (MI.getOpcode() == TargetOpcode::G_SHL) {
4607     if (Amt.ugt(VTBits)) {
4608       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4609     } else if (Amt.ugt(NVTBits)) {
4610       Lo = MIRBuilder.buildConstant(NVT, 0);
4611       Hi = MIRBuilder.buildShl(NVT, InL,
4612                                MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4613     } else if (Amt == NVTBits) {
4614       Lo = MIRBuilder.buildConstant(NVT, 0);
4615       Hi = InL;
4616     } else {
4617       Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
4618       auto OrLHS =
4619           MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
4620       auto OrRHS = MIRBuilder.buildLShr(
4621           NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4622       Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4623     }
4624   } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4625     if (Amt.ugt(VTBits)) {
4626       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4627     } else if (Amt.ugt(NVTBits)) {
4628       Lo = MIRBuilder.buildLShr(NVT, InH,
4629                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4630       Hi = MIRBuilder.buildConstant(NVT, 0);
4631     } else if (Amt == NVTBits) {
4632       Lo = InH;
4633       Hi = MIRBuilder.buildConstant(NVT, 0);
4634     } else {
4635       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4636 
4637       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4638       auto OrRHS = MIRBuilder.buildShl(
4639           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4640 
4641       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4642       Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
4643     }
4644   } else {
4645     if (Amt.ugt(VTBits)) {
4646       Hi = Lo = MIRBuilder.buildAShr(
4647           NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4648     } else if (Amt.ugt(NVTBits)) {
4649       Lo = MIRBuilder.buildAShr(NVT, InH,
4650                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4651       Hi = MIRBuilder.buildAShr(NVT, InH,
4652                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4653     } else if (Amt == NVTBits) {
4654       Lo = InH;
4655       Hi = MIRBuilder.buildAShr(NVT, InH,
4656                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4657     } else {
4658       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4659 
4660       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4661       auto OrRHS = MIRBuilder.buildShl(
4662           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4663 
4664       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4665       Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
4666     }
4667   }
4668 
4669   MIRBuilder.buildMerge(MI.getOperand(0), {Lo, Hi});
4670   MI.eraseFromParent();
4671 
4672   return Legalized;
4673 }
4674 
4675 // TODO: Optimize if constant shift amount.
4676 LegalizerHelper::LegalizeResult
4677 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
4678                                    LLT RequestedTy) {
4679   if (TypeIdx == 1) {
4680     Observer.changingInstr(MI);
4681     narrowScalarSrc(MI, RequestedTy, 2);
4682     Observer.changedInstr(MI);
4683     return Legalized;
4684   }
4685 
4686   Register DstReg = MI.getOperand(0).getReg();
4687   LLT DstTy = MRI.getType(DstReg);
4688   if (DstTy.isVector())
4689     return UnableToLegalize;
4690 
4691   Register Amt = MI.getOperand(2).getReg();
4692   LLT ShiftAmtTy = MRI.getType(Amt);
4693   const unsigned DstEltSize = DstTy.getScalarSizeInBits();
4694   if (DstEltSize % 2 != 0)
4695     return UnableToLegalize;
4696 
4697   // Ignore the input type. We can only go to exactly half the size of the
4698   // input. If that isn't small enough, the resulting pieces will be further
4699   // legalized.
4700   const unsigned NewBitSize = DstEltSize / 2;
4701   const LLT HalfTy = LLT::scalar(NewBitSize);
4702   const LLT CondTy = LLT::scalar(1);
4703 
4704   if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) {
4705     return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
4706                                        ShiftAmtTy);
4707   }
4708 
4709   // TODO: Expand with known bits.
4710 
4711   // Handle the fully general expansion by an unknown amount.
4712   auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
4713 
4714   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4715   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4716   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4717 
4718   auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
4719   auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
4720 
4721   auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
4722   auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
4723   auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
4724 
4725   Register ResultRegs[2];
4726   switch (MI.getOpcode()) {
4727   case TargetOpcode::G_SHL: {
4728     // Short: ShAmt < NewBitSize
4729     auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
4730 
4731     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
4732     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
4733     auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4734 
4735     // Long: ShAmt >= NewBitSize
4736     auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
4737     auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
4738 
4739     auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
4740     auto Hi = MIRBuilder.buildSelect(
4741         HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
4742 
4743     ResultRegs[0] = Lo.getReg(0);
4744     ResultRegs[1] = Hi.getReg(0);
4745     break;
4746   }
4747   case TargetOpcode::G_LSHR:
4748   case TargetOpcode::G_ASHR: {
4749     // Short: ShAmt < NewBitSize
4750     auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
4751 
4752     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
4753     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
4754     auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4755 
4756     // Long: ShAmt >= NewBitSize
4757     MachineInstrBuilder HiL;
4758     if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4759       HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
4760     } else {
4761       auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
4762       HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
4763     }
4764     auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
4765                                      {InH, AmtExcess});     // Lo from Hi part.
4766 
4767     auto Lo = MIRBuilder.buildSelect(
4768         HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
4769 
4770     auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
4771 
4772     ResultRegs[0] = Lo.getReg(0);
4773     ResultRegs[1] = Hi.getReg(0);
4774     break;
4775   }
4776   default:
4777     llvm_unreachable("not a shift");
4778   }
4779 
4780   MIRBuilder.buildMerge(DstReg, ResultRegs);
4781   MI.eraseFromParent();
4782   return Legalized;
4783 }
4784 
4785 LegalizerHelper::LegalizeResult
4786 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
4787                                        LLT MoreTy) {
4788   assert(TypeIdx == 0 && "Expecting only Idx 0");
4789 
4790   Observer.changingInstr(MI);
4791   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
4792     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
4793     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
4794     moreElementsVectorSrc(MI, MoreTy, I);
4795   }
4796 
4797   MachineBasicBlock &MBB = *MI.getParent();
4798   MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
4799   moreElementsVectorDst(MI, MoreTy, 0);
4800   Observer.changedInstr(MI);
4801   return Legalized;
4802 }
4803 
4804 LegalizerHelper::LegalizeResult
4805 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
4806                                     LLT MoreTy) {
4807   unsigned Opc = MI.getOpcode();
4808   switch (Opc) {
4809   case TargetOpcode::G_IMPLICIT_DEF:
4810   case TargetOpcode::G_LOAD: {
4811     if (TypeIdx != 0)
4812       return UnableToLegalize;
4813     Observer.changingInstr(MI);
4814     moreElementsVectorDst(MI, MoreTy, 0);
4815     Observer.changedInstr(MI);
4816     return Legalized;
4817   }
4818   case TargetOpcode::G_STORE:
4819     if (TypeIdx != 0)
4820       return UnableToLegalize;
4821     Observer.changingInstr(MI);
4822     moreElementsVectorSrc(MI, MoreTy, 0);
4823     Observer.changedInstr(MI);
4824     return Legalized;
4825   case TargetOpcode::G_AND:
4826   case TargetOpcode::G_OR:
4827   case TargetOpcode::G_XOR:
4828   case TargetOpcode::G_ADD:
4829   case TargetOpcode::G_SUB:
4830   case TargetOpcode::G_MUL:
4831   case TargetOpcode::G_FADD:
4832   case TargetOpcode::G_FMUL:
4833   case TargetOpcode::G_UADDSAT:
4834   case TargetOpcode::G_USUBSAT:
4835   case TargetOpcode::G_SADDSAT:
4836   case TargetOpcode::G_SSUBSAT:
4837   case TargetOpcode::G_SMIN:
4838   case TargetOpcode::G_SMAX:
4839   case TargetOpcode::G_UMIN:
4840   case TargetOpcode::G_UMAX:
4841   case TargetOpcode::G_FMINNUM:
4842   case TargetOpcode::G_FMAXNUM:
4843   case TargetOpcode::G_FMINNUM_IEEE:
4844   case TargetOpcode::G_FMAXNUM_IEEE:
4845   case TargetOpcode::G_FMINIMUM:
4846   case TargetOpcode::G_FMAXIMUM: {
4847     Observer.changingInstr(MI);
4848     moreElementsVectorSrc(MI, MoreTy, 1);
4849     moreElementsVectorSrc(MI, MoreTy, 2);
4850     moreElementsVectorDst(MI, MoreTy, 0);
4851     Observer.changedInstr(MI);
4852     return Legalized;
4853   }
4854   case TargetOpcode::G_FMA:
4855   case TargetOpcode::G_FSHR:
4856   case TargetOpcode::G_FSHL: {
4857     Observer.changingInstr(MI);
4858     moreElementsVectorSrc(MI, MoreTy, 1);
4859     moreElementsVectorSrc(MI, MoreTy, 2);
4860     moreElementsVectorSrc(MI, MoreTy, 3);
4861     moreElementsVectorDst(MI, MoreTy, 0);
4862     Observer.changedInstr(MI);
4863     return Legalized;
4864   }
4865   case TargetOpcode::G_EXTRACT:
4866     if (TypeIdx != 1)
4867       return UnableToLegalize;
4868     Observer.changingInstr(MI);
4869     moreElementsVectorSrc(MI, MoreTy, 1);
4870     Observer.changedInstr(MI);
4871     return Legalized;
4872   case TargetOpcode::G_INSERT:
4873   case TargetOpcode::G_FREEZE:
4874   case TargetOpcode::G_FNEG:
4875   case TargetOpcode::G_FABS:
4876   case TargetOpcode::G_BSWAP:
4877   case TargetOpcode::G_FCANONICALIZE:
4878   case TargetOpcode::G_SEXT_INREG:
4879     if (TypeIdx != 0)
4880       return UnableToLegalize;
4881     Observer.changingInstr(MI);
4882     moreElementsVectorSrc(MI, MoreTy, 1);
4883     moreElementsVectorDst(MI, MoreTy, 0);
4884     Observer.changedInstr(MI);
4885     return Legalized;
4886   case TargetOpcode::G_SELECT:
4887     if (TypeIdx != 0)
4888       return UnableToLegalize;
4889     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
4890       return UnableToLegalize;
4891 
4892     Observer.changingInstr(MI);
4893     moreElementsVectorSrc(MI, MoreTy, 2);
4894     moreElementsVectorSrc(MI, MoreTy, 3);
4895     moreElementsVectorDst(MI, MoreTy, 0);
4896     Observer.changedInstr(MI);
4897     return Legalized;
4898   case TargetOpcode::G_UNMERGE_VALUES:
4899     return UnableToLegalize;
4900   case TargetOpcode::G_PHI:
4901     return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
4902   case TargetOpcode::G_SHUFFLE_VECTOR:
4903     return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
4904   case TargetOpcode::G_BUILD_VECTOR: {
4905     SmallVector<SrcOp, 8> Elts;
4906     for (auto Op : MI.uses()) {
4907       Elts.push_back(Op.getReg());
4908     }
4909 
4910     for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
4911       Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType()));
4912     }
4913 
4914     MIRBuilder.buildDeleteTrailingVectorElements(
4915         MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts));
4916     MI.eraseFromParent();
4917     return Legalized;
4918   }
4919   case TargetOpcode::G_TRUNC: {
4920     Observer.changingInstr(MI);
4921     moreElementsVectorSrc(MI, MoreTy, 1);
4922     moreElementsVectorDst(MI, MoreTy, 0);
4923     Observer.changedInstr(MI);
4924     return Legalized;
4925   }
4926   default:
4927     return UnableToLegalize;
4928   }
4929 }
4930 
4931 LegalizerHelper::LegalizeResult
4932 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
4933                                            unsigned int TypeIdx, LLT MoreTy) {
4934   if (TypeIdx != 0)
4935     return UnableToLegalize;
4936 
4937   Register DstReg = MI.getOperand(0).getReg();
4938   Register Src1Reg = MI.getOperand(1).getReg();
4939   Register Src2Reg = MI.getOperand(2).getReg();
4940   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4941   LLT DstTy = MRI.getType(DstReg);
4942   LLT Src1Ty = MRI.getType(Src1Reg);
4943   LLT Src2Ty = MRI.getType(Src2Reg);
4944   unsigned NumElts = DstTy.getNumElements();
4945   unsigned WidenNumElts = MoreTy.getNumElements();
4946 
4947   // Expect a canonicalized shuffle.
4948   if (DstTy != Src1Ty || DstTy != Src2Ty)
4949     return UnableToLegalize;
4950 
4951   moreElementsVectorSrc(MI, MoreTy, 1);
4952   moreElementsVectorSrc(MI, MoreTy, 2);
4953 
4954   // Adjust mask based on new input vector length.
4955   SmallVector<int, 16> NewMask;
4956   for (unsigned I = 0; I != NumElts; ++I) {
4957     int Idx = Mask[I];
4958     if (Idx < static_cast<int>(NumElts))
4959       NewMask.push_back(Idx);
4960     else
4961       NewMask.push_back(Idx - NumElts + WidenNumElts);
4962   }
4963   for (unsigned I = NumElts; I != WidenNumElts; ++I)
4964     NewMask.push_back(-1);
4965   moreElementsVectorDst(MI, MoreTy, 0);
4966   MIRBuilder.setInstrAndDebugLoc(MI);
4967   MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
4968                                 MI.getOperand(1).getReg(),
4969                                 MI.getOperand(2).getReg(), NewMask);
4970   MI.eraseFromParent();
4971   return Legalized;
4972 }
4973 
4974 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
4975                                         ArrayRef<Register> Src1Regs,
4976                                         ArrayRef<Register> Src2Regs,
4977                                         LLT NarrowTy) {
4978   MachineIRBuilder &B = MIRBuilder;
4979   unsigned SrcParts = Src1Regs.size();
4980   unsigned DstParts = DstRegs.size();
4981 
4982   unsigned DstIdx = 0; // Low bits of the result.
4983   Register FactorSum =
4984       B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
4985   DstRegs[DstIdx] = FactorSum;
4986 
4987   unsigned CarrySumPrevDstIdx;
4988   SmallVector<Register, 4> Factors;
4989 
4990   for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
4991     // Collect low parts of muls for DstIdx.
4992     for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
4993          i <= std::min(DstIdx, SrcParts - 1); ++i) {
4994       MachineInstrBuilder Mul =
4995           B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
4996       Factors.push_back(Mul.getReg(0));
4997     }
4998     // Collect high parts of muls from previous DstIdx.
4999     for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
5000          i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
5001       MachineInstrBuilder Umulh =
5002           B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
5003       Factors.push_back(Umulh.getReg(0));
5004     }
5005     // Add CarrySum from additions calculated for previous DstIdx.
5006     if (DstIdx != 1) {
5007       Factors.push_back(CarrySumPrevDstIdx);
5008     }
5009 
5010     Register CarrySum;
5011     // Add all factors and accumulate all carries into CarrySum.
5012     if (DstIdx != DstParts - 1) {
5013       MachineInstrBuilder Uaddo =
5014           B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
5015       FactorSum = Uaddo.getReg(0);
5016       CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
5017       for (unsigned i = 2; i < Factors.size(); ++i) {
5018         MachineInstrBuilder Uaddo =
5019             B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
5020         FactorSum = Uaddo.getReg(0);
5021         MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
5022         CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
5023       }
5024     } else {
5025       // Since value for the next index is not calculated, neither is CarrySum.
5026       FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
5027       for (unsigned i = 2; i < Factors.size(); ++i)
5028         FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
5029     }
5030 
5031     CarrySumPrevDstIdx = CarrySum;
5032     DstRegs[DstIdx] = FactorSum;
5033     Factors.clear();
5034   }
5035 }
5036 
5037 LegalizerHelper::LegalizeResult
5038 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
5039                                     LLT NarrowTy) {
5040   if (TypeIdx != 0)
5041     return UnableToLegalize;
5042 
5043   Register DstReg = MI.getOperand(0).getReg();
5044   LLT DstType = MRI.getType(DstReg);
5045   // FIXME: add support for vector types
5046   if (DstType.isVector())
5047     return UnableToLegalize;
5048 
5049   unsigned Opcode = MI.getOpcode();
5050   unsigned OpO, OpE, OpF;
5051   switch (Opcode) {
5052   case TargetOpcode::G_SADDO:
5053   case TargetOpcode::G_SADDE:
5054   case TargetOpcode::G_UADDO:
5055   case TargetOpcode::G_UADDE:
5056   case TargetOpcode::G_ADD:
5057     OpO = TargetOpcode::G_UADDO;
5058     OpE = TargetOpcode::G_UADDE;
5059     OpF = TargetOpcode::G_UADDE;
5060     if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
5061       OpF = TargetOpcode::G_SADDE;
5062     break;
5063   case TargetOpcode::G_SSUBO:
5064   case TargetOpcode::G_SSUBE:
5065   case TargetOpcode::G_USUBO:
5066   case TargetOpcode::G_USUBE:
5067   case TargetOpcode::G_SUB:
5068     OpO = TargetOpcode::G_USUBO;
5069     OpE = TargetOpcode::G_USUBE;
5070     OpF = TargetOpcode::G_USUBE;
5071     if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
5072       OpF = TargetOpcode::G_SSUBE;
5073     break;
5074   default:
5075     llvm_unreachable("Unexpected add/sub opcode!");
5076   }
5077 
5078   // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
5079   unsigned NumDefs = MI.getNumExplicitDefs();
5080   Register Src1 = MI.getOperand(NumDefs).getReg();
5081   Register Src2 = MI.getOperand(NumDefs + 1).getReg();
5082   Register CarryDst, CarryIn;
5083   if (NumDefs == 2)
5084     CarryDst = MI.getOperand(1).getReg();
5085   if (MI.getNumOperands() == NumDefs + 3)
5086     CarryIn = MI.getOperand(NumDefs + 2).getReg();
5087 
5088   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5089   LLT LeftoverTy, DummyTy;
5090   SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
5091   extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left);
5092   extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left);
5093 
5094   int NarrowParts = Src1Regs.size();
5095   for (int I = 0, E = Src1Left.size(); I != E; ++I) {
5096     Src1Regs.push_back(Src1Left[I]);
5097     Src2Regs.push_back(Src2Left[I]);
5098   }
5099   DstRegs.reserve(Src1Regs.size());
5100 
5101   for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
5102     Register DstReg =
5103         MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
5104     Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
5105     // Forward the final carry-out to the destination register
5106     if (i == e - 1 && CarryDst)
5107       CarryOut = CarryDst;
5108 
5109     if (!CarryIn) {
5110       MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
5111                             {Src1Regs[i], Src2Regs[i]});
5112     } else if (i == e - 1) {
5113       MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
5114                             {Src1Regs[i], Src2Regs[i], CarryIn});
5115     } else {
5116       MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
5117                             {Src1Regs[i], Src2Regs[i], CarryIn});
5118     }
5119 
5120     DstRegs.push_back(DstReg);
5121     CarryIn = CarryOut;
5122   }
5123   insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
5124               makeArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
5125               makeArrayRef(DstRegs).drop_front(NarrowParts));
5126 
5127   MI.eraseFromParent();
5128   return Legalized;
5129 }
5130 
5131 LegalizerHelper::LegalizeResult
5132 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
5133   Register DstReg = MI.getOperand(0).getReg();
5134   Register Src1 = MI.getOperand(1).getReg();
5135   Register Src2 = MI.getOperand(2).getReg();
5136 
5137   LLT Ty = MRI.getType(DstReg);
5138   if (Ty.isVector())
5139     return UnableToLegalize;
5140 
5141   unsigned Size = Ty.getSizeInBits();
5142   unsigned NarrowSize = NarrowTy.getSizeInBits();
5143   if (Size % NarrowSize != 0)
5144     return UnableToLegalize;
5145 
5146   unsigned NumParts = Size / NarrowSize;
5147   bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
5148   unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
5149 
5150   SmallVector<Register, 2> Src1Parts, Src2Parts;
5151   SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
5152   extractParts(Src1, NarrowTy, NumParts, Src1Parts);
5153   extractParts(Src2, NarrowTy, NumParts, Src2Parts);
5154   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
5155 
5156   // Take only high half of registers if this is high mul.
5157   ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
5158   MIRBuilder.buildMerge(DstReg, DstRegs);
5159   MI.eraseFromParent();
5160   return Legalized;
5161 }
5162 
5163 LegalizerHelper::LegalizeResult
5164 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
5165                                    LLT NarrowTy) {
5166   if (TypeIdx != 0)
5167     return UnableToLegalize;
5168 
5169   bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
5170 
5171   Register Src = MI.getOperand(1).getReg();
5172   LLT SrcTy = MRI.getType(Src);
5173 
5174   // If all finite floats fit into the narrowed integer type, we can just swap
5175   // out the result type. This is practically only useful for conversions from
5176   // half to at least 16-bits, so just handle the one case.
5177   if (SrcTy.getScalarType() != LLT::scalar(16) ||
5178       NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
5179     return UnableToLegalize;
5180 
5181   Observer.changingInstr(MI);
5182   narrowScalarDst(MI, NarrowTy, 0,
5183                   IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
5184   Observer.changedInstr(MI);
5185   return Legalized;
5186 }
5187 
5188 LegalizerHelper::LegalizeResult
5189 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
5190                                      LLT NarrowTy) {
5191   if (TypeIdx != 1)
5192     return UnableToLegalize;
5193 
5194   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5195 
5196   int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5197   // FIXME: add support for when SizeOp1 isn't an exact multiple of
5198   // NarrowSize.
5199   if (SizeOp1 % NarrowSize != 0)
5200     return UnableToLegalize;
5201   int NumParts = SizeOp1 / NarrowSize;
5202 
5203   SmallVector<Register, 2> SrcRegs, DstRegs;
5204   SmallVector<uint64_t, 2> Indexes;
5205   extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
5206 
5207   Register OpReg = MI.getOperand(0).getReg();
5208   uint64_t OpStart = MI.getOperand(2).getImm();
5209   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5210   for (int i = 0; i < NumParts; ++i) {
5211     unsigned SrcStart = i * NarrowSize;
5212 
5213     if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
5214       // No part of the extract uses this subregister, ignore it.
5215       continue;
5216     } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5217       // The entire subregister is extracted, forward the value.
5218       DstRegs.push_back(SrcRegs[i]);
5219       continue;
5220     }
5221 
5222     // OpSegStart is where this destination segment would start in OpReg if it
5223     // extended infinitely in both directions.
5224     int64_t ExtractOffset;
5225     uint64_t SegSize;
5226     if (OpStart < SrcStart) {
5227       ExtractOffset = 0;
5228       SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
5229     } else {
5230       ExtractOffset = OpStart - SrcStart;
5231       SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
5232     }
5233 
5234     Register SegReg = SrcRegs[i];
5235     if (ExtractOffset != 0 || SegSize != NarrowSize) {
5236       // A genuine extract is needed.
5237       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5238       MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
5239     }
5240 
5241     DstRegs.push_back(SegReg);
5242   }
5243 
5244   Register DstReg = MI.getOperand(0).getReg();
5245   if (MRI.getType(DstReg).isVector())
5246     MIRBuilder.buildBuildVector(DstReg, DstRegs);
5247   else if (DstRegs.size() > 1)
5248     MIRBuilder.buildMerge(DstReg, DstRegs);
5249   else
5250     MIRBuilder.buildCopy(DstReg, DstRegs[0]);
5251   MI.eraseFromParent();
5252   return Legalized;
5253 }
5254 
5255 LegalizerHelper::LegalizeResult
5256 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
5257                                     LLT NarrowTy) {
5258   // FIXME: Don't know how to handle secondary types yet.
5259   if (TypeIdx != 0)
5260     return UnableToLegalize;
5261 
5262   SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
5263   SmallVector<uint64_t, 2> Indexes;
5264   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5265   LLT LeftoverTy;
5266   extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
5267                LeftoverRegs);
5268 
5269   for (Register Reg : LeftoverRegs)
5270     SrcRegs.push_back(Reg);
5271 
5272   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5273   Register OpReg = MI.getOperand(2).getReg();
5274   uint64_t OpStart = MI.getOperand(3).getImm();
5275   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5276   for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
5277     unsigned DstStart = I * NarrowSize;
5278 
5279     if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5280       // The entire subregister is defined by this insert, forward the new
5281       // value.
5282       DstRegs.push_back(OpReg);
5283       continue;
5284     }
5285 
5286     Register SrcReg = SrcRegs[I];
5287     if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
5288       // The leftover reg is smaller than NarrowTy, so we need to extend it.
5289       SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
5290       MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
5291     }
5292 
5293     if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
5294       // No part of the insert affects this subregister, forward the original.
5295       DstRegs.push_back(SrcReg);
5296       continue;
5297     }
5298 
5299     // OpSegStart is where this destination segment would start in OpReg if it
5300     // extended infinitely in both directions.
5301     int64_t ExtractOffset, InsertOffset;
5302     uint64_t SegSize;
5303     if (OpStart < DstStart) {
5304       InsertOffset = 0;
5305       ExtractOffset = DstStart - OpStart;
5306       SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
5307     } else {
5308       InsertOffset = OpStart - DstStart;
5309       ExtractOffset = 0;
5310       SegSize =
5311         std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
5312     }
5313 
5314     Register SegReg = OpReg;
5315     if (ExtractOffset != 0 || SegSize != OpSize) {
5316       // A genuine extract is needed.
5317       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5318       MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
5319     }
5320 
5321     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
5322     MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
5323     DstRegs.push_back(DstReg);
5324   }
5325 
5326   uint64_t WideSize = DstRegs.size() * NarrowSize;
5327   Register DstReg = MI.getOperand(0).getReg();
5328   if (WideSize > RegTy.getSizeInBits()) {
5329     Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
5330     MIRBuilder.buildMerge(MergeReg, DstRegs);
5331     MIRBuilder.buildTrunc(DstReg, MergeReg);
5332   } else
5333     MIRBuilder.buildMerge(DstReg, DstRegs);
5334 
5335   MI.eraseFromParent();
5336   return Legalized;
5337 }
5338 
5339 LegalizerHelper::LegalizeResult
5340 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
5341                                    LLT NarrowTy) {
5342   Register DstReg = MI.getOperand(0).getReg();
5343   LLT DstTy = MRI.getType(DstReg);
5344 
5345   assert(MI.getNumOperands() == 3 && TypeIdx == 0);
5346 
5347   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5348   SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
5349   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5350   LLT LeftoverTy;
5351   if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
5352                     Src0Regs, Src0LeftoverRegs))
5353     return UnableToLegalize;
5354 
5355   LLT Unused;
5356   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
5357                     Src1Regs, Src1LeftoverRegs))
5358     llvm_unreachable("inconsistent extractParts result");
5359 
5360   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5361     auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
5362                                         {Src0Regs[I], Src1Regs[I]});
5363     DstRegs.push_back(Inst.getReg(0));
5364   }
5365 
5366   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5367     auto Inst = MIRBuilder.buildInstr(
5368       MI.getOpcode(),
5369       {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
5370     DstLeftoverRegs.push_back(Inst.getReg(0));
5371   }
5372 
5373   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5374               LeftoverTy, DstLeftoverRegs);
5375 
5376   MI.eraseFromParent();
5377   return Legalized;
5378 }
5379 
5380 LegalizerHelper::LegalizeResult
5381 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
5382                                  LLT NarrowTy) {
5383   if (TypeIdx != 0)
5384     return UnableToLegalize;
5385 
5386   Register DstReg = MI.getOperand(0).getReg();
5387   Register SrcReg = MI.getOperand(1).getReg();
5388 
5389   LLT DstTy = MRI.getType(DstReg);
5390   if (DstTy.isVector())
5391     return UnableToLegalize;
5392 
5393   SmallVector<Register, 8> Parts;
5394   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
5395   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
5396   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
5397 
5398   MI.eraseFromParent();
5399   return Legalized;
5400 }
5401 
5402 LegalizerHelper::LegalizeResult
5403 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
5404                                     LLT NarrowTy) {
5405   if (TypeIdx != 0)
5406     return UnableToLegalize;
5407 
5408   Register CondReg = MI.getOperand(1).getReg();
5409   LLT CondTy = MRI.getType(CondReg);
5410   if (CondTy.isVector()) // TODO: Handle vselect
5411     return UnableToLegalize;
5412 
5413   Register DstReg = MI.getOperand(0).getReg();
5414   LLT DstTy = MRI.getType(DstReg);
5415 
5416   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5417   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5418   SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
5419   LLT LeftoverTy;
5420   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
5421                     Src1Regs, Src1LeftoverRegs))
5422     return UnableToLegalize;
5423 
5424   LLT Unused;
5425   if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
5426                     Src2Regs, Src2LeftoverRegs))
5427     llvm_unreachable("inconsistent extractParts result");
5428 
5429   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5430     auto Select = MIRBuilder.buildSelect(NarrowTy,
5431                                          CondReg, Src1Regs[I], Src2Regs[I]);
5432     DstRegs.push_back(Select.getReg(0));
5433   }
5434 
5435   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5436     auto Select = MIRBuilder.buildSelect(
5437       LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
5438     DstLeftoverRegs.push_back(Select.getReg(0));
5439   }
5440 
5441   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5442               LeftoverTy, DstLeftoverRegs);
5443 
5444   MI.eraseFromParent();
5445   return Legalized;
5446 }
5447 
5448 LegalizerHelper::LegalizeResult
5449 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
5450                                   LLT NarrowTy) {
5451   if (TypeIdx != 1)
5452     return UnableToLegalize;
5453 
5454   Register DstReg = MI.getOperand(0).getReg();
5455   Register SrcReg = MI.getOperand(1).getReg();
5456   LLT DstTy = MRI.getType(DstReg);
5457   LLT SrcTy = MRI.getType(SrcReg);
5458   unsigned NarrowSize = NarrowTy.getSizeInBits();
5459 
5460   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5461     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
5462 
5463     MachineIRBuilder &B = MIRBuilder;
5464     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5465     // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
5466     auto C_0 = B.buildConstant(NarrowTy, 0);
5467     auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5468                                 UnmergeSrc.getReg(1), C_0);
5469     auto LoCTLZ = IsUndef ?
5470       B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
5471       B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
5472     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5473     auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
5474     auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
5475     B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
5476 
5477     MI.eraseFromParent();
5478     return Legalized;
5479   }
5480 
5481   return UnableToLegalize;
5482 }
5483 
5484 LegalizerHelper::LegalizeResult
5485 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
5486                                   LLT NarrowTy) {
5487   if (TypeIdx != 1)
5488     return UnableToLegalize;
5489 
5490   Register DstReg = MI.getOperand(0).getReg();
5491   Register SrcReg = MI.getOperand(1).getReg();
5492   LLT DstTy = MRI.getType(DstReg);
5493   LLT SrcTy = MRI.getType(SrcReg);
5494   unsigned NarrowSize = NarrowTy.getSizeInBits();
5495 
5496   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5497     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
5498 
5499     MachineIRBuilder &B = MIRBuilder;
5500     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5501     // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
5502     auto C_0 = B.buildConstant(NarrowTy, 0);
5503     auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5504                                 UnmergeSrc.getReg(0), C_0);
5505     auto HiCTTZ = IsUndef ?
5506       B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
5507       B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
5508     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5509     auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
5510     auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
5511     B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
5512 
5513     MI.eraseFromParent();
5514     return Legalized;
5515   }
5516 
5517   return UnableToLegalize;
5518 }
5519 
5520 LegalizerHelper::LegalizeResult
5521 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
5522                                    LLT NarrowTy) {
5523   if (TypeIdx != 1)
5524     return UnableToLegalize;
5525 
5526   Register DstReg = MI.getOperand(0).getReg();
5527   LLT DstTy = MRI.getType(DstReg);
5528   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
5529   unsigned NarrowSize = NarrowTy.getSizeInBits();
5530 
5531   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5532     auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
5533 
5534     auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
5535     auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
5536     MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
5537 
5538     MI.eraseFromParent();
5539     return Legalized;
5540   }
5541 
5542   return UnableToLegalize;
5543 }
5544 
5545 LegalizerHelper::LegalizeResult
5546 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
5547   unsigned Opc = MI.getOpcode();
5548   const auto &TII = MIRBuilder.getTII();
5549   auto isSupported = [this](const LegalityQuery &Q) {
5550     auto QAction = LI.getAction(Q).Action;
5551     return QAction == Legal || QAction == Libcall || QAction == Custom;
5552   };
5553   switch (Opc) {
5554   default:
5555     return UnableToLegalize;
5556   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
5557     // This trivially expands to CTLZ.
5558     Observer.changingInstr(MI);
5559     MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
5560     Observer.changedInstr(MI);
5561     return Legalized;
5562   }
5563   case TargetOpcode::G_CTLZ: {
5564     Register DstReg = MI.getOperand(0).getReg();
5565     Register SrcReg = MI.getOperand(1).getReg();
5566     LLT DstTy = MRI.getType(DstReg);
5567     LLT SrcTy = MRI.getType(SrcReg);
5568     unsigned Len = SrcTy.getSizeInBits();
5569 
5570     if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5571       // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
5572       auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
5573       auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
5574       auto ICmp = MIRBuilder.buildICmp(
5575           CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
5576       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5577       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
5578       MI.eraseFromParent();
5579       return Legalized;
5580     }
5581     // for now, we do this:
5582     // NewLen = NextPowerOf2(Len);
5583     // x = x | (x >> 1);
5584     // x = x | (x >> 2);
5585     // ...
5586     // x = x | (x >>16);
5587     // x = x | (x >>32); // for 64-bit input
5588     // Upto NewLen/2
5589     // return Len - popcount(x);
5590     //
5591     // Ref: "Hacker's Delight" by Henry Warren
5592     Register Op = SrcReg;
5593     unsigned NewLen = PowerOf2Ceil(Len);
5594     for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
5595       auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
5596       auto MIBOp = MIRBuilder.buildOr(
5597           SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
5598       Op = MIBOp.getReg(0);
5599     }
5600     auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
5601     MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
5602                         MIBPop);
5603     MI.eraseFromParent();
5604     return Legalized;
5605   }
5606   case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
5607     // This trivially expands to CTTZ.
5608     Observer.changingInstr(MI);
5609     MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
5610     Observer.changedInstr(MI);
5611     return Legalized;
5612   }
5613   case TargetOpcode::G_CTTZ: {
5614     Register DstReg = MI.getOperand(0).getReg();
5615     Register SrcReg = MI.getOperand(1).getReg();
5616     LLT DstTy = MRI.getType(DstReg);
5617     LLT SrcTy = MRI.getType(SrcReg);
5618 
5619     unsigned Len = SrcTy.getSizeInBits();
5620     if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5621       // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
5622       // zero.
5623       auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
5624       auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
5625       auto ICmp = MIRBuilder.buildICmp(
5626           CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
5627       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5628       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
5629       MI.eraseFromParent();
5630       return Legalized;
5631     }
5632     // for now, we use: { return popcount(~x & (x - 1)); }
5633     // unless the target has ctlz but not ctpop, in which case we use:
5634     // { return 32 - nlz(~x & (x-1)); }
5635     // Ref: "Hacker's Delight" by Henry Warren
5636     auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
5637     auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
5638     auto MIBTmp = MIRBuilder.buildAnd(
5639         SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
5640     if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
5641         isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
5642       auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
5643       MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
5644                           MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
5645       MI.eraseFromParent();
5646       return Legalized;
5647     }
5648     MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
5649     MI.getOperand(1).setReg(MIBTmp.getReg(0));
5650     return Legalized;
5651   }
5652   case TargetOpcode::G_CTPOP: {
5653     Register SrcReg = MI.getOperand(1).getReg();
5654     LLT Ty = MRI.getType(SrcReg);
5655     unsigned Size = Ty.getSizeInBits();
5656     MachineIRBuilder &B = MIRBuilder;
5657 
5658     // Count set bits in blocks of 2 bits. Default approach would be
5659     // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
5660     // We use following formula instead:
5661     // B2Count = val - { (val >> 1) & 0x55555555 }
5662     // since it gives same result in blocks of 2 with one instruction less.
5663     auto C_1 = B.buildConstant(Ty, 1);
5664     auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
5665     APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
5666     auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
5667     auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
5668     auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
5669 
5670     // In order to get count in blocks of 4 add values from adjacent block of 2.
5671     // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
5672     auto C_2 = B.buildConstant(Ty, 2);
5673     auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
5674     APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
5675     auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
5676     auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
5677     auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
5678     auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
5679 
5680     // For count in blocks of 8 bits we don't have to mask high 4 bits before
5681     // addition since count value sits in range {0,...,8} and 4 bits are enough
5682     // to hold such binary values. After addition high 4 bits still hold count
5683     // of set bits in high 4 bit block, set them to zero and get 8 bit result.
5684     // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
5685     auto C_4 = B.buildConstant(Ty, 4);
5686     auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
5687     auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
5688     APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
5689     auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
5690     auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
5691 
5692     assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
5693     // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
5694     // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
5695     auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
5696     auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
5697 
5698     // Shift count result from 8 high bits to low bits.
5699     auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
5700     B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
5701 
5702     MI.eraseFromParent();
5703     return Legalized;
5704   }
5705   }
5706 }
5707 
5708 // Check that (every element of) Reg is undef or not an exact multiple of BW.
5709 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
5710                                         Register Reg, unsigned BW) {
5711   return matchUnaryPredicate(
5712       MRI, Reg,
5713       [=](const Constant *C) {
5714         // Null constant here means an undef.
5715         const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
5716         return !CI || CI->getValue().urem(BW) != 0;
5717       },
5718       /*AllowUndefs*/ true);
5719 }
5720 
5721 LegalizerHelper::LegalizeResult
5722 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
5723   Register Dst = MI.getOperand(0).getReg();
5724   Register X = MI.getOperand(1).getReg();
5725   Register Y = MI.getOperand(2).getReg();
5726   Register Z = MI.getOperand(3).getReg();
5727   LLT Ty = MRI.getType(Dst);
5728   LLT ShTy = MRI.getType(Z);
5729 
5730   unsigned BW = Ty.getScalarSizeInBits();
5731 
5732   if (!isPowerOf2_32(BW))
5733     return UnableToLegalize;
5734 
5735   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5736   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5737 
5738   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5739     // fshl X, Y, Z -> fshr X, Y, -Z
5740     // fshr X, Y, Z -> fshl X, Y, -Z
5741     auto Zero = MIRBuilder.buildConstant(ShTy, 0);
5742     Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
5743   } else {
5744     // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
5745     // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
5746     auto One = MIRBuilder.buildConstant(ShTy, 1);
5747     if (IsFSHL) {
5748       Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5749       X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
5750     } else {
5751       X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5752       Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
5753     }
5754 
5755     Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
5756   }
5757 
5758   MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
5759   MI.eraseFromParent();
5760   return Legalized;
5761 }
5762 
5763 LegalizerHelper::LegalizeResult
5764 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
5765   Register Dst = MI.getOperand(0).getReg();
5766   Register X = MI.getOperand(1).getReg();
5767   Register Y = MI.getOperand(2).getReg();
5768   Register Z = MI.getOperand(3).getReg();
5769   LLT Ty = MRI.getType(Dst);
5770   LLT ShTy = MRI.getType(Z);
5771 
5772   const unsigned BW = Ty.getScalarSizeInBits();
5773   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5774 
5775   Register ShX, ShY;
5776   Register ShAmt, InvShAmt;
5777 
5778   // FIXME: Emit optimized urem by constant instead of letting it expand later.
5779   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5780     // fshl: X << C | Y >> (BW - C)
5781     // fshr: X << (BW - C) | Y >> C
5782     // where C = Z % BW is not zero
5783     auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5784     ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5785     InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
5786     ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
5787     ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
5788   } else {
5789     // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
5790     // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
5791     auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
5792     if (isPowerOf2_32(BW)) {
5793       // Z % BW -> Z & (BW - 1)
5794       ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
5795       // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
5796       auto NotZ = MIRBuilder.buildNot(ShTy, Z);
5797       InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
5798     } else {
5799       auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5800       ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5801       InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
5802     }
5803 
5804     auto One = MIRBuilder.buildConstant(ShTy, 1);
5805     if (IsFSHL) {
5806       ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
5807       auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
5808       ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
5809     } else {
5810       auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
5811       ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
5812       ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
5813     }
5814   }
5815 
5816   MIRBuilder.buildOr(Dst, ShX, ShY);
5817   MI.eraseFromParent();
5818   return Legalized;
5819 }
5820 
5821 LegalizerHelper::LegalizeResult
5822 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
5823   // These operations approximately do the following (while avoiding undefined
5824   // shifts by BW):
5825   // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
5826   // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
5827   Register Dst = MI.getOperand(0).getReg();
5828   LLT Ty = MRI.getType(Dst);
5829   LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
5830 
5831   bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5832   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5833 
5834   // TODO: Use smarter heuristic that accounts for vector legalization.
5835   if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
5836     return lowerFunnelShiftAsShifts(MI);
5837 
5838   // This only works for powers of 2, fallback to shifts if it fails.
5839   LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
5840   if (Result == UnableToLegalize)
5841     return lowerFunnelShiftAsShifts(MI);
5842   return Result;
5843 }
5844 
5845 LegalizerHelper::LegalizeResult
5846 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
5847   Register Dst = MI.getOperand(0).getReg();
5848   Register Src = MI.getOperand(1).getReg();
5849   Register Amt = MI.getOperand(2).getReg();
5850   LLT AmtTy = MRI.getType(Amt);
5851   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
5852   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
5853   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
5854   auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
5855   MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
5856   MI.eraseFromParent();
5857   return Legalized;
5858 }
5859 
5860 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
5861   Register Dst = MI.getOperand(0).getReg();
5862   Register Src = MI.getOperand(1).getReg();
5863   Register Amt = MI.getOperand(2).getReg();
5864   LLT DstTy = MRI.getType(Dst);
5865   LLT SrcTy = MRI.getType(Src);
5866   LLT AmtTy = MRI.getType(Amt);
5867 
5868   unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
5869   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
5870 
5871   MIRBuilder.setInstrAndDebugLoc(MI);
5872 
5873   // If a rotate in the other direction is supported, use it.
5874   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
5875   if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
5876       isPowerOf2_32(EltSizeInBits))
5877     return lowerRotateWithReverseRotate(MI);
5878 
5879   // If a funnel shift is supported, use it.
5880   unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
5881   unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
5882   bool IsFShLegal = false;
5883   if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) ||
5884       LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) {
5885     auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
5886                                 Register R3) {
5887       MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3});
5888       MI.eraseFromParent();
5889       return Legalized;
5890     };
5891     // If a funnel shift in the other direction is supported, use it.
5892     if (IsFShLegal) {
5893       return buildFunnelShift(FShOpc, Dst, Src, Amt);
5894     } else if (isPowerOf2_32(EltSizeInBits)) {
5895       Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0);
5896       return buildFunnelShift(RevFsh, Dst, Src, Amt);
5897     }
5898   }
5899 
5900   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
5901   unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
5902   unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
5903   auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
5904   Register ShVal;
5905   Register RevShiftVal;
5906   if (isPowerOf2_32(EltSizeInBits)) {
5907     // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
5908     // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
5909     auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
5910     auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
5911     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
5912     auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
5913     RevShiftVal =
5914         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
5915   } else {
5916     // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
5917     // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
5918     auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
5919     auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
5920     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
5921     auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
5922     auto One = MIRBuilder.buildConstant(AmtTy, 1);
5923     auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
5924     RevShiftVal =
5925         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
5926   }
5927   MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
5928   MI.eraseFromParent();
5929   return Legalized;
5930 }
5931 
5932 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
5933 // representation.
5934 LegalizerHelper::LegalizeResult
5935 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
5936   Register Dst = MI.getOperand(0).getReg();
5937   Register Src = MI.getOperand(1).getReg();
5938   const LLT S64 = LLT::scalar(64);
5939   const LLT S32 = LLT::scalar(32);
5940   const LLT S1 = LLT::scalar(1);
5941 
5942   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
5943 
5944   // unsigned cul2f(ulong u) {
5945   //   uint lz = clz(u);
5946   //   uint e = (u != 0) ? 127U + 63U - lz : 0;
5947   //   u = (u << lz) & 0x7fffffffffffffffUL;
5948   //   ulong t = u & 0xffffffffffUL;
5949   //   uint v = (e << 23) | (uint)(u >> 40);
5950   //   uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
5951   //   return as_float(v + r);
5952   // }
5953 
5954   auto Zero32 = MIRBuilder.buildConstant(S32, 0);
5955   auto Zero64 = MIRBuilder.buildConstant(S64, 0);
5956 
5957   auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
5958 
5959   auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
5960   auto Sub = MIRBuilder.buildSub(S32, K, LZ);
5961 
5962   auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
5963   auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
5964 
5965   auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
5966   auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
5967 
5968   auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
5969 
5970   auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
5971   auto T = MIRBuilder.buildAnd(S64, U, Mask1);
5972 
5973   auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
5974   auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
5975   auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
5976 
5977   auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
5978   auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
5979   auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
5980   auto One = MIRBuilder.buildConstant(S32, 1);
5981 
5982   auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
5983   auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
5984   auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
5985   MIRBuilder.buildAdd(Dst, V, R);
5986 
5987   MI.eraseFromParent();
5988   return Legalized;
5989 }
5990 
5991 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
5992   Register Dst = MI.getOperand(0).getReg();
5993   Register Src = MI.getOperand(1).getReg();
5994   LLT DstTy = MRI.getType(Dst);
5995   LLT SrcTy = MRI.getType(Src);
5996 
5997   if (SrcTy == LLT::scalar(1)) {
5998     auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
5999     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6000     MIRBuilder.buildSelect(Dst, Src, True, False);
6001     MI.eraseFromParent();
6002     return Legalized;
6003   }
6004 
6005   if (SrcTy != LLT::scalar(64))
6006     return UnableToLegalize;
6007 
6008   if (DstTy == LLT::scalar(32)) {
6009     // TODO: SelectionDAG has several alternative expansions to port which may
6010     // be more reasonble depending on the available instructions. If a target
6011     // has sitofp, does not have CTLZ, or can efficiently use f64 as an
6012     // intermediate type, this is probably worse.
6013     return lowerU64ToF32BitOps(MI);
6014   }
6015 
6016   return UnableToLegalize;
6017 }
6018 
6019 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
6020   Register Dst = MI.getOperand(0).getReg();
6021   Register Src = MI.getOperand(1).getReg();
6022   LLT DstTy = MRI.getType(Dst);
6023   LLT SrcTy = MRI.getType(Src);
6024 
6025   const LLT S64 = LLT::scalar(64);
6026   const LLT S32 = LLT::scalar(32);
6027   const LLT S1 = LLT::scalar(1);
6028 
6029   if (SrcTy == S1) {
6030     auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
6031     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6032     MIRBuilder.buildSelect(Dst, Src, True, False);
6033     MI.eraseFromParent();
6034     return Legalized;
6035   }
6036 
6037   if (SrcTy != S64)
6038     return UnableToLegalize;
6039 
6040   if (DstTy == S32) {
6041     // signed cl2f(long l) {
6042     //   long s = l >> 63;
6043     //   float r = cul2f((l + s) ^ s);
6044     //   return s ? -r : r;
6045     // }
6046     Register L = Src;
6047     auto SignBit = MIRBuilder.buildConstant(S64, 63);
6048     auto S = MIRBuilder.buildAShr(S64, L, SignBit);
6049 
6050     auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
6051     auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
6052     auto R = MIRBuilder.buildUITOFP(S32, Xor);
6053 
6054     auto RNeg = MIRBuilder.buildFNeg(S32, R);
6055     auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
6056                                             MIRBuilder.buildConstant(S64, 0));
6057     MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
6058     MI.eraseFromParent();
6059     return Legalized;
6060   }
6061 
6062   return UnableToLegalize;
6063 }
6064 
6065 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
6066   Register Dst = MI.getOperand(0).getReg();
6067   Register Src = MI.getOperand(1).getReg();
6068   LLT DstTy = MRI.getType(Dst);
6069   LLT SrcTy = MRI.getType(Src);
6070   const LLT S64 = LLT::scalar(64);
6071   const LLT S32 = LLT::scalar(32);
6072 
6073   if (SrcTy != S64 && SrcTy != S32)
6074     return UnableToLegalize;
6075   if (DstTy != S32 && DstTy != S64)
6076     return UnableToLegalize;
6077 
6078   // FPTOSI gives same result as FPTOUI for positive signed integers.
6079   // FPTOUI needs to deal with fp values that convert to unsigned integers
6080   // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
6081 
6082   APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
6083   APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
6084                                                 : APFloat::IEEEdouble(),
6085                     APInt::getZero(SrcTy.getSizeInBits()));
6086   TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
6087 
6088   MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
6089 
6090   MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
6091   // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
6092   // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
6093   MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
6094   MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
6095   MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
6096   MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
6097 
6098   const LLT S1 = LLT::scalar(1);
6099 
6100   MachineInstrBuilder FCMP =
6101       MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
6102   MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
6103 
6104   MI.eraseFromParent();
6105   return Legalized;
6106 }
6107 
6108 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
6109   Register Dst = MI.getOperand(0).getReg();
6110   Register Src = MI.getOperand(1).getReg();
6111   LLT DstTy = MRI.getType(Dst);
6112   LLT SrcTy = MRI.getType(Src);
6113   const LLT S64 = LLT::scalar(64);
6114   const LLT S32 = LLT::scalar(32);
6115 
6116   // FIXME: Only f32 to i64 conversions are supported.
6117   if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
6118     return UnableToLegalize;
6119 
6120   // Expand f32 -> i64 conversion
6121   // This algorithm comes from compiler-rt's implementation of fixsfdi:
6122   // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
6123 
6124   unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
6125 
6126   auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
6127   auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
6128 
6129   auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
6130   auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
6131 
6132   auto SignMask = MIRBuilder.buildConstant(SrcTy,
6133                                            APInt::getSignMask(SrcEltBits));
6134   auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
6135   auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
6136   auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
6137   Sign = MIRBuilder.buildSExt(DstTy, Sign);
6138 
6139   auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
6140   auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
6141   auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
6142 
6143   auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
6144   R = MIRBuilder.buildZExt(DstTy, R);
6145 
6146   auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
6147   auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
6148   auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
6149   auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
6150 
6151   auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
6152   auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
6153 
6154   const LLT S1 = LLT::scalar(1);
6155   auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
6156                                     S1, Exponent, ExponentLoBit);
6157 
6158   R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
6159 
6160   auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
6161   auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
6162 
6163   auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
6164 
6165   auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
6166                                           S1, Exponent, ZeroSrcTy);
6167 
6168   auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
6169   MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
6170 
6171   MI.eraseFromParent();
6172   return Legalized;
6173 }
6174 
6175 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
6176 LegalizerHelper::LegalizeResult
6177 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
6178   Register Dst = MI.getOperand(0).getReg();
6179   Register Src = MI.getOperand(1).getReg();
6180 
6181   if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
6182     return UnableToLegalize;
6183 
6184   const unsigned ExpMask = 0x7ff;
6185   const unsigned ExpBiasf64 = 1023;
6186   const unsigned ExpBiasf16 = 15;
6187   const LLT S32 = LLT::scalar(32);
6188   const LLT S1 = LLT::scalar(1);
6189 
6190   auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
6191   Register U = Unmerge.getReg(0);
6192   Register UH = Unmerge.getReg(1);
6193 
6194   auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
6195   E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
6196 
6197   // Subtract the fp64 exponent bias (1023) to get the real exponent and
6198   // add the f16 bias (15) to get the biased exponent for the f16 format.
6199   E = MIRBuilder.buildAdd(
6200     S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
6201 
6202   auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
6203   M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
6204 
6205   auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
6206                                        MIRBuilder.buildConstant(S32, 0x1ff));
6207   MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
6208 
6209   auto Zero = MIRBuilder.buildConstant(S32, 0);
6210   auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
6211   auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
6212   M = MIRBuilder.buildOr(S32, M, Lo40Set);
6213 
6214   // (M != 0 ? 0x0200 : 0) | 0x7c00;
6215   auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
6216   auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
6217   auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
6218 
6219   auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
6220   auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
6221 
6222   // N = M | (E << 12);
6223   auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
6224   auto N = MIRBuilder.buildOr(S32, M, EShl12);
6225 
6226   // B = clamp(1-E, 0, 13);
6227   auto One = MIRBuilder.buildConstant(S32, 1);
6228   auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
6229   auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
6230   B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
6231 
6232   auto SigSetHigh = MIRBuilder.buildOr(S32, M,
6233                                        MIRBuilder.buildConstant(S32, 0x1000));
6234 
6235   auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
6236   auto D0 = MIRBuilder.buildShl(S32, D, B);
6237 
6238   auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
6239                                              D0, SigSetHigh);
6240   auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
6241   D = MIRBuilder.buildOr(S32, D, D1);
6242 
6243   auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
6244   auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
6245 
6246   auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
6247   V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
6248 
6249   auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
6250                                        MIRBuilder.buildConstant(S32, 3));
6251   auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
6252 
6253   auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
6254                                        MIRBuilder.buildConstant(S32, 5));
6255   auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
6256 
6257   V1 = MIRBuilder.buildOr(S32, V0, V1);
6258   V = MIRBuilder.buildAdd(S32, V, V1);
6259 
6260   auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
6261                                        E, MIRBuilder.buildConstant(S32, 30));
6262   V = MIRBuilder.buildSelect(S32, CmpEGt30,
6263                              MIRBuilder.buildConstant(S32, 0x7c00), V);
6264 
6265   auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
6266                                          E, MIRBuilder.buildConstant(S32, 1039));
6267   V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
6268 
6269   // Extract the sign bit.
6270   auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
6271   Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
6272 
6273   // Insert the sign bit
6274   V = MIRBuilder.buildOr(S32, Sign, V);
6275 
6276   MIRBuilder.buildTrunc(Dst, V);
6277   MI.eraseFromParent();
6278   return Legalized;
6279 }
6280 
6281 LegalizerHelper::LegalizeResult
6282 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
6283   Register Dst = MI.getOperand(0).getReg();
6284   Register Src = MI.getOperand(1).getReg();
6285 
6286   LLT DstTy = MRI.getType(Dst);
6287   LLT SrcTy = MRI.getType(Src);
6288   const LLT S64 = LLT::scalar(64);
6289   const LLT S16 = LLT::scalar(16);
6290 
6291   if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
6292     return lowerFPTRUNC_F64_TO_F16(MI);
6293 
6294   return UnableToLegalize;
6295 }
6296 
6297 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a
6298 // multiplication tree.
6299 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
6300   Register Dst = MI.getOperand(0).getReg();
6301   Register Src0 = MI.getOperand(1).getReg();
6302   Register Src1 = MI.getOperand(2).getReg();
6303   LLT Ty = MRI.getType(Dst);
6304 
6305   auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
6306   MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
6307   MI.eraseFromParent();
6308   return Legalized;
6309 }
6310 
6311 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
6312   switch (Opc) {
6313   case TargetOpcode::G_SMIN:
6314     return CmpInst::ICMP_SLT;
6315   case TargetOpcode::G_SMAX:
6316     return CmpInst::ICMP_SGT;
6317   case TargetOpcode::G_UMIN:
6318     return CmpInst::ICMP_ULT;
6319   case TargetOpcode::G_UMAX:
6320     return CmpInst::ICMP_UGT;
6321   default:
6322     llvm_unreachable("not in integer min/max");
6323   }
6324 }
6325 
6326 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
6327   Register Dst = MI.getOperand(0).getReg();
6328   Register Src0 = MI.getOperand(1).getReg();
6329   Register Src1 = MI.getOperand(2).getReg();
6330 
6331   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
6332   LLT CmpType = MRI.getType(Dst).changeElementSize(1);
6333 
6334   auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
6335   MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
6336 
6337   MI.eraseFromParent();
6338   return Legalized;
6339 }
6340 
6341 LegalizerHelper::LegalizeResult
6342 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
6343   Register Dst = MI.getOperand(0).getReg();
6344   Register Src0 = MI.getOperand(1).getReg();
6345   Register Src1 = MI.getOperand(2).getReg();
6346 
6347   const LLT Src0Ty = MRI.getType(Src0);
6348   const LLT Src1Ty = MRI.getType(Src1);
6349 
6350   const int Src0Size = Src0Ty.getScalarSizeInBits();
6351   const int Src1Size = Src1Ty.getScalarSizeInBits();
6352 
6353   auto SignBitMask = MIRBuilder.buildConstant(
6354     Src0Ty, APInt::getSignMask(Src0Size));
6355 
6356   auto NotSignBitMask = MIRBuilder.buildConstant(
6357     Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
6358 
6359   Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
6360   Register And1;
6361   if (Src0Ty == Src1Ty) {
6362     And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
6363   } else if (Src0Size > Src1Size) {
6364     auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
6365     auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
6366     auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
6367     And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
6368   } else {
6369     auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
6370     auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
6371     auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
6372     And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
6373   }
6374 
6375   // Be careful about setting nsz/nnan/ninf on every instruction, since the
6376   // constants are a nan and -0.0, but the final result should preserve
6377   // everything.
6378   unsigned Flags = MI.getFlags();
6379   MIRBuilder.buildOr(Dst, And0, And1, Flags);
6380 
6381   MI.eraseFromParent();
6382   return Legalized;
6383 }
6384 
6385 LegalizerHelper::LegalizeResult
6386 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
6387   unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
6388     TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
6389 
6390   Register Dst = MI.getOperand(0).getReg();
6391   Register Src0 = MI.getOperand(1).getReg();
6392   Register Src1 = MI.getOperand(2).getReg();
6393   LLT Ty = MRI.getType(Dst);
6394 
6395   if (!MI.getFlag(MachineInstr::FmNoNans)) {
6396     // Insert canonicalizes if it's possible we need to quiet to get correct
6397     // sNaN behavior.
6398 
6399     // Note this must be done here, and not as an optimization combine in the
6400     // absence of a dedicate quiet-snan instruction as we're using an
6401     // omni-purpose G_FCANONICALIZE.
6402     if (!isKnownNeverSNaN(Src0, MRI))
6403       Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
6404 
6405     if (!isKnownNeverSNaN(Src1, MRI))
6406       Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
6407   }
6408 
6409   // If there are no nans, it's safe to simply replace this with the non-IEEE
6410   // version.
6411   MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
6412   MI.eraseFromParent();
6413   return Legalized;
6414 }
6415 
6416 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
6417   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
6418   Register DstReg = MI.getOperand(0).getReg();
6419   LLT Ty = MRI.getType(DstReg);
6420   unsigned Flags = MI.getFlags();
6421 
6422   auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
6423                                   Flags);
6424   MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
6425   MI.eraseFromParent();
6426   return Legalized;
6427 }
6428 
6429 LegalizerHelper::LegalizeResult
6430 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
6431   Register DstReg = MI.getOperand(0).getReg();
6432   Register X = MI.getOperand(1).getReg();
6433   const unsigned Flags = MI.getFlags();
6434   const LLT Ty = MRI.getType(DstReg);
6435   const LLT CondTy = Ty.changeElementSize(1);
6436 
6437   // round(x) =>
6438   //  t = trunc(x);
6439   //  d = fabs(x - t);
6440   //  o = copysign(1.0f, x);
6441   //  return t + (d >= 0.5 ? o : 0.0);
6442 
6443   auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
6444 
6445   auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
6446   auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
6447   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6448   auto One = MIRBuilder.buildFConstant(Ty, 1.0);
6449   auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
6450   auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X);
6451 
6452   auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half,
6453                                   Flags);
6454   auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags);
6455 
6456   MIRBuilder.buildFAdd(DstReg, T, Sel, Flags);
6457 
6458   MI.eraseFromParent();
6459   return Legalized;
6460 }
6461 
6462 LegalizerHelper::LegalizeResult
6463 LegalizerHelper::lowerFFloor(MachineInstr &MI) {
6464   Register DstReg = MI.getOperand(0).getReg();
6465   Register SrcReg = MI.getOperand(1).getReg();
6466   unsigned Flags = MI.getFlags();
6467   LLT Ty = MRI.getType(DstReg);
6468   const LLT CondTy = Ty.changeElementSize(1);
6469 
6470   // result = trunc(src);
6471   // if (src < 0.0 && src != result)
6472   //   result += -1.0.
6473 
6474   auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
6475   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6476 
6477   auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
6478                                   SrcReg, Zero, Flags);
6479   auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
6480                                       SrcReg, Trunc, Flags);
6481   auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
6482   auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
6483 
6484   MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
6485   MI.eraseFromParent();
6486   return Legalized;
6487 }
6488 
6489 LegalizerHelper::LegalizeResult
6490 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
6491   const unsigned NumOps = MI.getNumOperands();
6492   Register DstReg = MI.getOperand(0).getReg();
6493   Register Src0Reg = MI.getOperand(1).getReg();
6494   LLT DstTy = MRI.getType(DstReg);
6495   LLT SrcTy = MRI.getType(Src0Reg);
6496   unsigned PartSize = SrcTy.getSizeInBits();
6497 
6498   LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
6499   Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
6500 
6501   for (unsigned I = 2; I != NumOps; ++I) {
6502     const unsigned Offset = (I - 1) * PartSize;
6503 
6504     Register SrcReg = MI.getOperand(I).getReg();
6505     auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
6506 
6507     Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
6508       MRI.createGenericVirtualRegister(WideTy);
6509 
6510     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
6511     auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
6512     MIRBuilder.buildOr(NextResult, ResultReg, Shl);
6513     ResultReg = NextResult;
6514   }
6515 
6516   if (DstTy.isPointer()) {
6517     if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
6518           DstTy.getAddressSpace())) {
6519       LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
6520       return UnableToLegalize;
6521     }
6522 
6523     MIRBuilder.buildIntToPtr(DstReg, ResultReg);
6524   }
6525 
6526   MI.eraseFromParent();
6527   return Legalized;
6528 }
6529 
6530 LegalizerHelper::LegalizeResult
6531 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
6532   const unsigned NumDst = MI.getNumOperands() - 1;
6533   Register SrcReg = MI.getOperand(NumDst).getReg();
6534   Register Dst0Reg = MI.getOperand(0).getReg();
6535   LLT DstTy = MRI.getType(Dst0Reg);
6536   if (DstTy.isPointer())
6537     return UnableToLegalize; // TODO
6538 
6539   SrcReg = coerceToScalar(SrcReg);
6540   if (!SrcReg)
6541     return UnableToLegalize;
6542 
6543   // Expand scalarizing unmerge as bitcast to integer and shift.
6544   LLT IntTy = MRI.getType(SrcReg);
6545 
6546   MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
6547 
6548   const unsigned DstSize = DstTy.getSizeInBits();
6549   unsigned Offset = DstSize;
6550   for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
6551     auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
6552     auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
6553     MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
6554   }
6555 
6556   MI.eraseFromParent();
6557   return Legalized;
6558 }
6559 
6560 /// Lower a vector extract or insert by writing the vector to a stack temporary
6561 /// and reloading the element or vector.
6562 ///
6563 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
6564 ///  =>
6565 ///  %stack_temp = G_FRAME_INDEX
6566 ///  G_STORE %vec, %stack_temp
6567 ///  %idx = clamp(%idx, %vec.getNumElements())
6568 ///  %element_ptr = G_PTR_ADD %stack_temp, %idx
6569 ///  %dst = G_LOAD %element_ptr
6570 LegalizerHelper::LegalizeResult
6571 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
6572   Register DstReg = MI.getOperand(0).getReg();
6573   Register SrcVec = MI.getOperand(1).getReg();
6574   Register InsertVal;
6575   if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
6576     InsertVal = MI.getOperand(2).getReg();
6577 
6578   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
6579 
6580   LLT VecTy = MRI.getType(SrcVec);
6581   LLT EltTy = VecTy.getElementType();
6582   unsigned NumElts = VecTy.getNumElements();
6583 
6584   int64_t IdxVal;
6585   if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) {
6586     SmallVector<Register, 8> SrcRegs;
6587     extractParts(SrcVec, EltTy, NumElts, SrcRegs);
6588 
6589     if (InsertVal) {
6590       SrcRegs[IdxVal] = MI.getOperand(2).getReg();
6591       MIRBuilder.buildMerge(DstReg, SrcRegs);
6592     } else {
6593       MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]);
6594     }
6595 
6596     MI.eraseFromParent();
6597     return Legalized;
6598   }
6599 
6600   if (!EltTy.isByteSized()) { // Not implemented.
6601     LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
6602     return UnableToLegalize;
6603   }
6604 
6605   unsigned EltBytes = EltTy.getSizeInBytes();
6606   Align VecAlign = getStackTemporaryAlignment(VecTy);
6607   Align EltAlign;
6608 
6609   MachinePointerInfo PtrInfo;
6610   auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()),
6611                                         VecAlign, PtrInfo);
6612   MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
6613 
6614   // Get the pointer to the element, and be sure not to hit undefined behavior
6615   // if the index is out of bounds.
6616   Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
6617 
6618   if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
6619     int64_t Offset = IdxVal * EltBytes;
6620     PtrInfo = PtrInfo.getWithOffset(Offset);
6621     EltAlign = commonAlignment(VecAlign, Offset);
6622   } else {
6623     // We lose information with a variable offset.
6624     EltAlign = getStackTemporaryAlignment(EltTy);
6625     PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
6626   }
6627 
6628   if (InsertVal) {
6629     // Write the inserted element
6630     MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
6631 
6632     // Reload the whole vector.
6633     MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
6634   } else {
6635     MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
6636   }
6637 
6638   MI.eraseFromParent();
6639   return Legalized;
6640 }
6641 
6642 LegalizerHelper::LegalizeResult
6643 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
6644   Register DstReg = MI.getOperand(0).getReg();
6645   Register Src0Reg = MI.getOperand(1).getReg();
6646   Register Src1Reg = MI.getOperand(2).getReg();
6647   LLT Src0Ty = MRI.getType(Src0Reg);
6648   LLT DstTy = MRI.getType(DstReg);
6649   LLT IdxTy = LLT::scalar(32);
6650 
6651   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6652 
6653   if (DstTy.isScalar()) {
6654     if (Src0Ty.isVector())
6655       return UnableToLegalize;
6656 
6657     // This is just a SELECT.
6658     assert(Mask.size() == 1 && "Expected a single mask element");
6659     Register Val;
6660     if (Mask[0] < 0 || Mask[0] > 1)
6661       Val = MIRBuilder.buildUndef(DstTy).getReg(0);
6662     else
6663       Val = Mask[0] == 0 ? Src0Reg : Src1Reg;
6664     MIRBuilder.buildCopy(DstReg, Val);
6665     MI.eraseFromParent();
6666     return Legalized;
6667   }
6668 
6669   Register Undef;
6670   SmallVector<Register, 32> BuildVec;
6671   LLT EltTy = DstTy.getElementType();
6672 
6673   for (int Idx : Mask) {
6674     if (Idx < 0) {
6675       if (!Undef.isValid())
6676         Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
6677       BuildVec.push_back(Undef);
6678       continue;
6679     }
6680 
6681     if (Src0Ty.isScalar()) {
6682       BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
6683     } else {
6684       int NumElts = Src0Ty.getNumElements();
6685       Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
6686       int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
6687       auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
6688       auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
6689       BuildVec.push_back(Extract.getReg(0));
6690     }
6691   }
6692 
6693   MIRBuilder.buildBuildVector(DstReg, BuildVec);
6694   MI.eraseFromParent();
6695   return Legalized;
6696 }
6697 
6698 LegalizerHelper::LegalizeResult
6699 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
6700   const auto &MF = *MI.getMF();
6701   const auto &TFI = *MF.getSubtarget().getFrameLowering();
6702   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
6703     return UnableToLegalize;
6704 
6705   Register Dst = MI.getOperand(0).getReg();
6706   Register AllocSize = MI.getOperand(1).getReg();
6707   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
6708 
6709   LLT PtrTy = MRI.getType(Dst);
6710   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
6711 
6712   Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
6713   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
6714   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
6715 
6716   // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
6717   // have to generate an extra instruction to negate the alloc and then use
6718   // G_PTR_ADD to add the negative offset.
6719   auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
6720   if (Alignment > Align(1)) {
6721     APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
6722     AlignMask.negate();
6723     auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
6724     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
6725   }
6726 
6727   SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
6728   MIRBuilder.buildCopy(SPReg, SPTmp);
6729   MIRBuilder.buildCopy(Dst, SPTmp);
6730 
6731   MI.eraseFromParent();
6732   return Legalized;
6733 }
6734 
6735 LegalizerHelper::LegalizeResult
6736 LegalizerHelper::lowerExtract(MachineInstr &MI) {
6737   Register Dst = MI.getOperand(0).getReg();
6738   Register Src = MI.getOperand(1).getReg();
6739   unsigned Offset = MI.getOperand(2).getImm();
6740 
6741   LLT DstTy = MRI.getType(Dst);
6742   LLT SrcTy = MRI.getType(Src);
6743 
6744   // Extract sub-vector or one element
6745   if (SrcTy.isVector()) {
6746     unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
6747     unsigned DstSize = DstTy.getSizeInBits();
6748 
6749     if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
6750         (Offset + DstSize <= SrcTy.getSizeInBits())) {
6751       // Unmerge and allow access to each Src element for the artifact combiner.
6752       auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), Src);
6753 
6754       // Take element(s) we need to extract and copy it (merge them).
6755       SmallVector<Register, 8> SubVectorElts;
6756       for (unsigned Idx = Offset / SrcEltSize;
6757            Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
6758         SubVectorElts.push_back(Unmerge.getReg(Idx));
6759       }
6760       if (SubVectorElts.size() == 1)
6761         MIRBuilder.buildCopy(Dst, SubVectorElts[0]);
6762       else
6763         MIRBuilder.buildMerge(Dst, SubVectorElts);
6764 
6765       MI.eraseFromParent();
6766       return Legalized;
6767     }
6768   }
6769 
6770   if (DstTy.isScalar() &&
6771       (SrcTy.isScalar() ||
6772        (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
6773     LLT SrcIntTy = SrcTy;
6774     if (!SrcTy.isScalar()) {
6775       SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
6776       Src = MIRBuilder.buildBitcast(SrcIntTy, Src).getReg(0);
6777     }
6778 
6779     if (Offset == 0)
6780       MIRBuilder.buildTrunc(Dst, Src);
6781     else {
6782       auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
6783       auto Shr = MIRBuilder.buildLShr(SrcIntTy, Src, ShiftAmt);
6784       MIRBuilder.buildTrunc(Dst, Shr);
6785     }
6786 
6787     MI.eraseFromParent();
6788     return Legalized;
6789   }
6790 
6791   return UnableToLegalize;
6792 }
6793 
6794 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
6795   Register Dst = MI.getOperand(0).getReg();
6796   Register Src = MI.getOperand(1).getReg();
6797   Register InsertSrc = MI.getOperand(2).getReg();
6798   uint64_t Offset = MI.getOperand(3).getImm();
6799 
6800   LLT DstTy = MRI.getType(Src);
6801   LLT InsertTy = MRI.getType(InsertSrc);
6802 
6803   // Insert sub-vector or one element
6804   if (DstTy.isVector() && !InsertTy.isPointer()) {
6805     LLT EltTy = DstTy.getElementType();
6806     unsigned EltSize = EltTy.getSizeInBits();
6807     unsigned InsertSize = InsertTy.getSizeInBits();
6808 
6809     if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
6810         (Offset + InsertSize <= DstTy.getSizeInBits())) {
6811       auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, Src);
6812       SmallVector<Register, 8> DstElts;
6813       unsigned Idx = 0;
6814       // Elements from Src before insert start Offset
6815       for (; Idx < Offset / EltSize; ++Idx) {
6816         DstElts.push_back(UnmergeSrc.getReg(Idx));
6817       }
6818 
6819       // Replace elements in Src with elements from InsertSrc
6820       if (InsertTy.getSizeInBits() > EltSize) {
6821         auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(EltTy, InsertSrc);
6822         for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
6823              ++Idx, ++i) {
6824           DstElts.push_back(UnmergeInsertSrc.getReg(i));
6825         }
6826       } else {
6827         DstElts.push_back(InsertSrc);
6828         ++Idx;
6829       }
6830 
6831       // Remaining elements from Src after insert
6832       for (; Idx < DstTy.getNumElements(); ++Idx) {
6833         DstElts.push_back(UnmergeSrc.getReg(Idx));
6834       }
6835 
6836       MIRBuilder.buildMerge(Dst, DstElts);
6837       MI.eraseFromParent();
6838       return Legalized;
6839     }
6840   }
6841 
6842   if (InsertTy.isVector() ||
6843       (DstTy.isVector() && DstTy.getElementType() != InsertTy))
6844     return UnableToLegalize;
6845 
6846   const DataLayout &DL = MIRBuilder.getDataLayout();
6847   if ((DstTy.isPointer() &&
6848        DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
6849       (InsertTy.isPointer() &&
6850        DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
6851     LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
6852     return UnableToLegalize;
6853   }
6854 
6855   LLT IntDstTy = DstTy;
6856 
6857   if (!DstTy.isScalar()) {
6858     IntDstTy = LLT::scalar(DstTy.getSizeInBits());
6859     Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
6860   }
6861 
6862   if (!InsertTy.isScalar()) {
6863     const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
6864     InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
6865   }
6866 
6867   Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
6868   if (Offset != 0) {
6869     auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
6870     ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
6871   }
6872 
6873   APInt MaskVal = APInt::getBitsSetWithWrap(
6874       DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
6875 
6876   auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
6877   auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
6878   auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
6879 
6880   MIRBuilder.buildCast(Dst, Or);
6881   MI.eraseFromParent();
6882   return Legalized;
6883 }
6884 
6885 LegalizerHelper::LegalizeResult
6886 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
6887   Register Dst0 = MI.getOperand(0).getReg();
6888   Register Dst1 = MI.getOperand(1).getReg();
6889   Register LHS = MI.getOperand(2).getReg();
6890   Register RHS = MI.getOperand(3).getReg();
6891   const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
6892 
6893   LLT Ty = MRI.getType(Dst0);
6894   LLT BoolTy = MRI.getType(Dst1);
6895 
6896   if (IsAdd)
6897     MIRBuilder.buildAdd(Dst0, LHS, RHS);
6898   else
6899     MIRBuilder.buildSub(Dst0, LHS, RHS);
6900 
6901   // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
6902 
6903   auto Zero = MIRBuilder.buildConstant(Ty, 0);
6904 
6905   // For an addition, the result should be less than one of the operands (LHS)
6906   // if and only if the other operand (RHS) is negative, otherwise there will
6907   // be overflow.
6908   // For a subtraction, the result should be less than one of the operands
6909   // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
6910   // otherwise there will be overflow.
6911   auto ResultLowerThanLHS =
6912       MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
6913   auto ConditionRHS = MIRBuilder.buildICmp(
6914       IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
6915 
6916   MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
6917   MI.eraseFromParent();
6918   return Legalized;
6919 }
6920 
6921 LegalizerHelper::LegalizeResult
6922 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
6923   Register Res = MI.getOperand(0).getReg();
6924   Register LHS = MI.getOperand(1).getReg();
6925   Register RHS = MI.getOperand(2).getReg();
6926   LLT Ty = MRI.getType(Res);
6927   bool IsSigned;
6928   bool IsAdd;
6929   unsigned BaseOp;
6930   switch (MI.getOpcode()) {
6931   default:
6932     llvm_unreachable("unexpected addsat/subsat opcode");
6933   case TargetOpcode::G_UADDSAT:
6934     IsSigned = false;
6935     IsAdd = true;
6936     BaseOp = TargetOpcode::G_ADD;
6937     break;
6938   case TargetOpcode::G_SADDSAT:
6939     IsSigned = true;
6940     IsAdd = true;
6941     BaseOp = TargetOpcode::G_ADD;
6942     break;
6943   case TargetOpcode::G_USUBSAT:
6944     IsSigned = false;
6945     IsAdd = false;
6946     BaseOp = TargetOpcode::G_SUB;
6947     break;
6948   case TargetOpcode::G_SSUBSAT:
6949     IsSigned = true;
6950     IsAdd = false;
6951     BaseOp = TargetOpcode::G_SUB;
6952     break;
6953   }
6954 
6955   if (IsSigned) {
6956     // sadd.sat(a, b) ->
6957     //   hi = 0x7fffffff - smax(a, 0)
6958     //   lo = 0x80000000 - smin(a, 0)
6959     //   a + smin(smax(lo, b), hi)
6960     // ssub.sat(a, b) ->
6961     //   lo = smax(a, -1) - 0x7fffffff
6962     //   hi = smin(a, -1) - 0x80000000
6963     //   a - smin(smax(lo, b), hi)
6964     // TODO: AMDGPU can use a "median of 3" instruction here:
6965     //   a +/- med3(lo, b, hi)
6966     uint64_t NumBits = Ty.getScalarSizeInBits();
6967     auto MaxVal =
6968         MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
6969     auto MinVal =
6970         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
6971     MachineInstrBuilder Hi, Lo;
6972     if (IsAdd) {
6973       auto Zero = MIRBuilder.buildConstant(Ty, 0);
6974       Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
6975       Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
6976     } else {
6977       auto NegOne = MIRBuilder.buildConstant(Ty, -1);
6978       Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
6979                                MaxVal);
6980       Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
6981                                MinVal);
6982     }
6983     auto RHSClamped =
6984         MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
6985     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
6986   } else {
6987     // uadd.sat(a, b) -> a + umin(~a, b)
6988     // usub.sat(a, b) -> a - umin(a, b)
6989     Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
6990     auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
6991     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
6992   }
6993 
6994   MI.eraseFromParent();
6995   return Legalized;
6996 }
6997 
6998 LegalizerHelper::LegalizeResult
6999 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
7000   Register Res = MI.getOperand(0).getReg();
7001   Register LHS = MI.getOperand(1).getReg();
7002   Register RHS = MI.getOperand(2).getReg();
7003   LLT Ty = MRI.getType(Res);
7004   LLT BoolTy = Ty.changeElementSize(1);
7005   bool IsSigned;
7006   bool IsAdd;
7007   unsigned OverflowOp;
7008   switch (MI.getOpcode()) {
7009   default:
7010     llvm_unreachable("unexpected addsat/subsat opcode");
7011   case TargetOpcode::G_UADDSAT:
7012     IsSigned = false;
7013     IsAdd = true;
7014     OverflowOp = TargetOpcode::G_UADDO;
7015     break;
7016   case TargetOpcode::G_SADDSAT:
7017     IsSigned = true;
7018     IsAdd = true;
7019     OverflowOp = TargetOpcode::G_SADDO;
7020     break;
7021   case TargetOpcode::G_USUBSAT:
7022     IsSigned = false;
7023     IsAdd = false;
7024     OverflowOp = TargetOpcode::G_USUBO;
7025     break;
7026   case TargetOpcode::G_SSUBSAT:
7027     IsSigned = true;
7028     IsAdd = false;
7029     OverflowOp = TargetOpcode::G_SSUBO;
7030     break;
7031   }
7032 
7033   auto OverflowRes =
7034       MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
7035   Register Tmp = OverflowRes.getReg(0);
7036   Register Ov = OverflowRes.getReg(1);
7037   MachineInstrBuilder Clamp;
7038   if (IsSigned) {
7039     // sadd.sat(a, b) ->
7040     //   {tmp, ov} = saddo(a, b)
7041     //   ov ? (tmp >>s 31) + 0x80000000 : r
7042     // ssub.sat(a, b) ->
7043     //   {tmp, ov} = ssubo(a, b)
7044     //   ov ? (tmp >>s 31) + 0x80000000 : r
7045     uint64_t NumBits = Ty.getScalarSizeInBits();
7046     auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
7047     auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
7048     auto MinVal =
7049         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7050     Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
7051   } else {
7052     // uadd.sat(a, b) ->
7053     //   {tmp, ov} = uaddo(a, b)
7054     //   ov ? 0xffffffff : tmp
7055     // usub.sat(a, b) ->
7056     //   {tmp, ov} = usubo(a, b)
7057     //   ov ? 0 : tmp
7058     Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
7059   }
7060   MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
7061 
7062   MI.eraseFromParent();
7063   return Legalized;
7064 }
7065 
7066 LegalizerHelper::LegalizeResult
7067 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
7068   assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
7069           MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
7070          "Expected shlsat opcode!");
7071   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
7072   Register Res = MI.getOperand(0).getReg();
7073   Register LHS = MI.getOperand(1).getReg();
7074   Register RHS = MI.getOperand(2).getReg();
7075   LLT Ty = MRI.getType(Res);
7076   LLT BoolTy = Ty.changeElementSize(1);
7077 
7078   unsigned BW = Ty.getScalarSizeInBits();
7079   auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
7080   auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
7081                        : MIRBuilder.buildLShr(Ty, Result, RHS);
7082 
7083   MachineInstrBuilder SatVal;
7084   if (IsSigned) {
7085     auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
7086     auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
7087     auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
7088                                     MIRBuilder.buildConstant(Ty, 0));
7089     SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
7090   } else {
7091     SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
7092   }
7093   auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
7094   MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
7095 
7096   MI.eraseFromParent();
7097   return Legalized;
7098 }
7099 
7100 LegalizerHelper::LegalizeResult
7101 LegalizerHelper::lowerBswap(MachineInstr &MI) {
7102   Register Dst = MI.getOperand(0).getReg();
7103   Register Src = MI.getOperand(1).getReg();
7104   const LLT Ty = MRI.getType(Src);
7105   unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
7106   unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
7107 
7108   // Swap most and least significant byte, set remaining bytes in Res to zero.
7109   auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
7110   auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
7111   auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7112   auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
7113 
7114   // Set i-th high/low byte in Res to i-th low/high byte from Src.
7115   for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
7116     // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
7117     APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
7118     auto Mask = MIRBuilder.buildConstant(Ty, APMask);
7119     auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
7120     // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
7121     auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
7122     auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
7123     Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
7124     // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
7125     auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7126     auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
7127     Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
7128   }
7129   Res.getInstr()->getOperand(0).setReg(Dst);
7130 
7131   MI.eraseFromParent();
7132   return Legalized;
7133 }
7134 
7135 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
7136 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
7137                                  MachineInstrBuilder Src, APInt Mask) {
7138   const LLT Ty = Dst.getLLTTy(*B.getMRI());
7139   MachineInstrBuilder C_N = B.buildConstant(Ty, N);
7140   MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
7141   auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
7142   auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
7143   return B.buildOr(Dst, LHS, RHS);
7144 }
7145 
7146 LegalizerHelper::LegalizeResult
7147 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
7148   Register Dst = MI.getOperand(0).getReg();
7149   Register Src = MI.getOperand(1).getReg();
7150   const LLT Ty = MRI.getType(Src);
7151   unsigned Size = Ty.getSizeInBits();
7152 
7153   MachineInstrBuilder BSWAP =
7154       MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
7155 
7156   // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
7157   //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
7158   // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
7159   MachineInstrBuilder Swap4 =
7160       SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
7161 
7162   // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
7163   //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
7164   // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
7165   MachineInstrBuilder Swap2 =
7166       SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
7167 
7168   // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
7169   //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
7170   // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
7171   SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
7172 
7173   MI.eraseFromParent();
7174   return Legalized;
7175 }
7176 
7177 LegalizerHelper::LegalizeResult
7178 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
7179   MachineFunction &MF = MIRBuilder.getMF();
7180 
7181   bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
7182   int NameOpIdx = IsRead ? 1 : 0;
7183   int ValRegIndex = IsRead ? 0 : 1;
7184 
7185   Register ValReg = MI.getOperand(ValRegIndex).getReg();
7186   const LLT Ty = MRI.getType(ValReg);
7187   const MDString *RegStr = cast<MDString>(
7188     cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
7189 
7190   Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
7191   if (!PhysReg.isValid())
7192     return UnableToLegalize;
7193 
7194   if (IsRead)
7195     MIRBuilder.buildCopy(ValReg, PhysReg);
7196   else
7197     MIRBuilder.buildCopy(PhysReg, ValReg);
7198 
7199   MI.eraseFromParent();
7200   return Legalized;
7201 }
7202 
7203 LegalizerHelper::LegalizeResult
7204 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
7205   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
7206   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
7207   Register Result = MI.getOperand(0).getReg();
7208   LLT OrigTy = MRI.getType(Result);
7209   auto SizeInBits = OrigTy.getScalarSizeInBits();
7210   LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
7211 
7212   auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
7213   auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
7214   auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
7215   unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
7216 
7217   auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
7218   auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
7219   MIRBuilder.buildTrunc(Result, Shifted);
7220 
7221   MI.eraseFromParent();
7222   return Legalized;
7223 }
7224 
7225 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
7226   // Implement vector G_SELECT in terms of XOR, AND, OR.
7227   Register DstReg = MI.getOperand(0).getReg();
7228   Register MaskReg = MI.getOperand(1).getReg();
7229   Register Op1Reg = MI.getOperand(2).getReg();
7230   Register Op2Reg = MI.getOperand(3).getReg();
7231   LLT DstTy = MRI.getType(DstReg);
7232   LLT MaskTy = MRI.getType(MaskReg);
7233   LLT Op1Ty = MRI.getType(Op1Reg);
7234   if (!DstTy.isVector())
7235     return UnableToLegalize;
7236 
7237   // Vector selects can have a scalar predicate. If so, splat into a vector and
7238   // finish for later legalization attempts to try again.
7239   if (MaskTy.isScalar()) {
7240     Register MaskElt = MaskReg;
7241     if (MaskTy.getSizeInBits() < DstTy.getScalarSizeInBits())
7242       MaskElt = MIRBuilder.buildSExt(DstTy.getElementType(), MaskElt).getReg(0);
7243     // Generate a vector splat idiom to be pattern matched later.
7244     auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
7245     Observer.changingInstr(MI);
7246     MI.getOperand(1).setReg(ShufSplat.getReg(0));
7247     Observer.changedInstr(MI);
7248     return Legalized;
7249   }
7250 
7251   if (MaskTy.getSizeInBits() != Op1Ty.getSizeInBits()) {
7252     return UnableToLegalize;
7253   }
7254 
7255   auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
7256   auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
7257   auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
7258   MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
7259   MI.eraseFromParent();
7260   return Legalized;
7261 }
7262 
7263 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
7264   // Split DIVREM into individual instructions.
7265   unsigned Opcode = MI.getOpcode();
7266 
7267   MIRBuilder.buildInstr(
7268       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
7269                                         : TargetOpcode::G_UDIV,
7270       {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7271   MIRBuilder.buildInstr(
7272       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
7273                                         : TargetOpcode::G_UREM,
7274       {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7275   MI.eraseFromParent();
7276   return Legalized;
7277 }
7278 
7279 LegalizerHelper::LegalizeResult
7280 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
7281   // Expand %res = G_ABS %a into:
7282   // %v1 = G_ASHR %a, scalar_size-1
7283   // %v2 = G_ADD %a, %v1
7284   // %res = G_XOR %v2, %v1
7285   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
7286   Register OpReg = MI.getOperand(1).getReg();
7287   auto ShiftAmt =
7288       MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
7289   auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
7290   auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
7291   MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
7292   MI.eraseFromParent();
7293   return Legalized;
7294 }
7295 
7296 LegalizerHelper::LegalizeResult
7297 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
7298   // Expand %res = G_ABS %a into:
7299   // %v1 = G_CONSTANT 0
7300   // %v2 = G_SUB %v1, %a
7301   // %res = G_SMAX %a, %v2
7302   Register SrcReg = MI.getOperand(1).getReg();
7303   LLT Ty = MRI.getType(SrcReg);
7304   auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
7305   auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
7306   MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
7307   MI.eraseFromParent();
7308   return Legalized;
7309 }
7310 
7311 LegalizerHelper::LegalizeResult
7312 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
7313   Register SrcReg = MI.getOperand(1).getReg();
7314   LLT SrcTy = MRI.getType(SrcReg);
7315   LLT DstTy = MRI.getType(SrcReg);
7316 
7317   // The source could be a scalar if the IR type was <1 x sN>.
7318   if (SrcTy.isScalar()) {
7319     if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
7320       return UnableToLegalize; // FIXME: handle extension.
7321     // This can be just a plain copy.
7322     Observer.changingInstr(MI);
7323     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
7324     Observer.changedInstr(MI);
7325     return Legalized;
7326   }
7327   return UnableToLegalize;;
7328 }
7329 
7330 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
7331   // On Darwin, -Os means optimize for size without hurting performance, so
7332   // only really optimize for size when -Oz (MinSize) is used.
7333   if (MF.getTarget().getTargetTriple().isOSDarwin())
7334     return MF.getFunction().hasMinSize();
7335   return MF.getFunction().hasOptSize();
7336 }
7337 
7338 // Returns a list of types to use for memory op lowering in MemOps. A partial
7339 // port of findOptimalMemOpLowering in TargetLowering.
7340 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
7341                                           unsigned Limit, const MemOp &Op,
7342                                           unsigned DstAS, unsigned SrcAS,
7343                                           const AttributeList &FuncAttributes,
7344                                           const TargetLowering &TLI) {
7345   if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
7346     return false;
7347 
7348   LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
7349 
7350   if (Ty == LLT()) {
7351     // Use the largest scalar type whose alignment constraints are satisfied.
7352     // We only need to check DstAlign here as SrcAlign is always greater or
7353     // equal to DstAlign (or zero).
7354     Ty = LLT::scalar(64);
7355     if (Op.isFixedDstAlign())
7356       while (Op.getDstAlign() < Ty.getSizeInBytes() &&
7357              !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
7358         Ty = LLT::scalar(Ty.getSizeInBytes());
7359     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
7360     // FIXME: check for the largest legal type we can load/store to.
7361   }
7362 
7363   unsigned NumMemOps = 0;
7364   uint64_t Size = Op.size();
7365   while (Size) {
7366     unsigned TySize = Ty.getSizeInBytes();
7367     while (TySize > Size) {
7368       // For now, only use non-vector load / store's for the left-over pieces.
7369       LLT NewTy = Ty;
7370       // FIXME: check for mem op safety and legality of the types. Not all of
7371       // SDAGisms map cleanly to GISel concepts.
7372       if (NewTy.isVector())
7373         NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
7374       NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
7375       unsigned NewTySize = NewTy.getSizeInBytes();
7376       assert(NewTySize > 0 && "Could not find appropriate type");
7377 
7378       // If the new LLT cannot cover all of the remaining bits, then consider
7379       // issuing a (or a pair of) unaligned and overlapping load / store.
7380       bool Fast;
7381       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
7382       MVT VT = getMVTForLLT(Ty);
7383       if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
7384           TLI.allowsMisalignedMemoryAccesses(
7385               VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
7386               MachineMemOperand::MONone, &Fast) &&
7387           Fast)
7388         TySize = Size;
7389       else {
7390         Ty = NewTy;
7391         TySize = NewTySize;
7392       }
7393     }
7394 
7395     if (++NumMemOps > Limit)
7396       return false;
7397 
7398     MemOps.push_back(Ty);
7399     Size -= TySize;
7400   }
7401 
7402   return true;
7403 }
7404 
7405 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
7406   if (Ty.isVector())
7407     return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
7408                                 Ty.getNumElements());
7409   return IntegerType::get(C, Ty.getSizeInBits());
7410 }
7411 
7412 // Get a vectorized representation of the memset value operand, GISel edition.
7413 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
7414   MachineRegisterInfo &MRI = *MIB.getMRI();
7415   unsigned NumBits = Ty.getScalarSizeInBits();
7416   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
7417   if (!Ty.isVector() && ValVRegAndVal) {
7418     APInt Scalar = ValVRegAndVal->Value.truncOrSelf(8);
7419     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
7420     return MIB.buildConstant(Ty, SplatVal).getReg(0);
7421   }
7422 
7423   // Extend the byte value to the larger type, and then multiply by a magic
7424   // value 0x010101... in order to replicate it across every byte.
7425   // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
7426   if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
7427     return MIB.buildConstant(Ty, 0).getReg(0);
7428   }
7429 
7430   LLT ExtType = Ty.getScalarType();
7431   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
7432   if (NumBits > 8) {
7433     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
7434     auto MagicMI = MIB.buildConstant(ExtType, Magic);
7435     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
7436   }
7437 
7438   // For vector types create a G_BUILD_VECTOR.
7439   if (Ty.isVector())
7440     Val = MIB.buildSplatVector(Ty, Val).getReg(0);
7441 
7442   return Val;
7443 }
7444 
7445 LegalizerHelper::LegalizeResult
7446 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
7447                              uint64_t KnownLen, Align Alignment,
7448                              bool IsVolatile) {
7449   auto &MF = *MI.getParent()->getParent();
7450   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7451   auto &DL = MF.getDataLayout();
7452   LLVMContext &C = MF.getFunction().getContext();
7453 
7454   assert(KnownLen != 0 && "Have a zero length memset length!");
7455 
7456   bool DstAlignCanChange = false;
7457   MachineFrameInfo &MFI = MF.getFrameInfo();
7458   bool OptSize = shouldLowerMemFuncForSize(MF);
7459 
7460   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7461   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7462     DstAlignCanChange = true;
7463 
7464   unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
7465   std::vector<LLT> MemOps;
7466 
7467   const auto &DstMMO = **MI.memoperands_begin();
7468   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7469 
7470   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
7471   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
7472 
7473   if (!findGISelOptimalMemOpLowering(MemOps, Limit,
7474                                      MemOp::Set(KnownLen, DstAlignCanChange,
7475                                                 Alignment,
7476                                                 /*IsZeroMemset=*/IsZeroVal,
7477                                                 /*IsVolatile=*/IsVolatile),
7478                                      DstPtrInfo.getAddrSpace(), ~0u,
7479                                      MF.getFunction().getAttributes(), TLI))
7480     return UnableToLegalize;
7481 
7482   if (DstAlignCanChange) {
7483     // Get an estimate of the type from the LLT.
7484     Type *IRTy = getTypeForLLT(MemOps[0], C);
7485     Align NewAlign = DL.getABITypeAlign(IRTy);
7486     if (NewAlign > Alignment) {
7487       Alignment = NewAlign;
7488       unsigned FI = FIDef->getOperand(1).getIndex();
7489       // Give the stack frame object a larger alignment if needed.
7490       if (MFI.getObjectAlign(FI) < Alignment)
7491         MFI.setObjectAlignment(FI, Alignment);
7492     }
7493   }
7494 
7495   MachineIRBuilder MIB(MI);
7496   // Find the largest store and generate the bit pattern for it.
7497   LLT LargestTy = MemOps[0];
7498   for (unsigned i = 1; i < MemOps.size(); i++)
7499     if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
7500       LargestTy = MemOps[i];
7501 
7502   // The memset stored value is always defined as an s8, so in order to make it
7503   // work with larger store types we need to repeat the bit pattern across the
7504   // wider type.
7505   Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
7506 
7507   if (!MemSetValue)
7508     return UnableToLegalize;
7509 
7510   // Generate the stores. For each store type in the list, we generate the
7511   // matching store of that type to the destination address.
7512   LLT PtrTy = MRI.getType(Dst);
7513   unsigned DstOff = 0;
7514   unsigned Size = KnownLen;
7515   for (unsigned I = 0; I < MemOps.size(); I++) {
7516     LLT Ty = MemOps[I];
7517     unsigned TySize = Ty.getSizeInBytes();
7518     if (TySize > Size) {
7519       // Issuing an unaligned load / store pair that overlaps with the previous
7520       // pair. Adjust the offset accordingly.
7521       assert(I == MemOps.size() - 1 && I != 0);
7522       DstOff -= TySize - Size;
7523     }
7524 
7525     // If this store is smaller than the largest store see whether we can get
7526     // the smaller value for free with a truncate.
7527     Register Value = MemSetValue;
7528     if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
7529       MVT VT = getMVTForLLT(Ty);
7530       MVT LargestVT = getMVTForLLT(LargestTy);
7531       if (!LargestTy.isVector() && !Ty.isVector() &&
7532           TLI.isTruncateFree(LargestVT, VT))
7533         Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
7534       else
7535         Value = getMemsetValue(Val, Ty, MIB);
7536       if (!Value)
7537         return UnableToLegalize;
7538     }
7539 
7540     auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
7541 
7542     Register Ptr = Dst;
7543     if (DstOff != 0) {
7544       auto Offset =
7545           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
7546       Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
7547     }
7548 
7549     MIB.buildStore(Value, Ptr, *StoreMMO);
7550     DstOff += Ty.getSizeInBytes();
7551     Size -= TySize;
7552   }
7553 
7554   MI.eraseFromParent();
7555   return Legalized;
7556 }
7557 
7558 LegalizerHelper::LegalizeResult
7559 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
7560   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7561 
7562   Register Dst = MI.getOperand(0).getReg();
7563   Register Src = MI.getOperand(1).getReg();
7564   Register Len = MI.getOperand(2).getReg();
7565 
7566   const auto *MMOIt = MI.memoperands_begin();
7567   const MachineMemOperand *MemOp = *MMOIt;
7568   bool IsVolatile = MemOp->isVolatile();
7569 
7570   // See if this is a constant length copy
7571   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
7572   // FIXME: support dynamically sized G_MEMCPY_INLINE
7573   assert(LenVRegAndVal.hasValue() &&
7574          "inline memcpy with dynamic size is not yet supported");
7575   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
7576   if (KnownLen == 0) {
7577     MI.eraseFromParent();
7578     return Legalized;
7579   }
7580 
7581   const auto &DstMMO = **MI.memoperands_begin();
7582   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7583   Align DstAlign = DstMMO.getBaseAlign();
7584   Align SrcAlign = SrcMMO.getBaseAlign();
7585 
7586   return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
7587                            IsVolatile);
7588 }
7589 
7590 LegalizerHelper::LegalizeResult
7591 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
7592                                    uint64_t KnownLen, Align DstAlign,
7593                                    Align SrcAlign, bool IsVolatile) {
7594   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7595   return lowerMemcpy(MI, Dst, Src, KnownLen,
7596                      std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
7597                      IsVolatile);
7598 }
7599 
7600 LegalizerHelper::LegalizeResult
7601 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
7602                              uint64_t KnownLen, uint64_t Limit, Align DstAlign,
7603                              Align SrcAlign, bool IsVolatile) {
7604   auto &MF = *MI.getParent()->getParent();
7605   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7606   auto &DL = MF.getDataLayout();
7607   LLVMContext &C = MF.getFunction().getContext();
7608 
7609   assert(KnownLen != 0 && "Have a zero length memcpy length!");
7610 
7611   bool DstAlignCanChange = false;
7612   MachineFrameInfo &MFI = MF.getFrameInfo();
7613   Align Alignment = commonAlignment(DstAlign, SrcAlign);
7614 
7615   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7616   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7617     DstAlignCanChange = true;
7618 
7619   // FIXME: infer better src pointer alignment like SelectionDAG does here.
7620   // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
7621   // if the memcpy is in a tail call position.
7622 
7623   std::vector<LLT> MemOps;
7624 
7625   const auto &DstMMO = **MI.memoperands_begin();
7626   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7627   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7628   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7629 
7630   if (!findGISelOptimalMemOpLowering(
7631           MemOps, Limit,
7632           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7633                       IsVolatile),
7634           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7635           MF.getFunction().getAttributes(), TLI))
7636     return UnableToLegalize;
7637 
7638   if (DstAlignCanChange) {
7639     // Get an estimate of the type from the LLT.
7640     Type *IRTy = getTypeForLLT(MemOps[0], C);
7641     Align NewAlign = DL.getABITypeAlign(IRTy);
7642 
7643     // Don't promote to an alignment that would require dynamic stack
7644     // realignment.
7645     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
7646     if (!TRI->hasStackRealignment(MF))
7647       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
7648         NewAlign = NewAlign / 2;
7649 
7650     if (NewAlign > Alignment) {
7651       Alignment = NewAlign;
7652       unsigned FI = FIDef->getOperand(1).getIndex();
7653       // Give the stack frame object a larger alignment if needed.
7654       if (MFI.getObjectAlign(FI) < Alignment)
7655         MFI.setObjectAlignment(FI, Alignment);
7656     }
7657   }
7658 
7659   LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
7660 
7661   MachineIRBuilder MIB(MI);
7662   // Now we need to emit a pair of load and stores for each of the types we've
7663   // collected. I.e. for each type, generate a load from the source pointer of
7664   // that type width, and then generate a corresponding store to the dest buffer
7665   // of that value loaded. This can result in a sequence of loads and stores
7666   // mixed types, depending on what the target specifies as good types to use.
7667   unsigned CurrOffset = 0;
7668   unsigned Size = KnownLen;
7669   for (auto CopyTy : MemOps) {
7670     // Issuing an unaligned load / store pair  that overlaps with the previous
7671     // pair. Adjust the offset accordingly.
7672     if (CopyTy.getSizeInBytes() > Size)
7673       CurrOffset -= CopyTy.getSizeInBytes() - Size;
7674 
7675     // Construct MMOs for the accesses.
7676     auto *LoadMMO =
7677         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
7678     auto *StoreMMO =
7679         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
7680 
7681     // Create the load.
7682     Register LoadPtr = Src;
7683     Register Offset;
7684     if (CurrOffset != 0) {
7685       LLT SrcTy = MRI.getType(Src);
7686       Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
7687                    .getReg(0);
7688       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
7689     }
7690     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
7691 
7692     // Create the store.
7693     Register StorePtr = Dst;
7694     if (CurrOffset != 0) {
7695       LLT DstTy = MRI.getType(Dst);
7696       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
7697     }
7698     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
7699     CurrOffset += CopyTy.getSizeInBytes();
7700     Size -= CopyTy.getSizeInBytes();
7701   }
7702 
7703   MI.eraseFromParent();
7704   return Legalized;
7705 }
7706 
7707 LegalizerHelper::LegalizeResult
7708 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
7709                               uint64_t KnownLen, Align DstAlign, Align SrcAlign,
7710                               bool IsVolatile) {
7711   auto &MF = *MI.getParent()->getParent();
7712   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7713   auto &DL = MF.getDataLayout();
7714   LLVMContext &C = MF.getFunction().getContext();
7715 
7716   assert(KnownLen != 0 && "Have a zero length memmove length!");
7717 
7718   bool DstAlignCanChange = false;
7719   MachineFrameInfo &MFI = MF.getFrameInfo();
7720   bool OptSize = shouldLowerMemFuncForSize(MF);
7721   Align Alignment = commonAlignment(DstAlign, SrcAlign);
7722 
7723   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7724   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7725     DstAlignCanChange = true;
7726 
7727   unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
7728   std::vector<LLT> MemOps;
7729 
7730   const auto &DstMMO = **MI.memoperands_begin();
7731   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7732   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7733   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7734 
7735   // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
7736   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
7737   // same thing here.
7738   if (!findGISelOptimalMemOpLowering(
7739           MemOps, Limit,
7740           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7741                       /*IsVolatile*/ true),
7742           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7743           MF.getFunction().getAttributes(), TLI))
7744     return UnableToLegalize;
7745 
7746   if (DstAlignCanChange) {
7747     // Get an estimate of the type from the LLT.
7748     Type *IRTy = getTypeForLLT(MemOps[0], C);
7749     Align NewAlign = DL.getABITypeAlign(IRTy);
7750 
7751     // Don't promote to an alignment that would require dynamic stack
7752     // realignment.
7753     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
7754     if (!TRI->hasStackRealignment(MF))
7755       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
7756         NewAlign = NewAlign / 2;
7757 
7758     if (NewAlign > Alignment) {
7759       Alignment = NewAlign;
7760       unsigned FI = FIDef->getOperand(1).getIndex();
7761       // Give the stack frame object a larger alignment if needed.
7762       if (MFI.getObjectAlign(FI) < Alignment)
7763         MFI.setObjectAlignment(FI, Alignment);
7764     }
7765   }
7766 
7767   LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
7768 
7769   MachineIRBuilder MIB(MI);
7770   // Memmove requires that we perform the loads first before issuing the stores.
7771   // Apart from that, this loop is pretty much doing the same thing as the
7772   // memcpy codegen function.
7773   unsigned CurrOffset = 0;
7774   SmallVector<Register, 16> LoadVals;
7775   for (auto CopyTy : MemOps) {
7776     // Construct MMO for the load.
7777     auto *LoadMMO =
7778         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
7779 
7780     // Create the load.
7781     Register LoadPtr = Src;
7782     if (CurrOffset != 0) {
7783       LLT SrcTy = MRI.getType(Src);
7784       auto Offset =
7785           MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
7786       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
7787     }
7788     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
7789     CurrOffset += CopyTy.getSizeInBytes();
7790   }
7791 
7792   CurrOffset = 0;
7793   for (unsigned I = 0; I < MemOps.size(); ++I) {
7794     LLT CopyTy = MemOps[I];
7795     // Now store the values loaded.
7796     auto *StoreMMO =
7797         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
7798 
7799     Register StorePtr = Dst;
7800     if (CurrOffset != 0) {
7801       LLT DstTy = MRI.getType(Dst);
7802       auto Offset =
7803           MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
7804       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
7805     }
7806     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
7807     CurrOffset += CopyTy.getSizeInBytes();
7808   }
7809   MI.eraseFromParent();
7810   return Legalized;
7811 }
7812 
7813 LegalizerHelper::LegalizeResult
7814 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
7815   const unsigned Opc = MI.getOpcode();
7816   // This combine is fairly complex so it's not written with a separate
7817   // matcher function.
7818   assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
7819           Opc == TargetOpcode::G_MEMSET) &&
7820          "Expected memcpy like instruction");
7821 
7822   auto MMOIt = MI.memoperands_begin();
7823   const MachineMemOperand *MemOp = *MMOIt;
7824 
7825   Align DstAlign = MemOp->getBaseAlign();
7826   Align SrcAlign;
7827   Register Dst = MI.getOperand(0).getReg();
7828   Register Src = MI.getOperand(1).getReg();
7829   Register Len = MI.getOperand(2).getReg();
7830 
7831   if (Opc != TargetOpcode::G_MEMSET) {
7832     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
7833     MemOp = *(++MMOIt);
7834     SrcAlign = MemOp->getBaseAlign();
7835   }
7836 
7837   // See if this is a constant length copy
7838   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
7839   if (!LenVRegAndVal)
7840     return UnableToLegalize;
7841   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
7842 
7843   if (KnownLen == 0) {
7844     MI.eraseFromParent();
7845     return Legalized;
7846   }
7847 
7848   bool IsVolatile = MemOp->isVolatile();
7849   if (Opc == TargetOpcode::G_MEMCPY_INLINE)
7850     return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
7851                              IsVolatile);
7852 
7853   // Don't try to optimize volatile.
7854   if (IsVolatile)
7855     return UnableToLegalize;
7856 
7857   if (MaxLen && KnownLen > MaxLen)
7858     return UnableToLegalize;
7859 
7860   if (Opc == TargetOpcode::G_MEMCPY) {
7861     auto &MF = *MI.getParent()->getParent();
7862     const auto &TLI = *MF.getSubtarget().getTargetLowering();
7863     bool OptSize = shouldLowerMemFuncForSize(MF);
7864     uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
7865     return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
7866                        IsVolatile);
7867   }
7868   if (Opc == TargetOpcode::G_MEMMOVE)
7869     return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
7870   if (Opc == TargetOpcode::G_MEMSET)
7871     return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
7872   return UnableToLegalize;
7873 }
7874