xref: /llvm-project/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp (revision d6fc7d3ab186fee1c95c00992206e0914cb25f42)
1 //===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs below peephole optimizations on MIR level.
10 //
11 // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
12 //    MOVi64imm + ANDXrr ==> ANDXri + ANDXri
13 //
14 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
15 //    MOVi64imm + ADDXrr ==> ANDXri + ANDXri
16 //
17 // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
18 //    MOVi64imm + SUBXrr ==> SUBXri + SUBXri
19 //
20 //    The mov pseudo instruction could be expanded to multiple mov instructions
21 //    later. In this case, we could try to split the constant  operand of mov
22 //    instruction into two immediates which can be directly encoded into
23 //    *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
24 //    multiple `mov` + `and/add/sub` instructions.
25 //
26 // 4. Remove redundant ORRWrs which is generated by zero-extend.
27 //
28 //    %3:gpr32 = ORRWrs $wzr, %2, 0
29 //    %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
30 //
31 //    If AArch64's 32-bit form of instruction defines the source operand of
32 //    ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
33 //    operand are set to zero.
34 //
35 // 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
36 //     ==> %reg:subidx =  SUBREG_TO_REG 0, %subreg, subidx
37 //
38 // 6. %intermediate:gpr32 = COPY %src:fpr128
39 //    %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32
40 //     ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0
41 //
42 //    In cases where a source FPR is copied to a GPR in order to be copied
43 //    to a destination FPR, we can directly copy the values between the FPRs,
44 //    eliminating the use of the Integer unit. When we match a pattern of
45 //    INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR
46 //    source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr
47 //    instructions.
48 //
49 // 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high
50 //    64-bits. For example,
51 //
52 //   %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
53 //   %2:fpr64 = MOVID 0
54 //   %4:fpr128 = IMPLICIT_DEF
55 //   %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), %2:fpr64, %subreg.dsub
56 //   %6:fpr128 = IMPLICIT_DEF
57 //   %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), %1:fpr64, %subreg.dsub
58 //   %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, %3:fpr128, 0
59 //   ==>
60 //   %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
61 //   %6:fpr128 = IMPLICIT_DEF
62 //   %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), %1:fpr64, %subreg.dsub
63 //
64 // 8. Remove redundant CSELs that select between identical registers, by
65 //    replacing them with unconditional moves.
66 //
67 // 9. Replace UBFMXri with UBFMWri if the instruction is equivalent to a 32 bit
68 //    LSR or LSL alias of UBFM.
69 //
70 //===----------------------------------------------------------------------===//
71 
72 #include "AArch64ExpandImm.h"
73 #include "AArch64InstrInfo.h"
74 #include "MCTargetDesc/AArch64AddressingModes.h"
75 #include "llvm/CodeGen/MachineDominators.h"
76 #include "llvm/CodeGen/MachineLoopInfo.h"
77 
78 using namespace llvm;
79 
80 #define DEBUG_TYPE "aarch64-mi-peephole-opt"
81 
82 namespace {
83 
84 struct AArch64MIPeepholeOpt : public MachineFunctionPass {
85   static char ID;
86 
87   AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {
88     initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry());
89   }
90 
91   const AArch64InstrInfo *TII;
92   const AArch64RegisterInfo *TRI;
93   MachineLoopInfo *MLI;
94   MachineRegisterInfo *MRI;
95 
96   using OpcodePair = std::pair<unsigned, unsigned>;
97   template <typename T>
98   using SplitAndOpcFunc =
99       std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>;
100   using BuildMIFunc =
101       std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,
102                          Register, Register, Register)>;
103 
104   /// For instructions where an immediate operand could be split into two
105   /// separate immediate instructions, use the splitTwoPartImm two handle the
106   /// optimization.
107   ///
108   /// To implement, the following function types must be passed to
109   /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
110   /// splitting the immediate is valid and returns the associated new opcode. A
111   /// BuildMIFunc must be implemented to build the two immediate instructions.
112   ///
113   /// Example Pattern (where IMM would require 2+ MOV instructions):
114   ///     %dst = <Instr>rr %src IMM [...]
115   /// becomes:
116   ///     %tmp = <Instr>ri %src (encode half IMM) [...]
117   ///     %dst = <Instr>ri %tmp (encode half IMM) [...]
118   template <typename T>
119   bool splitTwoPartImm(MachineInstr &MI,
120                        SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
121 
122   bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
123                         MachineInstr *&SubregToRegMI);
124 
125   template <typename T>
126   bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI);
127   template <typename T>
128   bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
129 
130   template <typename T>
131   bool visitAND(unsigned Opc, MachineInstr &MI);
132   bool visitORR(MachineInstr &MI);
133   bool visitCSEL(MachineInstr &MI);
134   bool visitINSERT(MachineInstr &MI);
135   bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
136   bool visitINSvi64lane(MachineInstr &MI);
137   bool visitFMOVDr(MachineInstr &MI);
138   bool visitUBFMXri(MachineInstr &MI);
139   bool visitCopy(MachineInstr &MI);
140   bool runOnMachineFunction(MachineFunction &MF) override;
141 
142   StringRef getPassName() const override {
143     return "AArch64 MI Peephole Optimization pass";
144   }
145 
146   void getAnalysisUsage(AnalysisUsage &AU) const override {
147     AU.setPreservesCFG();
148     AU.addRequired<MachineLoopInfoWrapperPass>();
149     MachineFunctionPass::getAnalysisUsage(AU);
150   }
151 };
152 
153 char AArch64MIPeepholeOpt::ID = 0;
154 
155 } // end anonymous namespace
156 
157 INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
158                 "AArch64 MI Peephole Optimization", false, false)
159 
160 template <typename T>
161 static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
162   T UImm = static_cast<T>(Imm);
163   if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
164     return false;
165 
166   // If this immediate can be handled by one instruction, do not split it.
167   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
168   AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
169   if (Insn.size() == 1)
170     return false;
171 
172   // The bitmask immediate consists of consecutive ones.  Let's say there is
173   // constant 0b00000000001000000000010000000000 which does not consist of
174   // consecutive ones. We can split it in to two bitmask immediate like
175   // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
176   // If we do AND with these two bitmask immediate, we can see original one.
177   unsigned LowestBitSet = llvm::countr_zero(UImm);
178   unsigned HighestBitSet = Log2_64(UImm);
179 
180   // Create a mask which is filled with one from the position of lowest bit set
181   // to the position of highest bit set.
182   T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
183               (static_cast<T>(1) << LowestBitSet);
184   // Create a mask which is filled with one outside the position of lowest bit
185   // set and the position of highest bit set.
186   T NewImm2 = UImm | ~NewImm1;
187 
188   // If the split value is not valid bitmask immediate, do not split this
189   // constant.
190   if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
191     return false;
192 
193   Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
194   Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
195   return true;
196 }
197 
198 template <typename T>
199 bool AArch64MIPeepholeOpt::visitAND(
200     unsigned Opc, MachineInstr &MI) {
201   // Try below transformation.
202   //
203   // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
204   // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
205   //
206   // The mov pseudo instruction could be expanded to multiple mov instructions
207   // later. Let's try to split the constant operand of mov instruction into two
208   // bitmask immediates. It makes only two AND instructions instead of multiple
209   // mov + and instructions.
210 
211   return splitTwoPartImm<T>(
212       MI,
213       [Opc](T Imm, unsigned RegSize, T &Imm0,
214             T &Imm1) -> std::optional<OpcodePair> {
215         if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
216           return std::make_pair(Opc, Opc);
217         return std::nullopt;
218       },
219       [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
220                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
221                    Register NewDstReg) {
222         DebugLoc DL = MI.getDebugLoc();
223         MachineBasicBlock *MBB = MI.getParent();
224         BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
225             .addReg(SrcReg)
226             .addImm(Imm0);
227         BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
228             .addReg(NewTmpReg)
229             .addImm(Imm1);
230       });
231 }
232 
233 bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) {
234   // Check this ORR comes from below zero-extend pattern.
235   //
236   // def : Pat<(i64 (zext GPR32:$src)),
237   //           (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
238   if (MI.getOperand(3).getImm() != 0)
239     return false;
240 
241   if (MI.getOperand(1).getReg() != AArch64::WZR)
242     return false;
243 
244   MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
245   if (!SrcMI)
246     return false;
247 
248   // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
249   //
250   // When you use the 32-bit form of an instruction, the upper 32 bits of the
251   // source registers are ignored and the upper 32 bits of the destination
252   // register are set to zero.
253   //
254   // If AArch64's 32-bit form of instruction defines the source operand of
255   // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
256   // real AArch64 instruction and if it is not, do not process the opcode
257   // conservatively.
258   if (SrcMI->getOpcode() == TargetOpcode::COPY &&
259       SrcMI->getOperand(1).getReg().isVirtual()) {
260     const TargetRegisterClass *RC =
261         MRI->getRegClass(SrcMI->getOperand(1).getReg());
262 
263     // A COPY from an FPR will become a FMOVSWr, so do so now so that we know
264     // that the upper bits are zero.
265     if (RC != &AArch64::FPR32RegClass &&
266         ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) ||
267          SrcMI->getOperand(1).getSubReg() != AArch64::ssub))
268       return false;
269     Register CpySrc = SrcMI->getOperand(1).getReg();
270     if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) {
271       CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass);
272       BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
273               TII->get(TargetOpcode::COPY), CpySrc)
274           .add(SrcMI->getOperand(1));
275     }
276     BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
277             TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg())
278         .addReg(CpySrc);
279     SrcMI->eraseFromParent();
280   }
281   else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
282     return false;
283 
284   Register DefReg = MI.getOperand(0).getReg();
285   Register SrcReg = MI.getOperand(2).getReg();
286   MRI->replaceRegWith(DefReg, SrcReg);
287   MRI->clearKillFlags(SrcReg);
288   LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
289   MI.eraseFromParent();
290 
291   return true;
292 }
293 
294 bool AArch64MIPeepholeOpt::visitCSEL(MachineInstr &MI) {
295   // Replace CSEL with MOV when both inputs are the same register.
296   if (MI.getOperand(1).getReg() != MI.getOperand(2).getReg())
297     return false;
298 
299   auto ZeroReg =
300       MI.getOpcode() == AArch64::CSELXr ? AArch64::XZR : AArch64::WZR;
301   auto OrOpcode =
302       MI.getOpcode() == AArch64::CSELXr ? AArch64::ORRXrs : AArch64::ORRWrs;
303 
304   BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(OrOpcode))
305       .addReg(MI.getOperand(0).getReg(), RegState::Define)
306       .addReg(ZeroReg)
307       .addReg(MI.getOperand(1).getReg())
308       .addImm(0);
309 
310   MI.eraseFromParent();
311   return true;
312 }
313 
314 bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) {
315   // Check this INSERT_SUBREG comes from below zero-extend pattern.
316   //
317   // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
318   // To   %reg:subidx =  SUBREG_TO_REG 0, %subreg, subidx
319   //
320   // We're assuming the first operand to INSERT_SUBREG is irrelevant because a
321   // COPY would destroy the upper part of the register anyway
322   if (!MI.isRegTiedToDefOperand(1))
323     return false;
324 
325   Register DstReg = MI.getOperand(0).getReg();
326   const TargetRegisterClass *RC = MRI->getRegClass(DstReg);
327   MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
328   if (!SrcMI)
329     return false;
330 
331   // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
332   //
333   // When you use the 32-bit form of an instruction, the upper 32 bits of the
334   // source registers are ignored and the upper 32 bits of the destination
335   // register are set to zero.
336   //
337   // If AArch64's 32-bit form of instruction defines the source operand of
338   // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
339   // real AArch64 instruction and if it is not, do not process the opcode
340   // conservatively.
341   if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) ||
342       !AArch64::GPR64allRegClass.hasSubClassEq(RC))
343     return false;
344 
345   // Build a SUBREG_TO_REG instruction
346   MachineInstr *SubregMI =
347       BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
348               TII->get(TargetOpcode::SUBREG_TO_REG), DstReg)
349           .addImm(0)
350           .add(MI.getOperand(2))
351           .add(MI.getOperand(3));
352   LLVM_DEBUG(dbgs() << MI << "  replace by:\n: " << *SubregMI << "\n");
353   (void)SubregMI;
354   MI.eraseFromParent();
355 
356   return true;
357 }
358 
359 template <typename T>
360 static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
361   // The immediate must be in the form of ((imm0 << 12) + imm1), in which both
362   // imm0 and imm1 are non-zero 12-bit unsigned int.
363   if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||
364       (Imm & ~static_cast<T>(0xffffff)) != 0)
365     return false;
366 
367   // The immediate can not be composed via a single instruction.
368   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
369   AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
370   if (Insn.size() == 1)
371     return false;
372 
373   // Split Imm into (Imm0 << 12) + Imm1;
374   Imm0 = (Imm >> 12) & 0xfff;
375   Imm1 = Imm & 0xfff;
376   return true;
377 }
378 
379 template <typename T>
380 bool AArch64MIPeepholeOpt::visitADDSUB(
381     unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) {
382   // Try below transformation.
383   //
384   // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri
385   // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri
386   //
387   // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri
388   // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri
389   //
390   // The mov pseudo instruction could be expanded to multiple mov instructions
391   // later. Let's try to split the constant operand of mov instruction into two
392   // legal add/sub immediates. It makes only two ADD/SUB instructions instead of
393   // multiple `mov` + `and/sub` instructions.
394 
395   // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant
396   // folded. Make sure that we don't generate invalid instructions that use XZR
397   // in those cases.
398   if (MI.getOperand(1).getReg() == AArch64::XZR ||
399       MI.getOperand(1).getReg() == AArch64::WZR)
400     return false;
401 
402   return splitTwoPartImm<T>(
403       MI,
404       [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
405                        T &Imm1) -> std::optional<OpcodePair> {
406         if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
407           return std::make_pair(PosOpc, PosOpc);
408         if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
409           return std::make_pair(NegOpc, NegOpc);
410         return std::nullopt;
411       },
412       [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
413                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
414                    Register NewDstReg) {
415         DebugLoc DL = MI.getDebugLoc();
416         MachineBasicBlock *MBB = MI.getParent();
417         BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
418             .addReg(SrcReg)
419             .addImm(Imm0)
420             .addImm(12);
421         BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
422             .addReg(NewTmpReg)
423             .addImm(Imm1)
424             .addImm(0);
425       });
426 }
427 
428 template <typename T>
429 bool AArch64MIPeepholeOpt::visitADDSSUBS(
430     OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) {
431   // Try the same transformation as ADDSUB but with additional requirement
432   // that the condition code usages are only for Equal and Not Equal
433 
434   if (MI.getOperand(1).getReg() == AArch64::XZR ||
435       MI.getOperand(1).getReg() == AArch64::WZR)
436     return false;
437 
438   return splitTwoPartImm<T>(
439       MI,
440       [PosOpcs, NegOpcs, &MI, &TRI = TRI,
441        &MRI = MRI](T Imm, unsigned RegSize, T &Imm0,
442                    T &Imm1) -> std::optional<OpcodePair> {
443         OpcodePair OP;
444         if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
445           OP = PosOpcs;
446         else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
447           OP = NegOpcs;
448         else
449           return std::nullopt;
450         // Check conditional uses last since it is expensive for scanning
451         // proceeding instructions
452         MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
453         std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI);
454         if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V)
455           return std::nullopt;
456         return OP;
457       },
458       [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
459                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
460                    Register NewDstReg) {
461         DebugLoc DL = MI.getDebugLoc();
462         MachineBasicBlock *MBB = MI.getParent();
463         BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
464             .addReg(SrcReg)
465             .addImm(Imm0)
466             .addImm(12);
467         BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
468             .addReg(NewTmpReg)
469             .addImm(Imm1)
470             .addImm(0);
471       });
472 }
473 
474 // Checks if the corresponding MOV immediate instruction is applicable for
475 // this peephole optimization.
476 bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
477                                             MachineInstr *&MovMI,
478                                             MachineInstr *&SubregToRegMI) {
479   // Check whether current MBB is in loop and the AND is loop invariant.
480   MachineBasicBlock *MBB = MI.getParent();
481   MachineLoop *L = MLI->getLoopFor(MBB);
482   if (L && !L->isLoopInvariant(MI))
483     return false;
484 
485   // Check whether current MI's operand is MOV with immediate.
486   MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
487   if (!MovMI)
488     return false;
489 
490   // If it is SUBREG_TO_REG, check its operand.
491   SubregToRegMI = nullptr;
492   if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
493     SubregToRegMI = MovMI;
494     MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
495     if (!MovMI)
496       return false;
497   }
498 
499   if (MovMI->getOpcode() != AArch64::MOVi32imm &&
500       MovMI->getOpcode() != AArch64::MOVi64imm)
501     return false;
502 
503   // If the MOV has multiple uses, do not split the immediate because it causes
504   // more instructions.
505   if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
506     return false;
507   if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
508     return false;
509 
510   // It is OK to perform this peephole optimization.
511   return true;
512 }
513 
514 template <typename T>
515 bool AArch64MIPeepholeOpt::splitTwoPartImm(
516     MachineInstr &MI,
517     SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
518   unsigned RegSize = sizeof(T) * 8;
519   assert((RegSize == 32 || RegSize == 64) &&
520          "Invalid RegSize for legal immediate peephole optimization");
521 
522   // Perform several essential checks against current MI.
523   MachineInstr *MovMI, *SubregToRegMI;
524   if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
525     return false;
526 
527   // Split the immediate to Imm0 and Imm1, and calculate the Opcode.
528   T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
529   // For the 32 bit form of instruction, the upper 32 bits of the destination
530   // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
531   // of Imm to zero. This is essential if the Immediate value was a negative
532   // number since it was sign extended when we assign to the 64-bit Imm.
533   if (SubregToRegMI)
534     Imm &= 0xFFFFFFFF;
535   OpcodePair Opcode;
536   if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
537     Opcode = *R;
538   else
539     return false;
540 
541   // Create new MIs using the first and second opcodes. Opcodes might differ for
542   // flag setting operations that should only set flags on second instruction.
543   // NewTmpReg = Opcode.first SrcReg Imm0
544   // NewDstReg = Opcode.second NewTmpReg Imm1
545 
546   // Determine register classes for destinations and register operands
547   MachineFunction *MF = MI.getMF();
548   const TargetRegisterClass *FirstInstrDstRC =
549       TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF);
550   const TargetRegisterClass *FirstInstrOperandRC =
551       TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF);
552   const TargetRegisterClass *SecondInstrDstRC =
553       (Opcode.first == Opcode.second)
554           ? FirstInstrDstRC
555           : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF);
556   const TargetRegisterClass *SecondInstrOperandRC =
557       (Opcode.first == Opcode.second)
558           ? FirstInstrOperandRC
559           : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF);
560 
561   // Get old registers destinations and new register destinations
562   Register DstReg = MI.getOperand(0).getReg();
563   Register SrcReg = MI.getOperand(1).getReg();
564   Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC);
565   // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to
566   // reuse that same destination register.
567   Register NewDstReg = DstReg.isVirtual()
568                            ? MRI->createVirtualRegister(SecondInstrDstRC)
569                            : DstReg;
570 
571   // Constrain registers based on their new uses
572   MRI->constrainRegClass(SrcReg, FirstInstrOperandRC);
573   MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC);
574   if (DstReg != NewDstReg)
575     MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
576 
577   // Call the delegating operation to build the instruction
578   BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
579 
580   // replaceRegWith changes MI's definition register. Keep it for SSA form until
581   // deleting MI. Only if we made a new destination register.
582   if (DstReg != NewDstReg) {
583     MRI->replaceRegWith(DstReg, NewDstReg);
584     MI.getOperand(0).setReg(DstReg);
585   }
586 
587   // Record the MIs need to be removed.
588   MI.eraseFromParent();
589   if (SubregToRegMI)
590     SubregToRegMI->eraseFromParent();
591   MovMI->eraseFromParent();
592 
593   return true;
594 }
595 
596 bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) {
597   // Check if this INSvi[X]gpr comes from COPY of a source FPR128
598   //
599   // From
600   //  %intermediate1:gpr64 = COPY %src:fpr128
601   //  %intermediate2:gpr32 = COPY %intermediate1:gpr64
602   //  %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32
603   // To
604   //  %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128,
605   //  src_index
606   // where src_index = 0, X = [8|16|32|64]
607 
608   MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
609 
610   // For a chain of COPY instructions, find the initial source register
611   // and check if it's an FPR128
612   while (true) {
613     if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY)
614       return false;
615 
616     if (!SrcMI->getOperand(1).getReg().isVirtual())
617       return false;
618 
619     if (MRI->getRegClass(SrcMI->getOperand(1).getReg()) ==
620         &AArch64::FPR128RegClass) {
621       break;
622     }
623     SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
624   }
625 
626   Register DstReg = MI.getOperand(0).getReg();
627   Register SrcReg = SrcMI->getOperand(1).getReg();
628   MachineInstr *INSvilaneMI =
629       BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg)
630           .add(MI.getOperand(1))
631           .add(MI.getOperand(2))
632           .addUse(SrcReg, getRegState(SrcMI->getOperand(1)))
633           .addImm(0);
634 
635   LLVM_DEBUG(dbgs() << MI << "  replace by:\n: " << *INSvilaneMI << "\n");
636   (void)INSvilaneMI;
637   MI.eraseFromParent();
638   return true;
639 }
640 
641 // All instructions that set a FPR64 will implicitly zero the top bits of the
642 // register.
643 static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI,
644                                         MachineRegisterInfo *MRI) {
645   if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef())
646     return false;
647   const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg());
648   if (RC != &AArch64::FPR64RegClass)
649     return false;
650   return MI->getOpcode() > TargetOpcode::GENERIC_OP_END;
651 }
652 
653 bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
654   // Check the MI for low 64-bits sets zero for high 64-bits implicitly.
655   // We are expecting below case.
656   //
657   //  %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
658   //  %6:fpr128 = IMPLICIT_DEF
659   //  %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
660   //  %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
661   MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
662   if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG)
663     return false;
664   Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg());
665   if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
666     return false;
667 
668   // Check there is `mov 0` MI for high 64-bits.
669   // We are expecting below cases.
670   //
671   //  %2:fpr64 = MOVID 0
672   //  %4:fpr128 = IMPLICIT_DEF
673   //  %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
674   //  %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
675   // or
676   //  %5:fpr128 = MOVIv2d_ns 0
677   //  %6:fpr64 = COPY %5.dsub:fpr128
678   //  %8:fpr128 = IMPLICIT_DEF
679   //  %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub
680   //  %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0
681   MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
682   if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG)
683     return false;
684   High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg());
685   if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY)
686     High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg());
687   if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID &&
688                     High64MI->getOpcode() != AArch64::MOVIv2d_ns))
689     return false;
690   if (High64MI->getOperand(1).getImm() != 0)
691     return false;
692 
693   // Let's remove MIs for high 64-bits.
694   Register OldDef = MI.getOperand(0).getReg();
695   Register NewDef = MI.getOperand(1).getReg();
696   MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
697   MRI->replaceRegWith(OldDef, NewDef);
698   MI.eraseFromParent();
699 
700   return true;
701 }
702 
703 bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
704   // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR.
705   MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
706   if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
707     return false;
708 
709   // Let's remove MIs for high 64-bits.
710   Register OldDef = MI.getOperand(0).getReg();
711   Register NewDef = MI.getOperand(1).getReg();
712   LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n");
713   MRI->clearKillFlags(OldDef);
714   MRI->clearKillFlags(NewDef);
715   MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
716   MRI->replaceRegWith(OldDef, NewDef);
717   MI.eraseFromParent();
718 
719   return true;
720 }
721 
722 bool AArch64MIPeepholeOpt::visitUBFMXri(MachineInstr &MI) {
723   // Check if the instruction is equivalent to a 32 bit LSR or LSL alias of
724   // UBFM, and replace the UBFMXri instruction with its 32 bit variant, UBFMWri.
725   int64_t Immr = MI.getOperand(2).getImm();
726   int64_t Imms = MI.getOperand(3).getImm();
727 
728   bool IsLSR = Imms == 31 && Immr <= Imms;
729   bool IsLSL = Immr == Imms + 33;
730   if (!IsLSR && !IsLSL)
731     return false;
732 
733   if (IsLSL) {
734     Immr -= 32;
735   }
736 
737   const TargetRegisterClass *DstRC64 =
738       TII->getRegClass(TII->get(MI.getOpcode()), 0, TRI, *MI.getMF());
739   const TargetRegisterClass *DstRC32 =
740       TRI->getSubRegisterClass(DstRC64, AArch64::sub_32);
741   assert(DstRC32 && "Destination register class of UBFMXri doesn't have a "
742                     "sub_32 subregister class");
743 
744   const TargetRegisterClass *SrcRC64 =
745       TII->getRegClass(TII->get(MI.getOpcode()), 1, TRI, *MI.getMF());
746   const TargetRegisterClass *SrcRC32 =
747       TRI->getSubRegisterClass(SrcRC64, AArch64::sub_32);
748   assert(SrcRC32 && "Source register class of UBFMXri doesn't have a sub_32 "
749                     "subregister class");
750 
751   Register DstReg64 = MI.getOperand(0).getReg();
752   Register DstReg32 = MRI->createVirtualRegister(DstRC32);
753   Register SrcReg64 = MI.getOperand(1).getReg();
754   Register SrcReg32 = MRI->createVirtualRegister(SrcRC32);
755 
756   BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::COPY),
757           SrcReg32)
758       .addReg(SrcReg64, 0, AArch64::sub_32);
759   BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::UBFMWri),
760           DstReg32)
761       .addReg(SrcReg32)
762       .addImm(Immr)
763       .addImm(Imms);
764   BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
765           TII->get(AArch64::SUBREG_TO_REG), DstReg64)
766       .addImm(0)
767       .addReg(DstReg32)
768       .addImm(AArch64::sub_32);
769   MI.eraseFromParent();
770   return true;
771 }
772 
773 // Across a basic-block we might have in i32 extract from a value that only
774 // operates on upper bits (for example a sxtw). We can replace the COPY with a
775 // new version skipping the sxtw.
776 bool AArch64MIPeepholeOpt::visitCopy(MachineInstr &MI) {
777   Register InputReg = MI.getOperand(1).getReg();
778   if (MI.getOperand(1).getSubReg() != AArch64::sub_32 ||
779       !MRI->hasOneNonDBGUse(InputReg))
780     return false;
781 
782   MachineInstr *SrcMI = MRI->getUniqueVRegDef(InputReg);
783   SmallPtrSet<MachineInstr *, 4> DeadInstrs;
784   DeadInstrs.insert(SrcMI);
785   while (SrcMI && SrcMI->isFullCopy() &&
786          MRI->hasOneNonDBGUse(SrcMI->getOperand(1).getReg())) {
787     SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
788     DeadInstrs.insert(SrcMI);
789   }
790 
791   if (!SrcMI)
792     return false;
793 
794   // Look for SXTW(X) and return Reg.
795   auto getSXTWSrcReg = [](MachineInstr *SrcMI) -> Register {
796     if (SrcMI->getOpcode() != AArch64::SBFMXri ||
797         SrcMI->getOperand(2).getImm() != 0 ||
798         SrcMI->getOperand(3).getImm() != 31)
799       return AArch64::NoRegister;
800     return SrcMI->getOperand(1).getReg();
801   };
802   // Look for SUBREG_TO_REG(ORRWrr(WZR, COPY(X.sub_32)))
803   auto getUXTWSrcReg = [&](MachineInstr *SrcMI) -> Register {
804     if (SrcMI->getOpcode() != AArch64::SUBREG_TO_REG ||
805         SrcMI->getOperand(3).getImm() != AArch64::sub_32 ||
806         !MRI->hasOneNonDBGUse(SrcMI->getOperand(2).getReg()))
807       return AArch64::NoRegister;
808     MachineInstr *Orr = MRI->getUniqueVRegDef(SrcMI->getOperand(2).getReg());
809     if (!Orr || Orr->getOpcode() != AArch64::ORRWrr ||
810         Orr->getOperand(1).getReg() != AArch64::WZR ||
811         !MRI->hasOneNonDBGUse(Orr->getOperand(2).getReg()))
812       return AArch64::NoRegister;
813     MachineInstr *Cpy = MRI->getUniqueVRegDef(Orr->getOperand(2).getReg());
814     if (!Cpy || Cpy->getOpcode() != AArch64::COPY ||
815         Cpy->getOperand(1).getSubReg() != AArch64::sub_32)
816       return AArch64::NoRegister;
817     DeadInstrs.insert(Orr);
818     return Cpy->getOperand(1).getReg();
819   };
820 
821   Register SrcReg = getSXTWSrcReg(SrcMI);
822   if (!SrcReg)
823     SrcReg = getUXTWSrcReg(SrcMI);
824   if (!SrcReg)
825     return false;
826 
827   MRI->constrainRegClass(SrcReg, MRI->getRegClass(InputReg));
828   LLVM_DEBUG(dbgs() << "Optimizing: " << MI);
829   MI.getOperand(1).setReg(SrcReg);
830   LLVM_DEBUG(dbgs() << "        to: " << MI);
831   for (auto *DeadMI : DeadInstrs) {
832     LLVM_DEBUG(dbgs() << "  Removing: " << *DeadMI);
833     DeadMI->eraseFromParent();
834   }
835   return true;
836 }
837 
838 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
839   if (skipFunction(MF.getFunction()))
840     return false;
841 
842   TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
843   TRI = static_cast<const AArch64RegisterInfo *>(
844       MF.getSubtarget().getRegisterInfo());
845   MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
846   MRI = &MF.getRegInfo();
847 
848   assert(MRI->isSSA() && "Expected to be run on SSA form!");
849 
850   bool Changed = false;
851 
852   for (MachineBasicBlock &MBB : MF) {
853     for (MachineInstr &MI : make_early_inc_range(MBB)) {
854       switch (MI.getOpcode()) {
855       default:
856         break;
857       case AArch64::INSERT_SUBREG:
858         Changed |= visitINSERT(MI);
859         break;
860       case AArch64::ANDWrr:
861         Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);
862         break;
863       case AArch64::ANDXrr:
864         Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
865         break;
866       case AArch64::ORRWrs:
867         Changed |= visitORR(MI);
868         break;
869       case AArch64::ADDWrr:
870         Changed |= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI);
871         break;
872       case AArch64::SUBWrr:
873         Changed |= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI);
874         break;
875       case AArch64::ADDXrr:
876         Changed |= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI);
877         break;
878       case AArch64::SUBXrr:
879         Changed |= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI);
880         break;
881       case AArch64::ADDSWrr:
882         Changed |=
883             visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri},
884                                     {AArch64::SUBWri, AArch64::SUBSWri}, MI);
885         break;
886       case AArch64::SUBSWrr:
887         Changed |=
888             visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri},
889                                     {AArch64::ADDWri, AArch64::ADDSWri}, MI);
890         break;
891       case AArch64::ADDSXrr:
892         Changed |=
893             visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri},
894                                     {AArch64::SUBXri, AArch64::SUBSXri}, MI);
895         break;
896       case AArch64::SUBSXrr:
897         Changed |=
898             visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri},
899                                     {AArch64::ADDXri, AArch64::ADDSXri}, MI);
900         break;
901       case AArch64::CSELWr:
902       case AArch64::CSELXr:
903         Changed |= visitCSEL(MI);
904         break;
905       case AArch64::INSvi64gpr:
906         Changed |= visitINSviGPR(MI, AArch64::INSvi64lane);
907         break;
908       case AArch64::INSvi32gpr:
909         Changed |= visitINSviGPR(MI, AArch64::INSvi32lane);
910         break;
911       case AArch64::INSvi16gpr:
912         Changed |= visitINSviGPR(MI, AArch64::INSvi16lane);
913         break;
914       case AArch64::INSvi8gpr:
915         Changed |= visitINSviGPR(MI, AArch64::INSvi8lane);
916         break;
917       case AArch64::INSvi64lane:
918         Changed |= visitINSvi64lane(MI);
919         break;
920       case AArch64::FMOVDr:
921         Changed |= visitFMOVDr(MI);
922         break;
923       case AArch64::UBFMXri:
924         Changed |= visitUBFMXri(MI);
925         break;
926       case AArch64::COPY:
927         Changed |= visitCopy(MI);
928         break;
929       }
930     }
931   }
932 
933   return Changed;
934 }
935 
936 FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
937   return new AArch64MIPeepholeOpt();
938 }
939