1 //===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass performs below peephole optimizations on MIR level. 10 // 11 // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri 12 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri 13 // 14 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi 15 // MOVi64imm + ADDXrr ==> ANDXri + ANDXri 16 // 17 // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi 18 // MOVi64imm + SUBXrr ==> SUBXri + SUBXri 19 // 20 // The mov pseudo instruction could be expanded to multiple mov instructions 21 // later. In this case, we could try to split the constant operand of mov 22 // instruction into two immediates which can be directly encoded into 23 // *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of 24 // multiple `mov` + `and/add/sub` instructions. 25 // 26 // 4. Remove redundant ORRWrs which is generated by zero-extend. 27 // 28 // %3:gpr32 = ORRWrs $wzr, %2, 0 29 // %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32 30 // 31 // If AArch64's 32-bit form of instruction defines the source operand of 32 // ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source 33 // operand are set to zero. 34 // 35 // 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx 36 // ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx 37 // 38 // 6. %intermediate:gpr32 = COPY %src:fpr128 39 // %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32 40 // ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0 41 // 42 // In cases where a source FPR is copied to a GPR in order to be copied 43 // to a destination FPR, we can directly copy the values between the FPRs, 44 // eliminating the use of the Integer unit. When we match a pattern of 45 // INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR 46 // source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr 47 // instructions. 48 // 49 // 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high 50 // 64-bits. For example, 51 // 52 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr 53 // %2:fpr64 = MOVID 0 54 // %4:fpr128 = IMPLICIT_DEF 55 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), %2:fpr64, %subreg.dsub 56 // %6:fpr128 = IMPLICIT_DEF 57 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), %1:fpr64, %subreg.dsub 58 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, %3:fpr128, 0 59 // ==> 60 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr 61 // %6:fpr128 = IMPLICIT_DEF 62 // %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), %1:fpr64, %subreg.dsub 63 // 64 // 8. Remove redundant CSELs that select between identical registers, by 65 // replacing them with unconditional moves. 66 // 67 // 9. Replace UBFMXri with UBFMWri if the instruction is equivalent to a 32 bit 68 // LSR or LSL alias of UBFM. 69 // 70 //===----------------------------------------------------------------------===// 71 72 #include "AArch64ExpandImm.h" 73 #include "AArch64InstrInfo.h" 74 #include "MCTargetDesc/AArch64AddressingModes.h" 75 #include "llvm/CodeGen/MachineDominators.h" 76 #include "llvm/CodeGen/MachineLoopInfo.h" 77 78 using namespace llvm; 79 80 #define DEBUG_TYPE "aarch64-mi-peephole-opt" 81 82 namespace { 83 84 struct AArch64MIPeepholeOpt : public MachineFunctionPass { 85 static char ID; 86 87 AArch64MIPeepholeOpt() : MachineFunctionPass(ID) { 88 initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry()); 89 } 90 91 const AArch64InstrInfo *TII; 92 const AArch64RegisterInfo *TRI; 93 MachineLoopInfo *MLI; 94 MachineRegisterInfo *MRI; 95 96 using OpcodePair = std::pair<unsigned, unsigned>; 97 template <typename T> 98 using SplitAndOpcFunc = 99 std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>; 100 using BuildMIFunc = 101 std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned, 102 Register, Register, Register)>; 103 104 /// For instructions where an immediate operand could be split into two 105 /// separate immediate instructions, use the splitTwoPartImm two handle the 106 /// optimization. 107 /// 108 /// To implement, the following function types must be passed to 109 /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if 110 /// splitting the immediate is valid and returns the associated new opcode. A 111 /// BuildMIFunc must be implemented to build the two immediate instructions. 112 /// 113 /// Example Pattern (where IMM would require 2+ MOV instructions): 114 /// %dst = <Instr>rr %src IMM [...] 115 /// becomes: 116 /// %tmp = <Instr>ri %src (encode half IMM) [...] 117 /// %dst = <Instr>ri %tmp (encode half IMM) [...] 118 template <typename T> 119 bool splitTwoPartImm(MachineInstr &MI, 120 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr); 121 122 bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI, 123 MachineInstr *&SubregToRegMI); 124 125 template <typename T> 126 bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI); 127 template <typename T> 128 bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI); 129 130 template <typename T> 131 bool visitAND(unsigned Opc, MachineInstr &MI); 132 bool visitORR(MachineInstr &MI); 133 bool visitCSEL(MachineInstr &MI); 134 bool visitINSERT(MachineInstr &MI); 135 bool visitINSviGPR(MachineInstr &MI, unsigned Opc); 136 bool visitINSvi64lane(MachineInstr &MI); 137 bool visitFMOVDr(MachineInstr &MI); 138 bool visitUBFMXri(MachineInstr &MI); 139 bool visitCopy(MachineInstr &MI); 140 bool runOnMachineFunction(MachineFunction &MF) override; 141 142 StringRef getPassName() const override { 143 return "AArch64 MI Peephole Optimization pass"; 144 } 145 146 void getAnalysisUsage(AnalysisUsage &AU) const override { 147 AU.setPreservesCFG(); 148 AU.addRequired<MachineLoopInfoWrapperPass>(); 149 MachineFunctionPass::getAnalysisUsage(AU); 150 } 151 }; 152 153 char AArch64MIPeepholeOpt::ID = 0; 154 155 } // end anonymous namespace 156 157 INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt", 158 "AArch64 MI Peephole Optimization", false, false) 159 160 template <typename T> 161 static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { 162 T UImm = static_cast<T>(Imm); 163 if (AArch64_AM::isLogicalImmediate(UImm, RegSize)) 164 return false; 165 166 // If this immediate can be handled by one instruction, do not split it. 167 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 168 AArch64_IMM::expandMOVImm(UImm, RegSize, Insn); 169 if (Insn.size() == 1) 170 return false; 171 172 // The bitmask immediate consists of consecutive ones. Let's say there is 173 // constant 0b00000000001000000000010000000000 which does not consist of 174 // consecutive ones. We can split it in to two bitmask immediate like 175 // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111. 176 // If we do AND with these two bitmask immediate, we can see original one. 177 unsigned LowestBitSet = llvm::countr_zero(UImm); 178 unsigned HighestBitSet = Log2_64(UImm); 179 180 // Create a mask which is filled with one from the position of lowest bit set 181 // to the position of highest bit set. 182 T NewImm1 = (static_cast<T>(2) << HighestBitSet) - 183 (static_cast<T>(1) << LowestBitSet); 184 // Create a mask which is filled with one outside the position of lowest bit 185 // set and the position of highest bit set. 186 T NewImm2 = UImm | ~NewImm1; 187 188 // If the split value is not valid bitmask immediate, do not split this 189 // constant. 190 if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize)) 191 return false; 192 193 Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize); 194 Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize); 195 return true; 196 } 197 198 template <typename T> 199 bool AArch64MIPeepholeOpt::visitAND( 200 unsigned Opc, MachineInstr &MI) { 201 // Try below transformation. 202 // 203 // MOVi32imm + ANDWrr ==> ANDWri + ANDWri 204 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri 205 // 206 // The mov pseudo instruction could be expanded to multiple mov instructions 207 // later. Let's try to split the constant operand of mov instruction into two 208 // bitmask immediates. It makes only two AND instructions instead of multiple 209 // mov + and instructions. 210 211 return splitTwoPartImm<T>( 212 MI, 213 [Opc](T Imm, unsigned RegSize, T &Imm0, 214 T &Imm1) -> std::optional<OpcodePair> { 215 if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1)) 216 return std::make_pair(Opc, Opc); 217 return std::nullopt; 218 }, 219 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, 220 unsigned Imm1, Register SrcReg, Register NewTmpReg, 221 Register NewDstReg) { 222 DebugLoc DL = MI.getDebugLoc(); 223 MachineBasicBlock *MBB = MI.getParent(); 224 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) 225 .addReg(SrcReg) 226 .addImm(Imm0); 227 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) 228 .addReg(NewTmpReg) 229 .addImm(Imm1); 230 }); 231 } 232 233 bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) { 234 // Check this ORR comes from below zero-extend pattern. 235 // 236 // def : Pat<(i64 (zext GPR32:$src)), 237 // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>; 238 if (MI.getOperand(3).getImm() != 0) 239 return false; 240 241 if (MI.getOperand(1).getReg() != AArch64::WZR) 242 return false; 243 244 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); 245 if (!SrcMI) 246 return false; 247 248 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC 249 // 250 // When you use the 32-bit form of an instruction, the upper 32 bits of the 251 // source registers are ignored and the upper 32 bits of the destination 252 // register are set to zero. 253 // 254 // If AArch64's 32-bit form of instruction defines the source operand of 255 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is 256 // real AArch64 instruction and if it is not, do not process the opcode 257 // conservatively. 258 if (SrcMI->getOpcode() == TargetOpcode::COPY && 259 SrcMI->getOperand(1).getReg().isVirtual()) { 260 const TargetRegisterClass *RC = 261 MRI->getRegClass(SrcMI->getOperand(1).getReg()); 262 263 // A COPY from an FPR will become a FMOVSWr, so do so now so that we know 264 // that the upper bits are zero. 265 if (RC != &AArch64::FPR32RegClass && 266 ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) || 267 SrcMI->getOperand(1).getSubReg() != AArch64::ssub)) 268 return false; 269 Register CpySrc = SrcMI->getOperand(1).getReg(); 270 if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) { 271 CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass); 272 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(), 273 TII->get(TargetOpcode::COPY), CpySrc) 274 .add(SrcMI->getOperand(1)); 275 } 276 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(), 277 TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg()) 278 .addReg(CpySrc); 279 SrcMI->eraseFromParent(); 280 } 281 else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) 282 return false; 283 284 Register DefReg = MI.getOperand(0).getReg(); 285 Register SrcReg = MI.getOperand(2).getReg(); 286 MRI->replaceRegWith(DefReg, SrcReg); 287 MRI->clearKillFlags(SrcReg); 288 LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n"); 289 MI.eraseFromParent(); 290 291 return true; 292 } 293 294 bool AArch64MIPeepholeOpt::visitCSEL(MachineInstr &MI) { 295 // Replace CSEL with MOV when both inputs are the same register. 296 if (MI.getOperand(1).getReg() != MI.getOperand(2).getReg()) 297 return false; 298 299 auto ZeroReg = 300 MI.getOpcode() == AArch64::CSELXr ? AArch64::XZR : AArch64::WZR; 301 auto OrOpcode = 302 MI.getOpcode() == AArch64::CSELXr ? AArch64::ORRXrs : AArch64::ORRWrs; 303 304 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(OrOpcode)) 305 .addReg(MI.getOperand(0).getReg(), RegState::Define) 306 .addReg(ZeroReg) 307 .addReg(MI.getOperand(1).getReg()) 308 .addImm(0); 309 310 MI.eraseFromParent(); 311 return true; 312 } 313 314 bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) { 315 // Check this INSERT_SUBREG comes from below zero-extend pattern. 316 // 317 // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx 318 // To %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx 319 // 320 // We're assuming the first operand to INSERT_SUBREG is irrelevant because a 321 // COPY would destroy the upper part of the register anyway 322 if (!MI.isRegTiedToDefOperand(1)) 323 return false; 324 325 Register DstReg = MI.getOperand(0).getReg(); 326 const TargetRegisterClass *RC = MRI->getRegClass(DstReg); 327 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); 328 if (!SrcMI) 329 return false; 330 331 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC 332 // 333 // When you use the 32-bit form of an instruction, the upper 32 bits of the 334 // source registers are ignored and the upper 32 bits of the destination 335 // register are set to zero. 336 // 337 // If AArch64's 32-bit form of instruction defines the source operand of 338 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is 339 // real AArch64 instruction and if it is not, do not process the opcode 340 // conservatively. 341 if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) || 342 !AArch64::GPR64allRegClass.hasSubClassEq(RC)) 343 return false; 344 345 // Build a SUBREG_TO_REG instruction 346 MachineInstr *SubregMI = 347 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 348 TII->get(TargetOpcode::SUBREG_TO_REG), DstReg) 349 .addImm(0) 350 .add(MI.getOperand(2)) 351 .add(MI.getOperand(3)); 352 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *SubregMI << "\n"); 353 (void)SubregMI; 354 MI.eraseFromParent(); 355 356 return true; 357 } 358 359 template <typename T> 360 static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) { 361 // The immediate must be in the form of ((imm0 << 12) + imm1), in which both 362 // imm0 and imm1 are non-zero 12-bit unsigned int. 363 if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 || 364 (Imm & ~static_cast<T>(0xffffff)) != 0) 365 return false; 366 367 // The immediate can not be composed via a single instruction. 368 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 369 AArch64_IMM::expandMOVImm(Imm, RegSize, Insn); 370 if (Insn.size() == 1) 371 return false; 372 373 // Split Imm into (Imm0 << 12) + Imm1; 374 Imm0 = (Imm >> 12) & 0xfff; 375 Imm1 = Imm & 0xfff; 376 return true; 377 } 378 379 template <typename T> 380 bool AArch64MIPeepholeOpt::visitADDSUB( 381 unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) { 382 // Try below transformation. 383 // 384 // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri 385 // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri 386 // 387 // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri 388 // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri 389 // 390 // The mov pseudo instruction could be expanded to multiple mov instructions 391 // later. Let's try to split the constant operand of mov instruction into two 392 // legal add/sub immediates. It makes only two ADD/SUB instructions instead of 393 // multiple `mov` + `and/sub` instructions. 394 395 // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant 396 // folded. Make sure that we don't generate invalid instructions that use XZR 397 // in those cases. 398 if (MI.getOperand(1).getReg() == AArch64::XZR || 399 MI.getOperand(1).getReg() == AArch64::WZR) 400 return false; 401 402 return splitTwoPartImm<T>( 403 MI, 404 [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0, 405 T &Imm1) -> std::optional<OpcodePair> { 406 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) 407 return std::make_pair(PosOpc, PosOpc); 408 if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) 409 return std::make_pair(NegOpc, NegOpc); 410 return std::nullopt; 411 }, 412 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, 413 unsigned Imm1, Register SrcReg, Register NewTmpReg, 414 Register NewDstReg) { 415 DebugLoc DL = MI.getDebugLoc(); 416 MachineBasicBlock *MBB = MI.getParent(); 417 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) 418 .addReg(SrcReg) 419 .addImm(Imm0) 420 .addImm(12); 421 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) 422 .addReg(NewTmpReg) 423 .addImm(Imm1) 424 .addImm(0); 425 }); 426 } 427 428 template <typename T> 429 bool AArch64MIPeepholeOpt::visitADDSSUBS( 430 OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) { 431 // Try the same transformation as ADDSUB but with additional requirement 432 // that the condition code usages are only for Equal and Not Equal 433 434 if (MI.getOperand(1).getReg() == AArch64::XZR || 435 MI.getOperand(1).getReg() == AArch64::WZR) 436 return false; 437 438 return splitTwoPartImm<T>( 439 MI, 440 [PosOpcs, NegOpcs, &MI, &TRI = TRI, 441 &MRI = MRI](T Imm, unsigned RegSize, T &Imm0, 442 T &Imm1) -> std::optional<OpcodePair> { 443 OpcodePair OP; 444 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) 445 OP = PosOpcs; 446 else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) 447 OP = NegOpcs; 448 else 449 return std::nullopt; 450 // Check conditional uses last since it is expensive for scanning 451 // proceeding instructions 452 MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg()); 453 std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI); 454 if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V) 455 return std::nullopt; 456 return OP; 457 }, 458 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, 459 unsigned Imm1, Register SrcReg, Register NewTmpReg, 460 Register NewDstReg) { 461 DebugLoc DL = MI.getDebugLoc(); 462 MachineBasicBlock *MBB = MI.getParent(); 463 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) 464 .addReg(SrcReg) 465 .addImm(Imm0) 466 .addImm(12); 467 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) 468 .addReg(NewTmpReg) 469 .addImm(Imm1) 470 .addImm(0); 471 }); 472 } 473 474 // Checks if the corresponding MOV immediate instruction is applicable for 475 // this peephole optimization. 476 bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI, 477 MachineInstr *&MovMI, 478 MachineInstr *&SubregToRegMI) { 479 // Check whether current MBB is in loop and the AND is loop invariant. 480 MachineBasicBlock *MBB = MI.getParent(); 481 MachineLoop *L = MLI->getLoopFor(MBB); 482 if (L && !L->isLoopInvariant(MI)) 483 return false; 484 485 // Check whether current MI's operand is MOV with immediate. 486 MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); 487 if (!MovMI) 488 return false; 489 490 // If it is SUBREG_TO_REG, check its operand. 491 SubregToRegMI = nullptr; 492 if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) { 493 SubregToRegMI = MovMI; 494 MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg()); 495 if (!MovMI) 496 return false; 497 } 498 499 if (MovMI->getOpcode() != AArch64::MOVi32imm && 500 MovMI->getOpcode() != AArch64::MOVi64imm) 501 return false; 502 503 // If the MOV has multiple uses, do not split the immediate because it causes 504 // more instructions. 505 if (!MRI->hasOneUse(MovMI->getOperand(0).getReg())) 506 return false; 507 if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg())) 508 return false; 509 510 // It is OK to perform this peephole optimization. 511 return true; 512 } 513 514 template <typename T> 515 bool AArch64MIPeepholeOpt::splitTwoPartImm( 516 MachineInstr &MI, 517 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) { 518 unsigned RegSize = sizeof(T) * 8; 519 assert((RegSize == 32 || RegSize == 64) && 520 "Invalid RegSize for legal immediate peephole optimization"); 521 522 // Perform several essential checks against current MI. 523 MachineInstr *MovMI, *SubregToRegMI; 524 if (!checkMovImmInstr(MI, MovMI, SubregToRegMI)) 525 return false; 526 527 // Split the immediate to Imm0 and Imm1, and calculate the Opcode. 528 T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1; 529 // For the 32 bit form of instruction, the upper 32 bits of the destination 530 // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits 531 // of Imm to zero. This is essential if the Immediate value was a negative 532 // number since it was sign extended when we assign to the 64-bit Imm. 533 if (SubregToRegMI) 534 Imm &= 0xFFFFFFFF; 535 OpcodePair Opcode; 536 if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1)) 537 Opcode = *R; 538 else 539 return false; 540 541 // Create new MIs using the first and second opcodes. Opcodes might differ for 542 // flag setting operations that should only set flags on second instruction. 543 // NewTmpReg = Opcode.first SrcReg Imm0 544 // NewDstReg = Opcode.second NewTmpReg Imm1 545 546 // Determine register classes for destinations and register operands 547 MachineFunction *MF = MI.getMF(); 548 const TargetRegisterClass *FirstInstrDstRC = 549 TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF); 550 const TargetRegisterClass *FirstInstrOperandRC = 551 TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF); 552 const TargetRegisterClass *SecondInstrDstRC = 553 (Opcode.first == Opcode.second) 554 ? FirstInstrDstRC 555 : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF); 556 const TargetRegisterClass *SecondInstrOperandRC = 557 (Opcode.first == Opcode.second) 558 ? FirstInstrOperandRC 559 : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF); 560 561 // Get old registers destinations and new register destinations 562 Register DstReg = MI.getOperand(0).getReg(); 563 Register SrcReg = MI.getOperand(1).getReg(); 564 Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC); 565 // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to 566 // reuse that same destination register. 567 Register NewDstReg = DstReg.isVirtual() 568 ? MRI->createVirtualRegister(SecondInstrDstRC) 569 : DstReg; 570 571 // Constrain registers based on their new uses 572 MRI->constrainRegClass(SrcReg, FirstInstrOperandRC); 573 MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC); 574 if (DstReg != NewDstReg) 575 MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg)); 576 577 // Call the delegating operation to build the instruction 578 BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg); 579 580 // replaceRegWith changes MI's definition register. Keep it for SSA form until 581 // deleting MI. Only if we made a new destination register. 582 if (DstReg != NewDstReg) { 583 MRI->replaceRegWith(DstReg, NewDstReg); 584 MI.getOperand(0).setReg(DstReg); 585 } 586 587 // Record the MIs need to be removed. 588 MI.eraseFromParent(); 589 if (SubregToRegMI) 590 SubregToRegMI->eraseFromParent(); 591 MovMI->eraseFromParent(); 592 593 return true; 594 } 595 596 bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) { 597 // Check if this INSvi[X]gpr comes from COPY of a source FPR128 598 // 599 // From 600 // %intermediate1:gpr64 = COPY %src:fpr128 601 // %intermediate2:gpr32 = COPY %intermediate1:gpr64 602 // %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32 603 // To 604 // %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128, 605 // src_index 606 // where src_index = 0, X = [8|16|32|64] 607 608 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg()); 609 610 // For a chain of COPY instructions, find the initial source register 611 // and check if it's an FPR128 612 while (true) { 613 if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY) 614 return false; 615 616 if (!SrcMI->getOperand(1).getReg().isVirtual()) 617 return false; 618 619 if (MRI->getRegClass(SrcMI->getOperand(1).getReg()) == 620 &AArch64::FPR128RegClass) { 621 break; 622 } 623 SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg()); 624 } 625 626 Register DstReg = MI.getOperand(0).getReg(); 627 Register SrcReg = SrcMI->getOperand(1).getReg(); 628 MachineInstr *INSvilaneMI = 629 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg) 630 .add(MI.getOperand(1)) 631 .add(MI.getOperand(2)) 632 .addUse(SrcReg, getRegState(SrcMI->getOperand(1))) 633 .addImm(0); 634 635 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *INSvilaneMI << "\n"); 636 (void)INSvilaneMI; 637 MI.eraseFromParent(); 638 return true; 639 } 640 641 // All instructions that set a FPR64 will implicitly zero the top bits of the 642 // register. 643 static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI, 644 MachineRegisterInfo *MRI) { 645 if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef()) 646 return false; 647 const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg()); 648 if (RC != &AArch64::FPR64RegClass) 649 return false; 650 return MI->getOpcode() > TargetOpcode::GENERIC_OP_END; 651 } 652 653 bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) { 654 // Check the MI for low 64-bits sets zero for high 64-bits implicitly. 655 // We are expecting below case. 656 // 657 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr 658 // %6:fpr128 = IMPLICIT_DEF 659 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub 660 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 661 MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg()); 662 if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG) 663 return false; 664 Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg()); 665 if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI)) 666 return false; 667 668 // Check there is `mov 0` MI for high 64-bits. 669 // We are expecting below cases. 670 // 671 // %2:fpr64 = MOVID 0 672 // %4:fpr128 = IMPLICIT_DEF 673 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub 674 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 675 // or 676 // %5:fpr128 = MOVIv2d_ns 0 677 // %6:fpr64 = COPY %5.dsub:fpr128 678 // %8:fpr128 = IMPLICIT_DEF 679 // %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub 680 // %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0 681 MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg()); 682 if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG) 683 return false; 684 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg()); 685 if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY) 686 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg()); 687 if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID && 688 High64MI->getOpcode() != AArch64::MOVIv2d_ns)) 689 return false; 690 if (High64MI->getOperand(1).getImm() != 0) 691 return false; 692 693 // Let's remove MIs for high 64-bits. 694 Register OldDef = MI.getOperand(0).getReg(); 695 Register NewDef = MI.getOperand(1).getReg(); 696 MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef)); 697 MRI->replaceRegWith(OldDef, NewDef); 698 MI.eraseFromParent(); 699 700 return true; 701 } 702 703 bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) { 704 // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR. 705 MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg()); 706 if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI)) 707 return false; 708 709 // Let's remove MIs for high 64-bits. 710 Register OldDef = MI.getOperand(0).getReg(); 711 Register NewDef = MI.getOperand(1).getReg(); 712 LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n"); 713 MRI->clearKillFlags(OldDef); 714 MRI->clearKillFlags(NewDef); 715 MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef)); 716 MRI->replaceRegWith(OldDef, NewDef); 717 MI.eraseFromParent(); 718 719 return true; 720 } 721 722 bool AArch64MIPeepholeOpt::visitUBFMXri(MachineInstr &MI) { 723 // Check if the instruction is equivalent to a 32 bit LSR or LSL alias of 724 // UBFM, and replace the UBFMXri instruction with its 32 bit variant, UBFMWri. 725 int64_t Immr = MI.getOperand(2).getImm(); 726 int64_t Imms = MI.getOperand(3).getImm(); 727 728 bool IsLSR = Imms == 31 && Immr <= Imms; 729 bool IsLSL = Immr == Imms + 33; 730 if (!IsLSR && !IsLSL) 731 return false; 732 733 if (IsLSL) { 734 Immr -= 32; 735 } 736 737 const TargetRegisterClass *DstRC64 = 738 TII->getRegClass(TII->get(MI.getOpcode()), 0, TRI, *MI.getMF()); 739 const TargetRegisterClass *DstRC32 = 740 TRI->getSubRegisterClass(DstRC64, AArch64::sub_32); 741 assert(DstRC32 && "Destination register class of UBFMXri doesn't have a " 742 "sub_32 subregister class"); 743 744 const TargetRegisterClass *SrcRC64 = 745 TII->getRegClass(TII->get(MI.getOpcode()), 1, TRI, *MI.getMF()); 746 const TargetRegisterClass *SrcRC32 = 747 TRI->getSubRegisterClass(SrcRC64, AArch64::sub_32); 748 assert(SrcRC32 && "Source register class of UBFMXri doesn't have a sub_32 " 749 "subregister class"); 750 751 Register DstReg64 = MI.getOperand(0).getReg(); 752 Register DstReg32 = MRI->createVirtualRegister(DstRC32); 753 Register SrcReg64 = MI.getOperand(1).getReg(); 754 Register SrcReg32 = MRI->createVirtualRegister(SrcRC32); 755 756 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::COPY), 757 SrcReg32) 758 .addReg(SrcReg64, 0, AArch64::sub_32); 759 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::UBFMWri), 760 DstReg32) 761 .addReg(SrcReg32) 762 .addImm(Immr) 763 .addImm(Imms); 764 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 765 TII->get(AArch64::SUBREG_TO_REG), DstReg64) 766 .addImm(0) 767 .addReg(DstReg32) 768 .addImm(AArch64::sub_32); 769 MI.eraseFromParent(); 770 return true; 771 } 772 773 // Across a basic-block we might have in i32 extract from a value that only 774 // operates on upper bits (for example a sxtw). We can replace the COPY with a 775 // new version skipping the sxtw. 776 bool AArch64MIPeepholeOpt::visitCopy(MachineInstr &MI) { 777 Register InputReg = MI.getOperand(1).getReg(); 778 if (MI.getOperand(1).getSubReg() != AArch64::sub_32 || 779 !MRI->hasOneNonDBGUse(InputReg)) 780 return false; 781 782 MachineInstr *SrcMI = MRI->getUniqueVRegDef(InputReg); 783 SmallPtrSet<MachineInstr *, 4> DeadInstrs; 784 DeadInstrs.insert(SrcMI); 785 while (SrcMI && SrcMI->isFullCopy() && 786 MRI->hasOneNonDBGUse(SrcMI->getOperand(1).getReg())) { 787 SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg()); 788 DeadInstrs.insert(SrcMI); 789 } 790 791 if (!SrcMI) 792 return false; 793 794 // Look for SXTW(X) and return Reg. 795 auto getSXTWSrcReg = [](MachineInstr *SrcMI) -> Register { 796 if (SrcMI->getOpcode() != AArch64::SBFMXri || 797 SrcMI->getOperand(2).getImm() != 0 || 798 SrcMI->getOperand(3).getImm() != 31) 799 return AArch64::NoRegister; 800 return SrcMI->getOperand(1).getReg(); 801 }; 802 // Look for SUBREG_TO_REG(ORRWrr(WZR, COPY(X.sub_32))) 803 auto getUXTWSrcReg = [&](MachineInstr *SrcMI) -> Register { 804 if (SrcMI->getOpcode() != AArch64::SUBREG_TO_REG || 805 SrcMI->getOperand(3).getImm() != AArch64::sub_32 || 806 !MRI->hasOneNonDBGUse(SrcMI->getOperand(2).getReg())) 807 return AArch64::NoRegister; 808 MachineInstr *Orr = MRI->getUniqueVRegDef(SrcMI->getOperand(2).getReg()); 809 if (!Orr || Orr->getOpcode() != AArch64::ORRWrr || 810 Orr->getOperand(1).getReg() != AArch64::WZR || 811 !MRI->hasOneNonDBGUse(Orr->getOperand(2).getReg())) 812 return AArch64::NoRegister; 813 MachineInstr *Cpy = MRI->getUniqueVRegDef(Orr->getOperand(2).getReg()); 814 if (!Cpy || Cpy->getOpcode() != AArch64::COPY || 815 Cpy->getOperand(1).getSubReg() != AArch64::sub_32) 816 return AArch64::NoRegister; 817 DeadInstrs.insert(Orr); 818 return Cpy->getOperand(1).getReg(); 819 }; 820 821 Register SrcReg = getSXTWSrcReg(SrcMI); 822 if (!SrcReg) 823 SrcReg = getUXTWSrcReg(SrcMI); 824 if (!SrcReg) 825 return false; 826 827 MRI->constrainRegClass(SrcReg, MRI->getRegClass(InputReg)); 828 LLVM_DEBUG(dbgs() << "Optimizing: " << MI); 829 MI.getOperand(1).setReg(SrcReg); 830 LLVM_DEBUG(dbgs() << " to: " << MI); 831 for (auto *DeadMI : DeadInstrs) { 832 LLVM_DEBUG(dbgs() << " Removing: " << *DeadMI); 833 DeadMI->eraseFromParent(); 834 } 835 return true; 836 } 837 838 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { 839 if (skipFunction(MF.getFunction())) 840 return false; 841 842 TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); 843 TRI = static_cast<const AArch64RegisterInfo *>( 844 MF.getSubtarget().getRegisterInfo()); 845 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); 846 MRI = &MF.getRegInfo(); 847 848 assert(MRI->isSSA() && "Expected to be run on SSA form!"); 849 850 bool Changed = false; 851 852 for (MachineBasicBlock &MBB : MF) { 853 for (MachineInstr &MI : make_early_inc_range(MBB)) { 854 switch (MI.getOpcode()) { 855 default: 856 break; 857 case AArch64::INSERT_SUBREG: 858 Changed |= visitINSERT(MI); 859 break; 860 case AArch64::ANDWrr: 861 Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI); 862 break; 863 case AArch64::ANDXrr: 864 Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI); 865 break; 866 case AArch64::ORRWrs: 867 Changed |= visitORR(MI); 868 break; 869 case AArch64::ADDWrr: 870 Changed |= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI); 871 break; 872 case AArch64::SUBWrr: 873 Changed |= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI); 874 break; 875 case AArch64::ADDXrr: 876 Changed |= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI); 877 break; 878 case AArch64::SUBXrr: 879 Changed |= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI); 880 break; 881 case AArch64::ADDSWrr: 882 Changed |= 883 visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri}, 884 {AArch64::SUBWri, AArch64::SUBSWri}, MI); 885 break; 886 case AArch64::SUBSWrr: 887 Changed |= 888 visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri}, 889 {AArch64::ADDWri, AArch64::ADDSWri}, MI); 890 break; 891 case AArch64::ADDSXrr: 892 Changed |= 893 visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri}, 894 {AArch64::SUBXri, AArch64::SUBSXri}, MI); 895 break; 896 case AArch64::SUBSXrr: 897 Changed |= 898 visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri}, 899 {AArch64::ADDXri, AArch64::ADDSXri}, MI); 900 break; 901 case AArch64::CSELWr: 902 case AArch64::CSELXr: 903 Changed |= visitCSEL(MI); 904 break; 905 case AArch64::INSvi64gpr: 906 Changed |= visitINSviGPR(MI, AArch64::INSvi64lane); 907 break; 908 case AArch64::INSvi32gpr: 909 Changed |= visitINSviGPR(MI, AArch64::INSvi32lane); 910 break; 911 case AArch64::INSvi16gpr: 912 Changed |= visitINSviGPR(MI, AArch64::INSvi16lane); 913 break; 914 case AArch64::INSvi8gpr: 915 Changed |= visitINSviGPR(MI, AArch64::INSvi8lane); 916 break; 917 case AArch64::INSvi64lane: 918 Changed |= visitINSvi64lane(MI); 919 break; 920 case AArch64::FMOVDr: 921 Changed |= visitFMOVDr(MI); 922 break; 923 case AArch64::UBFMXri: 924 Changed |= visitUBFMXri(MI); 925 break; 926 case AArch64::COPY: 927 Changed |= visitCopy(MI); 928 break; 929 } 930 } 931 } 932 933 return Changed; 934 } 935 936 FunctionPass *llvm::createAArch64MIPeepholeOptPass() { 937 return new AArch64MIPeepholeOpt(); 938 } 939