1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the AArch64 implementation of the TargetInstrInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64InstrInfo.h" 14 #include "AArch64ExpandImm.h" 15 #include "AArch64MachineFunctionInfo.h" 16 #include "AArch64PointerAuth.h" 17 #include "AArch64Subtarget.h" 18 #include "MCTargetDesc/AArch64AddressingModes.h" 19 #include "MCTargetDesc/AArch64MCTargetDesc.h" 20 #include "Utils/AArch64BaseInfo.h" 21 #include "llvm/ADT/ArrayRef.h" 22 #include "llvm/ADT/STLExtras.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/CodeGen/LivePhysRegs.h" 25 #include "llvm/CodeGen/MachineBasicBlock.h" 26 #include "llvm/CodeGen/MachineCombinerPattern.h" 27 #include "llvm/CodeGen/MachineFrameInfo.h" 28 #include "llvm/CodeGen/MachineFunction.h" 29 #include "llvm/CodeGen/MachineInstr.h" 30 #include "llvm/CodeGen/MachineInstrBuilder.h" 31 #include "llvm/CodeGen/MachineMemOperand.h" 32 #include "llvm/CodeGen/MachineModuleInfo.h" 33 #include "llvm/CodeGen/MachineOperand.h" 34 #include "llvm/CodeGen/MachineRegisterInfo.h" 35 #include "llvm/CodeGen/RegisterScavenging.h" 36 #include "llvm/CodeGen/StackMaps.h" 37 #include "llvm/CodeGen/TargetRegisterInfo.h" 38 #include "llvm/CodeGen/TargetSubtargetInfo.h" 39 #include "llvm/IR/DebugInfoMetadata.h" 40 #include "llvm/IR/DebugLoc.h" 41 #include "llvm/IR/GlobalValue.h" 42 #include "llvm/IR/Module.h" 43 #include "llvm/MC/MCAsmInfo.h" 44 #include "llvm/MC/MCInst.h" 45 #include "llvm/MC/MCInstBuilder.h" 46 #include "llvm/MC/MCInstrDesc.h" 47 #include "llvm/Support/Casting.h" 48 #include "llvm/Support/CodeGen.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/ErrorHandling.h" 51 #include "llvm/Support/LEB128.h" 52 #include "llvm/Support/MathExtras.h" 53 #include "llvm/Target/TargetMachine.h" 54 #include "llvm/Target/TargetOptions.h" 55 #include <cassert> 56 #include <cstdint> 57 #include <iterator> 58 #include <utility> 59 60 using namespace llvm; 61 62 #define GET_INSTRINFO_CTOR_DTOR 63 #include "AArch64GenInstrInfo.inc" 64 65 static cl::opt<unsigned> TBZDisplacementBits( 66 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), 67 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); 68 69 static cl::opt<unsigned> CBZDisplacementBits( 70 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), 71 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); 72 73 static cl::opt<unsigned> 74 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), 75 cl::desc("Restrict range of Bcc instructions (DEBUG)")); 76 77 static cl::opt<unsigned> 78 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), 79 cl::desc("Restrict range of B instructions (DEBUG)")); 80 81 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) 82 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, 83 AArch64::CATCHRET), 84 RI(STI.getTargetTriple()), Subtarget(STI) {} 85 86 /// GetInstSize - Return the number of bytes of code the specified 87 /// instruction may be. This returns the maximum number of bytes. 88 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 89 const MachineBasicBlock &MBB = *MI.getParent(); 90 const MachineFunction *MF = MBB.getParent(); 91 const Function &F = MF->getFunction(); 92 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 93 94 { 95 auto Op = MI.getOpcode(); 96 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR) 97 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); 98 } 99 100 // Meta-instructions emit no code. 101 if (MI.isMetaInstruction()) 102 return 0; 103 104 // FIXME: We currently only handle pseudoinstructions that don't get expanded 105 // before the assembly printer. 106 unsigned NumBytes = 0; 107 const MCInstrDesc &Desc = MI.getDesc(); 108 109 if (!MI.isBundle() && isTailCallReturnInst(MI)) { 110 NumBytes = Desc.getSize() ? Desc.getSize() : 4; 111 112 const auto *MFI = MF->getInfo<AArch64FunctionInfo>(); 113 if (!MFI->shouldSignReturnAddress(MF)) 114 return NumBytes; 115 116 const auto &STI = MF->getSubtarget<AArch64Subtarget>(); 117 auto Method = STI.getAuthenticatedLRCheckMethod(*MF); 118 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method); 119 return NumBytes; 120 } 121 122 // Size should be preferably set in 123 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case). 124 // Specific cases handle instructions of variable sizes 125 switch (Desc.getOpcode()) { 126 default: 127 if (Desc.getSize()) 128 return Desc.getSize(); 129 130 // Anything not explicitly designated otherwise (i.e. pseudo-instructions 131 // with fixed constant size but not specified in .td file) is a normal 132 // 4-byte insn. 133 NumBytes = 4; 134 break; 135 case TargetOpcode::STACKMAP: 136 // The upper bound for a stackmap intrinsic is the full length of its shadow 137 NumBytes = StackMapOpers(&MI).getNumPatchBytes(); 138 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 139 break; 140 case TargetOpcode::PATCHPOINT: 141 // The size of the patchpoint intrinsic is the number of bytes requested 142 NumBytes = PatchPointOpers(&MI).getNumPatchBytes(); 143 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 144 break; 145 case TargetOpcode::STATEPOINT: 146 NumBytes = StatepointOpers(&MI).getNumPatchBytes(); 147 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 148 // No patch bytes means a normal call inst is emitted 149 if (NumBytes == 0) 150 NumBytes = 4; 151 break; 152 case TargetOpcode::PATCHABLE_FUNCTION_ENTER: 153 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER 154 // instructions are expanded to the specified number of NOPs. Otherwise, 155 // they are expanded to 36-byte XRay sleds. 156 NumBytes = 157 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4; 158 break; 159 case TargetOpcode::PATCHABLE_FUNCTION_EXIT: 160 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL: 161 // An XRay sled can be 4 bytes of alignment plus a 32-byte block. 162 NumBytes = 36; 163 break; 164 case TargetOpcode::PATCHABLE_EVENT_CALL: 165 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment). 166 NumBytes = 24; 167 break; 168 169 case AArch64::SPACE: 170 NumBytes = MI.getOperand(1).getImm(); 171 break; 172 case TargetOpcode::BUNDLE: 173 NumBytes = getInstBundleLength(MI); 174 break; 175 } 176 177 return NumBytes; 178 } 179 180 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const { 181 unsigned Size = 0; 182 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 183 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 184 while (++I != E && I->isInsideBundle()) { 185 assert(!I->isBundle() && "No nested bundle!"); 186 Size += getInstSizeInBytes(*I); 187 } 188 return Size; 189 } 190 191 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, 192 SmallVectorImpl<MachineOperand> &Cond) { 193 // Block ends with fall-through condbranch. 194 switch (LastInst->getOpcode()) { 195 default: 196 llvm_unreachable("Unknown branch instruction?"); 197 case AArch64::Bcc: 198 Target = LastInst->getOperand(1).getMBB(); 199 Cond.push_back(LastInst->getOperand(0)); 200 break; 201 case AArch64::CBZW: 202 case AArch64::CBZX: 203 case AArch64::CBNZW: 204 case AArch64::CBNZX: 205 Target = LastInst->getOperand(1).getMBB(); 206 Cond.push_back(MachineOperand::CreateImm(-1)); 207 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 208 Cond.push_back(LastInst->getOperand(0)); 209 break; 210 case AArch64::TBZW: 211 case AArch64::TBZX: 212 case AArch64::TBNZW: 213 case AArch64::TBNZX: 214 Target = LastInst->getOperand(2).getMBB(); 215 Cond.push_back(MachineOperand::CreateImm(-1)); 216 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 217 Cond.push_back(LastInst->getOperand(0)); 218 Cond.push_back(LastInst->getOperand(1)); 219 } 220 } 221 222 static unsigned getBranchDisplacementBits(unsigned Opc) { 223 switch (Opc) { 224 default: 225 llvm_unreachable("unexpected opcode!"); 226 case AArch64::B: 227 return BDisplacementBits; 228 case AArch64::TBNZW: 229 case AArch64::TBZW: 230 case AArch64::TBNZX: 231 case AArch64::TBZX: 232 return TBZDisplacementBits; 233 case AArch64::CBNZW: 234 case AArch64::CBZW: 235 case AArch64::CBNZX: 236 case AArch64::CBZX: 237 return CBZDisplacementBits; 238 case AArch64::Bcc: 239 return BCCDisplacementBits; 240 } 241 } 242 243 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp, 244 int64_t BrOffset) const { 245 unsigned Bits = getBranchDisplacementBits(BranchOp); 246 assert(Bits >= 3 && "max branch displacement must be enough to jump" 247 "over conditional branch expansion"); 248 return isIntN(Bits, BrOffset / 4); 249 } 250 251 MachineBasicBlock * 252 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const { 253 switch (MI.getOpcode()) { 254 default: 255 llvm_unreachable("unexpected opcode!"); 256 case AArch64::B: 257 return MI.getOperand(0).getMBB(); 258 case AArch64::TBZW: 259 case AArch64::TBNZW: 260 case AArch64::TBZX: 261 case AArch64::TBNZX: 262 return MI.getOperand(2).getMBB(); 263 case AArch64::CBZW: 264 case AArch64::CBNZW: 265 case AArch64::CBZX: 266 case AArch64::CBNZX: 267 case AArch64::Bcc: 268 return MI.getOperand(1).getMBB(); 269 } 270 } 271 272 void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 273 MachineBasicBlock &NewDestBB, 274 MachineBasicBlock &RestoreBB, 275 const DebugLoc &DL, 276 int64_t BrOffset, 277 RegScavenger *RS) const { 278 assert(RS && "RegScavenger required for long branching"); 279 assert(MBB.empty() && 280 "new block should be inserted for expanding unconditional branch"); 281 assert(MBB.pred_size() == 1); 282 assert(RestoreBB.empty() && 283 "restore block should be inserted for restoring clobbered registers"); 284 285 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) { 286 // Offsets outside of the signed 33-bit range are not supported for ADRP + 287 // ADD. 288 if (!isInt<33>(BrOffset)) 289 report_fatal_error( 290 "Branch offsets outside of the signed 33-bit range not supported"); 291 292 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg) 293 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE); 294 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg) 295 .addReg(Reg) 296 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC) 297 .addImm(0); 298 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg); 299 }; 300 301 RS->enterBasicBlockEnd(MBB); 302 // If X16 is unused, we can rely on the linker to insert a range extension 303 // thunk if NewDestBB is out of range of a single B instruction. 304 constexpr Register Reg = AArch64::X16; 305 if (!RS->isRegUsed(Reg)) { 306 insertUnconditionalBranch(MBB, &NewDestBB, DL); 307 RS->setRegUsed(Reg); 308 return; 309 } 310 311 // If there's a free register and it's worth inflating the code size, 312 // manually insert the indirect branch. 313 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass); 314 if (Scavenged != AArch64::NoRegister && 315 MBB.getSectionID() == MBBSectionID::ColdSectionID) { 316 buildIndirectBranch(Scavenged, NewDestBB); 317 RS->setRegUsed(Scavenged); 318 return; 319 } 320 321 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible 322 // with red zones. 323 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>(); 324 if (!AFI || AFI->hasRedZone().value_or(true)) 325 report_fatal_error( 326 "Unable to insert indirect branch inside function that has red zone"); 327 328 // Otherwise, spill X16 and defer range extension to the linker. 329 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre)) 330 .addReg(AArch64::SP, RegState::Define) 331 .addReg(Reg) 332 .addReg(AArch64::SP) 333 .addImm(-16); 334 335 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB); 336 337 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost)) 338 .addReg(AArch64::SP, RegState::Define) 339 .addReg(Reg, RegState::Define) 340 .addReg(AArch64::SP) 341 .addImm(16); 342 } 343 344 // Branch analysis. 345 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, 346 MachineBasicBlock *&TBB, 347 MachineBasicBlock *&FBB, 348 SmallVectorImpl<MachineOperand> &Cond, 349 bool AllowModify) const { 350 // If the block has no terminators, it just falls into the block after it. 351 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 352 if (I == MBB.end()) 353 return false; 354 355 // Skip over SpeculationBarrierEndBB terminators 356 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 357 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 358 --I; 359 } 360 361 if (!isUnpredicatedTerminator(*I)) 362 return false; 363 364 // Get the last instruction in the block. 365 MachineInstr *LastInst = &*I; 366 367 // If there is only one terminator instruction, process it. 368 unsigned LastOpc = LastInst->getOpcode(); 369 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 370 if (isUncondBranchOpcode(LastOpc)) { 371 TBB = LastInst->getOperand(0).getMBB(); 372 return false; 373 } 374 if (isCondBranchOpcode(LastOpc)) { 375 // Block ends with fall-through condbranch. 376 parseCondBranch(LastInst, TBB, Cond); 377 return false; 378 } 379 return true; // Can't handle indirect branch. 380 } 381 382 // Get the instruction before it if it is a terminator. 383 MachineInstr *SecondLastInst = &*I; 384 unsigned SecondLastOpc = SecondLastInst->getOpcode(); 385 386 // If AllowModify is true and the block ends with two or more unconditional 387 // branches, delete all but the first unconditional branch. 388 if (AllowModify && isUncondBranchOpcode(LastOpc)) { 389 while (isUncondBranchOpcode(SecondLastOpc)) { 390 LastInst->eraseFromParent(); 391 LastInst = SecondLastInst; 392 LastOpc = LastInst->getOpcode(); 393 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 394 // Return now the only terminator is an unconditional branch. 395 TBB = LastInst->getOperand(0).getMBB(); 396 return false; 397 } 398 SecondLastInst = &*I; 399 SecondLastOpc = SecondLastInst->getOpcode(); 400 } 401 } 402 403 // If we're allowed to modify and the block ends in a unconditional branch 404 // which could simply fallthrough, remove the branch. (Note: This case only 405 // matters when we can't understand the whole sequence, otherwise it's also 406 // handled by BranchFolding.cpp.) 407 if (AllowModify && isUncondBranchOpcode(LastOpc) && 408 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) { 409 LastInst->eraseFromParent(); 410 LastInst = SecondLastInst; 411 LastOpc = LastInst->getOpcode(); 412 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 413 assert(!isUncondBranchOpcode(LastOpc) && 414 "unreachable unconditional branches removed above"); 415 416 if (isCondBranchOpcode(LastOpc)) { 417 // Block ends with fall-through condbranch. 418 parseCondBranch(LastInst, TBB, Cond); 419 return false; 420 } 421 return true; // Can't handle indirect branch. 422 } 423 SecondLastInst = &*I; 424 SecondLastOpc = SecondLastInst->getOpcode(); 425 } 426 427 // If there are three terminators, we don't know what sort of block this is. 428 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) 429 return true; 430 431 // If the block ends with a B and a Bcc, handle it. 432 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 433 parseCondBranch(SecondLastInst, TBB, Cond); 434 FBB = LastInst->getOperand(0).getMBB(); 435 return false; 436 } 437 438 // If the block ends with two unconditional branches, handle it. The second 439 // one is not executed, so remove it. 440 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 441 TBB = SecondLastInst->getOperand(0).getMBB(); 442 I = LastInst; 443 if (AllowModify) 444 I->eraseFromParent(); 445 return false; 446 } 447 448 // ...likewise if it ends with an indirect branch followed by an unconditional 449 // branch. 450 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 451 I = LastInst; 452 if (AllowModify) 453 I->eraseFromParent(); 454 return true; 455 } 456 457 // Otherwise, can't handle this. 458 return true; 459 } 460 461 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, 462 MachineBranchPredicate &MBP, 463 bool AllowModify) const { 464 // For the moment, handle only a block which ends with a cb(n)zx followed by 465 // a fallthrough. Why this? Because it is a common form. 466 // TODO: Should we handle b.cc? 467 468 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 469 if (I == MBB.end()) 470 return true; 471 472 // Skip over SpeculationBarrierEndBB terminators 473 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 474 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 475 --I; 476 } 477 478 if (!isUnpredicatedTerminator(*I)) 479 return true; 480 481 // Get the last instruction in the block. 482 MachineInstr *LastInst = &*I; 483 unsigned LastOpc = LastInst->getOpcode(); 484 if (!isCondBranchOpcode(LastOpc)) 485 return true; 486 487 switch (LastOpc) { 488 default: 489 return true; 490 case AArch64::CBZW: 491 case AArch64::CBZX: 492 case AArch64::CBNZW: 493 case AArch64::CBNZX: 494 break; 495 }; 496 497 MBP.TrueDest = LastInst->getOperand(1).getMBB(); 498 assert(MBP.TrueDest && "expected!"); 499 MBP.FalseDest = MBB.getNextNode(); 500 501 MBP.ConditionDef = nullptr; 502 MBP.SingleUseCondition = false; 503 504 MBP.LHS = LastInst->getOperand(0); 505 MBP.RHS = MachineOperand::CreateImm(0); 506 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE 507 : MachineBranchPredicate::PRED_EQ; 508 return false; 509 } 510 511 bool AArch64InstrInfo::reverseBranchCondition( 512 SmallVectorImpl<MachineOperand> &Cond) const { 513 if (Cond[0].getImm() != -1) { 514 // Regular Bcc 515 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); 516 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC)); 517 } else { 518 // Folded compare-and-branch 519 switch (Cond[1].getImm()) { 520 default: 521 llvm_unreachable("Unknown conditional branch!"); 522 case AArch64::CBZW: 523 Cond[1].setImm(AArch64::CBNZW); 524 break; 525 case AArch64::CBNZW: 526 Cond[1].setImm(AArch64::CBZW); 527 break; 528 case AArch64::CBZX: 529 Cond[1].setImm(AArch64::CBNZX); 530 break; 531 case AArch64::CBNZX: 532 Cond[1].setImm(AArch64::CBZX); 533 break; 534 case AArch64::TBZW: 535 Cond[1].setImm(AArch64::TBNZW); 536 break; 537 case AArch64::TBNZW: 538 Cond[1].setImm(AArch64::TBZW); 539 break; 540 case AArch64::TBZX: 541 Cond[1].setImm(AArch64::TBNZX); 542 break; 543 case AArch64::TBNZX: 544 Cond[1].setImm(AArch64::TBZX); 545 break; 546 } 547 } 548 549 return false; 550 } 551 552 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB, 553 int *BytesRemoved) const { 554 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 555 if (I == MBB.end()) 556 return 0; 557 558 if (!isUncondBranchOpcode(I->getOpcode()) && 559 !isCondBranchOpcode(I->getOpcode())) 560 return 0; 561 562 // Remove the branch. 563 I->eraseFromParent(); 564 565 I = MBB.end(); 566 567 if (I == MBB.begin()) { 568 if (BytesRemoved) 569 *BytesRemoved = 4; 570 return 1; 571 } 572 --I; 573 if (!isCondBranchOpcode(I->getOpcode())) { 574 if (BytesRemoved) 575 *BytesRemoved = 4; 576 return 1; 577 } 578 579 // Remove the branch. 580 I->eraseFromParent(); 581 if (BytesRemoved) 582 *BytesRemoved = 8; 583 584 return 2; 585 } 586 587 void AArch64InstrInfo::instantiateCondBranch( 588 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, 589 ArrayRef<MachineOperand> Cond) const { 590 if (Cond[0].getImm() != -1) { 591 // Regular Bcc 592 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); 593 } else { 594 // Folded compare-and-branch 595 // Note that we use addOperand instead of addReg to keep the flags. 596 const MachineInstrBuilder MIB = 597 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]); 598 if (Cond.size() > 3) 599 MIB.addImm(Cond[3].getImm()); 600 MIB.addMBB(TBB); 601 } 602 } 603 604 unsigned AArch64InstrInfo::insertBranch( 605 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, 606 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { 607 // Shouldn't be a fall through. 608 assert(TBB && "insertBranch must not be told to insert a fallthrough"); 609 610 if (!FBB) { 611 if (Cond.empty()) // Unconditional branch? 612 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB); 613 else 614 instantiateCondBranch(MBB, DL, TBB, Cond); 615 616 if (BytesAdded) 617 *BytesAdded = 4; 618 619 return 1; 620 } 621 622 // Two-way conditional branch. 623 instantiateCondBranch(MBB, DL, TBB, Cond); 624 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB); 625 626 if (BytesAdded) 627 *BytesAdded = 8; 628 629 return 2; 630 } 631 632 // Find the original register that VReg is copied from. 633 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { 634 while (Register::isVirtualRegister(VReg)) { 635 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 636 if (!DefMI->isFullCopy()) 637 return VReg; 638 VReg = DefMI->getOperand(1).getReg(); 639 } 640 return VReg; 641 } 642 643 // Determine if VReg is defined by an instruction that can be folded into a 644 // csel instruction. If so, return the folded opcode, and the replacement 645 // register. 646 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, 647 unsigned *NewVReg = nullptr) { 648 VReg = removeCopies(MRI, VReg); 649 if (!Register::isVirtualRegister(VReg)) 650 return 0; 651 652 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); 653 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 654 unsigned Opc = 0; 655 unsigned SrcOpNum = 0; 656 switch (DefMI->getOpcode()) { 657 case AArch64::ADDSXri: 658 case AArch64::ADDSWri: 659 // if NZCV is used, do not fold. 660 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, 661 true) == -1) 662 return 0; 663 // fall-through to ADDXri and ADDWri. 664 [[fallthrough]]; 665 case AArch64::ADDXri: 666 case AArch64::ADDWri: 667 // add x, 1 -> csinc. 668 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || 669 DefMI->getOperand(3).getImm() != 0) 670 return 0; 671 SrcOpNum = 1; 672 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; 673 break; 674 675 case AArch64::ORNXrr: 676 case AArch64::ORNWrr: { 677 // not x -> csinv, represented as orn dst, xzr, src. 678 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 679 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 680 return 0; 681 SrcOpNum = 2; 682 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; 683 break; 684 } 685 686 case AArch64::SUBSXrr: 687 case AArch64::SUBSWrr: 688 // if NZCV is used, do not fold. 689 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, 690 true) == -1) 691 return 0; 692 // fall-through to SUBXrr and SUBWrr. 693 [[fallthrough]]; 694 case AArch64::SUBXrr: 695 case AArch64::SUBWrr: { 696 // neg x -> csneg, represented as sub dst, xzr, src. 697 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 698 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 699 return 0; 700 SrcOpNum = 2; 701 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; 702 break; 703 } 704 default: 705 return 0; 706 } 707 assert(Opc && SrcOpNum && "Missing parameters"); 708 709 if (NewVReg) 710 *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); 711 return Opc; 712 } 713 714 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 715 ArrayRef<MachineOperand> Cond, 716 Register DstReg, Register TrueReg, 717 Register FalseReg, int &CondCycles, 718 int &TrueCycles, 719 int &FalseCycles) const { 720 // Check register classes. 721 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 722 const TargetRegisterClass *RC = 723 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); 724 if (!RC) 725 return false; 726 727 // Also need to check the dest regclass, in case we're trying to optimize 728 // something like: 729 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2 730 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg))) 731 return false; 732 733 // Expanding cbz/tbz requires an extra cycle of latency on the condition. 734 unsigned ExtraCondLat = Cond.size() != 1; 735 736 // GPRs are handled by csel. 737 // FIXME: Fold in x+1, -x, and ~x when applicable. 738 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) || 739 AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 740 // Single-cycle csel, csinc, csinv, and csneg. 741 CondCycles = 1 + ExtraCondLat; 742 TrueCycles = FalseCycles = 1; 743 if (canFoldIntoCSel(MRI, TrueReg)) 744 TrueCycles = 0; 745 else if (canFoldIntoCSel(MRI, FalseReg)) 746 FalseCycles = 0; 747 return true; 748 } 749 750 // Scalar floating point is handled by fcsel. 751 // FIXME: Form fabs, fmin, and fmax when applicable. 752 if (AArch64::FPR64RegClass.hasSubClassEq(RC) || 753 AArch64::FPR32RegClass.hasSubClassEq(RC)) { 754 CondCycles = 5 + ExtraCondLat; 755 TrueCycles = FalseCycles = 2; 756 return true; 757 } 758 759 // Can't do vectors. 760 return false; 761 } 762 763 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, 764 MachineBasicBlock::iterator I, 765 const DebugLoc &DL, Register DstReg, 766 ArrayRef<MachineOperand> Cond, 767 Register TrueReg, Register FalseReg) const { 768 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 769 770 // Parse the condition code, see parseCondBranch() above. 771 AArch64CC::CondCode CC; 772 switch (Cond.size()) { 773 default: 774 llvm_unreachable("Unknown condition opcode in Cond"); 775 case 1: // b.cc 776 CC = AArch64CC::CondCode(Cond[0].getImm()); 777 break; 778 case 3: { // cbz/cbnz 779 // We must insert a compare against 0. 780 bool Is64Bit; 781 switch (Cond[1].getImm()) { 782 default: 783 llvm_unreachable("Unknown branch opcode in Cond"); 784 case AArch64::CBZW: 785 Is64Bit = false; 786 CC = AArch64CC::EQ; 787 break; 788 case AArch64::CBZX: 789 Is64Bit = true; 790 CC = AArch64CC::EQ; 791 break; 792 case AArch64::CBNZW: 793 Is64Bit = false; 794 CC = AArch64CC::NE; 795 break; 796 case AArch64::CBNZX: 797 Is64Bit = true; 798 CC = AArch64CC::NE; 799 break; 800 } 801 Register SrcReg = Cond[2].getReg(); 802 if (Is64Bit) { 803 // cmp reg, #0 is actually subs xzr, reg, #0. 804 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); 805 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR) 806 .addReg(SrcReg) 807 .addImm(0) 808 .addImm(0); 809 } else { 810 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass); 811 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR) 812 .addReg(SrcReg) 813 .addImm(0) 814 .addImm(0); 815 } 816 break; 817 } 818 case 4: { // tbz/tbnz 819 // We must insert a tst instruction. 820 switch (Cond[1].getImm()) { 821 default: 822 llvm_unreachable("Unknown branch opcode in Cond"); 823 case AArch64::TBZW: 824 case AArch64::TBZX: 825 CC = AArch64CC::EQ; 826 break; 827 case AArch64::TBNZW: 828 case AArch64::TBNZX: 829 CC = AArch64CC::NE; 830 break; 831 } 832 // cmp reg, #foo is actually ands xzr, reg, #1<<foo. 833 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW) 834 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR) 835 .addReg(Cond[2].getReg()) 836 .addImm( 837 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32)); 838 else 839 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR) 840 .addReg(Cond[2].getReg()) 841 .addImm( 842 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64)); 843 break; 844 } 845 } 846 847 unsigned Opc = 0; 848 const TargetRegisterClass *RC = nullptr; 849 bool TryFold = false; 850 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) { 851 RC = &AArch64::GPR64RegClass; 852 Opc = AArch64::CSELXr; 853 TryFold = true; 854 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) { 855 RC = &AArch64::GPR32RegClass; 856 Opc = AArch64::CSELWr; 857 TryFold = true; 858 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) { 859 RC = &AArch64::FPR64RegClass; 860 Opc = AArch64::FCSELDrrr; 861 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) { 862 RC = &AArch64::FPR32RegClass; 863 Opc = AArch64::FCSELSrrr; 864 } 865 assert(RC && "Unsupported regclass"); 866 867 // Try folding simple instructions into the csel. 868 if (TryFold) { 869 unsigned NewVReg = 0; 870 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg); 871 if (FoldedOpc) { 872 // The folded opcodes csinc, csinc and csneg apply the operation to 873 // FalseReg, so we need to invert the condition. 874 CC = AArch64CC::getInvertedCondCode(CC); 875 TrueReg = FalseReg; 876 } else 877 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg); 878 879 // Fold the operation. Leave any dead instructions for DCE to clean up. 880 if (FoldedOpc) { 881 FalseReg = NewVReg; 882 Opc = FoldedOpc; 883 // The extends the live range of NewVReg. 884 MRI.clearKillFlags(NewVReg); 885 } 886 } 887 888 // Pull all virtual register into the appropriate class. 889 MRI.constrainRegClass(TrueReg, RC); 890 MRI.constrainRegClass(FalseReg, RC); 891 892 // Insert the csel. 893 BuildMI(MBB, I, DL, get(Opc), DstReg) 894 .addReg(TrueReg) 895 .addReg(FalseReg) 896 .addImm(CC); 897 } 898 899 // Return true if Imm can be loaded into a register by a "cheap" sequence of 900 // instructions. For now, "cheap" means at most two instructions. 901 static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) { 902 if (BitSize == 32) 903 return true; 904 905 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed"); 906 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm()); 907 SmallVector<AArch64_IMM::ImmInsnModel, 4> Is; 908 AArch64_IMM::expandMOVImm(Imm, BitSize, Is); 909 910 return Is.size() <= 2; 911 } 912 913 // FIXME: this implementation should be micro-architecture dependent, so a 914 // micro-architecture target hook should be introduced here in future. 915 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { 916 if (Subtarget.hasExynosCheapAsMoveHandling()) { 917 if (isExynosCheapAsMove(MI)) 918 return true; 919 return MI.isAsCheapAsAMove(); 920 } 921 922 switch (MI.getOpcode()) { 923 default: 924 return MI.isAsCheapAsAMove(); 925 926 case AArch64::ADDWrs: 927 case AArch64::ADDXrs: 928 case AArch64::SUBWrs: 929 case AArch64::SUBXrs: 930 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4; 931 932 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or 933 // ORRXri, it is as cheap as MOV. 934 // Likewise if it can be expanded to MOVZ/MOVN/MOVK. 935 case AArch64::MOVi32imm: 936 return isCheapImmediate(MI, 32); 937 case AArch64::MOVi64imm: 938 return isCheapImmediate(MI, 64); 939 } 940 } 941 942 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { 943 switch (MI.getOpcode()) { 944 default: 945 return false; 946 947 case AArch64::ADDWrs: 948 case AArch64::ADDXrs: 949 case AArch64::ADDSWrs: 950 case AArch64::ADDSXrs: { 951 unsigned Imm = MI.getOperand(3).getImm(); 952 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 953 if (ShiftVal == 0) 954 return true; 955 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; 956 } 957 958 case AArch64::ADDWrx: 959 case AArch64::ADDXrx: 960 case AArch64::ADDXrx64: 961 case AArch64::ADDSWrx: 962 case AArch64::ADDSXrx: 963 case AArch64::ADDSXrx64: { 964 unsigned Imm = MI.getOperand(3).getImm(); 965 switch (AArch64_AM::getArithExtendType(Imm)) { 966 default: 967 return false; 968 case AArch64_AM::UXTB: 969 case AArch64_AM::UXTH: 970 case AArch64_AM::UXTW: 971 case AArch64_AM::UXTX: 972 return AArch64_AM::getArithShiftValue(Imm) <= 4; 973 } 974 } 975 976 case AArch64::SUBWrs: 977 case AArch64::SUBSWrs: { 978 unsigned Imm = MI.getOperand(3).getImm(); 979 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 980 return ShiftVal == 0 || 981 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); 982 } 983 984 case AArch64::SUBXrs: 985 case AArch64::SUBSXrs: { 986 unsigned Imm = MI.getOperand(3).getImm(); 987 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 988 return ShiftVal == 0 || 989 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); 990 } 991 992 case AArch64::SUBWrx: 993 case AArch64::SUBXrx: 994 case AArch64::SUBXrx64: 995 case AArch64::SUBSWrx: 996 case AArch64::SUBSXrx: 997 case AArch64::SUBSXrx64: { 998 unsigned Imm = MI.getOperand(3).getImm(); 999 switch (AArch64_AM::getArithExtendType(Imm)) { 1000 default: 1001 return false; 1002 case AArch64_AM::UXTB: 1003 case AArch64_AM::UXTH: 1004 case AArch64_AM::UXTW: 1005 case AArch64_AM::UXTX: 1006 return AArch64_AM::getArithShiftValue(Imm) == 0; 1007 } 1008 } 1009 1010 case AArch64::LDRBBroW: 1011 case AArch64::LDRBBroX: 1012 case AArch64::LDRBroW: 1013 case AArch64::LDRBroX: 1014 case AArch64::LDRDroW: 1015 case AArch64::LDRDroX: 1016 case AArch64::LDRHHroW: 1017 case AArch64::LDRHHroX: 1018 case AArch64::LDRHroW: 1019 case AArch64::LDRHroX: 1020 case AArch64::LDRQroW: 1021 case AArch64::LDRQroX: 1022 case AArch64::LDRSBWroW: 1023 case AArch64::LDRSBWroX: 1024 case AArch64::LDRSBXroW: 1025 case AArch64::LDRSBXroX: 1026 case AArch64::LDRSHWroW: 1027 case AArch64::LDRSHWroX: 1028 case AArch64::LDRSHXroW: 1029 case AArch64::LDRSHXroX: 1030 case AArch64::LDRSWroW: 1031 case AArch64::LDRSWroX: 1032 case AArch64::LDRSroW: 1033 case AArch64::LDRSroX: 1034 case AArch64::LDRWroW: 1035 case AArch64::LDRWroX: 1036 case AArch64::LDRXroW: 1037 case AArch64::LDRXroX: 1038 case AArch64::PRFMroW: 1039 case AArch64::PRFMroX: 1040 case AArch64::STRBBroW: 1041 case AArch64::STRBBroX: 1042 case AArch64::STRBroW: 1043 case AArch64::STRBroX: 1044 case AArch64::STRDroW: 1045 case AArch64::STRDroX: 1046 case AArch64::STRHHroW: 1047 case AArch64::STRHHroX: 1048 case AArch64::STRHroW: 1049 case AArch64::STRHroX: 1050 case AArch64::STRQroW: 1051 case AArch64::STRQroX: 1052 case AArch64::STRSroW: 1053 case AArch64::STRSroX: 1054 case AArch64::STRWroW: 1055 case AArch64::STRWroX: 1056 case AArch64::STRXroW: 1057 case AArch64::STRXroX: { 1058 unsigned IsSigned = MI.getOperand(3).getImm(); 1059 return !IsSigned; 1060 } 1061 } 1062 } 1063 1064 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { 1065 unsigned Opc = MI.getOpcode(); 1066 switch (Opc) { 1067 default: 1068 return false; 1069 case AArch64::SEH_StackAlloc: 1070 case AArch64::SEH_SaveFPLR: 1071 case AArch64::SEH_SaveFPLR_X: 1072 case AArch64::SEH_SaveReg: 1073 case AArch64::SEH_SaveReg_X: 1074 case AArch64::SEH_SaveRegP: 1075 case AArch64::SEH_SaveRegP_X: 1076 case AArch64::SEH_SaveFReg: 1077 case AArch64::SEH_SaveFReg_X: 1078 case AArch64::SEH_SaveFRegP: 1079 case AArch64::SEH_SaveFRegP_X: 1080 case AArch64::SEH_SetFP: 1081 case AArch64::SEH_AddFP: 1082 case AArch64::SEH_Nop: 1083 case AArch64::SEH_PrologEnd: 1084 case AArch64::SEH_EpilogStart: 1085 case AArch64::SEH_EpilogEnd: 1086 case AArch64::SEH_PACSignLR: 1087 case AArch64::SEH_SaveAnyRegQP: 1088 case AArch64::SEH_SaveAnyRegQPX: 1089 return true; 1090 } 1091 } 1092 1093 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, 1094 Register &SrcReg, Register &DstReg, 1095 unsigned &SubIdx) const { 1096 switch (MI.getOpcode()) { 1097 default: 1098 return false; 1099 case AArch64::SBFMXri: // aka sxtw 1100 case AArch64::UBFMXri: // aka uxtw 1101 // Check for the 32 -> 64 bit extension case, these instructions can do 1102 // much more. 1103 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) 1104 return false; 1105 // This is a signed or unsigned 32 -> 64 bit extension. 1106 SrcReg = MI.getOperand(1).getReg(); 1107 DstReg = MI.getOperand(0).getReg(); 1108 SubIdx = AArch64::sub_32; 1109 return true; 1110 } 1111 } 1112 1113 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( 1114 const MachineInstr &MIa, const MachineInstr &MIb) const { 1115 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1116 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; 1117 int64_t OffsetA = 0, OffsetB = 0; 1118 TypeSize WidthA(0, false), WidthB(0, false); 1119 bool OffsetAIsScalable = false, OffsetBIsScalable = false; 1120 1121 assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); 1122 assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); 1123 1124 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || 1125 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1126 return false; 1127 1128 // Retrieve the base, offset from the base and width. Width 1129 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If 1130 // base are identical, and the offset of a lower memory access + 1131 // the width doesn't overlap the offset of a higher memory access, 1132 // then the memory accesses are different. 1133 // If OffsetAIsScalable and OffsetBIsScalable are both true, they 1134 // are assumed to have the same scale (vscale). 1135 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable, 1136 WidthA, TRI) && 1137 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable, 1138 WidthB, TRI)) { 1139 if (BaseOpA->isIdenticalTo(*BaseOpB) && 1140 OffsetAIsScalable == OffsetBIsScalable) { 1141 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1142 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1143 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1144 if (LowWidth.isScalable() == OffsetAIsScalable && 1145 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset) 1146 return true; 1147 } 1148 } 1149 return false; 1150 } 1151 1152 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, 1153 const MachineBasicBlock *MBB, 1154 const MachineFunction &MF) const { 1155 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) 1156 return true; 1157 1158 // Do not move an instruction that can be recognized as a branch target. 1159 if (hasBTISemantics(MI)) 1160 return true; 1161 1162 switch (MI.getOpcode()) { 1163 case AArch64::HINT: 1164 // CSDB hints are scheduling barriers. 1165 if (MI.getOperand(0).getImm() == 0x14) 1166 return true; 1167 break; 1168 case AArch64::DSB: 1169 case AArch64::ISB: 1170 // DSB and ISB also are scheduling barriers. 1171 return true; 1172 case AArch64::MSRpstatesvcrImm1: 1173 // SMSTART and SMSTOP are also scheduling barriers. 1174 return true; 1175 default:; 1176 } 1177 if (isSEHInstruction(MI)) 1178 return true; 1179 auto Next = std::next(MI.getIterator()); 1180 return Next != MBB->end() && Next->isCFIInstruction(); 1181 } 1182 1183 /// analyzeCompare - For a comparison instruction, return the source registers 1184 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. 1185 /// Return true if the comparison instruction can be analyzed. 1186 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, 1187 Register &SrcReg2, int64_t &CmpMask, 1188 int64_t &CmpValue) const { 1189 // The first operand can be a frame index where we'd normally expect a 1190 // register. 1191 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands"); 1192 if (!MI.getOperand(1).isReg()) 1193 return false; 1194 1195 switch (MI.getOpcode()) { 1196 default: 1197 break; 1198 case AArch64::PTEST_PP: 1199 case AArch64::PTEST_PP_ANY: 1200 SrcReg = MI.getOperand(0).getReg(); 1201 SrcReg2 = MI.getOperand(1).getReg(); 1202 // Not sure about the mask and value for now... 1203 CmpMask = ~0; 1204 CmpValue = 0; 1205 return true; 1206 case AArch64::SUBSWrr: 1207 case AArch64::SUBSWrs: 1208 case AArch64::SUBSWrx: 1209 case AArch64::SUBSXrr: 1210 case AArch64::SUBSXrs: 1211 case AArch64::SUBSXrx: 1212 case AArch64::ADDSWrr: 1213 case AArch64::ADDSWrs: 1214 case AArch64::ADDSWrx: 1215 case AArch64::ADDSXrr: 1216 case AArch64::ADDSXrs: 1217 case AArch64::ADDSXrx: 1218 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1219 SrcReg = MI.getOperand(1).getReg(); 1220 SrcReg2 = MI.getOperand(2).getReg(); 1221 CmpMask = ~0; 1222 CmpValue = 0; 1223 return true; 1224 case AArch64::SUBSWri: 1225 case AArch64::ADDSWri: 1226 case AArch64::SUBSXri: 1227 case AArch64::ADDSXri: 1228 SrcReg = MI.getOperand(1).getReg(); 1229 SrcReg2 = 0; 1230 CmpMask = ~0; 1231 CmpValue = MI.getOperand(2).getImm(); 1232 return true; 1233 case AArch64::ANDSWri: 1234 case AArch64::ANDSXri: 1235 // ANDS does not use the same encoding scheme as the others xxxS 1236 // instructions. 1237 SrcReg = MI.getOperand(1).getReg(); 1238 SrcReg2 = 0; 1239 CmpMask = ~0; 1240 CmpValue = AArch64_AM::decodeLogicalImmediate( 1241 MI.getOperand(2).getImm(), 1242 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64); 1243 return true; 1244 } 1245 1246 return false; 1247 } 1248 1249 static bool UpdateOperandRegClass(MachineInstr &Instr) { 1250 MachineBasicBlock *MBB = Instr.getParent(); 1251 assert(MBB && "Can't get MachineBasicBlock here"); 1252 MachineFunction *MF = MBB->getParent(); 1253 assert(MF && "Can't get MachineFunction here"); 1254 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 1255 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 1256 MachineRegisterInfo *MRI = &MF->getRegInfo(); 1257 1258 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; 1259 ++OpIdx) { 1260 MachineOperand &MO = Instr.getOperand(OpIdx); 1261 const TargetRegisterClass *OpRegCstraints = 1262 Instr.getRegClassConstraint(OpIdx, TII, TRI); 1263 1264 // If there's no constraint, there's nothing to do. 1265 if (!OpRegCstraints) 1266 continue; 1267 // If the operand is a frame index, there's nothing to do here. 1268 // A frame index operand will resolve correctly during PEI. 1269 if (MO.isFI()) 1270 continue; 1271 1272 assert(MO.isReg() && 1273 "Operand has register constraints without being a register!"); 1274 1275 Register Reg = MO.getReg(); 1276 if (Reg.isPhysical()) { 1277 if (!OpRegCstraints->contains(Reg)) 1278 return false; 1279 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && 1280 !MRI->constrainRegClass(Reg, OpRegCstraints)) 1281 return false; 1282 } 1283 1284 return true; 1285 } 1286 1287 /// Return the opcode that does not set flags when possible - otherwise 1288 /// return the original opcode. The caller is responsible to do the actual 1289 /// substitution and legality checking. 1290 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { 1291 // Don't convert all compare instructions, because for some the zero register 1292 // encoding becomes the sp register. 1293 bool MIDefinesZeroReg = false; 1294 if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) || 1295 MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) 1296 MIDefinesZeroReg = true; 1297 1298 switch (MI.getOpcode()) { 1299 default: 1300 return MI.getOpcode(); 1301 case AArch64::ADDSWrr: 1302 return AArch64::ADDWrr; 1303 case AArch64::ADDSWri: 1304 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri; 1305 case AArch64::ADDSWrs: 1306 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs; 1307 case AArch64::ADDSWrx: 1308 return AArch64::ADDWrx; 1309 case AArch64::ADDSXrr: 1310 return AArch64::ADDXrr; 1311 case AArch64::ADDSXri: 1312 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri; 1313 case AArch64::ADDSXrs: 1314 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs; 1315 case AArch64::ADDSXrx: 1316 return AArch64::ADDXrx; 1317 case AArch64::SUBSWrr: 1318 return AArch64::SUBWrr; 1319 case AArch64::SUBSWri: 1320 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri; 1321 case AArch64::SUBSWrs: 1322 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs; 1323 case AArch64::SUBSWrx: 1324 return AArch64::SUBWrx; 1325 case AArch64::SUBSXrr: 1326 return AArch64::SUBXrr; 1327 case AArch64::SUBSXri: 1328 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri; 1329 case AArch64::SUBSXrs: 1330 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs; 1331 case AArch64::SUBSXrx: 1332 return AArch64::SUBXrx; 1333 } 1334 } 1335 1336 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 }; 1337 1338 /// True when condition flags are accessed (either by writing or reading) 1339 /// on the instruction trace starting at From and ending at To. 1340 /// 1341 /// Note: If From and To are from different blocks it's assumed CC are accessed 1342 /// on the path. 1343 static bool areCFlagsAccessedBetweenInstrs( 1344 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, 1345 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { 1346 // Early exit if To is at the beginning of the BB. 1347 if (To == To->getParent()->begin()) 1348 return true; 1349 1350 // Check whether the instructions are in the same basic block 1351 // If not, assume the condition flags might get modified somewhere. 1352 if (To->getParent() != From->getParent()) 1353 return true; 1354 1355 // From must be above To. 1356 assert(std::any_of( 1357 ++To.getReverse(), To->getParent()->rend(), 1358 [From](MachineInstr &MI) { return MI.getIterator() == From; })); 1359 1360 // We iterate backward starting at \p To until we hit \p From. 1361 for (const MachineInstr &Instr : 1362 instructionsWithoutDebug(++To.getReverse(), From.getReverse())) { 1363 if (((AccessToCheck & AK_Write) && 1364 Instr.modifiesRegister(AArch64::NZCV, TRI)) || 1365 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) 1366 return true; 1367 } 1368 return false; 1369 } 1370 1371 std::optional<unsigned> 1372 AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask, 1373 MachineInstr *Pred, 1374 const MachineRegisterInfo *MRI) const { 1375 unsigned MaskOpcode = Mask->getOpcode(); 1376 unsigned PredOpcode = Pred->getOpcode(); 1377 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode); 1378 bool PredIsWhileLike = isWhileOpcode(PredOpcode); 1379 1380 if (PredIsWhileLike) { 1381 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc 1382 // instruction and the condition is "any" since WHILcc does an implicit 1383 // PTEST(ALL, PG) check and PG is always a subset of ALL. 1384 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY) 1385 return PredOpcode; 1386 1387 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is 1388 // redundant since WHILE performs an implicit PTEST with an all active 1389 // mask. 1390 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 && 1391 getElementSizeForOpcode(MaskOpcode) == 1392 getElementSizeForOpcode(PredOpcode)) 1393 return PredOpcode; 1394 1395 return {}; 1396 } 1397 1398 if (PredIsPTestLike) { 1399 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an 1400 // instruction that sets the flags as PTEST would and the condition is 1401 // "any" since PG is always a subset of the governing predicate of the 1402 // ptest-like instruction. 1403 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY) 1404 return PredOpcode; 1405 1406 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the 1407 // the element size matches and either the PTEST_LIKE instruction uses 1408 // the same all active mask or the condition is "any". 1409 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 && 1410 getElementSizeForOpcode(MaskOpcode) == 1411 getElementSizeForOpcode(PredOpcode)) { 1412 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1413 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY) 1414 return PredOpcode; 1415 } 1416 1417 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the 1418 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate 1419 // on 8-bit predicates like the PTEST. Otherwise, for instructions like 1420 // compare that also support 16/32/64-bit predicates, the implicit PTEST 1421 // performed by the compare could consider fewer lanes for these element 1422 // sizes. 1423 // 1424 // For example, consider 1425 // 1426 // ptrue p0.b ; P0=1111-1111-1111-1111 1427 // index z0.s, #0, #1 ; Z0=<0,1,2,3> 1428 // index z1.s, #1, #1 ; Z1=<1,2,3,4> 1429 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001 1430 // ; ^ last active 1431 // ptest p0, p1.b ; P1=0001-0001-0001-0001 1432 // ; ^ last active 1433 // 1434 // where the compare generates a canonical all active 32-bit predicate 1435 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last 1436 // active flag, whereas the PTEST instruction with the same mask doesn't. 1437 // For PTEST_ANY this doesn't apply as the flags in this case would be 1438 // identical regardless of element size. 1439 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1440 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode); 1441 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB || 1442 PTest->getOpcode() == AArch64::PTEST_PP_ANY)) 1443 return PredOpcode; 1444 1445 return {}; 1446 } 1447 1448 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the 1449 // opcode so the PTEST becomes redundant. 1450 switch (PredOpcode) { 1451 case AArch64::AND_PPzPP: 1452 case AArch64::BIC_PPzPP: 1453 case AArch64::EOR_PPzPP: 1454 case AArch64::NAND_PPzPP: 1455 case AArch64::NOR_PPzPP: 1456 case AArch64::ORN_PPzPP: 1457 case AArch64::ORR_PPzPP: 1458 case AArch64::BRKA_PPzP: 1459 case AArch64::BRKPA_PPzPP: 1460 case AArch64::BRKB_PPzP: 1461 case AArch64::BRKPB_PPzPP: 1462 case AArch64::RDFFR_PPz: { 1463 // Check to see if our mask is the same. If not the resulting flag bits 1464 // may be different and we can't remove the ptest. 1465 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1466 if (Mask != PredMask) 1467 return {}; 1468 break; 1469 } 1470 case AArch64::BRKN_PPzP: { 1471 // BRKN uses an all active implicit mask to set flags unlike the other 1472 // flag-setting instructions. 1473 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B). 1474 if ((MaskOpcode != AArch64::PTRUE_B) || 1475 (Mask->getOperand(1).getImm() != 31)) 1476 return {}; 1477 break; 1478 } 1479 case AArch64::PTRUE_B: 1480 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A) 1481 break; 1482 default: 1483 // Bail out if we don't recognize the input 1484 return {}; 1485 } 1486 1487 return convertToFlagSettingOpc(PredOpcode); 1488 } 1489 1490 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating 1491 /// operation which could set the flags in an identical manner 1492 bool AArch64InstrInfo::optimizePTestInstr( 1493 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, 1494 const MachineRegisterInfo *MRI) const { 1495 auto *Mask = MRI->getUniqueVRegDef(MaskReg); 1496 auto *Pred = MRI->getUniqueVRegDef(PredReg); 1497 unsigned PredOpcode = Pred->getOpcode(); 1498 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI); 1499 if (!NewOp) 1500 return false; 1501 1502 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1503 1504 // If another instruction between Pred and PTest accesses flags, don't remove 1505 // the ptest or update the earlier instruction to modify them. 1506 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI)) 1507 return false; 1508 1509 // If we pass all the checks, it's safe to remove the PTEST and use the flags 1510 // as they are prior to PTEST. Sometimes this requires the tested PTEST 1511 // operand to be replaced with an equivalent instruction that also sets the 1512 // flags. 1513 PTest->eraseFromParent(); 1514 if (*NewOp != PredOpcode) { 1515 Pred->setDesc(get(*NewOp)); 1516 bool succeeded = UpdateOperandRegClass(*Pred); 1517 (void)succeeded; 1518 assert(succeeded && "Operands have incompatible register classes!"); 1519 Pred->addRegisterDefined(AArch64::NZCV, TRI); 1520 } 1521 1522 // Ensure that the flags def is live. 1523 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) { 1524 unsigned i = 0, e = Pred->getNumOperands(); 1525 for (; i != e; ++i) { 1526 MachineOperand &MO = Pred->getOperand(i); 1527 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) { 1528 MO.setIsDead(false); 1529 break; 1530 } 1531 } 1532 } 1533 return true; 1534 } 1535 1536 /// Try to optimize a compare instruction. A compare instruction is an 1537 /// instruction which produces AArch64::NZCV. It can be truly compare 1538 /// instruction 1539 /// when there are no uses of its destination register. 1540 /// 1541 /// The following steps are tried in order: 1542 /// 1. Convert CmpInstr into an unconditional version. 1543 /// 2. Remove CmpInstr if above there is an instruction producing a needed 1544 /// condition code or an instruction which can be converted into such an 1545 /// instruction. 1546 /// Only comparison with zero is supported. 1547 bool AArch64InstrInfo::optimizeCompareInstr( 1548 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, 1549 int64_t CmpValue, const MachineRegisterInfo *MRI) const { 1550 assert(CmpInstr.getParent()); 1551 assert(MRI); 1552 1553 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1554 int DeadNZCVIdx = 1555 CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true); 1556 if (DeadNZCVIdx != -1) { 1557 if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) || 1558 CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) { 1559 CmpInstr.eraseFromParent(); 1560 return true; 1561 } 1562 unsigned Opc = CmpInstr.getOpcode(); 1563 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr); 1564 if (NewOpc == Opc) 1565 return false; 1566 const MCInstrDesc &MCID = get(NewOpc); 1567 CmpInstr.setDesc(MCID); 1568 CmpInstr.removeOperand(DeadNZCVIdx); 1569 bool succeeded = UpdateOperandRegClass(CmpInstr); 1570 (void)succeeded; 1571 assert(succeeded && "Some operands reg class are incompatible!"); 1572 return true; 1573 } 1574 1575 if (CmpInstr.getOpcode() == AArch64::PTEST_PP || 1576 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY) 1577 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI); 1578 1579 if (SrcReg2 != 0) 1580 return false; 1581 1582 // CmpInstr is a Compare instruction if destination register is not used. 1583 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) 1584 return false; 1585 1586 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI)) 1587 return true; 1588 return (CmpValue == 0 || CmpValue == 1) && 1589 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI); 1590 } 1591 1592 /// Get opcode of S version of Instr. 1593 /// If Instr is S version its opcode is returned. 1594 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version 1595 /// or we are not interested in it. 1596 static unsigned sForm(MachineInstr &Instr) { 1597 switch (Instr.getOpcode()) { 1598 default: 1599 return AArch64::INSTRUCTION_LIST_END; 1600 1601 case AArch64::ADDSWrr: 1602 case AArch64::ADDSWri: 1603 case AArch64::ADDSXrr: 1604 case AArch64::ADDSXri: 1605 case AArch64::SUBSWrr: 1606 case AArch64::SUBSWri: 1607 case AArch64::SUBSXrr: 1608 case AArch64::SUBSXri: 1609 return Instr.getOpcode(); 1610 1611 case AArch64::ADDWrr: 1612 return AArch64::ADDSWrr; 1613 case AArch64::ADDWri: 1614 return AArch64::ADDSWri; 1615 case AArch64::ADDXrr: 1616 return AArch64::ADDSXrr; 1617 case AArch64::ADDXri: 1618 return AArch64::ADDSXri; 1619 case AArch64::ADCWr: 1620 return AArch64::ADCSWr; 1621 case AArch64::ADCXr: 1622 return AArch64::ADCSXr; 1623 case AArch64::SUBWrr: 1624 return AArch64::SUBSWrr; 1625 case AArch64::SUBWri: 1626 return AArch64::SUBSWri; 1627 case AArch64::SUBXrr: 1628 return AArch64::SUBSXrr; 1629 case AArch64::SUBXri: 1630 return AArch64::SUBSXri; 1631 case AArch64::SBCWr: 1632 return AArch64::SBCSWr; 1633 case AArch64::SBCXr: 1634 return AArch64::SBCSXr; 1635 case AArch64::ANDWri: 1636 return AArch64::ANDSWri; 1637 case AArch64::ANDXri: 1638 return AArch64::ANDSXri; 1639 } 1640 } 1641 1642 /// Check if AArch64::NZCV should be alive in successors of MBB. 1643 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) { 1644 for (auto *BB : MBB->successors()) 1645 if (BB->isLiveIn(AArch64::NZCV)) 1646 return true; 1647 return false; 1648 } 1649 1650 /// \returns The condition code operand index for \p Instr if it is a branch 1651 /// or select and -1 otherwise. 1652 static int 1653 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) { 1654 switch (Instr.getOpcode()) { 1655 default: 1656 return -1; 1657 1658 case AArch64::Bcc: { 1659 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr); 1660 assert(Idx >= 2); 1661 return Idx - 2; 1662 } 1663 1664 case AArch64::CSINVWr: 1665 case AArch64::CSINVXr: 1666 case AArch64::CSINCWr: 1667 case AArch64::CSINCXr: 1668 case AArch64::CSELWr: 1669 case AArch64::CSELXr: 1670 case AArch64::CSNEGWr: 1671 case AArch64::CSNEGXr: 1672 case AArch64::FCSELSrrr: 1673 case AArch64::FCSELDrrr: { 1674 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr); 1675 assert(Idx >= 1); 1676 return Idx - 1; 1677 } 1678 } 1679 } 1680 1681 /// Find a condition code used by the instruction. 1682 /// Returns AArch64CC::Invalid if either the instruction does not use condition 1683 /// codes or we don't optimize CmpInstr in the presence of such instructions. 1684 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { 1685 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr); 1686 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>( 1687 Instr.getOperand(CCIdx).getImm()) 1688 : AArch64CC::Invalid; 1689 } 1690 1691 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { 1692 assert(CC != AArch64CC::Invalid); 1693 UsedNZCV UsedFlags; 1694 switch (CC) { 1695 default: 1696 break; 1697 1698 case AArch64CC::EQ: // Z set 1699 case AArch64CC::NE: // Z clear 1700 UsedFlags.Z = true; 1701 break; 1702 1703 case AArch64CC::HI: // Z clear and C set 1704 case AArch64CC::LS: // Z set or C clear 1705 UsedFlags.Z = true; 1706 [[fallthrough]]; 1707 case AArch64CC::HS: // C set 1708 case AArch64CC::LO: // C clear 1709 UsedFlags.C = true; 1710 break; 1711 1712 case AArch64CC::MI: // N set 1713 case AArch64CC::PL: // N clear 1714 UsedFlags.N = true; 1715 break; 1716 1717 case AArch64CC::VS: // V set 1718 case AArch64CC::VC: // V clear 1719 UsedFlags.V = true; 1720 break; 1721 1722 case AArch64CC::GT: // Z clear, N and V the same 1723 case AArch64CC::LE: // Z set, N and V differ 1724 UsedFlags.Z = true; 1725 [[fallthrough]]; 1726 case AArch64CC::GE: // N and V the same 1727 case AArch64CC::LT: // N and V differ 1728 UsedFlags.N = true; 1729 UsedFlags.V = true; 1730 break; 1731 } 1732 return UsedFlags; 1733 } 1734 1735 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV 1736 /// flags are not alive in successors of the same \p CmpInstr and \p MI parent. 1737 /// \returns std::nullopt otherwise. 1738 /// 1739 /// Collect instructions using that flags in \p CCUseInstrs if provided. 1740 std::optional<UsedNZCV> 1741 llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, 1742 const TargetRegisterInfo &TRI, 1743 SmallVectorImpl<MachineInstr *> *CCUseInstrs) { 1744 MachineBasicBlock *CmpParent = CmpInstr.getParent(); 1745 if (MI.getParent() != CmpParent) 1746 return std::nullopt; 1747 1748 if (areCFlagsAliveInSuccessors(CmpParent)) 1749 return std::nullopt; 1750 1751 UsedNZCV NZCVUsedAfterCmp; 1752 for (MachineInstr &Instr : instructionsWithoutDebug( 1753 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) { 1754 if (Instr.readsRegister(AArch64::NZCV, &TRI)) { 1755 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); 1756 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction 1757 return std::nullopt; 1758 NZCVUsedAfterCmp |= getUsedNZCV(CC); 1759 if (CCUseInstrs) 1760 CCUseInstrs->push_back(&Instr); 1761 } 1762 if (Instr.modifiesRegister(AArch64::NZCV, &TRI)) 1763 break; 1764 } 1765 return NZCVUsedAfterCmp; 1766 } 1767 1768 static bool isADDSRegImm(unsigned Opcode) { 1769 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; 1770 } 1771 1772 static bool isSUBSRegImm(unsigned Opcode) { 1773 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; 1774 } 1775 1776 /// Check if CmpInstr can be substituted by MI. 1777 /// 1778 /// CmpInstr can be substituted: 1779 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1780 /// - and, MI and CmpInstr are from the same MachineBB 1781 /// - and, condition flags are not alive in successors of the CmpInstr parent 1782 /// - and, if MI opcode is the S form there must be no defs of flags between 1783 /// MI and CmpInstr 1784 /// or if MI opcode is not the S form there must be neither defs of flags 1785 /// nor uses of flags between MI and CmpInstr. 1786 /// - and, if C/V flags are not used after CmpInstr 1787 /// or if N flag is used but MI produces poison value if signed overflow 1788 /// occurs. 1789 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, 1790 const TargetRegisterInfo &TRI) { 1791 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction 1792 // that may or may not set flags. 1793 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END); 1794 1795 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1796 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) 1797 return false; 1798 1799 assert((CmpInstr.getOperand(2).isImm() && 1800 CmpInstr.getOperand(2).getImm() == 0) && 1801 "Caller guarantees that CmpInstr compares with constant 0"); 1802 1803 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI); 1804 if (!NZVCUsed || NZVCUsed->C) 1805 return false; 1806 1807 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either 1808 // '%vreg = add ...' or '%vreg = sub ...'. 1809 // Condition flag V is used to indicate signed overflow. 1810 // 1) MI and CmpInstr set N and V to the same value. 1811 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when 1812 // signed overflow occurs, so CmpInstr could still be simplified away. 1813 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap)) 1814 return false; 1815 1816 AccessKind AccessToCheck = AK_Write; 1817 if (sForm(MI) != MI.getOpcode()) 1818 AccessToCheck = AK_All; 1819 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck); 1820 } 1821 1822 /// Substitute an instruction comparing to zero with another instruction 1823 /// which produces needed condition flags. 1824 /// 1825 /// Return true on success. 1826 bool AArch64InstrInfo::substituteCmpToZero( 1827 MachineInstr &CmpInstr, unsigned SrcReg, 1828 const MachineRegisterInfo &MRI) const { 1829 // Get the unique definition of SrcReg. 1830 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1831 if (!MI) 1832 return false; 1833 1834 const TargetRegisterInfo &TRI = getRegisterInfo(); 1835 1836 unsigned NewOpc = sForm(*MI); 1837 if (NewOpc == AArch64::INSTRUCTION_LIST_END) 1838 return false; 1839 1840 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI)) 1841 return false; 1842 1843 // Update the instruction to set NZCV. 1844 MI->setDesc(get(NewOpc)); 1845 CmpInstr.eraseFromParent(); 1846 bool succeeded = UpdateOperandRegClass(*MI); 1847 (void)succeeded; 1848 assert(succeeded && "Some operands reg class are incompatible!"); 1849 MI->addRegisterDefined(AArch64::NZCV, &TRI); 1850 return true; 1851 } 1852 1853 /// \returns True if \p CmpInstr can be removed. 1854 /// 1855 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition 1856 /// codes used in \p CCUseInstrs must be inverted. 1857 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, 1858 int CmpValue, const TargetRegisterInfo &TRI, 1859 SmallVectorImpl<MachineInstr *> &CCUseInstrs, 1860 bool &IsInvertCC) { 1861 assert((CmpValue == 0 || CmpValue == 1) && 1862 "Only comparisons to 0 or 1 considered for removal!"); 1863 1864 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>' 1865 unsigned MIOpc = MI.getOpcode(); 1866 if (MIOpc == AArch64::CSINCWr) { 1867 if (MI.getOperand(1).getReg() != AArch64::WZR || 1868 MI.getOperand(2).getReg() != AArch64::WZR) 1869 return false; 1870 } else if (MIOpc == AArch64::CSINCXr) { 1871 if (MI.getOperand(1).getReg() != AArch64::XZR || 1872 MI.getOperand(2).getReg() != AArch64::XZR) 1873 return false; 1874 } else { 1875 return false; 1876 } 1877 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI); 1878 if (MICC == AArch64CC::Invalid) 1879 return false; 1880 1881 // NZCV needs to be defined 1882 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1) 1883 return false; 1884 1885 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1' 1886 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1887 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode); 1888 if (CmpValue && !IsSubsRegImm) 1889 return false; 1890 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode)) 1891 return false; 1892 1893 // MI conditions allowed: eq, ne, mi, pl 1894 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC); 1895 if (MIUsedNZCV.C || MIUsedNZCV.V) 1896 return false; 1897 1898 std::optional<UsedNZCV> NZCVUsedAfterCmp = 1899 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs); 1900 // Condition flags are not used in CmpInstr basic block successors and only 1901 // Z or N flags allowed to be used after CmpInstr within its basic block 1902 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V) 1903 return false; 1904 // Z or N flag used after CmpInstr must correspond to the flag used in MI 1905 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) || 1906 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z)) 1907 return false; 1908 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne 1909 if (MIUsedNZCV.N && !CmpValue) 1910 return false; 1911 1912 // There must be no defs of flags between MI and CmpInstr 1913 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write)) 1914 return false; 1915 1916 // Condition code is inverted in the following cases: 1917 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1918 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1' 1919 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) || 1920 (!CmpValue && MICC == AArch64CC::NE); 1921 return true; 1922 } 1923 1924 /// Remove comparison in csinc-cmp sequence 1925 /// 1926 /// Examples: 1927 /// 1. \code 1928 /// csinc w9, wzr, wzr, ne 1929 /// cmp w9, #0 1930 /// b.eq 1931 /// \endcode 1932 /// to 1933 /// \code 1934 /// csinc w9, wzr, wzr, ne 1935 /// b.ne 1936 /// \endcode 1937 /// 1938 /// 2. \code 1939 /// csinc x2, xzr, xzr, mi 1940 /// cmp x2, #1 1941 /// b.pl 1942 /// \endcode 1943 /// to 1944 /// \code 1945 /// csinc x2, xzr, xzr, mi 1946 /// b.pl 1947 /// \endcode 1948 /// 1949 /// \param CmpInstr comparison instruction 1950 /// \return True when comparison removed 1951 bool AArch64InstrInfo::removeCmpToZeroOrOne( 1952 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue, 1953 const MachineRegisterInfo &MRI) const { 1954 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1955 if (!MI) 1956 return false; 1957 const TargetRegisterInfo &TRI = getRegisterInfo(); 1958 SmallVector<MachineInstr *, 4> CCUseInstrs; 1959 bool IsInvertCC = false; 1960 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs, 1961 IsInvertCC)) 1962 return false; 1963 // Make transformation 1964 CmpInstr.eraseFromParent(); 1965 if (IsInvertCC) { 1966 // Invert condition codes in CmpInstr CC users 1967 for (MachineInstr *CCUseInstr : CCUseInstrs) { 1968 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr); 1969 assert(Idx >= 0 && "Unexpected instruction using CC."); 1970 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx); 1971 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode( 1972 static_cast<AArch64CC::CondCode>(CCOperand.getImm())); 1973 CCOperand.setImm(CCUse); 1974 } 1975 } 1976 return true; 1977 } 1978 1979 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1980 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && 1981 MI.getOpcode() != AArch64::CATCHRET) 1982 return false; 1983 1984 MachineBasicBlock &MBB = *MI.getParent(); 1985 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>(); 1986 auto TRI = Subtarget.getRegisterInfo(); 1987 DebugLoc DL = MI.getDebugLoc(); 1988 1989 if (MI.getOpcode() == AArch64::CATCHRET) { 1990 // Skip to the first instruction before the epilog. 1991 const TargetInstrInfo *TII = 1992 MBB.getParent()->getSubtarget().getInstrInfo(); 1993 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); 1994 auto MBBI = MachineBasicBlock::iterator(MI); 1995 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI); 1996 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) && 1997 FirstEpilogSEH != MBB.begin()) 1998 FirstEpilogSEH = std::prev(FirstEpilogSEH); 1999 if (FirstEpilogSEH != MBB.begin()) 2000 FirstEpilogSEH = std::next(FirstEpilogSEH); 2001 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP)) 2002 .addReg(AArch64::X0, RegState::Define) 2003 .addMBB(TargetMBB); 2004 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri)) 2005 .addReg(AArch64::X0, RegState::Define) 2006 .addReg(AArch64::X0) 2007 .addMBB(TargetMBB) 2008 .addImm(0); 2009 TargetMBB->setMachineBlockAddressTaken(); 2010 return true; 2011 } 2012 2013 Register Reg = MI.getOperand(0).getReg(); 2014 Module &M = *MBB.getParent()->getFunction().getParent(); 2015 if (M.getStackProtectorGuard() == "sysreg") { 2016 const AArch64SysReg::SysReg *SrcReg = 2017 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg()); 2018 if (!SrcReg) 2019 report_fatal_error("Unknown SysReg for Stack Protector Guard Register"); 2020 2021 // mrs xN, sysreg 2022 BuildMI(MBB, MI, DL, get(AArch64::MRS)) 2023 .addDef(Reg, RegState::Renamable) 2024 .addImm(SrcReg->Encoding); 2025 int Offset = M.getStackProtectorGuardOffset(); 2026 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) { 2027 // ldr xN, [xN, #offset] 2028 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 2029 .addDef(Reg) 2030 .addUse(Reg, RegState::Kill) 2031 .addImm(Offset / 8); 2032 } else if (Offset >= -256 && Offset <= 255) { 2033 // ldur xN, [xN, #offset] 2034 BuildMI(MBB, MI, DL, get(AArch64::LDURXi)) 2035 .addDef(Reg) 2036 .addUse(Reg, RegState::Kill) 2037 .addImm(Offset); 2038 } else if (Offset >= -4095 && Offset <= 4095) { 2039 if (Offset > 0) { 2040 // add xN, xN, #offset 2041 BuildMI(MBB, MI, DL, get(AArch64::ADDXri)) 2042 .addDef(Reg) 2043 .addUse(Reg, RegState::Kill) 2044 .addImm(Offset) 2045 .addImm(0); 2046 } else { 2047 // sub xN, xN, #offset 2048 BuildMI(MBB, MI, DL, get(AArch64::SUBXri)) 2049 .addDef(Reg) 2050 .addUse(Reg, RegState::Kill) 2051 .addImm(-Offset) 2052 .addImm(0); 2053 } 2054 // ldr xN, [xN] 2055 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 2056 .addDef(Reg) 2057 .addUse(Reg, RegState::Kill) 2058 .addImm(0); 2059 } else { 2060 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger 2061 // than 23760. 2062 // It might be nice to use AArch64::MOVi32imm here, which would get 2063 // expanded in PreSched2 after PostRA, but our lone scratch Reg already 2064 // contains the MRS result. findScratchNonCalleeSaveRegister() in 2065 // AArch64FrameLowering might help us find such a scratch register 2066 // though. If we failed to find a scratch register, we could emit a 2067 // stream of add instructions to build up the immediate. Or, we could try 2068 // to insert a AArch64::MOVi32imm before register allocation so that we 2069 // didn't need to scavenge for a scratch register. 2070 report_fatal_error("Unable to encode Stack Protector Guard Offset"); 2071 } 2072 MBB.erase(MI); 2073 return true; 2074 } 2075 2076 const GlobalValue *GV = 2077 cast<GlobalValue>((*MI.memoperands_begin())->getValue()); 2078 const TargetMachine &TM = MBB.getParent()->getTarget(); 2079 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); 2080 const unsigned char MO_NC = AArch64II::MO_NC; 2081 2082 if ((OpFlags & AArch64II::MO_GOT) != 0) { 2083 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) 2084 .addGlobalAddress(GV, 0, OpFlags); 2085 if (Subtarget.isTargetILP32()) { 2086 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 2087 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 2088 .addDef(Reg32, RegState::Dead) 2089 .addUse(Reg, RegState::Kill) 2090 .addImm(0) 2091 .addMemOperand(*MI.memoperands_begin()) 2092 .addDef(Reg, RegState::Implicit); 2093 } else { 2094 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2095 .addReg(Reg, RegState::Kill) 2096 .addImm(0) 2097 .addMemOperand(*MI.memoperands_begin()); 2098 } 2099 } else if (TM.getCodeModel() == CodeModel::Large) { 2100 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?"); 2101 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) 2102 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) 2103 .addImm(0); 2104 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2105 .addReg(Reg, RegState::Kill) 2106 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC) 2107 .addImm(16); 2108 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2109 .addReg(Reg, RegState::Kill) 2110 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC) 2111 .addImm(32); 2112 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2113 .addReg(Reg, RegState::Kill) 2114 .addGlobalAddress(GV, 0, AArch64II::MO_G3) 2115 .addImm(48); 2116 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2117 .addReg(Reg, RegState::Kill) 2118 .addImm(0) 2119 .addMemOperand(*MI.memoperands_begin()); 2120 } else if (TM.getCodeModel() == CodeModel::Tiny) { 2121 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg) 2122 .addGlobalAddress(GV, 0, OpFlags); 2123 } else { 2124 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) 2125 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); 2126 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; 2127 if (Subtarget.isTargetILP32()) { 2128 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 2129 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 2130 .addDef(Reg32, RegState::Dead) 2131 .addUse(Reg, RegState::Kill) 2132 .addGlobalAddress(GV, 0, LoFlags) 2133 .addMemOperand(*MI.memoperands_begin()) 2134 .addDef(Reg, RegState::Implicit); 2135 } else { 2136 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2137 .addReg(Reg, RegState::Kill) 2138 .addGlobalAddress(GV, 0, LoFlags) 2139 .addMemOperand(*MI.memoperands_begin()); 2140 } 2141 } 2142 2143 MBB.erase(MI); 2144 2145 return true; 2146 } 2147 2148 // Return true if this instruction simply sets its single destination register 2149 // to zero. This is equivalent to a register rename of the zero-register. 2150 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) { 2151 switch (MI.getOpcode()) { 2152 default: 2153 break; 2154 case AArch64::MOVZWi: 2155 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) 2156 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { 2157 assert(MI.getDesc().getNumOperands() == 3 && 2158 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); 2159 return true; 2160 } 2161 break; 2162 case AArch64::ANDWri: // and Rd, Rzr, #imm 2163 return MI.getOperand(1).getReg() == AArch64::WZR; 2164 case AArch64::ANDXri: 2165 return MI.getOperand(1).getReg() == AArch64::XZR; 2166 case TargetOpcode::COPY: 2167 return MI.getOperand(1).getReg() == AArch64::WZR; 2168 } 2169 return false; 2170 } 2171 2172 // Return true if this instruction simply renames a general register without 2173 // modifying bits. 2174 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) { 2175 switch (MI.getOpcode()) { 2176 default: 2177 break; 2178 case TargetOpcode::COPY: { 2179 // GPR32 copies will by lowered to ORRXrs 2180 Register DstReg = MI.getOperand(0).getReg(); 2181 return (AArch64::GPR32RegClass.contains(DstReg) || 2182 AArch64::GPR64RegClass.contains(DstReg)); 2183 } 2184 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) 2185 if (MI.getOperand(1).getReg() == AArch64::XZR) { 2186 assert(MI.getDesc().getNumOperands() == 4 && 2187 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); 2188 return true; 2189 } 2190 break; 2191 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) 2192 if (MI.getOperand(2).getImm() == 0) { 2193 assert(MI.getDesc().getNumOperands() == 4 && 2194 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); 2195 return true; 2196 } 2197 break; 2198 } 2199 return false; 2200 } 2201 2202 // Return true if this instruction simply renames a general register without 2203 // modifying bits. 2204 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) { 2205 switch (MI.getOpcode()) { 2206 default: 2207 break; 2208 case TargetOpcode::COPY: { 2209 Register DstReg = MI.getOperand(0).getReg(); 2210 return AArch64::FPR128RegClass.contains(DstReg); 2211 } 2212 case AArch64::ORRv16i8: 2213 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { 2214 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && 2215 "invalid ORRv16i8 operands"); 2216 return true; 2217 } 2218 break; 2219 } 2220 return false; 2221 } 2222 2223 Register AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 2224 int &FrameIndex) const { 2225 switch (MI.getOpcode()) { 2226 default: 2227 break; 2228 case AArch64::LDRWui: 2229 case AArch64::LDRXui: 2230 case AArch64::LDRBui: 2231 case AArch64::LDRHui: 2232 case AArch64::LDRSui: 2233 case AArch64::LDRDui: 2234 case AArch64::LDRQui: 2235 case AArch64::LDR_PXI: 2236 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2237 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2238 FrameIndex = MI.getOperand(1).getIndex(); 2239 return MI.getOperand(0).getReg(); 2240 } 2241 break; 2242 } 2243 2244 return 0; 2245 } 2246 2247 Register AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, 2248 int &FrameIndex) const { 2249 switch (MI.getOpcode()) { 2250 default: 2251 break; 2252 case AArch64::STRWui: 2253 case AArch64::STRXui: 2254 case AArch64::STRBui: 2255 case AArch64::STRHui: 2256 case AArch64::STRSui: 2257 case AArch64::STRDui: 2258 case AArch64::STRQui: 2259 case AArch64::STR_PXI: 2260 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2261 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2262 FrameIndex = MI.getOperand(1).getIndex(); 2263 return MI.getOperand(0).getReg(); 2264 } 2265 break; 2266 } 2267 return 0; 2268 } 2269 2270 /// Check all MachineMemOperands for a hint to suppress pairing. 2271 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) { 2272 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2273 return MMO->getFlags() & MOSuppressPair; 2274 }); 2275 } 2276 2277 /// Set a flag on the first MachineMemOperand to suppress pairing. 2278 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) { 2279 if (MI.memoperands_empty()) 2280 return; 2281 (*MI.memoperands_begin())->setFlags(MOSuppressPair); 2282 } 2283 2284 /// Check all MachineMemOperands for a hint that the load/store is strided. 2285 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) { 2286 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2287 return MMO->getFlags() & MOStridedAccess; 2288 }); 2289 } 2290 2291 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) { 2292 switch (Opc) { 2293 default: 2294 return false; 2295 case AArch64::STURSi: 2296 case AArch64::STRSpre: 2297 case AArch64::STURDi: 2298 case AArch64::STRDpre: 2299 case AArch64::STURQi: 2300 case AArch64::STRQpre: 2301 case AArch64::STURBBi: 2302 case AArch64::STURHHi: 2303 case AArch64::STURWi: 2304 case AArch64::STRWpre: 2305 case AArch64::STURXi: 2306 case AArch64::STRXpre: 2307 case AArch64::LDURSi: 2308 case AArch64::LDRSpre: 2309 case AArch64::LDURDi: 2310 case AArch64::LDRDpre: 2311 case AArch64::LDURQi: 2312 case AArch64::LDRQpre: 2313 case AArch64::LDURWi: 2314 case AArch64::LDRWpre: 2315 case AArch64::LDURXi: 2316 case AArch64::LDRXpre: 2317 case AArch64::LDRSWpre: 2318 case AArch64::LDURSWi: 2319 case AArch64::LDURHHi: 2320 case AArch64::LDURBBi: 2321 case AArch64::LDURSBWi: 2322 case AArch64::LDURSHWi: 2323 return true; 2324 } 2325 } 2326 2327 std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) { 2328 switch (Opc) { 2329 default: return {}; 2330 case AArch64::PRFMui: return AArch64::PRFUMi; 2331 case AArch64::LDRXui: return AArch64::LDURXi; 2332 case AArch64::LDRWui: return AArch64::LDURWi; 2333 case AArch64::LDRBui: return AArch64::LDURBi; 2334 case AArch64::LDRHui: return AArch64::LDURHi; 2335 case AArch64::LDRSui: return AArch64::LDURSi; 2336 case AArch64::LDRDui: return AArch64::LDURDi; 2337 case AArch64::LDRQui: return AArch64::LDURQi; 2338 case AArch64::LDRBBui: return AArch64::LDURBBi; 2339 case AArch64::LDRHHui: return AArch64::LDURHHi; 2340 case AArch64::LDRSBXui: return AArch64::LDURSBXi; 2341 case AArch64::LDRSBWui: return AArch64::LDURSBWi; 2342 case AArch64::LDRSHXui: return AArch64::LDURSHXi; 2343 case AArch64::LDRSHWui: return AArch64::LDURSHWi; 2344 case AArch64::LDRSWui: return AArch64::LDURSWi; 2345 case AArch64::STRXui: return AArch64::STURXi; 2346 case AArch64::STRWui: return AArch64::STURWi; 2347 case AArch64::STRBui: return AArch64::STURBi; 2348 case AArch64::STRHui: return AArch64::STURHi; 2349 case AArch64::STRSui: return AArch64::STURSi; 2350 case AArch64::STRDui: return AArch64::STURDi; 2351 case AArch64::STRQui: return AArch64::STURQi; 2352 case AArch64::STRBBui: return AArch64::STURBBi; 2353 case AArch64::STRHHui: return AArch64::STURHHi; 2354 } 2355 } 2356 2357 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { 2358 switch (Opc) { 2359 default: 2360 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx"); 2361 case AArch64::ADDG: 2362 case AArch64::LDAPURBi: 2363 case AArch64::LDAPURHi: 2364 case AArch64::LDAPURi: 2365 case AArch64::LDAPURSBWi: 2366 case AArch64::LDAPURSBXi: 2367 case AArch64::LDAPURSHWi: 2368 case AArch64::LDAPURSHXi: 2369 case AArch64::LDAPURSWi: 2370 case AArch64::LDAPURXi: 2371 case AArch64::LDR_PPXI: 2372 case AArch64::LDR_PXI: 2373 case AArch64::LDR_ZXI: 2374 case AArch64::LDR_ZZXI: 2375 case AArch64::LDR_ZZZXI: 2376 case AArch64::LDR_ZZZZXI: 2377 case AArch64::LDRBBui: 2378 case AArch64::LDRBui: 2379 case AArch64::LDRDui: 2380 case AArch64::LDRHHui: 2381 case AArch64::LDRHui: 2382 case AArch64::LDRQui: 2383 case AArch64::LDRSBWui: 2384 case AArch64::LDRSBXui: 2385 case AArch64::LDRSHWui: 2386 case AArch64::LDRSHXui: 2387 case AArch64::LDRSui: 2388 case AArch64::LDRSWui: 2389 case AArch64::LDRWui: 2390 case AArch64::LDRXui: 2391 case AArch64::LDURBBi: 2392 case AArch64::LDURBi: 2393 case AArch64::LDURDi: 2394 case AArch64::LDURHHi: 2395 case AArch64::LDURHi: 2396 case AArch64::LDURQi: 2397 case AArch64::LDURSBWi: 2398 case AArch64::LDURSBXi: 2399 case AArch64::LDURSHWi: 2400 case AArch64::LDURSHXi: 2401 case AArch64::LDURSi: 2402 case AArch64::LDURSWi: 2403 case AArch64::LDURWi: 2404 case AArch64::LDURXi: 2405 case AArch64::PRFMui: 2406 case AArch64::PRFUMi: 2407 case AArch64::ST2Gi: 2408 case AArch64::STGi: 2409 case AArch64::STLURBi: 2410 case AArch64::STLURHi: 2411 case AArch64::STLURWi: 2412 case AArch64::STLURXi: 2413 case AArch64::StoreSwiftAsyncContext: 2414 case AArch64::STR_PPXI: 2415 case AArch64::STR_PXI: 2416 case AArch64::STR_ZXI: 2417 case AArch64::STR_ZZXI: 2418 case AArch64::STR_ZZZXI: 2419 case AArch64::STR_ZZZZXI: 2420 case AArch64::STRBBui: 2421 case AArch64::STRBui: 2422 case AArch64::STRDui: 2423 case AArch64::STRHHui: 2424 case AArch64::STRHui: 2425 case AArch64::STRQui: 2426 case AArch64::STRSui: 2427 case AArch64::STRWui: 2428 case AArch64::STRXui: 2429 case AArch64::STURBBi: 2430 case AArch64::STURBi: 2431 case AArch64::STURDi: 2432 case AArch64::STURHHi: 2433 case AArch64::STURHi: 2434 case AArch64::STURQi: 2435 case AArch64::STURSi: 2436 case AArch64::STURWi: 2437 case AArch64::STURXi: 2438 case AArch64::STZ2Gi: 2439 case AArch64::STZGi: 2440 case AArch64::TAGPstack: 2441 return 2; 2442 case AArch64::LD1B_D_IMM: 2443 case AArch64::LD1B_H_IMM: 2444 case AArch64::LD1B_IMM: 2445 case AArch64::LD1B_S_IMM: 2446 case AArch64::LD1D_IMM: 2447 case AArch64::LD1H_D_IMM: 2448 case AArch64::LD1H_IMM: 2449 case AArch64::LD1H_S_IMM: 2450 case AArch64::LD1RB_D_IMM: 2451 case AArch64::LD1RB_H_IMM: 2452 case AArch64::LD1RB_IMM: 2453 case AArch64::LD1RB_S_IMM: 2454 case AArch64::LD1RD_IMM: 2455 case AArch64::LD1RH_D_IMM: 2456 case AArch64::LD1RH_IMM: 2457 case AArch64::LD1RH_S_IMM: 2458 case AArch64::LD1RSB_D_IMM: 2459 case AArch64::LD1RSB_H_IMM: 2460 case AArch64::LD1RSB_S_IMM: 2461 case AArch64::LD1RSH_D_IMM: 2462 case AArch64::LD1RSH_S_IMM: 2463 case AArch64::LD1RSW_IMM: 2464 case AArch64::LD1RW_D_IMM: 2465 case AArch64::LD1RW_IMM: 2466 case AArch64::LD1SB_D_IMM: 2467 case AArch64::LD1SB_H_IMM: 2468 case AArch64::LD1SB_S_IMM: 2469 case AArch64::LD1SH_D_IMM: 2470 case AArch64::LD1SH_S_IMM: 2471 case AArch64::LD1SW_D_IMM: 2472 case AArch64::LD1W_D_IMM: 2473 case AArch64::LD1W_IMM: 2474 case AArch64::LD2B_IMM: 2475 case AArch64::LD2D_IMM: 2476 case AArch64::LD2H_IMM: 2477 case AArch64::LD2W_IMM: 2478 case AArch64::LD3B_IMM: 2479 case AArch64::LD3D_IMM: 2480 case AArch64::LD3H_IMM: 2481 case AArch64::LD3W_IMM: 2482 case AArch64::LD4B_IMM: 2483 case AArch64::LD4D_IMM: 2484 case AArch64::LD4H_IMM: 2485 case AArch64::LD4W_IMM: 2486 case AArch64::LDG: 2487 case AArch64::LDNF1B_D_IMM: 2488 case AArch64::LDNF1B_H_IMM: 2489 case AArch64::LDNF1B_IMM: 2490 case AArch64::LDNF1B_S_IMM: 2491 case AArch64::LDNF1D_IMM: 2492 case AArch64::LDNF1H_D_IMM: 2493 case AArch64::LDNF1H_IMM: 2494 case AArch64::LDNF1H_S_IMM: 2495 case AArch64::LDNF1SB_D_IMM: 2496 case AArch64::LDNF1SB_H_IMM: 2497 case AArch64::LDNF1SB_S_IMM: 2498 case AArch64::LDNF1SH_D_IMM: 2499 case AArch64::LDNF1SH_S_IMM: 2500 case AArch64::LDNF1SW_D_IMM: 2501 case AArch64::LDNF1W_D_IMM: 2502 case AArch64::LDNF1W_IMM: 2503 case AArch64::LDNPDi: 2504 case AArch64::LDNPQi: 2505 case AArch64::LDNPSi: 2506 case AArch64::LDNPWi: 2507 case AArch64::LDNPXi: 2508 case AArch64::LDNT1B_ZRI: 2509 case AArch64::LDNT1D_ZRI: 2510 case AArch64::LDNT1H_ZRI: 2511 case AArch64::LDNT1W_ZRI: 2512 case AArch64::LDPDi: 2513 case AArch64::LDPQi: 2514 case AArch64::LDPSi: 2515 case AArch64::LDPWi: 2516 case AArch64::LDPXi: 2517 case AArch64::LDRBBpost: 2518 case AArch64::LDRBBpre: 2519 case AArch64::LDRBpost: 2520 case AArch64::LDRBpre: 2521 case AArch64::LDRDpost: 2522 case AArch64::LDRDpre: 2523 case AArch64::LDRHHpost: 2524 case AArch64::LDRHHpre: 2525 case AArch64::LDRHpost: 2526 case AArch64::LDRHpre: 2527 case AArch64::LDRQpost: 2528 case AArch64::LDRQpre: 2529 case AArch64::LDRSpost: 2530 case AArch64::LDRSpre: 2531 case AArch64::LDRWpost: 2532 case AArch64::LDRWpre: 2533 case AArch64::LDRXpost: 2534 case AArch64::LDRXpre: 2535 case AArch64::ST1B_D_IMM: 2536 case AArch64::ST1B_H_IMM: 2537 case AArch64::ST1B_IMM: 2538 case AArch64::ST1B_S_IMM: 2539 case AArch64::ST1D_IMM: 2540 case AArch64::ST1H_D_IMM: 2541 case AArch64::ST1H_IMM: 2542 case AArch64::ST1H_S_IMM: 2543 case AArch64::ST1W_D_IMM: 2544 case AArch64::ST1W_IMM: 2545 case AArch64::ST2B_IMM: 2546 case AArch64::ST2D_IMM: 2547 case AArch64::ST2H_IMM: 2548 case AArch64::ST2W_IMM: 2549 case AArch64::ST3B_IMM: 2550 case AArch64::ST3D_IMM: 2551 case AArch64::ST3H_IMM: 2552 case AArch64::ST3W_IMM: 2553 case AArch64::ST4B_IMM: 2554 case AArch64::ST4D_IMM: 2555 case AArch64::ST4H_IMM: 2556 case AArch64::ST4W_IMM: 2557 case AArch64::STGPi: 2558 case AArch64::STGPreIndex: 2559 case AArch64::STZGPreIndex: 2560 case AArch64::ST2GPreIndex: 2561 case AArch64::STZ2GPreIndex: 2562 case AArch64::STGPostIndex: 2563 case AArch64::STZGPostIndex: 2564 case AArch64::ST2GPostIndex: 2565 case AArch64::STZ2GPostIndex: 2566 case AArch64::STNPDi: 2567 case AArch64::STNPQi: 2568 case AArch64::STNPSi: 2569 case AArch64::STNPWi: 2570 case AArch64::STNPXi: 2571 case AArch64::STNT1B_ZRI: 2572 case AArch64::STNT1D_ZRI: 2573 case AArch64::STNT1H_ZRI: 2574 case AArch64::STNT1W_ZRI: 2575 case AArch64::STPDi: 2576 case AArch64::STPQi: 2577 case AArch64::STPSi: 2578 case AArch64::STPWi: 2579 case AArch64::STPXi: 2580 case AArch64::STRBBpost: 2581 case AArch64::STRBBpre: 2582 case AArch64::STRBpost: 2583 case AArch64::STRBpre: 2584 case AArch64::STRDpost: 2585 case AArch64::STRDpre: 2586 case AArch64::STRHHpost: 2587 case AArch64::STRHHpre: 2588 case AArch64::STRHpost: 2589 case AArch64::STRHpre: 2590 case AArch64::STRQpost: 2591 case AArch64::STRQpre: 2592 case AArch64::STRSpost: 2593 case AArch64::STRSpre: 2594 case AArch64::STRWpost: 2595 case AArch64::STRWpre: 2596 case AArch64::STRXpost: 2597 case AArch64::STRXpre: 2598 return 3; 2599 case AArch64::LDPDpost: 2600 case AArch64::LDPDpre: 2601 case AArch64::LDPQpost: 2602 case AArch64::LDPQpre: 2603 case AArch64::LDPSpost: 2604 case AArch64::LDPSpre: 2605 case AArch64::LDPWpost: 2606 case AArch64::LDPWpre: 2607 case AArch64::LDPXpost: 2608 case AArch64::LDPXpre: 2609 case AArch64::STGPpre: 2610 case AArch64::STGPpost: 2611 case AArch64::STPDpost: 2612 case AArch64::STPDpre: 2613 case AArch64::STPQpost: 2614 case AArch64::STPQpre: 2615 case AArch64::STPSpost: 2616 case AArch64::STPSpre: 2617 case AArch64::STPWpost: 2618 case AArch64::STPWpre: 2619 case AArch64::STPXpost: 2620 case AArch64::STPXpre: 2621 return 4; 2622 } 2623 } 2624 2625 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) { 2626 switch (MI.getOpcode()) { 2627 default: 2628 return false; 2629 // Scaled instructions. 2630 case AArch64::STRSui: 2631 case AArch64::STRDui: 2632 case AArch64::STRQui: 2633 case AArch64::STRXui: 2634 case AArch64::STRWui: 2635 case AArch64::LDRSui: 2636 case AArch64::LDRDui: 2637 case AArch64::LDRQui: 2638 case AArch64::LDRXui: 2639 case AArch64::LDRWui: 2640 case AArch64::LDRSWui: 2641 // Unscaled instructions. 2642 case AArch64::STURSi: 2643 case AArch64::STRSpre: 2644 case AArch64::STURDi: 2645 case AArch64::STRDpre: 2646 case AArch64::STURQi: 2647 case AArch64::STRQpre: 2648 case AArch64::STURWi: 2649 case AArch64::STRWpre: 2650 case AArch64::STURXi: 2651 case AArch64::STRXpre: 2652 case AArch64::LDURSi: 2653 case AArch64::LDRSpre: 2654 case AArch64::LDURDi: 2655 case AArch64::LDRDpre: 2656 case AArch64::LDURQi: 2657 case AArch64::LDRQpre: 2658 case AArch64::LDURWi: 2659 case AArch64::LDRWpre: 2660 case AArch64::LDURXi: 2661 case AArch64::LDRXpre: 2662 case AArch64::LDURSWi: 2663 case AArch64::LDRSWpre: 2664 return true; 2665 } 2666 } 2667 2668 bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) { 2669 switch (MI.getOpcode()) { 2670 default: 2671 assert((!MI.isCall() || !MI.isReturn()) && 2672 "Unexpected instruction - was a new tail call opcode introduced?"); 2673 return false; 2674 case AArch64::TCRETURNdi: 2675 case AArch64::TCRETURNri: 2676 case AArch64::TCRETURNrix16x17: 2677 case AArch64::TCRETURNrix17: 2678 case AArch64::TCRETURNrinotx16: 2679 case AArch64::TCRETURNriALL: 2680 case AArch64::AUTH_TCRETURN: 2681 case AArch64::AUTH_TCRETURN_BTI: 2682 return true; 2683 } 2684 } 2685 2686 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) { 2687 switch (Opc) { 2688 default: 2689 llvm_unreachable("Opcode has no flag setting equivalent!"); 2690 // 32-bit cases: 2691 case AArch64::ADDWri: 2692 return AArch64::ADDSWri; 2693 case AArch64::ADDWrr: 2694 return AArch64::ADDSWrr; 2695 case AArch64::ADDWrs: 2696 return AArch64::ADDSWrs; 2697 case AArch64::ADDWrx: 2698 return AArch64::ADDSWrx; 2699 case AArch64::ANDWri: 2700 return AArch64::ANDSWri; 2701 case AArch64::ANDWrr: 2702 return AArch64::ANDSWrr; 2703 case AArch64::ANDWrs: 2704 return AArch64::ANDSWrs; 2705 case AArch64::BICWrr: 2706 return AArch64::BICSWrr; 2707 case AArch64::BICWrs: 2708 return AArch64::BICSWrs; 2709 case AArch64::SUBWri: 2710 return AArch64::SUBSWri; 2711 case AArch64::SUBWrr: 2712 return AArch64::SUBSWrr; 2713 case AArch64::SUBWrs: 2714 return AArch64::SUBSWrs; 2715 case AArch64::SUBWrx: 2716 return AArch64::SUBSWrx; 2717 // 64-bit cases: 2718 case AArch64::ADDXri: 2719 return AArch64::ADDSXri; 2720 case AArch64::ADDXrr: 2721 return AArch64::ADDSXrr; 2722 case AArch64::ADDXrs: 2723 return AArch64::ADDSXrs; 2724 case AArch64::ADDXrx: 2725 return AArch64::ADDSXrx; 2726 case AArch64::ANDXri: 2727 return AArch64::ANDSXri; 2728 case AArch64::ANDXrr: 2729 return AArch64::ANDSXrr; 2730 case AArch64::ANDXrs: 2731 return AArch64::ANDSXrs; 2732 case AArch64::BICXrr: 2733 return AArch64::BICSXrr; 2734 case AArch64::BICXrs: 2735 return AArch64::BICSXrs; 2736 case AArch64::SUBXri: 2737 return AArch64::SUBSXri; 2738 case AArch64::SUBXrr: 2739 return AArch64::SUBSXrr; 2740 case AArch64::SUBXrs: 2741 return AArch64::SUBSXrs; 2742 case AArch64::SUBXrx: 2743 return AArch64::SUBSXrx; 2744 // SVE instructions: 2745 case AArch64::AND_PPzPP: 2746 return AArch64::ANDS_PPzPP; 2747 case AArch64::BIC_PPzPP: 2748 return AArch64::BICS_PPzPP; 2749 case AArch64::EOR_PPzPP: 2750 return AArch64::EORS_PPzPP; 2751 case AArch64::NAND_PPzPP: 2752 return AArch64::NANDS_PPzPP; 2753 case AArch64::NOR_PPzPP: 2754 return AArch64::NORS_PPzPP; 2755 case AArch64::ORN_PPzPP: 2756 return AArch64::ORNS_PPzPP; 2757 case AArch64::ORR_PPzPP: 2758 return AArch64::ORRS_PPzPP; 2759 case AArch64::BRKA_PPzP: 2760 return AArch64::BRKAS_PPzP; 2761 case AArch64::BRKPA_PPzPP: 2762 return AArch64::BRKPAS_PPzPP; 2763 case AArch64::BRKB_PPzP: 2764 return AArch64::BRKBS_PPzP; 2765 case AArch64::BRKPB_PPzPP: 2766 return AArch64::BRKPBS_PPzPP; 2767 case AArch64::BRKN_PPzP: 2768 return AArch64::BRKNS_PPzP; 2769 case AArch64::RDFFR_PPz: 2770 return AArch64::RDFFRS_PPz; 2771 case AArch64::PTRUE_B: 2772 return AArch64::PTRUES_B; 2773 } 2774 } 2775 2776 // Is this a candidate for ld/st merging or pairing? For example, we don't 2777 // touch volatiles or load/stores that have a hint to avoid pair formation. 2778 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { 2779 2780 bool IsPreLdSt = isPreLdSt(MI); 2781 2782 // If this is a volatile load/store, don't mess with it. 2783 if (MI.hasOrderedMemoryRef()) 2784 return false; 2785 2786 // Make sure this is a reg/fi+imm (as opposed to an address reloc). 2787 // For Pre-inc LD/ST, the operand is shifted by one. 2788 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() || 2789 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) && 2790 "Expected a reg or frame index operand."); 2791 2792 // For Pre-indexed addressing quadword instructions, the third operand is the 2793 // immediate value. 2794 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm(); 2795 2796 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt) 2797 return false; 2798 2799 // Can't merge/pair if the instruction modifies the base register. 2800 // e.g., ldr x0, [x0] 2801 // This case will never occur with an FI base. 2802 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or 2803 // STR<S,D,Q,W,X>pre, it can be merged. 2804 // For example: 2805 // ldr q0, [x11, #32]! 2806 // ldr q1, [x11, #16] 2807 // to 2808 // ldp q0, q1, [x11, #32]! 2809 if (MI.getOperand(1).isReg() && !IsPreLdSt) { 2810 Register BaseReg = MI.getOperand(1).getReg(); 2811 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2812 if (MI.modifiesRegister(BaseReg, TRI)) 2813 return false; 2814 } 2815 2816 // Check if this load/store has a hint to avoid pair formation. 2817 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 2818 if (isLdStPairSuppressed(MI)) 2819 return false; 2820 2821 // Do not pair any callee-save store/reload instructions in the 2822 // prologue/epilogue if the CFI information encoded the operations as separate 2823 // instructions, as that will cause the size of the actual prologue to mismatch 2824 // with the prologue size recorded in the Windows CFI. 2825 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo(); 2826 bool NeedsWinCFI = MAI->usesWindowsCFI() && 2827 MI.getMF()->getFunction().needsUnwindTableEntry(); 2828 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) || 2829 MI.getFlag(MachineInstr::FrameDestroy))) 2830 return false; 2831 2832 // On some CPUs quad load/store pairs are slower than two single load/stores. 2833 if (Subtarget.isPaired128Slow()) { 2834 switch (MI.getOpcode()) { 2835 default: 2836 break; 2837 case AArch64::LDURQi: 2838 case AArch64::STURQi: 2839 case AArch64::LDRQui: 2840 case AArch64::STRQui: 2841 return false; 2842 } 2843 } 2844 2845 return true; 2846 } 2847 2848 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth( 2849 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 2850 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, 2851 const TargetRegisterInfo *TRI) const { 2852 if (!LdSt.mayLoadOrStore()) 2853 return false; 2854 2855 const MachineOperand *BaseOp; 2856 TypeSize WidthN(0, false); 2857 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable, 2858 WidthN, TRI)) 2859 return false; 2860 // The maximum vscale is 16 under AArch64, return the maximal extent for the 2861 // vector. 2862 Width = LocationSize::precise(WidthN); 2863 BaseOps.push_back(BaseOp); 2864 return true; 2865 } 2866 2867 std::optional<ExtAddrMode> 2868 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, 2869 const TargetRegisterInfo *TRI) const { 2870 const MachineOperand *Base; // Filled with the base operand of MI. 2871 int64_t Offset; // Filled with the offset of MI. 2872 bool OffsetIsScalable; 2873 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI)) 2874 return std::nullopt; 2875 2876 if (!Base->isReg()) 2877 return std::nullopt; 2878 ExtAddrMode AM; 2879 AM.BaseReg = Base->getReg(); 2880 AM.Displacement = Offset; 2881 AM.ScaledReg = 0; 2882 AM.Scale = 0; 2883 return AM; 2884 } 2885 2886 bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI, 2887 Register Reg, 2888 const MachineInstr &AddrI, 2889 ExtAddrMode &AM) const { 2890 // Filter out instructions into which we cannot fold. 2891 unsigned NumBytes; 2892 int64_t OffsetScale = 1; 2893 switch (MemI.getOpcode()) { 2894 default: 2895 return false; 2896 2897 case AArch64::LDURQi: 2898 case AArch64::STURQi: 2899 NumBytes = 16; 2900 break; 2901 2902 case AArch64::LDURDi: 2903 case AArch64::STURDi: 2904 case AArch64::LDURXi: 2905 case AArch64::STURXi: 2906 NumBytes = 8; 2907 break; 2908 2909 case AArch64::LDURWi: 2910 case AArch64::LDURSWi: 2911 case AArch64::STURWi: 2912 NumBytes = 4; 2913 break; 2914 2915 case AArch64::LDURHi: 2916 case AArch64::STURHi: 2917 case AArch64::LDURHHi: 2918 case AArch64::STURHHi: 2919 case AArch64::LDURSHXi: 2920 case AArch64::LDURSHWi: 2921 NumBytes = 2; 2922 break; 2923 2924 case AArch64::LDRBroX: 2925 case AArch64::LDRBBroX: 2926 case AArch64::LDRSBXroX: 2927 case AArch64::LDRSBWroX: 2928 case AArch64::STRBroX: 2929 case AArch64::STRBBroX: 2930 case AArch64::LDURBi: 2931 case AArch64::LDURBBi: 2932 case AArch64::LDURSBXi: 2933 case AArch64::LDURSBWi: 2934 case AArch64::STURBi: 2935 case AArch64::STURBBi: 2936 case AArch64::LDRBui: 2937 case AArch64::LDRBBui: 2938 case AArch64::LDRSBXui: 2939 case AArch64::LDRSBWui: 2940 case AArch64::STRBui: 2941 case AArch64::STRBBui: 2942 NumBytes = 1; 2943 break; 2944 2945 case AArch64::LDRQroX: 2946 case AArch64::STRQroX: 2947 case AArch64::LDRQui: 2948 case AArch64::STRQui: 2949 NumBytes = 16; 2950 OffsetScale = 16; 2951 break; 2952 2953 case AArch64::LDRDroX: 2954 case AArch64::STRDroX: 2955 case AArch64::LDRXroX: 2956 case AArch64::STRXroX: 2957 case AArch64::LDRDui: 2958 case AArch64::STRDui: 2959 case AArch64::LDRXui: 2960 case AArch64::STRXui: 2961 NumBytes = 8; 2962 OffsetScale = 8; 2963 break; 2964 2965 case AArch64::LDRWroX: 2966 case AArch64::LDRSWroX: 2967 case AArch64::STRWroX: 2968 case AArch64::LDRWui: 2969 case AArch64::LDRSWui: 2970 case AArch64::STRWui: 2971 NumBytes = 4; 2972 OffsetScale = 4; 2973 break; 2974 2975 case AArch64::LDRHroX: 2976 case AArch64::STRHroX: 2977 case AArch64::LDRHHroX: 2978 case AArch64::STRHHroX: 2979 case AArch64::LDRSHXroX: 2980 case AArch64::LDRSHWroX: 2981 case AArch64::LDRHui: 2982 case AArch64::STRHui: 2983 case AArch64::LDRHHui: 2984 case AArch64::STRHHui: 2985 case AArch64::LDRSHXui: 2986 case AArch64::LDRSHWui: 2987 NumBytes = 2; 2988 OffsetScale = 2; 2989 break; 2990 } 2991 2992 // Check the fold operand is not the loaded/stored value. 2993 const MachineOperand &BaseRegOp = MemI.getOperand(0); 2994 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg) 2995 return false; 2996 2997 // Handle memory instructions with a [Reg, Reg] addressing mode. 2998 if (MemI.getOperand(2).isReg()) { 2999 // Bail if the addressing mode already includes extension of the offset 3000 // register. 3001 if (MemI.getOperand(3).getImm()) 3002 return false; 3003 3004 // Check if we actually have a scaled offset. 3005 if (MemI.getOperand(4).getImm() == 0) 3006 OffsetScale = 1; 3007 3008 // If the address instructions is folded into the base register, then the 3009 // addressing mode must not have a scale. Then we can swap the base and the 3010 // scaled registers. 3011 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1) 3012 return false; 3013 3014 switch (AddrI.getOpcode()) { 3015 default: 3016 return false; 3017 3018 case AArch64::SBFMXri: 3019 // sxtw Xa, Wm 3020 // ldr Xd, [Xn, Xa, lsl #N] 3021 // -> 3022 // ldr Xd, [Xn, Wm, sxtw #N] 3023 if (AddrI.getOperand(2).getImm() != 0 || 3024 AddrI.getOperand(3).getImm() != 31) 3025 return false; 3026 3027 AM.BaseReg = MemI.getOperand(1).getReg(); 3028 if (AM.BaseReg == Reg) 3029 AM.BaseReg = MemI.getOperand(2).getReg(); 3030 AM.ScaledReg = AddrI.getOperand(1).getReg(); 3031 AM.Scale = OffsetScale; 3032 AM.Displacement = 0; 3033 AM.Form = ExtAddrMode::Formula::SExtScaledReg; 3034 return true; 3035 3036 case TargetOpcode::SUBREG_TO_REG: { 3037 // mov Wa, Wm 3038 // ldr Xd, [Xn, Xa, lsl #N] 3039 // -> 3040 // ldr Xd, [Xn, Wm, uxtw #N] 3041 3042 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG. 3043 if (AddrI.getOperand(1).getImm() != 0 || 3044 AddrI.getOperand(3).getImm() != AArch64::sub_32) 3045 return false; 3046 3047 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo(); 3048 Register OffsetReg = AddrI.getOperand(2).getReg(); 3049 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg)) 3050 return false; 3051 3052 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg); 3053 if (DefMI.getOpcode() != AArch64::ORRWrs || 3054 DefMI.getOperand(1).getReg() != AArch64::WZR || 3055 DefMI.getOperand(3).getImm() != 0) 3056 return false; 3057 3058 AM.BaseReg = MemI.getOperand(1).getReg(); 3059 if (AM.BaseReg == Reg) 3060 AM.BaseReg = MemI.getOperand(2).getReg(); 3061 AM.ScaledReg = DefMI.getOperand(2).getReg(); 3062 AM.Scale = OffsetScale; 3063 AM.Displacement = 0; 3064 AM.Form = ExtAddrMode::Formula::ZExtScaledReg; 3065 return true; 3066 } 3067 } 3068 } 3069 3070 // Handle memory instructions with a [Reg, #Imm] addressing mode. 3071 3072 // Check we are not breaking a potential conversion to an LDP. 3073 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset, 3074 int64_t NewOffset) -> bool { 3075 int64_t MinOffset, MaxOffset; 3076 switch (NumBytes) { 3077 default: 3078 return true; 3079 case 4: 3080 MinOffset = -256; 3081 MaxOffset = 252; 3082 break; 3083 case 8: 3084 MinOffset = -512; 3085 MaxOffset = 504; 3086 break; 3087 case 16: 3088 MinOffset = -1024; 3089 MaxOffset = 1008; 3090 break; 3091 } 3092 return OldOffset < MinOffset || OldOffset > MaxOffset || 3093 (NewOffset >= MinOffset && NewOffset <= MaxOffset); 3094 }; 3095 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool { 3096 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale; 3097 int64_t NewOffset = OldOffset + Disp; 3098 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0)) 3099 return false; 3100 // If the old offset would fit into an LDP, but the new offset wouldn't, 3101 // bail out. 3102 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset)) 3103 return false; 3104 AM.BaseReg = AddrI.getOperand(1).getReg(); 3105 AM.ScaledReg = 0; 3106 AM.Scale = 0; 3107 AM.Displacement = NewOffset; 3108 AM.Form = ExtAddrMode::Formula::Basic; 3109 return true; 3110 }; 3111 3112 auto canFoldAddRegIntoAddrMode = 3113 [&](int64_t Scale, 3114 ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool { 3115 if (MemI.getOperand(2).getImm() != 0) 3116 return false; 3117 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale)) 3118 return false; 3119 AM.BaseReg = AddrI.getOperand(1).getReg(); 3120 AM.ScaledReg = AddrI.getOperand(2).getReg(); 3121 AM.Scale = Scale; 3122 AM.Displacement = 0; 3123 AM.Form = Form; 3124 return true; 3125 }; 3126 3127 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) { 3128 unsigned Opcode = MemI.getOpcode(); 3129 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) && 3130 Subtarget.isSTRQroSlow(); 3131 }; 3132 3133 int64_t Disp = 0; 3134 const bool OptSize = MemI.getMF()->getFunction().hasOptSize(); 3135 switch (AddrI.getOpcode()) { 3136 default: 3137 return false; 3138 3139 case AArch64::ADDXri: 3140 // add Xa, Xn, #N 3141 // ldr Xd, [Xa, #M] 3142 // -> 3143 // ldr Xd, [Xn, #N'+M] 3144 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm(); 3145 return canFoldAddSubImmIntoAddrMode(Disp); 3146 3147 case AArch64::SUBXri: 3148 // sub Xa, Xn, #N 3149 // ldr Xd, [Xa, #M] 3150 // -> 3151 // ldr Xd, [Xn, #N'+M] 3152 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm(); 3153 return canFoldAddSubImmIntoAddrMode(-Disp); 3154 3155 case AArch64::ADDXrs: { 3156 // add Xa, Xn, Xm, lsl #N 3157 // ldr Xd, [Xa] 3158 // -> 3159 // ldr Xd, [Xn, Xm, lsl #N] 3160 3161 // Don't fold the add if the result would be slower, unless optimising for 3162 // size. 3163 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm()); 3164 if (AArch64_AM::getShiftType(Shift) != AArch64_AM::ShiftExtendType::LSL) 3165 return false; 3166 Shift = AArch64_AM::getShiftValue(Shift); 3167 if (!OptSize) { 3168 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14()) 3169 return false; 3170 if (avoidSlowSTRQ(MemI)) 3171 return false; 3172 } 3173 return canFoldAddRegIntoAddrMode(1ULL << Shift); 3174 } 3175 3176 case AArch64::ADDXrr: 3177 // add Xa, Xn, Xm 3178 // ldr Xd, [Xa] 3179 // -> 3180 // ldr Xd, [Xn, Xm, lsl #0] 3181 3182 // Don't fold the add if the result would be slower, unless optimising for 3183 // size. 3184 if (!OptSize && avoidSlowSTRQ(MemI)) 3185 return false; 3186 return canFoldAddRegIntoAddrMode(1); 3187 3188 case AArch64::ADDXrx: 3189 // add Xa, Xn, Wm, {s,u}xtw #N 3190 // ldr Xd, [Xa] 3191 // -> 3192 // ldr Xd, [Xn, Wm, {s,u}xtw #N] 3193 3194 // Don't fold the add if the result would be slower, unless optimising for 3195 // size. 3196 if (!OptSize && avoidSlowSTRQ(MemI)) 3197 return false; 3198 3199 // Can fold only sign-/zero-extend of a word. 3200 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm()); 3201 AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm); 3202 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW) 3203 return false; 3204 3205 return canFoldAddRegIntoAddrMode( 3206 1ULL << AArch64_AM::getArithShiftValue(Imm), 3207 (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg 3208 : ExtAddrMode::Formula::ZExtScaledReg); 3209 } 3210 } 3211 3212 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, 3213 // return the opcode of an instruction performing the same operation, but using 3214 // the [Reg, Reg] addressing mode. 3215 static unsigned regOffsetOpcode(unsigned Opcode) { 3216 switch (Opcode) { 3217 default: 3218 llvm_unreachable("Address folding not implemented for instruction"); 3219 3220 case AArch64::LDURQi: 3221 case AArch64::LDRQui: 3222 return AArch64::LDRQroX; 3223 case AArch64::STURQi: 3224 case AArch64::STRQui: 3225 return AArch64::STRQroX; 3226 case AArch64::LDURDi: 3227 case AArch64::LDRDui: 3228 return AArch64::LDRDroX; 3229 case AArch64::STURDi: 3230 case AArch64::STRDui: 3231 return AArch64::STRDroX; 3232 case AArch64::LDURXi: 3233 case AArch64::LDRXui: 3234 return AArch64::LDRXroX; 3235 case AArch64::STURXi: 3236 case AArch64::STRXui: 3237 return AArch64::STRXroX; 3238 case AArch64::LDURWi: 3239 case AArch64::LDRWui: 3240 return AArch64::LDRWroX; 3241 case AArch64::LDURSWi: 3242 case AArch64::LDRSWui: 3243 return AArch64::LDRSWroX; 3244 case AArch64::STURWi: 3245 case AArch64::STRWui: 3246 return AArch64::STRWroX; 3247 case AArch64::LDURHi: 3248 case AArch64::LDRHui: 3249 return AArch64::LDRHroX; 3250 case AArch64::STURHi: 3251 case AArch64::STRHui: 3252 return AArch64::STRHroX; 3253 case AArch64::LDURHHi: 3254 case AArch64::LDRHHui: 3255 return AArch64::LDRHHroX; 3256 case AArch64::STURHHi: 3257 case AArch64::STRHHui: 3258 return AArch64::STRHHroX; 3259 case AArch64::LDURSHXi: 3260 case AArch64::LDRSHXui: 3261 return AArch64::LDRSHXroX; 3262 case AArch64::LDURSHWi: 3263 case AArch64::LDRSHWui: 3264 return AArch64::LDRSHWroX; 3265 case AArch64::LDURBi: 3266 case AArch64::LDRBui: 3267 return AArch64::LDRBroX; 3268 case AArch64::LDURBBi: 3269 case AArch64::LDRBBui: 3270 return AArch64::LDRBBroX; 3271 case AArch64::LDURSBXi: 3272 case AArch64::LDRSBXui: 3273 return AArch64::LDRSBXroX; 3274 case AArch64::LDURSBWi: 3275 case AArch64::LDRSBWui: 3276 return AArch64::LDRSBWroX; 3277 case AArch64::STURBi: 3278 case AArch64::STRBui: 3279 return AArch64::STRBroX; 3280 case AArch64::STURBBi: 3281 case AArch64::STRBBui: 3282 return AArch64::STRBBroX; 3283 } 3284 } 3285 3286 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return 3287 // the opcode of an instruction performing the same operation, but using the 3288 // [Reg, #Imm] addressing mode with scaled offset. 3289 unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) { 3290 switch (Opcode) { 3291 default: 3292 llvm_unreachable("Address folding not implemented for instruction"); 3293 3294 case AArch64::LDURQi: 3295 Scale = 16; 3296 return AArch64::LDRQui; 3297 case AArch64::STURQi: 3298 Scale = 16; 3299 return AArch64::STRQui; 3300 case AArch64::LDURDi: 3301 Scale = 8; 3302 return AArch64::LDRDui; 3303 case AArch64::STURDi: 3304 Scale = 8; 3305 return AArch64::STRDui; 3306 case AArch64::LDURXi: 3307 Scale = 8; 3308 return AArch64::LDRXui; 3309 case AArch64::STURXi: 3310 Scale = 8; 3311 return AArch64::STRXui; 3312 case AArch64::LDURWi: 3313 Scale = 4; 3314 return AArch64::LDRWui; 3315 case AArch64::LDURSWi: 3316 Scale = 4; 3317 return AArch64::LDRSWui; 3318 case AArch64::STURWi: 3319 Scale = 4; 3320 return AArch64::STRWui; 3321 case AArch64::LDURHi: 3322 Scale = 2; 3323 return AArch64::LDRHui; 3324 case AArch64::STURHi: 3325 Scale = 2; 3326 return AArch64::STRHui; 3327 case AArch64::LDURHHi: 3328 Scale = 2; 3329 return AArch64::LDRHHui; 3330 case AArch64::STURHHi: 3331 Scale = 2; 3332 return AArch64::STRHHui; 3333 case AArch64::LDURSHXi: 3334 Scale = 2; 3335 return AArch64::LDRSHXui; 3336 case AArch64::LDURSHWi: 3337 Scale = 2; 3338 return AArch64::LDRSHWui; 3339 case AArch64::LDURBi: 3340 Scale = 1; 3341 return AArch64::LDRBui; 3342 case AArch64::LDURBBi: 3343 Scale = 1; 3344 return AArch64::LDRBBui; 3345 case AArch64::LDURSBXi: 3346 Scale = 1; 3347 return AArch64::LDRSBXui; 3348 case AArch64::LDURSBWi: 3349 Scale = 1; 3350 return AArch64::LDRSBWui; 3351 case AArch64::STURBi: 3352 Scale = 1; 3353 return AArch64::STRBui; 3354 case AArch64::STURBBi: 3355 Scale = 1; 3356 return AArch64::STRBBui; 3357 case AArch64::LDRQui: 3358 case AArch64::STRQui: 3359 Scale = 16; 3360 return Opcode; 3361 case AArch64::LDRDui: 3362 case AArch64::STRDui: 3363 case AArch64::LDRXui: 3364 case AArch64::STRXui: 3365 Scale = 8; 3366 return Opcode; 3367 case AArch64::LDRWui: 3368 case AArch64::LDRSWui: 3369 case AArch64::STRWui: 3370 Scale = 4; 3371 return Opcode; 3372 case AArch64::LDRHui: 3373 case AArch64::STRHui: 3374 case AArch64::LDRHHui: 3375 case AArch64::STRHHui: 3376 case AArch64::LDRSHXui: 3377 case AArch64::LDRSHWui: 3378 Scale = 2; 3379 return Opcode; 3380 case AArch64::LDRBui: 3381 case AArch64::LDRBBui: 3382 case AArch64::LDRSBXui: 3383 case AArch64::LDRSBWui: 3384 case AArch64::STRBui: 3385 case AArch64::STRBBui: 3386 Scale = 1; 3387 return Opcode; 3388 } 3389 } 3390 3391 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return 3392 // the opcode of an instruction performing the same operation, but using the 3393 // [Reg, #Imm] addressing mode with unscaled offset. 3394 unsigned unscaledOffsetOpcode(unsigned Opcode) { 3395 switch (Opcode) { 3396 default: 3397 llvm_unreachable("Address folding not implemented for instruction"); 3398 3399 case AArch64::LDURQi: 3400 case AArch64::STURQi: 3401 case AArch64::LDURDi: 3402 case AArch64::STURDi: 3403 case AArch64::LDURXi: 3404 case AArch64::STURXi: 3405 case AArch64::LDURWi: 3406 case AArch64::LDURSWi: 3407 case AArch64::STURWi: 3408 case AArch64::LDURHi: 3409 case AArch64::STURHi: 3410 case AArch64::LDURHHi: 3411 case AArch64::STURHHi: 3412 case AArch64::LDURSHXi: 3413 case AArch64::LDURSHWi: 3414 case AArch64::LDURBi: 3415 case AArch64::STURBi: 3416 case AArch64::LDURBBi: 3417 case AArch64::STURBBi: 3418 case AArch64::LDURSBWi: 3419 case AArch64::LDURSBXi: 3420 return Opcode; 3421 case AArch64::LDRQui: 3422 return AArch64::LDURQi; 3423 case AArch64::STRQui: 3424 return AArch64::STURQi; 3425 case AArch64::LDRDui: 3426 return AArch64::LDURDi; 3427 case AArch64::STRDui: 3428 return AArch64::STURDi; 3429 case AArch64::LDRXui: 3430 return AArch64::LDURXi; 3431 case AArch64::STRXui: 3432 return AArch64::STURXi; 3433 case AArch64::LDRWui: 3434 return AArch64::LDURWi; 3435 case AArch64::LDRSWui: 3436 return AArch64::LDURSWi; 3437 case AArch64::STRWui: 3438 return AArch64::STURWi; 3439 case AArch64::LDRHui: 3440 return AArch64::LDURHi; 3441 case AArch64::STRHui: 3442 return AArch64::STURHi; 3443 case AArch64::LDRHHui: 3444 return AArch64::LDURHHi; 3445 case AArch64::STRHHui: 3446 return AArch64::STURHHi; 3447 case AArch64::LDRSHXui: 3448 return AArch64::LDURSHXi; 3449 case AArch64::LDRSHWui: 3450 return AArch64::LDURSHWi; 3451 case AArch64::LDRBBui: 3452 return AArch64::LDURBBi; 3453 case AArch64::LDRBui: 3454 return AArch64::LDURBi; 3455 case AArch64::STRBBui: 3456 return AArch64::STURBBi; 3457 case AArch64::STRBui: 3458 return AArch64::STURBi; 3459 case AArch64::LDRSBWui: 3460 return AArch64::LDURSBWi; 3461 case AArch64::LDRSBXui: 3462 return AArch64::LDURSBXi; 3463 } 3464 } 3465 3466 // Given the opcode of a memory load/store instruction, return the opcode of an 3467 // instruction performing the same operation, but using 3468 // the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the 3469 // offset register. 3470 static unsigned offsetExtendOpcode(unsigned Opcode) { 3471 switch (Opcode) { 3472 default: 3473 llvm_unreachable("Address folding not implemented for instruction"); 3474 3475 case AArch64::LDRQroX: 3476 case AArch64::LDURQi: 3477 case AArch64::LDRQui: 3478 return AArch64::LDRQroW; 3479 case AArch64::STRQroX: 3480 case AArch64::STURQi: 3481 case AArch64::STRQui: 3482 return AArch64::STRQroW; 3483 case AArch64::LDRDroX: 3484 case AArch64::LDURDi: 3485 case AArch64::LDRDui: 3486 return AArch64::LDRDroW; 3487 case AArch64::STRDroX: 3488 case AArch64::STURDi: 3489 case AArch64::STRDui: 3490 return AArch64::STRDroW; 3491 case AArch64::LDRXroX: 3492 case AArch64::LDURXi: 3493 case AArch64::LDRXui: 3494 return AArch64::LDRXroW; 3495 case AArch64::STRXroX: 3496 case AArch64::STURXi: 3497 case AArch64::STRXui: 3498 return AArch64::STRXroW; 3499 case AArch64::LDRWroX: 3500 case AArch64::LDURWi: 3501 case AArch64::LDRWui: 3502 return AArch64::LDRWroW; 3503 case AArch64::LDRSWroX: 3504 case AArch64::LDURSWi: 3505 case AArch64::LDRSWui: 3506 return AArch64::LDRSWroW; 3507 case AArch64::STRWroX: 3508 case AArch64::STURWi: 3509 case AArch64::STRWui: 3510 return AArch64::STRWroW; 3511 case AArch64::LDRHroX: 3512 case AArch64::LDURHi: 3513 case AArch64::LDRHui: 3514 return AArch64::LDRHroW; 3515 case AArch64::STRHroX: 3516 case AArch64::STURHi: 3517 case AArch64::STRHui: 3518 return AArch64::STRHroW; 3519 case AArch64::LDRHHroX: 3520 case AArch64::LDURHHi: 3521 case AArch64::LDRHHui: 3522 return AArch64::LDRHHroW; 3523 case AArch64::STRHHroX: 3524 case AArch64::STURHHi: 3525 case AArch64::STRHHui: 3526 return AArch64::STRHHroW; 3527 case AArch64::LDRSHXroX: 3528 case AArch64::LDURSHXi: 3529 case AArch64::LDRSHXui: 3530 return AArch64::LDRSHXroW; 3531 case AArch64::LDRSHWroX: 3532 case AArch64::LDURSHWi: 3533 case AArch64::LDRSHWui: 3534 return AArch64::LDRSHWroW; 3535 case AArch64::LDRBroX: 3536 case AArch64::LDURBi: 3537 case AArch64::LDRBui: 3538 return AArch64::LDRBroW; 3539 case AArch64::LDRBBroX: 3540 case AArch64::LDURBBi: 3541 case AArch64::LDRBBui: 3542 return AArch64::LDRBBroW; 3543 case AArch64::LDRSBXroX: 3544 case AArch64::LDURSBXi: 3545 case AArch64::LDRSBXui: 3546 return AArch64::LDRSBXroW; 3547 case AArch64::LDRSBWroX: 3548 case AArch64::LDURSBWi: 3549 case AArch64::LDRSBWui: 3550 return AArch64::LDRSBWroW; 3551 case AArch64::STRBroX: 3552 case AArch64::STURBi: 3553 case AArch64::STRBui: 3554 return AArch64::STRBroW; 3555 case AArch64::STRBBroX: 3556 case AArch64::STURBBi: 3557 case AArch64::STRBBui: 3558 return AArch64::STRBBroW; 3559 } 3560 } 3561 3562 MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI, 3563 const ExtAddrMode &AM) const { 3564 3565 const DebugLoc &DL = MemI.getDebugLoc(); 3566 MachineBasicBlock &MBB = *MemI.getParent(); 3567 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo(); 3568 3569 if (AM.Form == ExtAddrMode::Formula::Basic) { 3570 if (AM.ScaledReg) { 3571 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`. 3572 unsigned Opcode = regOffsetOpcode(MemI.getOpcode()); 3573 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass); 3574 auto B = BuildMI(MBB, MemI, DL, get(Opcode)) 3575 .addReg(MemI.getOperand(0).getReg(), 3576 MemI.mayLoad() ? RegState::Define : 0) 3577 .addReg(AM.BaseReg) 3578 .addReg(AM.ScaledReg) 3579 .addImm(0) 3580 .addImm(AM.Scale > 1) 3581 .setMemRefs(MemI.memoperands()) 3582 .setMIFlags(MemI.getFlags()); 3583 return B.getInstr(); 3584 } 3585 3586 assert(AM.ScaledReg == 0 && AM.Scale == 0 && 3587 "Addressing mode not supported for folding"); 3588 3589 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`. 3590 unsigned Scale = 1; 3591 unsigned Opcode = MemI.getOpcode(); 3592 if (isInt<9>(AM.Displacement)) 3593 Opcode = unscaledOffsetOpcode(Opcode); 3594 else 3595 Opcode = scaledOffsetOpcode(Opcode, Scale); 3596 3597 auto B = BuildMI(MBB, MemI, DL, get(Opcode)) 3598 .addReg(MemI.getOperand(0).getReg(), 3599 MemI.mayLoad() ? RegState::Define : 0) 3600 .addReg(AM.BaseReg) 3601 .addImm(AM.Displacement / Scale) 3602 .setMemRefs(MemI.memoperands()) 3603 .setMIFlags(MemI.getFlags()); 3604 return B.getInstr(); 3605 } 3606 3607 if (AM.Form == ExtAddrMode::Formula::SExtScaledReg || 3608 AM.Form == ExtAddrMode::Formula::ZExtScaledReg) { 3609 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`. 3610 assert(AM.ScaledReg && !AM.Displacement && 3611 "Address offset can be a register or an immediate, but not both"); 3612 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode()); 3613 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass); 3614 // Make sure the offset register is in the correct register class. 3615 Register OffsetReg = AM.ScaledReg; 3616 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg); 3617 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) { 3618 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3619 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg) 3620 .addReg(AM.ScaledReg, 0, AArch64::sub_32); 3621 } 3622 auto B = BuildMI(MBB, MemI, DL, get(Opcode)) 3623 .addReg(MemI.getOperand(0).getReg(), 3624 MemI.mayLoad() ? RegState::Define : 0) 3625 .addReg(AM.BaseReg) 3626 .addReg(OffsetReg) 3627 .addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg) 3628 .addImm(AM.Scale != 1) 3629 .setMemRefs(MemI.memoperands()) 3630 .setMIFlags(MemI.getFlags()); 3631 3632 return B.getInstr(); 3633 } 3634 3635 llvm_unreachable( 3636 "Function must not be called with an addressing mode it can't handle"); 3637 } 3638 3639 /// Return true if the opcode is a post-index ld/st instruction, which really 3640 /// loads from base+0. 3641 static bool isPostIndexLdStOpcode(unsigned Opcode) { 3642 switch (Opcode) { 3643 default: 3644 return false; 3645 case AArch64::LD1Fourv16b_POST: 3646 case AArch64::LD1Fourv1d_POST: 3647 case AArch64::LD1Fourv2d_POST: 3648 case AArch64::LD1Fourv2s_POST: 3649 case AArch64::LD1Fourv4h_POST: 3650 case AArch64::LD1Fourv4s_POST: 3651 case AArch64::LD1Fourv8b_POST: 3652 case AArch64::LD1Fourv8h_POST: 3653 case AArch64::LD1Onev16b_POST: 3654 case AArch64::LD1Onev1d_POST: 3655 case AArch64::LD1Onev2d_POST: 3656 case AArch64::LD1Onev2s_POST: 3657 case AArch64::LD1Onev4h_POST: 3658 case AArch64::LD1Onev4s_POST: 3659 case AArch64::LD1Onev8b_POST: 3660 case AArch64::LD1Onev8h_POST: 3661 case AArch64::LD1Rv16b_POST: 3662 case AArch64::LD1Rv1d_POST: 3663 case AArch64::LD1Rv2d_POST: 3664 case AArch64::LD1Rv2s_POST: 3665 case AArch64::LD1Rv4h_POST: 3666 case AArch64::LD1Rv4s_POST: 3667 case AArch64::LD1Rv8b_POST: 3668 case AArch64::LD1Rv8h_POST: 3669 case AArch64::LD1Threev16b_POST: 3670 case AArch64::LD1Threev1d_POST: 3671 case AArch64::LD1Threev2d_POST: 3672 case AArch64::LD1Threev2s_POST: 3673 case AArch64::LD1Threev4h_POST: 3674 case AArch64::LD1Threev4s_POST: 3675 case AArch64::LD1Threev8b_POST: 3676 case AArch64::LD1Threev8h_POST: 3677 case AArch64::LD1Twov16b_POST: 3678 case AArch64::LD1Twov1d_POST: 3679 case AArch64::LD1Twov2d_POST: 3680 case AArch64::LD1Twov2s_POST: 3681 case AArch64::LD1Twov4h_POST: 3682 case AArch64::LD1Twov4s_POST: 3683 case AArch64::LD1Twov8b_POST: 3684 case AArch64::LD1Twov8h_POST: 3685 case AArch64::LD1i16_POST: 3686 case AArch64::LD1i32_POST: 3687 case AArch64::LD1i64_POST: 3688 case AArch64::LD1i8_POST: 3689 case AArch64::LD2Rv16b_POST: 3690 case AArch64::LD2Rv1d_POST: 3691 case AArch64::LD2Rv2d_POST: 3692 case AArch64::LD2Rv2s_POST: 3693 case AArch64::LD2Rv4h_POST: 3694 case AArch64::LD2Rv4s_POST: 3695 case AArch64::LD2Rv8b_POST: 3696 case AArch64::LD2Rv8h_POST: 3697 case AArch64::LD2Twov16b_POST: 3698 case AArch64::LD2Twov2d_POST: 3699 case AArch64::LD2Twov2s_POST: 3700 case AArch64::LD2Twov4h_POST: 3701 case AArch64::LD2Twov4s_POST: 3702 case AArch64::LD2Twov8b_POST: 3703 case AArch64::LD2Twov8h_POST: 3704 case AArch64::LD2i16_POST: 3705 case AArch64::LD2i32_POST: 3706 case AArch64::LD2i64_POST: 3707 case AArch64::LD2i8_POST: 3708 case AArch64::LD3Rv16b_POST: 3709 case AArch64::LD3Rv1d_POST: 3710 case AArch64::LD3Rv2d_POST: 3711 case AArch64::LD3Rv2s_POST: 3712 case AArch64::LD3Rv4h_POST: 3713 case AArch64::LD3Rv4s_POST: 3714 case AArch64::LD3Rv8b_POST: 3715 case AArch64::LD3Rv8h_POST: 3716 case AArch64::LD3Threev16b_POST: 3717 case AArch64::LD3Threev2d_POST: 3718 case AArch64::LD3Threev2s_POST: 3719 case AArch64::LD3Threev4h_POST: 3720 case AArch64::LD3Threev4s_POST: 3721 case AArch64::LD3Threev8b_POST: 3722 case AArch64::LD3Threev8h_POST: 3723 case AArch64::LD3i16_POST: 3724 case AArch64::LD3i32_POST: 3725 case AArch64::LD3i64_POST: 3726 case AArch64::LD3i8_POST: 3727 case AArch64::LD4Fourv16b_POST: 3728 case AArch64::LD4Fourv2d_POST: 3729 case AArch64::LD4Fourv2s_POST: 3730 case AArch64::LD4Fourv4h_POST: 3731 case AArch64::LD4Fourv4s_POST: 3732 case AArch64::LD4Fourv8b_POST: 3733 case AArch64::LD4Fourv8h_POST: 3734 case AArch64::LD4Rv16b_POST: 3735 case AArch64::LD4Rv1d_POST: 3736 case AArch64::LD4Rv2d_POST: 3737 case AArch64::LD4Rv2s_POST: 3738 case AArch64::LD4Rv4h_POST: 3739 case AArch64::LD4Rv4s_POST: 3740 case AArch64::LD4Rv8b_POST: 3741 case AArch64::LD4Rv8h_POST: 3742 case AArch64::LD4i16_POST: 3743 case AArch64::LD4i32_POST: 3744 case AArch64::LD4i64_POST: 3745 case AArch64::LD4i8_POST: 3746 case AArch64::LDAPRWpost: 3747 case AArch64::LDAPRXpost: 3748 case AArch64::LDIAPPWpost: 3749 case AArch64::LDIAPPXpost: 3750 case AArch64::LDPDpost: 3751 case AArch64::LDPQpost: 3752 case AArch64::LDPSWpost: 3753 case AArch64::LDPSpost: 3754 case AArch64::LDPWpost: 3755 case AArch64::LDPXpost: 3756 case AArch64::LDRBBpost: 3757 case AArch64::LDRBpost: 3758 case AArch64::LDRDpost: 3759 case AArch64::LDRHHpost: 3760 case AArch64::LDRHpost: 3761 case AArch64::LDRQpost: 3762 case AArch64::LDRSBWpost: 3763 case AArch64::LDRSBXpost: 3764 case AArch64::LDRSHWpost: 3765 case AArch64::LDRSHXpost: 3766 case AArch64::LDRSWpost: 3767 case AArch64::LDRSpost: 3768 case AArch64::LDRWpost: 3769 case AArch64::LDRXpost: 3770 case AArch64::ST1Fourv16b_POST: 3771 case AArch64::ST1Fourv1d_POST: 3772 case AArch64::ST1Fourv2d_POST: 3773 case AArch64::ST1Fourv2s_POST: 3774 case AArch64::ST1Fourv4h_POST: 3775 case AArch64::ST1Fourv4s_POST: 3776 case AArch64::ST1Fourv8b_POST: 3777 case AArch64::ST1Fourv8h_POST: 3778 case AArch64::ST1Onev16b_POST: 3779 case AArch64::ST1Onev1d_POST: 3780 case AArch64::ST1Onev2d_POST: 3781 case AArch64::ST1Onev2s_POST: 3782 case AArch64::ST1Onev4h_POST: 3783 case AArch64::ST1Onev4s_POST: 3784 case AArch64::ST1Onev8b_POST: 3785 case AArch64::ST1Onev8h_POST: 3786 case AArch64::ST1Threev16b_POST: 3787 case AArch64::ST1Threev1d_POST: 3788 case AArch64::ST1Threev2d_POST: 3789 case AArch64::ST1Threev2s_POST: 3790 case AArch64::ST1Threev4h_POST: 3791 case AArch64::ST1Threev4s_POST: 3792 case AArch64::ST1Threev8b_POST: 3793 case AArch64::ST1Threev8h_POST: 3794 case AArch64::ST1Twov16b_POST: 3795 case AArch64::ST1Twov1d_POST: 3796 case AArch64::ST1Twov2d_POST: 3797 case AArch64::ST1Twov2s_POST: 3798 case AArch64::ST1Twov4h_POST: 3799 case AArch64::ST1Twov4s_POST: 3800 case AArch64::ST1Twov8b_POST: 3801 case AArch64::ST1Twov8h_POST: 3802 case AArch64::ST1i16_POST: 3803 case AArch64::ST1i32_POST: 3804 case AArch64::ST1i64_POST: 3805 case AArch64::ST1i8_POST: 3806 case AArch64::ST2GPostIndex: 3807 case AArch64::ST2Twov16b_POST: 3808 case AArch64::ST2Twov2d_POST: 3809 case AArch64::ST2Twov2s_POST: 3810 case AArch64::ST2Twov4h_POST: 3811 case AArch64::ST2Twov4s_POST: 3812 case AArch64::ST2Twov8b_POST: 3813 case AArch64::ST2Twov8h_POST: 3814 case AArch64::ST2i16_POST: 3815 case AArch64::ST2i32_POST: 3816 case AArch64::ST2i64_POST: 3817 case AArch64::ST2i8_POST: 3818 case AArch64::ST3Threev16b_POST: 3819 case AArch64::ST3Threev2d_POST: 3820 case AArch64::ST3Threev2s_POST: 3821 case AArch64::ST3Threev4h_POST: 3822 case AArch64::ST3Threev4s_POST: 3823 case AArch64::ST3Threev8b_POST: 3824 case AArch64::ST3Threev8h_POST: 3825 case AArch64::ST3i16_POST: 3826 case AArch64::ST3i32_POST: 3827 case AArch64::ST3i64_POST: 3828 case AArch64::ST3i8_POST: 3829 case AArch64::ST4Fourv16b_POST: 3830 case AArch64::ST4Fourv2d_POST: 3831 case AArch64::ST4Fourv2s_POST: 3832 case AArch64::ST4Fourv4h_POST: 3833 case AArch64::ST4Fourv4s_POST: 3834 case AArch64::ST4Fourv8b_POST: 3835 case AArch64::ST4Fourv8h_POST: 3836 case AArch64::ST4i16_POST: 3837 case AArch64::ST4i32_POST: 3838 case AArch64::ST4i64_POST: 3839 case AArch64::ST4i8_POST: 3840 case AArch64::STGPostIndex: 3841 case AArch64::STGPpost: 3842 case AArch64::STPDpost: 3843 case AArch64::STPQpost: 3844 case AArch64::STPSpost: 3845 case AArch64::STPWpost: 3846 case AArch64::STPXpost: 3847 case AArch64::STRBBpost: 3848 case AArch64::STRBpost: 3849 case AArch64::STRDpost: 3850 case AArch64::STRHHpost: 3851 case AArch64::STRHpost: 3852 case AArch64::STRQpost: 3853 case AArch64::STRSpost: 3854 case AArch64::STRWpost: 3855 case AArch64::STRXpost: 3856 case AArch64::STZ2GPostIndex: 3857 case AArch64::STZGPostIndex: 3858 return true; 3859 } 3860 } 3861 3862 bool AArch64InstrInfo::getMemOperandWithOffsetWidth( 3863 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, 3864 bool &OffsetIsScalable, TypeSize &Width, 3865 const TargetRegisterInfo *TRI) const { 3866 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 3867 // Handle only loads/stores with base register followed by immediate offset. 3868 if (LdSt.getNumExplicitOperands() == 3) { 3869 // Non-paired instruction (e.g., ldr x1, [x0, #8]). 3870 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) || 3871 !LdSt.getOperand(2).isImm()) 3872 return false; 3873 } else if (LdSt.getNumExplicitOperands() == 4) { 3874 // Paired instruction (e.g., ldp x1, x2, [x0, #8]). 3875 if (!LdSt.getOperand(1).isReg() || 3876 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) || 3877 !LdSt.getOperand(3).isImm()) 3878 return false; 3879 } else 3880 return false; 3881 3882 // Get the scaling factor for the instruction and set the width for the 3883 // instruction. 3884 TypeSize Scale(0U, false); 3885 int64_t Dummy1, Dummy2; 3886 3887 // If this returns false, then it's an instruction we don't want to handle. 3888 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2)) 3889 return false; 3890 3891 // Compute the offset. Offset is calculated as the immediate operand 3892 // multiplied by the scaling factor. Unscaled instructions have scaling factor 3893 // set to 1. Postindex are a special case which have an offset of 0. 3894 if (isPostIndexLdStOpcode(LdSt.getOpcode())) { 3895 BaseOp = &LdSt.getOperand(2); 3896 Offset = 0; 3897 } else if (LdSt.getNumExplicitOperands() == 3) { 3898 BaseOp = &LdSt.getOperand(1); 3899 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue(); 3900 } else { 3901 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); 3902 BaseOp = &LdSt.getOperand(2); 3903 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue(); 3904 } 3905 OffsetIsScalable = Scale.isScalable(); 3906 3907 return BaseOp->isReg() || BaseOp->isFI(); 3908 } 3909 3910 MachineOperand & 3911 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { 3912 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 3913 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1); 3914 assert(OfsOp.isImm() && "Offset operand wasn't immediate."); 3915 return OfsOp; 3916 } 3917 3918 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, 3919 TypeSize &Width, int64_t &MinOffset, 3920 int64_t &MaxOffset) { 3921 switch (Opcode) { 3922 // Not a memory operation or something we want to handle. 3923 default: 3924 Scale = TypeSize::getFixed(0); 3925 Width = TypeSize::getFixed(0); 3926 MinOffset = MaxOffset = 0; 3927 return false; 3928 // LDR / STR 3929 case AArch64::LDRQui: 3930 case AArch64::STRQui: 3931 Scale = TypeSize::getFixed(16); 3932 Width = TypeSize::getFixed(16); 3933 MinOffset = 0; 3934 MaxOffset = 4095; 3935 break; 3936 case AArch64::LDRXui: 3937 case AArch64::LDRDui: 3938 case AArch64::STRXui: 3939 case AArch64::STRDui: 3940 case AArch64::PRFMui: 3941 Scale = TypeSize::getFixed(8); 3942 Width = TypeSize::getFixed(8); 3943 MinOffset = 0; 3944 MaxOffset = 4095; 3945 break; 3946 case AArch64::LDRWui: 3947 case AArch64::LDRSui: 3948 case AArch64::LDRSWui: 3949 case AArch64::STRWui: 3950 case AArch64::STRSui: 3951 Scale = TypeSize::getFixed(4); 3952 Width = TypeSize::getFixed(4); 3953 MinOffset = 0; 3954 MaxOffset = 4095; 3955 break; 3956 case AArch64::LDRHui: 3957 case AArch64::LDRHHui: 3958 case AArch64::LDRSHWui: 3959 case AArch64::LDRSHXui: 3960 case AArch64::STRHui: 3961 case AArch64::STRHHui: 3962 Scale = TypeSize::getFixed(2); 3963 Width = TypeSize::getFixed(2); 3964 MinOffset = 0; 3965 MaxOffset = 4095; 3966 break; 3967 case AArch64::LDRBui: 3968 case AArch64::LDRBBui: 3969 case AArch64::LDRSBWui: 3970 case AArch64::LDRSBXui: 3971 case AArch64::STRBui: 3972 case AArch64::STRBBui: 3973 Scale = TypeSize::getFixed(1); 3974 Width = TypeSize::getFixed(1); 3975 MinOffset = 0; 3976 MaxOffset = 4095; 3977 break; 3978 // post/pre inc 3979 case AArch64::STRQpre: 3980 case AArch64::LDRQpost: 3981 Scale = TypeSize::getFixed(1); 3982 Width = TypeSize::getFixed(16); 3983 MinOffset = -256; 3984 MaxOffset = 255; 3985 break; 3986 case AArch64::LDRDpost: 3987 case AArch64::LDRDpre: 3988 case AArch64::LDRXpost: 3989 case AArch64::LDRXpre: 3990 case AArch64::STRDpost: 3991 case AArch64::STRDpre: 3992 case AArch64::STRXpost: 3993 case AArch64::STRXpre: 3994 Scale = TypeSize::getFixed(1); 3995 Width = TypeSize::getFixed(8); 3996 MinOffset = -256; 3997 MaxOffset = 255; 3998 break; 3999 case AArch64::STRWpost: 4000 case AArch64::STRWpre: 4001 case AArch64::LDRWpost: 4002 case AArch64::LDRWpre: 4003 case AArch64::STRSpost: 4004 case AArch64::STRSpre: 4005 case AArch64::LDRSpost: 4006 case AArch64::LDRSpre: 4007 Scale = TypeSize::getFixed(1); 4008 Width = TypeSize::getFixed(4); 4009 MinOffset = -256; 4010 MaxOffset = 255; 4011 break; 4012 case AArch64::LDRHpost: 4013 case AArch64::LDRHpre: 4014 case AArch64::STRHpost: 4015 case AArch64::STRHpre: 4016 case AArch64::LDRHHpost: 4017 case AArch64::LDRHHpre: 4018 case AArch64::STRHHpost: 4019 case AArch64::STRHHpre: 4020 Scale = TypeSize::getFixed(1); 4021 Width = TypeSize::getFixed(2); 4022 MinOffset = -256; 4023 MaxOffset = 255; 4024 break; 4025 case AArch64::LDRBpost: 4026 case AArch64::LDRBpre: 4027 case AArch64::STRBpost: 4028 case AArch64::STRBpre: 4029 case AArch64::LDRBBpost: 4030 case AArch64::LDRBBpre: 4031 case AArch64::STRBBpost: 4032 case AArch64::STRBBpre: 4033 Scale = TypeSize::getFixed(1); 4034 Width = TypeSize::getFixed(1); 4035 MinOffset = -256; 4036 MaxOffset = 255; 4037 break; 4038 // Unscaled 4039 case AArch64::LDURQi: 4040 case AArch64::STURQi: 4041 Scale = TypeSize::getFixed(1); 4042 Width = TypeSize::getFixed(16); 4043 MinOffset = -256; 4044 MaxOffset = 255; 4045 break; 4046 case AArch64::LDURXi: 4047 case AArch64::LDURDi: 4048 case AArch64::LDAPURXi: 4049 case AArch64::STURXi: 4050 case AArch64::STURDi: 4051 case AArch64::STLURXi: 4052 case AArch64::PRFUMi: 4053 Scale = TypeSize::getFixed(1); 4054 Width = TypeSize::getFixed(8); 4055 MinOffset = -256; 4056 MaxOffset = 255; 4057 break; 4058 case AArch64::LDURWi: 4059 case AArch64::LDURSi: 4060 case AArch64::LDURSWi: 4061 case AArch64::LDAPURi: 4062 case AArch64::LDAPURSWi: 4063 case AArch64::STURWi: 4064 case AArch64::STURSi: 4065 case AArch64::STLURWi: 4066 Scale = TypeSize::getFixed(1); 4067 Width = TypeSize::getFixed(4); 4068 MinOffset = -256; 4069 MaxOffset = 255; 4070 break; 4071 case AArch64::LDURHi: 4072 case AArch64::LDURHHi: 4073 case AArch64::LDURSHXi: 4074 case AArch64::LDURSHWi: 4075 case AArch64::LDAPURHi: 4076 case AArch64::LDAPURSHWi: 4077 case AArch64::LDAPURSHXi: 4078 case AArch64::STURHi: 4079 case AArch64::STURHHi: 4080 case AArch64::STLURHi: 4081 Scale = TypeSize::getFixed(1); 4082 Width = TypeSize::getFixed(2); 4083 MinOffset = -256; 4084 MaxOffset = 255; 4085 break; 4086 case AArch64::LDURBi: 4087 case AArch64::LDURBBi: 4088 case AArch64::LDURSBXi: 4089 case AArch64::LDURSBWi: 4090 case AArch64::LDAPURBi: 4091 case AArch64::LDAPURSBWi: 4092 case AArch64::LDAPURSBXi: 4093 case AArch64::STURBi: 4094 case AArch64::STURBBi: 4095 case AArch64::STLURBi: 4096 Scale = TypeSize::getFixed(1); 4097 Width = TypeSize::getFixed(1); 4098 MinOffset = -256; 4099 MaxOffset = 255; 4100 break; 4101 // LDP / STP (including pre/post inc) 4102 case AArch64::LDPQi: 4103 case AArch64::LDNPQi: 4104 case AArch64::STPQi: 4105 case AArch64::STNPQi: 4106 case AArch64::LDPQpost: 4107 case AArch64::LDPQpre: 4108 case AArch64::STPQpost: 4109 case AArch64::STPQpre: 4110 Scale = TypeSize::getFixed(16); 4111 Width = TypeSize::getFixed(16 * 2); 4112 MinOffset = -64; 4113 MaxOffset = 63; 4114 break; 4115 case AArch64::LDPXi: 4116 case AArch64::LDPDi: 4117 case AArch64::LDNPXi: 4118 case AArch64::LDNPDi: 4119 case AArch64::STPXi: 4120 case AArch64::STPDi: 4121 case AArch64::STNPXi: 4122 case AArch64::STNPDi: 4123 case AArch64::LDPDpost: 4124 case AArch64::LDPDpre: 4125 case AArch64::LDPXpost: 4126 case AArch64::LDPXpre: 4127 case AArch64::STPDpost: 4128 case AArch64::STPDpre: 4129 case AArch64::STPXpost: 4130 case AArch64::STPXpre: 4131 Scale = TypeSize::getFixed(8); 4132 Width = TypeSize::getFixed(8 * 2); 4133 MinOffset = -64; 4134 MaxOffset = 63; 4135 break; 4136 case AArch64::LDPWi: 4137 case AArch64::LDPSi: 4138 case AArch64::LDNPWi: 4139 case AArch64::LDNPSi: 4140 case AArch64::STPWi: 4141 case AArch64::STPSi: 4142 case AArch64::STNPWi: 4143 case AArch64::STNPSi: 4144 case AArch64::LDPSpost: 4145 case AArch64::LDPSpre: 4146 case AArch64::LDPWpost: 4147 case AArch64::LDPWpre: 4148 case AArch64::STPSpost: 4149 case AArch64::STPSpre: 4150 case AArch64::STPWpost: 4151 case AArch64::STPWpre: 4152 Scale = TypeSize::getFixed(4); 4153 Width = TypeSize::getFixed(4 * 2); 4154 MinOffset = -64; 4155 MaxOffset = 63; 4156 break; 4157 case AArch64::StoreSwiftAsyncContext: 4158 // Store is an STRXui, but there might be an ADDXri in the expansion too. 4159 Scale = TypeSize::getFixed(1); 4160 Width = TypeSize::getFixed(8); 4161 MinOffset = 0; 4162 MaxOffset = 4095; 4163 break; 4164 case AArch64::ADDG: 4165 Scale = TypeSize::getFixed(16); 4166 Width = TypeSize::getFixed(0); 4167 MinOffset = 0; 4168 MaxOffset = 63; 4169 break; 4170 case AArch64::TAGPstack: 4171 Scale = TypeSize::getFixed(16); 4172 Width = TypeSize::getFixed(0); 4173 // TAGP with a negative offset turns into SUBP, which has a maximum offset 4174 // of 63 (not 64!). 4175 MinOffset = -63; 4176 MaxOffset = 63; 4177 break; 4178 case AArch64::LDG: 4179 case AArch64::STGi: 4180 case AArch64::STGPreIndex: 4181 case AArch64::STGPostIndex: 4182 case AArch64::STZGi: 4183 case AArch64::STZGPreIndex: 4184 case AArch64::STZGPostIndex: 4185 Scale = TypeSize::getFixed(16); 4186 Width = TypeSize::getFixed(16); 4187 MinOffset = -256; 4188 MaxOffset = 255; 4189 break; 4190 // SVE 4191 case AArch64::STR_ZZZZXI: 4192 case AArch64::LDR_ZZZZXI: 4193 Scale = TypeSize::getScalable(16); 4194 Width = TypeSize::getScalable(16 * 4); 4195 MinOffset = -256; 4196 MaxOffset = 252; 4197 break; 4198 case AArch64::STR_ZZZXI: 4199 case AArch64::LDR_ZZZXI: 4200 Scale = TypeSize::getScalable(16); 4201 Width = TypeSize::getScalable(16 * 3); 4202 MinOffset = -256; 4203 MaxOffset = 253; 4204 break; 4205 case AArch64::STR_ZZXI: 4206 case AArch64::LDR_ZZXI: 4207 Scale = TypeSize::getScalable(16); 4208 Width = TypeSize::getScalable(16 * 2); 4209 MinOffset = -256; 4210 MaxOffset = 254; 4211 break; 4212 case AArch64::LDR_PXI: 4213 case AArch64::STR_PXI: 4214 Scale = TypeSize::getScalable(2); 4215 Width = TypeSize::getScalable(2); 4216 MinOffset = -256; 4217 MaxOffset = 255; 4218 break; 4219 case AArch64::LDR_PPXI: 4220 case AArch64::STR_PPXI: 4221 Scale = TypeSize::getScalable(2); 4222 Width = TypeSize::getScalable(2 * 2); 4223 MinOffset = -256; 4224 MaxOffset = 254; 4225 break; 4226 case AArch64::LDR_ZXI: 4227 case AArch64::STR_ZXI: 4228 Scale = TypeSize::getScalable(16); 4229 Width = TypeSize::getScalable(16); 4230 MinOffset = -256; 4231 MaxOffset = 255; 4232 break; 4233 case AArch64::LD1B_IMM: 4234 case AArch64::LD1H_IMM: 4235 case AArch64::LD1W_IMM: 4236 case AArch64::LD1D_IMM: 4237 case AArch64::LDNT1B_ZRI: 4238 case AArch64::LDNT1H_ZRI: 4239 case AArch64::LDNT1W_ZRI: 4240 case AArch64::LDNT1D_ZRI: 4241 case AArch64::ST1B_IMM: 4242 case AArch64::ST1H_IMM: 4243 case AArch64::ST1W_IMM: 4244 case AArch64::ST1D_IMM: 4245 case AArch64::STNT1B_ZRI: 4246 case AArch64::STNT1H_ZRI: 4247 case AArch64::STNT1W_ZRI: 4248 case AArch64::STNT1D_ZRI: 4249 case AArch64::LDNF1B_IMM: 4250 case AArch64::LDNF1H_IMM: 4251 case AArch64::LDNF1W_IMM: 4252 case AArch64::LDNF1D_IMM: 4253 // A full vectors worth of data 4254 // Width = mbytes * elements 4255 Scale = TypeSize::getScalable(16); 4256 Width = TypeSize::getScalable(16); 4257 MinOffset = -8; 4258 MaxOffset = 7; 4259 break; 4260 case AArch64::LD2B_IMM: 4261 case AArch64::LD2H_IMM: 4262 case AArch64::LD2W_IMM: 4263 case AArch64::LD2D_IMM: 4264 case AArch64::ST2B_IMM: 4265 case AArch64::ST2H_IMM: 4266 case AArch64::ST2W_IMM: 4267 case AArch64::ST2D_IMM: 4268 Scale = TypeSize::getScalable(32); 4269 Width = TypeSize::getScalable(16 * 2); 4270 MinOffset = -8; 4271 MaxOffset = 7; 4272 break; 4273 case AArch64::LD3B_IMM: 4274 case AArch64::LD3H_IMM: 4275 case AArch64::LD3W_IMM: 4276 case AArch64::LD3D_IMM: 4277 case AArch64::ST3B_IMM: 4278 case AArch64::ST3H_IMM: 4279 case AArch64::ST3W_IMM: 4280 case AArch64::ST3D_IMM: 4281 Scale = TypeSize::getScalable(48); 4282 Width = TypeSize::getScalable(16 * 3); 4283 MinOffset = -8; 4284 MaxOffset = 7; 4285 break; 4286 case AArch64::LD4B_IMM: 4287 case AArch64::LD4H_IMM: 4288 case AArch64::LD4W_IMM: 4289 case AArch64::LD4D_IMM: 4290 case AArch64::ST4B_IMM: 4291 case AArch64::ST4H_IMM: 4292 case AArch64::ST4W_IMM: 4293 case AArch64::ST4D_IMM: 4294 Scale = TypeSize::getScalable(64); 4295 Width = TypeSize::getScalable(16 * 4); 4296 MinOffset = -8; 4297 MaxOffset = 7; 4298 break; 4299 case AArch64::LD1B_H_IMM: 4300 case AArch64::LD1SB_H_IMM: 4301 case AArch64::LD1H_S_IMM: 4302 case AArch64::LD1SH_S_IMM: 4303 case AArch64::LD1W_D_IMM: 4304 case AArch64::LD1SW_D_IMM: 4305 case AArch64::ST1B_H_IMM: 4306 case AArch64::ST1H_S_IMM: 4307 case AArch64::ST1W_D_IMM: 4308 case AArch64::LDNF1B_H_IMM: 4309 case AArch64::LDNF1SB_H_IMM: 4310 case AArch64::LDNF1H_S_IMM: 4311 case AArch64::LDNF1SH_S_IMM: 4312 case AArch64::LDNF1W_D_IMM: 4313 case AArch64::LDNF1SW_D_IMM: 4314 // A half vector worth of data 4315 // Width = mbytes * elements 4316 Scale = TypeSize::getScalable(8); 4317 Width = TypeSize::getScalable(8); 4318 MinOffset = -8; 4319 MaxOffset = 7; 4320 break; 4321 case AArch64::LD1B_S_IMM: 4322 case AArch64::LD1SB_S_IMM: 4323 case AArch64::LD1H_D_IMM: 4324 case AArch64::LD1SH_D_IMM: 4325 case AArch64::ST1B_S_IMM: 4326 case AArch64::ST1H_D_IMM: 4327 case AArch64::LDNF1B_S_IMM: 4328 case AArch64::LDNF1SB_S_IMM: 4329 case AArch64::LDNF1H_D_IMM: 4330 case AArch64::LDNF1SH_D_IMM: 4331 // A quarter vector worth of data 4332 // Width = mbytes * elements 4333 Scale = TypeSize::getScalable(4); 4334 Width = TypeSize::getScalable(4); 4335 MinOffset = -8; 4336 MaxOffset = 7; 4337 break; 4338 case AArch64::LD1B_D_IMM: 4339 case AArch64::LD1SB_D_IMM: 4340 case AArch64::ST1B_D_IMM: 4341 case AArch64::LDNF1B_D_IMM: 4342 case AArch64::LDNF1SB_D_IMM: 4343 // A eighth vector worth of data 4344 // Width = mbytes * elements 4345 Scale = TypeSize::getScalable(2); 4346 Width = TypeSize::getScalable(2); 4347 MinOffset = -8; 4348 MaxOffset = 7; 4349 break; 4350 case AArch64::ST2Gi: 4351 case AArch64::ST2GPreIndex: 4352 case AArch64::ST2GPostIndex: 4353 case AArch64::STZ2Gi: 4354 case AArch64::STZ2GPreIndex: 4355 case AArch64::STZ2GPostIndex: 4356 Scale = TypeSize::getFixed(16); 4357 Width = TypeSize::getFixed(32); 4358 MinOffset = -256; 4359 MaxOffset = 255; 4360 break; 4361 case AArch64::STGPi: 4362 case AArch64::STGPpost: 4363 case AArch64::STGPpre: 4364 Scale = TypeSize::getFixed(16); 4365 Width = TypeSize::getFixed(16); 4366 MinOffset = -64; 4367 MaxOffset = 63; 4368 break; 4369 case AArch64::LD1RB_IMM: 4370 case AArch64::LD1RB_H_IMM: 4371 case AArch64::LD1RB_S_IMM: 4372 case AArch64::LD1RB_D_IMM: 4373 case AArch64::LD1RSB_H_IMM: 4374 case AArch64::LD1RSB_S_IMM: 4375 case AArch64::LD1RSB_D_IMM: 4376 Scale = TypeSize::getFixed(1); 4377 Width = TypeSize::getFixed(1); 4378 MinOffset = 0; 4379 MaxOffset = 63; 4380 break; 4381 case AArch64::LD1RH_IMM: 4382 case AArch64::LD1RH_S_IMM: 4383 case AArch64::LD1RH_D_IMM: 4384 case AArch64::LD1RSH_S_IMM: 4385 case AArch64::LD1RSH_D_IMM: 4386 Scale = TypeSize::getFixed(2); 4387 Width = TypeSize::getFixed(2); 4388 MinOffset = 0; 4389 MaxOffset = 63; 4390 break; 4391 case AArch64::LD1RW_IMM: 4392 case AArch64::LD1RW_D_IMM: 4393 case AArch64::LD1RSW_IMM: 4394 Scale = TypeSize::getFixed(4); 4395 Width = TypeSize::getFixed(4); 4396 MinOffset = 0; 4397 MaxOffset = 63; 4398 break; 4399 case AArch64::LD1RD_IMM: 4400 Scale = TypeSize::getFixed(8); 4401 Width = TypeSize::getFixed(8); 4402 MinOffset = 0; 4403 MaxOffset = 63; 4404 break; 4405 } 4406 4407 return true; 4408 } 4409 4410 // Scaling factor for unscaled load or store. 4411 int AArch64InstrInfo::getMemScale(unsigned Opc) { 4412 switch (Opc) { 4413 default: 4414 llvm_unreachable("Opcode has unknown scale!"); 4415 case AArch64::LDRBBui: 4416 case AArch64::LDURBBi: 4417 case AArch64::LDRSBWui: 4418 case AArch64::LDURSBWi: 4419 case AArch64::STRBBui: 4420 case AArch64::STURBBi: 4421 return 1; 4422 case AArch64::LDRHHui: 4423 case AArch64::LDURHHi: 4424 case AArch64::LDRSHWui: 4425 case AArch64::LDURSHWi: 4426 case AArch64::STRHHui: 4427 case AArch64::STURHHi: 4428 return 2; 4429 case AArch64::LDRSui: 4430 case AArch64::LDURSi: 4431 case AArch64::LDRSpre: 4432 case AArch64::LDRSWui: 4433 case AArch64::LDURSWi: 4434 case AArch64::LDRSWpre: 4435 case AArch64::LDRWpre: 4436 case AArch64::LDRWui: 4437 case AArch64::LDURWi: 4438 case AArch64::STRSui: 4439 case AArch64::STURSi: 4440 case AArch64::STRSpre: 4441 case AArch64::STRWui: 4442 case AArch64::STURWi: 4443 case AArch64::STRWpre: 4444 case AArch64::LDPSi: 4445 case AArch64::LDPSWi: 4446 case AArch64::LDPWi: 4447 case AArch64::STPSi: 4448 case AArch64::STPWi: 4449 return 4; 4450 case AArch64::LDRDui: 4451 case AArch64::LDURDi: 4452 case AArch64::LDRDpre: 4453 case AArch64::LDRXui: 4454 case AArch64::LDURXi: 4455 case AArch64::LDRXpre: 4456 case AArch64::STRDui: 4457 case AArch64::STURDi: 4458 case AArch64::STRDpre: 4459 case AArch64::STRXui: 4460 case AArch64::STURXi: 4461 case AArch64::STRXpre: 4462 case AArch64::LDPDi: 4463 case AArch64::LDPXi: 4464 case AArch64::STPDi: 4465 case AArch64::STPXi: 4466 return 8; 4467 case AArch64::LDRQui: 4468 case AArch64::LDURQi: 4469 case AArch64::STRQui: 4470 case AArch64::STURQi: 4471 case AArch64::STRQpre: 4472 case AArch64::LDPQi: 4473 case AArch64::LDRQpre: 4474 case AArch64::STPQi: 4475 case AArch64::STGi: 4476 case AArch64::STZGi: 4477 case AArch64::ST2Gi: 4478 case AArch64::STZ2Gi: 4479 case AArch64::STGPi: 4480 return 16; 4481 } 4482 } 4483 4484 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) { 4485 switch (MI.getOpcode()) { 4486 default: 4487 return false; 4488 case AArch64::LDRWpre: 4489 case AArch64::LDRXpre: 4490 case AArch64::LDRSWpre: 4491 case AArch64::LDRSpre: 4492 case AArch64::LDRDpre: 4493 case AArch64::LDRQpre: 4494 return true; 4495 } 4496 } 4497 4498 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) { 4499 switch (MI.getOpcode()) { 4500 default: 4501 return false; 4502 case AArch64::STRWpre: 4503 case AArch64::STRXpre: 4504 case AArch64::STRSpre: 4505 case AArch64::STRDpre: 4506 case AArch64::STRQpre: 4507 return true; 4508 } 4509 } 4510 4511 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) { 4512 return isPreLd(MI) || isPreSt(MI); 4513 } 4514 4515 bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) { 4516 switch (MI.getOpcode()) { 4517 default: 4518 return false; 4519 case AArch64::LDPSi: 4520 case AArch64::LDPSWi: 4521 case AArch64::LDPDi: 4522 case AArch64::LDPQi: 4523 case AArch64::LDPWi: 4524 case AArch64::LDPXi: 4525 case AArch64::STPSi: 4526 case AArch64::STPDi: 4527 case AArch64::STPQi: 4528 case AArch64::STPWi: 4529 case AArch64::STPXi: 4530 case AArch64::STGPi: 4531 return true; 4532 } 4533 } 4534 4535 const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) { 4536 assert(MI.mayLoadOrStore() && "Load or store instruction expected"); 4537 unsigned Idx = 4538 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 4539 : 1; 4540 return MI.getOperand(Idx); 4541 } 4542 4543 const MachineOperand & 4544 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) { 4545 assert(MI.mayLoadOrStore() && "Load or store instruction expected"); 4546 unsigned Idx = 4547 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 4548 : 2; 4549 return MI.getOperand(Idx); 4550 } 4551 4552 const MachineOperand & 4553 AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) { 4554 switch (MI.getOpcode()) { 4555 default: 4556 llvm_unreachable("Unexpected opcode"); 4557 case AArch64::LDRBroX: 4558 case AArch64::LDRBBroX: 4559 case AArch64::LDRSBXroX: 4560 case AArch64::LDRSBWroX: 4561 case AArch64::LDRHroX: 4562 case AArch64::LDRHHroX: 4563 case AArch64::LDRSHXroX: 4564 case AArch64::LDRSHWroX: 4565 case AArch64::LDRWroX: 4566 case AArch64::LDRSroX: 4567 case AArch64::LDRSWroX: 4568 case AArch64::LDRDroX: 4569 case AArch64::LDRXroX: 4570 case AArch64::LDRQroX: 4571 return MI.getOperand(4); 4572 } 4573 } 4574 4575 static const TargetRegisterClass *getRegClass(const MachineInstr &MI, 4576 Register Reg) { 4577 if (MI.getParent() == nullptr) 4578 return nullptr; 4579 const MachineFunction *MF = MI.getParent()->getParent(); 4580 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr; 4581 } 4582 4583 bool AArch64InstrInfo::isHForm(const MachineInstr &MI) { 4584 auto IsHFPR = [&](const MachineOperand &Op) { 4585 if (!Op.isReg()) 4586 return false; 4587 auto Reg = Op.getReg(); 4588 if (Reg.isPhysical()) 4589 return AArch64::FPR16RegClass.contains(Reg); 4590 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 4591 return TRC == &AArch64::FPR16RegClass || 4592 TRC == &AArch64::FPR16_loRegClass; 4593 }; 4594 return llvm::any_of(MI.operands(), IsHFPR); 4595 } 4596 4597 bool AArch64InstrInfo::isQForm(const MachineInstr &MI) { 4598 auto IsQFPR = [&](const MachineOperand &Op) { 4599 if (!Op.isReg()) 4600 return false; 4601 auto Reg = Op.getReg(); 4602 if (Reg.isPhysical()) 4603 return AArch64::FPR128RegClass.contains(Reg); 4604 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 4605 return TRC == &AArch64::FPR128RegClass || 4606 TRC == &AArch64::FPR128_loRegClass; 4607 }; 4608 return llvm::any_of(MI.operands(), IsQFPR); 4609 } 4610 4611 bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) { 4612 switch (MI.getOpcode()) { 4613 case AArch64::BRK: 4614 case AArch64::HLT: 4615 case AArch64::PACIASP: 4616 case AArch64::PACIBSP: 4617 // Implicit BTI behavior. 4618 return true; 4619 case AArch64::PAUTH_PROLOGUE: 4620 // PAUTH_PROLOGUE expands to PACI(A|B)SP. 4621 return true; 4622 case AArch64::HINT: { 4623 unsigned Imm = MI.getOperand(0).getImm(); 4624 // Explicit BTI instruction. 4625 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38) 4626 return true; 4627 // PACI(A|B)SP instructions. 4628 if (Imm == 25 || Imm == 27) 4629 return true; 4630 return false; 4631 } 4632 default: 4633 return false; 4634 } 4635 } 4636 4637 bool AArch64InstrInfo::isFpOrNEON(Register Reg) { 4638 if (Reg == 0) 4639 return false; 4640 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON"); 4641 return AArch64::FPR128RegClass.contains(Reg) || 4642 AArch64::FPR64RegClass.contains(Reg) || 4643 AArch64::FPR32RegClass.contains(Reg) || 4644 AArch64::FPR16RegClass.contains(Reg) || 4645 AArch64::FPR8RegClass.contains(Reg); 4646 } 4647 4648 bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) { 4649 auto IsFPR = [&](const MachineOperand &Op) { 4650 if (!Op.isReg()) 4651 return false; 4652 auto Reg = Op.getReg(); 4653 if (Reg.isPhysical()) 4654 return isFpOrNEON(Reg); 4655 4656 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 4657 return TRC == &AArch64::FPR128RegClass || 4658 TRC == &AArch64::FPR128_loRegClass || 4659 TRC == &AArch64::FPR64RegClass || 4660 TRC == &AArch64::FPR64_loRegClass || 4661 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass || 4662 TRC == &AArch64::FPR8RegClass; 4663 }; 4664 return llvm::any_of(MI.operands(), IsFPR); 4665 } 4666 4667 // Scale the unscaled offsets. Returns false if the unscaled offset can't be 4668 // scaled. 4669 static bool scaleOffset(unsigned Opc, int64_t &Offset) { 4670 int Scale = AArch64InstrInfo::getMemScale(Opc); 4671 4672 // If the byte-offset isn't a multiple of the stride, we can't scale this 4673 // offset. 4674 if (Offset % Scale != 0) 4675 return false; 4676 4677 // Convert the byte-offset used by unscaled into an "element" offset used 4678 // by the scaled pair load/store instructions. 4679 Offset /= Scale; 4680 return true; 4681 } 4682 4683 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { 4684 if (FirstOpc == SecondOpc) 4685 return true; 4686 // We can also pair sign-ext and zero-ext instructions. 4687 switch (FirstOpc) { 4688 default: 4689 return false; 4690 case AArch64::STRSui: 4691 case AArch64::STURSi: 4692 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi; 4693 case AArch64::STRDui: 4694 case AArch64::STURDi: 4695 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi; 4696 case AArch64::STRQui: 4697 case AArch64::STURQi: 4698 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi; 4699 case AArch64::STRWui: 4700 case AArch64::STURWi: 4701 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi; 4702 case AArch64::STRXui: 4703 case AArch64::STURXi: 4704 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi; 4705 case AArch64::LDRSui: 4706 case AArch64::LDURSi: 4707 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi; 4708 case AArch64::LDRDui: 4709 case AArch64::LDURDi: 4710 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi; 4711 case AArch64::LDRQui: 4712 case AArch64::LDURQi: 4713 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi; 4714 case AArch64::LDRWui: 4715 case AArch64::LDURWi: 4716 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; 4717 case AArch64::LDRSWui: 4718 case AArch64::LDURSWi: 4719 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; 4720 case AArch64::LDRXui: 4721 case AArch64::LDURXi: 4722 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi; 4723 } 4724 // These instructions can't be paired based on their opcodes. 4725 return false; 4726 } 4727 4728 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, 4729 int64_t Offset1, unsigned Opcode1, int FI2, 4730 int64_t Offset2, unsigned Opcode2) { 4731 // Accesses through fixed stack object frame indices may access a different 4732 // fixed stack slot. Check that the object offsets + offsets match. 4733 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) { 4734 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); 4735 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); 4736 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); 4737 // Convert to scaled object offsets. 4738 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1); 4739 if (ObjectOffset1 % Scale1 != 0) 4740 return false; 4741 ObjectOffset1 /= Scale1; 4742 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2); 4743 if (ObjectOffset2 % Scale2 != 0) 4744 return false; 4745 ObjectOffset2 /= Scale2; 4746 ObjectOffset1 += Offset1; 4747 ObjectOffset2 += Offset2; 4748 return ObjectOffset1 + 1 == ObjectOffset2; 4749 } 4750 4751 return FI1 == FI2; 4752 } 4753 4754 /// Detect opportunities for ldp/stp formation. 4755 /// 4756 /// Only called for LdSt for which getMemOperandWithOffset returns true. 4757 bool AArch64InstrInfo::shouldClusterMemOps( 4758 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1, 4759 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2, 4760 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize, 4761 unsigned NumBytes) const { 4762 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); 4763 const MachineOperand &BaseOp1 = *BaseOps1.front(); 4764 const MachineOperand &BaseOp2 = *BaseOps2.front(); 4765 const MachineInstr &FirstLdSt = *BaseOp1.getParent(); 4766 const MachineInstr &SecondLdSt = *BaseOp2.getParent(); 4767 if (BaseOp1.getType() != BaseOp2.getType()) 4768 return false; 4769 4770 assert((BaseOp1.isReg() || BaseOp1.isFI()) && 4771 "Only base registers and frame indices are supported."); 4772 4773 // Check for both base regs and base FI. 4774 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) 4775 return false; 4776 4777 // Only cluster up to a single pair. 4778 if (ClusterSize > 2) 4779 return false; 4780 4781 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) 4782 return false; 4783 4784 // Can we pair these instructions based on their opcodes? 4785 unsigned FirstOpc = FirstLdSt.getOpcode(); 4786 unsigned SecondOpc = SecondLdSt.getOpcode(); 4787 if (!canPairLdStOpc(FirstOpc, SecondOpc)) 4788 return false; 4789 4790 // Can't merge volatiles or load/stores that have a hint to avoid pair 4791 // formation, for example. 4792 if (!isCandidateToMergeOrPair(FirstLdSt) || 4793 !isCandidateToMergeOrPair(SecondLdSt)) 4794 return false; 4795 4796 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. 4797 int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); 4798 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) 4799 return false; 4800 4801 int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); 4802 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) 4803 return false; 4804 4805 // Pairwise instructions have a 7-bit signed offset field. 4806 if (Offset1 > 63 || Offset1 < -64) 4807 return false; 4808 4809 // The caller should already have ordered First/SecondLdSt by offset. 4810 // Note: except for non-equal frame index bases 4811 if (BaseOp1.isFI()) { 4812 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && 4813 "Caller should have ordered offsets."); 4814 4815 const MachineFrameInfo &MFI = 4816 FirstLdSt.getParent()->getParent()->getFrameInfo(); 4817 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc, 4818 BaseOp2.getIndex(), Offset2, SecondOpc); 4819 } 4820 4821 assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); 4822 4823 return Offset1 + 1 == Offset2; 4824 } 4825 4826 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, 4827 MCRegister Reg, unsigned SubIdx, 4828 unsigned State, 4829 const TargetRegisterInfo *TRI) { 4830 if (!SubIdx) 4831 return MIB.addReg(Reg, State); 4832 4833 if (Reg.isPhysical()) 4834 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); 4835 return MIB.addReg(Reg, State, SubIdx); 4836 } 4837 4838 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, 4839 unsigned NumRegs) { 4840 // We really want the positive remainder mod 32 here, that happens to be 4841 // easily obtainable with a mask. 4842 return ((DestReg - SrcReg) & 0x1f) < NumRegs; 4843 } 4844 4845 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, 4846 MachineBasicBlock::iterator I, 4847 const DebugLoc &DL, MCRegister DestReg, 4848 MCRegister SrcReg, bool KillSrc, 4849 unsigned Opcode, 4850 ArrayRef<unsigned> Indices) const { 4851 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); 4852 const TargetRegisterInfo *TRI = &getRegisterInfo(); 4853 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 4854 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 4855 unsigned NumRegs = Indices.size(); 4856 4857 int SubReg = 0, End = NumRegs, Incr = 1; 4858 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { 4859 SubReg = NumRegs - 1; 4860 End = -1; 4861 Incr = -1; 4862 } 4863 4864 for (; SubReg != End; SubReg += Incr) { 4865 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 4866 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 4867 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); 4868 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 4869 } 4870 } 4871 4872 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, 4873 MachineBasicBlock::iterator I, 4874 const DebugLoc &DL, MCRegister DestReg, 4875 MCRegister SrcReg, bool KillSrc, 4876 unsigned Opcode, unsigned ZeroReg, 4877 llvm::ArrayRef<unsigned> Indices) const { 4878 const TargetRegisterInfo *TRI = &getRegisterInfo(); 4879 unsigned NumRegs = Indices.size(); 4880 4881 #ifndef NDEBUG 4882 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 4883 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 4884 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 && 4885 "GPR reg sequences should not be able to overlap"); 4886 #endif 4887 4888 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) { 4889 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 4890 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 4891 MIB.addReg(ZeroReg); 4892 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 4893 MIB.addImm(0); 4894 } 4895 } 4896 4897 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, 4898 MachineBasicBlock::iterator I, 4899 const DebugLoc &DL, MCRegister DestReg, 4900 MCRegister SrcReg, bool KillSrc, 4901 bool RenamableDest, 4902 bool RenamableSrc) const { 4903 if (AArch64::GPR32spRegClass.contains(DestReg) && 4904 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { 4905 const TargetRegisterInfo *TRI = &getRegisterInfo(); 4906 4907 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { 4908 // If either operand is WSP, expand to ADD #0. 4909 if (Subtarget.hasZeroCycleRegMove()) { 4910 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. 4911 MCRegister DestRegX = TRI->getMatchingSuperReg( 4912 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 4913 MCRegister SrcRegX = TRI->getMatchingSuperReg( 4914 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 4915 // This instruction is reading and writing X registers. This may upset 4916 // the register scavenger and machine verifier, so we need to indicate 4917 // that we are reading an undefined value from SrcRegX, but a proper 4918 // value from SrcReg. 4919 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX) 4920 .addReg(SrcRegX, RegState::Undef) 4921 .addImm(0) 4922 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 4923 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 4924 } else { 4925 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg) 4926 .addReg(SrcReg, getKillRegState(KillSrc)) 4927 .addImm(0) 4928 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 4929 } 4930 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { 4931 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) 4932 .addImm(0) 4933 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 4934 } else { 4935 if (Subtarget.hasZeroCycleRegMove()) { 4936 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. 4937 MCRegister DestRegX = TRI->getMatchingSuperReg( 4938 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 4939 MCRegister SrcRegX = TRI->getMatchingSuperReg( 4940 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 4941 // This instruction is reading and writing X registers. This may upset 4942 // the register scavenger and machine verifier, so we need to indicate 4943 // that we are reading an undefined value from SrcRegX, but a proper 4944 // value from SrcReg. 4945 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) 4946 .addReg(AArch64::XZR) 4947 .addReg(SrcRegX, RegState::Undef) 4948 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 4949 } else { 4950 // Otherwise, expand to ORR WZR. 4951 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) 4952 .addReg(AArch64::WZR) 4953 .addReg(SrcReg, getKillRegState(KillSrc)); 4954 } 4955 } 4956 return; 4957 } 4958 4959 // Copy a Predicate register by ORRing with itself. 4960 if (AArch64::PPRRegClass.contains(DestReg) && 4961 AArch64::PPRRegClass.contains(SrcReg)) { 4962 assert(Subtarget.isSVEorStreamingSVEAvailable() && 4963 "Unexpected SVE register."); 4964 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) 4965 .addReg(SrcReg) // Pg 4966 .addReg(SrcReg) 4967 .addReg(SrcReg, getKillRegState(KillSrc)); 4968 return; 4969 } 4970 4971 // Copy a predicate-as-counter register by ORRing with itself as if it 4972 // were a regular predicate (mask) register. 4973 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg); 4974 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg); 4975 if (DestIsPNR || SrcIsPNR) { 4976 auto ToPPR = [](MCRegister R) -> MCRegister { 4977 return (R - AArch64::PN0) + AArch64::P0; 4978 }; 4979 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg; 4980 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg; 4981 4982 if (PPRSrcReg != PPRDestReg) { 4983 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg) 4984 .addReg(PPRSrcReg) // Pg 4985 .addReg(PPRSrcReg) 4986 .addReg(PPRSrcReg, getKillRegState(KillSrc)); 4987 if (DestIsPNR) 4988 NewMI.addDef(DestReg, RegState::Implicit); 4989 } 4990 return; 4991 } 4992 4993 // Copy a Z register by ORRing with itself. 4994 if (AArch64::ZPRRegClass.contains(DestReg) && 4995 AArch64::ZPRRegClass.contains(SrcReg)) { 4996 assert(Subtarget.isSVEorStreamingSVEAvailable() && 4997 "Unexpected SVE register."); 4998 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) 4999 .addReg(SrcReg) 5000 .addReg(SrcReg, getKillRegState(KillSrc)); 5001 return; 5002 } 5003 5004 // Copy a Z register pair by copying the individual sub-registers. 5005 if ((AArch64::ZPR2RegClass.contains(DestReg) || 5006 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) && 5007 (AArch64::ZPR2RegClass.contains(SrcReg) || 5008 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) { 5009 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5010 "Unexpected SVE register."); 5011 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1}; 5012 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 5013 Indices); 5014 return; 5015 } 5016 5017 // Copy a Z register triple by copying the individual sub-registers. 5018 if (AArch64::ZPR3RegClass.contains(DestReg) && 5019 AArch64::ZPR3RegClass.contains(SrcReg)) { 5020 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5021 "Unexpected SVE register."); 5022 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 5023 AArch64::zsub2}; 5024 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 5025 Indices); 5026 return; 5027 } 5028 5029 // Copy a Z register quad by copying the individual sub-registers. 5030 if ((AArch64::ZPR4RegClass.contains(DestReg) || 5031 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) && 5032 (AArch64::ZPR4RegClass.contains(SrcReg) || 5033 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) { 5034 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5035 "Unexpected SVE register."); 5036 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 5037 AArch64::zsub2, AArch64::zsub3}; 5038 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 5039 Indices); 5040 return; 5041 } 5042 5043 if (AArch64::GPR64spRegClass.contains(DestReg) && 5044 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { 5045 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { 5046 // If either operand is SP, expand to ADD #0. 5047 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) 5048 .addReg(SrcReg, getKillRegState(KillSrc)) 5049 .addImm(0) 5050 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 5051 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { 5052 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) 5053 .addImm(0) 5054 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 5055 } else { 5056 // Otherwise, expand to ORR XZR. 5057 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) 5058 .addReg(AArch64::XZR) 5059 .addReg(SrcReg, getKillRegState(KillSrc)); 5060 } 5061 return; 5062 } 5063 5064 // Copy a DDDD register quad by copying the individual sub-registers. 5065 if (AArch64::DDDDRegClass.contains(DestReg) && 5066 AArch64::DDDDRegClass.contains(SrcReg)) { 5067 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 5068 AArch64::dsub2, AArch64::dsub3}; 5069 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 5070 Indices); 5071 return; 5072 } 5073 5074 // Copy a DDD register triple by copying the individual sub-registers. 5075 if (AArch64::DDDRegClass.contains(DestReg) && 5076 AArch64::DDDRegClass.contains(SrcReg)) { 5077 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 5078 AArch64::dsub2}; 5079 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 5080 Indices); 5081 return; 5082 } 5083 5084 // Copy a DD register pair by copying the individual sub-registers. 5085 if (AArch64::DDRegClass.contains(DestReg) && 5086 AArch64::DDRegClass.contains(SrcReg)) { 5087 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1}; 5088 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 5089 Indices); 5090 return; 5091 } 5092 5093 // Copy a QQQQ register quad by copying the individual sub-registers. 5094 if (AArch64::QQQQRegClass.contains(DestReg) && 5095 AArch64::QQQQRegClass.contains(SrcReg)) { 5096 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 5097 AArch64::qsub2, AArch64::qsub3}; 5098 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 5099 Indices); 5100 return; 5101 } 5102 5103 // Copy a QQQ register triple by copying the individual sub-registers. 5104 if (AArch64::QQQRegClass.contains(DestReg) && 5105 AArch64::QQQRegClass.contains(SrcReg)) { 5106 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 5107 AArch64::qsub2}; 5108 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 5109 Indices); 5110 return; 5111 } 5112 5113 // Copy a QQ register pair by copying the individual sub-registers. 5114 if (AArch64::QQRegClass.contains(DestReg) && 5115 AArch64::QQRegClass.contains(SrcReg)) { 5116 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1}; 5117 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 5118 Indices); 5119 return; 5120 } 5121 5122 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) && 5123 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) { 5124 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64}; 5125 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs, 5126 AArch64::XZR, Indices); 5127 return; 5128 } 5129 5130 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) && 5131 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) { 5132 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32}; 5133 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs, 5134 AArch64::WZR, Indices); 5135 return; 5136 } 5137 5138 if (AArch64::FPR128RegClass.contains(DestReg) && 5139 AArch64::FPR128RegClass.contains(SrcReg)) { 5140 if (Subtarget.isSVEorStreamingSVEAvailable() && 5141 !Subtarget.isNeonAvailable()) 5142 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ)) 5143 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define) 5144 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0)) 5145 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0)); 5146 else if (Subtarget.isNeonAvailable()) 5147 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 5148 .addReg(SrcReg) 5149 .addReg(SrcReg, getKillRegState(KillSrc)); 5150 else { 5151 BuildMI(MBB, I, DL, get(AArch64::STRQpre)) 5152 .addReg(AArch64::SP, RegState::Define) 5153 .addReg(SrcReg, getKillRegState(KillSrc)) 5154 .addReg(AArch64::SP) 5155 .addImm(-16); 5156 BuildMI(MBB, I, DL, get(AArch64::LDRQpost)) 5157 .addReg(AArch64::SP, RegState::Define) 5158 .addReg(DestReg, RegState::Define) 5159 .addReg(AArch64::SP) 5160 .addImm(16); 5161 } 5162 return; 5163 } 5164 5165 if (AArch64::FPR64RegClass.contains(DestReg) && 5166 AArch64::FPR64RegClass.contains(SrcReg)) { 5167 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) 5168 .addReg(SrcReg, getKillRegState(KillSrc)); 5169 return; 5170 } 5171 5172 if (AArch64::FPR32RegClass.contains(DestReg) && 5173 AArch64::FPR32RegClass.contains(SrcReg)) { 5174 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 5175 .addReg(SrcReg, getKillRegState(KillSrc)); 5176 return; 5177 } 5178 5179 if (AArch64::FPR16RegClass.contains(DestReg) && 5180 AArch64::FPR16RegClass.contains(SrcReg)) { 5181 DestReg = 5182 RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass); 5183 SrcReg = 5184 RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass); 5185 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 5186 .addReg(SrcReg, getKillRegState(KillSrc)); 5187 return; 5188 } 5189 5190 if (AArch64::FPR8RegClass.contains(DestReg) && 5191 AArch64::FPR8RegClass.contains(SrcReg)) { 5192 DestReg = 5193 RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass); 5194 SrcReg = 5195 RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass); 5196 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 5197 .addReg(SrcReg, getKillRegState(KillSrc)); 5198 return; 5199 } 5200 5201 // Copies between GPR64 and FPR64. 5202 if (AArch64::FPR64RegClass.contains(DestReg) && 5203 AArch64::GPR64RegClass.contains(SrcReg)) { 5204 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) 5205 .addReg(SrcReg, getKillRegState(KillSrc)); 5206 return; 5207 } 5208 if (AArch64::GPR64RegClass.contains(DestReg) && 5209 AArch64::FPR64RegClass.contains(SrcReg)) { 5210 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg) 5211 .addReg(SrcReg, getKillRegState(KillSrc)); 5212 return; 5213 } 5214 // Copies between GPR32 and FPR32. 5215 if (AArch64::FPR32RegClass.contains(DestReg) && 5216 AArch64::GPR32RegClass.contains(SrcReg)) { 5217 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) 5218 .addReg(SrcReg, getKillRegState(KillSrc)); 5219 return; 5220 } 5221 if (AArch64::GPR32RegClass.contains(DestReg) && 5222 AArch64::FPR32RegClass.contains(SrcReg)) { 5223 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg) 5224 .addReg(SrcReg, getKillRegState(KillSrc)); 5225 return; 5226 } 5227 5228 if (DestReg == AArch64::NZCV) { 5229 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy"); 5230 BuildMI(MBB, I, DL, get(AArch64::MSR)) 5231 .addImm(AArch64SysReg::NZCV) 5232 .addReg(SrcReg, getKillRegState(KillSrc)) 5233 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); 5234 return; 5235 } 5236 5237 if (SrcReg == AArch64::NZCV) { 5238 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); 5239 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) 5240 .addImm(AArch64SysReg::NZCV) 5241 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); 5242 return; 5243 } 5244 5245 #ifndef NDEBUG 5246 const TargetRegisterInfo &TRI = getRegisterInfo(); 5247 errs() << TRI.getRegAsmName(DestReg) << " = COPY " 5248 << TRI.getRegAsmName(SrcReg) << "\n"; 5249 #endif 5250 llvm_unreachable("unimplemented reg-to-reg copy"); 5251 } 5252 5253 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, 5254 MachineBasicBlock &MBB, 5255 MachineBasicBlock::iterator InsertBefore, 5256 const MCInstrDesc &MCID, 5257 Register SrcReg, bool IsKill, 5258 unsigned SubIdx0, unsigned SubIdx1, int FI, 5259 MachineMemOperand *MMO) { 5260 Register SrcReg0 = SrcReg; 5261 Register SrcReg1 = SrcReg; 5262 if (SrcReg.isPhysical()) { 5263 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); 5264 SubIdx0 = 0; 5265 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1); 5266 SubIdx1 = 0; 5267 } 5268 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 5269 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0) 5270 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1) 5271 .addFrameIndex(FI) 5272 .addImm(0) 5273 .addMemOperand(MMO); 5274 } 5275 5276 void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 5277 MachineBasicBlock::iterator MBBI, 5278 Register SrcReg, bool isKill, int FI, 5279 const TargetRegisterClass *RC, 5280 const TargetRegisterInfo *TRI, 5281 Register VReg, 5282 MachineInstr::MIFlag Flags) const { 5283 MachineFunction &MF = *MBB.getParent(); 5284 MachineFrameInfo &MFI = MF.getFrameInfo(); 5285 5286 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 5287 MachineMemOperand *MMO = 5288 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 5289 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 5290 unsigned Opc = 0; 5291 bool Offset = true; 5292 MCRegister PNRReg = MCRegister::NoRegister; 5293 unsigned StackID = TargetStackID::Default; 5294 switch (TRI->getSpillSize(*RC)) { 5295 case 1: 5296 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 5297 Opc = AArch64::STRBui; 5298 break; 5299 case 2: { 5300 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 5301 Opc = AArch64::STRHui; 5302 else if (AArch64::PNRRegClass.hasSubClassEq(RC) || 5303 AArch64::PPRRegClass.hasSubClassEq(RC)) { 5304 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5305 "Unexpected register store without SVE store instructions"); 5306 Opc = AArch64::STR_PXI; 5307 StackID = TargetStackID::ScalableVector; 5308 } 5309 break; 5310 } 5311 case 4: 5312 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 5313 Opc = AArch64::STRWui; 5314 if (SrcReg.isVirtual()) 5315 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); 5316 else 5317 assert(SrcReg != AArch64::WSP); 5318 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 5319 Opc = AArch64::STRSui; 5320 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) { 5321 Opc = AArch64::STR_PPXI; 5322 StackID = TargetStackID::ScalableVector; 5323 } 5324 break; 5325 case 8: 5326 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 5327 Opc = AArch64::STRXui; 5328 if (SrcReg.isVirtual()) 5329 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 5330 else 5331 assert(SrcReg != AArch64::SP); 5332 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 5333 Opc = AArch64::STRDui; 5334 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 5335 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 5336 get(AArch64::STPWi), SrcReg, isKill, 5337 AArch64::sube32, AArch64::subo32, FI, MMO); 5338 return; 5339 } 5340 break; 5341 case 16: 5342 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 5343 Opc = AArch64::STRQui; 5344 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 5345 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 5346 Opc = AArch64::ST1Twov1d; 5347 Offset = false; 5348 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 5349 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 5350 get(AArch64::STPXi), SrcReg, isKill, 5351 AArch64::sube64, AArch64::subo64, FI, MMO); 5352 return; 5353 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 5354 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5355 "Unexpected register store without SVE store instructions"); 5356 Opc = AArch64::STR_ZXI; 5357 StackID = TargetStackID::ScalableVector; 5358 } 5359 break; 5360 case 24: 5361 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 5362 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 5363 Opc = AArch64::ST1Threev1d; 5364 Offset = false; 5365 } 5366 break; 5367 case 32: 5368 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 5369 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 5370 Opc = AArch64::ST1Fourv1d; 5371 Offset = false; 5372 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 5373 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 5374 Opc = AArch64::ST1Twov2d; 5375 Offset = false; 5376 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) || 5377 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { 5378 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5379 "Unexpected register store without SVE store instructions"); 5380 Opc = AArch64::STR_ZZXI; 5381 StackID = TargetStackID::ScalableVector; 5382 } 5383 break; 5384 case 48: 5385 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 5386 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 5387 Opc = AArch64::ST1Threev2d; 5388 Offset = false; 5389 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 5390 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5391 "Unexpected register store without SVE store instructions"); 5392 Opc = AArch64::STR_ZZZXI; 5393 StackID = TargetStackID::ScalableVector; 5394 } 5395 break; 5396 case 64: 5397 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 5398 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 5399 Opc = AArch64::ST1Fourv2d; 5400 Offset = false; 5401 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) || 5402 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { 5403 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5404 "Unexpected register store without SVE store instructions"); 5405 Opc = AArch64::STR_ZZZZXI; 5406 StackID = TargetStackID::ScalableVector; 5407 } 5408 break; 5409 } 5410 assert(Opc && "Unknown register class"); 5411 MFI.setStackID(FI, StackID); 5412 5413 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 5414 .addReg(SrcReg, getKillRegState(isKill)) 5415 .addFrameIndex(FI); 5416 5417 if (Offset) 5418 MI.addImm(0); 5419 if (PNRReg.isValid()) 5420 MI.addDef(PNRReg, RegState::Implicit); 5421 MI.addMemOperand(MMO); 5422 } 5423 5424 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, 5425 MachineBasicBlock &MBB, 5426 MachineBasicBlock::iterator InsertBefore, 5427 const MCInstrDesc &MCID, 5428 Register DestReg, unsigned SubIdx0, 5429 unsigned SubIdx1, int FI, 5430 MachineMemOperand *MMO) { 5431 Register DestReg0 = DestReg; 5432 Register DestReg1 = DestReg; 5433 bool IsUndef = true; 5434 if (DestReg.isPhysical()) { 5435 DestReg0 = TRI.getSubReg(DestReg, SubIdx0); 5436 SubIdx0 = 0; 5437 DestReg1 = TRI.getSubReg(DestReg, SubIdx1); 5438 SubIdx1 = 0; 5439 IsUndef = false; 5440 } 5441 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 5442 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0) 5443 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1) 5444 .addFrameIndex(FI) 5445 .addImm(0) 5446 .addMemOperand(MMO); 5447 } 5448 5449 void AArch64InstrInfo::loadRegFromStackSlot( 5450 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, 5451 int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, 5452 Register VReg, MachineInstr::MIFlag Flags) const { 5453 MachineFunction &MF = *MBB.getParent(); 5454 MachineFrameInfo &MFI = MF.getFrameInfo(); 5455 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 5456 MachineMemOperand *MMO = 5457 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 5458 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 5459 5460 unsigned Opc = 0; 5461 bool Offset = true; 5462 unsigned StackID = TargetStackID::Default; 5463 Register PNRReg = MCRegister::NoRegister; 5464 switch (TRI->getSpillSize(*RC)) { 5465 case 1: 5466 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 5467 Opc = AArch64::LDRBui; 5468 break; 5469 case 2: { 5470 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC); 5471 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 5472 Opc = AArch64::LDRHui; 5473 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) { 5474 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5475 "Unexpected register load without SVE load instructions"); 5476 if (IsPNR) 5477 PNRReg = DestReg; 5478 Opc = AArch64::LDR_PXI; 5479 StackID = TargetStackID::ScalableVector; 5480 } 5481 break; 5482 } 5483 case 4: 5484 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 5485 Opc = AArch64::LDRWui; 5486 if (DestReg.isVirtual()) 5487 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); 5488 else 5489 assert(DestReg != AArch64::WSP); 5490 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 5491 Opc = AArch64::LDRSui; 5492 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) { 5493 Opc = AArch64::LDR_PPXI; 5494 StackID = TargetStackID::ScalableVector; 5495 } 5496 break; 5497 case 8: 5498 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 5499 Opc = AArch64::LDRXui; 5500 if (DestReg.isVirtual()) 5501 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); 5502 else 5503 assert(DestReg != AArch64::SP); 5504 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 5505 Opc = AArch64::LDRDui; 5506 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 5507 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 5508 get(AArch64::LDPWi), DestReg, AArch64::sube32, 5509 AArch64::subo32, FI, MMO); 5510 return; 5511 } 5512 break; 5513 case 16: 5514 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 5515 Opc = AArch64::LDRQui; 5516 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 5517 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5518 Opc = AArch64::LD1Twov1d; 5519 Offset = false; 5520 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 5521 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 5522 get(AArch64::LDPXi), DestReg, AArch64::sube64, 5523 AArch64::subo64, FI, MMO); 5524 return; 5525 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 5526 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5527 "Unexpected register load without SVE load instructions"); 5528 Opc = AArch64::LDR_ZXI; 5529 StackID = TargetStackID::ScalableVector; 5530 } 5531 break; 5532 case 24: 5533 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 5534 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5535 Opc = AArch64::LD1Threev1d; 5536 Offset = false; 5537 } 5538 break; 5539 case 32: 5540 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 5541 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5542 Opc = AArch64::LD1Fourv1d; 5543 Offset = false; 5544 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 5545 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5546 Opc = AArch64::LD1Twov2d; 5547 Offset = false; 5548 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) || 5549 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { 5550 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5551 "Unexpected register load without SVE load instructions"); 5552 Opc = AArch64::LDR_ZZXI; 5553 StackID = TargetStackID::ScalableVector; 5554 } 5555 break; 5556 case 48: 5557 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 5558 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5559 Opc = AArch64::LD1Threev2d; 5560 Offset = false; 5561 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 5562 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5563 "Unexpected register load without SVE load instructions"); 5564 Opc = AArch64::LDR_ZZZXI; 5565 StackID = TargetStackID::ScalableVector; 5566 } 5567 break; 5568 case 64: 5569 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 5570 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5571 Opc = AArch64::LD1Fourv2d; 5572 Offset = false; 5573 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) || 5574 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { 5575 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5576 "Unexpected register load without SVE load instructions"); 5577 Opc = AArch64::LDR_ZZZZXI; 5578 StackID = TargetStackID::ScalableVector; 5579 } 5580 break; 5581 } 5582 5583 assert(Opc && "Unknown register class"); 5584 MFI.setStackID(FI, StackID); 5585 5586 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 5587 .addReg(DestReg, getDefRegState(true)) 5588 .addFrameIndex(FI); 5589 if (Offset) 5590 MI.addImm(0); 5591 if (PNRReg.isValid() && !PNRReg.isVirtual()) 5592 MI.addDef(PNRReg, RegState::Implicit); 5593 MI.addMemOperand(MMO); 5594 } 5595 5596 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, 5597 const MachineInstr &UseMI, 5598 const TargetRegisterInfo *TRI) { 5599 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()), 5600 UseMI.getIterator()), 5601 [TRI](const MachineInstr &I) { 5602 return I.modifiesRegister(AArch64::NZCV, TRI) || 5603 I.readsRegister(AArch64::NZCV, TRI); 5604 }); 5605 } 5606 5607 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 5608 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) { 5609 // The smallest scalable element supported by scaled SVE addressing 5610 // modes are predicates, which are 2 scalable bytes in size. So the scalable 5611 // byte offset must always be a multiple of 2. 5612 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 5613 5614 // VGSized offsets are divided by '2', because the VG register is the 5615 // the number of 64bit granules as opposed to 128bit vector chunks, 5616 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled. 5617 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes. 5618 // VG = n * 2 and the dwarf offset must be VG * 8 bytes. 5619 ByteSized = Offset.getFixed(); 5620 VGSized = Offset.getScalable() / 2; 5621 } 5622 5623 /// Returns the offset in parts to which this frame offset can be 5624 /// decomposed for the purpose of describing a frame offset. 5625 /// For non-scalable offsets this is simply its byte size. 5626 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 5627 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors, 5628 int64_t &NumDataVectors) { 5629 // The smallest scalable element supported by scaled SVE addressing 5630 // modes are predicates, which are 2 scalable bytes in size. So the scalable 5631 // byte offset must always be a multiple of 2. 5632 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 5633 5634 NumBytes = Offset.getFixed(); 5635 NumDataVectors = 0; 5636 NumPredicateVectors = Offset.getScalable() / 2; 5637 // This method is used to get the offsets to adjust the frame offset. 5638 // If the function requires ADDPL to be used and needs more than two ADDPL 5639 // instructions, part of the offset is folded into NumDataVectors so that it 5640 // uses ADDVL for part of it, reducing the number of ADDPL instructions. 5641 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || 5642 NumPredicateVectors > 62) { 5643 NumDataVectors = NumPredicateVectors / 8; 5644 NumPredicateVectors -= NumDataVectors * 8; 5645 } 5646 } 5647 5648 // Convenience function to create a DWARF expression for 5649 // Expr + NumBytes + NumVGScaledBytes * AArch64::VG 5650 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes, 5651 int NumVGScaledBytes, unsigned VG, 5652 llvm::raw_string_ostream &Comment) { 5653 uint8_t buffer[16]; 5654 5655 if (NumBytes) { 5656 Expr.push_back(dwarf::DW_OP_consts); 5657 Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer)); 5658 Expr.push_back((uint8_t)dwarf::DW_OP_plus); 5659 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes); 5660 } 5661 5662 if (NumVGScaledBytes) { 5663 Expr.push_back((uint8_t)dwarf::DW_OP_consts); 5664 Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer)); 5665 5666 Expr.push_back((uint8_t)dwarf::DW_OP_bregx); 5667 Expr.append(buffer, buffer + encodeULEB128(VG, buffer)); 5668 Expr.push_back(0); 5669 5670 Expr.push_back((uint8_t)dwarf::DW_OP_mul); 5671 Expr.push_back((uint8_t)dwarf::DW_OP_plus); 5672 5673 Comment << (NumVGScaledBytes < 0 ? " - " : " + ") 5674 << std::abs(NumVGScaledBytes) << " * VG"; 5675 } 5676 } 5677 5678 // Creates an MCCFIInstruction: 5679 // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr } 5680 static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, 5681 unsigned Reg, 5682 const StackOffset &Offset) { 5683 int64_t NumBytes, NumVGScaledBytes; 5684 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes, 5685 NumVGScaledBytes); 5686 std::string CommentBuffer; 5687 llvm::raw_string_ostream Comment(CommentBuffer); 5688 5689 if (Reg == AArch64::SP) 5690 Comment << "sp"; 5691 else if (Reg == AArch64::FP) 5692 Comment << "fp"; 5693 else 5694 Comment << printReg(Reg, &TRI); 5695 5696 // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG) 5697 SmallString<64> Expr; 5698 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 5699 Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg)); 5700 Expr.push_back(0); 5701 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes, 5702 TRI.getDwarfRegNum(AArch64::VG, true), Comment); 5703 5704 // Wrap this into DW_CFA_def_cfa. 5705 SmallString<64> DefCfaExpr; 5706 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression); 5707 uint8_t buffer[16]; 5708 DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer)); 5709 DefCfaExpr.append(Expr.str()); 5710 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(), 5711 Comment.str()); 5712 } 5713 5714 MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI, 5715 unsigned FrameReg, unsigned Reg, 5716 const StackOffset &Offset, 5717 bool LastAdjustmentWasScalable) { 5718 if (Offset.getScalable()) 5719 return createDefCFAExpression(TRI, Reg, Offset); 5720 5721 if (FrameReg == Reg && !LastAdjustmentWasScalable) 5722 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed())); 5723 5724 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 5725 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed()); 5726 } 5727 5728 MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI, 5729 unsigned Reg, 5730 const StackOffset &OffsetFromDefCFA) { 5731 int64_t NumBytes, NumVGScaledBytes; 5732 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 5733 OffsetFromDefCFA, NumBytes, NumVGScaledBytes); 5734 5735 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 5736 5737 // Non-scalable offsets can use DW_CFA_offset directly. 5738 if (!NumVGScaledBytes) 5739 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes); 5740 5741 std::string CommentBuffer; 5742 llvm::raw_string_ostream Comment(CommentBuffer); 5743 Comment << printReg(Reg, &TRI) << " @ cfa"; 5744 5745 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG) 5746 SmallString<64> OffsetExpr; 5747 appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes, 5748 TRI.getDwarfRegNum(AArch64::VG, true), Comment); 5749 5750 // Wrap this into DW_CFA_expression 5751 SmallString<64> CfaExpr; 5752 CfaExpr.push_back(dwarf::DW_CFA_expression); 5753 uint8_t buffer[16]; 5754 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer)); 5755 CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer)); 5756 CfaExpr.append(OffsetExpr.str()); 5757 5758 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(), 5759 Comment.str()); 5760 } 5761 5762 // Helper function to emit a frame offset adjustment from a given 5763 // pointer (SrcReg), stored into DestReg. This function is explicit 5764 // in that it requires the opcode. 5765 static void emitFrameOffsetAdj(MachineBasicBlock &MBB, 5766 MachineBasicBlock::iterator MBBI, 5767 const DebugLoc &DL, unsigned DestReg, 5768 unsigned SrcReg, int64_t Offset, unsigned Opc, 5769 const TargetInstrInfo *TII, 5770 MachineInstr::MIFlag Flag, bool NeedsWinCFI, 5771 bool *HasWinCFI, bool EmitCFAOffset, 5772 StackOffset CFAOffset, unsigned FrameReg) { 5773 int Sign = 1; 5774 unsigned MaxEncoding, ShiftSize; 5775 switch (Opc) { 5776 case AArch64::ADDXri: 5777 case AArch64::ADDSXri: 5778 case AArch64::SUBXri: 5779 case AArch64::SUBSXri: 5780 MaxEncoding = 0xfff; 5781 ShiftSize = 12; 5782 break; 5783 case AArch64::ADDVL_XXI: 5784 case AArch64::ADDPL_XXI: 5785 case AArch64::ADDSVL_XXI: 5786 case AArch64::ADDSPL_XXI: 5787 MaxEncoding = 31; 5788 ShiftSize = 0; 5789 if (Offset < 0) { 5790 MaxEncoding = 32; 5791 Sign = -1; 5792 Offset = -Offset; 5793 } 5794 break; 5795 default: 5796 llvm_unreachable("Unsupported opcode"); 5797 } 5798 5799 // `Offset` can be in bytes or in "scalable bytes". 5800 int VScale = 1; 5801 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI) 5802 VScale = 16; 5803 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI) 5804 VScale = 2; 5805 5806 // FIXME: If the offset won't fit in 24-bits, compute the offset into a 5807 // scratch register. If DestReg is a virtual register, use it as the 5808 // scratch register; otherwise, create a new virtual register (to be 5809 // replaced by the scavenger at the end of PEI). That case can be optimized 5810 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch 5811 // register can be loaded with offset%8 and the add/sub can use an extending 5812 // instruction with LSL#3. 5813 // Currently the function handles any offsets but generates a poor sequence 5814 // of code. 5815 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); 5816 5817 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; 5818 Register TmpReg = DestReg; 5819 if (TmpReg == AArch64::XZR) 5820 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister( 5821 &AArch64::GPR64RegClass); 5822 do { 5823 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue); 5824 unsigned LocalShiftSize = 0; 5825 if (ThisVal > MaxEncoding) { 5826 ThisVal = ThisVal >> ShiftSize; 5827 LocalShiftSize = ShiftSize; 5828 } 5829 assert((ThisVal >> ShiftSize) <= MaxEncoding && 5830 "Encoding cannot handle value that big"); 5831 5832 Offset -= ThisVal << LocalShiftSize; 5833 if (Offset == 0) 5834 TmpReg = DestReg; 5835 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg) 5836 .addReg(SrcReg) 5837 .addImm(Sign * (int)ThisVal); 5838 if (ShiftSize) 5839 MBI = MBI.addImm( 5840 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize)); 5841 MBI = MBI.setMIFlag(Flag); 5842 5843 auto Change = 5844 VScale == 1 5845 ? StackOffset::getFixed(ThisVal << LocalShiftSize) 5846 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize)); 5847 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri) 5848 CFAOffset += Change; 5849 else 5850 CFAOffset -= Change; 5851 if (EmitCFAOffset && DestReg == TmpReg) { 5852 MachineFunction &MF = *MBB.getParent(); 5853 const TargetSubtargetInfo &STI = MF.getSubtarget(); 5854 const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); 5855 5856 unsigned CFIIndex = MF.addFrameInst( 5857 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1)); 5858 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) 5859 .addCFIIndex(CFIIndex) 5860 .setMIFlags(Flag); 5861 } 5862 5863 if (NeedsWinCFI) { 5864 assert(Sign == 1 && "SEH directives should always have a positive sign"); 5865 int Imm = (int)(ThisVal << LocalShiftSize); 5866 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || 5867 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { 5868 if (HasWinCFI) 5869 *HasWinCFI = true; 5870 if (Imm == 0) 5871 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag); 5872 else 5873 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) 5874 .addImm(Imm) 5875 .setMIFlag(Flag); 5876 assert(Offset == 0 && "Expected remaining offset to be zero to " 5877 "emit a single SEH directive"); 5878 } else if (DestReg == AArch64::SP) { 5879 if (HasWinCFI) 5880 *HasWinCFI = true; 5881 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc"); 5882 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) 5883 .addImm(Imm) 5884 .setMIFlag(Flag); 5885 } 5886 } 5887 5888 SrcReg = TmpReg; 5889 } while (Offset); 5890 } 5891 5892 void llvm::emitFrameOffset(MachineBasicBlock &MBB, 5893 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, 5894 unsigned DestReg, unsigned SrcReg, 5895 StackOffset Offset, const TargetInstrInfo *TII, 5896 MachineInstr::MIFlag Flag, bool SetNZCV, 5897 bool NeedsWinCFI, bool *HasWinCFI, 5898 bool EmitCFAOffset, StackOffset CFAOffset, 5899 unsigned FrameReg) { 5900 // If a function is marked as arm_locally_streaming, then the runtime value of 5901 // vscale in the prologue/epilogue is different the runtime value of vscale 5902 // in the function's body. To avoid having to consider multiple vscales, 5903 // we can use `addsvl` to allocate any scalable stack-slots, which under 5904 // most circumstances will be only locals, not callee-save slots. 5905 const Function &F = MBB.getParent()->getFunction(); 5906 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body"); 5907 5908 int64_t Bytes, NumPredicateVectors, NumDataVectors; 5909 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 5910 Offset, Bytes, NumPredicateVectors, NumDataVectors); 5911 5912 // First emit non-scalable frame offsets, or a simple 'mov'. 5913 if (Bytes || (!Offset && SrcReg != DestReg)) { 5914 assert((DestReg != AArch64::SP || Bytes % 8 == 0) && 5915 "SP increment/decrement not 8-byte aligned"); 5916 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; 5917 if (Bytes < 0) { 5918 Bytes = -Bytes; 5919 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri; 5920 } 5921 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag, 5922 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset, 5923 FrameReg); 5924 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri) 5925 ? StackOffset::getFixed(-Bytes) 5926 : StackOffset::getFixed(Bytes); 5927 SrcReg = DestReg; 5928 FrameReg = DestReg; 5929 } 5930 5931 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && 5932 "SetNZCV not supported with SVE vectors"); 5933 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && 5934 "WinCFI not supported with SVE vectors"); 5935 5936 if (NumDataVectors) { 5937 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, 5938 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, 5939 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, 5940 CFAOffset, FrameReg); 5941 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16); 5942 SrcReg = DestReg; 5943 } 5944 5945 if (NumPredicateVectors) { 5946 assert(DestReg != AArch64::SP && "Unaligned access to SP"); 5947 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, 5948 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, 5949 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, 5950 CFAOffset, FrameReg); 5951 } 5952 } 5953 5954 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( 5955 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 5956 MachineBasicBlock::iterator InsertPt, int FrameIndex, 5957 LiveIntervals *LIS, VirtRegMap *VRM) const { 5958 // This is a bit of a hack. Consider this instruction: 5959 // 5960 // %0 = COPY %sp; GPR64all:%0 5961 // 5962 // We explicitly chose GPR64all for the virtual register so such a copy might 5963 // be eliminated by RegisterCoalescer. However, that may not be possible, and 5964 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all 5965 // register class, TargetInstrInfo::foldMemoryOperand() is going to try. 5966 // 5967 // To prevent that, we are going to constrain the %0 register class here. 5968 if (MI.isFullCopy()) { 5969 Register DstReg = MI.getOperand(0).getReg(); 5970 Register SrcReg = MI.getOperand(1).getReg(); 5971 if (SrcReg == AArch64::SP && DstReg.isVirtual()) { 5972 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); 5973 return nullptr; 5974 } 5975 if (DstReg == AArch64::SP && SrcReg.isVirtual()) { 5976 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 5977 return nullptr; 5978 } 5979 // Nothing can folded with copy from/to NZCV. 5980 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV) 5981 return nullptr; 5982 } 5983 5984 // Handle the case where a copy is being spilled or filled but the source 5985 // and destination register class don't match. For example: 5986 // 5987 // %0 = COPY %xzr; GPR64common:%0 5988 // 5989 // In this case we can still safely fold away the COPY and generate the 5990 // following spill code: 5991 // 5992 // STRXui %xzr, %stack.0 5993 // 5994 // This also eliminates spilled cross register class COPYs (e.g. between x and 5995 // d regs) of the same size. For example: 5996 // 5997 // %0 = COPY %1; GPR64:%0, FPR64:%1 5998 // 5999 // will be filled as 6000 // 6001 // LDRDui %0, fi<#0> 6002 // 6003 // instead of 6004 // 6005 // LDRXui %Temp, fi<#0> 6006 // %0 = FMOV %Temp 6007 // 6008 if (MI.isCopy() && Ops.size() == 1 && 6009 // Make sure we're only folding the explicit COPY defs/uses. 6010 (Ops[0] == 0 || Ops[0] == 1)) { 6011 bool IsSpill = Ops[0] == 0; 6012 bool IsFill = !IsSpill; 6013 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 6014 const MachineRegisterInfo &MRI = MF.getRegInfo(); 6015 MachineBasicBlock &MBB = *MI.getParent(); 6016 const MachineOperand &DstMO = MI.getOperand(0); 6017 const MachineOperand &SrcMO = MI.getOperand(1); 6018 Register DstReg = DstMO.getReg(); 6019 Register SrcReg = SrcMO.getReg(); 6020 // This is slightly expensive to compute for physical regs since 6021 // getMinimalPhysRegClass is slow. 6022 auto getRegClass = [&](unsigned Reg) { 6023 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) 6024 : TRI.getMinimalPhysRegClass(Reg); 6025 }; 6026 6027 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { 6028 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == 6029 TRI.getRegSizeInBits(*getRegClass(SrcReg)) && 6030 "Mismatched register size in non subreg COPY"); 6031 if (IsSpill) 6032 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, 6033 getRegClass(SrcReg), &TRI, Register()); 6034 else 6035 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, 6036 getRegClass(DstReg), &TRI, Register()); 6037 return &*--InsertPt; 6038 } 6039 6040 // Handle cases like spilling def of: 6041 // 6042 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0 6043 // 6044 // where the physical register source can be widened and stored to the full 6045 // virtual reg destination stack slot, in this case producing: 6046 // 6047 // STRXui %xzr, %stack.0 6048 // 6049 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR && 6050 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) { 6051 assert(SrcMO.getSubReg() == 0 && 6052 "Unexpected subreg on physical register"); 6053 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(), 6054 FrameIndex, &AArch64::GPR64RegClass, &TRI, 6055 Register()); 6056 return &*--InsertPt; 6057 } 6058 6059 // Handle cases like filling use of: 6060 // 6061 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1 6062 // 6063 // where we can load the full virtual reg source stack slot, into the subreg 6064 // destination, in this case producing: 6065 // 6066 // LDRWui %0:sub_32<def,read-undef>, %stack.0 6067 // 6068 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { 6069 const TargetRegisterClass *FillRC; 6070 switch (DstMO.getSubReg()) { 6071 default: 6072 FillRC = nullptr; 6073 break; 6074 case AArch64::sub_32: 6075 FillRC = &AArch64::GPR32RegClass; 6076 break; 6077 case AArch64::ssub: 6078 FillRC = &AArch64::FPR32RegClass; 6079 break; 6080 case AArch64::dsub: 6081 FillRC = &AArch64::FPR64RegClass; 6082 break; 6083 } 6084 6085 if (FillRC) { 6086 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == 6087 TRI.getRegSizeInBits(*FillRC) && 6088 "Mismatched regclass size on folded subreg COPY"); 6089 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI, 6090 Register()); 6091 MachineInstr &LoadMI = *--InsertPt; 6092 MachineOperand &LoadDst = LoadMI.getOperand(0); 6093 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load"); 6094 LoadDst.setSubReg(DstMO.getSubReg()); 6095 LoadDst.setIsUndef(); 6096 return &LoadMI; 6097 } 6098 } 6099 } 6100 6101 // Cannot fold. 6102 return nullptr; 6103 } 6104 6105 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, 6106 StackOffset &SOffset, 6107 bool *OutUseUnscaledOp, 6108 unsigned *OutUnscaledOp, 6109 int64_t *EmittableOffset) { 6110 // Set output values in case of early exit. 6111 if (EmittableOffset) 6112 *EmittableOffset = 0; 6113 if (OutUseUnscaledOp) 6114 *OutUseUnscaledOp = false; 6115 if (OutUnscaledOp) 6116 *OutUnscaledOp = 0; 6117 6118 // Exit early for structured vector spills/fills as they can't take an 6119 // immediate offset. 6120 switch (MI.getOpcode()) { 6121 default: 6122 break; 6123 case AArch64::LD1Rv1d: 6124 case AArch64::LD1Rv2s: 6125 case AArch64::LD1Rv2d: 6126 case AArch64::LD1Rv4h: 6127 case AArch64::LD1Rv4s: 6128 case AArch64::LD1Rv8b: 6129 case AArch64::LD1Rv8h: 6130 case AArch64::LD1Rv16b: 6131 case AArch64::LD1Twov2d: 6132 case AArch64::LD1Threev2d: 6133 case AArch64::LD1Fourv2d: 6134 case AArch64::LD1Twov1d: 6135 case AArch64::LD1Threev1d: 6136 case AArch64::LD1Fourv1d: 6137 case AArch64::ST1Twov2d: 6138 case AArch64::ST1Threev2d: 6139 case AArch64::ST1Fourv2d: 6140 case AArch64::ST1Twov1d: 6141 case AArch64::ST1Threev1d: 6142 case AArch64::ST1Fourv1d: 6143 case AArch64::ST1i8: 6144 case AArch64::ST1i16: 6145 case AArch64::ST1i32: 6146 case AArch64::ST1i64: 6147 case AArch64::IRG: 6148 case AArch64::IRGstack: 6149 case AArch64::STGloop: 6150 case AArch64::STZGloop: 6151 return AArch64FrameOffsetCannotUpdate; 6152 } 6153 6154 // Get the min/max offset and the scale. 6155 TypeSize ScaleValue(0U, false), Width(0U, false); 6156 int64_t MinOff, MaxOff; 6157 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff, 6158 MaxOff)) 6159 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 6160 6161 // Construct the complete offset. 6162 bool IsMulVL = ScaleValue.isScalable(); 6163 unsigned Scale = ScaleValue.getKnownMinValue(); 6164 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed(); 6165 6166 const MachineOperand &ImmOpnd = 6167 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); 6168 Offset += ImmOpnd.getImm() * Scale; 6169 6170 // If the offset doesn't match the scale, we rewrite the instruction to 6171 // use the unscaled instruction instead. Likewise, if we have a negative 6172 // offset and there is an unscaled op to use. 6173 std::optional<unsigned> UnscaledOp = 6174 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); 6175 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); 6176 if (useUnscaledOp && 6177 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff, 6178 MaxOff)) 6179 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 6180 6181 Scale = ScaleValue.getKnownMinValue(); 6182 assert(IsMulVL == ScaleValue.isScalable() && 6183 "Unscaled opcode has different value for scalable"); 6184 6185 int64_t Remainder = Offset % Scale; 6186 assert(!(Remainder && useUnscaledOp) && 6187 "Cannot have remainder when using unscaled op"); 6188 6189 assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); 6190 int64_t NewOffset = Offset / Scale; 6191 if (MinOff <= NewOffset && NewOffset <= MaxOff) 6192 Offset = Remainder; 6193 else { 6194 NewOffset = NewOffset < 0 ? MinOff : MaxOff; 6195 Offset = Offset - (NewOffset * Scale); 6196 } 6197 6198 if (EmittableOffset) 6199 *EmittableOffset = NewOffset; 6200 if (OutUseUnscaledOp) 6201 *OutUseUnscaledOp = useUnscaledOp; 6202 if (OutUnscaledOp && UnscaledOp) 6203 *OutUnscaledOp = *UnscaledOp; 6204 6205 if (IsMulVL) 6206 SOffset = StackOffset::get(SOffset.getFixed(), Offset); 6207 else 6208 SOffset = StackOffset::get(Offset, SOffset.getScalable()); 6209 return AArch64FrameOffsetCanUpdate | 6210 (SOffset ? 0 : AArch64FrameOffsetIsLegal); 6211 } 6212 6213 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, 6214 unsigned FrameReg, StackOffset &Offset, 6215 const AArch64InstrInfo *TII) { 6216 unsigned Opcode = MI.getOpcode(); 6217 unsigned ImmIdx = FrameRegIdx + 1; 6218 6219 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { 6220 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm()); 6221 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), 6222 MI.getOperand(0).getReg(), FrameReg, Offset, TII, 6223 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); 6224 MI.eraseFromParent(); 6225 Offset = StackOffset(); 6226 return true; 6227 } 6228 6229 int64_t NewOffset; 6230 unsigned UnscaledOp; 6231 bool UseUnscaledOp; 6232 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, 6233 &UnscaledOp, &NewOffset); 6234 if (Status & AArch64FrameOffsetCanUpdate) { 6235 if (Status & AArch64FrameOffsetIsLegal) 6236 // Replace the FrameIndex with FrameReg. 6237 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); 6238 if (UseUnscaledOp) 6239 MI.setDesc(TII->get(UnscaledOp)); 6240 6241 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); 6242 return !Offset; 6243 } 6244 6245 return false; 6246 } 6247 6248 void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB, 6249 MachineBasicBlock::iterator MI) const { 6250 DebugLoc DL; 6251 BuildMI(MBB, MI, DL, get(AArch64::HINT)).addImm(0); 6252 } 6253 6254 MCInst AArch64InstrInfo::getNop() const { 6255 return MCInstBuilder(AArch64::HINT).addImm(0); 6256 } 6257 6258 // AArch64 supports MachineCombiner. 6259 bool AArch64InstrInfo::useMachineCombiner() const { return true; } 6260 6261 // True when Opc sets flag 6262 static bool isCombineInstrSettingFlag(unsigned Opc) { 6263 switch (Opc) { 6264 case AArch64::ADDSWrr: 6265 case AArch64::ADDSWri: 6266 case AArch64::ADDSXrr: 6267 case AArch64::ADDSXri: 6268 case AArch64::SUBSWrr: 6269 case AArch64::SUBSXrr: 6270 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 6271 case AArch64::SUBSWri: 6272 case AArch64::SUBSXri: 6273 return true; 6274 default: 6275 break; 6276 } 6277 return false; 6278 } 6279 6280 // 32b Opcodes that can be combined with a MUL 6281 static bool isCombineInstrCandidate32(unsigned Opc) { 6282 switch (Opc) { 6283 case AArch64::ADDWrr: 6284 case AArch64::ADDWri: 6285 case AArch64::SUBWrr: 6286 case AArch64::ADDSWrr: 6287 case AArch64::ADDSWri: 6288 case AArch64::SUBSWrr: 6289 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 6290 case AArch64::SUBWri: 6291 case AArch64::SUBSWri: 6292 return true; 6293 default: 6294 break; 6295 } 6296 return false; 6297 } 6298 6299 // 64b Opcodes that can be combined with a MUL 6300 static bool isCombineInstrCandidate64(unsigned Opc) { 6301 switch (Opc) { 6302 case AArch64::ADDXrr: 6303 case AArch64::ADDXri: 6304 case AArch64::SUBXrr: 6305 case AArch64::ADDSXrr: 6306 case AArch64::ADDSXri: 6307 case AArch64::SUBSXrr: 6308 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 6309 case AArch64::SUBXri: 6310 case AArch64::SUBSXri: 6311 case AArch64::ADDv8i8: 6312 case AArch64::ADDv16i8: 6313 case AArch64::ADDv4i16: 6314 case AArch64::ADDv8i16: 6315 case AArch64::ADDv2i32: 6316 case AArch64::ADDv4i32: 6317 case AArch64::SUBv8i8: 6318 case AArch64::SUBv16i8: 6319 case AArch64::SUBv4i16: 6320 case AArch64::SUBv8i16: 6321 case AArch64::SUBv2i32: 6322 case AArch64::SUBv4i32: 6323 return true; 6324 default: 6325 break; 6326 } 6327 return false; 6328 } 6329 6330 // FP Opcodes that can be combined with a FMUL. 6331 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { 6332 switch (Inst.getOpcode()) { 6333 default: 6334 break; 6335 case AArch64::FADDHrr: 6336 case AArch64::FADDSrr: 6337 case AArch64::FADDDrr: 6338 case AArch64::FADDv4f16: 6339 case AArch64::FADDv8f16: 6340 case AArch64::FADDv2f32: 6341 case AArch64::FADDv2f64: 6342 case AArch64::FADDv4f32: 6343 case AArch64::FSUBHrr: 6344 case AArch64::FSUBSrr: 6345 case AArch64::FSUBDrr: 6346 case AArch64::FSUBv4f16: 6347 case AArch64::FSUBv8f16: 6348 case AArch64::FSUBv2f32: 6349 case AArch64::FSUBv2f64: 6350 case AArch64::FSUBv4f32: 6351 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; 6352 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by 6353 // the target options or if FADD/FSUB has the contract fast-math flag. 6354 return Options.UnsafeFPMath || 6355 Options.AllowFPOpFusion == FPOpFusion::Fast || 6356 Inst.getFlag(MachineInstr::FmContract); 6357 return true; 6358 } 6359 return false; 6360 } 6361 6362 // Opcodes that can be combined with a MUL 6363 static bool isCombineInstrCandidate(unsigned Opc) { 6364 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); 6365 } 6366 6367 // 6368 // Utility routine that checks if \param MO is defined by an 6369 // \param CombineOpc instruction in the basic block \param MBB 6370 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, 6371 unsigned CombineOpc, unsigned ZeroReg = 0, 6372 bool CheckZeroReg = false) { 6373 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6374 MachineInstr *MI = nullptr; 6375 6376 if (MO.isReg() && MO.getReg().isVirtual()) 6377 MI = MRI.getUniqueVRegDef(MO.getReg()); 6378 // And it needs to be in the trace (otherwise, it won't have a depth). 6379 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) 6380 return false; 6381 // Must only used by the user we combine with. 6382 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 6383 return false; 6384 6385 if (CheckZeroReg) { 6386 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && 6387 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && 6388 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); 6389 // The third input reg must be zero. 6390 if (MI->getOperand(3).getReg() != ZeroReg) 6391 return false; 6392 } 6393 6394 if (isCombineInstrSettingFlag(CombineOpc) && 6395 MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1) 6396 return false; 6397 6398 return true; 6399 } 6400 6401 // 6402 // Is \param MO defined by an integer multiply and can be combined? 6403 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, 6404 unsigned MulOpc, unsigned ZeroReg) { 6405 return canCombine(MBB, MO, MulOpc, ZeroReg, true); 6406 } 6407 6408 // 6409 // Is \param MO defined by a floating-point multiply and can be combined? 6410 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, 6411 unsigned MulOpc) { 6412 return canCombine(MBB, MO, MulOpc); 6413 } 6414 6415 // TODO: There are many more machine instruction opcodes to match: 6416 // 1. Other data types (integer, vectors) 6417 // 2. Other math / logic operations (xor, or) 6418 // 3. Other forms of the same operation (intrinsics and other variants) 6419 bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, 6420 bool Invert) const { 6421 if (Invert) 6422 return false; 6423 switch (Inst.getOpcode()) { 6424 // == Floating-point types == 6425 // -- Floating-point instructions -- 6426 case AArch64::FADDHrr: 6427 case AArch64::FADDSrr: 6428 case AArch64::FADDDrr: 6429 case AArch64::FMULHrr: 6430 case AArch64::FMULSrr: 6431 case AArch64::FMULDrr: 6432 case AArch64::FMULX16: 6433 case AArch64::FMULX32: 6434 case AArch64::FMULX64: 6435 // -- Advanced SIMD instructions -- 6436 case AArch64::FADDv4f16: 6437 case AArch64::FADDv8f16: 6438 case AArch64::FADDv2f32: 6439 case AArch64::FADDv4f32: 6440 case AArch64::FADDv2f64: 6441 case AArch64::FMULv4f16: 6442 case AArch64::FMULv8f16: 6443 case AArch64::FMULv2f32: 6444 case AArch64::FMULv4f32: 6445 case AArch64::FMULv2f64: 6446 case AArch64::FMULXv4f16: 6447 case AArch64::FMULXv8f16: 6448 case AArch64::FMULXv2f32: 6449 case AArch64::FMULXv4f32: 6450 case AArch64::FMULXv2f64: 6451 // -- SVE instructions -- 6452 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX 6453 // in the SVE instruction set (though there are predicated ones). 6454 case AArch64::FADD_ZZZ_H: 6455 case AArch64::FADD_ZZZ_S: 6456 case AArch64::FADD_ZZZ_D: 6457 case AArch64::FMUL_ZZZ_H: 6458 case AArch64::FMUL_ZZZ_S: 6459 case AArch64::FMUL_ZZZ_D: 6460 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath || 6461 (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && 6462 Inst.getFlag(MachineInstr::MIFlag::FmNsz)); 6463 6464 // == Integer types == 6465 // -- Base instructions -- 6466 // Opcodes MULWrr and MULXrr don't exist because 6467 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of 6468 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively. 6469 // The machine-combiner does not support three-source-operands machine 6470 // instruction. So we cannot reassociate MULs. 6471 case AArch64::ADDWrr: 6472 case AArch64::ADDXrr: 6473 case AArch64::ANDWrr: 6474 case AArch64::ANDXrr: 6475 case AArch64::ORRWrr: 6476 case AArch64::ORRXrr: 6477 case AArch64::EORWrr: 6478 case AArch64::EORXrr: 6479 case AArch64::EONWrr: 6480 case AArch64::EONXrr: 6481 // -- Advanced SIMD instructions -- 6482 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL 6483 // in the Advanced SIMD instruction set. 6484 case AArch64::ADDv8i8: 6485 case AArch64::ADDv16i8: 6486 case AArch64::ADDv4i16: 6487 case AArch64::ADDv8i16: 6488 case AArch64::ADDv2i32: 6489 case AArch64::ADDv4i32: 6490 case AArch64::ADDv1i64: 6491 case AArch64::ADDv2i64: 6492 case AArch64::MULv8i8: 6493 case AArch64::MULv16i8: 6494 case AArch64::MULv4i16: 6495 case AArch64::MULv8i16: 6496 case AArch64::MULv2i32: 6497 case AArch64::MULv4i32: 6498 case AArch64::ANDv8i8: 6499 case AArch64::ANDv16i8: 6500 case AArch64::ORRv8i8: 6501 case AArch64::ORRv16i8: 6502 case AArch64::EORv8i8: 6503 case AArch64::EORv16i8: 6504 // -- SVE instructions -- 6505 case AArch64::ADD_ZZZ_B: 6506 case AArch64::ADD_ZZZ_H: 6507 case AArch64::ADD_ZZZ_S: 6508 case AArch64::ADD_ZZZ_D: 6509 case AArch64::MUL_ZZZ_B: 6510 case AArch64::MUL_ZZZ_H: 6511 case AArch64::MUL_ZZZ_S: 6512 case AArch64::MUL_ZZZ_D: 6513 case AArch64::AND_ZZZ: 6514 case AArch64::ORR_ZZZ: 6515 case AArch64::EOR_ZZZ: 6516 return true; 6517 6518 default: 6519 return false; 6520 } 6521 } 6522 6523 /// Find instructions that can be turned into madd. 6524 static bool getMaddPatterns(MachineInstr &Root, 6525 SmallVectorImpl<unsigned> &Patterns) { 6526 unsigned Opc = Root.getOpcode(); 6527 MachineBasicBlock &MBB = *Root.getParent(); 6528 bool Found = false; 6529 6530 if (!isCombineInstrCandidate(Opc)) 6531 return false; 6532 if (isCombineInstrSettingFlag(Opc)) { 6533 int Cmp_NZCV = 6534 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true); 6535 // When NZCV is live bail out. 6536 if (Cmp_NZCV == -1) 6537 return false; 6538 unsigned NewOpc = convertToNonFlagSettingOpc(Root); 6539 // When opcode can't change bail out. 6540 // CHECKME: do we miss any cases for opcode conversion? 6541 if (NewOpc == Opc) 6542 return false; 6543 Opc = NewOpc; 6544 } 6545 6546 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg, 6547 unsigned Pattern) { 6548 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) { 6549 Patterns.push_back(Pattern); 6550 Found = true; 6551 } 6552 }; 6553 6554 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) { 6555 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) { 6556 Patterns.push_back(Pattern); 6557 Found = true; 6558 } 6559 }; 6560 6561 typedef AArch64MachineCombinerPattern MCP; 6562 6563 switch (Opc) { 6564 default: 6565 break; 6566 case AArch64::ADDWrr: 6567 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 6568 "ADDWrr does not have register operands"); 6569 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1); 6570 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2); 6571 break; 6572 case AArch64::ADDXrr: 6573 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1); 6574 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2); 6575 break; 6576 case AArch64::SUBWrr: 6577 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2); 6578 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1); 6579 break; 6580 case AArch64::SUBXrr: 6581 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2); 6582 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1); 6583 break; 6584 case AArch64::ADDWri: 6585 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1); 6586 break; 6587 case AArch64::ADDXri: 6588 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1); 6589 break; 6590 case AArch64::SUBWri: 6591 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1); 6592 break; 6593 case AArch64::SUBXri: 6594 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); 6595 break; 6596 case AArch64::ADDv8i8: 6597 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1); 6598 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2); 6599 break; 6600 case AArch64::ADDv16i8: 6601 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1); 6602 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2); 6603 break; 6604 case AArch64::ADDv4i16: 6605 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1); 6606 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2); 6607 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1); 6608 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2); 6609 break; 6610 case AArch64::ADDv8i16: 6611 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1); 6612 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2); 6613 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1); 6614 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2); 6615 break; 6616 case AArch64::ADDv2i32: 6617 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1); 6618 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2); 6619 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1); 6620 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2); 6621 break; 6622 case AArch64::ADDv4i32: 6623 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1); 6624 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2); 6625 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1); 6626 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2); 6627 break; 6628 case AArch64::SUBv8i8: 6629 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1); 6630 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2); 6631 break; 6632 case AArch64::SUBv16i8: 6633 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1); 6634 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2); 6635 break; 6636 case AArch64::SUBv4i16: 6637 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1); 6638 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2); 6639 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1); 6640 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2); 6641 break; 6642 case AArch64::SUBv8i16: 6643 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1); 6644 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2); 6645 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1); 6646 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2); 6647 break; 6648 case AArch64::SUBv2i32: 6649 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1); 6650 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2); 6651 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1); 6652 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2); 6653 break; 6654 case AArch64::SUBv4i32: 6655 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1); 6656 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2); 6657 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1); 6658 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2); 6659 break; 6660 } 6661 return Found; 6662 } 6663 /// Floating-Point Support 6664 6665 /// Find instructions that can be turned into madd. 6666 static bool getFMAPatterns(MachineInstr &Root, 6667 SmallVectorImpl<unsigned> &Patterns) { 6668 6669 if (!isCombineInstrCandidateFP(Root)) 6670 return false; 6671 6672 MachineBasicBlock &MBB = *Root.getParent(); 6673 bool Found = false; 6674 6675 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool { 6676 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) { 6677 Patterns.push_back(Pattern); 6678 return true; 6679 } 6680 return false; 6681 }; 6682 6683 typedef AArch64MachineCombinerPattern MCP; 6684 6685 switch (Root.getOpcode()) { 6686 default: 6687 assert(false && "Unsupported FP instruction in combiner\n"); 6688 break; 6689 case AArch64::FADDHrr: 6690 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 6691 "FADDHrr does not have register operands"); 6692 6693 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1); 6694 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2); 6695 break; 6696 case AArch64::FADDSrr: 6697 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 6698 "FADDSrr does not have register operands"); 6699 6700 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) || 6701 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1); 6702 6703 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) || 6704 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2); 6705 break; 6706 case AArch64::FADDDrr: 6707 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) || 6708 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1); 6709 6710 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) || 6711 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2); 6712 break; 6713 case AArch64::FADDv4f16: 6714 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) || 6715 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1); 6716 6717 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) || 6718 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2); 6719 break; 6720 case AArch64::FADDv8f16: 6721 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) || 6722 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1); 6723 6724 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) || 6725 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2); 6726 break; 6727 case AArch64::FADDv2f32: 6728 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) || 6729 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1); 6730 6731 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) || 6732 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2); 6733 break; 6734 case AArch64::FADDv2f64: 6735 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) || 6736 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1); 6737 6738 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) || 6739 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2); 6740 break; 6741 case AArch64::FADDv4f32: 6742 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) || 6743 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1); 6744 6745 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) || 6746 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2); 6747 break; 6748 case AArch64::FSUBHrr: 6749 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1); 6750 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2); 6751 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1); 6752 break; 6753 case AArch64::FSUBSrr: 6754 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1); 6755 6756 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) || 6757 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2); 6758 6759 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1); 6760 break; 6761 case AArch64::FSUBDrr: 6762 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1); 6763 6764 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) || 6765 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2); 6766 6767 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1); 6768 break; 6769 case AArch64::FSUBv4f16: 6770 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) || 6771 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2); 6772 6773 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) || 6774 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1); 6775 break; 6776 case AArch64::FSUBv8f16: 6777 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) || 6778 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2); 6779 6780 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) || 6781 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1); 6782 break; 6783 case AArch64::FSUBv2f32: 6784 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) || 6785 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2); 6786 6787 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) || 6788 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1); 6789 break; 6790 case AArch64::FSUBv2f64: 6791 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) || 6792 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2); 6793 6794 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) || 6795 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1); 6796 break; 6797 case AArch64::FSUBv4f32: 6798 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) || 6799 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2); 6800 6801 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) || 6802 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1); 6803 break; 6804 } 6805 return Found; 6806 } 6807 6808 static bool getFMULPatterns(MachineInstr &Root, 6809 SmallVectorImpl<unsigned> &Patterns) { 6810 MachineBasicBlock &MBB = *Root.getParent(); 6811 bool Found = false; 6812 6813 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool { 6814 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6815 MachineOperand &MO = Root.getOperand(Operand); 6816 MachineInstr *MI = nullptr; 6817 if (MO.isReg() && MO.getReg().isVirtual()) 6818 MI = MRI.getUniqueVRegDef(MO.getReg()); 6819 // Ignore No-op COPYs in FMUL(COPY(DUP(..))) 6820 if (MI && MI->getOpcode() == TargetOpcode::COPY && 6821 MI->getOperand(1).getReg().isVirtual()) 6822 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg()); 6823 if (MI && MI->getOpcode() == Opcode) { 6824 Patterns.push_back(Pattern); 6825 return true; 6826 } 6827 return false; 6828 }; 6829 6830 typedef AArch64MachineCombinerPattern MCP; 6831 6832 switch (Root.getOpcode()) { 6833 default: 6834 return false; 6835 case AArch64::FMULv2f32: 6836 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1); 6837 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2); 6838 break; 6839 case AArch64::FMULv2f64: 6840 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1); 6841 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2); 6842 break; 6843 case AArch64::FMULv4f16: 6844 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1); 6845 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2); 6846 break; 6847 case AArch64::FMULv4f32: 6848 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1); 6849 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2); 6850 break; 6851 case AArch64::FMULv8f16: 6852 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1); 6853 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2); 6854 break; 6855 } 6856 6857 return Found; 6858 } 6859 6860 static bool getFNEGPatterns(MachineInstr &Root, 6861 SmallVectorImpl<unsigned> &Patterns) { 6862 unsigned Opc = Root.getOpcode(); 6863 MachineBasicBlock &MBB = *Root.getParent(); 6864 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6865 6866 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool { 6867 MachineOperand &MO = Root.getOperand(1); 6868 MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg()); 6869 if (MI != nullptr && (MI->getOpcode() == Opcode) && 6870 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) && 6871 Root.getFlag(MachineInstr::MIFlag::FmContract) && 6872 Root.getFlag(MachineInstr::MIFlag::FmNsz) && 6873 MI->getFlag(MachineInstr::MIFlag::FmContract) && 6874 MI->getFlag(MachineInstr::MIFlag::FmNsz)) { 6875 Patterns.push_back(Pattern); 6876 return true; 6877 } 6878 return false; 6879 }; 6880 6881 switch (Opc) { 6882 default: 6883 break; 6884 case AArch64::FNEGDr: 6885 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD); 6886 case AArch64::FNEGSr: 6887 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD); 6888 } 6889 6890 return false; 6891 } 6892 6893 /// Return true when a code sequence can improve throughput. It 6894 /// should be called only for instructions in loops. 6895 /// \param Pattern - combiner pattern 6896 bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const { 6897 switch (Pattern) { 6898 default: 6899 break; 6900 case AArch64MachineCombinerPattern::FMULADDH_OP1: 6901 case AArch64MachineCombinerPattern::FMULADDH_OP2: 6902 case AArch64MachineCombinerPattern::FMULSUBH_OP1: 6903 case AArch64MachineCombinerPattern::FMULSUBH_OP2: 6904 case AArch64MachineCombinerPattern::FMULADDS_OP1: 6905 case AArch64MachineCombinerPattern::FMULADDS_OP2: 6906 case AArch64MachineCombinerPattern::FMULSUBS_OP1: 6907 case AArch64MachineCombinerPattern::FMULSUBS_OP2: 6908 case AArch64MachineCombinerPattern::FMULADDD_OP1: 6909 case AArch64MachineCombinerPattern::FMULADDD_OP2: 6910 case AArch64MachineCombinerPattern::FMULSUBD_OP1: 6911 case AArch64MachineCombinerPattern::FMULSUBD_OP2: 6912 case AArch64MachineCombinerPattern::FNMULSUBH_OP1: 6913 case AArch64MachineCombinerPattern::FNMULSUBS_OP1: 6914 case AArch64MachineCombinerPattern::FNMULSUBD_OP1: 6915 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1: 6916 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2: 6917 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1: 6918 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2: 6919 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1: 6920 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2: 6921 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1: 6922 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2: 6923 case AArch64MachineCombinerPattern::FMLAv4f16_OP2: 6924 case AArch64MachineCombinerPattern::FMLAv4f16_OP1: 6925 case AArch64MachineCombinerPattern::FMLAv8f16_OP1: 6926 case AArch64MachineCombinerPattern::FMLAv8f16_OP2: 6927 case AArch64MachineCombinerPattern::FMLAv2f32_OP2: 6928 case AArch64MachineCombinerPattern::FMLAv2f32_OP1: 6929 case AArch64MachineCombinerPattern::FMLAv2f64_OP1: 6930 case AArch64MachineCombinerPattern::FMLAv2f64_OP2: 6931 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1: 6932 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2: 6933 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1: 6934 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2: 6935 case AArch64MachineCombinerPattern::FMLAv4f32_OP1: 6936 case AArch64MachineCombinerPattern::FMLAv4f32_OP2: 6937 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1: 6938 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2: 6939 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: 6940 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2: 6941 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: 6942 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2: 6943 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2: 6944 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2: 6945 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2: 6946 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2: 6947 case AArch64MachineCombinerPattern::FMLSv4f16_OP1: 6948 case AArch64MachineCombinerPattern::FMLSv4f16_OP2: 6949 case AArch64MachineCombinerPattern::FMLSv8f16_OP1: 6950 case AArch64MachineCombinerPattern::FMLSv8f16_OP2: 6951 case AArch64MachineCombinerPattern::FMLSv2f32_OP2: 6952 case AArch64MachineCombinerPattern::FMLSv2f64_OP2: 6953 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2: 6954 case AArch64MachineCombinerPattern::FMLSv4f32_OP2: 6955 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1: 6956 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: 6957 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1: 6958 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: 6959 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1: 6960 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: 6961 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1: 6962 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: 6963 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1: 6964 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: 6965 case AArch64MachineCombinerPattern::MULADDv8i8_OP1: 6966 case AArch64MachineCombinerPattern::MULADDv8i8_OP2: 6967 case AArch64MachineCombinerPattern::MULADDv16i8_OP1: 6968 case AArch64MachineCombinerPattern::MULADDv16i8_OP2: 6969 case AArch64MachineCombinerPattern::MULADDv4i16_OP1: 6970 case AArch64MachineCombinerPattern::MULADDv4i16_OP2: 6971 case AArch64MachineCombinerPattern::MULADDv8i16_OP1: 6972 case AArch64MachineCombinerPattern::MULADDv8i16_OP2: 6973 case AArch64MachineCombinerPattern::MULADDv2i32_OP1: 6974 case AArch64MachineCombinerPattern::MULADDv2i32_OP2: 6975 case AArch64MachineCombinerPattern::MULADDv4i32_OP1: 6976 case AArch64MachineCombinerPattern::MULADDv4i32_OP2: 6977 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1: 6978 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2: 6979 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1: 6980 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2: 6981 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1: 6982 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2: 6983 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1: 6984 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2: 6985 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1: 6986 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2: 6987 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1: 6988 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2: 6989 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1: 6990 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2: 6991 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1: 6992 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2: 6993 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1: 6994 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2: 6995 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1: 6996 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2: 6997 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 6998 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 6999 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 7000 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 7001 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 7002 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 7003 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 7004 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 7005 return true; 7006 } // end switch (Pattern) 7007 return false; 7008 } 7009 7010 /// Find other MI combine patterns. 7011 static bool getMiscPatterns(MachineInstr &Root, 7012 SmallVectorImpl<unsigned> &Patterns) { 7013 // A - (B + C) ==> (A - B) - C or (A - C) - B 7014 unsigned Opc = Root.getOpcode(); 7015 MachineBasicBlock &MBB = *Root.getParent(); 7016 7017 switch (Opc) { 7018 case AArch64::SUBWrr: 7019 case AArch64::SUBSWrr: 7020 case AArch64::SUBXrr: 7021 case AArch64::SUBSXrr: 7022 // Found candidate root. 7023 break; 7024 default: 7025 return false; 7026 } 7027 7028 if (isCombineInstrSettingFlag(Opc) && 7029 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == 7030 -1) 7031 return false; 7032 7033 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) || 7034 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) || 7035 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) || 7036 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) { 7037 Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP1); 7038 Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP2); 7039 return true; 7040 } 7041 7042 return false; 7043 } 7044 7045 CombinerObjective 7046 AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const { 7047 switch (Pattern) { 7048 case AArch64MachineCombinerPattern::SUBADD_OP1: 7049 case AArch64MachineCombinerPattern::SUBADD_OP2: 7050 return CombinerObjective::MustReduceDepth; 7051 default: 7052 return TargetInstrInfo::getCombinerObjective(Pattern); 7053 } 7054 } 7055 7056 /// Return true when there is potentially a faster code sequence for an 7057 /// instruction chain ending in \p Root. All potential patterns are listed in 7058 /// the \p Pattern vector. Pattern should be sorted in priority order since the 7059 /// pattern evaluator stops checking as soon as it finds a faster sequence. 7060 7061 bool AArch64InstrInfo::getMachineCombinerPatterns( 7062 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns, 7063 bool DoRegPressureReduce) const { 7064 // Integer patterns 7065 if (getMaddPatterns(Root, Patterns)) 7066 return true; 7067 // Floating point patterns 7068 if (getFMULPatterns(Root, Patterns)) 7069 return true; 7070 if (getFMAPatterns(Root, Patterns)) 7071 return true; 7072 if (getFNEGPatterns(Root, Patterns)) 7073 return true; 7074 7075 // Other patterns 7076 if (getMiscPatterns(Root, Patterns)) 7077 return true; 7078 7079 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, 7080 DoRegPressureReduce); 7081 } 7082 7083 enum class FMAInstKind { Default, Indexed, Accumulator }; 7084 /// genFusedMultiply - Generate fused multiply instructions. 7085 /// This function supports both integer and floating point instructions. 7086 /// A typical example: 7087 /// F|MUL I=A,B,0 7088 /// F|ADD R,I,C 7089 /// ==> F|MADD R,A,B,C 7090 /// \param MF Containing MachineFunction 7091 /// \param MRI Register information 7092 /// \param TII Target information 7093 /// \param Root is the F|ADD instruction 7094 /// \param [out] InsInstrs is a vector of machine instructions and will 7095 /// contain the generated madd instruction 7096 /// \param IdxMulOpd is index of operand in Root that is the result of 7097 /// the F|MUL. In the example above IdxMulOpd is 1. 7098 /// \param MaddOpc the opcode fo the f|madd instruction 7099 /// \param RC Register class of operands 7100 /// \param kind of fma instruction (addressing mode) to be generated 7101 /// \param ReplacedAddend is the result register from the instruction 7102 /// replacing the non-combined operand, if any. 7103 static MachineInstr * 7104 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, 7105 const TargetInstrInfo *TII, MachineInstr &Root, 7106 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, 7107 unsigned MaddOpc, const TargetRegisterClass *RC, 7108 FMAInstKind kind = FMAInstKind::Default, 7109 const Register *ReplacedAddend = nullptr) { 7110 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 7111 7112 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; 7113 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 7114 Register ResultReg = Root.getOperand(0).getReg(); 7115 Register SrcReg0 = MUL->getOperand(1).getReg(); 7116 bool Src0IsKill = MUL->getOperand(1).isKill(); 7117 Register SrcReg1 = MUL->getOperand(2).getReg(); 7118 bool Src1IsKill = MUL->getOperand(2).isKill(); 7119 7120 Register SrcReg2; 7121 bool Src2IsKill; 7122 if (ReplacedAddend) { 7123 // If we just generated a new addend, we must be it's only use. 7124 SrcReg2 = *ReplacedAddend; 7125 Src2IsKill = true; 7126 } else { 7127 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); 7128 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); 7129 } 7130 7131 if (ResultReg.isVirtual()) 7132 MRI.constrainRegClass(ResultReg, RC); 7133 if (SrcReg0.isVirtual()) 7134 MRI.constrainRegClass(SrcReg0, RC); 7135 if (SrcReg1.isVirtual()) 7136 MRI.constrainRegClass(SrcReg1, RC); 7137 if (SrcReg2.isVirtual()) 7138 MRI.constrainRegClass(SrcReg2, RC); 7139 7140 MachineInstrBuilder MIB; 7141 if (kind == FMAInstKind::Default) 7142 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 7143 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 7144 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 7145 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 7146 else if (kind == FMAInstKind::Indexed) 7147 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 7148 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 7149 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 7150 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 7151 .addImm(MUL->getOperand(3).getImm()); 7152 else if (kind == FMAInstKind::Accumulator) 7153 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 7154 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 7155 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 7156 .addReg(SrcReg1, getKillRegState(Src1IsKill)); 7157 else 7158 assert(false && "Invalid FMA instruction kind \n"); 7159 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) 7160 InsInstrs.push_back(MIB); 7161 return MUL; 7162 } 7163 7164 static MachineInstr * 7165 genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI, 7166 const TargetInstrInfo *TII, MachineInstr &Root, 7167 SmallVectorImpl<MachineInstr *> &InsInstrs) { 7168 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); 7169 7170 unsigned Opc = 0; 7171 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg()); 7172 if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 7173 Opc = AArch64::FNMADDSrrr; 7174 else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) 7175 Opc = AArch64::FNMADDDrrr; 7176 else 7177 return nullptr; 7178 7179 Register ResultReg = Root.getOperand(0).getReg(); 7180 Register SrcReg0 = MAD->getOperand(1).getReg(); 7181 Register SrcReg1 = MAD->getOperand(2).getReg(); 7182 Register SrcReg2 = MAD->getOperand(3).getReg(); 7183 bool Src0IsKill = MAD->getOperand(1).isKill(); 7184 bool Src1IsKill = MAD->getOperand(2).isKill(); 7185 bool Src2IsKill = MAD->getOperand(3).isKill(); 7186 if (ResultReg.isVirtual()) 7187 MRI.constrainRegClass(ResultReg, RC); 7188 if (SrcReg0.isVirtual()) 7189 MRI.constrainRegClass(SrcReg0, RC); 7190 if (SrcReg1.isVirtual()) 7191 MRI.constrainRegClass(SrcReg1, RC); 7192 if (SrcReg2.isVirtual()) 7193 MRI.constrainRegClass(SrcReg2, RC); 7194 7195 MachineInstrBuilder MIB = 7196 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg) 7197 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 7198 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 7199 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 7200 InsInstrs.push_back(MIB); 7201 7202 return MAD; 7203 } 7204 7205 /// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane) 7206 static MachineInstr * 7207 genIndexedMultiply(MachineInstr &Root, 7208 SmallVectorImpl<MachineInstr *> &InsInstrs, 7209 unsigned IdxDupOp, unsigned MulOpc, 7210 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) { 7211 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) && 7212 "Invalid index of FMUL operand"); 7213 7214 MachineFunction &MF = *Root.getMF(); 7215 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 7216 7217 MachineInstr *Dup = 7218 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg()); 7219 7220 if (Dup->getOpcode() == TargetOpcode::COPY) 7221 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg()); 7222 7223 Register DupSrcReg = Dup->getOperand(1).getReg(); 7224 MRI.clearKillFlags(DupSrcReg); 7225 MRI.constrainRegClass(DupSrcReg, RC); 7226 7227 unsigned DupSrcLane = Dup->getOperand(2).getImm(); 7228 7229 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1; 7230 MachineOperand &MulOp = Root.getOperand(IdxMulOp); 7231 7232 Register ResultReg = Root.getOperand(0).getReg(); 7233 7234 MachineInstrBuilder MIB; 7235 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg) 7236 .add(MulOp) 7237 .addReg(DupSrcReg) 7238 .addImm(DupSrcLane); 7239 7240 InsInstrs.push_back(MIB); 7241 return &Root; 7242 } 7243 7244 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate 7245 /// instructions. 7246 /// 7247 /// \see genFusedMultiply 7248 static MachineInstr *genFusedMultiplyAcc( 7249 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 7250 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 7251 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 7252 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 7253 FMAInstKind::Accumulator); 7254 } 7255 7256 /// genNeg - Helper to generate an intermediate negation of the second operand 7257 /// of Root 7258 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, 7259 const TargetInstrInfo *TII, MachineInstr &Root, 7260 SmallVectorImpl<MachineInstr *> &InsInstrs, 7261 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 7262 unsigned MnegOpc, const TargetRegisterClass *RC) { 7263 Register NewVR = MRI.createVirtualRegister(RC); 7264 MachineInstrBuilder MIB = 7265 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR) 7266 .add(Root.getOperand(2)); 7267 InsInstrs.push_back(MIB); 7268 7269 assert(InstrIdxForVirtReg.empty()); 7270 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7271 7272 return NewVR; 7273 } 7274 7275 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 7276 /// instructions with an additional negation of the accumulator 7277 static MachineInstr *genFusedMultiplyAccNeg( 7278 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 7279 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 7280 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 7281 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 7282 assert(IdxMulOpd == 1); 7283 7284 Register NewVR = 7285 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 7286 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 7287 FMAInstKind::Accumulator, &NewVR); 7288 } 7289 7290 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate 7291 /// instructions. 7292 /// 7293 /// \see genFusedMultiply 7294 static MachineInstr *genFusedMultiplyIdx( 7295 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 7296 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 7297 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 7298 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 7299 FMAInstKind::Indexed); 7300 } 7301 7302 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 7303 /// instructions with an additional negation of the accumulator 7304 static MachineInstr *genFusedMultiplyIdxNeg( 7305 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 7306 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 7307 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 7308 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 7309 assert(IdxMulOpd == 1); 7310 7311 Register NewVR = 7312 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 7313 7314 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 7315 FMAInstKind::Indexed, &NewVR); 7316 } 7317 7318 /// genMaddR - Generate madd instruction and combine mul and add using 7319 /// an extra virtual register 7320 /// Example - an ADD intermediate needs to be stored in a register: 7321 /// MUL I=A,B,0 7322 /// ADD R,I,Imm 7323 /// ==> ORR V, ZR, Imm 7324 /// ==> MADD R,A,B,V 7325 /// \param MF Containing MachineFunction 7326 /// \param MRI Register information 7327 /// \param TII Target information 7328 /// \param Root is the ADD instruction 7329 /// \param [out] InsInstrs is a vector of machine instructions and will 7330 /// contain the generated madd instruction 7331 /// \param IdxMulOpd is index of operand in Root that is the result of 7332 /// the MUL. In the example above IdxMulOpd is 1. 7333 /// \param MaddOpc the opcode fo the madd instruction 7334 /// \param VR is a virtual register that holds the value of an ADD operand 7335 /// (V in the example above). 7336 /// \param RC Register class of operands 7337 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, 7338 const TargetInstrInfo *TII, MachineInstr &Root, 7339 SmallVectorImpl<MachineInstr *> &InsInstrs, 7340 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, 7341 const TargetRegisterClass *RC) { 7342 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 7343 7344 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 7345 Register ResultReg = Root.getOperand(0).getReg(); 7346 Register SrcReg0 = MUL->getOperand(1).getReg(); 7347 bool Src0IsKill = MUL->getOperand(1).isKill(); 7348 Register SrcReg1 = MUL->getOperand(2).getReg(); 7349 bool Src1IsKill = MUL->getOperand(2).isKill(); 7350 7351 if (ResultReg.isVirtual()) 7352 MRI.constrainRegClass(ResultReg, RC); 7353 if (SrcReg0.isVirtual()) 7354 MRI.constrainRegClass(SrcReg0, RC); 7355 if (SrcReg1.isVirtual()) 7356 MRI.constrainRegClass(SrcReg1, RC); 7357 if (Register::isVirtualRegister(VR)) 7358 MRI.constrainRegClass(VR, RC); 7359 7360 MachineInstrBuilder MIB = 7361 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 7362 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 7363 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 7364 .addReg(VR); 7365 // Insert the MADD 7366 InsInstrs.push_back(MIB); 7367 return MUL; 7368 } 7369 7370 /// Do the following transformation 7371 /// A - (B + C) ==> (A - B) - C 7372 /// A - (B + C) ==> (A - C) - B 7373 static void 7374 genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, 7375 const TargetInstrInfo *TII, MachineInstr &Root, 7376 SmallVectorImpl<MachineInstr *> &InsInstrs, 7377 SmallVectorImpl<MachineInstr *> &DelInstrs, 7378 unsigned IdxOpd1, 7379 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) { 7380 assert(IdxOpd1 == 1 || IdxOpd1 == 2); 7381 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1; 7382 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); 7383 7384 Register ResultReg = Root.getOperand(0).getReg(); 7385 Register RegA = Root.getOperand(1).getReg(); 7386 bool RegAIsKill = Root.getOperand(1).isKill(); 7387 Register RegB = AddMI->getOperand(IdxOpd1).getReg(); 7388 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill(); 7389 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg(); 7390 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill(); 7391 Register NewVR = 7392 MRI.createVirtualRegister(MRI.getRegClass(Root.getOperand(2).getReg())); 7393 7394 unsigned Opcode = Root.getOpcode(); 7395 if (Opcode == AArch64::SUBSWrr) 7396 Opcode = AArch64::SUBWrr; 7397 else if (Opcode == AArch64::SUBSXrr) 7398 Opcode = AArch64::SUBXrr; 7399 else 7400 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) && 7401 "Unexpected instruction opcode."); 7402 7403 uint32_t Flags = Root.mergeFlagsWith(*AddMI); 7404 Flags &= ~MachineInstr::NoSWrap; 7405 Flags &= ~MachineInstr::NoUWrap; 7406 7407 MachineInstrBuilder MIB1 = 7408 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR) 7409 .addReg(RegA, getKillRegState(RegAIsKill)) 7410 .addReg(RegB, getKillRegState(RegBIsKill)) 7411 .setMIFlags(Flags); 7412 MachineInstrBuilder MIB2 = 7413 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg) 7414 .addReg(NewVR, getKillRegState(true)) 7415 .addReg(RegC, getKillRegState(RegCIsKill)) 7416 .setMIFlags(Flags); 7417 7418 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7419 InsInstrs.push_back(MIB1); 7420 InsInstrs.push_back(MIB2); 7421 DelInstrs.push_back(AddMI); 7422 DelInstrs.push_back(&Root); 7423 } 7424 7425 /// When getMachineCombinerPatterns() finds potential patterns, 7426 /// this function generates the instructions that could replace the 7427 /// original code sequence 7428 void AArch64InstrInfo::genAlternativeCodeSequence( 7429 MachineInstr &Root, unsigned Pattern, 7430 SmallVectorImpl<MachineInstr *> &InsInstrs, 7431 SmallVectorImpl<MachineInstr *> &DelInstrs, 7432 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { 7433 MachineBasicBlock &MBB = *Root.getParent(); 7434 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 7435 MachineFunction &MF = *MBB.getParent(); 7436 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 7437 7438 MachineInstr *MUL = nullptr; 7439 const TargetRegisterClass *RC; 7440 unsigned Opc; 7441 switch (Pattern) { 7442 default: 7443 // Reassociate instructions. 7444 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, 7445 DelInstrs, InstrIdxForVirtReg); 7446 return; 7447 case AArch64MachineCombinerPattern::SUBADD_OP1: 7448 // A - (B + C) 7449 // ==> (A - B) - C 7450 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1, 7451 InstrIdxForVirtReg); 7452 return; 7453 case AArch64MachineCombinerPattern::SUBADD_OP2: 7454 // A - (B + C) 7455 // ==> (A - C) - B 7456 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2, 7457 InstrIdxForVirtReg); 7458 return; 7459 case AArch64MachineCombinerPattern::MULADDW_OP1: 7460 case AArch64MachineCombinerPattern::MULADDX_OP1: 7461 // MUL I=A,B,0 7462 // ADD R,I,C 7463 // ==> MADD R,A,B,C 7464 // --- Create(MADD); 7465 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP1) { 7466 Opc = AArch64::MADDWrrr; 7467 RC = &AArch64::GPR32RegClass; 7468 } else { 7469 Opc = AArch64::MADDXrrr; 7470 RC = &AArch64::GPR64RegClass; 7471 } 7472 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7473 break; 7474 case AArch64MachineCombinerPattern::MULADDW_OP2: 7475 case AArch64MachineCombinerPattern::MULADDX_OP2: 7476 // MUL I=A,B,0 7477 // ADD R,C,I 7478 // ==> MADD R,A,B,C 7479 // --- Create(MADD); 7480 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP2) { 7481 Opc = AArch64::MADDWrrr; 7482 RC = &AArch64::GPR32RegClass; 7483 } else { 7484 Opc = AArch64::MADDXrrr; 7485 RC = &AArch64::GPR64RegClass; 7486 } 7487 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7488 break; 7489 case AArch64MachineCombinerPattern::MULADDWI_OP1: 7490 case AArch64MachineCombinerPattern::MULADDXI_OP1: { 7491 // MUL I=A,B,0 7492 // ADD R,I,Imm 7493 // ==> MOV V, Imm 7494 // ==> MADD R,A,B,V 7495 // --- Create(MADD); 7496 const TargetRegisterClass *OrrRC; 7497 unsigned BitSize, OrrOpc, ZeroReg; 7498 if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1) { 7499 OrrOpc = AArch64::ORRWri; 7500 OrrRC = &AArch64::GPR32spRegClass; 7501 BitSize = 32; 7502 ZeroReg = AArch64::WZR; 7503 Opc = AArch64::MADDWrrr; 7504 RC = &AArch64::GPR32RegClass; 7505 } else { 7506 OrrOpc = AArch64::ORRXri; 7507 OrrRC = &AArch64::GPR64spRegClass; 7508 BitSize = 64; 7509 ZeroReg = AArch64::XZR; 7510 Opc = AArch64::MADDXrrr; 7511 RC = &AArch64::GPR64RegClass; 7512 } 7513 Register NewVR = MRI.createVirtualRegister(OrrRC); 7514 uint64_t Imm = Root.getOperand(2).getImm(); 7515 7516 if (Root.getOperand(3).isImm()) { 7517 unsigned Val = Root.getOperand(3).getImm(); 7518 Imm = Imm << Val; 7519 } 7520 uint64_t UImm = SignExtend64(Imm, BitSize); 7521 // The immediate can be composed via a single instruction. 7522 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 7523 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn); 7524 if (Insn.size() != 1) 7525 return; 7526 auto MovI = Insn.begin(); 7527 MachineInstrBuilder MIB1; 7528 // MOV is an alias for one of three instructions: movz, movn, and orr. 7529 if (MovI->Opcode == OrrOpc) 7530 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR) 7531 .addReg(ZeroReg) 7532 .addImm(MovI->Op2); 7533 else { 7534 if (BitSize == 32) 7535 assert((MovI->Opcode == AArch64::MOVNWi || 7536 MovI->Opcode == AArch64::MOVZWi) && 7537 "Expected opcode"); 7538 else 7539 assert((MovI->Opcode == AArch64::MOVNXi || 7540 MovI->Opcode == AArch64::MOVZXi) && 7541 "Expected opcode"); 7542 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR) 7543 .addImm(MovI->Op1) 7544 .addImm(MovI->Op2); 7545 } 7546 InsInstrs.push_back(MIB1); 7547 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7548 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 7549 break; 7550 } 7551 case AArch64MachineCombinerPattern::MULSUBW_OP1: 7552 case AArch64MachineCombinerPattern::MULSUBX_OP1: { 7553 // MUL I=A,B,0 7554 // SUB R,I, C 7555 // ==> SUB V, 0, C 7556 // ==> MADD R,A,B,V // = -C + A*B 7557 // --- Create(MADD); 7558 const TargetRegisterClass *SubRC; 7559 unsigned SubOpc, ZeroReg; 7560 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP1) { 7561 SubOpc = AArch64::SUBWrr; 7562 SubRC = &AArch64::GPR32spRegClass; 7563 ZeroReg = AArch64::WZR; 7564 Opc = AArch64::MADDWrrr; 7565 RC = &AArch64::GPR32RegClass; 7566 } else { 7567 SubOpc = AArch64::SUBXrr; 7568 SubRC = &AArch64::GPR64spRegClass; 7569 ZeroReg = AArch64::XZR; 7570 Opc = AArch64::MADDXrrr; 7571 RC = &AArch64::GPR64RegClass; 7572 } 7573 Register NewVR = MRI.createVirtualRegister(SubRC); 7574 // SUB NewVR, 0, C 7575 MachineInstrBuilder MIB1 = 7576 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR) 7577 .addReg(ZeroReg) 7578 .add(Root.getOperand(2)); 7579 InsInstrs.push_back(MIB1); 7580 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7581 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 7582 break; 7583 } 7584 case AArch64MachineCombinerPattern::MULSUBW_OP2: 7585 case AArch64MachineCombinerPattern::MULSUBX_OP2: 7586 // MUL I=A,B,0 7587 // SUB R,C,I 7588 // ==> MSUB R,A,B,C (computes C - A*B) 7589 // --- Create(MSUB); 7590 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP2) { 7591 Opc = AArch64::MSUBWrrr; 7592 RC = &AArch64::GPR32RegClass; 7593 } else { 7594 Opc = AArch64::MSUBXrrr; 7595 RC = &AArch64::GPR64RegClass; 7596 } 7597 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7598 break; 7599 case AArch64MachineCombinerPattern::MULSUBWI_OP1: 7600 case AArch64MachineCombinerPattern::MULSUBXI_OP1: { 7601 // MUL I=A,B,0 7602 // SUB R,I, Imm 7603 // ==> MOV V, -Imm 7604 // ==> MADD R,A,B,V // = -Imm + A*B 7605 // --- Create(MADD); 7606 const TargetRegisterClass *OrrRC; 7607 unsigned BitSize, OrrOpc, ZeroReg; 7608 if (Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) { 7609 OrrOpc = AArch64::ORRWri; 7610 OrrRC = &AArch64::GPR32spRegClass; 7611 BitSize = 32; 7612 ZeroReg = AArch64::WZR; 7613 Opc = AArch64::MADDWrrr; 7614 RC = &AArch64::GPR32RegClass; 7615 } else { 7616 OrrOpc = AArch64::ORRXri; 7617 OrrRC = &AArch64::GPR64spRegClass; 7618 BitSize = 64; 7619 ZeroReg = AArch64::XZR; 7620 Opc = AArch64::MADDXrrr; 7621 RC = &AArch64::GPR64RegClass; 7622 } 7623 Register NewVR = MRI.createVirtualRegister(OrrRC); 7624 uint64_t Imm = Root.getOperand(2).getImm(); 7625 if (Root.getOperand(3).isImm()) { 7626 unsigned Val = Root.getOperand(3).getImm(); 7627 Imm = Imm << Val; 7628 } 7629 uint64_t UImm = SignExtend64(-Imm, BitSize); 7630 // The immediate can be composed via a single instruction. 7631 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 7632 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn); 7633 if (Insn.size() != 1) 7634 return; 7635 auto MovI = Insn.begin(); 7636 MachineInstrBuilder MIB1; 7637 // MOV is an alias for one of three instructions: movz, movn, and orr. 7638 if (MovI->Opcode == OrrOpc) 7639 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR) 7640 .addReg(ZeroReg) 7641 .addImm(MovI->Op2); 7642 else { 7643 if (BitSize == 32) 7644 assert((MovI->Opcode == AArch64::MOVNWi || 7645 MovI->Opcode == AArch64::MOVZWi) && 7646 "Expected opcode"); 7647 else 7648 assert((MovI->Opcode == AArch64::MOVNXi || 7649 MovI->Opcode == AArch64::MOVZXi) && 7650 "Expected opcode"); 7651 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR) 7652 .addImm(MovI->Op1) 7653 .addImm(MovI->Op2); 7654 } 7655 InsInstrs.push_back(MIB1); 7656 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7657 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 7658 break; 7659 } 7660 7661 case AArch64MachineCombinerPattern::MULADDv8i8_OP1: 7662 Opc = AArch64::MLAv8i8; 7663 RC = &AArch64::FPR64RegClass; 7664 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7665 break; 7666 case AArch64MachineCombinerPattern::MULADDv8i8_OP2: 7667 Opc = AArch64::MLAv8i8; 7668 RC = &AArch64::FPR64RegClass; 7669 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7670 break; 7671 case AArch64MachineCombinerPattern::MULADDv16i8_OP1: 7672 Opc = AArch64::MLAv16i8; 7673 RC = &AArch64::FPR128RegClass; 7674 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7675 break; 7676 case AArch64MachineCombinerPattern::MULADDv16i8_OP2: 7677 Opc = AArch64::MLAv16i8; 7678 RC = &AArch64::FPR128RegClass; 7679 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7680 break; 7681 case AArch64MachineCombinerPattern::MULADDv4i16_OP1: 7682 Opc = AArch64::MLAv4i16; 7683 RC = &AArch64::FPR64RegClass; 7684 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7685 break; 7686 case AArch64MachineCombinerPattern::MULADDv4i16_OP2: 7687 Opc = AArch64::MLAv4i16; 7688 RC = &AArch64::FPR64RegClass; 7689 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7690 break; 7691 case AArch64MachineCombinerPattern::MULADDv8i16_OP1: 7692 Opc = AArch64::MLAv8i16; 7693 RC = &AArch64::FPR128RegClass; 7694 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7695 break; 7696 case AArch64MachineCombinerPattern::MULADDv8i16_OP2: 7697 Opc = AArch64::MLAv8i16; 7698 RC = &AArch64::FPR128RegClass; 7699 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7700 break; 7701 case AArch64MachineCombinerPattern::MULADDv2i32_OP1: 7702 Opc = AArch64::MLAv2i32; 7703 RC = &AArch64::FPR64RegClass; 7704 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7705 break; 7706 case AArch64MachineCombinerPattern::MULADDv2i32_OP2: 7707 Opc = AArch64::MLAv2i32; 7708 RC = &AArch64::FPR64RegClass; 7709 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7710 break; 7711 case AArch64MachineCombinerPattern::MULADDv4i32_OP1: 7712 Opc = AArch64::MLAv4i32; 7713 RC = &AArch64::FPR128RegClass; 7714 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7715 break; 7716 case AArch64MachineCombinerPattern::MULADDv4i32_OP2: 7717 Opc = AArch64::MLAv4i32; 7718 RC = &AArch64::FPR128RegClass; 7719 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7720 break; 7721 7722 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1: 7723 Opc = AArch64::MLAv8i8; 7724 RC = &AArch64::FPR64RegClass; 7725 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7726 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8, 7727 RC); 7728 break; 7729 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2: 7730 Opc = AArch64::MLSv8i8; 7731 RC = &AArch64::FPR64RegClass; 7732 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7733 break; 7734 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1: 7735 Opc = AArch64::MLAv16i8; 7736 RC = &AArch64::FPR128RegClass; 7737 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7738 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8, 7739 RC); 7740 break; 7741 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2: 7742 Opc = AArch64::MLSv16i8; 7743 RC = &AArch64::FPR128RegClass; 7744 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7745 break; 7746 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1: 7747 Opc = AArch64::MLAv4i16; 7748 RC = &AArch64::FPR64RegClass; 7749 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7750 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 7751 RC); 7752 break; 7753 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2: 7754 Opc = AArch64::MLSv4i16; 7755 RC = &AArch64::FPR64RegClass; 7756 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7757 break; 7758 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1: 7759 Opc = AArch64::MLAv8i16; 7760 RC = &AArch64::FPR128RegClass; 7761 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7762 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 7763 RC); 7764 break; 7765 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2: 7766 Opc = AArch64::MLSv8i16; 7767 RC = &AArch64::FPR128RegClass; 7768 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7769 break; 7770 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1: 7771 Opc = AArch64::MLAv2i32; 7772 RC = &AArch64::FPR64RegClass; 7773 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7774 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 7775 RC); 7776 break; 7777 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2: 7778 Opc = AArch64::MLSv2i32; 7779 RC = &AArch64::FPR64RegClass; 7780 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7781 break; 7782 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1: 7783 Opc = AArch64::MLAv4i32; 7784 RC = &AArch64::FPR128RegClass; 7785 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7786 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 7787 RC); 7788 break; 7789 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2: 7790 Opc = AArch64::MLSv4i32; 7791 RC = &AArch64::FPR128RegClass; 7792 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7793 break; 7794 7795 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1: 7796 Opc = AArch64::MLAv4i16_indexed; 7797 RC = &AArch64::FPR64RegClass; 7798 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7799 break; 7800 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2: 7801 Opc = AArch64::MLAv4i16_indexed; 7802 RC = &AArch64::FPR64RegClass; 7803 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7804 break; 7805 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1: 7806 Opc = AArch64::MLAv8i16_indexed; 7807 RC = &AArch64::FPR128RegClass; 7808 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7809 break; 7810 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2: 7811 Opc = AArch64::MLAv8i16_indexed; 7812 RC = &AArch64::FPR128RegClass; 7813 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7814 break; 7815 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1: 7816 Opc = AArch64::MLAv2i32_indexed; 7817 RC = &AArch64::FPR64RegClass; 7818 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7819 break; 7820 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2: 7821 Opc = AArch64::MLAv2i32_indexed; 7822 RC = &AArch64::FPR64RegClass; 7823 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7824 break; 7825 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1: 7826 Opc = AArch64::MLAv4i32_indexed; 7827 RC = &AArch64::FPR128RegClass; 7828 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7829 break; 7830 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2: 7831 Opc = AArch64::MLAv4i32_indexed; 7832 RC = &AArch64::FPR128RegClass; 7833 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7834 break; 7835 7836 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 7837 Opc = AArch64::MLAv4i16_indexed; 7838 RC = &AArch64::FPR64RegClass; 7839 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 7840 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 7841 RC); 7842 break; 7843 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 7844 Opc = AArch64::MLSv4i16_indexed; 7845 RC = &AArch64::FPR64RegClass; 7846 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7847 break; 7848 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 7849 Opc = AArch64::MLAv8i16_indexed; 7850 RC = &AArch64::FPR128RegClass; 7851 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 7852 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 7853 RC); 7854 break; 7855 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 7856 Opc = AArch64::MLSv8i16_indexed; 7857 RC = &AArch64::FPR128RegClass; 7858 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7859 break; 7860 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 7861 Opc = AArch64::MLAv2i32_indexed; 7862 RC = &AArch64::FPR64RegClass; 7863 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 7864 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 7865 RC); 7866 break; 7867 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 7868 Opc = AArch64::MLSv2i32_indexed; 7869 RC = &AArch64::FPR64RegClass; 7870 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7871 break; 7872 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 7873 Opc = AArch64::MLAv4i32_indexed; 7874 RC = &AArch64::FPR128RegClass; 7875 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 7876 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 7877 RC); 7878 break; 7879 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 7880 Opc = AArch64::MLSv4i32_indexed; 7881 RC = &AArch64::FPR128RegClass; 7882 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7883 break; 7884 7885 // Floating Point Support 7886 case AArch64MachineCombinerPattern::FMULADDH_OP1: 7887 Opc = AArch64::FMADDHrrr; 7888 RC = &AArch64::FPR16RegClass; 7889 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7890 break; 7891 case AArch64MachineCombinerPattern::FMULADDS_OP1: 7892 Opc = AArch64::FMADDSrrr; 7893 RC = &AArch64::FPR32RegClass; 7894 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7895 break; 7896 case AArch64MachineCombinerPattern::FMULADDD_OP1: 7897 Opc = AArch64::FMADDDrrr; 7898 RC = &AArch64::FPR64RegClass; 7899 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7900 break; 7901 7902 case AArch64MachineCombinerPattern::FMULADDH_OP2: 7903 Opc = AArch64::FMADDHrrr; 7904 RC = &AArch64::FPR16RegClass; 7905 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7906 break; 7907 case AArch64MachineCombinerPattern::FMULADDS_OP2: 7908 Opc = AArch64::FMADDSrrr; 7909 RC = &AArch64::FPR32RegClass; 7910 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7911 break; 7912 case AArch64MachineCombinerPattern::FMULADDD_OP2: 7913 Opc = AArch64::FMADDDrrr; 7914 RC = &AArch64::FPR64RegClass; 7915 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7916 break; 7917 7918 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1: 7919 Opc = AArch64::FMLAv1i32_indexed; 7920 RC = &AArch64::FPR32RegClass; 7921 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7922 FMAInstKind::Indexed); 7923 break; 7924 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2: 7925 Opc = AArch64::FMLAv1i32_indexed; 7926 RC = &AArch64::FPR32RegClass; 7927 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7928 FMAInstKind::Indexed); 7929 break; 7930 7931 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1: 7932 Opc = AArch64::FMLAv1i64_indexed; 7933 RC = &AArch64::FPR64RegClass; 7934 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7935 FMAInstKind::Indexed); 7936 break; 7937 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2: 7938 Opc = AArch64::FMLAv1i64_indexed; 7939 RC = &AArch64::FPR64RegClass; 7940 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7941 FMAInstKind::Indexed); 7942 break; 7943 7944 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1: 7945 RC = &AArch64::FPR64RegClass; 7946 Opc = AArch64::FMLAv4i16_indexed; 7947 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7948 FMAInstKind::Indexed); 7949 break; 7950 case AArch64MachineCombinerPattern::FMLAv4f16_OP1: 7951 RC = &AArch64::FPR64RegClass; 7952 Opc = AArch64::FMLAv4f16; 7953 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7954 FMAInstKind::Accumulator); 7955 break; 7956 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2: 7957 RC = &AArch64::FPR64RegClass; 7958 Opc = AArch64::FMLAv4i16_indexed; 7959 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7960 FMAInstKind::Indexed); 7961 break; 7962 case AArch64MachineCombinerPattern::FMLAv4f16_OP2: 7963 RC = &AArch64::FPR64RegClass; 7964 Opc = AArch64::FMLAv4f16; 7965 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7966 FMAInstKind::Accumulator); 7967 break; 7968 7969 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1: 7970 case AArch64MachineCombinerPattern::FMLAv2f32_OP1: 7971 RC = &AArch64::FPR64RegClass; 7972 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1) { 7973 Opc = AArch64::FMLAv2i32_indexed; 7974 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7975 FMAInstKind::Indexed); 7976 } else { 7977 Opc = AArch64::FMLAv2f32; 7978 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7979 FMAInstKind::Accumulator); 7980 } 7981 break; 7982 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2: 7983 case AArch64MachineCombinerPattern::FMLAv2f32_OP2: 7984 RC = &AArch64::FPR64RegClass; 7985 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2) { 7986 Opc = AArch64::FMLAv2i32_indexed; 7987 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7988 FMAInstKind::Indexed); 7989 } else { 7990 Opc = AArch64::FMLAv2f32; 7991 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7992 FMAInstKind::Accumulator); 7993 } 7994 break; 7995 7996 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1: 7997 RC = &AArch64::FPR128RegClass; 7998 Opc = AArch64::FMLAv8i16_indexed; 7999 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 8000 FMAInstKind::Indexed); 8001 break; 8002 case AArch64MachineCombinerPattern::FMLAv8f16_OP1: 8003 RC = &AArch64::FPR128RegClass; 8004 Opc = AArch64::FMLAv8f16; 8005 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 8006 FMAInstKind::Accumulator); 8007 break; 8008 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2: 8009 RC = &AArch64::FPR128RegClass; 8010 Opc = AArch64::FMLAv8i16_indexed; 8011 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8012 FMAInstKind::Indexed); 8013 break; 8014 case AArch64MachineCombinerPattern::FMLAv8f16_OP2: 8015 RC = &AArch64::FPR128RegClass; 8016 Opc = AArch64::FMLAv8f16; 8017 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8018 FMAInstKind::Accumulator); 8019 break; 8020 8021 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1: 8022 case AArch64MachineCombinerPattern::FMLAv2f64_OP1: 8023 RC = &AArch64::FPR128RegClass; 8024 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1) { 8025 Opc = AArch64::FMLAv2i64_indexed; 8026 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 8027 FMAInstKind::Indexed); 8028 } else { 8029 Opc = AArch64::FMLAv2f64; 8030 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 8031 FMAInstKind::Accumulator); 8032 } 8033 break; 8034 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2: 8035 case AArch64MachineCombinerPattern::FMLAv2f64_OP2: 8036 RC = &AArch64::FPR128RegClass; 8037 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2) { 8038 Opc = AArch64::FMLAv2i64_indexed; 8039 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8040 FMAInstKind::Indexed); 8041 } else { 8042 Opc = AArch64::FMLAv2f64; 8043 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8044 FMAInstKind::Accumulator); 8045 } 8046 break; 8047 8048 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1: 8049 case AArch64MachineCombinerPattern::FMLAv4f32_OP1: 8050 RC = &AArch64::FPR128RegClass; 8051 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1) { 8052 Opc = AArch64::FMLAv4i32_indexed; 8053 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 8054 FMAInstKind::Indexed); 8055 } else { 8056 Opc = AArch64::FMLAv4f32; 8057 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 8058 FMAInstKind::Accumulator); 8059 } 8060 break; 8061 8062 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2: 8063 case AArch64MachineCombinerPattern::FMLAv4f32_OP2: 8064 RC = &AArch64::FPR128RegClass; 8065 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2) { 8066 Opc = AArch64::FMLAv4i32_indexed; 8067 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8068 FMAInstKind::Indexed); 8069 } else { 8070 Opc = AArch64::FMLAv4f32; 8071 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8072 FMAInstKind::Accumulator); 8073 } 8074 break; 8075 8076 case AArch64MachineCombinerPattern::FMULSUBH_OP1: 8077 Opc = AArch64::FNMSUBHrrr; 8078 RC = &AArch64::FPR16RegClass; 8079 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 8080 break; 8081 case AArch64MachineCombinerPattern::FMULSUBS_OP1: 8082 Opc = AArch64::FNMSUBSrrr; 8083 RC = &AArch64::FPR32RegClass; 8084 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 8085 break; 8086 case AArch64MachineCombinerPattern::FMULSUBD_OP1: 8087 Opc = AArch64::FNMSUBDrrr; 8088 RC = &AArch64::FPR64RegClass; 8089 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 8090 break; 8091 8092 case AArch64MachineCombinerPattern::FNMULSUBH_OP1: 8093 Opc = AArch64::FNMADDHrrr; 8094 RC = &AArch64::FPR16RegClass; 8095 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 8096 break; 8097 case AArch64MachineCombinerPattern::FNMULSUBS_OP1: 8098 Opc = AArch64::FNMADDSrrr; 8099 RC = &AArch64::FPR32RegClass; 8100 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 8101 break; 8102 case AArch64MachineCombinerPattern::FNMULSUBD_OP1: 8103 Opc = AArch64::FNMADDDrrr; 8104 RC = &AArch64::FPR64RegClass; 8105 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 8106 break; 8107 8108 case AArch64MachineCombinerPattern::FMULSUBH_OP2: 8109 Opc = AArch64::FMSUBHrrr; 8110 RC = &AArch64::FPR16RegClass; 8111 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 8112 break; 8113 case AArch64MachineCombinerPattern::FMULSUBS_OP2: 8114 Opc = AArch64::FMSUBSrrr; 8115 RC = &AArch64::FPR32RegClass; 8116 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 8117 break; 8118 case AArch64MachineCombinerPattern::FMULSUBD_OP2: 8119 Opc = AArch64::FMSUBDrrr; 8120 RC = &AArch64::FPR64RegClass; 8121 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 8122 break; 8123 8124 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2: 8125 Opc = AArch64::FMLSv1i32_indexed; 8126 RC = &AArch64::FPR32RegClass; 8127 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8128 FMAInstKind::Indexed); 8129 break; 8130 8131 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2: 8132 Opc = AArch64::FMLSv1i64_indexed; 8133 RC = &AArch64::FPR64RegClass; 8134 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8135 FMAInstKind::Indexed); 8136 break; 8137 8138 case AArch64MachineCombinerPattern::FMLSv4f16_OP1: 8139 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: { 8140 RC = &AArch64::FPR64RegClass; 8141 Register NewVR = MRI.createVirtualRegister(RC); 8142 MachineInstrBuilder MIB1 = 8143 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR) 8144 .add(Root.getOperand(2)); 8145 InsInstrs.push_back(MIB1); 8146 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 8147 if (Pattern == AArch64MachineCombinerPattern::FMLSv4f16_OP1) { 8148 Opc = AArch64::FMLAv4f16; 8149 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 8150 FMAInstKind::Accumulator, &NewVR); 8151 } else { 8152 Opc = AArch64::FMLAv4i16_indexed; 8153 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 8154 FMAInstKind::Indexed, &NewVR); 8155 } 8156 break; 8157 } 8158 case AArch64MachineCombinerPattern::FMLSv4f16_OP2: 8159 RC = &AArch64::FPR64RegClass; 8160 Opc = AArch64::FMLSv4f16; 8161 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8162 FMAInstKind::Accumulator); 8163 break; 8164 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2: 8165 RC = &AArch64::FPR64RegClass; 8166 Opc = AArch64::FMLSv4i16_indexed; 8167 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8168 FMAInstKind::Indexed); 8169 break; 8170 8171 case AArch64MachineCombinerPattern::FMLSv2f32_OP2: 8172 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2: 8173 RC = &AArch64::FPR64RegClass; 8174 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2) { 8175 Opc = AArch64::FMLSv2i32_indexed; 8176 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8177 FMAInstKind::Indexed); 8178 } else { 8179 Opc = AArch64::FMLSv2f32; 8180 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8181 FMAInstKind::Accumulator); 8182 } 8183 break; 8184 8185 case AArch64MachineCombinerPattern::FMLSv8f16_OP1: 8186 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: { 8187 RC = &AArch64::FPR128RegClass; 8188 Register NewVR = MRI.createVirtualRegister(RC); 8189 MachineInstrBuilder MIB1 = 8190 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR) 8191 .add(Root.getOperand(2)); 8192 InsInstrs.push_back(MIB1); 8193 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 8194 if (Pattern == AArch64MachineCombinerPattern::FMLSv8f16_OP1) { 8195 Opc = AArch64::FMLAv8f16; 8196 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 8197 FMAInstKind::Accumulator, &NewVR); 8198 } else { 8199 Opc = AArch64::FMLAv8i16_indexed; 8200 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 8201 FMAInstKind::Indexed, &NewVR); 8202 } 8203 break; 8204 } 8205 case AArch64MachineCombinerPattern::FMLSv8f16_OP2: 8206 RC = &AArch64::FPR128RegClass; 8207 Opc = AArch64::FMLSv8f16; 8208 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8209 FMAInstKind::Accumulator); 8210 break; 8211 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2: 8212 RC = &AArch64::FPR128RegClass; 8213 Opc = AArch64::FMLSv8i16_indexed; 8214 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8215 FMAInstKind::Indexed); 8216 break; 8217 8218 case AArch64MachineCombinerPattern::FMLSv2f64_OP2: 8219 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2: 8220 RC = &AArch64::FPR128RegClass; 8221 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2) { 8222 Opc = AArch64::FMLSv2i64_indexed; 8223 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8224 FMAInstKind::Indexed); 8225 } else { 8226 Opc = AArch64::FMLSv2f64; 8227 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8228 FMAInstKind::Accumulator); 8229 } 8230 break; 8231 8232 case AArch64MachineCombinerPattern::FMLSv4f32_OP2: 8233 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2: 8234 RC = &AArch64::FPR128RegClass; 8235 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2) { 8236 Opc = AArch64::FMLSv4i32_indexed; 8237 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8238 FMAInstKind::Indexed); 8239 } else { 8240 Opc = AArch64::FMLSv4f32; 8241 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 8242 FMAInstKind::Accumulator); 8243 } 8244 break; 8245 case AArch64MachineCombinerPattern::FMLSv2f32_OP1: 8246 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1: { 8247 RC = &AArch64::FPR64RegClass; 8248 Register NewVR = MRI.createVirtualRegister(RC); 8249 MachineInstrBuilder MIB1 = 8250 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR) 8251 .add(Root.getOperand(2)); 8252 InsInstrs.push_back(MIB1); 8253 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 8254 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1) { 8255 Opc = AArch64::FMLAv2i32_indexed; 8256 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 8257 FMAInstKind::Indexed, &NewVR); 8258 } else { 8259 Opc = AArch64::FMLAv2f32; 8260 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 8261 FMAInstKind::Accumulator, &NewVR); 8262 } 8263 break; 8264 } 8265 case AArch64MachineCombinerPattern::FMLSv4f32_OP1: 8266 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1: { 8267 RC = &AArch64::FPR128RegClass; 8268 Register NewVR = MRI.createVirtualRegister(RC); 8269 MachineInstrBuilder MIB1 = 8270 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR) 8271 .add(Root.getOperand(2)); 8272 InsInstrs.push_back(MIB1); 8273 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 8274 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1) { 8275 Opc = AArch64::FMLAv4i32_indexed; 8276 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 8277 FMAInstKind::Indexed, &NewVR); 8278 } else { 8279 Opc = AArch64::FMLAv4f32; 8280 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 8281 FMAInstKind::Accumulator, &NewVR); 8282 } 8283 break; 8284 } 8285 case AArch64MachineCombinerPattern::FMLSv2f64_OP1: 8286 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1: { 8287 RC = &AArch64::FPR128RegClass; 8288 Register NewVR = MRI.createVirtualRegister(RC); 8289 MachineInstrBuilder MIB1 = 8290 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR) 8291 .add(Root.getOperand(2)); 8292 InsInstrs.push_back(MIB1); 8293 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 8294 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1) { 8295 Opc = AArch64::FMLAv2i64_indexed; 8296 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 8297 FMAInstKind::Indexed, &NewVR); 8298 } else { 8299 Opc = AArch64::FMLAv2f64; 8300 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 8301 FMAInstKind::Accumulator, &NewVR); 8302 } 8303 break; 8304 } 8305 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1: 8306 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: { 8307 unsigned IdxDupOp = 8308 (Pattern == AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1 8309 : 2; 8310 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed, 8311 &AArch64::FPR128RegClass, MRI); 8312 break; 8313 } 8314 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1: 8315 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: { 8316 unsigned IdxDupOp = 8317 (Pattern == AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1 8318 : 2; 8319 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed, 8320 &AArch64::FPR128RegClass, MRI); 8321 break; 8322 } 8323 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1: 8324 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: { 8325 unsigned IdxDupOp = 8326 (Pattern == AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1 8327 : 2; 8328 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed, 8329 &AArch64::FPR128_loRegClass, MRI); 8330 break; 8331 } 8332 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1: 8333 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: { 8334 unsigned IdxDupOp = 8335 (Pattern == AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1 8336 : 2; 8337 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed, 8338 &AArch64::FPR128RegClass, MRI); 8339 break; 8340 } 8341 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1: 8342 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: { 8343 unsigned IdxDupOp = 8344 (Pattern == AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1 8345 : 2; 8346 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed, 8347 &AArch64::FPR128_loRegClass, MRI); 8348 break; 8349 } 8350 case AArch64MachineCombinerPattern::FNMADD: { 8351 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs); 8352 break; 8353 } 8354 8355 } // end switch (Pattern) 8356 // Record MUL and ADD/SUB for deletion 8357 if (MUL) 8358 DelInstrs.push_back(MUL); 8359 DelInstrs.push_back(&Root); 8360 8361 // Set the flags on the inserted instructions to be the merged flags of the 8362 // instructions that we have combined. 8363 uint32_t Flags = Root.getFlags(); 8364 if (MUL) 8365 Flags = Root.mergeFlagsWith(*MUL); 8366 for (auto *MI : InsInstrs) 8367 MI->setFlags(Flags); 8368 } 8369 8370 /// Replace csincr-branch sequence by simple conditional branch 8371 /// 8372 /// Examples: 8373 /// 1. \code 8374 /// csinc w9, wzr, wzr, <condition code> 8375 /// tbnz w9, #0, 0x44 8376 /// \endcode 8377 /// to 8378 /// \code 8379 /// b.<inverted condition code> 8380 /// \endcode 8381 /// 8382 /// 2. \code 8383 /// csinc w9, wzr, wzr, <condition code> 8384 /// tbz w9, #0, 0x44 8385 /// \endcode 8386 /// to 8387 /// \code 8388 /// b.<condition code> 8389 /// \endcode 8390 /// 8391 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the 8392 /// compare's constant operand is power of 2. 8393 /// 8394 /// Examples: 8395 /// \code 8396 /// and w8, w8, #0x400 8397 /// cbnz w8, L1 8398 /// \endcode 8399 /// to 8400 /// \code 8401 /// tbnz w8, #10, L1 8402 /// \endcode 8403 /// 8404 /// \param MI Conditional Branch 8405 /// \return True when the simple conditional branch is generated 8406 /// 8407 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { 8408 bool IsNegativeBranch = false; 8409 bool IsTestAndBranch = false; 8410 unsigned TargetBBInMI = 0; 8411 switch (MI.getOpcode()) { 8412 default: 8413 llvm_unreachable("Unknown branch instruction?"); 8414 case AArch64::Bcc: 8415 return false; 8416 case AArch64::CBZW: 8417 case AArch64::CBZX: 8418 TargetBBInMI = 1; 8419 break; 8420 case AArch64::CBNZW: 8421 case AArch64::CBNZX: 8422 TargetBBInMI = 1; 8423 IsNegativeBranch = true; 8424 break; 8425 case AArch64::TBZW: 8426 case AArch64::TBZX: 8427 TargetBBInMI = 2; 8428 IsTestAndBranch = true; 8429 break; 8430 case AArch64::TBNZW: 8431 case AArch64::TBNZX: 8432 TargetBBInMI = 2; 8433 IsNegativeBranch = true; 8434 IsTestAndBranch = true; 8435 break; 8436 } 8437 // So we increment a zero register and test for bits other 8438 // than bit 0? Conservatively bail out in case the verifier 8439 // missed this case. 8440 if (IsTestAndBranch && MI.getOperand(1).getImm()) 8441 return false; 8442 8443 // Find Definition. 8444 assert(MI.getParent() && "Incomplete machine instruciton\n"); 8445 MachineBasicBlock *MBB = MI.getParent(); 8446 MachineFunction *MF = MBB->getParent(); 8447 MachineRegisterInfo *MRI = &MF->getRegInfo(); 8448 Register VReg = MI.getOperand(0).getReg(); 8449 if (!VReg.isVirtual()) 8450 return false; 8451 8452 MachineInstr *DefMI = MRI->getVRegDef(VReg); 8453 8454 // Look through COPY instructions to find definition. 8455 while (DefMI->isCopy()) { 8456 Register CopyVReg = DefMI->getOperand(1).getReg(); 8457 if (!MRI->hasOneNonDBGUse(CopyVReg)) 8458 return false; 8459 if (!MRI->hasOneDef(CopyVReg)) 8460 return false; 8461 DefMI = MRI->getVRegDef(CopyVReg); 8462 } 8463 8464 switch (DefMI->getOpcode()) { 8465 default: 8466 return false; 8467 // Fold AND into a TBZ/TBNZ if constant operand is power of 2. 8468 case AArch64::ANDWri: 8469 case AArch64::ANDXri: { 8470 if (IsTestAndBranch) 8471 return false; 8472 if (DefMI->getParent() != MBB) 8473 return false; 8474 if (!MRI->hasOneNonDBGUse(VReg)) 8475 return false; 8476 8477 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); 8478 uint64_t Mask = AArch64_AM::decodeLogicalImmediate( 8479 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); 8480 if (!isPowerOf2_64(Mask)) 8481 return false; 8482 8483 MachineOperand &MO = DefMI->getOperand(1); 8484 Register NewReg = MO.getReg(); 8485 if (!NewReg.isVirtual()) 8486 return false; 8487 8488 assert(!MRI->def_empty(NewReg) && "Register must be defined."); 8489 8490 MachineBasicBlock &RefToMBB = *MBB; 8491 MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); 8492 DebugLoc DL = MI.getDebugLoc(); 8493 unsigned Imm = Log2_64(Mask); 8494 unsigned Opc = (Imm < 32) 8495 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) 8496 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); 8497 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) 8498 .addReg(NewReg) 8499 .addImm(Imm) 8500 .addMBB(TBB); 8501 // Register lives on to the CBZ now. 8502 MO.setIsKill(false); 8503 8504 // For immediate smaller than 32, we need to use the 32-bit 8505 // variant (W) in all cases. Indeed the 64-bit variant does not 8506 // allow to encode them. 8507 // Therefore, if the input register is 64-bit, we need to take the 8508 // 32-bit sub-part. 8509 if (!Is32Bit && Imm < 32) 8510 NewMI->getOperand(0).setSubReg(AArch64::sub_32); 8511 MI.eraseFromParent(); 8512 return true; 8513 } 8514 // Look for CSINC 8515 case AArch64::CSINCWr: 8516 case AArch64::CSINCXr: { 8517 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && 8518 DefMI->getOperand(2).getReg() == AArch64::WZR) && 8519 !(DefMI->getOperand(1).getReg() == AArch64::XZR && 8520 DefMI->getOperand(2).getReg() == AArch64::XZR)) 8521 return false; 8522 8523 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, 8524 true) != -1) 8525 return false; 8526 8527 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); 8528 // Convert only when the condition code is not modified between 8529 // the CSINC and the branch. The CC may be used by other 8530 // instructions in between. 8531 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) 8532 return false; 8533 MachineBasicBlock &RefToMBB = *MBB; 8534 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); 8535 DebugLoc DL = MI.getDebugLoc(); 8536 if (IsNegativeBranch) 8537 CC = AArch64CC::getInvertedCondCode(CC); 8538 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); 8539 MI.eraseFromParent(); 8540 return true; 8541 } 8542 } 8543 } 8544 8545 std::pair<unsigned, unsigned> 8546 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 8547 const unsigned Mask = AArch64II::MO_FRAGMENT; 8548 return std::make_pair(TF & Mask, TF & ~Mask); 8549 } 8550 8551 ArrayRef<std::pair<unsigned, const char *>> 8552 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 8553 using namespace AArch64II; 8554 8555 static const std::pair<unsigned, const char *> TargetFlags[] = { 8556 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"}, 8557 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"}, 8558 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"}, 8559 {MO_HI12, "aarch64-hi12"}}; 8560 return ArrayRef(TargetFlags); 8561 } 8562 8563 ArrayRef<std::pair<unsigned, const char *>> 8564 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { 8565 using namespace AArch64II; 8566 8567 static const std::pair<unsigned, const char *> TargetFlags[] = { 8568 {MO_COFFSTUB, "aarch64-coffstub"}, 8569 {MO_GOT, "aarch64-got"}, 8570 {MO_NC, "aarch64-nc"}, 8571 {MO_S, "aarch64-s"}, 8572 {MO_TLS, "aarch64-tls"}, 8573 {MO_DLLIMPORT, "aarch64-dllimport"}, 8574 {MO_PREL, "aarch64-prel"}, 8575 {MO_TAGGED, "aarch64-tagged"}, 8576 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"}, 8577 }; 8578 return ArrayRef(TargetFlags); 8579 } 8580 8581 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> 8582 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { 8583 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = 8584 {{MOSuppressPair, "aarch64-suppress-pair"}, 8585 {MOStridedAccess, "aarch64-strided-access"}}; 8586 return ArrayRef(TargetFlags); 8587 } 8588 8589 /// Constants defining how certain sequences should be outlined. 8590 /// This encompasses how an outlined function should be called, and what kind of 8591 /// frame should be emitted for that outlined function. 8592 /// 8593 /// \p MachineOutlinerDefault implies that the function should be called with 8594 /// a save and restore of LR to the stack. 8595 /// 8596 /// That is, 8597 /// 8598 /// I1 Save LR OUTLINED_FUNCTION: 8599 /// I2 --> BL OUTLINED_FUNCTION I1 8600 /// I3 Restore LR I2 8601 /// I3 8602 /// RET 8603 /// 8604 /// * Call construction overhead: 3 (save + BL + restore) 8605 /// * Frame construction overhead: 1 (ret) 8606 /// * Requires stack fixups? Yes 8607 /// 8608 /// \p MachineOutlinerTailCall implies that the function is being created from 8609 /// a sequence of instructions ending in a return. 8610 /// 8611 /// That is, 8612 /// 8613 /// I1 OUTLINED_FUNCTION: 8614 /// I2 --> B OUTLINED_FUNCTION I1 8615 /// RET I2 8616 /// RET 8617 /// 8618 /// * Call construction overhead: 1 (B) 8619 /// * Frame construction overhead: 0 (Return included in sequence) 8620 /// * Requires stack fixups? No 8621 /// 8622 /// \p MachineOutlinerNoLRSave implies that the function should be called using 8623 /// a BL instruction, but doesn't require LR to be saved and restored. This 8624 /// happens when LR is known to be dead. 8625 /// 8626 /// That is, 8627 /// 8628 /// I1 OUTLINED_FUNCTION: 8629 /// I2 --> BL OUTLINED_FUNCTION I1 8630 /// I3 I2 8631 /// I3 8632 /// RET 8633 /// 8634 /// * Call construction overhead: 1 (BL) 8635 /// * Frame construction overhead: 1 (RET) 8636 /// * Requires stack fixups? No 8637 /// 8638 /// \p MachineOutlinerThunk implies that the function is being created from 8639 /// a sequence of instructions ending in a call. The outlined function is 8640 /// called with a BL instruction, and the outlined function tail-calls the 8641 /// original call destination. 8642 /// 8643 /// That is, 8644 /// 8645 /// I1 OUTLINED_FUNCTION: 8646 /// I2 --> BL OUTLINED_FUNCTION I1 8647 /// BL f I2 8648 /// B f 8649 /// * Call construction overhead: 1 (BL) 8650 /// * Frame construction overhead: 0 8651 /// * Requires stack fixups? No 8652 /// 8653 /// \p MachineOutlinerRegSave implies that the function should be called with a 8654 /// save and restore of LR to an available register. This allows us to avoid 8655 /// stack fixups. Note that this outlining variant is compatible with the 8656 /// NoLRSave case. 8657 /// 8658 /// That is, 8659 /// 8660 /// I1 Save LR OUTLINED_FUNCTION: 8661 /// I2 --> BL OUTLINED_FUNCTION I1 8662 /// I3 Restore LR I2 8663 /// I3 8664 /// RET 8665 /// 8666 /// * Call construction overhead: 3 (save + BL + restore) 8667 /// * Frame construction overhead: 1 (ret) 8668 /// * Requires stack fixups? No 8669 enum MachineOutlinerClass { 8670 MachineOutlinerDefault, /// Emit a save, restore, call, and return. 8671 MachineOutlinerTailCall, /// Only emit a branch. 8672 MachineOutlinerNoLRSave, /// Emit a call and return. 8673 MachineOutlinerThunk, /// Emit a call and tail-call. 8674 MachineOutlinerRegSave /// Same as default, but save to a register. 8675 }; 8676 8677 enum MachineOutlinerMBBFlags { 8678 LRUnavailableSomewhere = 0x2, 8679 HasCalls = 0x4, 8680 UnsafeRegsDead = 0x8 8681 }; 8682 8683 Register 8684 AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const { 8685 MachineFunction *MF = C.getMF(); 8686 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo(); 8687 const AArch64RegisterInfo *ARI = 8688 static_cast<const AArch64RegisterInfo *>(&TRI); 8689 // Check if there is an available register across the sequence that we can 8690 // use. 8691 for (unsigned Reg : AArch64::GPR64RegClass) { 8692 if (!ARI->isReservedReg(*MF, Reg) && 8693 Reg != AArch64::LR && // LR is not reserved, but don't use it. 8694 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. 8695 Reg != AArch64::X17 && // Ditto for X17. 8696 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) && 8697 C.isAvailableInsideSeq(Reg, TRI)) 8698 return Reg; 8699 } 8700 return Register(); 8701 } 8702 8703 static bool 8704 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, 8705 const outliner::Candidate &b) { 8706 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 8707 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 8708 8709 return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) && 8710 MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true); 8711 } 8712 8713 static bool 8714 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, 8715 const outliner::Candidate &b) { 8716 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 8717 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 8718 8719 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey(); 8720 } 8721 8722 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, 8723 const outliner::Candidate &b) { 8724 const AArch64Subtarget &SubtargetA = 8725 a.getMF()->getSubtarget<AArch64Subtarget>(); 8726 const AArch64Subtarget &SubtargetB = 8727 b.getMF()->getSubtarget<AArch64Subtarget>(); 8728 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps(); 8729 } 8730 8731 std::optional<std::unique_ptr<outliner::OutlinedFunction>> 8732 AArch64InstrInfo::getOutliningCandidateInfo( 8733 const MachineModuleInfo &MMI, 8734 std::vector<outliner::Candidate> &RepeatedSequenceLocs, 8735 unsigned MinRepeats) const { 8736 unsigned SequenceSize = 0; 8737 for (auto &MI : RepeatedSequenceLocs[0]) 8738 SequenceSize += getInstSizeInBytes(MI); 8739 8740 unsigned NumBytesToCreateFrame = 0; 8741 8742 // We only allow outlining for functions having exactly matching return 8743 // address signing attributes, i.e., all share the same value for the 8744 // attribute "sign-return-address" and all share the same type of key they 8745 // are signed with. 8746 // Additionally we require all functions to simultaniously either support 8747 // v8.3a features or not. Otherwise an outlined function could get signed 8748 // using dedicated v8.3 instructions and a call from a function that doesn't 8749 // support v8.3 instructions would therefore be invalid. 8750 if (std::adjacent_find( 8751 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 8752 [](const outliner::Candidate &a, const outliner::Candidate &b) { 8753 // Return true if a and b are non-equal w.r.t. return address 8754 // signing or support of v8.3a features 8755 if (outliningCandidatesSigningScopeConsensus(a, b) && 8756 outliningCandidatesSigningKeyConsensus(a, b) && 8757 outliningCandidatesV8_3OpsConsensus(a, b)) { 8758 return false; 8759 } 8760 return true; 8761 }) != RepeatedSequenceLocs.end()) { 8762 return std::nullopt; 8763 } 8764 8765 // Since at this point all candidates agree on their return address signing 8766 // picking just one is fine. If the candidate functions potentially sign their 8767 // return addresses, the outlined function should do the same. Note that in 8768 // the case of "sign-return-address"="non-leaf" this is an assumption: It is 8769 // not certainly true that the outlined function will have to sign its return 8770 // address but this decision is made later, when the decision to outline 8771 // has already been made. 8772 // The same holds for the number of additional instructions we need: On 8773 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is 8774 // necessary. However, at this point we don't know if the outlined function 8775 // will have a RET instruction so we assume the worst. 8776 const TargetRegisterInfo &TRI = getRegisterInfo(); 8777 // Performing a tail call may require extra checks when PAuth is enabled. 8778 // If PAuth is disabled, set it to zero for uniformity. 8779 unsigned NumBytesToCheckLRInTCEpilogue = 0; 8780 if (RepeatedSequenceLocs[0] 8781 .getMF() 8782 ->getInfo<AArch64FunctionInfo>() 8783 ->shouldSignReturnAddress(true)) { 8784 // One PAC and one AUT instructions 8785 NumBytesToCreateFrame += 8; 8786 8787 // PAuth is enabled - set extra tail call cost, if any. 8788 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod( 8789 *RepeatedSequenceLocs[0].getMF()); 8790 NumBytesToCheckLRInTCEpilogue = 8791 AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod); 8792 // Checking the authenticated LR value may significantly impact 8793 // SequenceSize, so account for it for more precise results. 8794 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back())) 8795 SequenceSize += NumBytesToCheckLRInTCEpilogue; 8796 8797 // We have to check if sp modifying instructions would get outlined. 8798 // If so we only allow outlining if sp is unchanged overall, so matching 8799 // sub and add instructions are okay to outline, all other sp modifications 8800 // are not 8801 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) { 8802 int SPValue = 0; 8803 for (auto &MI : C) { 8804 if (MI.modifiesRegister(AArch64::SP, &TRI)) { 8805 switch (MI.getOpcode()) { 8806 case AArch64::ADDXri: 8807 case AArch64::ADDWri: 8808 assert(MI.getNumOperands() == 4 && "Wrong number of operands"); 8809 assert(MI.getOperand(2).isImm() && 8810 "Expected operand to be immediate"); 8811 assert(MI.getOperand(1).isReg() && 8812 "Expected operand to be a register"); 8813 // Check if the add just increments sp. If so, we search for 8814 // matching sub instructions that decrement sp. If not, the 8815 // modification is illegal 8816 if (MI.getOperand(1).getReg() == AArch64::SP) 8817 SPValue += MI.getOperand(2).getImm(); 8818 else 8819 return true; 8820 break; 8821 case AArch64::SUBXri: 8822 case AArch64::SUBWri: 8823 assert(MI.getNumOperands() == 4 && "Wrong number of operands"); 8824 assert(MI.getOperand(2).isImm() && 8825 "Expected operand to be immediate"); 8826 assert(MI.getOperand(1).isReg() && 8827 "Expected operand to be a register"); 8828 // Check if the sub just decrements sp. If so, we search for 8829 // matching add instructions that increment sp. If not, the 8830 // modification is illegal 8831 if (MI.getOperand(1).getReg() == AArch64::SP) 8832 SPValue -= MI.getOperand(2).getImm(); 8833 else 8834 return true; 8835 break; 8836 default: 8837 return true; 8838 } 8839 } 8840 } 8841 if (SPValue) 8842 return true; 8843 return false; 8844 }; 8845 // Remove candidates with illegal stack modifying instructions 8846 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification); 8847 8848 // If the sequence doesn't have enough candidates left, then we're done. 8849 if (RepeatedSequenceLocs.size() < MinRepeats) 8850 return std::nullopt; 8851 } 8852 8853 // Properties about candidate MBBs that hold for all of them. 8854 unsigned FlagsSetInAll = 0xF; 8855 8856 // Compute liveness information for each candidate, and set FlagsSetInAll. 8857 for (outliner::Candidate &C : RepeatedSequenceLocs) 8858 FlagsSetInAll &= C.Flags; 8859 8860 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode(); 8861 8862 // Helper lambda which sets call information for every candidate. 8863 auto SetCandidateCallInfo = 8864 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) { 8865 for (outliner::Candidate &C : RepeatedSequenceLocs) 8866 C.setCallInfo(CallID, NumBytesForCall); 8867 }; 8868 8869 unsigned FrameID = MachineOutlinerDefault; 8870 NumBytesToCreateFrame += 4; 8871 8872 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) { 8873 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement(); 8874 }); 8875 8876 // We check to see if CFI Instructions are present, and if they are 8877 // we find the number of CFI Instructions in the candidates. 8878 unsigned CFICount = 0; 8879 for (auto &I : RepeatedSequenceLocs[0]) { 8880 if (I.isCFIInstruction()) 8881 CFICount++; 8882 } 8883 8884 // We compare the number of found CFI Instructions to the number of CFI 8885 // instructions in the parent function for each candidate. We must check this 8886 // since if we outline one of the CFI instructions in a function, we have to 8887 // outline them all for correctness. If we do not, the address offsets will be 8888 // incorrect between the two sections of the program. 8889 for (outliner::Candidate &C : RepeatedSequenceLocs) { 8890 std::vector<MCCFIInstruction> CFIInstructions = 8891 C.getMF()->getFrameInstructions(); 8892 8893 if (CFICount > 0 && CFICount != CFIInstructions.size()) 8894 return std::nullopt; 8895 } 8896 8897 // Returns true if an instructions is safe to fix up, false otherwise. 8898 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { 8899 if (MI.isCall()) 8900 return true; 8901 8902 if (!MI.modifiesRegister(AArch64::SP, &TRI) && 8903 !MI.readsRegister(AArch64::SP, &TRI)) 8904 return true; 8905 8906 // Any modification of SP will break our code to save/restore LR. 8907 // FIXME: We could handle some instructions which add a constant 8908 // offset to SP, with a bit more work. 8909 if (MI.modifiesRegister(AArch64::SP, &TRI)) 8910 return false; 8911 8912 // At this point, we have a stack instruction that we might need to 8913 // fix up. We'll handle it if it's a load or store. 8914 if (MI.mayLoadOrStore()) { 8915 const MachineOperand *Base; // Filled with the base operand of MI. 8916 int64_t Offset; // Filled with the offset of MI. 8917 bool OffsetIsScalable; 8918 8919 // Does it allow us to offset the base operand and is the base the 8920 // register SP? 8921 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) || 8922 !Base->isReg() || Base->getReg() != AArch64::SP) 8923 return false; 8924 8925 // Fixe-up code below assumes bytes. 8926 if (OffsetIsScalable) 8927 return false; 8928 8929 // Find the minimum/maximum offset for this instruction and check 8930 // if fixing it up would be in range. 8931 int64_t MinOffset, 8932 MaxOffset; // Unscaled offsets for the instruction. 8933 // The scale to multiply the offsets by. 8934 TypeSize Scale(0U, false), DummyWidth(0U, false); 8935 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); 8936 8937 Offset += 16; // Update the offset to what it would be if we outlined. 8938 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() || 8939 Offset > MaxOffset * (int64_t)Scale.getFixedValue()) 8940 return false; 8941 8942 // It's in range, so we can outline it. 8943 return true; 8944 } 8945 8946 // FIXME: Add handling for instructions like "add x0, sp, #8". 8947 8948 // We can't fix it up, so don't outline it. 8949 return false; 8950 }; 8951 8952 // True if it's possible to fix up each stack instruction in this sequence. 8953 // Important for frames/call variants that modify the stack. 8954 bool AllStackInstrsSafe = 8955 llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup); 8956 8957 // If the last instruction in any candidate is a terminator, then we should 8958 // tail call all of the candidates. 8959 if (RepeatedSequenceLocs[0].back().isTerminator()) { 8960 FrameID = MachineOutlinerTailCall; 8961 NumBytesToCreateFrame = 0; 8962 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue; 8963 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall); 8964 } 8965 8966 else if (LastInstrOpcode == AArch64::BL || 8967 ((LastInstrOpcode == AArch64::BLR || 8968 LastInstrOpcode == AArch64::BLRNoIP) && 8969 !HasBTI)) { 8970 // FIXME: Do we need to check if the code after this uses the value of LR? 8971 FrameID = MachineOutlinerThunk; 8972 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue; 8973 SetCandidateCallInfo(MachineOutlinerThunk, 4); 8974 } 8975 8976 else { 8977 // We need to decide how to emit calls + frames. We can always emit the same 8978 // frame if we don't need to save to the stack. If we have to save to the 8979 // stack, then we need a different frame. 8980 unsigned NumBytesNoStackCalls = 0; 8981 std::vector<outliner::Candidate> CandidatesWithoutStackFixups; 8982 8983 // Check if we have to save LR. 8984 for (outliner::Candidate &C : RepeatedSequenceLocs) { 8985 bool LRAvailable = 8986 (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere) 8987 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) 8988 : true; 8989 // If we have a noreturn caller, then we're going to be conservative and 8990 // say that we have to save LR. If we don't have a ret at the end of the 8991 // block, then we can't reason about liveness accurately. 8992 // 8993 // FIXME: We can probably do better than always disabling this in 8994 // noreturn functions by fixing up the liveness info. 8995 bool IsNoReturn = 8996 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn); 8997 8998 // Is LR available? If so, we don't need a save. 8999 if (LRAvailable && !IsNoReturn) { 9000 NumBytesNoStackCalls += 4; 9001 C.setCallInfo(MachineOutlinerNoLRSave, 4); 9002 CandidatesWithoutStackFixups.push_back(C); 9003 } 9004 9005 // Is an unused register available? If so, we won't modify the stack, so 9006 // we can outline with the same frame type as those that don't save LR. 9007 else if (findRegisterToSaveLRTo(C)) { 9008 NumBytesNoStackCalls += 12; 9009 C.setCallInfo(MachineOutlinerRegSave, 12); 9010 CandidatesWithoutStackFixups.push_back(C); 9011 } 9012 9013 // Is SP used in the sequence at all? If not, we don't have to modify 9014 // the stack, so we are guaranteed to get the same frame. 9015 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) { 9016 NumBytesNoStackCalls += 12; 9017 C.setCallInfo(MachineOutlinerDefault, 12); 9018 CandidatesWithoutStackFixups.push_back(C); 9019 } 9020 9021 // If we outline this, we need to modify the stack. Pretend we don't 9022 // outline this by saving all of its bytes. 9023 else { 9024 NumBytesNoStackCalls += SequenceSize; 9025 } 9026 } 9027 9028 // If there are no places where we have to save LR, then note that we 9029 // don't have to update the stack. Otherwise, give every candidate the 9030 // default call type, as long as it's safe to do so. 9031 if (!AllStackInstrsSafe || 9032 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { 9033 RepeatedSequenceLocs = CandidatesWithoutStackFixups; 9034 FrameID = MachineOutlinerNoLRSave; 9035 if (RepeatedSequenceLocs.size() < MinRepeats) 9036 return std::nullopt; 9037 } else { 9038 SetCandidateCallInfo(MachineOutlinerDefault, 12); 9039 9040 // Bugzilla ID: 46767 9041 // TODO: Check if fixing up the stack more than once is safe so we can 9042 // outline these. 9043 // 9044 // An outline resulting in a caller that requires stack fixups at the 9045 // callsite to a callee that also requires stack fixups can happen when 9046 // there are no available registers at the candidate callsite for a 9047 // candidate that itself also has calls. 9048 // 9049 // In other words if function_containing_sequence in the following pseudo 9050 // assembly requires that we save LR at the point of the call, but there 9051 // are no available registers: in this case we save using SP and as a 9052 // result the SP offsets requires stack fixups by multiples of 16. 9053 // 9054 // function_containing_sequence: 9055 // ... 9056 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 9057 // call OUTLINED_FUNCTION_N 9058 // restore LR from SP 9059 // ... 9060 // 9061 // OUTLINED_FUNCTION_N: 9062 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 9063 // ... 9064 // bl foo 9065 // restore LR from SP 9066 // ret 9067 // 9068 // Because the code to handle more than one stack fixup does not 9069 // currently have the proper checks for legality, these cases will assert 9070 // in the AArch64 MachineOutliner. This is because the code to do this 9071 // needs more hardening, testing, better checks that generated code is 9072 // legal, etc and because it is only verified to handle a single pass of 9073 // stack fixup. 9074 // 9075 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch 9076 // these cases until they are known to be handled. Bugzilla 46767 is 9077 // referenced in comments at the assert site. 9078 // 9079 // To avoid asserting (or generating non-legal code on noassert builds) 9080 // we remove all candidates which would need more than one stack fixup by 9081 // pruning the cases where the candidate has calls while also having no 9082 // available LR and having no available general purpose registers to copy 9083 // LR to (ie one extra stack save/restore). 9084 // 9085 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 9086 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) { 9087 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); }; 9088 return (llvm::any_of(C, IsCall)) && 9089 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) || 9090 !findRegisterToSaveLRTo(C)); 9091 }); 9092 } 9093 } 9094 9095 // If we dropped all of the candidates, bail out here. 9096 if (RepeatedSequenceLocs.size() < MinRepeats) 9097 return std::nullopt; 9098 } 9099 9100 // Does every candidate's MBB contain a call? If so, then we might have a call 9101 // in the range. 9102 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 9103 // Check if the range contains a call. These require a save + restore of the 9104 // link register. 9105 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; 9106 bool ModStackToSaveLR = false; 9107 if (any_of(drop_end(FirstCand), 9108 [](const MachineInstr &MI) { return MI.isCall(); })) 9109 ModStackToSaveLR = true; 9110 9111 // Handle the last instruction separately. If this is a tail call, then the 9112 // last instruction is a call. We don't want to save + restore in this case. 9113 // However, it could be possible that the last instruction is a call without 9114 // it being valid to tail call this sequence. We should consider this as 9115 // well. 9116 else if (FrameID != MachineOutlinerThunk && 9117 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall()) 9118 ModStackToSaveLR = true; 9119 9120 if (ModStackToSaveLR) { 9121 // We can't fix up the stack. Bail out. 9122 if (!AllStackInstrsSafe) 9123 return std::nullopt; 9124 9125 // Save + restore LR. 9126 NumBytesToCreateFrame += 8; 9127 } 9128 } 9129 9130 // If we have CFI instructions, we can only outline if the outlined section 9131 // can be a tail call 9132 if (FrameID != MachineOutlinerTailCall && CFICount > 0) 9133 return std::nullopt; 9134 9135 return std::make_unique<outliner::OutlinedFunction>( 9136 RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID); 9137 } 9138 9139 void AArch64InstrInfo::mergeOutliningCandidateAttributes( 9140 Function &F, std::vector<outliner::Candidate> &Candidates) const { 9141 // If a bunch of candidates reach this point they must agree on their return 9142 // address signing. It is therefore enough to just consider the signing 9143 // behaviour of one of them 9144 const auto &CFn = Candidates.front().getMF()->getFunction(); 9145 9146 if (CFn.hasFnAttribute("ptrauth-returns")) 9147 F.addFnAttr(CFn.getFnAttribute("ptrauth-returns")); 9148 if (CFn.hasFnAttribute("ptrauth-auth-traps")) 9149 F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps")); 9150 // Since all candidates belong to the same module, just copy the 9151 // function-level attributes of an arbitrary function. 9152 if (CFn.hasFnAttribute("sign-return-address")) 9153 F.addFnAttr(CFn.getFnAttribute("sign-return-address")); 9154 if (CFn.hasFnAttribute("sign-return-address-key")) 9155 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key")); 9156 9157 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates); 9158 } 9159 9160 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( 9161 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { 9162 const Function &F = MF.getFunction(); 9163 9164 // Can F be deduplicated by the linker? If it can, don't outline from it. 9165 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) 9166 return false; 9167 9168 // Don't outline from functions with section markings; the program could 9169 // expect that all the code is in the named section. 9170 // FIXME: Allow outlining from multiple functions with the same section 9171 // marking. 9172 if (F.hasSection()) 9173 return false; 9174 9175 // Outlining from functions with redzones is unsafe since the outliner may 9176 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't 9177 // outline from it. 9178 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 9179 if (!AFI || AFI->hasRedZone().value_or(true)) 9180 return false; 9181 9182 // FIXME: Determine whether it is safe to outline from functions which contain 9183 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are 9184 // outlined together and ensure it is safe to outline with async unwind info, 9185 // required for saving & restoring VG around calls. 9186 if (AFI->hasStreamingModeChanges()) 9187 return false; 9188 9189 // FIXME: Teach the outliner to generate/handle Windows unwind info. 9190 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) 9191 return false; 9192 9193 // It's safe to outline from MF. 9194 return true; 9195 } 9196 9197 SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>> 9198 AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB, 9199 unsigned &Flags) const { 9200 assert(MBB.getParent()->getRegInfo().tracksLiveness() && 9201 "Must track liveness!"); 9202 SmallVector< 9203 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>> 9204 Ranges; 9205 // According to the AArch64 Procedure Call Standard, the following are 9206 // undefined on entry/exit from a function call: 9207 // 9208 // * Registers x16, x17, (and thus w16, w17) 9209 // * Condition codes (and thus the NZCV register) 9210 // 9211 // If any of these registers are used inside or live across an outlined 9212 // function, then they may be modified later, either by the compiler or 9213 // some other tool (like the linker). 9214 // 9215 // To avoid outlining in these situations, partition each block into ranges 9216 // where these registers are dead. We will only outline from those ranges. 9217 LiveRegUnits LRU(getRegisterInfo()); 9218 auto AreAllUnsafeRegsDead = [&LRU]() { 9219 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) && 9220 LRU.available(AArch64::NZCV); 9221 }; 9222 9223 // We need to know if LR is live across an outlining boundary later on in 9224 // order to decide how we'll create the outlined call, frame, etc. 9225 // 9226 // It's pretty expensive to check this for *every candidate* within a block. 9227 // That's some potentially n^2 behaviour, since in the worst case, we'd need 9228 // to compute liveness from the end of the block for O(n) candidates within 9229 // the block. 9230 // 9231 // So, to improve the average case, let's keep track of liveness from the end 9232 // of the block to the beginning of *every outlinable range*. If we know that 9233 // LR is available in every range we could outline from, then we know that 9234 // we don't need to check liveness for any candidate within that range. 9235 bool LRAvailableEverywhere = true; 9236 // Compute liveness bottom-up. 9237 LRU.addLiveOuts(MBB); 9238 // Update flags that require info about the entire MBB. 9239 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) { 9240 if (MI.isCall() && !MI.isTerminator()) 9241 Flags |= MachineOutlinerMBBFlags::HasCalls; 9242 }; 9243 // Range: [RangeBegin, RangeEnd) 9244 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd; 9245 unsigned RangeLen; 9246 auto CreateNewRangeStartingAt = 9247 [&RangeBegin, &RangeEnd, 9248 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) { 9249 RangeBegin = NewBegin; 9250 RangeEnd = std::next(RangeBegin); 9251 RangeLen = 0; 9252 }; 9253 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() { 9254 // At least one unsafe register is not dead. We do not want to outline at 9255 // this point. If it is long enough to outline from, save the range 9256 // [RangeBegin, RangeEnd). 9257 if (RangeLen > 1) 9258 Ranges.push_back(std::make_pair(RangeBegin, RangeEnd)); 9259 }; 9260 // Find the first point where all unsafe registers are dead. 9261 // FIND: <safe instr> <-- end of first potential range 9262 // SKIP: <unsafe def> 9263 // SKIP: ... everything between ... 9264 // SKIP: <unsafe use> 9265 auto FirstPossibleEndPt = MBB.instr_rbegin(); 9266 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) { 9267 LRU.stepBackward(*FirstPossibleEndPt); 9268 // Update flags that impact how we outline across the entire block, 9269 // regardless of safety. 9270 UpdateWholeMBBFlags(*FirstPossibleEndPt); 9271 if (AreAllUnsafeRegsDead()) 9272 break; 9273 } 9274 // If we exhausted the entire block, we have no safe ranges to outline. 9275 if (FirstPossibleEndPt == MBB.instr_rend()) 9276 return Ranges; 9277 // Current range. 9278 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator()); 9279 // StartPt points to the first place where all unsafe registers 9280 // are dead (if there is any such point). Begin partitioning the MBB into 9281 // ranges. 9282 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) { 9283 LRU.stepBackward(MI); 9284 UpdateWholeMBBFlags(MI); 9285 if (!AreAllUnsafeRegsDead()) { 9286 SaveRangeIfNonEmpty(); 9287 CreateNewRangeStartingAt(MI.getIterator()); 9288 continue; 9289 } 9290 LRAvailableEverywhere &= LRU.available(AArch64::LR); 9291 RangeBegin = MI.getIterator(); 9292 ++RangeLen; 9293 } 9294 // Above loop misses the last (or only) range. If we are still safe, then 9295 // let's save the range. 9296 if (AreAllUnsafeRegsDead()) 9297 SaveRangeIfNonEmpty(); 9298 if (Ranges.empty()) 9299 return Ranges; 9300 // We found the ranges bottom-up. Mapping expects the top-down. Reverse 9301 // the order. 9302 std::reverse(Ranges.begin(), Ranges.end()); 9303 // If there is at least one outlinable range where LR is unavailable 9304 // somewhere, remember that. 9305 if (!LRAvailableEverywhere) 9306 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; 9307 return Ranges; 9308 } 9309 9310 outliner::InstrType 9311 AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI, 9312 MachineBasicBlock::iterator &MIT, 9313 unsigned Flags) const { 9314 MachineInstr &MI = *MIT; 9315 MachineBasicBlock *MBB = MI.getParent(); 9316 MachineFunction *MF = MBB->getParent(); 9317 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); 9318 9319 // Don't outline anything used for return address signing. The outlined 9320 // function will get signed later if needed 9321 switch (MI.getOpcode()) { 9322 case AArch64::PACM: 9323 case AArch64::PACIASP: 9324 case AArch64::PACIBSP: 9325 case AArch64::PACIASPPC: 9326 case AArch64::PACIBSPPC: 9327 case AArch64::AUTIASP: 9328 case AArch64::AUTIBSP: 9329 case AArch64::AUTIASPPCi: 9330 case AArch64::AUTIASPPCr: 9331 case AArch64::AUTIBSPPCi: 9332 case AArch64::AUTIBSPPCr: 9333 case AArch64::RETAA: 9334 case AArch64::RETAB: 9335 case AArch64::RETAASPPCi: 9336 case AArch64::RETAASPPCr: 9337 case AArch64::RETABSPPCi: 9338 case AArch64::RETABSPPCr: 9339 case AArch64::EMITBKEY: 9340 case AArch64::PAUTH_PROLOGUE: 9341 case AArch64::PAUTH_EPILOGUE: 9342 return outliner::InstrType::Illegal; 9343 } 9344 9345 // Don't outline LOHs. 9346 if (FuncInfo->getLOHRelated().count(&MI)) 9347 return outliner::InstrType::Illegal; 9348 9349 // We can only outline these if we will tail call the outlined function, or 9350 // fix up the CFI offsets. Currently, CFI instructions are outlined only if 9351 // in a tail call. 9352 // 9353 // FIXME: If the proper fixups for the offset are implemented, this should be 9354 // possible. 9355 if (MI.isCFIInstruction()) 9356 return outliner::InstrType::Legal; 9357 9358 // Is this a terminator for a basic block? 9359 if (MI.isTerminator()) 9360 // TargetInstrInfo::getOutliningType has already filtered out anything 9361 // that would break this, so we can allow it here. 9362 return outliner::InstrType::Legal; 9363 9364 // Make sure none of the operands are un-outlinable. 9365 for (const MachineOperand &MOP : MI.operands()) { 9366 // A check preventing CFI indices was here before, but only CFI 9367 // instructions should have those. 9368 assert(!MOP.isCFIIndex()); 9369 9370 // If it uses LR or W30 explicitly, then don't touch it. 9371 if (MOP.isReg() && !MOP.isImplicit() && 9372 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30)) 9373 return outliner::InstrType::Illegal; 9374 } 9375 9376 // Special cases for instructions that can always be outlined, but will fail 9377 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always 9378 // be outlined because they don't require a *specific* value to be in LR. 9379 if (MI.getOpcode() == AArch64::ADRP) 9380 return outliner::InstrType::Legal; 9381 9382 // If MI is a call we might be able to outline it. We don't want to outline 9383 // any calls that rely on the position of items on the stack. When we outline 9384 // something containing a call, we have to emit a save and restore of LR in 9385 // the outlined function. Currently, this always happens by saving LR to the 9386 // stack. Thus, if we outline, say, half the parameters for a function call 9387 // plus the call, then we'll break the callee's expectations for the layout 9388 // of the stack. 9389 // 9390 // FIXME: Allow calls to functions which construct a stack frame, as long 9391 // as they don't access arguments on the stack. 9392 // FIXME: Figure out some way to analyze functions defined in other modules. 9393 // We should be able to compute the memory usage based on the IR calling 9394 // convention, even if we can't see the definition. 9395 if (MI.isCall()) { 9396 // Get the function associated with the call. Look at each operand and find 9397 // the one that represents the callee and get its name. 9398 const Function *Callee = nullptr; 9399 for (const MachineOperand &MOP : MI.operands()) { 9400 if (MOP.isGlobal()) { 9401 Callee = dyn_cast<Function>(MOP.getGlobal()); 9402 break; 9403 } 9404 } 9405 9406 // Never outline calls to mcount. There isn't any rule that would require 9407 // this, but the Linux kernel's "ftrace" feature depends on it. 9408 if (Callee && Callee->getName() == "\01_mcount") 9409 return outliner::InstrType::Illegal; 9410 9411 // If we don't know anything about the callee, assume it depends on the 9412 // stack layout of the caller. In that case, it's only legal to outline 9413 // as a tail-call. Explicitly list the call instructions we know about so we 9414 // don't get unexpected results with call pseudo-instructions. 9415 auto UnknownCallOutlineType = outliner::InstrType::Illegal; 9416 if (MI.getOpcode() == AArch64::BLR || 9417 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL) 9418 UnknownCallOutlineType = outliner::InstrType::LegalTerminator; 9419 9420 if (!Callee) 9421 return UnknownCallOutlineType; 9422 9423 // We have a function we have information about. Check it if it's something 9424 // can safely outline. 9425 MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee); 9426 9427 // We don't know what's going on with the callee at all. Don't touch it. 9428 if (!CalleeMF) 9429 return UnknownCallOutlineType; 9430 9431 // Check if we know anything about the callee saves on the function. If we 9432 // don't, then don't touch it, since that implies that we haven't 9433 // computed anything about its stack frame yet. 9434 MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); 9435 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || 9436 MFI.getNumObjects() > 0) 9437 return UnknownCallOutlineType; 9438 9439 // At this point, we can say that CalleeMF ought to not pass anything on the 9440 // stack. Therefore, we can outline it. 9441 return outliner::InstrType::Legal; 9442 } 9443 9444 // Don't touch the link register or W30. 9445 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) || 9446 MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) 9447 return outliner::InstrType::Illegal; 9448 9449 // Don't outline BTI instructions, because that will prevent the outlining 9450 // site from being indirectly callable. 9451 if (hasBTISemantics(MI)) 9452 return outliner::InstrType::Illegal; 9453 9454 return outliner::InstrType::Legal; 9455 } 9456 9457 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { 9458 for (MachineInstr &MI : MBB) { 9459 const MachineOperand *Base; 9460 TypeSize Width(0, false); 9461 int64_t Offset; 9462 bool OffsetIsScalable; 9463 9464 // Is this a load or store with an immediate offset with SP as the base? 9465 if (!MI.mayLoadOrStore() || 9466 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width, 9467 &RI) || 9468 (Base->isReg() && Base->getReg() != AArch64::SP)) 9469 continue; 9470 9471 // It is, so we have to fix it up. 9472 TypeSize Scale(0U, false); 9473 int64_t Dummy1, Dummy2; 9474 9475 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); 9476 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); 9477 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); 9478 assert(Scale != 0 && "Unexpected opcode!"); 9479 assert(!OffsetIsScalable && "Expected offset to be a byte offset"); 9480 9481 // We've pushed the return address to the stack, so add 16 to the offset. 9482 // This is safe, since we already checked if it would overflow when we 9483 // checked if this instruction was legal to outline. 9484 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue(); 9485 StackOffsetOperand.setImm(NewImm); 9486 } 9487 } 9488 9489 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, 9490 const AArch64InstrInfo *TII, 9491 bool ShouldSignReturnAddr) { 9492 if (!ShouldSignReturnAddr) 9493 return; 9494 9495 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE)) 9496 .setMIFlag(MachineInstr::FrameSetup); 9497 BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(), 9498 TII->get(AArch64::PAUTH_EPILOGUE)) 9499 .setMIFlag(MachineInstr::FrameDestroy); 9500 } 9501 9502 void AArch64InstrInfo::buildOutlinedFrame( 9503 MachineBasicBlock &MBB, MachineFunction &MF, 9504 const outliner::OutlinedFunction &OF) const { 9505 9506 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>(); 9507 9508 if (OF.FrameConstructionID == MachineOutlinerTailCall) 9509 FI->setOutliningStyle("Tail Call"); 9510 else if (OF.FrameConstructionID == MachineOutlinerThunk) { 9511 // For thunk outlining, rewrite the last instruction from a call to a 9512 // tail-call. 9513 MachineInstr *Call = &*--MBB.instr_end(); 9514 unsigned TailOpcode; 9515 if (Call->getOpcode() == AArch64::BL) { 9516 TailOpcode = AArch64::TCRETURNdi; 9517 } else { 9518 assert(Call->getOpcode() == AArch64::BLR || 9519 Call->getOpcode() == AArch64::BLRNoIP); 9520 TailOpcode = AArch64::TCRETURNriALL; 9521 } 9522 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) 9523 .add(Call->getOperand(0)) 9524 .addImm(0); 9525 MBB.insert(MBB.end(), TC); 9526 Call->eraseFromParent(); 9527 9528 FI->setOutliningStyle("Thunk"); 9529 } 9530 9531 bool IsLeafFunction = true; 9532 9533 // Is there a call in the outlined range? 9534 auto IsNonTailCall = [](const MachineInstr &MI) { 9535 return MI.isCall() && !MI.isReturn(); 9536 }; 9537 9538 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) { 9539 // Fix up the instructions in the range, since we're going to modify the 9540 // stack. 9541 9542 // Bugzilla ID: 46767 9543 // TODO: Check if fixing up twice is safe so we can outline these. 9544 assert(OF.FrameConstructionID != MachineOutlinerDefault && 9545 "Can only fix up stack references once"); 9546 fixupPostOutline(MBB); 9547 9548 IsLeafFunction = false; 9549 9550 // LR has to be a live in so that we can save it. 9551 if (!MBB.isLiveIn(AArch64::LR)) 9552 MBB.addLiveIn(AArch64::LR); 9553 9554 MachineBasicBlock::iterator It = MBB.begin(); 9555 MachineBasicBlock::iterator Et = MBB.end(); 9556 9557 if (OF.FrameConstructionID == MachineOutlinerTailCall || 9558 OF.FrameConstructionID == MachineOutlinerThunk) 9559 Et = std::prev(MBB.end()); 9560 9561 // Insert a save before the outlined region 9562 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 9563 .addReg(AArch64::SP, RegState::Define) 9564 .addReg(AArch64::LR) 9565 .addReg(AArch64::SP) 9566 .addImm(-16); 9567 It = MBB.insert(It, STRXpre); 9568 9569 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) { 9570 const TargetSubtargetInfo &STI = MF.getSubtarget(); 9571 const MCRegisterInfo *MRI = STI.getRegisterInfo(); 9572 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true); 9573 9574 // Add a CFI saying the stack was moved 16 B down. 9575 int64_t StackPosEntry = 9576 MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16)); 9577 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 9578 .addCFIIndex(StackPosEntry) 9579 .setMIFlags(MachineInstr::FrameSetup); 9580 9581 // Add a CFI saying that the LR that we want to find is now 16 B higher 9582 // than before. 9583 int64_t LRPosEntry = MF.addFrameInst( 9584 MCCFIInstruction::createOffset(nullptr, DwarfReg, -16)); 9585 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 9586 .addCFIIndex(LRPosEntry) 9587 .setMIFlags(MachineInstr::FrameSetup); 9588 } 9589 9590 // Insert a restore before the terminator for the function. 9591 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 9592 .addReg(AArch64::SP, RegState::Define) 9593 .addReg(AArch64::LR, RegState::Define) 9594 .addReg(AArch64::SP) 9595 .addImm(16); 9596 Et = MBB.insert(Et, LDRXpost); 9597 } 9598 9599 bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(!IsLeafFunction); 9600 9601 // If this is a tail call outlined function, then there's already a return. 9602 if (OF.FrameConstructionID == MachineOutlinerTailCall || 9603 OF.FrameConstructionID == MachineOutlinerThunk) { 9604 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr); 9605 return; 9606 } 9607 9608 // It's not a tail call, so we have to insert the return ourselves. 9609 9610 // LR has to be a live in so that we can return to it. 9611 if (!MBB.isLiveIn(AArch64::LR)) 9612 MBB.addLiveIn(AArch64::LR); 9613 9614 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) 9615 .addReg(AArch64::LR); 9616 MBB.insert(MBB.end(), ret); 9617 9618 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr); 9619 9620 FI->setOutliningStyle("Function"); 9621 9622 // Did we have to modify the stack by saving the link register? 9623 if (OF.FrameConstructionID != MachineOutlinerDefault) 9624 return; 9625 9626 // We modified the stack. 9627 // Walk over the basic block and fix up all the stack accesses. 9628 fixupPostOutline(MBB); 9629 } 9630 9631 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( 9632 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, 9633 MachineFunction &MF, outliner::Candidate &C) const { 9634 9635 // Are we tail calling? 9636 if (C.CallConstructionID == MachineOutlinerTailCall) { 9637 // If yes, then we can just branch to the label. 9638 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi)) 9639 .addGlobalAddress(M.getNamedValue(MF.getName())) 9640 .addImm(0)); 9641 return It; 9642 } 9643 9644 // Are we saving the link register? 9645 if (C.CallConstructionID == MachineOutlinerNoLRSave || 9646 C.CallConstructionID == MachineOutlinerThunk) { 9647 // No, so just insert the call. 9648 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 9649 .addGlobalAddress(M.getNamedValue(MF.getName()))); 9650 return It; 9651 } 9652 9653 // We want to return the spot where we inserted the call. 9654 MachineBasicBlock::iterator CallPt; 9655 9656 // Instructions for saving and restoring LR around the call instruction we're 9657 // going to insert. 9658 MachineInstr *Save; 9659 MachineInstr *Restore; 9660 // Can we save to a register? 9661 if (C.CallConstructionID == MachineOutlinerRegSave) { 9662 // FIXME: This logic should be sunk into a target-specific interface so that 9663 // we don't have to recompute the register. 9664 Register Reg = findRegisterToSaveLRTo(C); 9665 assert(Reg && "No callee-saved register available?"); 9666 9667 // LR has to be a live in so that we can save it. 9668 if (!MBB.isLiveIn(AArch64::LR)) 9669 MBB.addLiveIn(AArch64::LR); 9670 9671 // Save and restore LR from Reg. 9672 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg) 9673 .addReg(AArch64::XZR) 9674 .addReg(AArch64::LR) 9675 .addImm(0); 9676 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR) 9677 .addReg(AArch64::XZR) 9678 .addReg(Reg) 9679 .addImm(0); 9680 } else { 9681 // We have the default case. Save and restore from SP. 9682 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 9683 .addReg(AArch64::SP, RegState::Define) 9684 .addReg(AArch64::LR) 9685 .addReg(AArch64::SP) 9686 .addImm(-16); 9687 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 9688 .addReg(AArch64::SP, RegState::Define) 9689 .addReg(AArch64::LR, RegState::Define) 9690 .addReg(AArch64::SP) 9691 .addImm(16); 9692 } 9693 9694 It = MBB.insert(It, Save); 9695 It++; 9696 9697 // Insert the call. 9698 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 9699 .addGlobalAddress(M.getNamedValue(MF.getName()))); 9700 CallPt = It; 9701 It++; 9702 9703 It = MBB.insert(It, Restore); 9704 return CallPt; 9705 } 9706 9707 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( 9708 MachineFunction &MF) const { 9709 return MF.getFunction().hasMinSize(); 9710 } 9711 9712 void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB, 9713 MachineBasicBlock::iterator Iter, 9714 DebugLoc &DL, 9715 bool AllowSideEffects) const { 9716 const MachineFunction &MF = *MBB.getParent(); 9717 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); 9718 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo(); 9719 9720 if (TRI.isGeneralPurposeRegister(MF, Reg)) { 9721 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0); 9722 } else if (STI.isSVEorStreamingSVEAvailable()) { 9723 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg) 9724 .addImm(0) 9725 .addImm(0); 9726 } else if (STI.isNeonAvailable()) { 9727 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg) 9728 .addImm(0); 9729 } else { 9730 // This is a streaming-compatible function without SVE. We don't have full 9731 // Neon (just FPRs), so we can at most use the first 64-bit sub-register. 9732 // So given `movi v..` would be illegal use `fmov d..` instead. 9733 assert(STI.hasNEON() && "Expected to have NEON."); 9734 Register Reg64 = TRI.getSubReg(Reg, AArch64::dsub); 9735 BuildMI(MBB, Iter, DL, get(AArch64::FMOVD0), Reg64); 9736 } 9737 } 9738 9739 std::optional<DestSourcePair> 9740 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { 9741 9742 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg 9743 // and zero immediate operands used as an alias for mov instruction. 9744 if (((MI.getOpcode() == AArch64::ORRWrs && 9745 MI.getOperand(1).getReg() == AArch64::WZR && 9746 MI.getOperand(3).getImm() == 0x0) || 9747 (MI.getOpcode() == AArch64::ORRWrr && 9748 MI.getOperand(1).getReg() == AArch64::WZR)) && 9749 // Check that the w->w move is not a zero-extending w->x mov. 9750 (!MI.getOperand(0).getReg().isVirtual() || 9751 MI.getOperand(0).getSubReg() == 0) && 9752 (!MI.getOperand(0).getReg().isPhysical() || 9753 MI.findRegisterDefOperandIdx(MI.getOperand(0).getReg() - AArch64::W0 + 9754 AArch64::X0, 9755 /*TRI=*/nullptr) == -1)) 9756 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 9757 9758 if (MI.getOpcode() == AArch64::ORRXrs && 9759 MI.getOperand(1).getReg() == AArch64::XZR && 9760 MI.getOperand(3).getImm() == 0x0) 9761 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 9762 9763 return std::nullopt; 9764 } 9765 9766 std::optional<DestSourcePair> 9767 AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const { 9768 if ((MI.getOpcode() == AArch64::ORRWrs && 9769 MI.getOperand(1).getReg() == AArch64::WZR && 9770 MI.getOperand(3).getImm() == 0x0) || 9771 (MI.getOpcode() == AArch64::ORRWrr && 9772 MI.getOperand(1).getReg() == AArch64::WZR)) 9773 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 9774 return std::nullopt; 9775 } 9776 9777 std::optional<RegImmPair> 9778 AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const { 9779 int Sign = 1; 9780 int64_t Offset = 0; 9781 9782 // TODO: Handle cases where Reg is a super- or sub-register of the 9783 // destination register. 9784 const MachineOperand &Op0 = MI.getOperand(0); 9785 if (!Op0.isReg() || Reg != Op0.getReg()) 9786 return std::nullopt; 9787 9788 switch (MI.getOpcode()) { 9789 default: 9790 return std::nullopt; 9791 case AArch64::SUBWri: 9792 case AArch64::SUBXri: 9793 case AArch64::SUBSWri: 9794 case AArch64::SUBSXri: 9795 Sign *= -1; 9796 [[fallthrough]]; 9797 case AArch64::ADDSWri: 9798 case AArch64::ADDSXri: 9799 case AArch64::ADDWri: 9800 case AArch64::ADDXri: { 9801 // TODO: Third operand can be global address (usually some string). 9802 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || 9803 !MI.getOperand(2).isImm()) 9804 return std::nullopt; 9805 int Shift = MI.getOperand(3).getImm(); 9806 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12"); 9807 Offset = Sign * (MI.getOperand(2).getImm() << Shift); 9808 } 9809 } 9810 return RegImmPair{MI.getOperand(1).getReg(), Offset}; 9811 } 9812 9813 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with 9814 /// the destination register then, if possible, describe the value in terms of 9815 /// the source register. 9816 static std::optional<ParamLoadedValue> 9817 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, 9818 const TargetInstrInfo *TII, 9819 const TargetRegisterInfo *TRI) { 9820 auto DestSrc = TII->isCopyLikeInstr(MI); 9821 if (!DestSrc) 9822 return std::nullopt; 9823 9824 Register DestReg = DestSrc->Destination->getReg(); 9825 Register SrcReg = DestSrc->Source->getReg(); 9826 9827 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); 9828 9829 // If the described register is the destination, just return the source. 9830 if (DestReg == DescribedReg) 9831 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 9832 9833 // ORRWrs zero-extends to 64-bits, so we need to consider such cases. 9834 if (MI.getOpcode() == AArch64::ORRWrs && 9835 TRI->isSuperRegister(DestReg, DescribedReg)) 9836 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 9837 9838 // We may need to describe the lower part of a ORRXrs move. 9839 if (MI.getOpcode() == AArch64::ORRXrs && 9840 TRI->isSubRegister(DestReg, DescribedReg)) { 9841 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32); 9842 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); 9843 } 9844 9845 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) && 9846 "Unhandled ORR[XW]rs copy case"); 9847 9848 return std::nullopt; 9849 } 9850 9851 bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const { 9852 // Functions cannot be split to different sections on AArch64 if they have 9853 // a red zone. This is because relaxing a cross-section branch may require 9854 // incrementing the stack pointer to spill a register, which would overwrite 9855 // the red zone. 9856 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true)) 9857 return false; 9858 9859 return TargetInstrInfo::isFunctionSafeToSplit(MF); 9860 } 9861 9862 bool AArch64InstrInfo::isMBBSafeToSplitToCold( 9863 const MachineBasicBlock &MBB) const { 9864 // Asm Goto blocks can contain conditional branches to goto labels, which can 9865 // get moved out of range of the branch instruction. 9866 auto isAsmGoto = [](const MachineInstr &MI) { 9867 return MI.getOpcode() == AArch64::INLINEASM_BR; 9868 }; 9869 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget()) 9870 return false; 9871 9872 // Because jump tables are label-relative instead of table-relative, they all 9873 // must be in the same section or relocation fixup handling will fail. 9874 9875 // Check if MBB is a jump table target 9876 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo(); 9877 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) { 9878 return llvm::is_contained(JTE.MBBs, &MBB); 9879 }; 9880 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB)) 9881 return false; 9882 9883 // Check if MBB contains a jump table lookup 9884 for (const MachineInstr &MI : MBB) { 9885 switch (MI.getOpcode()) { 9886 case TargetOpcode::G_BRJT: 9887 case AArch64::JumpTableDest32: 9888 case AArch64::JumpTableDest16: 9889 case AArch64::JumpTableDest8: 9890 return false; 9891 default: 9892 continue; 9893 } 9894 } 9895 9896 // MBB isn't a special case, so it's safe to be split to the cold section. 9897 return true; 9898 } 9899 9900 std::optional<ParamLoadedValue> 9901 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI, 9902 Register Reg) const { 9903 const MachineFunction *MF = MI.getMF(); 9904 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 9905 switch (MI.getOpcode()) { 9906 case AArch64::MOVZWi: 9907 case AArch64::MOVZXi: { 9908 // MOVZWi may be used for producing zero-extended 32-bit immediates in 9909 // 64-bit parameters, so we need to consider super-registers. 9910 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) 9911 return std::nullopt; 9912 9913 if (!MI.getOperand(1).isImm()) 9914 return std::nullopt; 9915 int64_t Immediate = MI.getOperand(1).getImm(); 9916 int Shift = MI.getOperand(2).getImm(); 9917 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift), 9918 nullptr); 9919 } 9920 case AArch64::ORRWrs: 9921 case AArch64::ORRXrs: 9922 return describeORRLoadedValue(MI, Reg, this, TRI); 9923 } 9924 9925 return TargetInstrInfo::describeLoadedValue(MI, Reg); 9926 } 9927 9928 bool AArch64InstrInfo::isExtendLikelyToBeFolded( 9929 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const { 9930 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT || 9931 ExtMI.getOpcode() == TargetOpcode::G_ZEXT || 9932 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT); 9933 9934 // Anyexts are nops. 9935 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT) 9936 return true; 9937 9938 Register DefReg = ExtMI.getOperand(0).getReg(); 9939 if (!MRI.hasOneNonDBGUse(DefReg)) 9940 return false; 9941 9942 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an 9943 // addressing mode. 9944 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg); 9945 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD; 9946 } 9947 9948 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const { 9949 return get(Opc).TSFlags & AArch64::ElementSizeMask; 9950 } 9951 9952 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const { 9953 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike; 9954 } 9955 9956 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const { 9957 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile; 9958 } 9959 9960 unsigned int 9961 AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const { 9962 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2; 9963 } 9964 9965 bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset, 9966 unsigned Scale) const { 9967 if (Offset && Scale) 9968 return false; 9969 9970 // Check Reg + Imm 9971 if (!Scale) { 9972 // 9-bit signed offset 9973 if (isInt<9>(Offset)) 9974 return true; 9975 9976 // 12-bit unsigned offset 9977 unsigned Shift = Log2_64(NumBytes); 9978 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && 9979 // Must be a multiple of NumBytes (NumBytes is a power of 2) 9980 (Offset >> Shift) << Shift == Offset) 9981 return true; 9982 return false; 9983 } 9984 9985 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 9986 return Scale == 1 || (Scale > 0 && Scale == NumBytes); 9987 } 9988 9989 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { 9990 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr()) 9991 return AArch64::BLRNoIP; 9992 else 9993 return AArch64::BLR; 9994 } 9995 9996 MachineBasicBlock::iterator 9997 AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI, 9998 Register TargetReg, bool FrameSetup) const { 9999 assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP"); 10000 10001 MachineBasicBlock &MBB = *MBBI->getParent(); 10002 MachineFunction &MF = *MBB.getParent(); 10003 const AArch64InstrInfo *TII = 10004 MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); 10005 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize(); 10006 DebugLoc DL = MBB.findDebugLoc(MBBI); 10007 10008 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); 10009 MachineBasicBlock *LoopTestMBB = 10010 MF.CreateMachineBasicBlock(MBB.getBasicBlock()); 10011 MF.insert(MBBInsertPoint, LoopTestMBB); 10012 MachineBasicBlock *LoopBodyMBB = 10013 MF.CreateMachineBasicBlock(MBB.getBasicBlock()); 10014 MF.insert(MBBInsertPoint, LoopBodyMBB); 10015 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); 10016 MF.insert(MBBInsertPoint, ExitMBB); 10017 MachineInstr::MIFlag Flags = 10018 FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags; 10019 10020 // LoopTest: 10021 // SUB SP, SP, #ProbeSize 10022 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP, 10023 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags); 10024 10025 // CMP SP, TargetReg 10026 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64), 10027 AArch64::XZR) 10028 .addReg(AArch64::SP) 10029 .addReg(TargetReg) 10030 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) 10031 .setMIFlags(Flags); 10032 10033 // B.<Cond> LoopExit 10034 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc)) 10035 .addImm(AArch64CC::LE) 10036 .addMBB(ExitMBB) 10037 .setMIFlags(Flags); 10038 10039 // STR XZR, [SP] 10040 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui)) 10041 .addReg(AArch64::XZR) 10042 .addReg(AArch64::SP) 10043 .addImm(0) 10044 .setMIFlags(Flags); 10045 10046 // B loop 10047 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B)) 10048 .addMBB(LoopTestMBB) 10049 .setMIFlags(Flags); 10050 10051 // LoopExit: 10052 // MOV SP, TargetReg 10053 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP) 10054 .addReg(TargetReg) 10055 .addImm(0) 10056 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 10057 .setMIFlags(Flags); 10058 10059 // LDR XZR, [SP] 10060 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui)) 10061 .addReg(AArch64::XZR, RegState::Define) 10062 .addReg(AArch64::SP) 10063 .addImm(0) 10064 .setMIFlags(Flags); 10065 10066 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); 10067 ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); 10068 10069 LoopTestMBB->addSuccessor(ExitMBB); 10070 LoopTestMBB->addSuccessor(LoopBodyMBB); 10071 LoopBodyMBB->addSuccessor(LoopTestMBB); 10072 MBB.addSuccessor(LoopTestMBB); 10073 10074 // Update liveins. 10075 if (MF.getRegInfo().reservedRegsFrozen()) 10076 fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB}); 10077 10078 return ExitMBB->begin(); 10079 } 10080 10081 namespace { 10082 class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo { 10083 MachineFunction *MF; 10084 const TargetInstrInfo *TII; 10085 const TargetRegisterInfo *TRI; 10086 MachineRegisterInfo &MRI; 10087 10088 /// The block of the loop 10089 MachineBasicBlock *LoopBB; 10090 /// The conditional branch of the loop 10091 MachineInstr *CondBranch; 10092 /// The compare instruction for loop control 10093 MachineInstr *Comp; 10094 /// The number of the operand of the loop counter value in Comp 10095 unsigned CompCounterOprNum; 10096 /// The instruction that updates the loop counter value 10097 MachineInstr *Update; 10098 /// The number of the operand of the loop counter value in Update 10099 unsigned UpdateCounterOprNum; 10100 /// The initial value of the loop counter 10101 Register Init; 10102 /// True iff Update is a predecessor of Comp 10103 bool IsUpdatePriorComp; 10104 10105 /// The normalized condition used by createTripCountGreaterCondition() 10106 SmallVector<MachineOperand, 4> Cond; 10107 10108 public: 10109 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch, 10110 MachineInstr *Comp, unsigned CompCounterOprNum, 10111 MachineInstr *Update, unsigned UpdateCounterOprNum, 10112 Register Init, bool IsUpdatePriorComp, 10113 const SmallVectorImpl<MachineOperand> &Cond) 10114 : MF(Comp->getParent()->getParent()), 10115 TII(MF->getSubtarget().getInstrInfo()), 10116 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()), 10117 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp), 10118 CompCounterOprNum(CompCounterOprNum), Update(Update), 10119 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init), 10120 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {} 10121 10122 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override { 10123 // Make the instructions for loop control be placed in stage 0. 10124 // The predecessors of Comp are considered by the caller. 10125 return MI == Comp; 10126 } 10127 10128 std::optional<bool> createTripCountGreaterCondition( 10129 int TC, MachineBasicBlock &MBB, 10130 SmallVectorImpl<MachineOperand> &CondParam) override { 10131 // A branch instruction will be inserted as "if (Cond) goto epilogue". 10132 // Cond is normalized for such use. 10133 // The predecessors of the branch are assumed to have already been inserted. 10134 CondParam = Cond; 10135 return {}; 10136 } 10137 10138 void createRemainingIterationsGreaterCondition( 10139 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond, 10140 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override; 10141 10142 void setPreheader(MachineBasicBlock *NewPreheader) override {} 10143 10144 void adjustTripCount(int TripCountAdjust) override {} 10145 10146 bool isMVEExpanderSupported() override { return true; } 10147 }; 10148 } // namespace 10149 10150 /// Clone an instruction from MI. The register of ReplaceOprNum-th operand 10151 /// is replaced by ReplaceReg. The output register is newly created. 10152 /// The other operands are unchanged from MI. 10153 static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, 10154 Register ReplaceReg, MachineBasicBlock &MBB, 10155 MachineBasicBlock::iterator InsertTo) { 10156 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 10157 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo(); 10158 const TargetRegisterInfo *TRI = 10159 MBB.getParent()->getSubtarget().getRegisterInfo(); 10160 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI); 10161 Register Result = 0; 10162 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) { 10163 if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) { 10164 Result = MRI.createVirtualRegister( 10165 MRI.getRegClass(NewMI->getOperand(0).getReg())); 10166 NewMI->getOperand(I).setReg(Result); 10167 } else if (I == ReplaceOprNum) { 10168 MRI.constrainRegClass( 10169 ReplaceReg, 10170 TII->getRegClass(NewMI->getDesc(), I, TRI, *MBB.getParent())); 10171 NewMI->getOperand(I).setReg(ReplaceReg); 10172 } 10173 } 10174 MBB.insert(InsertTo, NewMI); 10175 return Result; 10176 } 10177 10178 void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition( 10179 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond, 10180 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) { 10181 // Create and accumulate conditions for next TC iterations. 10182 // Example: 10183 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last 10184 // # iteration of the kernel 10185 // 10186 // # insert the following instructions 10187 // cond = CSINCXr 0, 0, C, implicit $nzcv 10188 // counter = ADDXri counter, 1 # clone from this->Update 10189 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp 10190 // cond = CSINCXr cond, cond, C, implicit $nzcv 10191 // ... (repeat TC times) 10192 // SUBSXri cond, 0, implicit-def $nzcv 10193 10194 assert(CondBranch->getOpcode() == AArch64::Bcc); 10195 // CondCode to exit the loop 10196 AArch64CC::CondCode CC = 10197 (AArch64CC::CondCode)CondBranch->getOperand(0).getImm(); 10198 if (CondBranch->getOperand(1).getMBB() == LoopBB) 10199 CC = AArch64CC::getInvertedCondCode(CC); 10200 10201 // Accumulate conditions to exit the loop 10202 Register AccCond = AArch64::XZR; 10203 10204 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned. 10205 auto AccumulateCond = [&](Register CurCond, 10206 AArch64CC::CondCode CC) -> Register { 10207 Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); 10208 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr)) 10209 .addReg(NewCond, RegState::Define) 10210 .addReg(CurCond) 10211 .addReg(CurCond) 10212 .addImm(AArch64CC::getInvertedCondCode(CC)); 10213 return NewCond; 10214 }; 10215 10216 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) { 10217 // Update and Comp for I==0 are already exists in MBB 10218 // (MBB is an unrolled kernel) 10219 Register Counter; 10220 for (int I = 0; I <= TC; ++I) { 10221 Register NextCounter; 10222 if (I != 0) 10223 NextCounter = 10224 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end()); 10225 10226 AccCond = AccumulateCond(AccCond, CC); 10227 10228 if (I != TC) { 10229 if (I == 0) { 10230 if (Update != Comp && IsUpdatePriorComp) { 10231 Counter = 10232 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg(); 10233 NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, 10234 MBB.end()); 10235 } else { 10236 // can use already calculated value 10237 NextCounter = LastStage0Insts[Update]->getOperand(0).getReg(); 10238 } 10239 } else if (Update != Comp) { 10240 NextCounter = 10241 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end()); 10242 } 10243 } 10244 Counter = NextCounter; 10245 } 10246 } else { 10247 Register Counter; 10248 if (LastStage0Insts.empty()) { 10249 // use initial counter value (testing if the trip count is sufficient to 10250 // be executed by pipelined code) 10251 Counter = Init; 10252 if (IsUpdatePriorComp) 10253 Counter = 10254 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end()); 10255 } else { 10256 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block. 10257 Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg(); 10258 } 10259 10260 for (int I = 0; I <= TC; ++I) { 10261 Register NextCounter; 10262 NextCounter = 10263 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end()); 10264 AccCond = AccumulateCond(AccCond, CC); 10265 if (I != TC && Update != Comp) 10266 NextCounter = 10267 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end()); 10268 Counter = NextCounter; 10269 } 10270 } 10271 10272 // If AccCond == 0, the remainder is greater than TC. 10273 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri)) 10274 .addReg(AArch64::XZR, RegState::Define | RegState::Dead) 10275 .addReg(AccCond) 10276 .addImm(0) 10277 .addImm(0); 10278 Cond.clear(); 10279 Cond.push_back(MachineOperand::CreateImm(AArch64CC::EQ)); 10280 } 10281 10282 static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB, 10283 Register &RegMBB, Register &RegOther) { 10284 assert(Phi.getNumOperands() == 5); 10285 if (Phi.getOperand(2).getMBB() == MBB) { 10286 RegMBB = Phi.getOperand(1).getReg(); 10287 RegOther = Phi.getOperand(3).getReg(); 10288 } else { 10289 assert(Phi.getOperand(4).getMBB() == MBB); 10290 RegMBB = Phi.getOperand(3).getReg(); 10291 RegOther = Phi.getOperand(1).getReg(); 10292 } 10293 } 10294 10295 static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) { 10296 if (!Reg.isVirtual()) 10297 return false; 10298 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 10299 return MRI.getVRegDef(Reg)->getParent() != BB; 10300 } 10301 10302 /// If Reg is an induction variable, return true and set some parameters 10303 static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB, 10304 MachineInstr *&UpdateInst, 10305 unsigned &UpdateCounterOprNum, Register &InitReg, 10306 bool &IsUpdatePriorComp) { 10307 // Example: 10308 // 10309 // Preheader: 10310 // InitReg = ... 10311 // LoopBB: 10312 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB) 10313 // Reg = COPY Reg0 ; COPY is ignored. 10314 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value. 10315 // ; Reg is the value calculated in the previous 10316 // ; iteration, so IsUpdatePriorComp == false. 10317 10318 if (LoopBB->pred_size() != 2) 10319 return false; 10320 if (!Reg.isVirtual()) 10321 return false; 10322 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo(); 10323 UpdateInst = nullptr; 10324 UpdateCounterOprNum = 0; 10325 InitReg = 0; 10326 IsUpdatePriorComp = true; 10327 Register CurReg = Reg; 10328 while (true) { 10329 MachineInstr *Def = MRI.getVRegDef(CurReg); 10330 if (Def->getParent() != LoopBB) 10331 return false; 10332 if (Def->isCopy()) { 10333 // Ignore copy instructions unless they contain subregisters 10334 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg()) 10335 return false; 10336 CurReg = Def->getOperand(1).getReg(); 10337 } else if (Def->isPHI()) { 10338 if (InitReg != 0) 10339 return false; 10340 if (!UpdateInst) 10341 IsUpdatePriorComp = false; 10342 extractPhiReg(*Def, LoopBB, CurReg, InitReg); 10343 } else { 10344 if (UpdateInst) 10345 return false; 10346 switch (Def->getOpcode()) { 10347 case AArch64::ADDSXri: 10348 case AArch64::ADDSWri: 10349 case AArch64::SUBSXri: 10350 case AArch64::SUBSWri: 10351 case AArch64::ADDXri: 10352 case AArch64::ADDWri: 10353 case AArch64::SUBXri: 10354 case AArch64::SUBWri: 10355 UpdateInst = Def; 10356 UpdateCounterOprNum = 1; 10357 break; 10358 case AArch64::ADDSXrr: 10359 case AArch64::ADDSWrr: 10360 case AArch64::SUBSXrr: 10361 case AArch64::SUBSWrr: 10362 case AArch64::ADDXrr: 10363 case AArch64::ADDWrr: 10364 case AArch64::SUBXrr: 10365 case AArch64::SUBWrr: 10366 UpdateInst = Def; 10367 if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB)) 10368 UpdateCounterOprNum = 1; 10369 else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB)) 10370 UpdateCounterOprNum = 2; 10371 else 10372 return false; 10373 break; 10374 default: 10375 return false; 10376 } 10377 CurReg = Def->getOperand(UpdateCounterOprNum).getReg(); 10378 } 10379 10380 if (!CurReg.isVirtual()) 10381 return false; 10382 if (Reg == CurReg) 10383 break; 10384 } 10385 10386 if (!UpdateInst) 10387 return false; 10388 10389 return true; 10390 } 10391 10392 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> 10393 AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const { 10394 // Accept loops that meet the following conditions 10395 // * The conditional branch is BCC 10396 // * The compare instruction is ADDS/SUBS/WHILEXX 10397 // * One operand of the compare is an induction variable and the other is a 10398 // loop invariant value 10399 // * The induction variable is incremented/decremented by a single instruction 10400 // * Does not contain CALL or instructions which have unmodeled side effects 10401 10402 for (MachineInstr &MI : *LoopBB) 10403 if (MI.isCall() || MI.hasUnmodeledSideEffects()) 10404 // This instruction may use NZCV, which interferes with the instruction to 10405 // be inserted for loop control. 10406 return nullptr; 10407 10408 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; 10409 SmallVector<MachineOperand, 4> Cond; 10410 if (analyzeBranch(*LoopBB, TBB, FBB, Cond)) 10411 return nullptr; 10412 10413 // Infinite loops are not supported 10414 if (TBB == LoopBB && FBB == LoopBB) 10415 return nullptr; 10416 10417 // Must be conditional branch 10418 if (TBB != LoopBB && FBB == nullptr) 10419 return nullptr; 10420 10421 assert((TBB == LoopBB || FBB == LoopBB) && 10422 "The Loop must be a single-basic-block loop"); 10423 10424 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator(); 10425 const TargetRegisterInfo &TRI = getRegisterInfo(); 10426 10427 if (CondBranch->getOpcode() != AArch64::Bcc) 10428 return nullptr; 10429 10430 // Normalization for createTripCountGreaterCondition() 10431 if (TBB == LoopBB) 10432 reverseBranchCondition(Cond); 10433 10434 MachineInstr *Comp = nullptr; 10435 unsigned CompCounterOprNum = 0; 10436 for (MachineInstr &MI : reverse(*LoopBB)) { 10437 if (MI.modifiesRegister(AArch64::NZCV, &TRI)) { 10438 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the 10439 // operands is a loop invariant value 10440 10441 switch (MI.getOpcode()) { 10442 case AArch64::SUBSXri: 10443 case AArch64::SUBSWri: 10444 case AArch64::ADDSXri: 10445 case AArch64::ADDSWri: 10446 Comp = &MI; 10447 CompCounterOprNum = 1; 10448 break; 10449 case AArch64::ADDSWrr: 10450 case AArch64::ADDSXrr: 10451 case AArch64::SUBSWrr: 10452 case AArch64::SUBSXrr: 10453 Comp = &MI; 10454 break; 10455 default: 10456 if (isWhileOpcode(MI.getOpcode())) { 10457 Comp = &MI; 10458 break; 10459 } 10460 return nullptr; 10461 } 10462 10463 if (CompCounterOprNum == 0) { 10464 if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB)) 10465 CompCounterOprNum = 2; 10466 else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB)) 10467 CompCounterOprNum = 1; 10468 else 10469 return nullptr; 10470 } 10471 break; 10472 } 10473 } 10474 if (!Comp) 10475 return nullptr; 10476 10477 MachineInstr *Update = nullptr; 10478 Register Init; 10479 bool IsUpdatePriorComp; 10480 unsigned UpdateCounterOprNum; 10481 if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB, 10482 Update, UpdateCounterOprNum, Init, IsUpdatePriorComp)) 10483 return nullptr; 10484 10485 return std::make_unique<AArch64PipelinerLoopInfo>( 10486 LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum, 10487 Init, IsUpdatePriorComp, Cond); 10488 } 10489 10490 /// verifyInstruction - Perform target specific instruction verification. 10491 bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI, 10492 StringRef &ErrInfo) const { 10493 10494 // Verify that immediate offsets on load/store instructions are within range. 10495 // Stack objects with an FI operand are excluded as they can be fixed up 10496 // during PEI. 10497 TypeSize Scale(0U, false), Width(0U, false); 10498 int64_t MinOffset, MaxOffset; 10499 if (getMemOpInfo(MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) { 10500 unsigned ImmIdx = getLoadStoreImmIdx(MI.getOpcode()); 10501 if (MI.getOperand(ImmIdx).isImm() && !MI.getOperand(ImmIdx - 1).isFI()) { 10502 int64_t Imm = MI.getOperand(ImmIdx).getImm(); 10503 if (Imm < MinOffset || Imm > MaxOffset) { 10504 ErrInfo = "Unexpected immediate on load/store instruction"; 10505 return false; 10506 } 10507 } 10508 } 10509 return true; 10510 } 10511 10512 #define GET_INSTRINFO_HELPERS 10513 #define GET_INSTRMAP_INFO 10514 #include "AArch64GenInstrInfo.inc" 10515