1 //===- ARMFrameLowering.cpp - ARM Frame Information -----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the ARM implementation of TargetFrameLowering class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "ARMFrameLowering.h" 14 #include "ARMBaseInstrInfo.h" 15 #include "ARMBaseRegisterInfo.h" 16 #include "ARMConstantPoolValue.h" 17 #include "ARMMachineFunctionInfo.h" 18 #include "ARMSubtarget.h" 19 #include "MCTargetDesc/ARMAddressingModes.h" 20 #include "MCTargetDesc/ARMBaseInfo.h" 21 #include "Utils/ARMBaseInfo.h" 22 #include "llvm/ADT/BitVector.h" 23 #include "llvm/ADT/STLExtras.h" 24 #include "llvm/ADT/SmallPtrSet.h" 25 #include "llvm/ADT/SmallVector.h" 26 #include "llvm/CodeGen/MachineBasicBlock.h" 27 #include "llvm/CodeGen/MachineConstantPool.h" 28 #include "llvm/CodeGen/MachineFrameInfo.h" 29 #include "llvm/CodeGen/MachineFunction.h" 30 #include "llvm/CodeGen/MachineInstr.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineJumpTableInfo.h" 33 #include "llvm/CodeGen/MachineModuleInfo.h" 34 #include "llvm/CodeGen/MachineOperand.h" 35 #include "llvm/CodeGen/MachineRegisterInfo.h" 36 #include "llvm/CodeGen/RegisterScavenging.h" 37 #include "llvm/CodeGen/TargetInstrInfo.h" 38 #include "llvm/CodeGen/TargetOpcodes.h" 39 #include "llvm/CodeGen/TargetRegisterInfo.h" 40 #include "llvm/CodeGen/TargetSubtargetInfo.h" 41 #include "llvm/IR/Attributes.h" 42 #include "llvm/IR/CallingConv.h" 43 #include "llvm/IR/DebugLoc.h" 44 #include "llvm/IR/Function.h" 45 #include "llvm/MC/MCContext.h" 46 #include "llvm/MC/MCDwarf.h" 47 #include "llvm/MC/MCInstrDesc.h" 48 #include "llvm/MC/MCRegisterInfo.h" 49 #include "llvm/Support/CodeGen.h" 50 #include "llvm/Support/CommandLine.h" 51 #include "llvm/Support/Compiler.h" 52 #include "llvm/Support/Debug.h" 53 #include "llvm/Support/ErrorHandling.h" 54 #include "llvm/Support/MathExtras.h" 55 #include "llvm/Support/raw_ostream.h" 56 #include "llvm/Target/TargetMachine.h" 57 #include "llvm/Target/TargetOptions.h" 58 #include <algorithm> 59 #include <cassert> 60 #include <cstddef> 61 #include <cstdint> 62 #include <iterator> 63 #include <utility> 64 #include <vector> 65 66 #define DEBUG_TYPE "arm-frame-lowering" 67 68 using namespace llvm; 69 70 static cl::opt<bool> 71 SpillAlignedNEONRegs("align-neon-spills", cl::Hidden, cl::init(true), 72 cl::desc("Align ARM NEON spills in prolog and epilog")); 73 74 static cl::opt<bool> EnableExtraSpills( 75 "arm-extra-spills", cl::Hidden, cl::init(false), 76 cl::desc("Preserve extra registers when useful for IPRA")); 77 78 // Testing option to bypass some profitability checks. 79 static cl::opt<bool> ForceExtraSpills("arm-extra-spills-force", cl::Hidden, 80 cl::init(false)); 81 82 static MachineBasicBlock::iterator 83 skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI, 84 unsigned NumAlignedDPRCS2Regs); 85 86 ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti) 87 : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, Align(4)), 88 STI(sti) {} 89 90 bool ARMFrameLowering::keepFramePointer(const MachineFunction &MF) const { 91 // iOS always has a FP for backtracking, force other targets to keep their FP 92 // when doing FastISel. The emitted code is currently superior, and in cases 93 // like test-suite's lencod FastISel isn't quite correct when FP is eliminated. 94 return MF.getSubtarget<ARMSubtarget>().useFastISel(); 95 } 96 97 /// Returns true if the target can safely skip saving callee-saved registers 98 /// for noreturn nounwind functions. 99 bool ARMFrameLowering::enableCalleeSaveSkip(const MachineFunction &MF) const { 100 assert(MF.getFunction().hasFnAttribute(Attribute::NoReturn) && 101 MF.getFunction().hasFnAttribute(Attribute::NoUnwind) && 102 !MF.getFunction().hasFnAttribute(Attribute::UWTable)); 103 104 // Frame pointer and link register are not treated as normal CSR, thus we 105 // can always skip CSR saves for nonreturning functions. 106 return true; 107 } 108 109 /// hasFP - Return true if the specified function should have a dedicated frame 110 /// pointer register. This is true if the function has variable sized allocas 111 /// or if frame pointer elimination is disabled. 112 bool ARMFrameLowering::hasFP(const MachineFunction &MF) const { 113 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); 114 const MachineFrameInfo &MFI = MF.getFrameInfo(); 115 116 // ABI-required frame pointer. 117 if (MF.getTarget().Options.DisableFramePointerElim(MF)) 118 return true; 119 120 // Frame pointer required for use within this function. 121 return (RegInfo->needsStackRealignment(MF) || 122 MFI.hasVarSizedObjects() || 123 MFI.isFrameAddressTaken()); 124 } 125 126 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is 127 /// not required, we reserve argument space for call sites in the function 128 /// immediately on entry to the current function. This eliminates the need for 129 /// add/sub sp brackets around call sites. Returns true if the call frame is 130 /// included as part of the stack frame. 131 bool ARMFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { 132 const MachineFrameInfo &MFI = MF.getFrameInfo(); 133 unsigned CFSize = MFI.getMaxCallFrameSize(); 134 // It's not always a good idea to include the call frame as part of the 135 // stack frame. ARM (especially Thumb) has small immediate offset to 136 // address the stack frame. So a large call frame can cause poor codegen 137 // and may even makes it impossible to scavenge a register. 138 if (CFSize >= ((1 << 12) - 1) / 2) // Half of imm12 139 return false; 140 141 return !MFI.hasVarSizedObjects(); 142 } 143 144 /// canSimplifyCallFramePseudos - If there is a reserved call frame, the 145 /// call frame pseudos can be simplified. Unlike most targets, having a FP 146 /// is not sufficient here since we still may reference some objects via SP 147 /// even when FP is available in Thumb2 mode. 148 bool 149 ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { 150 return hasReservedCallFrame(MF) || MF.getFrameInfo().hasVarSizedObjects(); 151 } 152 153 static void emitRegPlusImmediate( 154 bool isARM, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, 155 const DebugLoc &dl, const ARMBaseInstrInfo &TII, unsigned DestReg, 156 unsigned SrcReg, int NumBytes, unsigned MIFlags = MachineInstr::NoFlags, 157 ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) { 158 if (isARM) 159 emitARMRegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes, 160 Pred, PredReg, TII, MIFlags); 161 else 162 emitT2RegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes, 163 Pred, PredReg, TII, MIFlags); 164 } 165 166 static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB, 167 MachineBasicBlock::iterator &MBBI, const DebugLoc &dl, 168 const ARMBaseInstrInfo &TII, int NumBytes, 169 unsigned MIFlags = MachineInstr::NoFlags, 170 ARMCC::CondCodes Pred = ARMCC::AL, 171 unsigned PredReg = 0) { 172 emitRegPlusImmediate(isARM, MBB, MBBI, dl, TII, ARM::SP, ARM::SP, NumBytes, 173 MIFlags, Pred, PredReg); 174 } 175 176 static int sizeOfSPAdjustment(const MachineInstr &MI) { 177 int RegSize; 178 switch (MI.getOpcode()) { 179 case ARM::VSTMDDB_UPD: 180 RegSize = 8; 181 break; 182 case ARM::STMDB_UPD: 183 case ARM::t2STMDB_UPD: 184 RegSize = 4; 185 break; 186 case ARM::t2STR_PRE: 187 case ARM::STR_PRE_IMM: 188 return 4; 189 default: 190 llvm_unreachable("Unknown push or pop like instruction"); 191 } 192 193 int count = 0; 194 // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+ 195 // pred) so the list starts at 4. 196 for (int i = MI.getNumOperands() - 1; i >= 4; --i) 197 count += RegSize; 198 return count; 199 } 200 201 static bool WindowsRequiresStackProbe(const MachineFunction &MF, 202 size_t StackSizeInBytes) { 203 const MachineFrameInfo &MFI = MF.getFrameInfo(); 204 const Function &F = MF.getFunction(); 205 unsigned StackProbeSize = (MFI.getStackProtectorIndex() > 0) ? 4080 : 4096; 206 if (F.hasFnAttribute("stack-probe-size")) 207 F.getFnAttribute("stack-probe-size") 208 .getValueAsString() 209 .getAsInteger(0, StackProbeSize); 210 return (StackSizeInBytes >= StackProbeSize) && 211 !F.hasFnAttribute("no-stack-arg-probe"); 212 } 213 214 namespace { 215 216 struct StackAdjustingInsts { 217 struct InstInfo { 218 MachineBasicBlock::iterator I; 219 unsigned SPAdjust; 220 bool BeforeFPSet; 221 }; 222 223 SmallVector<InstInfo, 4> Insts; 224 225 void addInst(MachineBasicBlock::iterator I, unsigned SPAdjust, 226 bool BeforeFPSet = false) { 227 InstInfo Info = {I, SPAdjust, BeforeFPSet}; 228 Insts.push_back(Info); 229 } 230 231 void addExtraBytes(const MachineBasicBlock::iterator I, unsigned ExtraBytes) { 232 auto Info = 233 llvm::find_if(Insts, [&](InstInfo &Info) { return Info.I == I; }); 234 assert(Info != Insts.end() && "invalid sp adjusting instruction"); 235 Info->SPAdjust += ExtraBytes; 236 } 237 238 void emitDefCFAOffsets(MachineBasicBlock &MBB, const DebugLoc &dl, 239 const ARMBaseInstrInfo &TII, bool HasFP) { 240 MachineFunction &MF = *MBB.getParent(); 241 unsigned CFAOffset = 0; 242 for (auto &Info : Insts) { 243 if (HasFP && !Info.BeforeFPSet) 244 return; 245 246 CFAOffset -= Info.SPAdjust; 247 unsigned CFIIndex = MF.addFrameInst( 248 MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset)); 249 BuildMI(MBB, std::next(Info.I), dl, 250 TII.get(TargetOpcode::CFI_INSTRUCTION)) 251 .addCFIIndex(CFIIndex) 252 .setMIFlags(MachineInstr::FrameSetup); 253 } 254 } 255 }; 256 257 } // end anonymous namespace 258 259 /// Emit an instruction sequence that will align the address in 260 /// register Reg by zero-ing out the lower bits. For versions of the 261 /// architecture that support Neon, this must be done in a single 262 /// instruction, since skipAlignedDPRCS2Spills assumes it is done in a 263 /// single instruction. That function only gets called when optimizing 264 /// spilling of D registers on a core with the Neon instruction set 265 /// present. 266 static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI, 267 const TargetInstrInfo &TII, 268 MachineBasicBlock &MBB, 269 MachineBasicBlock::iterator MBBI, 270 const DebugLoc &DL, const unsigned Reg, 271 const Align Alignment, 272 const bool MustBeSingleInstruction) { 273 const ARMSubtarget &AST = 274 static_cast<const ARMSubtarget &>(MF.getSubtarget()); 275 const bool CanUseBFC = AST.hasV6T2Ops() || AST.hasV7Ops(); 276 const unsigned AlignMask = Alignment.value() - 1U; 277 const unsigned NrBitsToZero = Log2(Alignment); 278 assert(!AFI->isThumb1OnlyFunction() && "Thumb1 not supported"); 279 if (!AFI->isThumbFunction()) { 280 // if the BFC instruction is available, use that to zero the lower 281 // bits: 282 // bfc Reg, #0, log2(Alignment) 283 // otherwise use BIC, if the mask to zero the required number of bits 284 // can be encoded in the bic immediate field 285 // bic Reg, Reg, Alignment-1 286 // otherwise, emit 287 // lsr Reg, Reg, log2(Alignment) 288 // lsl Reg, Reg, log2(Alignment) 289 if (CanUseBFC) { 290 BuildMI(MBB, MBBI, DL, TII.get(ARM::BFC), Reg) 291 .addReg(Reg, RegState::Kill) 292 .addImm(~AlignMask) 293 .add(predOps(ARMCC::AL)); 294 } else if (AlignMask <= 255) { 295 BuildMI(MBB, MBBI, DL, TII.get(ARM::BICri), Reg) 296 .addReg(Reg, RegState::Kill) 297 .addImm(AlignMask) 298 .add(predOps(ARMCC::AL)) 299 .add(condCodeOp()); 300 } else { 301 assert(!MustBeSingleInstruction && 302 "Shouldn't call emitAligningInstructions demanding a single " 303 "instruction to be emitted for large stack alignment for a target " 304 "without BFC."); 305 BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg) 306 .addReg(Reg, RegState::Kill) 307 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsr, NrBitsToZero)) 308 .add(predOps(ARMCC::AL)) 309 .add(condCodeOp()); 310 BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg) 311 .addReg(Reg, RegState::Kill) 312 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, NrBitsToZero)) 313 .add(predOps(ARMCC::AL)) 314 .add(condCodeOp()); 315 } 316 } else { 317 // Since this is only reached for Thumb-2 targets, the BFC instruction 318 // should always be available. 319 assert(CanUseBFC); 320 BuildMI(MBB, MBBI, DL, TII.get(ARM::t2BFC), Reg) 321 .addReg(Reg, RegState::Kill) 322 .addImm(~AlignMask) 323 .add(predOps(ARMCC::AL)); 324 } 325 } 326 327 /// We need the offset of the frame pointer relative to other MachineFrameInfo 328 /// offsets which are encoded relative to SP at function begin. 329 /// See also emitPrologue() for how the FP is set up. 330 /// Unfortunately we cannot determine this value in determineCalleeSaves() yet 331 /// as assignCalleeSavedSpillSlots() hasn't run at this point. Instead we use 332 /// this to produce a conservative estimate that we check in an assert() later. 333 static int getMaxFPOffset(const Function &F, const ARMFunctionInfo &AFI) { 334 // For Thumb1, push.w isn't available, so the first push will always push 335 // r7 and lr onto the stack first. 336 if (AFI.isThumb1OnlyFunction()) 337 return -AFI.getArgRegsSaveSize() - (2 * 4); 338 // This is a conservative estimation: Assume the frame pointer being r7 and 339 // pc("r15") up to r8 getting spilled before (= 8 registers). 340 return -AFI.getArgRegsSaveSize() - (8 * 4); 341 } 342 343 void ARMFrameLowering::emitPrologue(MachineFunction &MF, 344 MachineBasicBlock &MBB) const { 345 MachineBasicBlock::iterator MBBI = MBB.begin(); 346 MachineFrameInfo &MFI = MF.getFrameInfo(); 347 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 348 MachineModuleInfo &MMI = MF.getMMI(); 349 MCContext &Context = MMI.getContext(); 350 const TargetMachine &TM = MF.getTarget(); 351 const MCRegisterInfo *MRI = Context.getRegisterInfo(); 352 const ARMBaseRegisterInfo *RegInfo = STI.getRegisterInfo(); 353 const ARMBaseInstrInfo &TII = *STI.getInstrInfo(); 354 assert(!AFI->isThumb1OnlyFunction() && 355 "This emitPrologue does not support Thumb1!"); 356 bool isARM = !AFI->isThumbFunction(); 357 Align Alignment = STI.getFrameLowering()->getStackAlign(); 358 unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); 359 unsigned NumBytes = MFI.getStackSize(); 360 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); 361 362 // Debug location must be unknown since the first debug location is used 363 // to determine the end of the prologue. 364 DebugLoc dl; 365 366 Register FramePtr = RegInfo->getFrameRegister(MF); 367 368 // Determine the sizes of each callee-save spill areas and record which frame 369 // belongs to which callee-save spill areas. 370 unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; 371 int FramePtrSpillFI = 0; 372 int D8SpillFI = 0; 373 374 // All calls are tail calls in GHC calling conv, and functions have no 375 // prologue/epilogue. 376 if (MF.getFunction().getCallingConv() == CallingConv::GHC) 377 return; 378 379 StackAdjustingInsts DefCFAOffsetCandidates; 380 bool HasFP = hasFP(MF); 381 382 // Allocate the vararg register save area. 383 if (ArgRegsSaveSize) { 384 emitSPUpdate(isARM, MBB, MBBI, dl, TII, -ArgRegsSaveSize, 385 MachineInstr::FrameSetup); 386 DefCFAOffsetCandidates.addInst(std::prev(MBBI), ArgRegsSaveSize, true); 387 } 388 389 if (!AFI->hasStackFrame() && 390 (!STI.isTargetWindows() || !WindowsRequiresStackProbe(MF, NumBytes))) { 391 if (NumBytes - ArgRegsSaveSize != 0) { 392 emitSPUpdate(isARM, MBB, MBBI, dl, TII, -(NumBytes - ArgRegsSaveSize), 393 MachineInstr::FrameSetup); 394 DefCFAOffsetCandidates.addInst(std::prev(MBBI), 395 NumBytes - ArgRegsSaveSize, true); 396 } 397 DefCFAOffsetCandidates.emitDefCFAOffsets(MBB, dl, TII, HasFP); 398 return; 399 } 400 401 // Determine spill area sizes. 402 for (unsigned i = 0, e = CSI.size(); i != e; ++i) { 403 unsigned Reg = CSI[i].getReg(); 404 int FI = CSI[i].getFrameIdx(); 405 switch (Reg) { 406 case ARM::R8: 407 case ARM::R9: 408 case ARM::R10: 409 case ARM::R11: 410 case ARM::R12: 411 if (STI.splitFramePushPop(MF)) { 412 GPRCS2Size += 4; 413 break; 414 } 415 LLVM_FALLTHROUGH; 416 case ARM::R0: 417 case ARM::R1: 418 case ARM::R2: 419 case ARM::R3: 420 case ARM::R4: 421 case ARM::R5: 422 case ARM::R6: 423 case ARM::R7: 424 case ARM::LR: 425 if (Reg == FramePtr) 426 FramePtrSpillFI = FI; 427 GPRCS1Size += 4; 428 break; 429 default: 430 // This is a DPR. Exclude the aligned DPRCS2 spills. 431 if (Reg == ARM::D8) 432 D8SpillFI = FI; 433 if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs()) 434 DPRCSSize += 8; 435 } 436 } 437 438 // Move past area 1. 439 MachineBasicBlock::iterator LastPush = MBB.end(), GPRCS1Push, GPRCS2Push; 440 if (GPRCS1Size > 0) { 441 GPRCS1Push = LastPush = MBBI++; 442 DefCFAOffsetCandidates.addInst(LastPush, GPRCS1Size, true); 443 } 444 445 // Determine starting offsets of spill areas. 446 unsigned GPRCS1Offset = NumBytes - ArgRegsSaveSize - GPRCS1Size; 447 unsigned GPRCS2Offset = GPRCS1Offset - GPRCS2Size; 448 Align DPRAlign = DPRCSSize ? std::min(Align(8), Alignment) : Align(4); 449 unsigned DPRGapSize = 450 (GPRCS1Size + GPRCS2Size + ArgRegsSaveSize) % DPRAlign.value(); 451 unsigned DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize; 452 int FramePtrOffsetInPush = 0; 453 if (HasFP) { 454 int FPOffset = MFI.getObjectOffset(FramePtrSpillFI); 455 assert(getMaxFPOffset(MF.getFunction(), *AFI) <= FPOffset && 456 "Max FP estimation is wrong"); 457 FramePtrOffsetInPush = FPOffset + ArgRegsSaveSize; 458 AFI->setFramePtrSpillOffset(MFI.getObjectOffset(FramePtrSpillFI) + 459 NumBytes); 460 } 461 AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); 462 AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); 463 AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); 464 465 // Move past area 2. 466 if (GPRCS2Size > 0) { 467 GPRCS2Push = LastPush = MBBI++; 468 DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size); 469 } 470 471 // Prolog/epilog inserter assumes we correctly align DPRs on the stack, so our 472 // .cfi_offset operations will reflect that. 473 if (DPRGapSize) { 474 assert(DPRGapSize == 4 && "unexpected alignment requirements for DPRs"); 475 if (LastPush != MBB.end() && 476 tryFoldSPUpdateIntoPushPop(STI, MF, &*LastPush, DPRGapSize)) 477 DefCFAOffsetCandidates.addExtraBytes(LastPush, DPRGapSize); 478 else { 479 emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRGapSize, 480 MachineInstr::FrameSetup); 481 DefCFAOffsetCandidates.addInst(std::prev(MBBI), DPRGapSize); 482 } 483 } 484 485 // Move past area 3. 486 if (DPRCSSize > 0) { 487 // Since vpush register list cannot have gaps, there may be multiple vpush 488 // instructions in the prologue. 489 while (MBBI != MBB.end() && MBBI->getOpcode() == ARM::VSTMDDB_UPD) { 490 DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(*MBBI)); 491 LastPush = MBBI++; 492 } 493 } 494 495 // Move past the aligned DPRCS2 area. 496 if (AFI->getNumAlignedDPRCS2Regs() > 0) { 497 MBBI = skipAlignedDPRCS2Spills(MBBI, AFI->getNumAlignedDPRCS2Regs()); 498 // The code inserted by emitAlignedDPRCS2Spills realigns the stack, and 499 // leaves the stack pointer pointing to the DPRCS2 area. 500 // 501 // Adjust NumBytes to represent the stack slots below the DPRCS2 area. 502 NumBytes += MFI.getObjectOffset(D8SpillFI); 503 } else 504 NumBytes = DPRCSOffset; 505 506 if (STI.isTargetWindows() && WindowsRequiresStackProbe(MF, NumBytes)) { 507 uint32_t NumWords = NumBytes >> 2; 508 509 if (NumWords < 65536) 510 BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), ARM::R4) 511 .addImm(NumWords) 512 .setMIFlags(MachineInstr::FrameSetup) 513 .add(predOps(ARMCC::AL)); 514 else 515 BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R4) 516 .addImm(NumWords) 517 .setMIFlags(MachineInstr::FrameSetup); 518 519 switch (TM.getCodeModel()) { 520 case CodeModel::Tiny: 521 llvm_unreachable("Tiny code model not available on ARM."); 522 case CodeModel::Small: 523 case CodeModel::Medium: 524 case CodeModel::Kernel: 525 BuildMI(MBB, MBBI, dl, TII.get(ARM::tBL)) 526 .add(predOps(ARMCC::AL)) 527 .addExternalSymbol("__chkstk") 528 .addReg(ARM::R4, RegState::Implicit) 529 .setMIFlags(MachineInstr::FrameSetup); 530 break; 531 case CodeModel::Large: 532 BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R12) 533 .addExternalSymbol("__chkstk") 534 .setMIFlags(MachineInstr::FrameSetup); 535 536 BuildMI(MBB, MBBI, dl, TII.get(ARM::tBLXr)) 537 .add(predOps(ARMCC::AL)) 538 .addReg(ARM::R12, RegState::Kill) 539 .addReg(ARM::R4, RegState::Implicit) 540 .setMIFlags(MachineInstr::FrameSetup); 541 break; 542 } 543 544 BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), ARM::SP) 545 .addReg(ARM::SP, RegState::Kill) 546 .addReg(ARM::R4, RegState::Kill) 547 .setMIFlags(MachineInstr::FrameSetup) 548 .add(predOps(ARMCC::AL)) 549 .add(condCodeOp()); 550 NumBytes = 0; 551 } 552 553 if (NumBytes) { 554 // Adjust SP after all the callee-save spills. 555 if (AFI->getNumAlignedDPRCS2Regs() == 0 && 556 tryFoldSPUpdateIntoPushPop(STI, MF, &*LastPush, NumBytes)) 557 DefCFAOffsetCandidates.addExtraBytes(LastPush, NumBytes); 558 else { 559 emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, 560 MachineInstr::FrameSetup); 561 DefCFAOffsetCandidates.addInst(std::prev(MBBI), NumBytes); 562 } 563 564 if (HasFP && isARM) 565 // Restore from fp only in ARM mode: e.g. sub sp, r7, #24 566 // Note it's not safe to do this in Thumb2 mode because it would have 567 // taken two instructions: 568 // mov sp, r7 569 // sub sp, #24 570 // If an interrupt is taken between the two instructions, then sp is in 571 // an inconsistent state (pointing to the middle of callee-saved area). 572 // The interrupt handler can end up clobbering the registers. 573 AFI->setShouldRestoreSPFromFP(true); 574 } 575 576 // Set FP to point to the stack slot that contains the previous FP. 577 // For iOS, FP is R7, which has now been stored in spill area 1. 578 // Otherwise, if this is not iOS, all the callee-saved registers go 579 // into spill area 1, including the FP in R11. In either case, it 580 // is in area one and the adjustment needs to take place just after 581 // that push. 582 if (HasFP) { 583 MachineBasicBlock::iterator AfterPush = std::next(GPRCS1Push); 584 unsigned PushSize = sizeOfSPAdjustment(*GPRCS1Push); 585 emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush, 586 dl, TII, FramePtr, ARM::SP, 587 PushSize + FramePtrOffsetInPush, 588 MachineInstr::FrameSetup); 589 if (FramePtrOffsetInPush + PushSize != 0) { 590 unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa( 591 nullptr, MRI->getDwarfRegNum(FramePtr, true), 592 -(ArgRegsSaveSize - FramePtrOffsetInPush))); 593 BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) 594 .addCFIIndex(CFIIndex) 595 .setMIFlags(MachineInstr::FrameSetup); 596 } else { 597 unsigned CFIIndex = 598 MF.addFrameInst(MCCFIInstruction::createDefCfaRegister( 599 nullptr, MRI->getDwarfRegNum(FramePtr, true))); 600 BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) 601 .addCFIIndex(CFIIndex) 602 .setMIFlags(MachineInstr::FrameSetup); 603 } 604 } 605 606 // Now that the prologue's actual instructions are finalised, we can insert 607 // the necessary DWARF cf instructions to describe the situation. Start by 608 // recording where each register ended up: 609 if (GPRCS1Size > 0) { 610 MachineBasicBlock::iterator Pos = std::next(GPRCS1Push); 611 int CFIIndex; 612 for (const auto &Entry : CSI) { 613 unsigned Reg = Entry.getReg(); 614 int FI = Entry.getFrameIdx(); 615 switch (Reg) { 616 case ARM::R8: 617 case ARM::R9: 618 case ARM::R10: 619 case ARM::R11: 620 case ARM::R12: 621 if (STI.splitFramePushPop(MF)) 622 break; 623 LLVM_FALLTHROUGH; 624 case ARM::R0: 625 case ARM::R1: 626 case ARM::R2: 627 case ARM::R3: 628 case ARM::R4: 629 case ARM::R5: 630 case ARM::R6: 631 case ARM::R7: 632 case ARM::LR: 633 CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( 634 nullptr, MRI->getDwarfRegNum(Reg, true), MFI.getObjectOffset(FI))); 635 BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) 636 .addCFIIndex(CFIIndex) 637 .setMIFlags(MachineInstr::FrameSetup); 638 break; 639 } 640 } 641 } 642 643 if (GPRCS2Size > 0) { 644 MachineBasicBlock::iterator Pos = std::next(GPRCS2Push); 645 for (const auto &Entry : CSI) { 646 unsigned Reg = Entry.getReg(); 647 int FI = Entry.getFrameIdx(); 648 switch (Reg) { 649 case ARM::R8: 650 case ARM::R9: 651 case ARM::R10: 652 case ARM::R11: 653 case ARM::R12: 654 if (STI.splitFramePushPop(MF)) { 655 unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); 656 unsigned Offset = MFI.getObjectOffset(FI); 657 unsigned CFIIndex = MF.addFrameInst( 658 MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); 659 BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) 660 .addCFIIndex(CFIIndex) 661 .setMIFlags(MachineInstr::FrameSetup); 662 } 663 break; 664 } 665 } 666 } 667 668 if (DPRCSSize > 0) { 669 // Since vpush register list cannot have gaps, there may be multiple vpush 670 // instructions in the prologue. 671 MachineBasicBlock::iterator Pos = std::next(LastPush); 672 for (const auto &Entry : CSI) { 673 unsigned Reg = Entry.getReg(); 674 int FI = Entry.getFrameIdx(); 675 if ((Reg >= ARM::D0 && Reg <= ARM::D31) && 676 (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())) { 677 unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); 678 unsigned Offset = MFI.getObjectOffset(FI); 679 unsigned CFIIndex = MF.addFrameInst( 680 MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); 681 BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) 682 .addCFIIndex(CFIIndex) 683 .setMIFlags(MachineInstr::FrameSetup); 684 } 685 } 686 } 687 688 // Now we can emit descriptions of where the canonical frame address was 689 // throughout the process. If we have a frame pointer, it takes over the job 690 // half-way through, so only the first few .cfi_def_cfa_offset instructions 691 // actually get emitted. 692 DefCFAOffsetCandidates.emitDefCFAOffsets(MBB, dl, TII, HasFP); 693 694 if (STI.isTargetELF() && hasFP(MF)) 695 MFI.setOffsetAdjustment(MFI.getOffsetAdjustment() - 696 AFI->getFramePtrSpillOffset()); 697 698 AFI->setGPRCalleeSavedArea1Size(GPRCS1Size); 699 AFI->setGPRCalleeSavedArea2Size(GPRCS2Size); 700 AFI->setDPRCalleeSavedGapSize(DPRGapSize); 701 AFI->setDPRCalleeSavedAreaSize(DPRCSSize); 702 703 // If we need dynamic stack realignment, do it here. Be paranoid and make 704 // sure if we also have VLAs, we have a base pointer for frame access. 705 // If aligned NEON registers were spilled, the stack has already been 706 // realigned. 707 if (!AFI->getNumAlignedDPRCS2Regs() && RegInfo->needsStackRealignment(MF)) { 708 Align MaxAlign = MFI.getMaxAlign(); 709 assert(!AFI->isThumb1OnlyFunction()); 710 if (!AFI->isThumbFunction()) { 711 emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::SP, MaxAlign, 712 false); 713 } else { 714 // We cannot use sp as source/dest register here, thus we're using r4 to 715 // perform the calculations. We're emitting the following sequence: 716 // mov r4, sp 717 // -- use emitAligningInstructions to produce best sequence to zero 718 // -- out lower bits in r4 719 // mov sp, r4 720 // FIXME: It will be better just to find spare register here. 721 BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4) 722 .addReg(ARM::SP, RegState::Kill) 723 .add(predOps(ARMCC::AL)); 724 emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::R4, MaxAlign, 725 false); 726 BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP) 727 .addReg(ARM::R4, RegState::Kill) 728 .add(predOps(ARMCC::AL)); 729 } 730 731 AFI->setShouldRestoreSPFromFP(true); 732 } 733 734 // If we need a base pointer, set it up here. It's whatever the value 735 // of the stack pointer is at this point. Any variable size objects 736 // will be allocated after this, so we can still use the base pointer 737 // to reference locals. 738 // FIXME: Clarify FrameSetup flags here. 739 if (RegInfo->hasBasePointer(MF)) { 740 if (isARM) 741 BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), RegInfo->getBaseRegister()) 742 .addReg(ARM::SP) 743 .add(predOps(ARMCC::AL)) 744 .add(condCodeOp()); 745 else 746 BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), RegInfo->getBaseRegister()) 747 .addReg(ARM::SP) 748 .add(predOps(ARMCC::AL)); 749 } 750 751 // If the frame has variable sized objects then the epilogue must restore 752 // the sp from fp. We can assume there's an FP here since hasFP already 753 // checks for hasVarSizedObjects. 754 if (MFI.hasVarSizedObjects()) 755 AFI->setShouldRestoreSPFromFP(true); 756 } 757 758 void ARMFrameLowering::emitEpilogue(MachineFunction &MF, 759 MachineBasicBlock &MBB) const { 760 MachineFrameInfo &MFI = MF.getFrameInfo(); 761 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 762 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); 763 const ARMBaseInstrInfo &TII = 764 *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo()); 765 assert(!AFI->isThumb1OnlyFunction() && 766 "This emitEpilogue does not support Thumb1!"); 767 bool isARM = !AFI->isThumbFunction(); 768 769 unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); 770 int NumBytes = (int)MFI.getStackSize(); 771 Register FramePtr = RegInfo->getFrameRegister(MF); 772 773 // All calls are tail calls in GHC calling conv, and functions have no 774 // prologue/epilogue. 775 if (MF.getFunction().getCallingConv() == CallingConv::GHC) 776 return; 777 778 // First put ourselves on the first (from top) terminator instructions. 779 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); 780 DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); 781 782 if (!AFI->hasStackFrame()) { 783 if (NumBytes - ArgRegsSaveSize != 0) 784 emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize, 785 MachineInstr::FrameDestroy); 786 } else { 787 // Unwind MBBI to point to first LDR / VLDRD. 788 if (MBBI != MBB.begin()) { 789 do { 790 --MBBI; 791 } while (MBBI != MBB.begin() && 792 MBBI->getFlag(MachineInstr::FrameDestroy)); 793 if (!MBBI->getFlag(MachineInstr::FrameDestroy)) 794 ++MBBI; 795 } 796 797 // Move SP to start of FP callee save spill area. 798 NumBytes -= (ArgRegsSaveSize + 799 AFI->getGPRCalleeSavedArea1Size() + 800 AFI->getGPRCalleeSavedArea2Size() + 801 AFI->getDPRCalleeSavedGapSize() + 802 AFI->getDPRCalleeSavedAreaSize()); 803 804 // Reset SP based on frame pointer only if the stack frame extends beyond 805 // frame pointer stack slot or target is ELF and the function has FP. 806 if (AFI->shouldRestoreSPFromFP()) { 807 NumBytes = AFI->getFramePtrSpillOffset() - NumBytes; 808 if (NumBytes) { 809 if (isARM) 810 emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes, 811 ARMCC::AL, 0, TII, 812 MachineInstr::FrameDestroy); 813 else { 814 // It's not possible to restore SP from FP in a single instruction. 815 // For iOS, this looks like: 816 // mov sp, r7 817 // sub sp, #24 818 // This is bad, if an interrupt is taken after the mov, sp is in an 819 // inconsistent state. 820 // Use the first callee-saved register as a scratch register. 821 assert(!MFI.getPristineRegs(MF).test(ARM::R4) && 822 "No scratch register to restore SP from FP!"); 823 emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes, 824 ARMCC::AL, 0, TII, MachineInstr::FrameDestroy); 825 BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP) 826 .addReg(ARM::R4) 827 .add(predOps(ARMCC::AL)) 828 .setMIFlag(MachineInstr::FrameDestroy); 829 } 830 } else { 831 // Thumb2 or ARM. 832 if (isARM) 833 BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP) 834 .addReg(FramePtr) 835 .add(predOps(ARMCC::AL)) 836 .add(condCodeOp()) 837 .setMIFlag(MachineInstr::FrameDestroy); 838 else 839 BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP) 840 .addReg(FramePtr) 841 .add(predOps(ARMCC::AL)) 842 .setMIFlag(MachineInstr::FrameDestroy); 843 } 844 } else if (NumBytes && 845 !tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes)) 846 emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes, 847 MachineInstr::FrameDestroy); 848 849 // Increment past our save areas. 850 if (MBBI != MBB.end() && AFI->getDPRCalleeSavedAreaSize()) { 851 MBBI++; 852 // Since vpop register list cannot have gaps, there may be multiple vpop 853 // instructions in the epilogue. 854 while (MBBI != MBB.end() && MBBI->getOpcode() == ARM::VLDMDIA_UPD) 855 MBBI++; 856 } 857 if (AFI->getDPRCalleeSavedGapSize()) { 858 assert(AFI->getDPRCalleeSavedGapSize() == 4 && 859 "unexpected DPR alignment gap"); 860 emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedGapSize(), 861 MachineInstr::FrameDestroy); 862 } 863 864 if (AFI->getGPRCalleeSavedArea2Size()) MBBI++; 865 if (AFI->getGPRCalleeSavedArea1Size()) MBBI++; 866 } 867 868 if (ArgRegsSaveSize) 869 emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize, 870 MachineInstr::FrameDestroy); 871 } 872 873 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for 874 /// debug info. It's the same as what we use for resolving the code-gen 875 /// references for now. FIXME: This can go wrong when references are 876 /// SP-relative and simple call frames aren't used. 877 int 878 ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, 879 unsigned &FrameReg) const { 880 return ResolveFrameIndexReference(MF, FI, FrameReg, 0); 881 } 882 883 int 884 ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, 885 int FI, unsigned &FrameReg, 886 int SPAdj) const { 887 const MachineFrameInfo &MFI = MF.getFrameInfo(); 888 const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>( 889 MF.getSubtarget().getRegisterInfo()); 890 const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 891 int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize(); 892 int FPOffset = Offset - AFI->getFramePtrSpillOffset(); 893 bool isFixed = MFI.isFixedObjectIndex(FI); 894 895 FrameReg = ARM::SP; 896 Offset += SPAdj; 897 898 // SP can move around if there are allocas. We may also lose track of SP 899 // when emergency spilling inside a non-reserved call frame setup. 900 bool hasMovingSP = !hasReservedCallFrame(MF); 901 902 // When dynamically realigning the stack, use the frame pointer for 903 // parameters, and the stack/base pointer for locals. 904 if (RegInfo->needsStackRealignment(MF)) { 905 assert(hasFP(MF) && "dynamic stack realignment without a FP!"); 906 if (isFixed) { 907 FrameReg = RegInfo->getFrameRegister(MF); 908 Offset = FPOffset; 909 } else if (hasMovingSP) { 910 assert(RegInfo->hasBasePointer(MF) && 911 "VLAs and dynamic stack alignment, but missing base pointer!"); 912 FrameReg = RegInfo->getBaseRegister(); 913 Offset -= SPAdj; 914 } 915 return Offset; 916 } 917 918 // If there is a frame pointer, use it when we can. 919 if (hasFP(MF) && AFI->hasStackFrame()) { 920 // Use frame pointer to reference fixed objects. Use it for locals if 921 // there are VLAs (and thus the SP isn't reliable as a base). 922 if (isFixed || (hasMovingSP && !RegInfo->hasBasePointer(MF))) { 923 FrameReg = RegInfo->getFrameRegister(MF); 924 return FPOffset; 925 } else if (hasMovingSP) { 926 assert(RegInfo->hasBasePointer(MF) && "missing base pointer!"); 927 if (AFI->isThumb2Function()) { 928 // Try to use the frame pointer if we can, else use the base pointer 929 // since it's available. This is handy for the emergency spill slot, in 930 // particular. 931 if (FPOffset >= -255 && FPOffset < 0) { 932 FrameReg = RegInfo->getFrameRegister(MF); 933 return FPOffset; 934 } 935 } 936 } else if (AFI->isThumbFunction()) { 937 // Prefer SP to base pointer, if the offset is suitably aligned and in 938 // range as the effective range of the immediate offset is bigger when 939 // basing off SP. 940 // Use add <rd>, sp, #<imm8> 941 // ldr <rd>, [sp, #<imm8>] 942 if (Offset >= 0 && (Offset & 3) == 0 && Offset <= 1020) 943 return Offset; 944 // In Thumb2 mode, the negative offset is very limited. Try to avoid 945 // out of range references. ldr <rt>,[<rn>, #-<imm8>] 946 if (AFI->isThumb2Function() && FPOffset >= -255 && FPOffset < 0) { 947 FrameReg = RegInfo->getFrameRegister(MF); 948 return FPOffset; 949 } 950 } else if (Offset > (FPOffset < 0 ? -FPOffset : FPOffset)) { 951 // Otherwise, use SP or FP, whichever is closer to the stack slot. 952 FrameReg = RegInfo->getFrameRegister(MF); 953 return FPOffset; 954 } 955 } 956 // Use the base pointer if we have one. 957 // FIXME: Maybe prefer sp on Thumb1 if it's legal and the offset is cheaper? 958 // That can happen if we forced a base pointer for a large call frame. 959 if (RegInfo->hasBasePointer(MF)) { 960 FrameReg = RegInfo->getBaseRegister(); 961 Offset -= SPAdj; 962 } 963 return Offset; 964 } 965 966 void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, 967 MachineBasicBlock::iterator MI, 968 ArrayRef<CalleeSavedInfo> CSI, 969 unsigned StmOpc, unsigned StrOpc, 970 bool NoGap, bool (*Func)(unsigned, bool), 971 unsigned NumAlignedDPRCS2Regs, 972 unsigned MIFlags) const { 973 MachineFunction &MF = *MBB.getParent(); 974 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); 975 const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); 976 977 DebugLoc DL; 978 979 using RegAndKill = std::pair<unsigned, bool>; 980 981 SmallVector<RegAndKill, 4> Regs; 982 unsigned i = CSI.size(); 983 while (i != 0) { 984 unsigned LastReg = 0; 985 for (; i != 0; --i) { 986 unsigned Reg = CSI[i-1].getReg(); 987 if (!(Func)(Reg, STI.splitFramePushPop(MF))) continue; 988 989 // D-registers in the aligned area DPRCS2 are NOT spilled here. 990 if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs) 991 continue; 992 993 const MachineRegisterInfo &MRI = MF.getRegInfo(); 994 bool isLiveIn = MRI.isLiveIn(Reg); 995 if (!isLiveIn && !MRI.isReserved(Reg)) 996 MBB.addLiveIn(Reg); 997 // If NoGap is true, push consecutive registers and then leave the rest 998 // for other instructions. e.g. 999 // vpush {d8, d10, d11} -> vpush {d8}, vpush {d10, d11} 1000 if (NoGap && LastReg && LastReg != Reg-1) 1001 break; 1002 LastReg = Reg; 1003 // Do not set a kill flag on values that are also marked as live-in. This 1004 // happens with the @llvm-returnaddress intrinsic and with arguments 1005 // passed in callee saved registers. 1006 // Omitting the kill flags is conservatively correct even if the live-in 1007 // is not used after all. 1008 Regs.push_back(std::make_pair(Reg, /*isKill=*/!isLiveIn)); 1009 } 1010 1011 if (Regs.empty()) 1012 continue; 1013 1014 llvm::sort(Regs, [&](const RegAndKill &LHS, const RegAndKill &RHS) { 1015 return TRI.getEncodingValue(LHS.first) < TRI.getEncodingValue(RHS.first); 1016 }); 1017 1018 if (Regs.size() > 1 || StrOpc== 0) { 1019 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StmOpc), ARM::SP) 1020 .addReg(ARM::SP) 1021 .setMIFlags(MIFlags) 1022 .add(predOps(ARMCC::AL)); 1023 for (unsigned i = 0, e = Regs.size(); i < e; ++i) 1024 MIB.addReg(Regs[i].first, getKillRegState(Regs[i].second)); 1025 } else if (Regs.size() == 1) { 1026 BuildMI(MBB, MI, DL, TII.get(StrOpc), ARM::SP) 1027 .addReg(Regs[0].first, getKillRegState(Regs[0].second)) 1028 .addReg(ARM::SP) 1029 .setMIFlags(MIFlags) 1030 .addImm(-4) 1031 .add(predOps(ARMCC::AL)); 1032 } 1033 Regs.clear(); 1034 1035 // Put any subsequent vpush instructions before this one: they will refer to 1036 // higher register numbers so need to be pushed first in order to preserve 1037 // monotonicity. 1038 if (MI != MBB.begin()) 1039 --MI; 1040 } 1041 } 1042 1043 void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, 1044 MachineBasicBlock::iterator MI, 1045 MutableArrayRef<CalleeSavedInfo> CSI, 1046 unsigned LdmOpc, unsigned LdrOpc, 1047 bool isVarArg, bool NoGap, 1048 bool (*Func)(unsigned, bool), 1049 unsigned NumAlignedDPRCS2Regs) const { 1050 MachineFunction &MF = *MBB.getParent(); 1051 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); 1052 const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); 1053 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 1054 DebugLoc DL; 1055 bool isTailCall = false; 1056 bool isInterrupt = false; 1057 bool isTrap = false; 1058 if (MBB.end() != MI) { 1059 DL = MI->getDebugLoc(); 1060 unsigned RetOpcode = MI->getOpcode(); 1061 isTailCall = (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri); 1062 isInterrupt = 1063 RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR; 1064 isTrap = 1065 RetOpcode == ARM::TRAP || RetOpcode == ARM::TRAPNaCl || 1066 RetOpcode == ARM::tTRAP; 1067 } 1068 1069 SmallVector<unsigned, 4> Regs; 1070 unsigned i = CSI.size(); 1071 while (i != 0) { 1072 unsigned LastReg = 0; 1073 bool DeleteRet = false; 1074 for (; i != 0; --i) { 1075 CalleeSavedInfo &Info = CSI[i-1]; 1076 unsigned Reg = Info.getReg(); 1077 if (!(Func)(Reg, STI.splitFramePushPop(MF))) continue; 1078 1079 // The aligned reloads from area DPRCS2 are not inserted here. 1080 if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs) 1081 continue; 1082 1083 if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt && 1084 !isTrap && STI.hasV5TOps()) { 1085 if (MBB.succ_empty()) { 1086 Reg = ARM::PC; 1087 // Fold the return instruction into the LDM. 1088 DeleteRet = true; 1089 LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET; 1090 // We 'restore' LR into PC so it is not live out of the return block: 1091 // Clear Restored bit. 1092 Info.setRestored(false); 1093 } else 1094 LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD; 1095 } 1096 1097 // If NoGap is true, pop consecutive registers and then leave the rest 1098 // for other instructions. e.g. 1099 // vpop {d8, d10, d11} -> vpop {d8}, vpop {d10, d11} 1100 if (NoGap && LastReg && LastReg != Reg-1) 1101 break; 1102 1103 LastReg = Reg; 1104 Regs.push_back(Reg); 1105 } 1106 1107 if (Regs.empty()) 1108 continue; 1109 1110 llvm::sort(Regs, [&](unsigned LHS, unsigned RHS) { 1111 return TRI.getEncodingValue(LHS) < TRI.getEncodingValue(RHS); 1112 }); 1113 1114 if (Regs.size() > 1 || LdrOpc == 0) { 1115 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdmOpc), ARM::SP) 1116 .addReg(ARM::SP) 1117 .add(predOps(ARMCC::AL)) 1118 .setMIFlags(MachineInstr::FrameDestroy); 1119 for (unsigned i = 0, e = Regs.size(); i < e; ++i) 1120 MIB.addReg(Regs[i], getDefRegState(true)); 1121 if (DeleteRet) { 1122 if (MI != MBB.end()) { 1123 MIB.copyImplicitOps(*MI); 1124 MI->eraseFromParent(); 1125 } 1126 } 1127 MI = MIB; 1128 } else if (Regs.size() == 1) { 1129 // If we adjusted the reg to PC from LR above, switch it back here. We 1130 // only do that for LDM. 1131 if (Regs[0] == ARM::PC) 1132 Regs[0] = ARM::LR; 1133 MachineInstrBuilder MIB = 1134 BuildMI(MBB, MI, DL, TII.get(LdrOpc), Regs[0]) 1135 .addReg(ARM::SP, RegState::Define) 1136 .addReg(ARM::SP) 1137 .setMIFlags(MachineInstr::FrameDestroy); 1138 // ARM mode needs an extra reg0 here due to addrmode2. Will go away once 1139 // that refactoring is complete (eventually). 1140 if (LdrOpc == ARM::LDR_POST_REG || LdrOpc == ARM::LDR_POST_IMM) { 1141 MIB.addReg(0); 1142 MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::add, 4, ARM_AM::no_shift)); 1143 } else 1144 MIB.addImm(4); 1145 MIB.add(predOps(ARMCC::AL)); 1146 } 1147 Regs.clear(); 1148 1149 // Put any subsequent vpop instructions after this one: they will refer to 1150 // higher register numbers so need to be popped afterwards. 1151 if (MI != MBB.end()) 1152 ++MI; 1153 } 1154 } 1155 1156 /// Emit aligned spill instructions for NumAlignedDPRCS2Regs D-registers 1157 /// starting from d8. Also insert stack realignment code and leave the stack 1158 /// pointer pointing to the d8 spill slot. 1159 static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB, 1160 MachineBasicBlock::iterator MI, 1161 unsigned NumAlignedDPRCS2Regs, 1162 ArrayRef<CalleeSavedInfo> CSI, 1163 const TargetRegisterInfo *TRI) { 1164 MachineFunction &MF = *MBB.getParent(); 1165 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 1166 DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc(); 1167 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); 1168 MachineFrameInfo &MFI = MF.getFrameInfo(); 1169 1170 // Mark the D-register spill slots as properly aligned. Since MFI computes 1171 // stack slot layout backwards, this can actually mean that the d-reg stack 1172 // slot offsets can be wrong. The offset for d8 will always be correct. 1173 for (unsigned i = 0, e = CSI.size(); i != e; ++i) { 1174 unsigned DNum = CSI[i].getReg() - ARM::D8; 1175 if (DNum > NumAlignedDPRCS2Regs - 1) 1176 continue; 1177 int FI = CSI[i].getFrameIdx(); 1178 // The even-numbered registers will be 16-byte aligned, the odd-numbered 1179 // registers will be 8-byte aligned. 1180 MFI.setObjectAlignment(FI, DNum % 2 ? Align(8) : Align(16)); 1181 1182 // The stack slot for D8 needs to be maximally aligned because this is 1183 // actually the point where we align the stack pointer. MachineFrameInfo 1184 // computes all offsets relative to the incoming stack pointer which is a 1185 // bit weird when realigning the stack. Any extra padding for this 1186 // over-alignment is not realized because the code inserted below adjusts 1187 // the stack pointer by numregs * 8 before aligning the stack pointer. 1188 if (DNum == 0) 1189 MFI.setObjectAlignment(FI, MFI.getMaxAlign()); 1190 } 1191 1192 // Move the stack pointer to the d8 spill slot, and align it at the same 1193 // time. Leave the stack slot address in the scratch register r4. 1194 // 1195 // sub r4, sp, #numregs * 8 1196 // bic r4, r4, #align - 1 1197 // mov sp, r4 1198 // 1199 bool isThumb = AFI->isThumbFunction(); 1200 assert(!AFI->isThumb1OnlyFunction() && "Can't realign stack for thumb1"); 1201 AFI->setShouldRestoreSPFromFP(true); 1202 1203 // sub r4, sp, #numregs * 8 1204 // The immediate is <= 64, so it doesn't need any special encoding. 1205 unsigned Opc = isThumb ? ARM::t2SUBri : ARM::SUBri; 1206 BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4) 1207 .addReg(ARM::SP) 1208 .addImm(8 * NumAlignedDPRCS2Regs) 1209 .add(predOps(ARMCC::AL)) 1210 .add(condCodeOp()); 1211 1212 Align MaxAlign = MF.getFrameInfo().getMaxAlign(); 1213 // We must set parameter MustBeSingleInstruction to true, since 1214 // skipAlignedDPRCS2Spills expects exactly 3 instructions to perform 1215 // stack alignment. Luckily, this can always be done since all ARM 1216 // architecture versions that support Neon also support the BFC 1217 // instruction. 1218 emitAligningInstructions(MF, AFI, TII, MBB, MI, DL, ARM::R4, MaxAlign, true); 1219 1220 // mov sp, r4 1221 // The stack pointer must be adjusted before spilling anything, otherwise 1222 // the stack slots could be clobbered by an interrupt handler. 1223 // Leave r4 live, it is used below. 1224 Opc = isThumb ? ARM::tMOVr : ARM::MOVr; 1225 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(Opc), ARM::SP) 1226 .addReg(ARM::R4) 1227 .add(predOps(ARMCC::AL)); 1228 if (!isThumb) 1229 MIB.add(condCodeOp()); 1230 1231 // Now spill NumAlignedDPRCS2Regs registers starting from d8. 1232 // r4 holds the stack slot address. 1233 unsigned NextReg = ARM::D8; 1234 1235 // 16-byte aligned vst1.64 with 4 d-regs and address writeback. 1236 // The writeback is only needed when emitting two vst1.64 instructions. 1237 if (NumAlignedDPRCS2Regs >= 6) { 1238 unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, 1239 &ARM::QQPRRegClass); 1240 MBB.addLiveIn(SupReg); 1241 BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Qwb_fixed), ARM::R4) 1242 .addReg(ARM::R4, RegState::Kill) 1243 .addImm(16) 1244 .addReg(NextReg) 1245 .addReg(SupReg, RegState::ImplicitKill) 1246 .add(predOps(ARMCC::AL)); 1247 NextReg += 4; 1248 NumAlignedDPRCS2Regs -= 4; 1249 } 1250 1251 // We won't modify r4 beyond this point. It currently points to the next 1252 // register to be spilled. 1253 unsigned R4BaseReg = NextReg; 1254 1255 // 16-byte aligned vst1.64 with 4 d-regs, no writeback. 1256 if (NumAlignedDPRCS2Regs >= 4) { 1257 unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, 1258 &ARM::QQPRRegClass); 1259 MBB.addLiveIn(SupReg); 1260 BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Q)) 1261 .addReg(ARM::R4) 1262 .addImm(16) 1263 .addReg(NextReg) 1264 .addReg(SupReg, RegState::ImplicitKill) 1265 .add(predOps(ARMCC::AL)); 1266 NextReg += 4; 1267 NumAlignedDPRCS2Regs -= 4; 1268 } 1269 1270 // 16-byte aligned vst1.64 with 2 d-regs. 1271 if (NumAlignedDPRCS2Regs >= 2) { 1272 unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, 1273 &ARM::QPRRegClass); 1274 MBB.addLiveIn(SupReg); 1275 BuildMI(MBB, MI, DL, TII.get(ARM::VST1q64)) 1276 .addReg(ARM::R4) 1277 .addImm(16) 1278 .addReg(SupReg) 1279 .add(predOps(ARMCC::AL)); 1280 NextReg += 2; 1281 NumAlignedDPRCS2Regs -= 2; 1282 } 1283 1284 // Finally, use a vanilla vstr.64 for the odd last register. 1285 if (NumAlignedDPRCS2Regs) { 1286 MBB.addLiveIn(NextReg); 1287 // vstr.64 uses addrmode5 which has an offset scale of 4. 1288 BuildMI(MBB, MI, DL, TII.get(ARM::VSTRD)) 1289 .addReg(NextReg) 1290 .addReg(ARM::R4) 1291 .addImm((NextReg - R4BaseReg) * 2) 1292 .add(predOps(ARMCC::AL)); 1293 } 1294 1295 // The last spill instruction inserted should kill the scratch register r4. 1296 std::prev(MI)->addRegisterKilled(ARM::R4, TRI); 1297 } 1298 1299 /// Skip past the code inserted by emitAlignedDPRCS2Spills, and return an 1300 /// iterator to the following instruction. 1301 static MachineBasicBlock::iterator 1302 skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI, 1303 unsigned NumAlignedDPRCS2Regs) { 1304 // sub r4, sp, #numregs * 8 1305 // bic r4, r4, #align - 1 1306 // mov sp, r4 1307 ++MI; ++MI; ++MI; 1308 assert(MI->mayStore() && "Expecting spill instruction"); 1309 1310 // These switches all fall through. 1311 switch(NumAlignedDPRCS2Regs) { 1312 case 7: 1313 ++MI; 1314 assert(MI->mayStore() && "Expecting spill instruction"); 1315 LLVM_FALLTHROUGH; 1316 default: 1317 ++MI; 1318 assert(MI->mayStore() && "Expecting spill instruction"); 1319 LLVM_FALLTHROUGH; 1320 case 1: 1321 case 2: 1322 case 4: 1323 assert(MI->killsRegister(ARM::R4) && "Missed kill flag"); 1324 ++MI; 1325 } 1326 return MI; 1327 } 1328 1329 /// Emit aligned reload instructions for NumAlignedDPRCS2Regs D-registers 1330 /// starting from d8. These instructions are assumed to execute while the 1331 /// stack is still aligned, unlike the code inserted by emitPopInst. 1332 static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB, 1333 MachineBasicBlock::iterator MI, 1334 unsigned NumAlignedDPRCS2Regs, 1335 ArrayRef<CalleeSavedInfo> CSI, 1336 const TargetRegisterInfo *TRI) { 1337 MachineFunction &MF = *MBB.getParent(); 1338 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 1339 DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc(); 1340 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); 1341 1342 // Find the frame index assigned to d8. 1343 int D8SpillFI = 0; 1344 for (unsigned i = 0, e = CSI.size(); i != e; ++i) 1345 if (CSI[i].getReg() == ARM::D8) { 1346 D8SpillFI = CSI[i].getFrameIdx(); 1347 break; 1348 } 1349 1350 // Materialize the address of the d8 spill slot into the scratch register r4. 1351 // This can be fairly complicated if the stack frame is large, so just use 1352 // the normal frame index elimination mechanism to do it. This code runs as 1353 // the initial part of the epilog where the stack and base pointers haven't 1354 // been changed yet. 1355 bool isThumb = AFI->isThumbFunction(); 1356 assert(!AFI->isThumb1OnlyFunction() && "Can't realign stack for thumb1"); 1357 1358 unsigned Opc = isThumb ? ARM::t2ADDri : ARM::ADDri; 1359 BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4) 1360 .addFrameIndex(D8SpillFI) 1361 .addImm(0) 1362 .add(predOps(ARMCC::AL)) 1363 .add(condCodeOp()); 1364 1365 // Now restore NumAlignedDPRCS2Regs registers starting from d8. 1366 unsigned NextReg = ARM::D8; 1367 1368 // 16-byte aligned vld1.64 with 4 d-regs and writeback. 1369 if (NumAlignedDPRCS2Regs >= 6) { 1370 unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, 1371 &ARM::QQPRRegClass); 1372 BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Qwb_fixed), NextReg) 1373 .addReg(ARM::R4, RegState::Define) 1374 .addReg(ARM::R4, RegState::Kill) 1375 .addImm(16) 1376 .addReg(SupReg, RegState::ImplicitDefine) 1377 .add(predOps(ARMCC::AL)); 1378 NextReg += 4; 1379 NumAlignedDPRCS2Regs -= 4; 1380 } 1381 1382 // We won't modify r4 beyond this point. It currently points to the next 1383 // register to be spilled. 1384 unsigned R4BaseReg = NextReg; 1385 1386 // 16-byte aligned vld1.64 with 4 d-regs, no writeback. 1387 if (NumAlignedDPRCS2Regs >= 4) { 1388 unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, 1389 &ARM::QQPRRegClass); 1390 BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Q), NextReg) 1391 .addReg(ARM::R4) 1392 .addImm(16) 1393 .addReg(SupReg, RegState::ImplicitDefine) 1394 .add(predOps(ARMCC::AL)); 1395 NextReg += 4; 1396 NumAlignedDPRCS2Regs -= 4; 1397 } 1398 1399 // 16-byte aligned vld1.64 with 2 d-regs. 1400 if (NumAlignedDPRCS2Regs >= 2) { 1401 unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, 1402 &ARM::QPRRegClass); 1403 BuildMI(MBB, MI, DL, TII.get(ARM::VLD1q64), SupReg) 1404 .addReg(ARM::R4) 1405 .addImm(16) 1406 .add(predOps(ARMCC::AL)); 1407 NextReg += 2; 1408 NumAlignedDPRCS2Regs -= 2; 1409 } 1410 1411 // Finally, use a vanilla vldr.64 for the remaining odd register. 1412 if (NumAlignedDPRCS2Regs) 1413 BuildMI(MBB, MI, DL, TII.get(ARM::VLDRD), NextReg) 1414 .addReg(ARM::R4) 1415 .addImm(2 * (NextReg - R4BaseReg)) 1416 .add(predOps(ARMCC::AL)); 1417 1418 // Last store kills r4. 1419 std::prev(MI)->addRegisterKilled(ARM::R4, TRI); 1420 } 1421 1422 bool ARMFrameLowering::spillCalleeSavedRegisters( 1423 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, 1424 ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { 1425 if (CSI.empty()) 1426 return false; 1427 1428 MachineFunction &MF = *MBB.getParent(); 1429 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 1430 1431 unsigned PushOpc = AFI->isThumbFunction() ? ARM::t2STMDB_UPD : ARM::STMDB_UPD; 1432 unsigned PushOneOpc = AFI->isThumbFunction() ? 1433 ARM::t2STR_PRE : ARM::STR_PRE_IMM; 1434 unsigned FltOpc = ARM::VSTMDDB_UPD; 1435 unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs(); 1436 emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register, 0, 1437 MachineInstr::FrameSetup); 1438 emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register, 0, 1439 MachineInstr::FrameSetup); 1440 emitPushInst(MBB, MI, CSI, FltOpc, 0, true, &isARMArea3Register, 1441 NumAlignedDPRCS2Regs, MachineInstr::FrameSetup); 1442 1443 // The code above does not insert spill code for the aligned DPRCS2 registers. 1444 // The stack realignment code will be inserted between the push instructions 1445 // and these spills. 1446 if (NumAlignedDPRCS2Regs) 1447 emitAlignedDPRCS2Spills(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI); 1448 1449 return true; 1450 } 1451 1452 bool ARMFrameLowering::restoreCalleeSavedRegisters( 1453 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, 1454 MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { 1455 if (CSI.empty()) 1456 return false; 1457 1458 MachineFunction &MF = *MBB.getParent(); 1459 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 1460 bool isVarArg = AFI->getArgRegsSaveSize() > 0; 1461 unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs(); 1462 1463 // The emitPopInst calls below do not insert reloads for the aligned DPRCS2 1464 // registers. Do that here instead. 1465 if (NumAlignedDPRCS2Regs) 1466 emitAlignedDPRCS2Restores(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI); 1467 1468 unsigned PopOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD; 1469 unsigned LdrOpc = AFI->isThumbFunction() ? ARM::t2LDR_POST :ARM::LDR_POST_IMM; 1470 unsigned FltOpc = ARM::VLDMDIA_UPD; 1471 emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, &isARMArea3Register, 1472 NumAlignedDPRCS2Regs); 1473 emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false, 1474 &isARMArea2Register, 0); 1475 emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false, 1476 &isARMArea1Register, 0); 1477 1478 return true; 1479 } 1480 1481 // FIXME: Make generic? 1482 static unsigned EstimateFunctionSizeInBytes(const MachineFunction &MF, 1483 const ARMBaseInstrInfo &TII) { 1484 unsigned FnSize = 0; 1485 for (auto &MBB : MF) { 1486 for (auto &MI : MBB) 1487 FnSize += TII.getInstSizeInBytes(MI); 1488 } 1489 if (MF.getJumpTableInfo()) 1490 for (auto &Table: MF.getJumpTableInfo()->getJumpTables()) 1491 FnSize += Table.MBBs.size() * 4; 1492 FnSize += MF.getConstantPool()->getConstants().size() * 4; 1493 return FnSize; 1494 } 1495 1496 /// estimateRSStackSizeLimit - Look at each instruction that references stack 1497 /// frames and return the stack size limit beyond which some of these 1498 /// instructions will require a scratch register during their expansion later. 1499 // FIXME: Move to TII? 1500 static unsigned estimateRSStackSizeLimit(MachineFunction &MF, 1501 const TargetFrameLowering *TFI, 1502 bool &HasNonSPFrameIndex) { 1503 const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 1504 const ARMBaseInstrInfo &TII = 1505 *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo()); 1506 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 1507 unsigned Limit = (1 << 12) - 1; 1508 for (auto &MBB : MF) { 1509 for (auto &MI : MBB) { 1510 if (MI.isDebugInstr()) 1511 continue; 1512 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 1513 if (!MI.getOperand(i).isFI()) 1514 continue; 1515 1516 // When using ADDri to get the address of a stack object, 255 is the 1517 // largest offset guaranteed to fit in the immediate offset. 1518 if (MI.getOpcode() == ARM::ADDri) { 1519 Limit = std::min(Limit, (1U << 8) - 1); 1520 break; 1521 } 1522 // t2ADDri will not require an extra register, it can reuse the 1523 // destination. 1524 if (MI.getOpcode() == ARM::t2ADDri || MI.getOpcode() == ARM::t2ADDri12) 1525 break; 1526 1527 const MCInstrDesc &MCID = MI.getDesc(); 1528 const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i, TRI, MF); 1529 if (RegClass && !RegClass->contains(ARM::SP)) 1530 HasNonSPFrameIndex = true; 1531 1532 // Otherwise check the addressing mode. 1533 switch (MI.getDesc().TSFlags & ARMII::AddrModeMask) { 1534 case ARMII::AddrMode_i12: 1535 case ARMII::AddrMode2: 1536 // Default 12 bit limit. 1537 break; 1538 case ARMII::AddrMode3: 1539 case ARMII::AddrModeT2_i8: 1540 Limit = std::min(Limit, (1U << 8) - 1); 1541 break; 1542 case ARMII::AddrMode5FP16: 1543 Limit = std::min(Limit, ((1U << 8) - 1) * 2); 1544 break; 1545 case ARMII::AddrMode5: 1546 case ARMII::AddrModeT2_i8s4: 1547 case ARMII::AddrModeT2_ldrex: 1548 Limit = std::min(Limit, ((1U << 8) - 1) * 4); 1549 break; 1550 case ARMII::AddrModeT2_i12: 1551 // i12 supports only positive offset so these will be converted to 1552 // i8 opcodes. See llvm::rewriteT2FrameIndex. 1553 if (TFI->hasFP(MF) && AFI->hasStackFrame()) 1554 Limit = std::min(Limit, (1U << 8) - 1); 1555 break; 1556 case ARMII::AddrMode4: 1557 case ARMII::AddrMode6: 1558 // Addressing modes 4 & 6 (load/store) instructions can't encode an 1559 // immediate offset for stack references. 1560 return 0; 1561 case ARMII::AddrModeT2_i7: 1562 Limit = std::min(Limit, ((1U << 7) - 1) * 1); 1563 break; 1564 case ARMII::AddrModeT2_i7s2: 1565 Limit = std::min(Limit, ((1U << 7) - 1) * 2); 1566 break; 1567 case ARMII::AddrModeT2_i7s4: 1568 Limit = std::min(Limit, ((1U << 7) - 1) * 4); 1569 break; 1570 default: 1571 llvm_unreachable("Unhandled addressing mode in stack size limit calculation"); 1572 } 1573 break; // At most one FI per instruction 1574 } 1575 } 1576 } 1577 1578 return Limit; 1579 } 1580 1581 // In functions that realign the stack, it can be an advantage to spill the 1582 // callee-saved vector registers after realigning the stack. The vst1 and vld1 1583 // instructions take alignment hints that can improve performance. 1584 static void 1585 checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs) { 1586 MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(0); 1587 if (!SpillAlignedNEONRegs) 1588 return; 1589 1590 // Naked functions don't spill callee-saved registers. 1591 if (MF.getFunction().hasFnAttribute(Attribute::Naked)) 1592 return; 1593 1594 // We are planning to use NEON instructions vst1 / vld1. 1595 if (!static_cast<const ARMSubtarget &>(MF.getSubtarget()).hasNEON()) 1596 return; 1597 1598 // Don't bother if the default stack alignment is sufficiently high. 1599 if (MF.getSubtarget().getFrameLowering()->getStackAlign() >= Align(8)) 1600 return; 1601 1602 // Aligned spills require stack realignment. 1603 if (!static_cast<const ARMBaseRegisterInfo *>( 1604 MF.getSubtarget().getRegisterInfo())->canRealignStack(MF)) 1605 return; 1606 1607 // We always spill contiguous d-registers starting from d8. Count how many 1608 // needs spilling. The register allocator will almost always use the 1609 // callee-saved registers in order, but it can happen that there are holes in 1610 // the range. Registers above the hole will be spilled to the standard DPRCS 1611 // area. 1612 unsigned NumSpills = 0; 1613 for (; NumSpills < 8; ++NumSpills) 1614 if (!SavedRegs.test(ARM::D8 + NumSpills)) 1615 break; 1616 1617 // Don't do this for just one d-register. It's not worth it. 1618 if (NumSpills < 2) 1619 return; 1620 1621 // Spill the first NumSpills D-registers after realigning the stack. 1622 MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(NumSpills); 1623 1624 // A scratch register is required for the vst1 / vld1 instructions. 1625 SavedRegs.set(ARM::R4); 1626 } 1627 1628 // Compute the set of registers which cannot be preserved, because they are 1629 // either modified outside the PUSH/POP instructions, or are live at the point 1630 // where the POP will be inserted. This only considers r0-r3, which are 1631 // currently the only registers we voluntatrily save when the PCS doesn't 1632 // require it. 1633 void ARMFrameLowering::findRegDefsOutsideSaveRestore( 1634 MachineFunction &MF, BitVector &UnsaveableRegs) const { 1635 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 1636 MachineFrameInfo &MFI = MF.getFrameInfo(); 1637 1638 SmallSet<MachineBasicBlock *, 2> SaveBlocks; 1639 SmallSet<MachineBasicBlock *, 2> RestoreBlocks; 1640 1641 if (MFI.getSavePoint()) { 1642 SaveBlocks.insert(MFI.getSavePoint()); 1643 RestoreBlocks.insert(MFI.getRestorePoint()); 1644 } else { 1645 SaveBlocks.insert(&MF.front()); 1646 for (MachineBasicBlock &MBB : MF) 1647 if (MBB.isReturnBlock()) 1648 RestoreBlocks.insert(&MBB); 1649 } 1650 1651 // Walk blocks from the function entry and exits (following control flow both 1652 // ways), stopping when we get to a save/restore block. Check for 1653 // instructions which modify any of the registers we care about. 1654 SmallVector<MachineBasicBlock *, 4> WorkList; 1655 SmallSet<MachineBasicBlock *, 4> VisitedBlocks; 1656 LLVM_DEBUG(dbgs() << "Entry block: " << MF.front().getName() << "\n"); 1657 WorkList.push_back(&MF.front()); 1658 for (MachineBasicBlock &MBB : MF) { 1659 if (MBB.isReturnBlock()) { 1660 LLVM_DEBUG(dbgs() << "Return block: " << MBB.getName() << "\n"); 1661 WorkList.push_back(&MBB); 1662 } 1663 } 1664 1665 auto CheckOutsideInst = [&UnsaveableRegs, TRI](MachineInstr &MI) { 1666 for (Register Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) { 1667 if (MI.modifiesRegister(Reg, TRI)) { 1668 UnsaveableRegs.set(Reg); 1669 LLVM_DEBUG(dbgs() << "Register " << TRI->getName(Reg) 1670 << " modified by instruction " << MI << "\n"); 1671 } 1672 } 1673 }; 1674 1675 while (!WorkList.empty()) { 1676 MachineBasicBlock *MBB = WorkList.pop_back_val(); 1677 1678 if (VisitedBlocks.count(MBB)) 1679 continue; 1680 VisitedBlocks.insert(MBB); 1681 1682 bool IsSave = SaveBlocks.count(MBB); 1683 bool IsRestore = RestoreBlocks.count(MBB); 1684 1685 LLVM_DEBUG(dbgs() << "Visiting block " << MBB->getName() << ", IsSave=" 1686 << IsSave << ", IsRestore=" << IsRestore << "\n"); 1687 1688 // If this is a restore block, the POP instruction will be inserted just 1689 // before the terminator, so we need to consider any terminator 1690 // instructions to be outside the preserved region. We also need to check 1691 // for registers which are live at the POP insertion point, because these 1692 // can't be restored without changing their value. 1693 if (IsRestore) { 1694 LivePhysRegs LPR(*TRI); 1695 LPR.addLiveOuts(*MBB); 1696 for (auto &Term : reverse(MBB->terminators())) { 1697 LPR.stepBackward(Term); 1698 CheckOutsideInst(Term); 1699 } 1700 1701 for (Register Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) { 1702 if (LPR.contains(Reg)) { 1703 UnsaveableRegs.set(Reg); 1704 LLVM_DEBUG(dbgs() << "Register " << TRI->getName(Reg) 1705 << " live-out of restore block " << MBB->getName() 1706 << "\n"); 1707 } 1708 } 1709 } 1710 1711 // If this block is completely outside the save/restore region, then any 1712 // modified registers can't be preserved. A save block counts as being 1713 // inside the saved region, with the possible exception of the last few 1714 // instructions if it's also a restore block, handled above. We don't visit 1715 // blocks which are completely inside the saved region and don't have any 1716 // save/restore instructions, so don't need to check that here. 1717 if (!IsSave && !IsRestore) 1718 for (auto &MI : *MBB) 1719 CheckOutsideInst(MI); 1720 1721 // Walk the control flow graph in both directions, except for blocks which 1722 // are inside the PUSH/POP region. 1723 if (IsSave || !IsRestore) 1724 for (auto Pred : MBB->predecessors()) 1725 WorkList.push_back(Pred); 1726 if (!IsSave || IsRestore) 1727 for (auto Succ : MBB->successors()) 1728 WorkList.push_back(Succ); 1729 } 1730 } 1731 1732 bool ARMFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { 1733 // Shrink wrapping is detrimental to code size because it prevents merging 1734 // the CSR restore and function return into one POP instruction. It also 1735 // conflicts with saving extra registers for IPRA, because it makes more 1736 // registers live at the PUSH/POP. 1737 if (MF.getFunction().hasMinSize()) 1738 return false; 1739 1740 return true; 1741 } 1742 1743 // When doing inter-procedural register allocation, saving extra registers in 1744 // [r0,r3] will allow us to keep live values in them in any callers. The extra 1745 // saves and restores don't cost us any code-size if we are already emitting 1746 // PUSH and POP instructions. 1747 unsigned ARMFrameLowering::spillExtraRegsForIPRA(MachineFunction &MF, 1748 BitVector &SavedRegs, 1749 bool HasFPRegSaves) const { 1750 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 1751 MachineRegisterInfo &MRI = MF.getRegInfo(); 1752 MachineFrameInfo &MFI = MF.getFrameInfo(); 1753 1754 LLVM_DEBUG(dbgs() << "Extra spills for " << MF.getName() << ": "); 1755 1756 if (!EnableExtraSpills) { 1757 LLVM_DEBUG(dbgs() << "optimisation not enabled\n"); 1758 return 0; 1759 } 1760 1761 // If IPRA is not enabled, nothing will be able to take advantage of the 1762 // extra saved registers. 1763 if (!MF.getTarget().Options.EnableIPRA) { 1764 LLVM_DEBUG(dbgs() << "IPRA disabled\n"); 1765 return 0; 1766 } 1767 1768 // These registers will take extra time to save and restore, and will often 1769 // go unused, so only to this at -Oz. 1770 if (!MF.getFunction().hasMinSize()) { 1771 LLVM_DEBUG(dbgs() << "not minsize\n"); 1772 return 0; 1773 } 1774 1775 // If we are not currently spilling any registers, we'd need to add an extra 1776 // PUSH/POP pair, so this isn't worth it. 1777 if (!SavedRegs.any()) { 1778 LLVM_DEBUG(dbgs() << "no existing push/pop\n"); 1779 return 0; 1780 } 1781 1782 // If we can't guarantee that this definition of the function is the one 1783 // which will be picked by the linker, then IPRA can't make use of any extra 1784 // saved registers. 1785 if (!MF.getFunction().isDefinitionExact()) { 1786 LLVM_DEBUG(dbgs() << "inexact definition\n"); 1787 return 0; 1788 } 1789 1790 int NumVisibleCallers = 0; 1791 for (const User *U : MF.getFunction().users()) { 1792 if (const CallBase *Call = dyn_cast<CallBase>(U)) { 1793 if (Call->getCalledOperand() == &MF.getFunction()) { 1794 ++NumVisibleCallers; 1795 } 1796 } 1797 } 1798 1799 // If we don't have any direct callers in the current translation unit, 1800 // nothing will be able to take advantage of the extra saved registers. 1801 if (NumVisibleCallers == 0 && !ForceExtraSpills) { 1802 LLVM_DEBUG(dbgs() << "no visible callers\n"); 1803 return 0; 1804 } 1805 1806 // If we need to emit unwind tables, these will be longer if we need to 1807 // preserve r0-r3, so we need a lot of visible calls to make this worthwhile. 1808 if (MF.getFunction().needsUnwindTableEntry() && NumVisibleCallers <= 8 && 1809 !ForceExtraSpills) { 1810 LLVM_DEBUG(dbgs() << "needs unwind table\n"); 1811 return 0; 1812 } 1813 1814 // Ok, we've decided we are going to try the optimisation. 1815 LLVM_DEBUG(dbgs() << "enabled\n"); 1816 1817 // Compute the registers which can't be preserved because they are either 1818 // modified before the PUSH or after the POP, or are live at the point where 1819 // the POP will be inserted. 1820 BitVector NonPreserveableRegisters; 1821 NonPreserveableRegisters.resize(TRI->getNumRegs()); 1822 findRegDefsOutsideSaveRestore(MF, NonPreserveableRegisters); 1823 1824 unsigned NumExtraRegs = 0; 1825 1826 // We'd also like to leave some registers free so that we can use them to 1827 // fold a small SP update into the PUSH/POP. We can't know exactly what this 1828 // optimisation can do, because stack layout isn't finalised, but we can make 1829 // a good enough estimate. 1830 unsigned StackSize = MFI.estimateStackSize(MF); 1831 1832 // If the stack space is large, we probably won't be able to fold the SP 1833 // update into the push/pop, so we should use all the registers we want. If 1834 // we have FP register saves, then the SP update will be folded into the 1835 // VPUSH/VPOP instead, and we can use the GPRs freely. 1836 if (StackSize > 16 || HasFPRegSaves) 1837 StackSize = 0; 1838 1839 LLVM_DEBUG(dbgs() << "Estimated " << StackSize 1840 << " bytes of SP update being folded into push/pop\n"); 1841 1842 for (Register Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) { 1843 if (StackSize) { 1844 StackSize -= 4; 1845 LLVM_DEBUG(dbgs() << "not saving " << TRI->getName(Reg) 1846 << ", wanted for SP update\n"); 1847 continue; 1848 } 1849 1850 // If we don't modify the register anywhere in this function, IPRA will 1851 // already know that it is preserved, and there's no point in saving it. 1852 if (!MRI.isPhysRegModified(Reg)) { 1853 LLVM_DEBUG(dbgs() << "not saving " << TRI->getName(Reg) 1854 << ", not modified\n"); 1855 continue; 1856 } 1857 1858 if (NonPreserveableRegisters[Reg]) { 1859 LLVM_DEBUG(dbgs() << "not saving " << TRI->getName(Reg) 1860 << ", modified outide save region\n"); 1861 continue; 1862 } 1863 1864 LLVM_DEBUG(dbgs() << "also saving " << TRI->getName(Reg) << " for IPRA\n"); 1865 SavedRegs.set(Reg); 1866 MRI.enableCalleeSavedRegister(Reg); 1867 ++NumExtraRegs; 1868 } 1869 1870 return NumExtraRegs; 1871 } 1872 1873 void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, 1874 BitVector &SavedRegs, 1875 RegScavenger *RS) const { 1876 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); 1877 // This tells PEI to spill the FP as if it is any other callee-save register 1878 // to take advantage the eliminateFrameIndex machinery. This also ensures it 1879 // is spilled in the order specified by getCalleeSavedRegs() to make it easier 1880 // to combine multiple loads / stores. 1881 bool CanEliminateFrame = true; 1882 bool CS1Spilled = false; 1883 bool LRSpilled = false; 1884 unsigned NumGPRSpills = 0; 1885 unsigned NumFPRSpills = 0; 1886 SmallVector<unsigned, 4> UnspilledCS1GPRs; 1887 SmallVector<unsigned, 4> UnspilledCS2GPRs; 1888 const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>( 1889 MF.getSubtarget().getRegisterInfo()); 1890 const ARMBaseInstrInfo &TII = 1891 *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo()); 1892 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 1893 MachineFrameInfo &MFI = MF.getFrameInfo(); 1894 MachineRegisterInfo &MRI = MF.getRegInfo(); 1895 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 1896 (void)TRI; // Silence unused warning in non-assert builds. 1897 Register FramePtr = RegInfo->getFrameRegister(MF); 1898 1899 // Spill R4 if Thumb2 function requires stack realignment - it will be used as 1900 // scratch register. Also spill R4 if Thumb2 function has varsized objects, 1901 // since it's not always possible to restore sp from fp in a single 1902 // instruction. 1903 // FIXME: It will be better just to find spare register here. 1904 if (AFI->isThumb2Function() && 1905 (MFI.hasVarSizedObjects() || RegInfo->needsStackRealignment(MF))) 1906 SavedRegs.set(ARM::R4); 1907 1908 // If a stack probe will be emitted, spill R4 and LR, since they are 1909 // clobbered by the stack probe call. 1910 // This estimate should be a safe, conservative estimate. The actual 1911 // stack probe is enabled based on the size of the local objects; 1912 // this estimate also includes the varargs store size. 1913 if (STI.isTargetWindows() && 1914 WindowsRequiresStackProbe(MF, MFI.estimateStackSize(MF))) { 1915 SavedRegs.set(ARM::R4); 1916 SavedRegs.set(ARM::LR); 1917 } 1918 1919 if (AFI->isThumb1OnlyFunction()) { 1920 // Spill LR if Thumb1 function uses variable length argument lists. 1921 if (AFI->getArgRegsSaveSize() > 0) 1922 SavedRegs.set(ARM::LR); 1923 1924 // Spill R4 if Thumb1 epilogue has to restore SP from FP or the function 1925 // requires stack alignment. We don't know for sure what the stack size 1926 // will be, but for this, an estimate is good enough. If there anything 1927 // changes it, it'll be a spill, which implies we've used all the registers 1928 // and so R4 is already used, so not marking it here will be OK. 1929 // FIXME: It will be better just to find spare register here. 1930 if (MFI.hasVarSizedObjects() || RegInfo->needsStackRealignment(MF) || 1931 MFI.estimateStackSize(MF) > 508) 1932 SavedRegs.set(ARM::R4); 1933 } 1934 1935 // See if we can spill vector registers to aligned stack. 1936 checkNumAlignedDPRCS2Regs(MF, SavedRegs); 1937 1938 // Spill the BasePtr if it's used. 1939 if (RegInfo->hasBasePointer(MF)) 1940 SavedRegs.set(RegInfo->getBaseRegister()); 1941 1942 // Don't spill FP if the frame can be eliminated. This is determined 1943 // by scanning the callee-save registers to see if any is modified. 1944 const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); 1945 for (unsigned i = 0; CSRegs[i]; ++i) { 1946 unsigned Reg = CSRegs[i]; 1947 bool Spilled = false; 1948 if (SavedRegs.test(Reg)) { 1949 Spilled = true; 1950 CanEliminateFrame = false; 1951 } 1952 1953 if (!ARM::GPRRegClass.contains(Reg)) { 1954 if (Spilled) { 1955 if (ARM::SPRRegClass.contains(Reg)) 1956 NumFPRSpills++; 1957 else if (ARM::DPRRegClass.contains(Reg)) 1958 NumFPRSpills += 2; 1959 else if (ARM::QPRRegClass.contains(Reg)) 1960 NumFPRSpills += 4; 1961 } 1962 continue; 1963 } 1964 1965 if (Spilled) { 1966 NumGPRSpills++; 1967 1968 if (!STI.splitFramePushPop(MF)) { 1969 if (Reg == ARM::LR) 1970 LRSpilled = true; 1971 CS1Spilled = true; 1972 continue; 1973 } 1974 1975 // Keep track if LR and any of R4, R5, R6, and R7 is spilled. 1976 switch (Reg) { 1977 case ARM::LR: 1978 LRSpilled = true; 1979 LLVM_FALLTHROUGH; 1980 case ARM::R0: case ARM::R1: 1981 case ARM::R2: case ARM::R3: 1982 case ARM::R4: case ARM::R5: 1983 case ARM::R6: case ARM::R7: 1984 CS1Spilled = true; 1985 break; 1986 default: 1987 break; 1988 } 1989 } else { 1990 if (!STI.splitFramePushPop(MF)) { 1991 UnspilledCS1GPRs.push_back(Reg); 1992 continue; 1993 } 1994 1995 switch (Reg) { 1996 case ARM::R0: case ARM::R1: 1997 case ARM::R2: case ARM::R3: 1998 case ARM::R4: case ARM::R5: 1999 case ARM::R6: case ARM::R7: 2000 case ARM::LR: 2001 UnspilledCS1GPRs.push_back(Reg); 2002 break; 2003 default: 2004 UnspilledCS2GPRs.push_back(Reg); 2005 break; 2006 } 2007 } 2008 } 2009 2010 bool ForceLRSpill = false; 2011 if (!LRSpilled && AFI->isThumb1OnlyFunction()) { 2012 unsigned FnSize = EstimateFunctionSizeInBytes(MF, TII); 2013 // Force LR to be spilled if the Thumb function size is > 2048. This enables 2014 // use of BL to implement far jump. 2015 if (FnSize >= (1 << 11)) { 2016 CanEliminateFrame = false; 2017 ForceLRSpill = true; 2018 } 2019 } 2020 2021 // If any of the stack slot references may be out of range of an immediate 2022 // offset, make sure a register (or a spill slot) is available for the 2023 // register scavenger. Note that if we're indexing off the frame pointer, the 2024 // effective stack size is 4 bytes larger since the FP points to the stack 2025 // slot of the previous FP. Also, if we have variable sized objects in the 2026 // function, stack slot references will often be negative, and some of 2027 // our instructions are positive-offset only, so conservatively consider 2028 // that case to want a spill slot (or register) as well. Similarly, if 2029 // the function adjusts the stack pointer during execution and the 2030 // adjustments aren't already part of our stack size estimate, our offset 2031 // calculations may be off, so be conservative. 2032 // FIXME: We could add logic to be more precise about negative offsets 2033 // and which instructions will need a scratch register for them. Is it 2034 // worth the effort and added fragility? 2035 unsigned EstimatedStackSize = 2036 MFI.estimateStackSize(MF) + 4 * (NumGPRSpills + NumFPRSpills); 2037 2038 // Determine biggest (positive) SP offset in MachineFrameInfo. 2039 int MaxFixedOffset = 0; 2040 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) { 2041 int MaxObjectOffset = MFI.getObjectOffset(I) + MFI.getObjectSize(I); 2042 MaxFixedOffset = std::max(MaxFixedOffset, MaxObjectOffset); 2043 } 2044 2045 bool HasFP = hasFP(MF); 2046 if (HasFP) { 2047 if (AFI->hasStackFrame()) 2048 EstimatedStackSize += 4; 2049 } else { 2050 // If FP is not used, SP will be used to access arguments, so count the 2051 // size of arguments into the estimation. 2052 EstimatedStackSize += MaxFixedOffset; 2053 } 2054 EstimatedStackSize += 16; // For possible paddings. 2055 2056 unsigned EstimatedRSStackSizeLimit, EstimatedRSFixedSizeLimit; 2057 bool HasNonSPFrameIndex = false; 2058 if (AFI->isThumb1OnlyFunction()) { 2059 // For Thumb1, don't bother to iterate over the function. The only 2060 // instruction that requires an emergency spill slot is a store to a 2061 // frame index. 2062 // 2063 // tSTRspi, which is used for sp-relative accesses, has an 8-bit unsigned 2064 // immediate. tSTRi, which is used for bp- and fp-relative accesses, has 2065 // a 5-bit unsigned immediate. 2066 // 2067 // We could try to check if the function actually contains a tSTRspi 2068 // that might need the spill slot, but it's not really important. 2069 // Functions with VLAs or extremely large call frames are rare, and 2070 // if a function is allocating more than 1KB of stack, an extra 4-byte 2071 // slot probably isn't relevant. 2072 if (RegInfo->hasBasePointer(MF)) 2073 EstimatedRSStackSizeLimit = (1U << 5) * 4; 2074 else 2075 EstimatedRSStackSizeLimit = (1U << 8) * 4; 2076 EstimatedRSFixedSizeLimit = (1U << 5) * 4; 2077 } else { 2078 EstimatedRSStackSizeLimit = 2079 estimateRSStackSizeLimit(MF, this, HasNonSPFrameIndex); 2080 EstimatedRSFixedSizeLimit = EstimatedRSStackSizeLimit; 2081 } 2082 // Final estimate of whether sp or bp-relative accesses might require 2083 // scavenging. 2084 bool HasLargeStack = EstimatedStackSize > EstimatedRSStackSizeLimit; 2085 2086 // If the stack pointer moves and we don't have a base pointer, the 2087 // estimate logic doesn't work. The actual offsets might be larger when 2088 // we're constructing a call frame, or we might need to use negative 2089 // offsets from fp. 2090 bool HasMovingSP = MFI.hasVarSizedObjects() || 2091 (MFI.adjustsStack() && !canSimplifyCallFramePseudos(MF)); 2092 bool HasBPOrFixedSP = RegInfo->hasBasePointer(MF) || !HasMovingSP; 2093 2094 // If we have a frame pointer, we assume arguments will be accessed 2095 // relative to the frame pointer. Check whether fp-relative accesses to 2096 // arguments require scavenging. 2097 // 2098 // We could do slightly better on Thumb1; in some cases, an sp-relative 2099 // offset would be legal even though an fp-relative offset is not. 2100 int MaxFPOffset = getMaxFPOffset(MF.getFunction(), *AFI); 2101 bool HasLargeArgumentList = 2102 HasFP && (MaxFixedOffset - MaxFPOffset) > (int)EstimatedRSFixedSizeLimit; 2103 2104 bool BigFrameOffsets = HasLargeStack || !HasBPOrFixedSP || 2105 HasLargeArgumentList || HasNonSPFrameIndex; 2106 LLVM_DEBUG(dbgs() << "EstimatedLimit: " << EstimatedRSStackSizeLimit 2107 << "; EstimatedStack: " << EstimatedStackSize 2108 << "; EstimatedFPStack: " << MaxFixedOffset - MaxFPOffset 2109 << "; BigFrameOffsets: " << BigFrameOffsets << "\n"); 2110 if (BigFrameOffsets || 2111 !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) { 2112 AFI->setHasStackFrame(true); 2113 2114 if (HasFP) { 2115 SavedRegs.set(FramePtr); 2116 // If the frame pointer is required by the ABI, also spill LR so that we 2117 // emit a complete frame record. 2118 if (MF.getTarget().Options.DisableFramePointerElim(MF) && !LRSpilled) { 2119 SavedRegs.set(ARM::LR); 2120 LRSpilled = true; 2121 NumGPRSpills++; 2122 auto LRPos = llvm::find(UnspilledCS1GPRs, ARM::LR); 2123 if (LRPos != UnspilledCS1GPRs.end()) 2124 UnspilledCS1GPRs.erase(LRPos); 2125 } 2126 auto FPPos = llvm::find(UnspilledCS1GPRs, FramePtr); 2127 if (FPPos != UnspilledCS1GPRs.end()) 2128 UnspilledCS1GPRs.erase(FPPos); 2129 NumGPRSpills++; 2130 if (FramePtr == ARM::R7) 2131 CS1Spilled = true; 2132 } 2133 2134 // This is true when we inserted a spill for a callee-save GPR which is 2135 // not otherwise used by the function. This guaranteees it is possible 2136 // to scavenge a register to hold the address of a stack slot. On Thumb1, 2137 // the register must be a valid operand to tSTRi, i.e. r4-r7. For other 2138 // subtargets, this is any GPR, i.e. r4-r11 or lr. 2139 // 2140 // If we don't insert a spill, we instead allocate an emergency spill 2141 // slot, which can be used by scavenging to spill an arbitrary register. 2142 // 2143 // We currently don't try to figure out whether any specific instruction 2144 // requires scavening an additional register. 2145 bool ExtraCSSpill = false; 2146 2147 if (AFI->isThumb1OnlyFunction()) { 2148 // For Thumb1-only targets, we need some low registers when we save and 2149 // restore the high registers (which aren't allocatable, but could be 2150 // used by inline assembly) because the push/pop instructions can not 2151 // access high registers. If necessary, we might need to push more low 2152 // registers to ensure that there is at least one free that can be used 2153 // for the saving & restoring, and preferably we should ensure that as 2154 // many as are needed are available so that fewer push/pop instructions 2155 // are required. 2156 2157 // Low registers which are not currently pushed, but could be (r4-r7). 2158 SmallVector<unsigned, 4> AvailableRegs; 2159 2160 // Unused argument registers (r0-r3) can be clobbered in the prologue for 2161 // free. 2162 int EntryRegDeficit = 0; 2163 for (unsigned Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) { 2164 if (!MF.getRegInfo().isLiveIn(Reg)) { 2165 --EntryRegDeficit; 2166 LLVM_DEBUG(dbgs() 2167 << printReg(Reg, TRI) 2168 << " is unused argument register, EntryRegDeficit = " 2169 << EntryRegDeficit << "\n"); 2170 } 2171 } 2172 2173 // Unused return registers can be clobbered in the epilogue for free. 2174 int ExitRegDeficit = AFI->getReturnRegsCount() - 4; 2175 LLVM_DEBUG(dbgs() << AFI->getReturnRegsCount() 2176 << " return regs used, ExitRegDeficit = " 2177 << ExitRegDeficit << "\n"); 2178 2179 int RegDeficit = std::max(EntryRegDeficit, ExitRegDeficit); 2180 LLVM_DEBUG(dbgs() << "RegDeficit = " << RegDeficit << "\n"); 2181 2182 // r4-r6 can be used in the prologue if they are pushed by the first push 2183 // instruction. 2184 for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6}) { 2185 if (SavedRegs.test(Reg)) { 2186 --RegDeficit; 2187 LLVM_DEBUG(dbgs() << printReg(Reg, TRI) 2188 << " is saved low register, RegDeficit = " 2189 << RegDeficit << "\n"); 2190 } else { 2191 AvailableRegs.push_back(Reg); 2192 LLVM_DEBUG( 2193 dbgs() 2194 << printReg(Reg, TRI) 2195 << " is non-saved low register, adding to AvailableRegs\n"); 2196 } 2197 } 2198 2199 // r7 can be used if it is not being used as the frame pointer. 2200 if (!HasFP) { 2201 if (SavedRegs.test(ARM::R7)) { 2202 --RegDeficit; 2203 LLVM_DEBUG(dbgs() << "%r7 is saved low register, RegDeficit = " 2204 << RegDeficit << "\n"); 2205 } else { 2206 AvailableRegs.push_back(ARM::R7); 2207 LLVM_DEBUG( 2208 dbgs() 2209 << "%r7 is non-saved low register, adding to AvailableRegs\n"); 2210 } 2211 } 2212 2213 // Each of r8-r11 needs to be copied to a low register, then pushed. 2214 for (unsigned Reg : {ARM::R8, ARM::R9, ARM::R10, ARM::R11}) { 2215 if (SavedRegs.test(Reg)) { 2216 ++RegDeficit; 2217 LLVM_DEBUG(dbgs() << printReg(Reg, TRI) 2218 << " is saved high register, RegDeficit = " 2219 << RegDeficit << "\n"); 2220 } 2221 } 2222 2223 // LR can only be used by PUSH, not POP, and can't be used at all if the 2224 // llvm.returnaddress intrinsic is used. This is only worth doing if we 2225 // are more limited at function entry than exit. 2226 if ((EntryRegDeficit > ExitRegDeficit) && 2227 !(MF.getRegInfo().isLiveIn(ARM::LR) && 2228 MF.getFrameInfo().isReturnAddressTaken())) { 2229 if (SavedRegs.test(ARM::LR)) { 2230 --RegDeficit; 2231 LLVM_DEBUG(dbgs() << "%lr is saved register, RegDeficit = " 2232 << RegDeficit << "\n"); 2233 } else { 2234 AvailableRegs.push_back(ARM::LR); 2235 LLVM_DEBUG(dbgs() << "%lr is not saved, adding to AvailableRegs\n"); 2236 } 2237 } 2238 2239 // If there are more high registers that need pushing than low registers 2240 // available, push some more low registers so that we can use fewer push 2241 // instructions. This might not reduce RegDeficit all the way to zero, 2242 // because we can only guarantee that r4-r6 are available, but r8-r11 may 2243 // need saving. 2244 LLVM_DEBUG(dbgs() << "Final RegDeficit = " << RegDeficit << "\n"); 2245 for (; RegDeficit > 0 && !AvailableRegs.empty(); --RegDeficit) { 2246 unsigned Reg = AvailableRegs.pop_back_val(); 2247 LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI) 2248 << " to make up reg deficit\n"); 2249 SavedRegs.set(Reg); 2250 NumGPRSpills++; 2251 CS1Spilled = true; 2252 assert(!MRI.isReserved(Reg) && "Should not be reserved"); 2253 if (Reg != ARM::LR && !MRI.isPhysRegUsed(Reg)) 2254 ExtraCSSpill = true; 2255 UnspilledCS1GPRs.erase(llvm::find(UnspilledCS1GPRs, Reg)); 2256 if (Reg == ARM::LR) 2257 LRSpilled = true; 2258 } 2259 LLVM_DEBUG(dbgs() << "After adding spills, RegDeficit = " << RegDeficit 2260 << "\n"); 2261 } 2262 2263 // When using IPRA, we might want to preserve some of r0-r3, to reduce 2264 // register pressure in our callers. 2265 unsigned ExtraIPRASpills = 2266 spillExtraRegsForIPRA(MF, SavedRegs, NumFPRSpills != 0); 2267 NumGPRSpills += ExtraIPRASpills; 2268 if (ExtraIPRASpills) 2269 CS1Spilled = true; 2270 2271 // Avoid spilling LR in Thumb1 if there's a tail call: it's expensive to 2272 // restore LR in that case. 2273 bool ExpensiveLRRestore = AFI->isThumb1OnlyFunction() && MFI.hasTailCall(); 2274 2275 // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled. 2276 // Spill LR as well so we can fold BX_RET to the registers restore (LDM). 2277 if (!LRSpilled && CS1Spilled && !ExpensiveLRRestore) { 2278 SavedRegs.set(ARM::LR); 2279 NumGPRSpills++; 2280 SmallVectorImpl<unsigned>::iterator LRPos; 2281 LRPos = llvm::find(UnspilledCS1GPRs, (unsigned)ARM::LR); 2282 if (LRPos != UnspilledCS1GPRs.end()) 2283 UnspilledCS1GPRs.erase(LRPos); 2284 2285 ForceLRSpill = false; 2286 if (!MRI.isReserved(ARM::LR) && !MRI.isPhysRegUsed(ARM::LR) && 2287 !AFI->isThumb1OnlyFunction()) 2288 ExtraCSSpill = true; 2289 } 2290 2291 // If stack and double are 8-byte aligned and we are spilling an odd number 2292 // of GPRs, spill one extra callee save GPR so we won't have to pad between 2293 // the integer and double callee save areas. 2294 LLVM_DEBUG(dbgs() << "NumGPRSpills = " << NumGPRSpills << "\n"); 2295 const Align TargetAlign = getStackAlign(); 2296 if (TargetAlign >= Align(8) && (NumGPRSpills & 1)) { 2297 if (CS1Spilled && !UnspilledCS1GPRs.empty()) { 2298 for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) { 2299 unsigned Reg = UnspilledCS1GPRs[i]; 2300 // Don't spill high register if the function is thumb. In the case of 2301 // Windows on ARM, accept R11 (frame pointer) 2302 if (!AFI->isThumbFunction() || 2303 (STI.isTargetWindows() && Reg == ARM::R11) || 2304 isARMLowRegister(Reg) || 2305 (Reg == ARM::LR && !ExpensiveLRRestore)) { 2306 SavedRegs.set(Reg); 2307 LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI) 2308 << " to make up alignment\n"); 2309 if (!MRI.isReserved(Reg) && !MRI.isPhysRegUsed(Reg) && 2310 !(Reg == ARM::LR && AFI->isThumb1OnlyFunction())) 2311 ExtraCSSpill = true; 2312 break; 2313 } 2314 } 2315 } else if (!UnspilledCS2GPRs.empty() && !AFI->isThumb1OnlyFunction()) { 2316 unsigned Reg = UnspilledCS2GPRs.front(); 2317 SavedRegs.set(Reg); 2318 LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI) 2319 << " to make up alignment\n"); 2320 if (!MRI.isReserved(Reg) && !MRI.isPhysRegUsed(Reg)) 2321 ExtraCSSpill = true; 2322 } 2323 } 2324 2325 // Estimate if we might need to scavenge a register at some point in order 2326 // to materialize a stack offset. If so, either spill one additional 2327 // callee-saved register or reserve a special spill slot to facilitate 2328 // register scavenging. Thumb1 needs a spill slot for stack pointer 2329 // adjustments also, even when the frame itself is small. 2330 if (BigFrameOffsets && !ExtraCSSpill) { 2331 // If any non-reserved CS register isn't spilled, just spill one or two 2332 // extra. That should take care of it! 2333 unsigned NumExtras = TargetAlign.value() / 4; 2334 SmallVector<unsigned, 2> Extras; 2335 while (NumExtras && !UnspilledCS1GPRs.empty()) { 2336 unsigned Reg = UnspilledCS1GPRs.back(); 2337 UnspilledCS1GPRs.pop_back(); 2338 if (!MRI.isReserved(Reg) && 2339 (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg))) { 2340 Extras.push_back(Reg); 2341 NumExtras--; 2342 } 2343 } 2344 // For non-Thumb1 functions, also check for hi-reg CS registers 2345 if (!AFI->isThumb1OnlyFunction()) { 2346 while (NumExtras && !UnspilledCS2GPRs.empty()) { 2347 unsigned Reg = UnspilledCS2GPRs.back(); 2348 UnspilledCS2GPRs.pop_back(); 2349 if (!MRI.isReserved(Reg)) { 2350 Extras.push_back(Reg); 2351 NumExtras--; 2352 } 2353 } 2354 } 2355 if (NumExtras == 0) { 2356 for (unsigned Reg : Extras) { 2357 SavedRegs.set(Reg); 2358 if (!MRI.isPhysRegUsed(Reg)) 2359 ExtraCSSpill = true; 2360 } 2361 } 2362 if (!ExtraCSSpill && RS) { 2363 // Reserve a slot closest to SP or frame pointer. 2364 LLVM_DEBUG(dbgs() << "Reserving emergency spill slot\n"); 2365 const TargetRegisterClass &RC = ARM::GPRRegClass; 2366 unsigned Size = TRI->getSpillSize(RC); 2367 unsigned Align = TRI->getSpillAlignment(RC); 2368 RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false)); 2369 } 2370 } 2371 } 2372 2373 if (ForceLRSpill) 2374 SavedRegs.set(ARM::LR); 2375 AFI->setLRIsSpilled(SavedRegs.test(ARM::LR)); 2376 } 2377 2378 void ARMFrameLowering::getCalleeSaves(const MachineFunction &MF, 2379 BitVector &SavedRegs) const { 2380 TargetFrameLowering::getCalleeSaves(MF, SavedRegs); 2381 2382 // If we have the "returned" parameter attribute which guarantees that we 2383 // return the value which was passed in r0 unmodified (e.g. C++ 'structors), 2384 // record that fact for IPRA. 2385 const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2386 if (AFI->getPreservesR0()) 2387 SavedRegs.set(ARM::R0); 2388 } 2389 2390 MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr( 2391 MachineFunction &MF, MachineBasicBlock &MBB, 2392 MachineBasicBlock::iterator I) const { 2393 const ARMBaseInstrInfo &TII = 2394 *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo()); 2395 if (!hasReservedCallFrame(MF)) { 2396 // If we have alloca, convert as follows: 2397 // ADJCALLSTACKDOWN -> sub, sp, sp, amount 2398 // ADJCALLSTACKUP -> add, sp, sp, amount 2399 MachineInstr &Old = *I; 2400 DebugLoc dl = Old.getDebugLoc(); 2401 unsigned Amount = TII.getFrameSize(Old); 2402 if (Amount != 0) { 2403 // We need to keep the stack aligned properly. To do this, we round the 2404 // amount of space needed for the outgoing arguments up to the next 2405 // alignment boundary. 2406 Amount = alignSPAdjust(Amount); 2407 2408 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2409 assert(!AFI->isThumb1OnlyFunction() && 2410 "This eliminateCallFramePseudoInstr does not support Thumb1!"); 2411 bool isARM = !AFI->isThumbFunction(); 2412 2413 // Replace the pseudo instruction with a new instruction... 2414 unsigned Opc = Old.getOpcode(); 2415 int PIdx = Old.findFirstPredOperandIdx(); 2416 ARMCC::CondCodes Pred = 2417 (PIdx == -1) ? ARMCC::AL 2418 : (ARMCC::CondCodes)Old.getOperand(PIdx).getImm(); 2419 unsigned PredReg = TII.getFramePred(Old); 2420 if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) { 2421 emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, MachineInstr::NoFlags, 2422 Pred, PredReg); 2423 } else { 2424 assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP); 2425 emitSPUpdate(isARM, MBB, I, dl, TII, Amount, MachineInstr::NoFlags, 2426 Pred, PredReg); 2427 } 2428 } 2429 } 2430 return MBB.erase(I); 2431 } 2432 2433 /// Get the minimum constant for ARM that is greater than or equal to the 2434 /// argument. In ARM, constants can have any value that can be produced by 2435 /// rotating an 8-bit value to the right by an even number of bits within a 2436 /// 32-bit word. 2437 static uint32_t alignToARMConstant(uint32_t Value) { 2438 unsigned Shifted = 0; 2439 2440 if (Value == 0) 2441 return 0; 2442 2443 while (!(Value & 0xC0000000)) { 2444 Value = Value << 2; 2445 Shifted += 2; 2446 } 2447 2448 bool Carry = (Value & 0x00FFFFFF); 2449 Value = ((Value & 0xFF000000) >> 24) + Carry; 2450 2451 if (Value & 0x0000100) 2452 Value = Value & 0x000001FC; 2453 2454 if (Shifted > 24) 2455 Value = Value >> (Shifted - 24); 2456 else 2457 Value = Value << (24 - Shifted); 2458 2459 return Value; 2460 } 2461 2462 // The stack limit in the TCB is set to this many bytes above the actual 2463 // stack limit. 2464 static const uint64_t kSplitStackAvailable = 256; 2465 2466 // Adjust the function prologue to enable split stacks. This currently only 2467 // supports android and linux. 2468 // 2469 // The ABI of the segmented stack prologue is a little arbitrarily chosen, but 2470 // must be well defined in order to allow for consistent implementations of the 2471 // __morestack helper function. The ABI is also not a normal ABI in that it 2472 // doesn't follow the normal calling conventions because this allows the 2473 // prologue of each function to be optimized further. 2474 // 2475 // Currently, the ABI looks like (when calling __morestack) 2476 // 2477 // * r4 holds the minimum stack size requested for this function call 2478 // * r5 holds the stack size of the arguments to the function 2479 // * the beginning of the function is 3 instructions after the call to 2480 // __morestack 2481 // 2482 // Implementations of __morestack should use r4 to allocate a new stack, r5 to 2483 // place the arguments on to the new stack, and the 3-instruction knowledge to 2484 // jump directly to the body of the function when working on the new stack. 2485 // 2486 // An old (and possibly no longer compatible) implementation of __morestack for 2487 // ARM can be found at [1]. 2488 // 2489 // [1] - https://github.com/mozilla/rust/blob/86efd9/src/rt/arch/arm/morestack.S 2490 void ARMFrameLowering::adjustForSegmentedStacks( 2491 MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { 2492 unsigned Opcode; 2493 unsigned CFIIndex; 2494 const ARMSubtarget *ST = &MF.getSubtarget<ARMSubtarget>(); 2495 bool Thumb = ST->isThumb(); 2496 2497 // Sadly, this currently doesn't support varargs, platforms other than 2498 // android/linux. Note that thumb1/thumb2 are support for android/linux. 2499 if (MF.getFunction().isVarArg()) 2500 report_fatal_error("Segmented stacks do not support vararg functions."); 2501 if (!ST->isTargetAndroid() && !ST->isTargetLinux()) 2502 report_fatal_error("Segmented stacks not supported on this platform."); 2503 2504 MachineFrameInfo &MFI = MF.getFrameInfo(); 2505 MachineModuleInfo &MMI = MF.getMMI(); 2506 MCContext &Context = MMI.getContext(); 2507 const MCRegisterInfo *MRI = Context.getRegisterInfo(); 2508 const ARMBaseInstrInfo &TII = 2509 *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo()); 2510 ARMFunctionInfo *ARMFI = MF.getInfo<ARMFunctionInfo>(); 2511 DebugLoc DL; 2512 2513 uint64_t StackSize = MFI.getStackSize(); 2514 2515 // Do not generate a prologue for leaf functions with a stack of size zero. 2516 // For non-leaf functions we have to allow for the possibility that the 2517 // callis to a non-split function, as in PR37807. This function could also 2518 // take the address of a non-split function. When the linker tries to adjust 2519 // its non-existent prologue, it would fail with an error. Mark the object 2520 // file so that such failures are not errors. See this Go language bug-report 2521 // https://go-review.googlesource.com/c/go/+/148819/ 2522 if (StackSize == 0 && !MFI.hasTailCall()) { 2523 MF.getMMI().setHasNosplitStack(true); 2524 return; 2525 } 2526 2527 // Use R4 and R5 as scratch registers. 2528 // We save R4 and R5 before use and restore them before leaving the function. 2529 unsigned ScratchReg0 = ARM::R4; 2530 unsigned ScratchReg1 = ARM::R5; 2531 uint64_t AlignedStackSize; 2532 2533 MachineBasicBlock *PrevStackMBB = MF.CreateMachineBasicBlock(); 2534 MachineBasicBlock *PostStackMBB = MF.CreateMachineBasicBlock(); 2535 MachineBasicBlock *AllocMBB = MF.CreateMachineBasicBlock(); 2536 MachineBasicBlock *GetMBB = MF.CreateMachineBasicBlock(); 2537 MachineBasicBlock *McrMBB = MF.CreateMachineBasicBlock(); 2538 2539 // Grab everything that reaches PrologueMBB to update there liveness as well. 2540 SmallPtrSet<MachineBasicBlock *, 8> BeforePrologueRegion; 2541 SmallVector<MachineBasicBlock *, 2> WalkList; 2542 WalkList.push_back(&PrologueMBB); 2543 2544 do { 2545 MachineBasicBlock *CurMBB = WalkList.pop_back_val(); 2546 for (MachineBasicBlock *PredBB : CurMBB->predecessors()) { 2547 if (BeforePrologueRegion.insert(PredBB).second) 2548 WalkList.push_back(PredBB); 2549 } 2550 } while (!WalkList.empty()); 2551 2552 // The order in that list is important. 2553 // The blocks will all be inserted before PrologueMBB using that order. 2554 // Therefore the block that should appear first in the CFG should appear 2555 // first in the list. 2556 MachineBasicBlock *AddedBlocks[] = {PrevStackMBB, McrMBB, GetMBB, AllocMBB, 2557 PostStackMBB}; 2558 2559 for (MachineBasicBlock *B : AddedBlocks) 2560 BeforePrologueRegion.insert(B); 2561 2562 for (const auto &LI : PrologueMBB.liveins()) { 2563 for (MachineBasicBlock *PredBB : BeforePrologueRegion) 2564 PredBB->addLiveIn(LI); 2565 } 2566 2567 // Remove the newly added blocks from the list, since we know 2568 // we do not have to do the following updates for them. 2569 for (MachineBasicBlock *B : AddedBlocks) { 2570 BeforePrologueRegion.erase(B); 2571 MF.insert(PrologueMBB.getIterator(), B); 2572 } 2573 2574 for (MachineBasicBlock *MBB : BeforePrologueRegion) { 2575 // Make sure the LiveIns are still sorted and unique. 2576 MBB->sortUniqueLiveIns(); 2577 // Replace the edges to PrologueMBB by edges to the sequences 2578 // we are about to add. 2579 MBB->ReplaceUsesOfBlockWith(&PrologueMBB, AddedBlocks[0]); 2580 } 2581 2582 // The required stack size that is aligned to ARM constant criterion. 2583 AlignedStackSize = alignToARMConstant(StackSize); 2584 2585 // When the frame size is less than 256 we just compare the stack 2586 // boundary directly to the value of the stack pointer, per gcc. 2587 bool CompareStackPointer = AlignedStackSize < kSplitStackAvailable; 2588 2589 // We will use two of the callee save registers as scratch registers so we 2590 // need to save those registers onto the stack. 2591 // We will use SR0 to hold stack limit and SR1 to hold the stack size 2592 // requested and arguments for __morestack(). 2593 // SR0: Scratch Register #0 2594 // SR1: Scratch Register #1 2595 // push {SR0, SR1} 2596 if (Thumb) { 2597 BuildMI(PrevStackMBB, DL, TII.get(ARM::tPUSH)) 2598 .add(predOps(ARMCC::AL)) 2599 .addReg(ScratchReg0) 2600 .addReg(ScratchReg1); 2601 } else { 2602 BuildMI(PrevStackMBB, DL, TII.get(ARM::STMDB_UPD)) 2603 .addReg(ARM::SP, RegState::Define) 2604 .addReg(ARM::SP) 2605 .add(predOps(ARMCC::AL)) 2606 .addReg(ScratchReg0) 2607 .addReg(ScratchReg1); 2608 } 2609 2610 // Emit the relevant DWARF information about the change in stack pointer as 2611 // well as where to find both r4 and r5 (the callee-save registers) 2612 CFIIndex = 2613 MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -8)); 2614 BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) 2615 .addCFIIndex(CFIIndex); 2616 CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( 2617 nullptr, MRI->getDwarfRegNum(ScratchReg1, true), -4)); 2618 BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) 2619 .addCFIIndex(CFIIndex); 2620 CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( 2621 nullptr, MRI->getDwarfRegNum(ScratchReg0, true), -8)); 2622 BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) 2623 .addCFIIndex(CFIIndex); 2624 2625 // mov SR1, sp 2626 if (Thumb) { 2627 BuildMI(McrMBB, DL, TII.get(ARM::tMOVr), ScratchReg1) 2628 .addReg(ARM::SP) 2629 .add(predOps(ARMCC::AL)); 2630 } else if (CompareStackPointer) { 2631 BuildMI(McrMBB, DL, TII.get(ARM::MOVr), ScratchReg1) 2632 .addReg(ARM::SP) 2633 .add(predOps(ARMCC::AL)) 2634 .add(condCodeOp()); 2635 } 2636 2637 // sub SR1, sp, #StackSize 2638 if (!CompareStackPointer && Thumb) { 2639 BuildMI(McrMBB, DL, TII.get(ARM::tSUBi8), ScratchReg1) 2640 .add(condCodeOp()) 2641 .addReg(ScratchReg1) 2642 .addImm(AlignedStackSize) 2643 .add(predOps(ARMCC::AL)); 2644 } else if (!CompareStackPointer) { 2645 BuildMI(McrMBB, DL, TII.get(ARM::SUBri), ScratchReg1) 2646 .addReg(ARM::SP) 2647 .addImm(AlignedStackSize) 2648 .add(predOps(ARMCC::AL)) 2649 .add(condCodeOp()); 2650 } 2651 2652 if (Thumb && ST->isThumb1Only()) { 2653 unsigned PCLabelId = ARMFI->createPICLabelUId(); 2654 ARMConstantPoolValue *NewCPV = ARMConstantPoolSymbol::Create( 2655 MF.getFunction().getContext(), "__STACK_LIMIT", PCLabelId, 0); 2656 MachineConstantPool *MCP = MF.getConstantPool(); 2657 unsigned CPI = MCP->getConstantPoolIndex(NewCPV, 4); 2658 2659 // ldr SR0, [pc, offset(STACK_LIMIT)] 2660 BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0) 2661 .addConstantPoolIndex(CPI) 2662 .add(predOps(ARMCC::AL)); 2663 2664 // ldr SR0, [SR0] 2665 BuildMI(GetMBB, DL, TII.get(ARM::tLDRi), ScratchReg0) 2666 .addReg(ScratchReg0) 2667 .addImm(0) 2668 .add(predOps(ARMCC::AL)); 2669 } else { 2670 // Get TLS base address from the coprocessor 2671 // mrc p15, #0, SR0, c13, c0, #3 2672 BuildMI(McrMBB, DL, TII.get(Thumb ? ARM::t2MRC : ARM::MRC), 2673 ScratchReg0) 2674 .addImm(15) 2675 .addImm(0) 2676 .addImm(13) 2677 .addImm(0) 2678 .addImm(3) 2679 .add(predOps(ARMCC::AL)); 2680 2681 // Use the last tls slot on android and a private field of the TCP on linux. 2682 assert(ST->isTargetAndroid() || ST->isTargetLinux()); 2683 unsigned TlsOffset = ST->isTargetAndroid() ? 63 : 1; 2684 2685 // Get the stack limit from the right offset 2686 // ldr SR0, [sr0, #4 * TlsOffset] 2687 BuildMI(GetMBB, DL, TII.get(Thumb ? ARM::t2LDRi12 : ARM::LDRi12), 2688 ScratchReg0) 2689 .addReg(ScratchReg0) 2690 .addImm(4 * TlsOffset) 2691 .add(predOps(ARMCC::AL)); 2692 } 2693 2694 // Compare stack limit with stack size requested. 2695 // cmp SR0, SR1 2696 Opcode = Thumb ? ARM::tCMPr : ARM::CMPrr; 2697 BuildMI(GetMBB, DL, TII.get(Opcode)) 2698 .addReg(ScratchReg0) 2699 .addReg(ScratchReg1) 2700 .add(predOps(ARMCC::AL)); 2701 2702 // This jump is taken if StackLimit < SP - stack required. 2703 Opcode = Thumb ? ARM::tBcc : ARM::Bcc; 2704 BuildMI(GetMBB, DL, TII.get(Opcode)).addMBB(PostStackMBB) 2705 .addImm(ARMCC::LO) 2706 .addReg(ARM::CPSR); 2707 2708 2709 // Calling __morestack(StackSize, Size of stack arguments). 2710 // __morestack knows that the stack size requested is in SR0(r4) 2711 // and amount size of stack arguments is in SR1(r5). 2712 2713 // Pass first argument for the __morestack by Scratch Register #0. 2714 // The amount size of stack required 2715 if (Thumb) { 2716 BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg0) 2717 .add(condCodeOp()) 2718 .addImm(AlignedStackSize) 2719 .add(predOps(ARMCC::AL)); 2720 } else { 2721 BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg0) 2722 .addImm(AlignedStackSize) 2723 .add(predOps(ARMCC::AL)) 2724 .add(condCodeOp()); 2725 } 2726 // Pass second argument for the __morestack by Scratch Register #1. 2727 // The amount size of stack consumed to save function arguments. 2728 if (Thumb) { 2729 BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg1) 2730 .add(condCodeOp()) 2731 .addImm(alignToARMConstant(ARMFI->getArgumentStackSize())) 2732 .add(predOps(ARMCC::AL)); 2733 } else { 2734 BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg1) 2735 .addImm(alignToARMConstant(ARMFI->getArgumentStackSize())) 2736 .add(predOps(ARMCC::AL)) 2737 .add(condCodeOp()); 2738 } 2739 2740 // push {lr} - Save return address of this function. 2741 if (Thumb) { 2742 BuildMI(AllocMBB, DL, TII.get(ARM::tPUSH)) 2743 .add(predOps(ARMCC::AL)) 2744 .addReg(ARM::LR); 2745 } else { 2746 BuildMI(AllocMBB, DL, TII.get(ARM::STMDB_UPD)) 2747 .addReg(ARM::SP, RegState::Define) 2748 .addReg(ARM::SP) 2749 .add(predOps(ARMCC::AL)) 2750 .addReg(ARM::LR); 2751 } 2752 2753 // Emit the DWARF info about the change in stack as well as where to find the 2754 // previous link register 2755 CFIIndex = 2756 MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -12)); 2757 BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) 2758 .addCFIIndex(CFIIndex); 2759 CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( 2760 nullptr, MRI->getDwarfRegNum(ARM::LR, true), -12)); 2761 BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) 2762 .addCFIIndex(CFIIndex); 2763 2764 // Call __morestack(). 2765 if (Thumb) { 2766 BuildMI(AllocMBB, DL, TII.get(ARM::tBL)) 2767 .add(predOps(ARMCC::AL)) 2768 .addExternalSymbol("__morestack"); 2769 } else { 2770 BuildMI(AllocMBB, DL, TII.get(ARM::BL)) 2771 .addExternalSymbol("__morestack"); 2772 } 2773 2774 // pop {lr} - Restore return address of this original function. 2775 if (Thumb) { 2776 if (ST->isThumb1Only()) { 2777 BuildMI(AllocMBB, DL, TII.get(ARM::tPOP)) 2778 .add(predOps(ARMCC::AL)) 2779 .addReg(ScratchReg0); 2780 BuildMI(AllocMBB, DL, TII.get(ARM::tMOVr), ARM::LR) 2781 .addReg(ScratchReg0) 2782 .add(predOps(ARMCC::AL)); 2783 } else { 2784 BuildMI(AllocMBB, DL, TII.get(ARM::t2LDR_POST)) 2785 .addReg(ARM::LR, RegState::Define) 2786 .addReg(ARM::SP, RegState::Define) 2787 .addReg(ARM::SP) 2788 .addImm(4) 2789 .add(predOps(ARMCC::AL)); 2790 } 2791 } else { 2792 BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD)) 2793 .addReg(ARM::SP, RegState::Define) 2794 .addReg(ARM::SP) 2795 .add(predOps(ARMCC::AL)) 2796 .addReg(ARM::LR); 2797 } 2798 2799 // Restore SR0 and SR1 in case of __morestack() was called. 2800 // __morestack() will skip PostStackMBB block so we need to restore 2801 // scratch registers from here. 2802 // pop {SR0, SR1} 2803 if (Thumb) { 2804 BuildMI(AllocMBB, DL, TII.get(ARM::tPOP)) 2805 .add(predOps(ARMCC::AL)) 2806 .addReg(ScratchReg0) 2807 .addReg(ScratchReg1); 2808 } else { 2809 BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD)) 2810 .addReg(ARM::SP, RegState::Define) 2811 .addReg(ARM::SP) 2812 .add(predOps(ARMCC::AL)) 2813 .addReg(ScratchReg0) 2814 .addReg(ScratchReg1); 2815 } 2816 2817 // Update the CFA offset now that we've popped 2818 CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0)); 2819 BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) 2820 .addCFIIndex(CFIIndex); 2821 2822 // Return from this function. 2823 BuildMI(AllocMBB, DL, TII.get(ST->getReturnOpcode())).add(predOps(ARMCC::AL)); 2824 2825 // Restore SR0 and SR1 in case of __morestack() was not called. 2826 // pop {SR0, SR1} 2827 if (Thumb) { 2828 BuildMI(PostStackMBB, DL, TII.get(ARM::tPOP)) 2829 .add(predOps(ARMCC::AL)) 2830 .addReg(ScratchReg0) 2831 .addReg(ScratchReg1); 2832 } else { 2833 BuildMI(PostStackMBB, DL, TII.get(ARM::LDMIA_UPD)) 2834 .addReg(ARM::SP, RegState::Define) 2835 .addReg(ARM::SP) 2836 .add(predOps(ARMCC::AL)) 2837 .addReg(ScratchReg0) 2838 .addReg(ScratchReg1); 2839 } 2840 2841 // Update the CFA offset now that we've popped 2842 CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0)); 2843 BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) 2844 .addCFIIndex(CFIIndex); 2845 2846 // Tell debuggers that r4 and r5 are now the same as they were in the 2847 // previous function, that they're the "Same Value". 2848 CFIIndex = MF.addFrameInst(MCCFIInstruction::createSameValue( 2849 nullptr, MRI->getDwarfRegNum(ScratchReg0, true))); 2850 BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) 2851 .addCFIIndex(CFIIndex); 2852 CFIIndex = MF.addFrameInst(MCCFIInstruction::createSameValue( 2853 nullptr, MRI->getDwarfRegNum(ScratchReg1, true))); 2854 BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) 2855 .addCFIIndex(CFIIndex); 2856 2857 // Organizing MBB lists 2858 PostStackMBB->addSuccessor(&PrologueMBB); 2859 2860 AllocMBB->addSuccessor(PostStackMBB); 2861 2862 GetMBB->addSuccessor(PostStackMBB); 2863 GetMBB->addSuccessor(AllocMBB); 2864 2865 McrMBB->addSuccessor(GetMBB); 2866 2867 PrevStackMBB->addSuccessor(McrMBB); 2868 2869 #ifdef EXPENSIVE_CHECKS 2870 MF.verify(); 2871 #endif 2872 } 2873