1 //===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass that performs load / store related peephole 11 // optimizations. This pass should be run after register allocation. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #define DEBUG_TYPE "arm-ldst-opt" 16 #include "ARM.h" 17 #include "ARMAddressingModes.h" 18 #include "ARMBaseInstrInfo.h" 19 #include "ARMMachineFunctionInfo.h" 20 #include "ARMRegisterInfo.h" 21 #include "llvm/DerivedTypes.h" 22 #include "llvm/Function.h" 23 #include "llvm/CodeGen/MachineBasicBlock.h" 24 #include "llvm/CodeGen/MachineFunctionPass.h" 25 #include "llvm/CodeGen/MachineInstr.h" 26 #include "llvm/CodeGen/MachineInstrBuilder.h" 27 #include "llvm/CodeGen/MachineRegisterInfo.h" 28 #include "llvm/CodeGen/RegisterScavenging.h" 29 #include "llvm/Target/TargetData.h" 30 #include "llvm/Target/TargetInstrInfo.h" 31 #include "llvm/Target/TargetMachine.h" 32 #include "llvm/Target/TargetRegisterInfo.h" 33 #include "llvm/Support/Compiler.h" 34 #include "llvm/Support/ErrorHandling.h" 35 #include "llvm/ADT/DenseMap.h" 36 #include "llvm/ADT/STLExtras.h" 37 #include "llvm/ADT/SmallPtrSet.h" 38 #include "llvm/ADT/SmallSet.h" 39 #include "llvm/ADT/SmallVector.h" 40 #include "llvm/ADT/Statistic.h" 41 using namespace llvm; 42 43 STATISTIC(NumLDMGened , "Number of ldm instructions generated"); 44 STATISTIC(NumSTMGened , "Number of stm instructions generated"); 45 STATISTIC(NumFLDMGened, "Number of fldm instructions generated"); 46 STATISTIC(NumFSTMGened, "Number of fstm instructions generated"); 47 STATISTIC(NumLdStMoved, "Number of load / store instructions moved"); 48 STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation"); 49 STATISTIC(NumSTRDFormed,"Number of strd created before allocation"); 50 STATISTIC(NumLDRD2LDM, "Number of ldrd instructions turned back into ldm"); 51 STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm"); 52 STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's"); 53 STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's"); 54 55 /// ARMAllocLoadStoreOpt - Post- register allocation pass the combine 56 /// load / store instructions to form ldm / stm instructions. 57 58 namespace { 59 struct VISIBILITY_HIDDEN ARMLoadStoreOpt : public MachineFunctionPass { 60 static char ID; 61 ARMLoadStoreOpt() : MachineFunctionPass(&ID) {} 62 63 const TargetInstrInfo *TII; 64 const TargetRegisterInfo *TRI; 65 ARMFunctionInfo *AFI; 66 RegScavenger *RS; 67 bool isThumb2; 68 69 virtual bool runOnMachineFunction(MachineFunction &Fn); 70 71 virtual const char *getPassName() const { 72 return "ARM load / store optimization pass"; 73 } 74 75 private: 76 struct MemOpQueueEntry { 77 int Offset; 78 unsigned Position; 79 MachineBasicBlock::iterator MBBI; 80 bool Merged; 81 MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i) 82 : Offset(o), Position(p), MBBI(i), Merged(false) {}; 83 }; 84 typedef SmallVector<MemOpQueueEntry,8> MemOpQueue; 85 typedef MemOpQueue::iterator MemOpQueueIter; 86 87 bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, 88 int Offset, unsigned Base, bool BaseKill, int Opcode, 89 ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch, 90 DebugLoc dl, SmallVector<std::pair<unsigned, bool>, 8> &Regs); 91 void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base, 92 int Opcode, unsigned Size, 93 ARMCC::CondCodes Pred, unsigned PredReg, 94 unsigned Scratch, MemOpQueue &MemOps, 95 SmallVector<MachineBasicBlock::iterator, 4> &Merges); 96 97 void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps); 98 bool FixInvalidRegPairOp(MachineBasicBlock &MBB, 99 MachineBasicBlock::iterator &MBBI); 100 bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, 101 MachineBasicBlock::iterator MBBI, 102 const TargetInstrInfo *TII, 103 bool &Advance, 104 MachineBasicBlock::iterator &I); 105 bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, 106 MachineBasicBlock::iterator MBBI, 107 bool &Advance, 108 MachineBasicBlock::iterator &I); 109 bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); 110 bool MergeReturnIntoLDM(MachineBasicBlock &MBB); 111 }; 112 char ARMLoadStoreOpt::ID = 0; 113 } 114 115 static int getLoadStoreMultipleOpcode(int Opcode) { 116 switch (Opcode) { 117 case ARM::LDR: 118 NumLDMGened++; 119 return ARM::LDM; 120 case ARM::STR: 121 NumSTMGened++; 122 return ARM::STM; 123 case ARM::t2LDRi8: 124 case ARM::t2LDRi12: 125 NumLDMGened++; 126 return ARM::t2LDM; 127 case ARM::t2STRi8: 128 case ARM::t2STRi12: 129 NumSTMGened++; 130 return ARM::t2STM; 131 case ARM::FLDS: 132 NumFLDMGened++; 133 return ARM::FLDMS; 134 case ARM::FSTS: 135 NumFSTMGened++; 136 return ARM::FSTMS; 137 case ARM::FLDD: 138 NumFLDMGened++; 139 return ARM::FLDMD; 140 case ARM::FSTD: 141 NumFSTMGened++; 142 return ARM::FSTMD; 143 default: llvm_unreachable("Unhandled opcode!"); 144 } 145 return 0; 146 } 147 148 static bool isT2i32Load(unsigned Opc) { 149 return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8; 150 } 151 152 static bool isi32Load(unsigned Opc) { 153 return Opc == ARM::LDR || isT2i32Load(Opc); 154 } 155 156 static bool isT2i32Store(unsigned Opc) { 157 return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8; 158 } 159 160 static bool isi32Store(unsigned Opc) { 161 return Opc == ARM::STR || isT2i32Store(Opc); 162 } 163 164 /// MergeOps - Create and insert a LDM or STM with Base as base register and 165 /// registers in Regs as the register operands that would be loaded / stored. 166 /// It returns true if the transformation is done. 167 bool 168 ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, 169 MachineBasicBlock::iterator MBBI, 170 int Offset, unsigned Base, bool BaseKill, 171 int Opcode, ARMCC::CondCodes Pred, 172 unsigned PredReg, unsigned Scratch, DebugLoc dl, 173 SmallVector<std::pair<unsigned, bool>, 8> &Regs) { 174 // Only a single register to load / store. Don't bother. 175 unsigned NumRegs = Regs.size(); 176 if (NumRegs <= 1) 177 return false; 178 179 ARM_AM::AMSubMode Mode = ARM_AM::ia; 180 bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode); 181 if (isAM4 && Offset == 4) { 182 if (isThumb2) 183 // Thumb2 does not support ldmib / stmib. 184 return false; 185 Mode = ARM_AM::ib; 186 } else if (isAM4 && Offset == -4 * (int)NumRegs + 4) { 187 if (isThumb2) 188 // Thumb2 does not support ldmda / stmda. 189 return false; 190 Mode = ARM_AM::da; 191 } else if (isAM4 && Offset == -4 * (int)NumRegs) { 192 Mode = ARM_AM::db; 193 } else if (Offset != 0) { 194 // If starting offset isn't zero, insert a MI to materialize a new base. 195 // But only do so if it is cost effective, i.e. merging more than two 196 // loads / stores. 197 if (NumRegs <= 2) 198 return false; 199 200 unsigned NewBase; 201 if (isi32Load(Opcode)) 202 // If it is a load, then just use one of the destination register to 203 // use as the new base. 204 NewBase = Regs[NumRegs-1].first; 205 else { 206 // Use the scratch register to use as a new base. 207 NewBase = Scratch; 208 if (NewBase == 0) 209 return false; 210 } 211 int BaseOpc = !isThumb2 212 ? ARM::ADDri 213 : ((Base == ARM::SP) ? ARM::t2ADDrSPi : ARM::t2ADDri); 214 if (Offset < 0) { 215 BaseOpc = !isThumb2 216 ? ARM::SUBri 217 : ((Base == ARM::SP) ? ARM::t2SUBrSPi : ARM::t2SUBri); 218 Offset = - Offset; 219 } 220 int ImmedOffset = isThumb2 221 ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset); 222 if (ImmedOffset == -1) 223 // FIXME: Try t2ADDri12 or t2SUBri12? 224 return false; // Probably not worth it then. 225 226 BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase) 227 .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) 228 .addImm(Pred).addReg(PredReg).addReg(0); 229 Base = NewBase; 230 BaseKill = true; // New base is always killed right its use. 231 } 232 233 bool isDPR = Opcode == ARM::FLDD || Opcode == ARM::FSTD; 234 bool isDef = isi32Load(Opcode) || Opcode == ARM::FLDS || Opcode == ARM::FLDD; 235 Opcode = getLoadStoreMultipleOpcode(Opcode); 236 MachineInstrBuilder MIB = (isAM4) 237 ? BuildMI(MBB, MBBI, dl, TII->get(Opcode)) 238 .addReg(Base, getKillRegState(BaseKill)) 239 .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg) 240 : BuildMI(MBB, MBBI, dl, TII->get(Opcode)) 241 .addReg(Base, getKillRegState(BaseKill)) 242 .addImm(ARM_AM::getAM5Opc(Mode, false, isDPR ? NumRegs<<1 : NumRegs)) 243 .addImm(Pred).addReg(PredReg); 244 MIB.addReg(0); // Add optional writeback (0 for now). 245 for (unsigned i = 0; i != NumRegs; ++i) 246 MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef) 247 | getKillRegState(Regs[i].second)); 248 249 return true; 250 } 251 252 /// MergeLDR_STR - Merge a number of load / store instructions into one or more 253 /// load / store multiple instructions. 254 void 255 ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, 256 unsigned Base, int Opcode, unsigned Size, 257 ARMCC::CondCodes Pred, unsigned PredReg, 258 unsigned Scratch, MemOpQueue &MemOps, 259 SmallVector<MachineBasicBlock::iterator, 4> &Merges) { 260 bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode); 261 int Offset = MemOps[SIndex].Offset; 262 int SOffset = Offset; 263 unsigned Pos = MemOps[SIndex].Position; 264 MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI; 265 DebugLoc dl = Loc->getDebugLoc(); 266 unsigned PReg = Loc->getOperand(0).getReg(); 267 unsigned PRegNum = ARMRegisterInfo::getRegisterNumbering(PReg); 268 bool isKill = Loc->getOperand(0).isKill(); 269 270 SmallVector<std::pair<unsigned,bool>, 8> Regs; 271 Regs.push_back(std::make_pair(PReg, isKill)); 272 for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) { 273 int NewOffset = MemOps[i].Offset; 274 unsigned Reg = MemOps[i].MBBI->getOperand(0).getReg(); 275 unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg); 276 isKill = MemOps[i].MBBI->getOperand(0).isKill(); 277 // AM4 - register numbers in ascending order. 278 // AM5 - consecutive register numbers in ascending order. 279 if (NewOffset == Offset + (int)Size && 280 ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) { 281 Offset += Size; 282 Regs.push_back(std::make_pair(Reg, isKill)); 283 PRegNum = RegNum; 284 } else { 285 // Can't merge this in. Try merge the earlier ones first. 286 if (MergeOps(MBB, ++Loc, SOffset, Base, false, Opcode, Pred, PredReg, 287 Scratch, dl, Regs)) { 288 Merges.push_back(prior(Loc)); 289 for (unsigned j = SIndex; j < i; ++j) { 290 MBB.erase(MemOps[j].MBBI); 291 MemOps[j].Merged = true; 292 } 293 } 294 MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch, 295 MemOps, Merges); 296 return; 297 } 298 299 if (MemOps[i].Position > Pos) { 300 Pos = MemOps[i].Position; 301 Loc = MemOps[i].MBBI; 302 } 303 } 304 305 bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1; 306 if (MergeOps(MBB, ++Loc, SOffset, Base, BaseKill, Opcode, Pred, PredReg, 307 Scratch, dl, Regs)) { 308 Merges.push_back(prior(Loc)); 309 for (unsigned i = SIndex, e = MemOps.size(); i != e; ++i) { 310 MBB.erase(MemOps[i].MBBI); 311 MemOps[i].Merged = true; 312 } 313 } 314 315 return; 316 } 317 318 static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base, 319 unsigned Bytes, unsigned Limit, 320 ARMCC::CondCodes Pred, unsigned PredReg){ 321 unsigned MyPredReg = 0; 322 if (!MI) 323 return false; 324 if (MI->getOpcode() != ARM::t2SUBri && 325 MI->getOpcode() != ARM::t2SUBrSPi && 326 MI->getOpcode() != ARM::t2SUBrSPi12 && 327 MI->getOpcode() != ARM::tSUBspi && 328 MI->getOpcode() != ARM::SUBri) 329 return false; 330 331 // Make sure the offset fits in 8 bits. 332 if (Bytes <= 0 || (Limit && Bytes >= Limit)) 333 return false; 334 335 unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME 336 return (MI->getOperand(0).getReg() == Base && 337 MI->getOperand(1).getReg() == Base && 338 (MI->getOperand(2).getImm()*Scale) == Bytes && 339 llvm::getInstrPredicate(MI, MyPredReg) == Pred && 340 MyPredReg == PredReg); 341 } 342 343 static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base, 344 unsigned Bytes, unsigned Limit, 345 ARMCC::CondCodes Pred, unsigned PredReg){ 346 unsigned MyPredReg = 0; 347 if (!MI) 348 return false; 349 if (MI->getOpcode() != ARM::t2ADDri && 350 MI->getOpcode() != ARM::t2ADDrSPi && 351 MI->getOpcode() != ARM::t2ADDrSPi12 && 352 MI->getOpcode() != ARM::tADDspi && 353 MI->getOpcode() != ARM::ADDri) 354 return false; 355 356 if (Bytes <= 0 || (Limit && Bytes >= Limit)) 357 // Make sure the offset fits in 8 bits. 358 return false; 359 360 unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME 361 return (MI->getOperand(0).getReg() == Base && 362 MI->getOperand(1).getReg() == Base && 363 (MI->getOperand(2).getImm()*Scale) == Bytes && 364 llvm::getInstrPredicate(MI, MyPredReg) == Pred && 365 MyPredReg == PredReg); 366 } 367 368 static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { 369 switch (MI->getOpcode()) { 370 default: return 0; 371 case ARM::LDR: 372 case ARM::STR: 373 case ARM::t2LDRi8: 374 case ARM::t2LDRi12: 375 case ARM::t2STRi8: 376 case ARM::t2STRi12: 377 case ARM::FLDS: 378 case ARM::FSTS: 379 return 4; 380 case ARM::FLDD: 381 case ARM::FSTD: 382 return 8; 383 case ARM::LDM: 384 case ARM::STM: 385 case ARM::t2LDM: 386 case ARM::t2STM: 387 return (MI->getNumOperands() - 5) * 4; 388 case ARM::FLDMS: 389 case ARM::FSTMS: 390 case ARM::FLDMD: 391 case ARM::FSTMD: 392 return ARM_AM::getAM5Offset(MI->getOperand(1).getImm()) * 4; 393 } 394 } 395 396 /// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base 397 /// register into the LDM/STM/FLDM{D|S}/FSTM{D|S} op when possible: 398 /// 399 /// stmia rn, <ra, rb, rc> 400 /// rn := rn + 4 * 3; 401 /// => 402 /// stmia rn!, <ra, rb, rc> 403 /// 404 /// rn := rn - 4 * 3; 405 /// ldmia rn, <ra, rb, rc> 406 /// => 407 /// ldmdb rn!, <ra, rb, rc> 408 bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, 409 MachineBasicBlock::iterator MBBI, 410 bool &Advance, 411 MachineBasicBlock::iterator &I) { 412 MachineInstr *MI = MBBI; 413 unsigned Base = MI->getOperand(0).getReg(); 414 unsigned Bytes = getLSMultipleTransferSize(MI); 415 unsigned PredReg = 0; 416 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); 417 int Opcode = MI->getOpcode(); 418 bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::t2LDM || 419 Opcode == ARM::STM || Opcode == ARM::t2STM; 420 421 if (isAM4) { 422 if (ARM_AM::getAM4WBFlag(MI->getOperand(1).getImm())) 423 return false; 424 425 // Can't use the updating AM4 sub-mode if the base register is also a dest 426 // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined. 427 for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) { 428 if (MI->getOperand(i).getReg() == Base) 429 return false; 430 } 431 432 ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm()); 433 if (MBBI != MBB.begin()) { 434 MachineBasicBlock::iterator PrevMBBI = prior(MBBI); 435 if (Mode == ARM_AM::ia && 436 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 437 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::db, true)); 438 MI->getOperand(4).setReg(Base); 439 MI->getOperand(4).setIsDef(); 440 MBB.erase(PrevMBBI); 441 return true; 442 } else if (Mode == ARM_AM::ib && 443 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 444 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::da, true)); 445 MI->getOperand(4).setReg(Base); // WB to base 446 MI->getOperand(4).setIsDef(); 447 MBB.erase(PrevMBBI); 448 return true; 449 } 450 } 451 452 if (MBBI != MBB.end()) { 453 MachineBasicBlock::iterator NextMBBI = next(MBBI); 454 if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) && 455 isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 456 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true)); 457 MI->getOperand(4).setReg(Base); // WB to base 458 MI->getOperand(4).setIsDef(); 459 if (NextMBBI == I) { 460 Advance = true; 461 ++I; 462 } 463 MBB.erase(NextMBBI); 464 return true; 465 } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) && 466 isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 467 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true)); 468 MI->getOperand(4).setReg(Base); // WB to base 469 MI->getOperand(4).setIsDef(); 470 if (NextMBBI == I) { 471 Advance = true; 472 ++I; 473 } 474 MBB.erase(NextMBBI); 475 return true; 476 } 477 } 478 } else { 479 // FLDM{D|S}, FSTM{D|S} addressing mode 5 ops. 480 if (ARM_AM::getAM5WBFlag(MI->getOperand(1).getImm())) 481 return false; 482 483 ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MI->getOperand(1).getImm()); 484 unsigned Offset = ARM_AM::getAM5Offset(MI->getOperand(1).getImm()); 485 if (MBBI != MBB.begin()) { 486 MachineBasicBlock::iterator PrevMBBI = prior(MBBI); 487 if (Mode == ARM_AM::ia && 488 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 489 MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::db, true, Offset)); 490 MI->getOperand(4).setReg(Base); // WB to base 491 MI->getOperand(4).setIsDef(); 492 MBB.erase(PrevMBBI); 493 return true; 494 } 495 } 496 497 if (MBBI != MBB.end()) { 498 MachineBasicBlock::iterator NextMBBI = next(MBBI); 499 if (Mode == ARM_AM::ia && 500 isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 501 MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset)); 502 MI->getOperand(4).setReg(Base); // WB to base 503 MI->getOperand(4).setIsDef(); 504 if (NextMBBI == I) { 505 Advance = true; 506 ++I; 507 } 508 MBB.erase(NextMBBI); 509 } 510 return true; 511 } 512 } 513 514 return false; 515 } 516 517 static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) { 518 switch (Opc) { 519 case ARM::LDR: return ARM::LDR_PRE; 520 case ARM::STR: return ARM::STR_PRE; 521 case ARM::FLDS: return ARM::FLDMS; 522 case ARM::FLDD: return ARM::FLDMD; 523 case ARM::FSTS: return ARM::FSTMS; 524 case ARM::FSTD: return ARM::FSTMD; 525 case ARM::t2LDRi8: 526 case ARM::t2LDRi12: 527 return ARM::t2LDR_PRE; 528 case ARM::t2STRi8: 529 case ARM::t2STRi12: 530 return ARM::t2STR_PRE; 531 default: llvm_unreachable("Unhandled opcode!"); 532 } 533 return 0; 534 } 535 536 static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) { 537 switch (Opc) { 538 case ARM::LDR: return ARM::LDR_POST; 539 case ARM::STR: return ARM::STR_POST; 540 case ARM::FLDS: return ARM::FLDMS; 541 case ARM::FLDD: return ARM::FLDMD; 542 case ARM::FSTS: return ARM::FSTMS; 543 case ARM::FSTD: return ARM::FSTMD; 544 case ARM::t2LDRi8: 545 case ARM::t2LDRi12: 546 return ARM::t2LDR_POST; 547 case ARM::t2STRi8: 548 case ARM::t2STRi12: 549 return ARM::t2STR_POST; 550 default: llvm_unreachable("Unhandled opcode!"); 551 } 552 return 0; 553 } 554 555 /// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base 556 /// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible: 557 bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, 558 MachineBasicBlock::iterator MBBI, 559 const TargetInstrInfo *TII, 560 bool &Advance, 561 MachineBasicBlock::iterator &I) { 562 MachineInstr *MI = MBBI; 563 unsigned Base = MI->getOperand(1).getReg(); 564 bool BaseKill = MI->getOperand(1).isKill(); 565 unsigned Bytes = getLSMultipleTransferSize(MI); 566 int Opcode = MI->getOpcode(); 567 DebugLoc dl = MI->getDebugLoc(); 568 bool isAM5 = Opcode == ARM::FLDD || Opcode == ARM::FLDS || 569 Opcode == ARM::FSTD || Opcode == ARM::FSTS; 570 bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR; 571 if (isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0) 572 return false; 573 else if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) 574 return false; 575 else if (isT2i32Load(Opcode) || isT2i32Store(Opcode)) 576 if (MI->getOperand(2).getImm() != 0) 577 return false; 578 579 bool isLd = isi32Load(Opcode) || Opcode == ARM::FLDS || Opcode == ARM::FLDD; 580 // Can't do the merge if the destination register is the same as the would-be 581 // writeback register. 582 if (isLd && MI->getOperand(0).getReg() == Base) 583 return false; 584 585 unsigned PredReg = 0; 586 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); 587 bool DoMerge = false; 588 ARM_AM::AddrOpc AddSub = ARM_AM::add; 589 unsigned NewOpc = 0; 590 // AM2 - 12 bits, thumb2 - 8 bits. 591 unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100); 592 if (MBBI != MBB.begin()) { 593 MachineBasicBlock::iterator PrevMBBI = prior(MBBI); 594 if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) { 595 DoMerge = true; 596 AddSub = ARM_AM::sub; 597 NewOpc = getPreIndexedLoadStoreOpcode(Opcode); 598 } else if (!isAM5 && 599 isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) { 600 DoMerge = true; 601 NewOpc = getPreIndexedLoadStoreOpcode(Opcode); 602 } 603 if (DoMerge) 604 MBB.erase(PrevMBBI); 605 } 606 607 if (!DoMerge && MBBI != MBB.end()) { 608 MachineBasicBlock::iterator NextMBBI = next(MBBI); 609 if (!isAM5 && 610 isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) { 611 DoMerge = true; 612 AddSub = ARM_AM::sub; 613 NewOpc = getPostIndexedLoadStoreOpcode(Opcode); 614 } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) { 615 DoMerge = true; 616 NewOpc = getPostIndexedLoadStoreOpcode(Opcode); 617 } 618 if (DoMerge) { 619 if (NextMBBI == I) { 620 Advance = true; 621 ++I; 622 } 623 MBB.erase(NextMBBI); 624 } 625 } 626 627 if (!DoMerge) 628 return false; 629 630 bool isDPR = NewOpc == ARM::FLDMD || NewOpc == ARM::FSTMD; 631 unsigned Offset = 0; 632 if (isAM5) 633 Offset = ARM_AM::getAM5Opc((AddSub == ARM_AM::sub) 634 ? ARM_AM::db 635 : ARM_AM::ia, true, (isDPR ? 2 : 1)); 636 else if (isAM2) 637 Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); 638 else 639 Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; 640 if (isLd) { 641 if (isAM5) 642 // FLDMS, FLDMD 643 BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) 644 .addReg(Base, getKillRegState(BaseKill)) 645 .addImm(Offset).addImm(Pred).addReg(PredReg) 646 .addReg(Base, getDefRegState(true)) // WB base register 647 .addReg(MI->getOperand(0).getReg(), RegState::Define); 648 else if (isAM2) 649 // LDR_PRE, LDR_POST, 650 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) 651 .addReg(Base, RegState::Define) 652 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); 653 else 654 // t2LDR_PRE, t2LDR_POST 655 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) 656 .addReg(Base, RegState::Define) 657 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); 658 } else { 659 MachineOperand &MO = MI->getOperand(0); 660 if (isAM5) 661 // FSTMS, FSTMD 662 BuildMI(MBB, MBBI, dl, TII->get(NewOpc)).addReg(Base).addImm(Offset) 663 .addImm(Pred).addReg(PredReg) 664 .addReg(Base, getDefRegState(true)) // WB base register 665 .addReg(MO.getReg(), getKillRegState(MO.isKill())); 666 else if (isAM2) 667 // STR_PRE, STR_POST 668 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) 669 .addReg(MO.getReg(), getKillRegState(MO.isKill())) 670 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); 671 else 672 // t2STR_PRE, t2STR_POST 673 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) 674 .addReg(MO.getReg(), getKillRegState(MO.isKill())) 675 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); 676 } 677 MBB.erase(MBBI); 678 679 return true; 680 } 681 682 /// isMemoryOp - Returns true if instruction is a memory operations (that this 683 /// pass is capable of operating on). 684 static bool isMemoryOp(const MachineInstr *MI) { 685 int Opcode = MI->getOpcode(); 686 switch (Opcode) { 687 default: break; 688 case ARM::LDR: 689 case ARM::STR: 690 return MI->getOperand(1).isReg() && MI->getOperand(2).getReg() == 0; 691 case ARM::FLDS: 692 case ARM::FSTS: 693 return MI->getOperand(1).isReg(); 694 case ARM::FLDD: 695 case ARM::FSTD: 696 return MI->getOperand(1).isReg(); 697 case ARM::t2LDRi8: 698 case ARM::t2LDRi12: 699 case ARM::t2STRi8: 700 case ARM::t2STRi12: 701 return MI->getOperand(1).isReg(); 702 } 703 return false; 704 } 705 706 /// AdvanceRS - Advance register scavenger to just before the earliest memory 707 /// op that is being merged. 708 void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) { 709 MachineBasicBlock::iterator Loc = MemOps[0].MBBI; 710 unsigned Position = MemOps[0].Position; 711 for (unsigned i = 1, e = MemOps.size(); i != e; ++i) { 712 if (MemOps[i].Position < Position) { 713 Position = MemOps[i].Position; 714 Loc = MemOps[i].MBBI; 715 } 716 } 717 718 if (Loc != MBB.begin()) 719 RS->forward(prior(Loc)); 720 } 721 722 static int getMemoryOpOffset(const MachineInstr *MI) { 723 int Opcode = MI->getOpcode(); 724 bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR; 725 bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD; 726 unsigned NumOperands = MI->getDesc().getNumOperands(); 727 unsigned OffField = MI->getOperand(NumOperands-3).getImm(); 728 729 if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 || 730 Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 || 731 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) 732 return OffField; 733 734 int Offset = isAM2 735 ? ARM_AM::getAM2Offset(OffField) 736 : (isAM3 ? ARM_AM::getAM3Offset(OffField) 737 : ARM_AM::getAM5Offset(OffField) * 4); 738 if (isAM2) { 739 if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub) 740 Offset = -Offset; 741 } else if (isAM3) { 742 if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub) 743 Offset = -Offset; 744 } else { 745 if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub) 746 Offset = -Offset; 747 } 748 return Offset; 749 } 750 751 static void InsertLDR_STR(MachineBasicBlock &MBB, 752 MachineBasicBlock::iterator &MBBI, 753 int OffImm, bool isDef, 754 DebugLoc dl, unsigned NewOpc, 755 unsigned Reg, bool RegDeadKill, bool RegUndef, 756 unsigned BaseReg, bool BaseKill, bool BaseUndef, 757 unsigned OffReg, bool OffKill, bool OffUndef, 758 ARMCC::CondCodes Pred, unsigned PredReg, 759 const TargetInstrInfo *TII, bool isT2) { 760 int Offset = OffImm; 761 if (!isT2) { 762 if (OffImm < 0) 763 Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift); 764 else 765 Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift); 766 } 767 if (isDef) { 768 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), 769 TII->get(NewOpc)) 770 .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill)) 771 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); 772 if (!isT2) 773 MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef)); 774 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 775 } else { 776 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), 777 TII->get(NewOpc)) 778 .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef)) 779 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); 780 if (!isT2) 781 MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef)); 782 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 783 } 784 } 785 786 bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, 787 MachineBasicBlock::iterator &MBBI) { 788 MachineInstr *MI = &*MBBI; 789 unsigned Opcode = MI->getOpcode(); 790 if (Opcode == ARM::LDRD || Opcode == ARM::STRD || 791 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) { 792 unsigned EvenReg = MI->getOperand(0).getReg(); 793 unsigned OddReg = MI->getOperand(1).getReg(); 794 unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false); 795 unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false); 796 if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum) 797 return false; 798 799 bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8; 800 bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8; 801 bool EvenDeadKill = isLd ? 802 MI->getOperand(0).isDead() : MI->getOperand(0).isKill(); 803 bool EvenUndef = MI->getOperand(0).isUndef(); 804 bool OddDeadKill = isLd ? 805 MI->getOperand(1).isDead() : MI->getOperand(1).isKill(); 806 bool OddUndef = MI->getOperand(1).isUndef(); 807 const MachineOperand &BaseOp = MI->getOperand(2); 808 unsigned BaseReg = BaseOp.getReg(); 809 bool BaseKill = BaseOp.isKill(); 810 bool BaseUndef = BaseOp.isUndef(); 811 unsigned OffReg = isT2 ? 0 : MI->getOperand(3).getReg(); 812 bool OffKill = isT2 ? false : MI->getOperand(3).isKill(); 813 bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef(); 814 int OffImm = getMemoryOpOffset(MI); 815 unsigned PredReg = 0; 816 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); 817 818 if (OddRegNum > EvenRegNum && OffReg == 0 && OffImm == 0) { 819 // Ascending register numbers and no offset. It's safe to change it to a 820 // ldm or stm. 821 unsigned NewOpc = (isLd) 822 ? (isT2 ? ARM::t2LDM : ARM::LDM) 823 : (isT2 ? ARM::t2STM : ARM::STM); 824 if (isLd) { 825 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) 826 .addReg(BaseReg, getKillRegState(BaseKill)) 827 .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) 828 .addImm(Pred).addReg(PredReg) 829 .addReg(0) 830 .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill)) 831 .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)); 832 ++NumLDRD2LDM; 833 } else { 834 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) 835 .addReg(BaseReg, getKillRegState(BaseKill)) 836 .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) 837 .addImm(Pred).addReg(PredReg) 838 .addReg(0) 839 .addReg(EvenReg, 840 getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef)) 841 .addReg(OddReg, 842 getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)); 843 ++NumSTRD2STM; 844 } 845 } else { 846 // Split into two instructions. 847 assert((!isT2 || !OffReg) && 848 "Thumb2 ldrd / strd does not encode offset register!"); 849 unsigned NewOpc = (isLd) 850 ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDR) 851 : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STR); 852 DebugLoc dl = MBBI->getDebugLoc(); 853 // If this is a load and base register is killed, it may have been 854 // re-defed by the load, make sure the first load does not clobber it. 855 if (isLd && 856 (BaseKill || OffKill) && 857 (TRI->regsOverlap(EvenReg, BaseReg) || 858 (OffReg && TRI->regsOverlap(EvenReg, OffReg)))) { 859 assert(!TRI->regsOverlap(OddReg, BaseReg) && 860 (!OffReg || !TRI->regsOverlap(OddReg, OffReg))); 861 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, 862 OddReg, OddDeadKill, false, 863 BaseReg, false, BaseUndef, OffReg, false, OffUndef, 864 Pred, PredReg, TII, isT2); 865 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, 866 EvenReg, EvenDeadKill, false, 867 BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef, 868 Pred, PredReg, TII, isT2); 869 } else { 870 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, 871 EvenReg, EvenDeadKill, EvenUndef, 872 BaseReg, false, BaseUndef, OffReg, false, OffUndef, 873 Pred, PredReg, TII, isT2); 874 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, 875 OddReg, OddDeadKill, OddUndef, 876 BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef, 877 Pred, PredReg, TII, isT2); 878 } 879 if (isLd) 880 ++NumLDRD2LDR; 881 else 882 ++NumSTRD2STR; 883 } 884 885 MBBI = prior(MBBI); 886 MBB.erase(MI); 887 } 888 return false; 889 } 890 891 /// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR 892 /// ops of the same base and incrementing offset into LDM / STM ops. 893 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { 894 unsigned NumMerges = 0; 895 unsigned NumMemOps = 0; 896 MemOpQueue MemOps; 897 unsigned CurrBase = 0; 898 int CurrOpc = -1; 899 unsigned CurrSize = 0; 900 ARMCC::CondCodes CurrPred = ARMCC::AL; 901 unsigned CurrPredReg = 0; 902 unsigned Position = 0; 903 SmallVector<MachineBasicBlock::iterator,4> Merges; 904 905 RS->enterBasicBlock(&MBB); 906 MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 907 while (MBBI != E) { 908 if (FixInvalidRegPairOp(MBB, MBBI)) 909 continue; 910 911 bool Advance = false; 912 bool TryMerge = false; 913 bool Clobber = false; 914 915 bool isMemOp = isMemoryOp(MBBI); 916 if (isMemOp) { 917 int Opcode = MBBI->getOpcode(); 918 unsigned Size = getLSMultipleTransferSize(MBBI); 919 unsigned Base = MBBI->getOperand(1).getReg(); 920 unsigned PredReg = 0; 921 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg); 922 int Offset = getMemoryOpOffset(MBBI); 923 // Watch out for: 924 // r4 := ldr [r5] 925 // r5 := ldr [r5, #4] 926 // r6 := ldr [r5, #8] 927 // 928 // The second ldr has effectively broken the chain even though it 929 // looks like the later ldr(s) use the same base register. Try to 930 // merge the ldr's so far, including this one. But don't try to 931 // combine the following ldr(s). 932 Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg()); 933 if (CurrBase == 0 && !Clobber) { 934 // Start of a new chain. 935 CurrBase = Base; 936 CurrOpc = Opcode; 937 CurrSize = Size; 938 CurrPred = Pred; 939 CurrPredReg = PredReg; 940 MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI)); 941 NumMemOps++; 942 Advance = true; 943 } else { 944 if (Clobber) { 945 TryMerge = true; 946 Advance = true; 947 } 948 949 if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) { 950 // No need to match PredReg. 951 // Continue adding to the queue. 952 if (Offset > MemOps.back().Offset) { 953 MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI)); 954 NumMemOps++; 955 Advance = true; 956 } else { 957 for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); 958 I != E; ++I) { 959 if (Offset < I->Offset) { 960 MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI)); 961 NumMemOps++; 962 Advance = true; 963 break; 964 } else if (Offset == I->Offset) { 965 // Collision! This can't be merged! 966 break; 967 } 968 } 969 } 970 } 971 } 972 } 973 974 if (Advance) { 975 ++Position; 976 ++MBBI; 977 if (MBBI == E) 978 // Reach the end of the block, try merging the memory instructions. 979 TryMerge = true; 980 } else 981 TryMerge = true; 982 983 if (TryMerge) { 984 if (NumMemOps > 1) { 985 // Try to find a free register to use as a new base in case it's needed. 986 // First advance to the instruction just before the start of the chain. 987 AdvanceRS(MBB, MemOps); 988 // Find a scratch register. 989 unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass); 990 // Process the load / store instructions. 991 RS->forward(prior(MBBI)); 992 993 // Merge ops. 994 Merges.clear(); 995 MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize, 996 CurrPred, CurrPredReg, Scratch, MemOps, Merges); 997 998 // Try folding preceeding/trailing base inc/dec into the generated 999 // LDM/STM ops. 1000 for (unsigned i = 0, e = Merges.size(); i < e; ++i) 1001 if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI)) 1002 ++NumMerges; 1003 NumMerges += Merges.size(); 1004 1005 // Try folding preceeding/trailing base inc/dec into those load/store 1006 // that were not merged to form LDM/STM ops. 1007 for (unsigned i = 0; i != NumMemOps; ++i) 1008 if (!MemOps[i].Merged) 1009 if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI)) 1010 ++NumMerges; 1011 1012 // RS may be pointing to an instruction that's deleted. 1013 RS->skipTo(prior(MBBI)); 1014 } else if (NumMemOps == 1) { 1015 // Try folding preceeding/trailing base inc/dec into the single 1016 // load/store. 1017 if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) { 1018 ++NumMerges; 1019 RS->forward(prior(MBBI)); 1020 } 1021 } 1022 1023 CurrBase = 0; 1024 CurrOpc = -1; 1025 CurrSize = 0; 1026 CurrPred = ARMCC::AL; 1027 CurrPredReg = 0; 1028 if (NumMemOps) { 1029 MemOps.clear(); 1030 NumMemOps = 0; 1031 } 1032 1033 // If iterator hasn't been advanced and this is not a memory op, skip it. 1034 // It can't start a new chain anyway. 1035 if (!Advance && !isMemOp && MBBI != E) { 1036 ++Position; 1037 ++MBBI; 1038 } 1039 } 1040 } 1041 return NumMerges > 0; 1042 } 1043 1044 namespace { 1045 struct OffsetCompare { 1046 bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const { 1047 int LOffset = getMemoryOpOffset(LHS); 1048 int ROffset = getMemoryOpOffset(RHS); 1049 assert(LHS == RHS || LOffset != ROffset); 1050 return LOffset > ROffset; 1051 } 1052 }; 1053 } 1054 1055 /// MergeReturnIntoLDM - If this is a exit BB, try merging the return op 1056 /// (bx lr) into the preceeding stack restore so it directly restore the value 1057 /// of LR into pc. 1058 /// ldmfd sp!, {r7, lr} 1059 /// bx lr 1060 /// => 1061 /// ldmfd sp!, {r7, pc} 1062 bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { 1063 if (MBB.empty()) return false; 1064 1065 MachineBasicBlock::iterator MBBI = prior(MBB.end()); 1066 if (MBBI != MBB.begin() && 1067 (MBBI->getOpcode() == ARM::BX_RET || MBBI->getOpcode() == ARM::tBX_RET)) { 1068 MachineInstr *PrevMI = prior(MBBI); 1069 if (PrevMI->getOpcode() == ARM::LDM || PrevMI->getOpcode() == ARM::t2LDM) { 1070 MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1); 1071 if (MO.getReg() != ARM::LR) 1072 return false; 1073 unsigned NewOpc = isThumb2 ? ARM::t2LDM_RET : ARM::LDM_RET; 1074 PrevMI->setDesc(TII->get(NewOpc)); 1075 MO.setReg(ARM::PC); 1076 MBB.erase(MBBI); 1077 return true; 1078 } 1079 } 1080 return false; 1081 } 1082 1083 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1084 const TargetMachine &TM = Fn.getTarget(); 1085 AFI = Fn.getInfo<ARMFunctionInfo>(); 1086 TII = TM.getInstrInfo(); 1087 TRI = TM.getRegisterInfo(); 1088 RS = new RegScavenger(); 1089 isThumb2 = AFI->isThumb2Function(); 1090 1091 bool Modified = false; 1092 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; 1093 ++MFI) { 1094 MachineBasicBlock &MBB = *MFI; 1095 Modified |= LoadStoreMultipleOpti(MBB); 1096 Modified |= MergeReturnIntoLDM(MBB); 1097 } 1098 1099 delete RS; 1100 return Modified; 1101 } 1102 1103 1104 /// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move 1105 /// load / stores from consecutive locations close to make it more 1106 /// likely they will be combined later. 1107 1108 namespace { 1109 struct VISIBILITY_HIDDEN ARMPreAllocLoadStoreOpt : public MachineFunctionPass{ 1110 static char ID; 1111 ARMPreAllocLoadStoreOpt() : MachineFunctionPass(&ID) {} 1112 1113 const TargetData *TD; 1114 const TargetInstrInfo *TII; 1115 const TargetRegisterInfo *TRI; 1116 const ARMSubtarget *STI; 1117 MachineRegisterInfo *MRI; 1118 MachineFunction *MF; 1119 1120 virtual bool runOnMachineFunction(MachineFunction &Fn); 1121 1122 virtual const char *getPassName() const { 1123 return "ARM pre- register allocation load / store optimization pass"; 1124 } 1125 1126 private: 1127 bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, 1128 unsigned &NewOpc, unsigned &EvenReg, 1129 unsigned &OddReg, unsigned &BaseReg, 1130 unsigned &OffReg, int &Offset, 1131 unsigned &PredReg, ARMCC::CondCodes &Pred, 1132 bool &isT2); 1133 bool RescheduleOps(MachineBasicBlock *MBB, 1134 SmallVector<MachineInstr*, 4> &Ops, 1135 unsigned Base, bool isLd, 1136 DenseMap<MachineInstr*, unsigned> &MI2LocMap); 1137 bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB); 1138 }; 1139 char ARMPreAllocLoadStoreOpt::ID = 0; 1140 } 1141 1142 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1143 TD = Fn.getTarget().getTargetData(); 1144 TII = Fn.getTarget().getInstrInfo(); 1145 TRI = Fn.getTarget().getRegisterInfo(); 1146 STI = &Fn.getTarget().getSubtarget<ARMSubtarget>(); 1147 MRI = &Fn.getRegInfo(); 1148 MF = &Fn; 1149 1150 bool Modified = false; 1151 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; 1152 ++MFI) 1153 Modified |= RescheduleLoadStoreInstrs(MFI); 1154 1155 return Modified; 1156 } 1157 1158 static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, 1159 MachineBasicBlock::iterator I, 1160 MachineBasicBlock::iterator E, 1161 SmallPtrSet<MachineInstr*, 4> &MemOps, 1162 SmallSet<unsigned, 4> &MemRegs, 1163 const TargetRegisterInfo *TRI) { 1164 // Are there stores / loads / calls between them? 1165 // FIXME: This is overly conservative. We should make use of alias information 1166 // some day. 1167 SmallSet<unsigned, 4> AddedRegPressure; 1168 while (++I != E) { 1169 if (MemOps.count(&*I)) 1170 continue; 1171 const TargetInstrDesc &TID = I->getDesc(); 1172 if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects()) 1173 return false; 1174 if (isLd && TID.mayStore()) 1175 return false; 1176 if (!isLd) { 1177 if (TID.mayLoad()) 1178 return false; 1179 // It's not safe to move the first 'str' down. 1180 // str r1, [r0] 1181 // strh r5, [r0] 1182 // str r4, [r0, #+4] 1183 if (TID.mayStore()) 1184 return false; 1185 } 1186 for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) { 1187 MachineOperand &MO = I->getOperand(j); 1188 if (!MO.isReg()) 1189 continue; 1190 unsigned Reg = MO.getReg(); 1191 if (MO.isDef() && TRI->regsOverlap(Reg, Base)) 1192 return false; 1193 if (Reg != Base && !MemRegs.count(Reg)) 1194 AddedRegPressure.insert(Reg); 1195 } 1196 } 1197 1198 // Estimate register pressure increase due to the transformation. 1199 if (MemRegs.size() <= 4) 1200 // Ok if we are moving small number of instructions. 1201 return true; 1202 return AddedRegPressure.size() <= MemRegs.size() * 2; 1203 } 1204 1205 bool 1206 ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, 1207 DebugLoc &dl, 1208 unsigned &NewOpc, unsigned &EvenReg, 1209 unsigned &OddReg, unsigned &BaseReg, 1210 unsigned &OffReg, int &Offset, 1211 unsigned &PredReg, 1212 ARMCC::CondCodes &Pred, 1213 bool &isT2) { 1214 // Make sure we're allowed to generate LDRD/STRD. 1215 if (!STI->hasV5TEOps()) 1216 return false; 1217 1218 // FIXME: FLDS / FSTS -> FLDD / FSTD 1219 unsigned Scale = 1; 1220 unsigned Opcode = Op0->getOpcode(); 1221 if (Opcode == ARM::LDR) 1222 NewOpc = ARM::LDRD; 1223 else if (Opcode == ARM::STR) 1224 NewOpc = ARM::STRD; 1225 else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) { 1226 NewOpc = ARM::t2LDRDi8; 1227 Scale = 4; 1228 isT2 = true; 1229 } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) { 1230 NewOpc = ARM::t2STRDi8; 1231 Scale = 4; 1232 isT2 = true; 1233 } else 1234 return false; 1235 1236 // Make sure the offset registers match. 1237 if (!isT2 && 1238 (Op0->getOperand(2).getReg() != Op1->getOperand(2).getReg())) 1239 return false; 1240 1241 // Must sure the base address satisfies i64 ld / st alignment requirement. 1242 if (!Op0->hasOneMemOperand() || 1243 !(*Op0->memoperands_begin())->getValue() || 1244 (*Op0->memoperands_begin())->isVolatile()) 1245 return false; 1246 1247 unsigned Align = (*Op0->memoperands_begin())->getAlignment(); 1248 Function *Func = MF->getFunction(); 1249 unsigned ReqAlign = STI->hasV6Ops() 1250 ? TD->getPrefTypeAlignment(Type::getInt64Ty(Func->getContext())) 1251 : 8; // Pre-v6 need 8-byte align 1252 if (Align < ReqAlign) 1253 return false; 1254 1255 // Then make sure the immediate offset fits. 1256 int OffImm = getMemoryOpOffset(Op0); 1257 if (isT2) { 1258 if (OffImm < 0) { 1259 if (OffImm < -255) 1260 // Can't fall back to t2LDRi8 / t2STRi8. 1261 return false; 1262 } else { 1263 int Limit = (1 << 8) * Scale; 1264 if (OffImm >= Limit || (OffImm & (Scale-1))) 1265 return false; 1266 } 1267 Offset = OffImm; 1268 } else { 1269 ARM_AM::AddrOpc AddSub = ARM_AM::add; 1270 if (OffImm < 0) { 1271 AddSub = ARM_AM::sub; 1272 OffImm = - OffImm; 1273 } 1274 int Limit = (1 << 8) * Scale; 1275 if (OffImm >= Limit || (OffImm & (Scale-1))) 1276 return false; 1277 Offset = ARM_AM::getAM3Opc(AddSub, OffImm); 1278 } 1279 EvenReg = Op0->getOperand(0).getReg(); 1280 OddReg = Op1->getOperand(0).getReg(); 1281 if (EvenReg == OddReg) 1282 return false; 1283 BaseReg = Op0->getOperand(1).getReg(); 1284 if (!isT2) 1285 OffReg = Op0->getOperand(2).getReg(); 1286 Pred = llvm::getInstrPredicate(Op0, PredReg); 1287 dl = Op0->getDebugLoc(); 1288 return true; 1289 } 1290 1291 bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, 1292 SmallVector<MachineInstr*, 4> &Ops, 1293 unsigned Base, bool isLd, 1294 DenseMap<MachineInstr*, unsigned> &MI2LocMap) { 1295 bool RetVal = false; 1296 1297 // Sort by offset (in reverse order). 1298 std::sort(Ops.begin(), Ops.end(), OffsetCompare()); 1299 1300 // The loads / stores of the same base are in order. Scan them from first to 1301 // last and check for the followins: 1302 // 1. Any def of base. 1303 // 2. Any gaps. 1304 while (Ops.size() > 1) { 1305 unsigned FirstLoc = ~0U; 1306 unsigned LastLoc = 0; 1307 MachineInstr *FirstOp = 0; 1308 MachineInstr *LastOp = 0; 1309 int LastOffset = 0; 1310 unsigned LastOpcode = 0; 1311 unsigned LastBytes = 0; 1312 unsigned NumMove = 0; 1313 for (int i = Ops.size() - 1; i >= 0; --i) { 1314 MachineInstr *Op = Ops[i]; 1315 unsigned Loc = MI2LocMap[Op]; 1316 if (Loc <= FirstLoc) { 1317 FirstLoc = Loc; 1318 FirstOp = Op; 1319 } 1320 if (Loc >= LastLoc) { 1321 LastLoc = Loc; 1322 LastOp = Op; 1323 } 1324 1325 unsigned Opcode = Op->getOpcode(); 1326 if (LastOpcode && Opcode != LastOpcode) 1327 break; 1328 1329 int Offset = getMemoryOpOffset(Op); 1330 unsigned Bytes = getLSMultipleTransferSize(Op); 1331 if (LastBytes) { 1332 if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes)) 1333 break; 1334 } 1335 LastOffset = Offset; 1336 LastBytes = Bytes; 1337 LastOpcode = Opcode; 1338 if (++NumMove == 8) // FIXME: Tune this limit. 1339 break; 1340 } 1341 1342 if (NumMove <= 1) 1343 Ops.pop_back(); 1344 else { 1345 SmallPtrSet<MachineInstr*, 4> MemOps; 1346 SmallSet<unsigned, 4> MemRegs; 1347 for (int i = NumMove-1; i >= 0; --i) { 1348 MemOps.insert(Ops[i]); 1349 MemRegs.insert(Ops[i]->getOperand(0).getReg()); 1350 } 1351 1352 // Be conservative, if the instructions are too far apart, don't 1353 // move them. We want to limit the increase of register pressure. 1354 bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this. 1355 if (DoMove) 1356 DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp, 1357 MemOps, MemRegs, TRI); 1358 if (!DoMove) { 1359 for (unsigned i = 0; i != NumMove; ++i) 1360 Ops.pop_back(); 1361 } else { 1362 // This is the new location for the loads / stores. 1363 MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp; 1364 while (InsertPos != MBB->end() && MemOps.count(InsertPos)) 1365 ++InsertPos; 1366 1367 // If we are moving a pair of loads / stores, see if it makes sense 1368 // to try to allocate a pair of registers that can form register pairs. 1369 MachineInstr *Op0 = Ops.back(); 1370 MachineInstr *Op1 = Ops[Ops.size()-2]; 1371 unsigned EvenReg = 0, OddReg = 0; 1372 unsigned BaseReg = 0, OffReg = 0, PredReg = 0; 1373 ARMCC::CondCodes Pred = ARMCC::AL; 1374 bool isT2 = false; 1375 unsigned NewOpc = 0; 1376 int Offset = 0; 1377 DebugLoc dl; 1378 if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc, 1379 EvenReg, OddReg, BaseReg, OffReg, 1380 Offset, PredReg, Pred, isT2)) { 1381 Ops.pop_back(); 1382 Ops.pop_back(); 1383 1384 // Form the pair instruction. 1385 if (isLd) { 1386 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, 1387 dl, TII->get(NewOpc)) 1388 .addReg(EvenReg, RegState::Define) 1389 .addReg(OddReg, RegState::Define) 1390 .addReg(BaseReg); 1391 if (!isT2) 1392 MIB.addReg(OffReg); 1393 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 1394 ++NumLDRDFormed; 1395 } else { 1396 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, 1397 dl, TII->get(NewOpc)) 1398 .addReg(EvenReg) 1399 .addReg(OddReg) 1400 .addReg(BaseReg); 1401 if (!isT2) 1402 MIB.addReg(OffReg); 1403 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 1404 ++NumSTRDFormed; 1405 } 1406 MBB->erase(Op0); 1407 MBB->erase(Op1); 1408 1409 // Add register allocation hints to form register pairs. 1410 MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg); 1411 MRI->setRegAllocationHint(OddReg, ARMRI::RegPairOdd, EvenReg); 1412 } else { 1413 for (unsigned i = 0; i != NumMove; ++i) { 1414 MachineInstr *Op = Ops.back(); 1415 Ops.pop_back(); 1416 MBB->splice(InsertPos, MBB, Op); 1417 } 1418 } 1419 1420 NumLdStMoved += NumMove; 1421 RetVal = true; 1422 } 1423 } 1424 } 1425 1426 return RetVal; 1427 } 1428 1429 bool 1430 ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { 1431 bool RetVal = false; 1432 1433 DenseMap<MachineInstr*, unsigned> MI2LocMap; 1434 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap; 1435 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap; 1436 SmallVector<unsigned, 4> LdBases; 1437 SmallVector<unsigned, 4> StBases; 1438 1439 unsigned Loc = 0; 1440 MachineBasicBlock::iterator MBBI = MBB->begin(); 1441 MachineBasicBlock::iterator E = MBB->end(); 1442 while (MBBI != E) { 1443 for (; MBBI != E; ++MBBI) { 1444 MachineInstr *MI = MBBI; 1445 const TargetInstrDesc &TID = MI->getDesc(); 1446 if (TID.isCall() || TID.isTerminator()) { 1447 // Stop at barriers. 1448 ++MBBI; 1449 break; 1450 } 1451 1452 MI2LocMap[MI] = Loc++; 1453 if (!isMemoryOp(MI)) 1454 continue; 1455 unsigned PredReg = 0; 1456 if (llvm::getInstrPredicate(MI, PredReg) != ARMCC::AL) 1457 continue; 1458 1459 int Opc = MI->getOpcode(); 1460 bool isLd = isi32Load(Opc) || Opc == ARM::FLDS || Opc == ARM::FLDD; 1461 unsigned Base = MI->getOperand(1).getReg(); 1462 int Offset = getMemoryOpOffset(MI); 1463 1464 bool StopHere = false; 1465 if (isLd) { 1466 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = 1467 Base2LdsMap.find(Base); 1468 if (BI != Base2LdsMap.end()) { 1469 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { 1470 if (Offset == getMemoryOpOffset(BI->second[i])) { 1471 StopHere = true; 1472 break; 1473 } 1474 } 1475 if (!StopHere) 1476 BI->second.push_back(MI); 1477 } else { 1478 SmallVector<MachineInstr*, 4> MIs; 1479 MIs.push_back(MI); 1480 Base2LdsMap[Base] = MIs; 1481 LdBases.push_back(Base); 1482 } 1483 } else { 1484 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = 1485 Base2StsMap.find(Base); 1486 if (BI != Base2StsMap.end()) { 1487 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { 1488 if (Offset == getMemoryOpOffset(BI->second[i])) { 1489 StopHere = true; 1490 break; 1491 } 1492 } 1493 if (!StopHere) 1494 BI->second.push_back(MI); 1495 } else { 1496 SmallVector<MachineInstr*, 4> MIs; 1497 MIs.push_back(MI); 1498 Base2StsMap[Base] = MIs; 1499 StBases.push_back(Base); 1500 } 1501 } 1502 1503 if (StopHere) { 1504 // Found a duplicate (a base+offset combination that's seen earlier). 1505 // Backtrack. 1506 --Loc; 1507 break; 1508 } 1509 } 1510 1511 // Re-schedule loads. 1512 for (unsigned i = 0, e = LdBases.size(); i != e; ++i) { 1513 unsigned Base = LdBases[i]; 1514 SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base]; 1515 if (Lds.size() > 1) 1516 RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap); 1517 } 1518 1519 // Re-schedule stores. 1520 for (unsigned i = 0, e = StBases.size(); i != e; ++i) { 1521 unsigned Base = StBases[i]; 1522 SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base]; 1523 if (Sts.size() > 1) 1524 RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap); 1525 } 1526 1527 if (MBBI != E) { 1528 Base2LdsMap.clear(); 1529 Base2StsMap.clear(); 1530 LdBases.clear(); 1531 StBases.clear(); 1532 } 1533 } 1534 1535 return RetVal; 1536 } 1537 1538 1539 /// createARMLoadStoreOptimizationPass - returns an instance of the load / store 1540 /// optimization pass. 1541 FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) { 1542 if (PreAlloc) 1543 return new ARMPreAllocLoadStoreOpt(); 1544 return new ARMLoadStoreOpt(); 1545 } 1546