1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains a pass that performs load / store related peephole 10 // optimizations. This pass should be run after register allocation. 11 // 12 // The pass runs after the PrologEpilogInserter where we emit the CFI 13 // instructions. In order to preserve the correctness of the unwind informaiton, 14 // the pass should not change the order of any two instructions, one of which 15 // has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix 16 // to unwind information. 17 // 18 //===----------------------------------------------------------------------===// 19 20 #include "AArch64InstrInfo.h" 21 #include "AArch64MachineFunctionInfo.h" 22 #include "AArch64Subtarget.h" 23 #include "MCTargetDesc/AArch64AddressingModes.h" 24 #include "llvm/ADT/SmallVector.h" 25 #include "llvm/ADT/Statistic.h" 26 #include "llvm/ADT/StringRef.h" 27 #include "llvm/ADT/iterator_range.h" 28 #include "llvm/Analysis/AliasAnalysis.h" 29 #include "llvm/CodeGen/MachineBasicBlock.h" 30 #include "llvm/CodeGen/MachineFunction.h" 31 #include "llvm/CodeGen/MachineFunctionPass.h" 32 #include "llvm/CodeGen/MachineInstr.h" 33 #include "llvm/CodeGen/MachineInstrBuilder.h" 34 #include "llvm/CodeGen/MachineOperand.h" 35 #include "llvm/CodeGen/MachineRegisterInfo.h" 36 #include "llvm/CodeGen/TargetRegisterInfo.h" 37 #include "llvm/IR/DebugLoc.h" 38 #include "llvm/MC/MCAsmInfo.h" 39 #include "llvm/MC/MCDwarf.h" 40 #include "llvm/MC/MCRegisterInfo.h" 41 #include "llvm/Pass.h" 42 #include "llvm/Support/CommandLine.h" 43 #include "llvm/Support/Debug.h" 44 #include "llvm/Support/DebugCounter.h" 45 #include "llvm/Support/ErrorHandling.h" 46 #include "llvm/Support/raw_ostream.h" 47 #include <cassert> 48 #include <cstdint> 49 #include <functional> 50 #include <iterator> 51 #include <limits> 52 #include <optional> 53 54 using namespace llvm; 55 56 #define DEBUG_TYPE "aarch64-ldst-opt" 57 58 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); 59 STATISTIC(NumPostFolded, "Number of post-index updates folded"); 60 STATISTIC(NumPreFolded, "Number of pre-index updates folded"); 61 STATISTIC(NumUnscaledPairCreated, 62 "Number of load/store from unscaled generated"); 63 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); 64 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); 65 STATISTIC(NumConstOffsetFolded, 66 "Number of const offset of index address folded"); 67 68 DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming", 69 "Controls which pairs are considered for renaming"); 70 71 // The LdStLimit limits how far we search for load/store pairs. 72 static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit", 73 cl::init(20), cl::Hidden); 74 75 // The UpdateLimit limits how far we search for update instructions when we form 76 // pre-/post-index instructions. 77 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100), 78 cl::Hidden); 79 80 // The LdStConstLimit limits how far we search for const offset instructions 81 // when we form index address load/store instructions. 82 static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit", 83 cl::init(10), cl::Hidden); 84 85 // Enable register renaming to find additional store pairing opportunities. 86 static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming", 87 cl::init(true), cl::Hidden); 88 89 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" 90 91 namespace { 92 93 using LdStPairFlags = struct LdStPairFlags { 94 // If a matching instruction is found, MergeForward is set to true if the 95 // merge is to remove the first instruction and replace the second with 96 // a pair-wise insn, and false if the reverse is true. 97 bool MergeForward = false; 98 99 // SExtIdx gives the index of the result of the load pair that must be 100 // extended. The value of SExtIdx assumes that the paired load produces the 101 // value in this order: (I, returned iterator), i.e., -1 means no value has 102 // to be extended, 0 means I, and 1 means the returned iterator. 103 int SExtIdx = -1; 104 105 // If not none, RenameReg can be used to rename the result register of the 106 // first store in a pair. Currently this only works when merging stores 107 // forward. 108 std::optional<MCPhysReg> RenameReg; 109 110 LdStPairFlags() = default; 111 112 void setMergeForward(bool V = true) { MergeForward = V; } 113 bool getMergeForward() const { return MergeForward; } 114 115 void setSExtIdx(int V) { SExtIdx = V; } 116 int getSExtIdx() const { return SExtIdx; } 117 118 void setRenameReg(MCPhysReg R) { RenameReg = R; } 119 void clearRenameReg() { RenameReg = std::nullopt; } 120 std::optional<MCPhysReg> getRenameReg() const { return RenameReg; } 121 }; 122 123 struct AArch64LoadStoreOpt : public MachineFunctionPass { 124 static char ID; 125 126 AArch64LoadStoreOpt() : MachineFunctionPass(ID) { 127 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); 128 } 129 130 AliasAnalysis *AA; 131 const AArch64InstrInfo *TII; 132 const TargetRegisterInfo *TRI; 133 const AArch64Subtarget *Subtarget; 134 135 // Track which register units have been modified and used. 136 LiveRegUnits ModifiedRegUnits, UsedRegUnits; 137 LiveRegUnits DefinedInBB; 138 139 void getAnalysisUsage(AnalysisUsage &AU) const override { 140 AU.addRequired<AAResultsWrapperPass>(); 141 MachineFunctionPass::getAnalysisUsage(AU); 142 } 143 144 // Scan the instructions looking for a load/store that can be combined 145 // with the current instruction into a load/store pair. 146 // Return the matching instruction if one is found, else MBB->end(). 147 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, 148 LdStPairFlags &Flags, 149 unsigned Limit, 150 bool FindNarrowMerge); 151 152 // Scan the instructions looking for a store that writes to the address from 153 // which the current load instruction reads. Return true if one is found. 154 bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, 155 MachineBasicBlock::iterator &StoreI); 156 157 // Merge the two instructions indicated into a wider narrow store instruction. 158 MachineBasicBlock::iterator 159 mergeNarrowZeroStores(MachineBasicBlock::iterator I, 160 MachineBasicBlock::iterator MergeMI, 161 const LdStPairFlags &Flags); 162 163 // Merge the two instructions indicated into a single pair-wise instruction. 164 MachineBasicBlock::iterator 165 mergePairedInsns(MachineBasicBlock::iterator I, 166 MachineBasicBlock::iterator Paired, 167 const LdStPairFlags &Flags); 168 169 // Promote the load that reads directly from the address stored to. 170 MachineBasicBlock::iterator 171 promoteLoadFromStore(MachineBasicBlock::iterator LoadI, 172 MachineBasicBlock::iterator StoreI); 173 174 // Scan the instruction list to find a base register update that can 175 // be combined with the current instruction (a load or store) using 176 // pre or post indexed addressing with writeback. Scan forwards. 177 MachineBasicBlock::iterator 178 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, 179 int UnscaledOffset, unsigned Limit); 180 181 // Scan the instruction list to find a register assigned with a const 182 // value that can be combined with the current instruction (a load or store) 183 // using base addressing with writeback. Scan forwards. 184 MachineBasicBlock::iterator 185 findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit, 186 unsigned &Offset); 187 188 // Scan the instruction list to find a base register update that can 189 // be combined with the current instruction (a load or store) using 190 // pre or post indexed addressing with writeback. Scan backwards. 191 MachineBasicBlock::iterator 192 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); 193 194 // Find an instruction that updates the base register of the ld/st 195 // instruction. 196 bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, 197 unsigned BaseReg, int Offset); 198 199 bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI, 200 unsigned IndexReg, unsigned &Offset); 201 202 // Merge a pre- or post-index base register update into a ld/st instruction. 203 MachineBasicBlock::iterator 204 mergeUpdateInsn(MachineBasicBlock::iterator I, 205 MachineBasicBlock::iterator Update, bool IsPreIdx); 206 207 MachineBasicBlock::iterator 208 mergeConstOffsetInsn(MachineBasicBlock::iterator I, 209 MachineBasicBlock::iterator Update, unsigned Offset, 210 int Scale); 211 212 // Find and merge zero store instructions. 213 bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI); 214 215 // Find and pair ldr/str instructions. 216 bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI); 217 218 // Find and promote load instructions which read directly from store. 219 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); 220 221 // Find and merge a base register updates before or after a ld/st instruction. 222 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI); 223 224 // Find and merge a index ldr/st instructions into a base ld/st instruction. 225 bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale); 226 227 bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt); 228 229 bool runOnMachineFunction(MachineFunction &Fn) override; 230 231 MachineFunctionProperties getRequiredProperties() const override { 232 return MachineFunctionProperties().set( 233 MachineFunctionProperties::Property::NoVRegs); 234 } 235 236 StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; } 237 }; 238 239 char AArch64LoadStoreOpt::ID = 0; 240 241 } // end anonymous namespace 242 243 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", 244 AARCH64_LOAD_STORE_OPT_NAME, false, false) 245 246 static bool isNarrowStore(unsigned Opc) { 247 switch (Opc) { 248 default: 249 return false; 250 case AArch64::STRBBui: 251 case AArch64::STURBBi: 252 case AArch64::STRHHui: 253 case AArch64::STURHHi: 254 return true; 255 } 256 } 257 258 // These instruction set memory tag and either keep memory contents unchanged or 259 // set it to zero, ignoring the address part of the source register. 260 static bool isTagStore(const MachineInstr &MI) { 261 switch (MI.getOpcode()) { 262 default: 263 return false; 264 case AArch64::STGi: 265 case AArch64::STZGi: 266 case AArch64::ST2Gi: 267 case AArch64::STZ2Gi: 268 return true; 269 } 270 } 271 272 static unsigned getMatchingNonSExtOpcode(unsigned Opc, 273 bool *IsValidLdStrOpc = nullptr) { 274 if (IsValidLdStrOpc) 275 *IsValidLdStrOpc = true; 276 switch (Opc) { 277 default: 278 if (IsValidLdStrOpc) 279 *IsValidLdStrOpc = false; 280 return std::numeric_limits<unsigned>::max(); 281 case AArch64::STRDui: 282 case AArch64::STURDi: 283 case AArch64::STRDpre: 284 case AArch64::STRQui: 285 case AArch64::STURQi: 286 case AArch64::STRQpre: 287 case AArch64::STRBBui: 288 case AArch64::STURBBi: 289 case AArch64::STRHHui: 290 case AArch64::STURHHi: 291 case AArch64::STRWui: 292 case AArch64::STRWpre: 293 case AArch64::STURWi: 294 case AArch64::STRXui: 295 case AArch64::STRXpre: 296 case AArch64::STURXi: 297 case AArch64::LDRDui: 298 case AArch64::LDURDi: 299 case AArch64::LDRDpre: 300 case AArch64::LDRQui: 301 case AArch64::LDURQi: 302 case AArch64::LDRQpre: 303 case AArch64::LDRWui: 304 case AArch64::LDURWi: 305 case AArch64::LDRWpre: 306 case AArch64::LDRXui: 307 case AArch64::LDURXi: 308 case AArch64::LDRXpre: 309 case AArch64::STRSui: 310 case AArch64::STURSi: 311 case AArch64::STRSpre: 312 case AArch64::LDRSui: 313 case AArch64::LDURSi: 314 case AArch64::LDRSpre: 315 return Opc; 316 case AArch64::LDRSWui: 317 return AArch64::LDRWui; 318 case AArch64::LDURSWi: 319 return AArch64::LDURWi; 320 case AArch64::LDRSWpre: 321 return AArch64::LDRWpre; 322 } 323 } 324 325 static unsigned getMatchingWideOpcode(unsigned Opc) { 326 switch (Opc) { 327 default: 328 llvm_unreachable("Opcode has no wide equivalent!"); 329 case AArch64::STRBBui: 330 return AArch64::STRHHui; 331 case AArch64::STRHHui: 332 return AArch64::STRWui; 333 case AArch64::STURBBi: 334 return AArch64::STURHHi; 335 case AArch64::STURHHi: 336 return AArch64::STURWi; 337 case AArch64::STURWi: 338 return AArch64::STURXi; 339 case AArch64::STRWui: 340 return AArch64::STRXui; 341 } 342 } 343 344 static unsigned getMatchingPairOpcode(unsigned Opc) { 345 switch (Opc) { 346 default: 347 llvm_unreachable("Opcode has no pairwise equivalent!"); 348 case AArch64::STRSui: 349 case AArch64::STURSi: 350 return AArch64::STPSi; 351 case AArch64::STRSpre: 352 return AArch64::STPSpre; 353 case AArch64::STRDui: 354 case AArch64::STURDi: 355 return AArch64::STPDi; 356 case AArch64::STRDpre: 357 return AArch64::STPDpre; 358 case AArch64::STRQui: 359 case AArch64::STURQi: 360 return AArch64::STPQi; 361 case AArch64::STRQpre: 362 return AArch64::STPQpre; 363 case AArch64::STRWui: 364 case AArch64::STURWi: 365 return AArch64::STPWi; 366 case AArch64::STRWpre: 367 return AArch64::STPWpre; 368 case AArch64::STRXui: 369 case AArch64::STURXi: 370 return AArch64::STPXi; 371 case AArch64::STRXpre: 372 return AArch64::STPXpre; 373 case AArch64::LDRSui: 374 case AArch64::LDURSi: 375 return AArch64::LDPSi; 376 case AArch64::LDRSpre: 377 return AArch64::LDPSpre; 378 case AArch64::LDRDui: 379 case AArch64::LDURDi: 380 return AArch64::LDPDi; 381 case AArch64::LDRDpre: 382 return AArch64::LDPDpre; 383 case AArch64::LDRQui: 384 case AArch64::LDURQi: 385 return AArch64::LDPQi; 386 case AArch64::LDRQpre: 387 return AArch64::LDPQpre; 388 case AArch64::LDRWui: 389 case AArch64::LDURWi: 390 return AArch64::LDPWi; 391 case AArch64::LDRWpre: 392 return AArch64::LDPWpre; 393 case AArch64::LDRXui: 394 case AArch64::LDURXi: 395 return AArch64::LDPXi; 396 case AArch64::LDRXpre: 397 return AArch64::LDPXpre; 398 case AArch64::LDRSWui: 399 case AArch64::LDURSWi: 400 return AArch64::LDPSWi; 401 case AArch64::LDRSWpre: 402 return AArch64::LDPSWpre; 403 } 404 } 405 406 static unsigned isMatchingStore(MachineInstr &LoadInst, 407 MachineInstr &StoreInst) { 408 unsigned LdOpc = LoadInst.getOpcode(); 409 unsigned StOpc = StoreInst.getOpcode(); 410 switch (LdOpc) { 411 default: 412 llvm_unreachable("Unsupported load instruction!"); 413 case AArch64::LDRBBui: 414 return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui || 415 StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; 416 case AArch64::LDURBBi: 417 return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi || 418 StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; 419 case AArch64::LDRHHui: 420 return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui || 421 StOpc == AArch64::STRXui; 422 case AArch64::LDURHHi: 423 return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi || 424 StOpc == AArch64::STURXi; 425 case AArch64::LDRWui: 426 return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; 427 case AArch64::LDURWi: 428 return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; 429 case AArch64::LDRXui: 430 return StOpc == AArch64::STRXui; 431 case AArch64::LDURXi: 432 return StOpc == AArch64::STURXi; 433 } 434 } 435 436 static unsigned getPreIndexedOpcode(unsigned Opc) { 437 // FIXME: We don't currently support creating pre-indexed loads/stores when 438 // the load or store is the unscaled version. If we decide to perform such an 439 // optimization in the future the cases for the unscaled loads/stores will 440 // need to be added here. 441 switch (Opc) { 442 default: 443 llvm_unreachable("Opcode has no pre-indexed equivalent!"); 444 case AArch64::STRSui: 445 return AArch64::STRSpre; 446 case AArch64::STRDui: 447 return AArch64::STRDpre; 448 case AArch64::STRQui: 449 return AArch64::STRQpre; 450 case AArch64::STRBBui: 451 return AArch64::STRBBpre; 452 case AArch64::STRHHui: 453 return AArch64::STRHHpre; 454 case AArch64::STRWui: 455 return AArch64::STRWpre; 456 case AArch64::STRXui: 457 return AArch64::STRXpre; 458 case AArch64::LDRSui: 459 return AArch64::LDRSpre; 460 case AArch64::LDRDui: 461 return AArch64::LDRDpre; 462 case AArch64::LDRQui: 463 return AArch64::LDRQpre; 464 case AArch64::LDRBBui: 465 return AArch64::LDRBBpre; 466 case AArch64::LDRHHui: 467 return AArch64::LDRHHpre; 468 case AArch64::LDRWui: 469 return AArch64::LDRWpre; 470 case AArch64::LDRXui: 471 return AArch64::LDRXpre; 472 case AArch64::LDRSWui: 473 return AArch64::LDRSWpre; 474 case AArch64::LDPSi: 475 return AArch64::LDPSpre; 476 case AArch64::LDPSWi: 477 return AArch64::LDPSWpre; 478 case AArch64::LDPDi: 479 return AArch64::LDPDpre; 480 case AArch64::LDPQi: 481 return AArch64::LDPQpre; 482 case AArch64::LDPWi: 483 return AArch64::LDPWpre; 484 case AArch64::LDPXi: 485 return AArch64::LDPXpre; 486 case AArch64::STPSi: 487 return AArch64::STPSpre; 488 case AArch64::STPDi: 489 return AArch64::STPDpre; 490 case AArch64::STPQi: 491 return AArch64::STPQpre; 492 case AArch64::STPWi: 493 return AArch64::STPWpre; 494 case AArch64::STPXi: 495 return AArch64::STPXpre; 496 case AArch64::STGi: 497 return AArch64::STGPreIndex; 498 case AArch64::STZGi: 499 return AArch64::STZGPreIndex; 500 case AArch64::ST2Gi: 501 return AArch64::ST2GPreIndex; 502 case AArch64::STZ2Gi: 503 return AArch64::STZ2GPreIndex; 504 case AArch64::STGPi: 505 return AArch64::STGPpre; 506 } 507 } 508 509 static unsigned getBaseAddressOpcode(unsigned Opc) { 510 // TODO: Add more index address loads/stores. 511 switch (Opc) { 512 default: 513 llvm_unreachable("Opcode has no base address equivalent!"); 514 case AArch64::LDRBBroX: 515 return AArch64::LDRBBui; 516 } 517 } 518 519 static unsigned getPostIndexedOpcode(unsigned Opc) { 520 switch (Opc) { 521 default: 522 llvm_unreachable("Opcode has no post-indexed wise equivalent!"); 523 case AArch64::STRSui: 524 case AArch64::STURSi: 525 return AArch64::STRSpost; 526 case AArch64::STRDui: 527 case AArch64::STURDi: 528 return AArch64::STRDpost; 529 case AArch64::STRQui: 530 case AArch64::STURQi: 531 return AArch64::STRQpost; 532 case AArch64::STRBBui: 533 return AArch64::STRBBpost; 534 case AArch64::STRHHui: 535 return AArch64::STRHHpost; 536 case AArch64::STRWui: 537 case AArch64::STURWi: 538 return AArch64::STRWpost; 539 case AArch64::STRXui: 540 case AArch64::STURXi: 541 return AArch64::STRXpost; 542 case AArch64::LDRSui: 543 case AArch64::LDURSi: 544 return AArch64::LDRSpost; 545 case AArch64::LDRDui: 546 case AArch64::LDURDi: 547 return AArch64::LDRDpost; 548 case AArch64::LDRQui: 549 case AArch64::LDURQi: 550 return AArch64::LDRQpost; 551 case AArch64::LDRBBui: 552 return AArch64::LDRBBpost; 553 case AArch64::LDRHHui: 554 return AArch64::LDRHHpost; 555 case AArch64::LDRWui: 556 case AArch64::LDURWi: 557 return AArch64::LDRWpost; 558 case AArch64::LDRXui: 559 case AArch64::LDURXi: 560 return AArch64::LDRXpost; 561 case AArch64::LDRSWui: 562 return AArch64::LDRSWpost; 563 case AArch64::LDPSi: 564 return AArch64::LDPSpost; 565 case AArch64::LDPSWi: 566 return AArch64::LDPSWpost; 567 case AArch64::LDPDi: 568 return AArch64::LDPDpost; 569 case AArch64::LDPQi: 570 return AArch64::LDPQpost; 571 case AArch64::LDPWi: 572 return AArch64::LDPWpost; 573 case AArch64::LDPXi: 574 return AArch64::LDPXpost; 575 case AArch64::STPSi: 576 return AArch64::STPSpost; 577 case AArch64::STPDi: 578 return AArch64::STPDpost; 579 case AArch64::STPQi: 580 return AArch64::STPQpost; 581 case AArch64::STPWi: 582 return AArch64::STPWpost; 583 case AArch64::STPXi: 584 return AArch64::STPXpost; 585 case AArch64::STGi: 586 return AArch64::STGPostIndex; 587 case AArch64::STZGi: 588 return AArch64::STZGPostIndex; 589 case AArch64::ST2Gi: 590 return AArch64::ST2GPostIndex; 591 case AArch64::STZ2Gi: 592 return AArch64::STZ2GPostIndex; 593 case AArch64::STGPi: 594 return AArch64::STGPpost; 595 } 596 } 597 598 static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) { 599 600 unsigned OpcA = FirstMI.getOpcode(); 601 unsigned OpcB = MI.getOpcode(); 602 603 switch (OpcA) { 604 default: 605 return false; 606 case AArch64::STRSpre: 607 return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi); 608 case AArch64::STRDpre: 609 return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi); 610 case AArch64::STRQpre: 611 return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi); 612 case AArch64::STRWpre: 613 return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi); 614 case AArch64::STRXpre: 615 return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi); 616 case AArch64::LDRSpre: 617 return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi); 618 case AArch64::LDRDpre: 619 return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi); 620 case AArch64::LDRQpre: 621 return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi); 622 case AArch64::LDRWpre: 623 return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi); 624 case AArch64::LDRXpre: 625 return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi); 626 case AArch64::LDRSWpre: 627 return (OpcB == AArch64::LDRSWui) || (OpcB == AArch64::LDURSWi); 628 } 629 } 630 631 // Returns the scale and offset range of pre/post indexed variants of MI. 632 static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, 633 int &MinOffset, int &MaxOffset) { 634 bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI); 635 bool IsTagStore = isTagStore(MI); 636 // ST*G and all paired ldst have the same scale in pre/post-indexed variants 637 // as in the "unsigned offset" variant. 638 // All other pre/post indexed ldst instructions are unscaled. 639 Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1; 640 641 if (IsPaired) { 642 MinOffset = -64; 643 MaxOffset = 63; 644 } else { 645 MinOffset = -256; 646 MaxOffset = 255; 647 } 648 } 649 650 static MachineOperand &getLdStRegOp(MachineInstr &MI, 651 unsigned PairedRegOp = 0) { 652 assert(PairedRegOp < 2 && "Unexpected register operand idx."); 653 bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI); 654 if (IsPreLdSt) 655 PairedRegOp += 1; 656 unsigned Idx = 657 AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0; 658 return MI.getOperand(Idx); 659 } 660 661 static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst, 662 MachineInstr &StoreInst, 663 const AArch64InstrInfo *TII) { 664 assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); 665 int LoadSize = TII->getMemScale(LoadInst); 666 int StoreSize = TII->getMemScale(StoreInst); 667 int UnscaledStOffset = 668 TII->hasUnscaledLdStOffset(StoreInst) 669 ? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() 670 : AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize; 671 int UnscaledLdOffset = 672 TII->hasUnscaledLdStOffset(LoadInst) 673 ? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() 674 : AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize; 675 return (UnscaledStOffset <= UnscaledLdOffset) && 676 (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); 677 } 678 679 static bool isPromotableZeroStoreInst(MachineInstr &MI) { 680 unsigned Opc = MI.getOpcode(); 681 return (Opc == AArch64::STRWui || Opc == AArch64::STURWi || 682 isNarrowStore(Opc)) && 683 getLdStRegOp(MI).getReg() == AArch64::WZR; 684 } 685 686 static bool isPromotableLoadFromStore(MachineInstr &MI) { 687 switch (MI.getOpcode()) { 688 default: 689 return false; 690 // Scaled instructions. 691 case AArch64::LDRBBui: 692 case AArch64::LDRHHui: 693 case AArch64::LDRWui: 694 case AArch64::LDRXui: 695 // Unscaled instructions. 696 case AArch64::LDURBBi: 697 case AArch64::LDURHHi: 698 case AArch64::LDURWi: 699 case AArch64::LDURXi: 700 return true; 701 } 702 } 703 704 static bool isMergeableLdStUpdate(MachineInstr &MI) { 705 unsigned Opc = MI.getOpcode(); 706 switch (Opc) { 707 default: 708 return false; 709 // Scaled instructions. 710 case AArch64::STRSui: 711 case AArch64::STRDui: 712 case AArch64::STRQui: 713 case AArch64::STRXui: 714 case AArch64::STRWui: 715 case AArch64::STRHHui: 716 case AArch64::STRBBui: 717 case AArch64::LDRSui: 718 case AArch64::LDRDui: 719 case AArch64::LDRQui: 720 case AArch64::LDRXui: 721 case AArch64::LDRWui: 722 case AArch64::LDRHHui: 723 case AArch64::LDRBBui: 724 case AArch64::STGi: 725 case AArch64::STZGi: 726 case AArch64::ST2Gi: 727 case AArch64::STZ2Gi: 728 case AArch64::STGPi: 729 // Unscaled instructions. 730 case AArch64::STURSi: 731 case AArch64::STURDi: 732 case AArch64::STURQi: 733 case AArch64::STURWi: 734 case AArch64::STURXi: 735 case AArch64::LDURSi: 736 case AArch64::LDURDi: 737 case AArch64::LDURQi: 738 case AArch64::LDURWi: 739 case AArch64::LDURXi: 740 // Paired instructions. 741 case AArch64::LDPSi: 742 case AArch64::LDPSWi: 743 case AArch64::LDPDi: 744 case AArch64::LDPQi: 745 case AArch64::LDPWi: 746 case AArch64::LDPXi: 747 case AArch64::STPSi: 748 case AArch64::STPDi: 749 case AArch64::STPQi: 750 case AArch64::STPWi: 751 case AArch64::STPXi: 752 // Make sure this is a reg+imm (as opposed to an address reloc). 753 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) 754 return false; 755 756 return true; 757 } 758 } 759 760 // Make sure this is a reg+reg Ld/St 761 static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) { 762 unsigned Opc = MI.getOpcode(); 763 switch (Opc) { 764 default: 765 return false; 766 // Scaled instructions. 767 // TODO: Add more index address loads/stores. 768 case AArch64::LDRBBroX: 769 Scale = 1; 770 return true; 771 } 772 } 773 774 static bool isRewritableImplicitDef(unsigned Opc) { 775 switch (Opc) { 776 default: 777 return false; 778 case AArch64::ORRWrs: 779 case AArch64::ADDWri: 780 return true; 781 } 782 } 783 784 MachineBasicBlock::iterator 785 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, 786 MachineBasicBlock::iterator MergeMI, 787 const LdStPairFlags &Flags) { 788 assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) && 789 "Expected promotable zero stores."); 790 791 MachineBasicBlock::iterator E = I->getParent()->end(); 792 MachineBasicBlock::iterator NextI = next_nodbg(I, E); 793 // If NextI is the second of the two instructions to be merged, we need 794 // to skip one further. Either way we merge will invalidate the iterator, 795 // and we don't need to scan the new instruction, as it's a pairwise 796 // instruction, which we're not considering for further action anyway. 797 if (NextI == MergeMI) 798 NextI = next_nodbg(NextI, E); 799 800 unsigned Opc = I->getOpcode(); 801 unsigned MergeMIOpc = MergeMI->getOpcode(); 802 bool IsScaled = !TII->hasUnscaledLdStOffset(Opc); 803 bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(MergeMIOpc); 804 int OffsetStride = IsScaled ? TII->getMemScale(*I) : 1; 805 int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(*MergeMI) : 1; 806 807 bool MergeForward = Flags.getMergeForward(); 808 // Insert our new paired instruction after whichever of the paired 809 // instructions MergeForward indicates. 810 MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I; 811 // Also based on MergeForward is from where we copy the base register operand 812 // so we get the flags compatible with the input code. 813 const MachineOperand &BaseRegOp = 814 MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI) 815 : AArch64InstrInfo::getLdStBaseOp(*I); 816 817 // Which register is Rt and which is Rt2 depends on the offset order. 818 int64_t IOffsetInBytes = 819 AArch64InstrInfo::getLdStOffsetOp(*I).getImm() * OffsetStride; 820 int64_t MIOffsetInBytes = 821 AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() * 822 MergeMIOffsetStride; 823 // Select final offset based on the offset order. 824 int64_t OffsetImm; 825 if (IOffsetInBytes > MIOffsetInBytes) 826 OffsetImm = MIOffsetInBytes; 827 else 828 OffsetImm = IOffsetInBytes; 829 830 int NewOpcode = getMatchingWideOpcode(Opc); 831 bool FinalIsScaled = !TII->hasUnscaledLdStOffset(NewOpcode); 832 833 // Adjust final offset if the result opcode is a scaled store. 834 if (FinalIsScaled) { 835 int NewOffsetStride = FinalIsScaled ? TII->getMemScale(NewOpcode) : 1; 836 assert(((OffsetImm % NewOffsetStride) == 0) && 837 "Offset should be a multiple of the store memory scale"); 838 OffsetImm = OffsetImm / NewOffsetStride; 839 } 840 841 // Construct the new instruction. 842 DebugLoc DL = I->getDebugLoc(); 843 MachineBasicBlock *MBB = I->getParent(); 844 MachineInstrBuilder MIB; 845 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) 846 .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR) 847 .add(BaseRegOp) 848 .addImm(OffsetImm) 849 .cloneMergedMemRefs({&*I, &*MergeMI}) 850 .setMIFlags(I->mergeFlagsWith(*MergeMI)); 851 (void)MIB; 852 853 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n "); 854 LLVM_DEBUG(I->print(dbgs())); 855 LLVM_DEBUG(dbgs() << " "); 856 LLVM_DEBUG(MergeMI->print(dbgs())); 857 LLVM_DEBUG(dbgs() << " with instruction:\n "); 858 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 859 LLVM_DEBUG(dbgs() << "\n"); 860 861 // Erase the old instructions. 862 I->eraseFromParent(); 863 MergeMI->eraseFromParent(); 864 return NextI; 865 } 866 867 // Apply Fn to all instructions between MI and the beginning of the block, until 868 // a def for DefReg is reached. Returns true, iff Fn returns true for all 869 // visited instructions. Stop after visiting Limit iterations. 870 static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg, 871 const TargetRegisterInfo *TRI, unsigned Limit, 872 std::function<bool(MachineInstr &, bool)> &Fn) { 873 auto MBB = MI.getParent(); 874 for (MachineInstr &I : 875 instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) { 876 if (!Limit) 877 return false; 878 --Limit; 879 880 bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) { 881 return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() && 882 TRI->regsOverlap(MOP.getReg(), DefReg); 883 }); 884 if (!Fn(I, isDef)) 885 return false; 886 if (isDef) 887 break; 888 } 889 return true; 890 } 891 892 static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units, 893 const TargetRegisterInfo *TRI) { 894 895 for (const MachineOperand &MOP : phys_regs_and_masks(MI)) 896 if (MOP.isReg() && MOP.isKill()) 897 Units.removeReg(MOP.getReg()); 898 899 for (const MachineOperand &MOP : phys_regs_and_masks(MI)) 900 if (MOP.isReg() && !MOP.isKill()) 901 Units.addReg(MOP.getReg()); 902 } 903 904 MachineBasicBlock::iterator 905 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, 906 MachineBasicBlock::iterator Paired, 907 const LdStPairFlags &Flags) { 908 MachineBasicBlock::iterator E = I->getParent()->end(); 909 MachineBasicBlock::iterator NextI = next_nodbg(I, E); 910 // If NextI is the second of the two instructions to be merged, we need 911 // to skip one further. Either way we merge will invalidate the iterator, 912 // and we don't need to scan the new instruction, as it's a pairwise 913 // instruction, which we're not considering for further action anyway. 914 if (NextI == Paired) 915 NextI = next_nodbg(NextI, E); 916 917 int SExtIdx = Flags.getSExtIdx(); 918 unsigned Opc = 919 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); 920 bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc); 921 int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1; 922 923 bool MergeForward = Flags.getMergeForward(); 924 925 std::optional<MCPhysReg> RenameReg = Flags.getRenameReg(); 926 if (RenameReg) { 927 MCRegister RegToRename = getLdStRegOp(*I).getReg(); 928 DefinedInBB.addReg(*RenameReg); 929 930 // Return the sub/super register for RenameReg, matching the size of 931 // OriginalReg. 932 auto GetMatchingSubReg = 933 [this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg { 934 for (MCPhysReg SubOrSuper : 935 TRI->sub_and_superregs_inclusive(*RenameReg)) { 936 if (C->contains(SubOrSuper)) 937 return SubOrSuper; 938 } 939 llvm_unreachable("Should have found matching sub or super register!"); 940 }; 941 942 std::function<bool(MachineInstr &, bool)> UpdateMIs = 943 [this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI, 944 bool IsDef) { 945 if (IsDef) { 946 bool SeenDef = false; 947 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) { 948 MachineOperand &MOP = MI.getOperand(OpIdx); 949 // Rename the first explicit definition and all implicit 950 // definitions matching RegToRename. 951 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() && 952 (!MergeForward || !SeenDef || 953 (MOP.isDef() && MOP.isImplicit())) && 954 TRI->regsOverlap(MOP.getReg(), RegToRename)) { 955 assert((MOP.isImplicit() || 956 (MOP.isRenamable() && !MOP.isEarlyClobber())) && 957 "Need renamable operands"); 958 Register MatchingReg; 959 if (const TargetRegisterClass *RC = 960 MI.getRegClassConstraint(OpIdx, TII, TRI)) 961 MatchingReg = GetMatchingSubReg(RC); 962 else { 963 if (!isRewritableImplicitDef(MI.getOpcode())) 964 continue; 965 MatchingReg = GetMatchingSubReg( 966 TRI->getMinimalPhysRegClass(MOP.getReg())); 967 } 968 MOP.setReg(MatchingReg); 969 SeenDef = true; 970 } 971 } 972 } else { 973 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) { 974 MachineOperand &MOP = MI.getOperand(OpIdx); 975 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() && 976 TRI->regsOverlap(MOP.getReg(), RegToRename)) { 977 assert((MOP.isImplicit() || 978 (MOP.isRenamable() && !MOP.isEarlyClobber())) && 979 "Need renamable operands"); 980 Register MatchingReg; 981 if (const TargetRegisterClass *RC = 982 MI.getRegClassConstraint(OpIdx, TII, TRI)) 983 MatchingReg = GetMatchingSubReg(RC); 984 else 985 MatchingReg = GetMatchingSubReg( 986 TRI->getMinimalPhysRegClass(MOP.getReg())); 987 assert(MatchingReg != AArch64::NoRegister && 988 "Cannot find matching regs for renaming"); 989 MOP.setReg(MatchingReg); 990 } 991 } 992 } 993 LLVM_DEBUG(dbgs() << "Renamed " << MI); 994 return true; 995 }; 996 forAllMIsUntilDef(MergeForward ? *I : *std::prev(Paired), RegToRename, TRI, 997 UINT32_MAX, UpdateMIs); 998 999 #if !defined(NDEBUG) 1000 // For forward merging store: 1001 // Make sure the register used for renaming is not used between the 1002 // paired instructions. That would trash the content before the new 1003 // paired instruction. 1004 MCPhysReg RegToCheck = *RenameReg; 1005 // For backward merging load: 1006 // Make sure the register being renamed is not used between the 1007 // paired instructions. That would trash the content after the new 1008 // paired instruction. 1009 if (!MergeForward) 1010 RegToCheck = RegToRename; 1011 for (auto &MI : 1012 iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>( 1013 MergeForward ? std::next(I) : I, 1014 MergeForward ? std::next(Paired) : Paired)) 1015 assert(all_of(MI.operands(), 1016 [this, RegToCheck](const MachineOperand &MOP) { 1017 return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() || 1018 MOP.isUndef() || 1019 !TRI->regsOverlap(MOP.getReg(), RegToCheck); 1020 }) && 1021 "Rename register used between paired instruction, trashing the " 1022 "content"); 1023 #endif 1024 } 1025 1026 // Insert our new paired instruction after whichever of the paired 1027 // instructions MergeForward indicates. 1028 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; 1029 // Also based on MergeForward is from where we copy the base register operand 1030 // so we get the flags compatible with the input code. 1031 const MachineOperand &BaseRegOp = 1032 MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired) 1033 : AArch64InstrInfo::getLdStBaseOp(*I); 1034 1035 int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm(); 1036 int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm(); 1037 bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode()); 1038 if (IsUnscaled != PairedIsUnscaled) { 1039 // We're trying to pair instructions that differ in how they are scaled. If 1040 // I is scaled then scale the offset of Paired accordingly. Otherwise, do 1041 // the opposite (i.e., make Paired's offset unscaled). 1042 int MemSize = TII->getMemScale(*Paired); 1043 if (PairedIsUnscaled) { 1044 // If the unscaled offset isn't a multiple of the MemSize, we can't 1045 // pair the operations together. 1046 assert(!(PairedOffset % TII->getMemScale(*Paired)) && 1047 "Offset should be a multiple of the stride!"); 1048 PairedOffset /= MemSize; 1049 } else { 1050 PairedOffset *= MemSize; 1051 } 1052 } 1053 1054 // Which register is Rt and which is Rt2 depends on the offset order. 1055 // However, for pre load/stores the Rt should be the one of the pre 1056 // load/store. 1057 MachineInstr *RtMI, *Rt2MI; 1058 if (Offset == PairedOffset + OffsetStride && 1059 !AArch64InstrInfo::isPreLdSt(*I)) { 1060 RtMI = &*Paired; 1061 Rt2MI = &*I; 1062 // Here we swapped the assumption made for SExtIdx. 1063 // I.e., we turn ldp I, Paired into ldp Paired, I. 1064 // Update the index accordingly. 1065 if (SExtIdx != -1) 1066 SExtIdx = (SExtIdx + 1) % 2; 1067 } else { 1068 RtMI = &*I; 1069 Rt2MI = &*Paired; 1070 } 1071 int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm(); 1072 // Scale the immediate offset, if necessary. 1073 if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) { 1074 assert(!(OffsetImm % TII->getMemScale(*RtMI)) && 1075 "Unscaled offset cannot be scaled."); 1076 OffsetImm /= TII->getMemScale(*RtMI); 1077 } 1078 1079 // Construct the new instruction. 1080 MachineInstrBuilder MIB; 1081 DebugLoc DL = I->getDebugLoc(); 1082 MachineBasicBlock *MBB = I->getParent(); 1083 MachineOperand RegOp0 = getLdStRegOp(*RtMI); 1084 MachineOperand RegOp1 = getLdStRegOp(*Rt2MI); 1085 MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1; 1086 // Kill flags may become invalid when moving stores for pairing. 1087 if (RegOp0.isUse()) { 1088 if (!MergeForward) { 1089 // Clear kill flags on store if moving upwards. Example: 1090 // STRWui kill %w0, ... 1091 // USE %w1 1092 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards 1093 // We are about to move the store of w1, so its kill flag may become 1094 // invalid; not the case for w0. 1095 // Since w1 is used between the stores, the kill flag on w1 is cleared 1096 // after merging. 1097 // STPWi kill %w0, %w1, ... 1098 // USE %w1 1099 for (auto It = std::next(I); It != Paired && PairedRegOp.isKill(); ++It) 1100 if (It->readsRegister(PairedRegOp.getReg(), TRI)) 1101 PairedRegOp.setIsKill(false); 1102 } else { 1103 // Clear kill flags of the first stores register. Example: 1104 // STRWui %w1, ... 1105 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards 1106 // STRW %w0 1107 Register Reg = getLdStRegOp(*I).getReg(); 1108 for (MachineInstr &MI : make_range(std::next(I), Paired)) 1109 MI.clearRegisterKills(Reg, TRI); 1110 } 1111 } 1112 1113 unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc); 1114 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode)); 1115 1116 // Adds the pre-index operand for pre-indexed ld/st pairs. 1117 if (AArch64InstrInfo::isPreLdSt(*RtMI)) 1118 MIB.addReg(BaseRegOp.getReg(), RegState::Define); 1119 1120 MIB.add(RegOp0) 1121 .add(RegOp1) 1122 .add(BaseRegOp) 1123 .addImm(OffsetImm) 1124 .cloneMergedMemRefs({&*I, &*Paired}) 1125 .setMIFlags(I->mergeFlagsWith(*Paired)); 1126 1127 (void)MIB; 1128 1129 LLVM_DEBUG( 1130 dbgs() << "Creating pair load/store. Replacing instructions:\n "); 1131 LLVM_DEBUG(I->print(dbgs())); 1132 LLVM_DEBUG(dbgs() << " "); 1133 LLVM_DEBUG(Paired->print(dbgs())); 1134 LLVM_DEBUG(dbgs() << " with instruction:\n "); 1135 if (SExtIdx != -1) { 1136 // Generate the sign extension for the proper result of the ldp. 1137 // I.e., with X1, that would be: 1138 // %w1 = KILL %w1, implicit-def %x1 1139 // %x1 = SBFMXri killed %x1, 0, 31 1140 MachineOperand &DstMO = MIB->getOperand(SExtIdx); 1141 // Right now, DstMO has the extended register, since it comes from an 1142 // extended opcode. 1143 Register DstRegX = DstMO.getReg(); 1144 // Get the W variant of that register. 1145 Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); 1146 // Update the result of LDP to use the W instead of the X variant. 1147 DstMO.setReg(DstRegW); 1148 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 1149 LLVM_DEBUG(dbgs() << "\n"); 1150 // Make the machine verifier happy by providing a definition for 1151 // the X register. 1152 // Insert this definition right after the generated LDP, i.e., before 1153 // InsertionPoint. 1154 MachineInstrBuilder MIBKill = 1155 BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW) 1156 .addReg(DstRegW) 1157 .addReg(DstRegX, RegState::Define); 1158 MIBKill->getOperand(2).setImplicit(); 1159 // Create the sign extension. 1160 MachineInstrBuilder MIBSXTW = 1161 BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX) 1162 .addReg(DstRegX) 1163 .addImm(0) 1164 .addImm(31); 1165 (void)MIBSXTW; 1166 LLVM_DEBUG(dbgs() << " Extend operand:\n "); 1167 LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); 1168 } else { 1169 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 1170 } 1171 LLVM_DEBUG(dbgs() << "\n"); 1172 1173 if (MergeForward) 1174 for (const MachineOperand &MOP : phys_regs_and_masks(*I)) 1175 if (MOP.isReg() && MOP.isKill()) 1176 DefinedInBB.addReg(MOP.getReg()); 1177 1178 // Erase the old instructions. 1179 I->eraseFromParent(); 1180 Paired->eraseFromParent(); 1181 1182 return NextI; 1183 } 1184 1185 MachineBasicBlock::iterator 1186 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, 1187 MachineBasicBlock::iterator StoreI) { 1188 MachineBasicBlock::iterator NextI = 1189 next_nodbg(LoadI, LoadI->getParent()->end()); 1190 1191 int LoadSize = TII->getMemScale(*LoadI); 1192 int StoreSize = TII->getMemScale(*StoreI); 1193 Register LdRt = getLdStRegOp(*LoadI).getReg(); 1194 const MachineOperand &StMO = getLdStRegOp(*StoreI); 1195 Register StRt = getLdStRegOp(*StoreI).getReg(); 1196 bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); 1197 1198 assert((IsStoreXReg || 1199 TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) && 1200 "Unexpected RegClass"); 1201 1202 MachineInstr *BitExtMI; 1203 if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) { 1204 // Remove the load, if the destination register of the loads is the same 1205 // register for stored value. 1206 if (StRt == LdRt && LoadSize == 8) { 1207 for (MachineInstr &MI : make_range(StoreI->getIterator(), 1208 LoadI->getIterator())) { 1209 if (MI.killsRegister(StRt, TRI)) { 1210 MI.clearRegisterKills(StRt, TRI); 1211 break; 1212 } 1213 } 1214 LLVM_DEBUG(dbgs() << "Remove load instruction:\n "); 1215 LLVM_DEBUG(LoadI->print(dbgs())); 1216 LLVM_DEBUG(dbgs() << "\n"); 1217 LoadI->eraseFromParent(); 1218 return NextI; 1219 } 1220 // Replace the load with a mov if the load and store are in the same size. 1221 BitExtMI = 1222 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 1223 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt) 1224 .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR) 1225 .add(StMO) 1226 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 1227 .setMIFlags(LoadI->getFlags()); 1228 } else { 1229 // FIXME: Currently we disable this transformation in big-endian targets as 1230 // performance and correctness are verified only in little-endian. 1231 if (!Subtarget->isLittleEndian()) 1232 return NextI; 1233 bool IsUnscaled = TII->hasUnscaledLdStOffset(*LoadI); 1234 assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) && 1235 "Unsupported ld/st match"); 1236 assert(LoadSize <= StoreSize && "Invalid load size"); 1237 int UnscaledLdOffset = 1238 IsUnscaled 1239 ? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() 1240 : AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize; 1241 int UnscaledStOffset = 1242 IsUnscaled 1243 ? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() 1244 : AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize; 1245 int Width = LoadSize * 8; 1246 Register DestReg = 1247 IsStoreXReg ? Register(TRI->getMatchingSuperReg( 1248 LdRt, AArch64::sub_32, &AArch64::GPR64RegClass)) 1249 : LdRt; 1250 1251 assert((UnscaledLdOffset >= UnscaledStOffset && 1252 (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) && 1253 "Invalid offset"); 1254 1255 int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); 1256 int Imms = Immr + Width - 1; 1257 if (UnscaledLdOffset == UnscaledStOffset) { 1258 uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N 1259 | ((Immr) << 6) // immr 1260 | ((Imms) << 0) // imms 1261 ; 1262 1263 BitExtMI = 1264 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 1265 TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri), 1266 DestReg) 1267 .add(StMO) 1268 .addImm(AndMaskEncoded) 1269 .setMIFlags(LoadI->getFlags()); 1270 } else { 1271 BitExtMI = 1272 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 1273 TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri), 1274 DestReg) 1275 .add(StMO) 1276 .addImm(Immr) 1277 .addImm(Imms) 1278 .setMIFlags(LoadI->getFlags()); 1279 } 1280 } 1281 1282 // Clear kill flags between store and load. 1283 for (MachineInstr &MI : make_range(StoreI->getIterator(), 1284 BitExtMI->getIterator())) 1285 if (MI.killsRegister(StRt, TRI)) { 1286 MI.clearRegisterKills(StRt, TRI); 1287 break; 1288 } 1289 1290 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n "); 1291 LLVM_DEBUG(StoreI->print(dbgs())); 1292 LLVM_DEBUG(dbgs() << " "); 1293 LLVM_DEBUG(LoadI->print(dbgs())); 1294 LLVM_DEBUG(dbgs() << " with instructions:\n "); 1295 LLVM_DEBUG(StoreI->print(dbgs())); 1296 LLVM_DEBUG(dbgs() << " "); 1297 LLVM_DEBUG((BitExtMI)->print(dbgs())); 1298 LLVM_DEBUG(dbgs() << "\n"); 1299 1300 // Erase the old instructions. 1301 LoadI->eraseFromParent(); 1302 return NextI; 1303 } 1304 1305 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { 1306 // Convert the byte-offset used by unscaled into an "element" offset used 1307 // by the scaled pair load/store instructions. 1308 if (IsUnscaled) { 1309 // If the byte-offset isn't a multiple of the stride, there's no point 1310 // trying to match it. 1311 if (Offset % OffsetStride) 1312 return false; 1313 Offset /= OffsetStride; 1314 } 1315 return Offset <= 63 && Offset >= -64; 1316 } 1317 1318 // Do alignment, specialized to power of 2 and for signed ints, 1319 // avoiding having to do a C-style cast from uint_64t to int when 1320 // using alignTo from include/llvm/Support/MathExtras.h. 1321 // FIXME: Move this function to include/MathExtras.h? 1322 static int alignTo(int Num, int PowOf2) { 1323 return (Num + PowOf2 - 1) & ~(PowOf2 - 1); 1324 } 1325 1326 static bool mayAlias(MachineInstr &MIa, 1327 SmallVectorImpl<MachineInstr *> &MemInsns, 1328 AliasAnalysis *AA) { 1329 for (MachineInstr *MIb : MemInsns) { 1330 if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false)) { 1331 LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump()); 1332 return true; 1333 } 1334 } 1335 1336 LLVM_DEBUG(dbgs() << "No aliases found\n"); 1337 return false; 1338 } 1339 1340 bool AArch64LoadStoreOpt::findMatchingStore( 1341 MachineBasicBlock::iterator I, unsigned Limit, 1342 MachineBasicBlock::iterator &StoreI) { 1343 MachineBasicBlock::iterator B = I->getParent()->begin(); 1344 MachineBasicBlock::iterator MBBI = I; 1345 MachineInstr &LoadMI = *I; 1346 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg(); 1347 1348 // If the load is the first instruction in the block, there's obviously 1349 // not any matching store. 1350 if (MBBI == B) 1351 return false; 1352 1353 // Track which register units have been modified and used between the first 1354 // insn and the second insn. 1355 ModifiedRegUnits.clear(); 1356 UsedRegUnits.clear(); 1357 1358 unsigned Count = 0; 1359 do { 1360 MBBI = prev_nodbg(MBBI, B); 1361 MachineInstr &MI = *MBBI; 1362 1363 // Don't count transient instructions towards the search limit since there 1364 // may be different numbers of them if e.g. debug information is present. 1365 if (!MI.isTransient()) 1366 ++Count; 1367 1368 // If the load instruction reads directly from the address to which the 1369 // store instruction writes and the stored value is not modified, we can 1370 // promote the load. Since we do not handle stores with pre-/post-index, 1371 // it's unnecessary to check if BaseReg is modified by the store itself. 1372 // Also we can't handle stores without an immediate offset operand, 1373 // while the operand might be the address for a global variable. 1374 if (MI.mayStore() && isMatchingStore(LoadMI, MI) && 1375 BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() && 1376 AArch64InstrInfo::getLdStOffsetOp(MI).isImm() && 1377 isLdOffsetInRangeOfSt(LoadMI, MI, TII) && 1378 ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) { 1379 StoreI = MBBI; 1380 return true; 1381 } 1382 1383 if (MI.isCall()) 1384 return false; 1385 1386 // Update modified / uses register units. 1387 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 1388 1389 // Otherwise, if the base register is modified, we have no match, so 1390 // return early. 1391 if (!ModifiedRegUnits.available(BaseReg)) 1392 return false; 1393 1394 // If we encounter a store aliased with the load, return early. 1395 if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false)) 1396 return false; 1397 } while (MBBI != B && Count < Limit); 1398 return false; 1399 } 1400 1401 static bool needsWinCFI(const MachineFunction *MF) { 1402 return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() && 1403 MF->getFunction().needsUnwindTableEntry(); 1404 } 1405 1406 // Returns true if FirstMI and MI are candidates for merging or pairing. 1407 // Otherwise, returns false. 1408 static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI, 1409 LdStPairFlags &Flags, 1410 const AArch64InstrInfo *TII) { 1411 // If this is volatile or if pairing is suppressed, not a candidate. 1412 if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) 1413 return false; 1414 1415 // We should have already checked FirstMI for pair suppression and volatility. 1416 assert(!FirstMI.hasOrderedMemoryRef() && 1417 !TII->isLdStPairSuppressed(FirstMI) && 1418 "FirstMI shouldn't get here if either of these checks are true."); 1419 1420 if (needsWinCFI(MI.getMF()) && (MI.getFlag(MachineInstr::FrameSetup) || 1421 MI.getFlag(MachineInstr::FrameDestroy))) 1422 return false; 1423 1424 unsigned OpcA = FirstMI.getOpcode(); 1425 unsigned OpcB = MI.getOpcode(); 1426 1427 // Opcodes match: If the opcodes are pre ld/st there is nothing more to check. 1428 if (OpcA == OpcB) 1429 return !AArch64InstrInfo::isPreLdSt(FirstMI); 1430 1431 // Two pre ld/st of different opcodes cannot be merged either 1432 if (AArch64InstrInfo::isPreLdSt(FirstMI) && AArch64InstrInfo::isPreLdSt(MI)) 1433 return false; 1434 1435 // Try to match a sign-extended load/store with a zero-extended load/store. 1436 bool IsValidLdStrOpc, PairIsValidLdStrOpc; 1437 unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc); 1438 assert(IsValidLdStrOpc && 1439 "Given Opc should be a Load or Store with an immediate"); 1440 // OpcA will be the first instruction in the pair. 1441 if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) { 1442 Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0); 1443 return true; 1444 } 1445 1446 // If the second instruction isn't even a mergable/pairable load/store, bail 1447 // out. 1448 if (!PairIsValidLdStrOpc) 1449 return false; 1450 1451 // FIXME: We don't support merging narrow stores with mixed scaled/unscaled 1452 // offsets. 1453 if (isNarrowStore(OpcA) || isNarrowStore(OpcB)) 1454 return false; 1455 1456 // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and 1457 // LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui 1458 // are candidate pairs that can be merged. 1459 if (isPreLdStPairCandidate(FirstMI, MI)) 1460 return true; 1461 1462 // Try to match an unscaled load/store with a scaled load/store. 1463 return TII->hasUnscaledLdStOffset(OpcA) != TII->hasUnscaledLdStOffset(OpcB) && 1464 getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB); 1465 1466 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair? 1467 } 1468 1469 static bool canRenameMOP(const MachineOperand &MOP, 1470 const TargetRegisterInfo *TRI) { 1471 if (MOP.isReg()) { 1472 auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg()); 1473 // Renaming registers with multiple disjunct sub-registers (e.g. the 1474 // result of a LD3) means that all sub-registers are renamed, potentially 1475 // impacting other instructions we did not check. Bail out. 1476 // Note that this relies on the structure of the AArch64 register file. In 1477 // particular, a subregister cannot be written without overwriting the 1478 // whole register. 1479 if (RegClass->HasDisjunctSubRegs) { 1480 LLVM_DEBUG( 1481 dbgs() 1482 << " Cannot rename operands with multiple disjunct subregisters (" 1483 << MOP << ")\n"); 1484 return false; 1485 } 1486 1487 // We cannot rename arbitrary implicit-defs, the specific rule to rewrite 1488 // them must be known. For example, in ORRWrs the implicit-def 1489 // corresponds to the result register. 1490 if (MOP.isImplicit() && MOP.isDef()) { 1491 if (!isRewritableImplicitDef(MOP.getParent()->getOpcode())) 1492 return false; 1493 return TRI->isSuperOrSubRegisterEq( 1494 MOP.getParent()->getOperand(0).getReg(), MOP.getReg()); 1495 } 1496 } 1497 return MOP.isImplicit() || 1498 (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied()); 1499 } 1500 1501 static bool 1502 canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween, 1503 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, 1504 const TargetRegisterInfo *TRI) { 1505 if (!FirstMI.mayStore()) 1506 return false; 1507 1508 // Check if we can find an unused register which we can use to rename 1509 // the register used by the first load/store. 1510 1511 auto RegToRename = getLdStRegOp(FirstMI).getReg(); 1512 // For now, we only rename if the store operand gets killed at the store. 1513 if (!getLdStRegOp(FirstMI).isKill() && 1514 !any_of(FirstMI.operands(), 1515 [TRI, RegToRename](const MachineOperand &MOP) { 1516 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() && 1517 MOP.isImplicit() && MOP.isKill() && 1518 TRI->regsOverlap(RegToRename, MOP.getReg()); 1519 })) { 1520 LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI); 1521 return false; 1522 } 1523 1524 bool FoundDef = false; 1525 1526 // For each instruction between FirstMI and the previous def for RegToRename, 1527 // we 1528 // * check if we can rename RegToRename in this instruction 1529 // * collect the registers used and required register classes for RegToRename. 1530 std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI, 1531 bool IsDef) { 1532 LLVM_DEBUG(dbgs() << "Checking " << MI); 1533 // Currently we do not try to rename across frame-setup instructions. 1534 if (MI.getFlag(MachineInstr::FrameSetup)) { 1535 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions " 1536 << "currently\n"); 1537 return false; 1538 } 1539 1540 UsedInBetween.accumulate(MI); 1541 1542 // For a definition, check that we can rename the definition and exit the 1543 // loop. 1544 FoundDef = IsDef; 1545 1546 // For defs, check if we can rename the first def of RegToRename. 1547 if (FoundDef) { 1548 // For some pseudo instructions, we might not generate code in the end 1549 // (e.g. KILL) and we would end up without a correct def for the rename 1550 // register. 1551 // TODO: This might be overly conservative and we could handle those cases 1552 // in multiple ways: 1553 // 1. Insert an extra copy, to materialize the def. 1554 // 2. Skip pseudo-defs until we find an non-pseudo def. 1555 if (MI.isPseudo()) { 1556 LLVM_DEBUG(dbgs() << " Cannot rename pseudo/bundle instruction\n"); 1557 return false; 1558 } 1559 1560 for (auto &MOP : MI.operands()) { 1561 if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() || 1562 !TRI->regsOverlap(MOP.getReg(), RegToRename)) 1563 continue; 1564 if (!canRenameMOP(MOP, TRI)) { 1565 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI); 1566 return false; 1567 } 1568 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg())); 1569 } 1570 return true; 1571 } else { 1572 for (auto &MOP : MI.operands()) { 1573 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() || 1574 !TRI->regsOverlap(MOP.getReg(), RegToRename)) 1575 continue; 1576 1577 if (!canRenameMOP(MOP, TRI)) { 1578 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI); 1579 return false; 1580 } 1581 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg())); 1582 } 1583 } 1584 return true; 1585 }; 1586 1587 if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs)) 1588 return false; 1589 1590 if (!FoundDef) { 1591 LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n"); 1592 return false; 1593 } 1594 return true; 1595 } 1596 1597 // We want to merge the second load into the first by rewriting the usages of 1598 // the same reg between first (incl.) and second (excl.). We don't need to care 1599 // about any insns before FirstLoad or after SecondLoad. 1600 // 1. The second load writes new value into the same reg. 1601 // - The renaming is impossible to impact later use of the reg. 1602 // - The second load always trash the value written by the first load which 1603 // means the reg must be killed before the second load. 1604 // 2. The first load must be a def for the same reg so we don't need to look 1605 // into anything before it. 1606 static bool canRenameUntilSecondLoad( 1607 MachineInstr &FirstLoad, MachineInstr &SecondLoad, 1608 LiveRegUnits &UsedInBetween, 1609 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, 1610 const TargetRegisterInfo *TRI) { 1611 if (FirstLoad.isPseudo()) 1612 return false; 1613 1614 UsedInBetween.accumulate(FirstLoad); 1615 auto RegToRename = getLdStRegOp(FirstLoad).getReg(); 1616 bool Success = std::all_of( 1617 FirstLoad.getIterator(), SecondLoad.getIterator(), 1618 [&](MachineInstr &MI) { 1619 LLVM_DEBUG(dbgs() << "Checking " << MI); 1620 // Currently we do not try to rename across frame-setup instructions. 1621 if (MI.getFlag(MachineInstr::FrameSetup)) { 1622 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions " 1623 << "currently\n"); 1624 return false; 1625 } 1626 1627 for (auto &MOP : MI.operands()) { 1628 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() || 1629 !TRI->regsOverlap(MOP.getReg(), RegToRename)) 1630 continue; 1631 if (!canRenameMOP(MOP, TRI)) { 1632 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI); 1633 return false; 1634 } 1635 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg())); 1636 } 1637 1638 return true; 1639 }); 1640 return Success; 1641 } 1642 1643 // Check if we can find a physical register for renaming \p Reg. This register 1644 // must: 1645 // * not be defined already in \p DefinedInBB; DefinedInBB must contain all 1646 // defined registers up to the point where the renamed register will be used, 1647 // * not used in \p UsedInBetween; UsedInBetween must contain all accessed 1648 // registers in the range the rename register will be used, 1649 // * is available in all used register classes (checked using RequiredClasses). 1650 static std::optional<MCPhysReg> tryToFindRegisterToRename( 1651 const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB, 1652 LiveRegUnits &UsedInBetween, 1653 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, 1654 const TargetRegisterInfo *TRI) { 1655 const MachineRegisterInfo &RegInfo = MF.getRegInfo(); 1656 1657 // Checks if any sub- or super-register of PR is callee saved. 1658 auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) { 1659 return any_of(TRI->sub_and_superregs_inclusive(PR), 1660 [&MF, TRI](MCPhysReg SubOrSuper) { 1661 return TRI->isCalleeSavedPhysReg(SubOrSuper, MF); 1662 }); 1663 }; 1664 1665 // Check if PR or one of its sub- or super-registers can be used for all 1666 // required register classes. 1667 auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) { 1668 return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) { 1669 return any_of( 1670 TRI->sub_and_superregs_inclusive(PR), 1671 [C](MCPhysReg SubOrSuper) { return C->contains(SubOrSuper); }); 1672 }); 1673 }; 1674 1675 auto *RegClass = TRI->getMinimalPhysRegClass(Reg); 1676 for (const MCPhysReg &PR : *RegClass) { 1677 if (DefinedInBB.available(PR) && UsedInBetween.available(PR) && 1678 !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) && 1679 CanBeUsedForAllClasses(PR)) { 1680 DefinedInBB.addReg(PR); 1681 LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI) 1682 << "\n"); 1683 return {PR}; 1684 } 1685 } 1686 LLVM_DEBUG(dbgs() << "No rename register found from " 1687 << TRI->getRegClassName(RegClass) << "\n"); 1688 return std::nullopt; 1689 } 1690 1691 // For store pairs: returns a register from FirstMI to the beginning of the 1692 // block that can be renamed. 1693 // For load pairs: returns a register from FirstMI to MI that can be renamed. 1694 static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair( 1695 std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI, 1696 Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween, 1697 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, 1698 const TargetRegisterInfo *TRI) { 1699 std::optional<MCPhysReg> RenameReg; 1700 if (!DebugCounter::shouldExecute(RegRenamingCounter)) 1701 return RenameReg; 1702 1703 auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg()); 1704 MachineFunction &MF = *FirstMI.getParent()->getParent(); 1705 if (!RegClass || !MF.getRegInfo().tracksLiveness()) 1706 return RenameReg; 1707 1708 const bool IsLoad = FirstMI.mayLoad(); 1709 1710 if (!MaybeCanRename) { 1711 if (IsLoad) 1712 MaybeCanRename = {canRenameUntilSecondLoad(FirstMI, MI, UsedInBetween, 1713 RequiredClasses, TRI)}; 1714 else 1715 MaybeCanRename = { 1716 canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)}; 1717 } 1718 1719 if (*MaybeCanRename) { 1720 RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween, 1721 RequiredClasses, TRI); 1722 } 1723 return RenameReg; 1724 } 1725 1726 /// Scan the instructions looking for a load/store that can be combined with the 1727 /// current instruction into a wider equivalent or a load/store pair. 1728 MachineBasicBlock::iterator 1729 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, 1730 LdStPairFlags &Flags, unsigned Limit, 1731 bool FindNarrowMerge) { 1732 MachineBasicBlock::iterator E = I->getParent()->end(); 1733 MachineBasicBlock::iterator MBBI = I; 1734 MachineBasicBlock::iterator MBBIWithRenameReg; 1735 MachineInstr &FirstMI = *I; 1736 MBBI = next_nodbg(MBBI, E); 1737 1738 bool MayLoad = FirstMI.mayLoad(); 1739 bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI); 1740 Register Reg = getLdStRegOp(FirstMI).getReg(); 1741 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg(); 1742 int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm(); 1743 int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1; 1744 bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); 1745 1746 std::optional<bool> MaybeCanRename; 1747 if (!EnableRenaming) 1748 MaybeCanRename = {false}; 1749 1750 SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses; 1751 LiveRegUnits UsedInBetween; 1752 UsedInBetween.init(*TRI); 1753 1754 Flags.clearRenameReg(); 1755 1756 // Track which register units have been modified and used between the first 1757 // insn (inclusive) and the second insn. 1758 ModifiedRegUnits.clear(); 1759 UsedRegUnits.clear(); 1760 1761 // Remember any instructions that read/write memory between FirstMI and MI. 1762 SmallVector<MachineInstr *, 4> MemInsns; 1763 1764 LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump()); 1765 for (unsigned Count = 0; MBBI != E && Count < Limit; 1766 MBBI = next_nodbg(MBBI, E)) { 1767 MachineInstr &MI = *MBBI; 1768 LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump()); 1769 1770 UsedInBetween.accumulate(MI); 1771 1772 // Don't count transient instructions towards the search limit since there 1773 // may be different numbers of them if e.g. debug information is present. 1774 if (!MI.isTransient()) 1775 ++Count; 1776 1777 Flags.setSExtIdx(-1); 1778 if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) && 1779 AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) { 1780 assert(MI.mayLoadOrStore() && "Expected memory operation."); 1781 // If we've found another instruction with the same opcode, check to see 1782 // if the base and offset are compatible with our starting instruction. 1783 // These instructions all have scaled immediate operands, so we just 1784 // check for +1/-1. Make sure to check the new instruction offset is 1785 // actually an immediate and not a symbolic reference destined for 1786 // a relocation. 1787 Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg(); 1788 int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); 1789 bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI); 1790 if (IsUnscaled != MIIsUnscaled) { 1791 // We're trying to pair instructions that differ in how they are scaled. 1792 // If FirstMI is scaled then scale the offset of MI accordingly. 1793 // Otherwise, do the opposite (i.e., make MI's offset unscaled). 1794 int MemSize = TII->getMemScale(MI); 1795 if (MIIsUnscaled) { 1796 // If the unscaled offset isn't a multiple of the MemSize, we can't 1797 // pair the operations together: bail and keep looking. 1798 if (MIOffset % MemSize) { 1799 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1800 UsedRegUnits, TRI); 1801 MemInsns.push_back(&MI); 1802 continue; 1803 } 1804 MIOffset /= MemSize; 1805 } else { 1806 MIOffset *= MemSize; 1807 } 1808 } 1809 1810 bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI); 1811 1812 if (BaseReg == MIBaseReg) { 1813 // If the offset of the second ld/st is not equal to the size of the 1814 // destination register it can’t be paired with a pre-index ld/st 1815 // pair. Additionally if the base reg is used or modified the operations 1816 // can't be paired: bail and keep looking. 1817 if (IsPreLdSt) { 1818 bool IsOutOfBounds = MIOffset != TII->getMemScale(MI); 1819 bool IsBaseRegUsed = !UsedRegUnits.available( 1820 AArch64InstrInfo::getLdStBaseOp(MI).getReg()); 1821 bool IsBaseRegModified = !ModifiedRegUnits.available( 1822 AArch64InstrInfo::getLdStBaseOp(MI).getReg()); 1823 // If the stored value and the address of the second instruction is 1824 // the same, it needs to be using the updated register and therefore 1825 // it must not be folded. 1826 bool IsMIRegTheSame = 1827 TRI->regsOverlap(getLdStRegOp(MI).getReg(), 1828 AArch64InstrInfo::getLdStBaseOp(MI).getReg()); 1829 if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified || 1830 IsMIRegTheSame) { 1831 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1832 UsedRegUnits, TRI); 1833 MemInsns.push_back(&MI); 1834 continue; 1835 } 1836 } else { 1837 if ((Offset != MIOffset + OffsetStride) && 1838 (Offset + OffsetStride != MIOffset)) { 1839 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1840 UsedRegUnits, TRI); 1841 MemInsns.push_back(&MI); 1842 continue; 1843 } 1844 } 1845 1846 int MinOffset = Offset < MIOffset ? Offset : MIOffset; 1847 if (FindNarrowMerge) { 1848 // If the alignment requirements of the scaled wide load/store 1849 // instruction can't express the offset of the scaled narrow input, 1850 // bail and keep looking. For promotable zero stores, allow only when 1851 // the stored value is the same (i.e., WZR). 1852 if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) || 1853 (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) { 1854 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1855 UsedRegUnits, TRI); 1856 MemInsns.push_back(&MI); 1857 continue; 1858 } 1859 } else { 1860 // Pairwise instructions have a 7-bit signed offset field. Single 1861 // insns have a 12-bit unsigned offset field. If the resultant 1862 // immediate offset of merging these instructions is out of range for 1863 // a pairwise instruction, bail and keep looking. 1864 if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) { 1865 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1866 UsedRegUnits, TRI); 1867 MemInsns.push_back(&MI); 1868 LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, " 1869 << "keep looking.\n"); 1870 continue; 1871 } 1872 // If the alignment requirements of the paired (scaled) instruction 1873 // can't express the offset of the unscaled input, bail and keep 1874 // looking. 1875 if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { 1876 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1877 UsedRegUnits, TRI); 1878 MemInsns.push_back(&MI); 1879 LLVM_DEBUG(dbgs() 1880 << "Offset doesn't fit due to alignment requirements, " 1881 << "keep looking.\n"); 1882 continue; 1883 } 1884 } 1885 1886 // If the BaseReg has been modified, then we cannot do the optimization. 1887 // For example, in the following pattern 1888 // ldr x1 [x2] 1889 // ldr x2 [x3] 1890 // ldr x4 [x2, #8], 1891 // the first and third ldr cannot be converted to ldp x1, x4, [x2] 1892 if (!ModifiedRegUnits.available(BaseReg)) 1893 return E; 1894 1895 const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq( 1896 Reg, getLdStRegOp(MI).getReg()); 1897 1898 // If the Rt of the second instruction (destination register of the 1899 // load) was not modified or used between the two instructions and none 1900 // of the instructions between the second and first alias with the 1901 // second, we can combine the second into the first. 1902 bool RtNotModified = 1903 ModifiedRegUnits.available(getLdStRegOp(MI).getReg()); 1904 bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg && 1905 !UsedRegUnits.available(getLdStRegOp(MI).getReg())); 1906 1907 LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n" 1908 << "Reg '" << getLdStRegOp(MI) << "' not modified: " 1909 << (RtNotModified ? "true" : "false") << "\n" 1910 << "Reg '" << getLdStRegOp(MI) << "' not used: " 1911 << (RtNotUsed ? "true" : "false") << "\n"); 1912 1913 if (RtNotModified && RtNotUsed && !mayAlias(MI, MemInsns, AA)) { 1914 // For pairs loading into the same reg, try to find a renaming 1915 // opportunity to allow the renaming of Reg between FirstMI and MI 1916 // and combine MI into FirstMI; otherwise bail and keep looking. 1917 if (SameLoadReg) { 1918 std::optional<MCPhysReg> RenameReg = 1919 findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI, 1920 Reg, DefinedInBB, UsedInBetween, 1921 RequiredClasses, TRI); 1922 if (!RenameReg) { 1923 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1924 UsedRegUnits, TRI); 1925 MemInsns.push_back(&MI); 1926 LLVM_DEBUG(dbgs() << "Can't find reg for renaming, " 1927 << "keep looking.\n"); 1928 continue; 1929 } 1930 Flags.setRenameReg(*RenameReg); 1931 } 1932 1933 Flags.setMergeForward(false); 1934 if (!SameLoadReg) 1935 Flags.clearRenameReg(); 1936 return MBBI; 1937 } 1938 1939 // Likewise, if the Rt of the first instruction is not modified or used 1940 // between the two instructions and none of the instructions between the 1941 // first and the second alias with the first, we can combine the first 1942 // into the second. 1943 RtNotModified = !( 1944 MayLoad && !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())); 1945 1946 LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n" 1947 << "Reg '" << getLdStRegOp(FirstMI) 1948 << "' not modified: " 1949 << (RtNotModified ? "true" : "false") << "\n"); 1950 1951 if (RtNotModified && !mayAlias(FirstMI, MemInsns, AA)) { 1952 if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) { 1953 Flags.setMergeForward(true); 1954 Flags.clearRenameReg(); 1955 return MBBI; 1956 } 1957 1958 std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair( 1959 MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween, 1960 RequiredClasses, TRI); 1961 if (RenameReg) { 1962 Flags.setMergeForward(true); 1963 Flags.setRenameReg(*RenameReg); 1964 MBBIWithRenameReg = MBBI; 1965 } 1966 } 1967 LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to " 1968 << "interference in between, keep looking.\n"); 1969 } 1970 } 1971 1972 if (Flags.getRenameReg()) 1973 return MBBIWithRenameReg; 1974 1975 // If the instruction wasn't a matching load or store. Stop searching if we 1976 // encounter a call instruction that might modify memory. 1977 if (MI.isCall()) { 1978 LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n"); 1979 return E; 1980 } 1981 1982 // Update modified / uses register units. 1983 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 1984 1985 // Otherwise, if the base register is modified, we have no match, so 1986 // return early. 1987 if (!ModifiedRegUnits.available(BaseReg)) { 1988 LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n"); 1989 return E; 1990 } 1991 1992 // Update list of instructions that read/write memory. 1993 if (MI.mayLoadOrStore()) 1994 MemInsns.push_back(&MI); 1995 } 1996 return E; 1997 } 1998 1999 static MachineBasicBlock::iterator 2000 maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) { 2001 auto End = MI.getParent()->end(); 2002 if (MaybeCFI == End || 2003 MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION || 2004 !(MI.getFlag(MachineInstr::FrameSetup) || 2005 MI.getFlag(MachineInstr::FrameDestroy)) || 2006 AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP) 2007 return End; 2008 2009 const MachineFunction &MF = *MI.getParent()->getParent(); 2010 unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex(); 2011 const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex]; 2012 switch (CFI.getOperation()) { 2013 case MCCFIInstruction::OpDefCfa: 2014 case MCCFIInstruction::OpDefCfaOffset: 2015 return MaybeCFI; 2016 default: 2017 return End; 2018 } 2019 } 2020 2021 MachineBasicBlock::iterator 2022 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, 2023 MachineBasicBlock::iterator Update, 2024 bool IsPreIdx) { 2025 assert((Update->getOpcode() == AArch64::ADDXri || 2026 Update->getOpcode() == AArch64::SUBXri) && 2027 "Unexpected base register update instruction to merge!"); 2028 MachineBasicBlock::iterator E = I->getParent()->end(); 2029 MachineBasicBlock::iterator NextI = next_nodbg(I, E); 2030 2031 // If updating the SP and the following instruction is CFA offset related CFI 2032 // instruction move it after the merged instruction. 2033 MachineBasicBlock::iterator CFI = 2034 IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E; 2035 2036 // Return the instruction following the merged instruction, which is 2037 // the instruction following our unmerged load. Unless that's the add/sub 2038 // instruction we're merging, in which case it's the one after that. 2039 if (NextI == Update) 2040 NextI = next_nodbg(NextI, E); 2041 2042 int Value = Update->getOperand(2).getImm(); 2043 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && 2044 "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); 2045 if (Update->getOpcode() == AArch64::SUBXri) 2046 Value = -Value; 2047 2048 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) 2049 : getPostIndexedOpcode(I->getOpcode()); 2050 MachineInstrBuilder MIB; 2051 int Scale, MinOffset, MaxOffset; 2052 getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset); 2053 if (!AArch64InstrInfo::isPairedLdSt(*I)) { 2054 // Non-paired instruction. 2055 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 2056 .add(getLdStRegOp(*Update)) 2057 .add(getLdStRegOp(*I)) 2058 .add(AArch64InstrInfo::getLdStBaseOp(*I)) 2059 .addImm(Value / Scale) 2060 .setMemRefs(I->memoperands()) 2061 .setMIFlags(I->mergeFlagsWith(*Update)); 2062 } else { 2063 // Paired instruction. 2064 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 2065 .add(getLdStRegOp(*Update)) 2066 .add(getLdStRegOp(*I, 0)) 2067 .add(getLdStRegOp(*I, 1)) 2068 .add(AArch64InstrInfo::getLdStBaseOp(*I)) 2069 .addImm(Value / Scale) 2070 .setMemRefs(I->memoperands()) 2071 .setMIFlags(I->mergeFlagsWith(*Update)); 2072 } 2073 if (CFI != E) { 2074 MachineBasicBlock *MBB = I->getParent(); 2075 MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI); 2076 } 2077 2078 if (IsPreIdx) { 2079 ++NumPreFolded; 2080 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store."); 2081 } else { 2082 ++NumPostFolded; 2083 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store."); 2084 } 2085 LLVM_DEBUG(dbgs() << " Replacing instructions:\n "); 2086 LLVM_DEBUG(I->print(dbgs())); 2087 LLVM_DEBUG(dbgs() << " "); 2088 LLVM_DEBUG(Update->print(dbgs())); 2089 LLVM_DEBUG(dbgs() << " with instruction:\n "); 2090 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 2091 LLVM_DEBUG(dbgs() << "\n"); 2092 2093 // Erase the old instructions for the block. 2094 I->eraseFromParent(); 2095 Update->eraseFromParent(); 2096 2097 return NextI; 2098 } 2099 2100 MachineBasicBlock::iterator 2101 AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I, 2102 MachineBasicBlock::iterator Update, 2103 unsigned Offset, int Scale) { 2104 assert((Update->getOpcode() == AArch64::MOVKWi) && 2105 "Unexpected const mov instruction to merge!"); 2106 MachineBasicBlock::iterator E = I->getParent()->end(); 2107 MachineBasicBlock::iterator NextI = next_nodbg(I, E); 2108 MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E); 2109 MachineInstr &MemMI = *I; 2110 unsigned Mask = (1 << 12) * Scale - 1; 2111 unsigned Low = Offset & Mask; 2112 unsigned High = Offset - Low; 2113 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); 2114 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg(); 2115 MachineInstrBuilder AddMIB, MemMIB; 2116 2117 // Add IndexReg, BaseReg, High (the BaseReg may be SP) 2118 AddMIB = 2119 BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri)) 2120 .addDef(IndexReg) 2121 .addUse(BaseReg) 2122 .addImm(High >> 12) // shifted value 2123 .addImm(12); // shift 12 2124 (void)AddMIB; 2125 // Ld/St DestReg, IndexReg, Imm12 2126 unsigned NewOpc = getBaseAddressOpcode(I->getOpcode()); 2127 MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 2128 .add(getLdStRegOp(MemMI)) 2129 .add(AArch64InstrInfo::getLdStOffsetOp(MemMI)) 2130 .addImm(Low / Scale) 2131 .setMemRefs(I->memoperands()) 2132 .setMIFlags(I->mergeFlagsWith(*Update)); 2133 (void)MemMIB; 2134 2135 ++NumConstOffsetFolded; 2136 LLVM_DEBUG(dbgs() << "Creating base address load/store.\n"); 2137 LLVM_DEBUG(dbgs() << " Replacing instructions:\n "); 2138 LLVM_DEBUG(PrevI->print(dbgs())); 2139 LLVM_DEBUG(dbgs() << " "); 2140 LLVM_DEBUG(Update->print(dbgs())); 2141 LLVM_DEBUG(dbgs() << " "); 2142 LLVM_DEBUG(I->print(dbgs())); 2143 LLVM_DEBUG(dbgs() << " with instruction:\n "); 2144 LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs())); 2145 LLVM_DEBUG(dbgs() << " "); 2146 LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs())); 2147 LLVM_DEBUG(dbgs() << "\n"); 2148 2149 // Erase the old instructions for the block. 2150 I->eraseFromParent(); 2151 PrevI->eraseFromParent(); 2152 Update->eraseFromParent(); 2153 2154 return NextI; 2155 } 2156 2157 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, 2158 MachineInstr &MI, 2159 unsigned BaseReg, int Offset) { 2160 switch (MI.getOpcode()) { 2161 default: 2162 break; 2163 case AArch64::SUBXri: 2164 case AArch64::ADDXri: 2165 // Make sure it's a vanilla immediate operand, not a relocation or 2166 // anything else we can't handle. 2167 if (!MI.getOperand(2).isImm()) 2168 break; 2169 // Watch out for 1 << 12 shifted value. 2170 if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm())) 2171 break; 2172 2173 // The update instruction source and destination register must be the 2174 // same as the load/store base register. 2175 if (MI.getOperand(0).getReg() != BaseReg || 2176 MI.getOperand(1).getReg() != BaseReg) 2177 break; 2178 2179 int UpdateOffset = MI.getOperand(2).getImm(); 2180 if (MI.getOpcode() == AArch64::SUBXri) 2181 UpdateOffset = -UpdateOffset; 2182 2183 // The immediate must be a multiple of the scaling factor of the pre/post 2184 // indexed instruction. 2185 int Scale, MinOffset, MaxOffset; 2186 getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset); 2187 if (UpdateOffset % Scale != 0) 2188 break; 2189 2190 // Scaled offset must fit in the instruction immediate. 2191 int ScaledOffset = UpdateOffset / Scale; 2192 if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset) 2193 break; 2194 2195 // If we have a non-zero Offset, we check that it matches the amount 2196 // we're adding to the register. 2197 if (!Offset || Offset == UpdateOffset) 2198 return true; 2199 break; 2200 } 2201 return false; 2202 } 2203 2204 bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI, 2205 MachineInstr &MI, 2206 unsigned IndexReg, 2207 unsigned &Offset) { 2208 // The update instruction source and destination register must be the 2209 // same as the load/store index register. 2210 if (MI.getOpcode() == AArch64::MOVKWi && 2211 TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) { 2212 2213 // movz + movk hold a large offset of a Ld/St instruction. 2214 MachineBasicBlock::iterator B = MI.getParent()->begin(); 2215 MachineBasicBlock::iterator MBBI = &MI; 2216 MBBI = prev_nodbg(MBBI, B); 2217 MachineInstr &MovzMI = *MBBI; 2218 if (MovzMI.getOpcode() == AArch64::MOVZWi) { 2219 unsigned Low = MovzMI.getOperand(1).getImm(); 2220 unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm(); 2221 Offset = High + Low; 2222 // 12-bit optionally shifted immediates are legal for adds. 2223 return Offset >> 24 == 0; 2224 } 2225 } 2226 return false; 2227 } 2228 2229 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( 2230 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) { 2231 MachineBasicBlock::iterator E = I->getParent()->end(); 2232 MachineInstr &MemMI = *I; 2233 MachineBasicBlock::iterator MBBI = I; 2234 2235 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); 2236 int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() * 2237 TII->getMemScale(MemMI); 2238 2239 // Scan forward looking for post-index opportunities. Updating instructions 2240 // can't be formed if the memory instruction doesn't have the offset we're 2241 // looking for. 2242 if (MIUnscaledOffset != UnscaledOffset) 2243 return E; 2244 2245 // If the base register overlaps a source/destination register, we can't 2246 // merge the update. This does not apply to tag store instructions which 2247 // ignore the address part of the source register. 2248 // This does not apply to STGPi as well, which does not have unpredictable 2249 // behavior in this case unlike normal stores, and always performs writeback 2250 // after reading the source register value. 2251 if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) { 2252 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); 2253 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 2254 Register DestReg = getLdStRegOp(MemMI, i).getReg(); 2255 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 2256 return E; 2257 } 2258 } 2259 2260 // Track which register units have been modified and used between the first 2261 // insn (inclusive) and the second insn. 2262 ModifiedRegUnits.clear(); 2263 UsedRegUnits.clear(); 2264 MBBI = next_nodbg(MBBI, E); 2265 2266 // We can't post-increment the stack pointer if any instruction between 2267 // the memory access (I) and the increment (MBBI) can access the memory 2268 // region defined by [SP, MBBI]. 2269 const bool BaseRegSP = BaseReg == AArch64::SP; 2270 if (BaseRegSP && needsWinCFI(I->getMF())) { 2271 // FIXME: For now, we always block the optimization over SP in windows 2272 // targets as it requires to adjust the unwind/debug info, messing up 2273 // the unwind info can actually cause a miscompile. 2274 return E; 2275 } 2276 2277 for (unsigned Count = 0; MBBI != E && Count < Limit; 2278 MBBI = next_nodbg(MBBI, E)) { 2279 MachineInstr &MI = *MBBI; 2280 2281 // Don't count transient instructions towards the search limit since there 2282 // may be different numbers of them if e.g. debug information is present. 2283 if (!MI.isTransient()) 2284 ++Count; 2285 2286 // If we found a match, return it. 2287 if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset)) 2288 return MBBI; 2289 2290 // Update the status of what the instruction clobbered and used. 2291 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 2292 2293 // Otherwise, if the base register is used or modified, we have no match, so 2294 // return early. 2295 // If we are optimizing SP, do not allow instructions that may load or store 2296 // in between the load and the optimized value update. 2297 if (!ModifiedRegUnits.available(BaseReg) || 2298 !UsedRegUnits.available(BaseReg) || 2299 (BaseRegSP && MBBI->mayLoadOrStore())) 2300 return E; 2301 } 2302 return E; 2303 } 2304 2305 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( 2306 MachineBasicBlock::iterator I, unsigned Limit) { 2307 MachineBasicBlock::iterator B = I->getParent()->begin(); 2308 MachineBasicBlock::iterator E = I->getParent()->end(); 2309 MachineInstr &MemMI = *I; 2310 MachineBasicBlock::iterator MBBI = I; 2311 MachineFunction &MF = *MemMI.getMF(); 2312 2313 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); 2314 int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm(); 2315 2316 // If the load/store is the first instruction in the block, there's obviously 2317 // not any matching update. Ditto if the memory offset isn't zero. 2318 if (MBBI == B || Offset != 0) 2319 return E; 2320 // If the base register overlaps a destination register, we can't 2321 // merge the update. 2322 if (!isTagStore(MemMI)) { 2323 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); 2324 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 2325 Register DestReg = getLdStRegOp(MemMI, i).getReg(); 2326 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 2327 return E; 2328 } 2329 } 2330 2331 const bool BaseRegSP = BaseReg == AArch64::SP; 2332 if (BaseRegSP && needsWinCFI(I->getMF())) { 2333 // FIXME: For now, we always block the optimization over SP in windows 2334 // targets as it requires to adjust the unwind/debug info, messing up 2335 // the unwind info can actually cause a miscompile. 2336 return E; 2337 } 2338 2339 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 2340 unsigned RedZoneSize = 2341 Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction()); 2342 2343 // Track which register units have been modified and used between the first 2344 // insn (inclusive) and the second insn. 2345 ModifiedRegUnits.clear(); 2346 UsedRegUnits.clear(); 2347 unsigned Count = 0; 2348 bool MemAcessBeforeSPPreInc = false; 2349 do { 2350 MBBI = prev_nodbg(MBBI, B); 2351 MachineInstr &MI = *MBBI; 2352 2353 // Don't count transient instructions towards the search limit since there 2354 // may be different numbers of them if e.g. debug information is present. 2355 if (!MI.isTransient()) 2356 ++Count; 2357 2358 // If we found a match, return it. 2359 if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) { 2360 // Check that the update value is within our red zone limit (which may be 2361 // zero). 2362 if (MemAcessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize) 2363 return E; 2364 return MBBI; 2365 } 2366 2367 // Update the status of what the instruction clobbered and used. 2368 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 2369 2370 // Otherwise, if the base register is used or modified, we have no match, so 2371 // return early. 2372 if (!ModifiedRegUnits.available(BaseReg) || 2373 !UsedRegUnits.available(BaseReg)) 2374 return E; 2375 // Keep track if we have a memory access before an SP pre-increment, in this 2376 // case we need to validate later that the update amount respects the red 2377 // zone. 2378 if (BaseRegSP && MBBI->mayLoadOrStore()) 2379 MemAcessBeforeSPPreInc = true; 2380 } while (MBBI != B && Count < Limit); 2381 return E; 2382 } 2383 2384 MachineBasicBlock::iterator 2385 AArch64LoadStoreOpt::findMatchingConstOffsetBackward( 2386 MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) { 2387 MachineBasicBlock::iterator B = I->getParent()->begin(); 2388 MachineBasicBlock::iterator E = I->getParent()->end(); 2389 MachineInstr &MemMI = *I; 2390 MachineBasicBlock::iterator MBBI = I; 2391 2392 // If the load is the first instruction in the block, there's obviously 2393 // not any matching load or store. 2394 if (MBBI == B) 2395 return E; 2396 2397 // Make sure the IndexReg is killed and the shift amount is zero. 2398 // TODO: Relex this restriction to extend, simplify processing now. 2399 if (!AArch64InstrInfo::getLdStOffsetOp(MemMI).isKill() || 2400 !AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() || 2401 (AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0)) 2402 return E; 2403 2404 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg(); 2405 2406 // Track which register units have been modified and used between the first 2407 // insn (inclusive) and the second insn. 2408 ModifiedRegUnits.clear(); 2409 UsedRegUnits.clear(); 2410 unsigned Count = 0; 2411 do { 2412 MBBI = prev_nodbg(MBBI, B); 2413 MachineInstr &MI = *MBBI; 2414 2415 // Don't count transient instructions towards the search limit since there 2416 // may be different numbers of them if e.g. debug information is present. 2417 if (!MI.isTransient()) 2418 ++Count; 2419 2420 // If we found a match, return it. 2421 if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) { 2422 return MBBI; 2423 } 2424 2425 // Update the status of what the instruction clobbered and used. 2426 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 2427 2428 // Otherwise, if the index register is used or modified, we have no match, 2429 // so return early. 2430 if (!ModifiedRegUnits.available(IndexReg) || 2431 !UsedRegUnits.available(IndexReg)) 2432 return E; 2433 2434 } while (MBBI != B && Count < Limit); 2435 return E; 2436 } 2437 2438 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( 2439 MachineBasicBlock::iterator &MBBI) { 2440 MachineInstr &MI = *MBBI; 2441 // If this is a volatile load, don't mess with it. 2442 if (MI.hasOrderedMemoryRef()) 2443 return false; 2444 2445 if (needsWinCFI(MI.getMF()) && MI.getFlag(MachineInstr::FrameDestroy)) 2446 return false; 2447 2448 // Make sure this is a reg+imm. 2449 // FIXME: It is possible to extend it to handle reg+reg cases. 2450 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) 2451 return false; 2452 2453 // Look backward up to LdStLimit instructions. 2454 MachineBasicBlock::iterator StoreI; 2455 if (findMatchingStore(MBBI, LdStLimit, StoreI)) { 2456 ++NumLoadsFromStoresPromoted; 2457 // Promote the load. Keeping the iterator straight is a 2458 // pain, so we let the merge routine tell us what the next instruction 2459 // is after it's done mucking about. 2460 MBBI = promoteLoadFromStore(MBBI, StoreI); 2461 return true; 2462 } 2463 return false; 2464 } 2465 2466 // Merge adjacent zero stores into a wider store. 2467 bool AArch64LoadStoreOpt::tryToMergeZeroStInst( 2468 MachineBasicBlock::iterator &MBBI) { 2469 assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store."); 2470 MachineInstr &MI = *MBBI; 2471 MachineBasicBlock::iterator E = MI.getParent()->end(); 2472 2473 if (!TII->isCandidateToMergeOrPair(MI)) 2474 return false; 2475 2476 // Look ahead up to LdStLimit instructions for a mergable instruction. 2477 LdStPairFlags Flags; 2478 MachineBasicBlock::iterator MergeMI = 2479 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true); 2480 if (MergeMI != E) { 2481 ++NumZeroStoresPromoted; 2482 2483 // Keeping the iterator straight is a pain, so we let the merge routine tell 2484 // us what the next instruction is after it's done mucking about. 2485 MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags); 2486 return true; 2487 } 2488 return false; 2489 } 2490 2491 // Find loads and stores that can be merged into a single load or store pair 2492 // instruction. 2493 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { 2494 MachineInstr &MI = *MBBI; 2495 MachineBasicBlock::iterator E = MI.getParent()->end(); 2496 2497 if (!TII->isCandidateToMergeOrPair(MI)) 2498 return false; 2499 2500 // If disable-ldp feature is opted, do not emit ldp. 2501 if (MI.mayLoad() && Subtarget->hasDisableLdp()) 2502 return false; 2503 2504 // If disable-stp feature is opted, do not emit stp. 2505 if (MI.mayStore() && Subtarget->hasDisableStp()) 2506 return false; 2507 2508 // Early exit if the offset is not possible to match. (6 bits of positive 2509 // range, plus allow an extra one in case we find a later insn that matches 2510 // with Offset-1) 2511 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI); 2512 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); 2513 int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1; 2514 // Allow one more for offset. 2515 if (Offset > 0) 2516 Offset -= OffsetStride; 2517 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) 2518 return false; 2519 2520 // Look ahead up to LdStLimit instructions for a pairable instruction. 2521 LdStPairFlags Flags; 2522 MachineBasicBlock::iterator Paired = 2523 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false); 2524 if (Paired != E) { 2525 ++NumPairCreated; 2526 if (TII->hasUnscaledLdStOffset(MI)) 2527 ++NumUnscaledPairCreated; 2528 // Keeping the iterator straight is a pain, so we let the merge routine tell 2529 // us what the next instruction is after it's done mucking about. 2530 auto Prev = std::prev(MBBI); 2531 2532 // Fetch the memoperand of the load/store that is a candidate for 2533 // combination. 2534 MachineMemOperand *MemOp = 2535 MI.memoperands_empty() ? nullptr : MI.memoperands().front(); 2536 2537 // Get the needed alignments to check them if 2538 // ldp-aligned-only/stp-aligned-only features are opted. 2539 uint64_t MemAlignment = MemOp ? MemOp->getAlign().value() : -1; 2540 uint64_t TypeAlignment = MemOp ? Align(MemOp->getSize()).value() : -1; 2541 2542 // If a load arrives and ldp-aligned-only feature is opted, check that the 2543 // alignment of the source pointer is at least double the alignment of the 2544 // type. 2545 if (MI.mayLoad() && Subtarget->hasLdpAlignedOnly() && MemOp && 2546 MemAlignment < 2 * TypeAlignment) 2547 return false; 2548 2549 // If a store arrives and stp-aligned-only feature is opted, check that the 2550 // alignment of the source pointer is at least double the alignment of the 2551 // type. 2552 if (MI.mayStore() && Subtarget->hasStpAlignedOnly() && MemOp && 2553 MemAlignment < 2 * TypeAlignment) 2554 return false; 2555 2556 MBBI = mergePairedInsns(MBBI, Paired, Flags); 2557 // Collect liveness info for instructions between Prev and the new position 2558 // MBBI. 2559 for (auto I = std::next(Prev); I != MBBI; I++) 2560 updateDefinedRegisters(*I, DefinedInBB, TRI); 2561 2562 return true; 2563 } 2564 return false; 2565 } 2566 2567 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate 2568 (MachineBasicBlock::iterator &MBBI) { 2569 MachineInstr &MI = *MBBI; 2570 MachineBasicBlock::iterator E = MI.getParent()->end(); 2571 MachineBasicBlock::iterator Update; 2572 2573 // Look forward to try to form a post-index instruction. For example, 2574 // ldr x0, [x20] 2575 // add x20, x20, #32 2576 // merged into: 2577 // ldr x0, [x20], #32 2578 Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit); 2579 if (Update != E) { 2580 // Merge the update into the ld/st. 2581 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); 2582 return true; 2583 } 2584 2585 // Don't know how to handle unscaled pre/post-index versions below, so bail. 2586 if (TII->hasUnscaledLdStOffset(MI.getOpcode())) 2587 return false; 2588 2589 // Look back to try to find a pre-index instruction. For example, 2590 // add x0, x0, #8 2591 // ldr x1, [x0] 2592 // merged into: 2593 // ldr x1, [x0, #8]! 2594 Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit); 2595 if (Update != E) { 2596 // Merge the update into the ld/st. 2597 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 2598 return true; 2599 } 2600 2601 // The immediate in the load/store is scaled by the size of the memory 2602 // operation. The immediate in the add we're looking for, 2603 // however, is not, so adjust here. 2604 int UnscaledOffset = 2605 AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI); 2606 2607 // Look forward to try to find a pre-index instruction. For example, 2608 // ldr x1, [x0, #64] 2609 // add x0, x0, #64 2610 // merged into: 2611 // ldr x1, [x0, #64]! 2612 Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit); 2613 if (Update != E) { 2614 // Merge the update into the ld/st. 2615 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 2616 return true; 2617 } 2618 2619 return false; 2620 } 2621 2622 bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, 2623 int Scale) { 2624 MachineInstr &MI = *MBBI; 2625 MachineBasicBlock::iterator E = MI.getParent()->end(); 2626 MachineBasicBlock::iterator Update; 2627 2628 // Don't know how to handle unscaled pre/post-index versions below, so bail. 2629 if (TII->hasUnscaledLdStOffset(MI.getOpcode())) 2630 return false; 2631 2632 // Look back to try to find a const offset for index LdSt instruction. For 2633 // example, 2634 // mov x8, #LargeImm ; = a * (1<<12) + imm12 2635 // ldr x1, [x0, x8] 2636 // merged into: 2637 // add x8, x0, a * (1<<12) 2638 // ldr x1, [x8, imm12] 2639 unsigned Offset; 2640 Update = findMatchingConstOffsetBackward(MBBI, LdStConstLimit, Offset); 2641 if (Update != E && (Offset & (Scale - 1)) == 0) { 2642 // Merge the imm12 into the ld/st. 2643 MBBI = mergeConstOffsetInsn(MBBI, Update, Offset, Scale); 2644 return true; 2645 } 2646 2647 return false; 2648 } 2649 2650 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, 2651 bool EnableNarrowZeroStOpt) { 2652 2653 bool Modified = false; 2654 // Four tranformations to do here: 2655 // 1) Find loads that directly read from stores and promote them by 2656 // replacing with mov instructions. If the store is wider than the load, 2657 // the load will be replaced with a bitfield extract. 2658 // e.g., 2659 // str w1, [x0, #4] 2660 // ldrh w2, [x0, #6] 2661 // ; becomes 2662 // str w1, [x0, #4] 2663 // lsr w2, w1, #16 2664 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2665 MBBI != E;) { 2666 if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI)) 2667 Modified = true; 2668 else 2669 ++MBBI; 2670 } 2671 // 2) Merge adjacent zero stores into a wider store. 2672 // e.g., 2673 // strh wzr, [x0] 2674 // strh wzr, [x0, #2] 2675 // ; becomes 2676 // str wzr, [x0] 2677 // e.g., 2678 // str wzr, [x0] 2679 // str wzr, [x0, #4] 2680 // ; becomes 2681 // str xzr, [x0] 2682 if (EnableNarrowZeroStOpt) 2683 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2684 MBBI != E;) { 2685 if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI)) 2686 Modified = true; 2687 else 2688 ++MBBI; 2689 } 2690 // 3) Find loads and stores that can be merged into a single load or store 2691 // pair instruction. 2692 // e.g., 2693 // ldr x0, [x2] 2694 // ldr x1, [x2, #8] 2695 // ; becomes 2696 // ldp x0, x1, [x2] 2697 2698 if (MBB.getParent()->getRegInfo().tracksLiveness()) { 2699 DefinedInBB.clear(); 2700 DefinedInBB.addLiveIns(MBB); 2701 } 2702 2703 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2704 MBBI != E;) { 2705 // Track currently live registers up to this point, to help with 2706 // searching for a rename register on demand. 2707 updateDefinedRegisters(*MBBI, DefinedInBB, TRI); 2708 if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI)) 2709 Modified = true; 2710 else 2711 ++MBBI; 2712 } 2713 // 4) Find base register updates that can be merged into the load or store 2714 // as a base-reg writeback. 2715 // e.g., 2716 // ldr x0, [x2] 2717 // add x2, x2, #4 2718 // ; becomes 2719 // ldr x0, [x2], #4 2720 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2721 MBBI != E;) { 2722 if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI)) 2723 Modified = true; 2724 else 2725 ++MBBI; 2726 } 2727 2728 // 5) Find a register assigned with a const value that can be combined with 2729 // into the load or store. e.g., 2730 // mov x8, #LargeImm ; = a * (1<<12) + imm12 2731 // ldr x1, [x0, x8] 2732 // ; becomes 2733 // add x8, x0, a * (1<<12) 2734 // ldr x1, [x8, imm12] 2735 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2736 MBBI != E;) { 2737 int Scale; 2738 if (isMergeableIndexLdSt(*MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale)) 2739 Modified = true; 2740 else 2741 ++MBBI; 2742 } 2743 2744 return Modified; 2745 } 2746 2747 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 2748 if (skipFunction(Fn.getFunction())) 2749 return false; 2750 2751 Subtarget = &Fn.getSubtarget<AArch64Subtarget>(); 2752 TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); 2753 TRI = Subtarget->getRegisterInfo(); 2754 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2755 2756 // Resize the modified and used register unit trackers. We do this once 2757 // per function and then clear the register units each time we optimize a load 2758 // or store. 2759 ModifiedRegUnits.init(*TRI); 2760 UsedRegUnits.init(*TRI); 2761 DefinedInBB.init(*TRI); 2762 2763 bool Modified = false; 2764 bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign(); 2765 for (auto &MBB : Fn) { 2766 auto M = optimizeBlock(MBB, enableNarrowZeroStOpt); 2767 Modified |= M; 2768 } 2769 2770 return Modified; 2771 } 2772 2773 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and 2774 // stores near one another? Note: The pre-RA instruction scheduler already has 2775 // hooks to try and schedule pairable loads/stores together to improve pairing 2776 // opportunities. Thus, pre-RA pairing pass may not be worth the effort. 2777 2778 // FIXME: When pairing store instructions it's very possible for this pass to 2779 // hoist a store with a KILL marker above another use (without a KILL marker). 2780 // The resulting IR is invalid, but nothing uses the KILL markers after this 2781 // pass, so it's never caused a problem in practice. 2782 2783 /// createAArch64LoadStoreOptimizationPass - returns an instance of the 2784 /// load / store optimization pass. 2785 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { 2786 return new AArch64LoadStoreOpt(); 2787 } 2788