1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 S_BUFFER_LOAD_SGPR_IMM, 78 S_LOAD_IMM, 79 BUFFER_LOAD, 80 BUFFER_STORE, 81 MIMG, 82 TBUFFER_LOAD, 83 TBUFFER_STORE, 84 GLOBAL_LOAD_SADDR, 85 GLOBAL_STORE_SADDR, 86 FLAT_LOAD, 87 FLAT_STORE, 88 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of 89 GLOBAL_STORE // any CombineInfo, they are only ever returned by 90 // getCommonInstClass. 91 }; 92 93 struct AddressRegs { 94 unsigned char NumVAddrs = 0; 95 bool SBase = false; 96 bool SRsrc = false; 97 bool SOffset = false; 98 bool SAddr = false; 99 bool VAddr = false; 100 bool Addr = false; 101 bool SSamp = false; 102 }; 103 104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 105 const unsigned MaxAddressRegs = 12 + 1 + 1; 106 107 class SILoadStoreOptimizer : public MachineFunctionPass { 108 struct CombineInfo { 109 MachineBasicBlock::iterator I; 110 unsigned EltSize; 111 unsigned Offset; 112 unsigned Width; 113 unsigned Format; 114 unsigned BaseOff; 115 unsigned DMask; 116 InstClassEnum InstClass; 117 unsigned CPol = 0; 118 bool IsAGPR; 119 bool UseST64; 120 int AddrIdx[MaxAddressRegs]; 121 const MachineOperand *AddrReg[MaxAddressRegs]; 122 unsigned NumAddresses; 123 unsigned Order; 124 125 bool hasSameBaseAddress(const CombineInfo &CI) { 126 if (NumAddresses != CI.NumAddresses) 127 return false; 128 129 const MachineInstr &MI = *CI.I; 130 for (unsigned i = 0; i < NumAddresses; i++) { 131 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 132 133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 135 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 136 return false; 137 } 138 continue; 139 } 140 141 // Check same base pointer. Be careful of subregisters, which can occur 142 // with vectors of pointers. 143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 145 return false; 146 } 147 } 148 return true; 149 } 150 151 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 152 for (unsigned i = 0; i < NumAddresses; ++i) { 153 const MachineOperand *AddrOp = AddrReg[i]; 154 // Immediates are always OK. 155 if (AddrOp->isImm()) 156 continue; 157 158 // Don't try to merge addresses that aren't either immediates or registers. 159 // TODO: Should be possible to merge FrameIndexes and maybe some other 160 // non-register 161 if (!AddrOp->isReg()) 162 return false; 163 164 // TODO: We should be able to merge instructions with other physical reg 165 // addresses too. 166 if (AddrOp->getReg().isPhysical() && 167 AddrOp->getReg() != AMDGPU::SGPR_NULL) 168 return false; 169 170 // If an address has only one use then there will be no other 171 // instructions with the same address, so we can't merge this one. 172 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 173 return false; 174 } 175 return true; 176 } 177 178 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 179 180 // Compare by pointer order. 181 bool operator<(const CombineInfo& Other) const { 182 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; 183 } 184 }; 185 186 struct BaseRegisters { 187 Register LoReg; 188 Register HiReg; 189 190 unsigned LoSubReg = 0; 191 unsigned HiSubReg = 0; 192 }; 193 194 struct MemAddress { 195 BaseRegisters Base; 196 int64_t Offset = 0; 197 }; 198 199 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 200 201 private: 202 const GCNSubtarget *STM = nullptr; 203 const SIInstrInfo *TII = nullptr; 204 const SIRegisterInfo *TRI = nullptr; 205 MachineRegisterInfo *MRI = nullptr; 206 AliasAnalysis *AA = nullptr; 207 bool OptimizeAgain; 208 209 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 210 const DenseSet<Register> &ARegUses, 211 const MachineInstr &A, const MachineInstr &B) const; 212 static bool dmasksCanBeCombined(const CombineInfo &CI, 213 const SIInstrInfo &TII, 214 const CombineInfo &Paired); 215 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 216 CombineInfo &Paired, bool Modify = false); 217 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 218 const CombineInfo &Paired); 219 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 220 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 221 const CombineInfo &Paired); 222 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 223 const CombineInfo &Paired); 224 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 225 226 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 227 228 unsigned read2Opcode(unsigned EltSize) const; 229 unsigned read2ST64Opcode(unsigned EltSize) const; 230 MachineBasicBlock::iterator 231 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 232 MachineBasicBlock::iterator InsertBefore); 233 234 unsigned write2Opcode(unsigned EltSize) const; 235 unsigned write2ST64Opcode(unsigned EltSize) const; 236 MachineBasicBlock::iterator 237 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 238 MachineBasicBlock::iterator InsertBefore); 239 MachineBasicBlock::iterator 240 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 241 MachineBasicBlock::iterator InsertBefore); 242 MachineBasicBlock::iterator 243 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 244 MachineBasicBlock::iterator InsertBefore); 245 MachineBasicBlock::iterator 246 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 247 MachineBasicBlock::iterator InsertBefore); 248 MachineBasicBlock::iterator 249 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 250 MachineBasicBlock::iterator InsertBefore); 251 MachineBasicBlock::iterator 252 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 253 MachineBasicBlock::iterator InsertBefore); 254 MachineBasicBlock::iterator 255 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 256 MachineBasicBlock::iterator InsertBefore); 257 MachineBasicBlock::iterator 258 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, 259 MachineBasicBlock::iterator InsertBefore); 260 MachineBasicBlock::iterator 261 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, 262 MachineBasicBlock::iterator InsertBefore); 263 264 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 265 int32_t NewOffset) const; 266 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 267 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 268 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 269 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 270 /// Promotes constant offset to the immediate by adjusting the base. It 271 /// tries to use a base from the nearby instructions that allows it to have 272 /// a 13bit constant offset which gets promoted to the immediate. 273 bool promoteConstantOffsetToImm(MachineInstr &CI, 274 MemInfoMap &Visited, 275 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 276 void addInstToMergeableList(const CombineInfo &CI, 277 std::list<std::list<CombineInfo> > &MergeableInsts) const; 278 279 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 280 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 281 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 282 std::list<std::list<CombineInfo>> &MergeableInsts) const; 283 284 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, 285 const CombineInfo &Paired); 286 287 static InstClassEnum getCommonInstClass(const CombineInfo &CI, 288 const CombineInfo &Paired); 289 290 public: 291 static char ID; 292 293 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 294 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 295 } 296 297 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 298 bool &OptimizeListAgain); 299 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 300 301 bool runOnMachineFunction(MachineFunction &MF) override; 302 303 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 304 305 void getAnalysisUsage(AnalysisUsage &AU) const override { 306 AU.setPreservesCFG(); 307 AU.addRequired<AAResultsWrapperPass>(); 308 309 MachineFunctionPass::getAnalysisUsage(AU); 310 } 311 312 MachineFunctionProperties getRequiredProperties() const override { 313 return MachineFunctionProperties() 314 .set(MachineFunctionProperties::Property::IsSSA); 315 } 316 }; 317 318 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 319 const unsigned Opc = MI.getOpcode(); 320 321 if (TII.isMUBUF(Opc)) { 322 // FIXME: Handle d16 correctly 323 return AMDGPU::getMUBUFElements(Opc); 324 } 325 if (TII.isImage(MI)) { 326 uint64_t DMaskImm = 327 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 328 return llvm::popcount(DMaskImm); 329 } 330 if (TII.isMTBUF(Opc)) { 331 return AMDGPU::getMTBUFElements(Opc); 332 } 333 334 switch (Opc) { 335 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 336 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 337 case AMDGPU::S_LOAD_DWORD_IMM: 338 case AMDGPU::GLOBAL_LOAD_DWORD: 339 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 340 case AMDGPU::GLOBAL_STORE_DWORD: 341 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 342 case AMDGPU::FLAT_LOAD_DWORD: 343 case AMDGPU::FLAT_STORE_DWORD: 344 return 1; 345 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 346 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 347 case AMDGPU::S_LOAD_DWORDX2_IMM: 348 case AMDGPU::GLOBAL_LOAD_DWORDX2: 349 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 350 case AMDGPU::GLOBAL_STORE_DWORDX2: 351 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 352 case AMDGPU::FLAT_LOAD_DWORDX2: 353 case AMDGPU::FLAT_STORE_DWORDX2: 354 return 2; 355 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 356 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 357 case AMDGPU::S_LOAD_DWORDX3_IMM: 358 case AMDGPU::GLOBAL_LOAD_DWORDX3: 359 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 360 case AMDGPU::GLOBAL_STORE_DWORDX3: 361 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 362 case AMDGPU::FLAT_LOAD_DWORDX3: 363 case AMDGPU::FLAT_STORE_DWORDX3: 364 return 3; 365 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 366 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 367 case AMDGPU::S_LOAD_DWORDX4_IMM: 368 case AMDGPU::GLOBAL_LOAD_DWORDX4: 369 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 370 case AMDGPU::GLOBAL_STORE_DWORDX4: 371 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 372 case AMDGPU::FLAT_LOAD_DWORDX4: 373 case AMDGPU::FLAT_STORE_DWORDX4: 374 return 4; 375 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 376 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 377 case AMDGPU::S_LOAD_DWORDX8_IMM: 378 return 8; 379 case AMDGPU::DS_READ_B32: [[fallthrough]]; 380 case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]]; 381 case AMDGPU::DS_WRITE_B32: [[fallthrough]]; 382 case AMDGPU::DS_WRITE_B32_gfx9: 383 return 1; 384 case AMDGPU::DS_READ_B64: [[fallthrough]]; 385 case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]]; 386 case AMDGPU::DS_WRITE_B64: [[fallthrough]]; 387 case AMDGPU::DS_WRITE_B64_gfx9: 388 return 2; 389 default: 390 return 0; 391 } 392 } 393 394 /// Maps instruction opcode to enum InstClassEnum. 395 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 396 switch (Opc) { 397 default: 398 if (TII.isMUBUF(Opc)) { 399 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 400 default: 401 return UNKNOWN; 402 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 403 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 404 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 405 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 406 return BUFFER_LOAD; 407 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 408 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 409 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 410 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 411 return BUFFER_STORE; 412 } 413 } 414 if (TII.isImage(Opc)) { 415 // Ignore instructions encoded without vaddr. 416 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && 417 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0)) 418 return UNKNOWN; 419 // Ignore BVH instructions 420 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 421 return UNKNOWN; 422 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 423 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 424 TII.isGather4(Opc)) 425 return UNKNOWN; 426 return MIMG; 427 } 428 if (TII.isMTBUF(Opc)) { 429 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 430 default: 431 return UNKNOWN; 432 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 433 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 434 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 435 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 436 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: 437 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: 438 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: 439 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: 440 return TBUFFER_LOAD; 441 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 442 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 443 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 444 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 445 return TBUFFER_STORE; 446 } 447 } 448 return UNKNOWN; 449 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 450 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 451 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 452 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 453 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 454 return S_BUFFER_LOAD_IMM; 455 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 456 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 457 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 458 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 459 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 460 return S_BUFFER_LOAD_SGPR_IMM; 461 case AMDGPU::S_LOAD_DWORD_IMM: 462 case AMDGPU::S_LOAD_DWORDX2_IMM: 463 case AMDGPU::S_LOAD_DWORDX3_IMM: 464 case AMDGPU::S_LOAD_DWORDX4_IMM: 465 case AMDGPU::S_LOAD_DWORDX8_IMM: 466 return S_LOAD_IMM; 467 case AMDGPU::DS_READ_B32: 468 case AMDGPU::DS_READ_B32_gfx9: 469 case AMDGPU::DS_READ_B64: 470 case AMDGPU::DS_READ_B64_gfx9: 471 return DS_READ; 472 case AMDGPU::DS_WRITE_B32: 473 case AMDGPU::DS_WRITE_B32_gfx9: 474 case AMDGPU::DS_WRITE_B64: 475 case AMDGPU::DS_WRITE_B64_gfx9: 476 return DS_WRITE; 477 case AMDGPU::GLOBAL_LOAD_DWORD: 478 case AMDGPU::GLOBAL_LOAD_DWORDX2: 479 case AMDGPU::GLOBAL_LOAD_DWORDX3: 480 case AMDGPU::GLOBAL_LOAD_DWORDX4: 481 case AMDGPU::FLAT_LOAD_DWORD: 482 case AMDGPU::FLAT_LOAD_DWORDX2: 483 case AMDGPU::FLAT_LOAD_DWORDX3: 484 case AMDGPU::FLAT_LOAD_DWORDX4: 485 return FLAT_LOAD; 486 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 487 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 488 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 489 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 490 return GLOBAL_LOAD_SADDR; 491 case AMDGPU::GLOBAL_STORE_DWORD: 492 case AMDGPU::GLOBAL_STORE_DWORDX2: 493 case AMDGPU::GLOBAL_STORE_DWORDX3: 494 case AMDGPU::GLOBAL_STORE_DWORDX4: 495 case AMDGPU::FLAT_STORE_DWORD: 496 case AMDGPU::FLAT_STORE_DWORDX2: 497 case AMDGPU::FLAT_STORE_DWORDX3: 498 case AMDGPU::FLAT_STORE_DWORDX4: 499 return FLAT_STORE; 500 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 501 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 502 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 503 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 504 return GLOBAL_STORE_SADDR; 505 } 506 } 507 508 /// Determines instruction subclass from opcode. Only instructions 509 /// of the same subclass can be merged together. The merged instruction may have 510 /// a different subclass but must have the same class. 511 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 512 switch (Opc) { 513 default: 514 if (TII.isMUBUF(Opc)) 515 return AMDGPU::getMUBUFBaseOpcode(Opc); 516 if (TII.isImage(Opc)) { 517 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 518 assert(Info); 519 return Info->BaseOpcode; 520 } 521 if (TII.isMTBUF(Opc)) 522 return AMDGPU::getMTBUFBaseOpcode(Opc); 523 return -1; 524 case AMDGPU::DS_READ_B32: 525 case AMDGPU::DS_READ_B32_gfx9: 526 case AMDGPU::DS_READ_B64: 527 case AMDGPU::DS_READ_B64_gfx9: 528 case AMDGPU::DS_WRITE_B32: 529 case AMDGPU::DS_WRITE_B32_gfx9: 530 case AMDGPU::DS_WRITE_B64: 531 case AMDGPU::DS_WRITE_B64_gfx9: 532 return Opc; 533 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 534 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 535 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 536 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 537 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 538 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 539 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 540 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 541 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 542 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 543 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 544 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; 545 case AMDGPU::S_LOAD_DWORD_IMM: 546 case AMDGPU::S_LOAD_DWORDX2_IMM: 547 case AMDGPU::S_LOAD_DWORDX3_IMM: 548 case AMDGPU::S_LOAD_DWORDX4_IMM: 549 case AMDGPU::S_LOAD_DWORDX8_IMM: 550 return AMDGPU::S_LOAD_DWORD_IMM; 551 case AMDGPU::GLOBAL_LOAD_DWORD: 552 case AMDGPU::GLOBAL_LOAD_DWORDX2: 553 case AMDGPU::GLOBAL_LOAD_DWORDX3: 554 case AMDGPU::GLOBAL_LOAD_DWORDX4: 555 case AMDGPU::FLAT_LOAD_DWORD: 556 case AMDGPU::FLAT_LOAD_DWORDX2: 557 case AMDGPU::FLAT_LOAD_DWORDX3: 558 case AMDGPU::FLAT_LOAD_DWORDX4: 559 return AMDGPU::FLAT_LOAD_DWORD; 560 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 561 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 562 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 563 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 564 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; 565 case AMDGPU::GLOBAL_STORE_DWORD: 566 case AMDGPU::GLOBAL_STORE_DWORDX2: 567 case AMDGPU::GLOBAL_STORE_DWORDX3: 568 case AMDGPU::GLOBAL_STORE_DWORDX4: 569 case AMDGPU::FLAT_STORE_DWORD: 570 case AMDGPU::FLAT_STORE_DWORDX2: 571 case AMDGPU::FLAT_STORE_DWORDX3: 572 case AMDGPU::FLAT_STORE_DWORDX4: 573 return AMDGPU::FLAT_STORE_DWORD; 574 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 575 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 576 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 577 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 578 return AMDGPU::GLOBAL_STORE_DWORD_SADDR; 579 } 580 } 581 582 // GLOBAL loads and stores are classified as FLAT initially. If both combined 583 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. 584 // If either or both instructions are non segment specific FLAT the resulting 585 // combined operation will be FLAT, potentially promoting one of the GLOBAL 586 // operations to FLAT. 587 // For other instructions return the original unmodified class. 588 InstClassEnum 589 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, 590 const CombineInfo &Paired) { 591 assert(CI.InstClass == Paired.InstClass); 592 593 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && 594 SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I)) 595 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; 596 597 return CI.InstClass; 598 } 599 600 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 601 AddressRegs Result; 602 603 if (TII.isMUBUF(Opc)) { 604 if (AMDGPU::getMUBUFHasVAddr(Opc)) 605 Result.VAddr = true; 606 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 607 Result.SRsrc = true; 608 if (AMDGPU::getMUBUFHasSoffset(Opc)) 609 Result.SOffset = true; 610 611 return Result; 612 } 613 614 if (TII.isImage(Opc)) { 615 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 616 if (VAddr0Idx >= 0) { 617 int RsrcName = 618 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; 619 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName); 620 Result.NumVAddrs = RsrcIdx - VAddr0Idx; 621 } else { 622 Result.VAddr = true; 623 } 624 Result.SRsrc = true; 625 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 626 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 627 Result.SSamp = true; 628 629 return Result; 630 } 631 if (TII.isMTBUF(Opc)) { 632 if (AMDGPU::getMTBUFHasVAddr(Opc)) 633 Result.VAddr = true; 634 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 635 Result.SRsrc = true; 636 if (AMDGPU::getMTBUFHasSoffset(Opc)) 637 Result.SOffset = true; 638 639 return Result; 640 } 641 642 switch (Opc) { 643 default: 644 return Result; 645 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 646 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 647 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 648 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 649 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 650 Result.SOffset = true; 651 [[fallthrough]]; 652 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 653 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 654 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 655 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 656 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 657 case AMDGPU::S_LOAD_DWORD_IMM: 658 case AMDGPU::S_LOAD_DWORDX2_IMM: 659 case AMDGPU::S_LOAD_DWORDX3_IMM: 660 case AMDGPU::S_LOAD_DWORDX4_IMM: 661 case AMDGPU::S_LOAD_DWORDX8_IMM: 662 Result.SBase = true; 663 return Result; 664 case AMDGPU::DS_READ_B32: 665 case AMDGPU::DS_READ_B64: 666 case AMDGPU::DS_READ_B32_gfx9: 667 case AMDGPU::DS_READ_B64_gfx9: 668 case AMDGPU::DS_WRITE_B32: 669 case AMDGPU::DS_WRITE_B64: 670 case AMDGPU::DS_WRITE_B32_gfx9: 671 case AMDGPU::DS_WRITE_B64_gfx9: 672 Result.Addr = true; 673 return Result; 674 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 675 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 676 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 677 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 678 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 679 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 680 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 681 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 682 Result.SAddr = true; 683 [[fallthrough]]; 684 case AMDGPU::GLOBAL_LOAD_DWORD: 685 case AMDGPU::GLOBAL_LOAD_DWORDX2: 686 case AMDGPU::GLOBAL_LOAD_DWORDX3: 687 case AMDGPU::GLOBAL_LOAD_DWORDX4: 688 case AMDGPU::GLOBAL_STORE_DWORD: 689 case AMDGPU::GLOBAL_STORE_DWORDX2: 690 case AMDGPU::GLOBAL_STORE_DWORDX3: 691 case AMDGPU::GLOBAL_STORE_DWORDX4: 692 case AMDGPU::FLAT_LOAD_DWORD: 693 case AMDGPU::FLAT_LOAD_DWORDX2: 694 case AMDGPU::FLAT_LOAD_DWORDX3: 695 case AMDGPU::FLAT_LOAD_DWORDX4: 696 case AMDGPU::FLAT_STORE_DWORD: 697 case AMDGPU::FLAT_STORE_DWORDX2: 698 case AMDGPU::FLAT_STORE_DWORDX3: 699 case AMDGPU::FLAT_STORE_DWORDX4: 700 Result.VAddr = true; 701 return Result; 702 } 703 } 704 705 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 706 const SILoadStoreOptimizer &LSO) { 707 I = MI; 708 unsigned Opc = MI->getOpcode(); 709 InstClass = getInstClass(Opc, *LSO.TII); 710 711 if (InstClass == UNKNOWN) 712 return; 713 714 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 715 716 switch (InstClass) { 717 case DS_READ: 718 EltSize = 719 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 720 : 4; 721 break; 722 case DS_WRITE: 723 EltSize = 724 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 725 : 4; 726 break; 727 case S_BUFFER_LOAD_IMM: 728 case S_BUFFER_LOAD_SGPR_IMM: 729 case S_LOAD_IMM: 730 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 731 break; 732 default: 733 EltSize = 4; 734 break; 735 } 736 737 if (InstClass == MIMG) { 738 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 739 // Offset is not considered for MIMG instructions. 740 Offset = 0; 741 } else { 742 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 743 Offset = I->getOperand(OffsetIdx).getImm(); 744 } 745 746 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 747 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 748 749 Width = getOpcodeWidth(*I, *LSO.TII); 750 751 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 752 Offset &= 0xffff; 753 } else if (InstClass != MIMG) { 754 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 755 } 756 757 AddressRegs Regs = getRegs(Opc, *LSO.TII); 758 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I); 759 760 NumAddresses = 0; 761 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 762 AddrIdx[NumAddresses++] = 763 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 764 if (Regs.Addr) 765 AddrIdx[NumAddresses++] = 766 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 767 if (Regs.SBase) 768 AddrIdx[NumAddresses++] = 769 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 770 if (Regs.SRsrc) 771 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 772 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc); 773 if (Regs.SOffset) 774 AddrIdx[NumAddresses++] = 775 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 776 if (Regs.SAddr) 777 AddrIdx[NumAddresses++] = 778 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 779 if (Regs.VAddr) 780 AddrIdx[NumAddresses++] = 781 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 782 if (Regs.SSamp) 783 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 784 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp); 785 assert(NumAddresses <= MaxAddressRegs); 786 787 for (unsigned J = 0; J < NumAddresses; J++) 788 AddrReg[J] = &I->getOperand(AddrIdx[J]); 789 } 790 791 } // end anonymous namespace. 792 793 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 794 "SI Load Store Optimizer", false, false) 795 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 796 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 797 false, false) 798 799 char SILoadStoreOptimizer::ID = 0; 800 801 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 802 803 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 804 return new SILoadStoreOptimizer(); 805 } 806 807 static void addDefsUsesToList(const MachineInstr &MI, 808 DenseSet<Register> &RegDefs, 809 DenseSet<Register> &RegUses) { 810 for (const auto &Op : MI.operands()) { 811 if (!Op.isReg()) 812 continue; 813 if (Op.isDef()) 814 RegDefs.insert(Op.getReg()); 815 if (Op.readsReg()) 816 RegUses.insert(Op.getReg()); 817 } 818 } 819 820 bool SILoadStoreOptimizer::canSwapInstructions( 821 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 822 const MachineInstr &A, const MachineInstr &B) const { 823 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 824 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 825 return false; 826 for (const auto &BOp : B.operands()) { 827 if (!BOp.isReg()) 828 continue; 829 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 830 return false; 831 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 832 return false; 833 } 834 return true; 835 } 836 837 // Given that \p CI and \p Paired are adjacent memory operations produce a new 838 // MMO for the combined operation with a new access size. 839 MachineMemOperand * 840 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, 841 const CombineInfo &Paired) { 842 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 843 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 844 845 unsigned Size = MMOa->getSize() + MMOb->getSize(); 846 847 // A base pointer for the combined operation is the same as the leading 848 // operation's pointer. 849 if (Paired < CI) 850 std::swap(MMOa, MMOb); 851 852 MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); 853 // If merging FLAT and GLOBAL set address space to FLAT. 854 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 855 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; 856 857 MachineFunction *MF = CI.I->getMF(); 858 return MF->getMachineMemOperand(MMOa, PtrInfo, Size); 859 } 860 861 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 862 const SIInstrInfo &TII, 863 const CombineInfo &Paired) { 864 assert(CI.InstClass == MIMG); 865 866 // Ignore instructions with tfe/lwe set. 867 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 868 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 869 870 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 871 return false; 872 873 // Check other optional immediate operands for equality. 874 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 875 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 876 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 877 878 for (auto op : OperandsToMatch) { 879 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 880 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 881 return false; 882 if (Idx != -1 && 883 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 884 return false; 885 } 886 887 // Check DMask for overlaps. 888 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 889 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 890 891 if (!MaxMask) 892 return false; 893 894 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask); 895 if ((1u << AllowedBitsForMin) <= MinMask) 896 return false; 897 898 return true; 899 } 900 901 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 902 unsigned ComponentCount, 903 const GCNSubtarget &STI) { 904 if (ComponentCount > 4) 905 return 0; 906 907 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 908 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 909 if (!OldFormatInfo) 910 return 0; 911 912 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 913 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 914 ComponentCount, 915 OldFormatInfo->NumFormat, STI); 916 917 if (!NewFormatInfo) 918 return 0; 919 920 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 921 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 922 923 return NewFormatInfo->Format; 924 } 925 926 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 927 // highest power of two. Note that the result is well defined for all inputs 928 // including corner cases like: 929 // - if Lo == Hi, return that value 930 // - if Lo == 0, return 0 (even though the "- 1" below underflows 931 // - if Lo > Hi, return 0 (as if the range wrapped around) 932 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 933 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1); 934 } 935 936 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 937 const GCNSubtarget &STI, 938 CombineInfo &Paired, 939 bool Modify) { 940 assert(CI.InstClass != MIMG); 941 942 // XXX - Would the same offset be OK? Is there any reason this would happen or 943 // be useful? 944 if (CI.Offset == Paired.Offset) 945 return false; 946 947 // This won't be valid if the offset isn't aligned. 948 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 949 return false; 950 951 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 952 953 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 954 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 955 if (!Info0) 956 return false; 957 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 958 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 959 if (!Info1) 960 return false; 961 962 if (Info0->BitsPerComp != Info1->BitsPerComp || 963 Info0->NumFormat != Info1->NumFormat) 964 return false; 965 966 // TODO: Should be possible to support more formats, but if format loads 967 // are not dword-aligned, the merged load might not be valid. 968 if (Info0->BitsPerComp != 32) 969 return false; 970 971 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 972 return false; 973 } 974 975 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 976 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 977 CI.UseST64 = false; 978 CI.BaseOff = 0; 979 980 // Handle all non-DS instructions. 981 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 982 if (EltOffset0 + CI.Width != EltOffset1 && 983 EltOffset1 + Paired.Width != EltOffset0) 984 return false; 985 if (CI.CPol != Paired.CPol) 986 return false; 987 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || 988 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { 989 // Reject cases like: 990 // dword + dwordx2 -> dwordx3 991 // dword + dwordx3 -> dwordx4 992 // If we tried to combine these cases, we would fail to extract a subreg 993 // for the result of the second load due to SGPR alignment requirements. 994 if (CI.Width != Paired.Width && 995 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset)) 996 return false; 997 } 998 return true; 999 } 1000 1001 // If the offset in elements doesn't fit in 8-bits, we might be able to use 1002 // the stride 64 versions. 1003 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 1004 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 1005 if (Modify) { 1006 CI.Offset = EltOffset0 / 64; 1007 Paired.Offset = EltOffset1 / 64; 1008 CI.UseST64 = true; 1009 } 1010 return true; 1011 } 1012 1013 // Check if the new offsets fit in the reduced 8-bit range. 1014 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 1015 if (Modify) { 1016 CI.Offset = EltOffset0; 1017 Paired.Offset = EltOffset1; 1018 } 1019 return true; 1020 } 1021 1022 // Try to shift base address to decrease offsets. 1023 uint32_t Min = std::min(EltOffset0, EltOffset1); 1024 uint32_t Max = std::max(EltOffset0, EltOffset1); 1025 1026 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 1027 if (((Max - Min) & ~Mask) == 0) { 1028 if (Modify) { 1029 // From the range of values we could use for BaseOff, choose the one that 1030 // is aligned to the highest power of two, to maximise the chance that 1031 // the same offset can be reused for other load/store pairs. 1032 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 1033 // Copy the low bits of the offsets, so that when we adjust them by 1034 // subtracting BaseOff they will be multiples of 64. 1035 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 1036 CI.BaseOff = BaseOff * CI.EltSize; 1037 CI.Offset = (EltOffset0 - BaseOff) / 64; 1038 Paired.Offset = (EltOffset1 - BaseOff) / 64; 1039 CI.UseST64 = true; 1040 } 1041 return true; 1042 } 1043 1044 if (isUInt<8>(Max - Min)) { 1045 if (Modify) { 1046 // From the range of values we could use for BaseOff, choose the one that 1047 // is aligned to the highest power of two, to maximise the chance that 1048 // the same offset can be reused for other load/store pairs. 1049 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 1050 CI.BaseOff = BaseOff * CI.EltSize; 1051 CI.Offset = EltOffset0 - BaseOff; 1052 Paired.Offset = EltOffset1 - BaseOff; 1053 } 1054 return true; 1055 } 1056 1057 return false; 1058 } 1059 1060 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 1061 const CombineInfo &CI, 1062 const CombineInfo &Paired) { 1063 const unsigned Width = (CI.Width + Paired.Width); 1064 switch (CI.InstClass) { 1065 default: 1066 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 1067 case S_BUFFER_LOAD_IMM: 1068 case S_BUFFER_LOAD_SGPR_IMM: 1069 case S_LOAD_IMM: 1070 switch (Width) { 1071 default: 1072 return false; 1073 case 2: 1074 case 4: 1075 case 8: 1076 return true; 1077 case 3: 1078 return STM.hasScalarDwordx3Loads(); 1079 } 1080 } 1081 } 1082 1083 const TargetRegisterClass * 1084 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 1085 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 1086 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1087 } 1088 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 1089 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1090 } 1091 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 1092 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1093 } 1094 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 1095 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1096 } 1097 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 1098 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1099 } 1100 return nullptr; 1101 } 1102 1103 /// This function assumes that CI comes before Paired in a basic block. Return 1104 /// an insertion point for the merged instruction or nullptr on failure. 1105 SILoadStoreOptimizer::CombineInfo * 1106 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 1107 CombineInfo &Paired) { 1108 // If another instruction has already been merged into CI, it may now be a 1109 // type that we can't do any further merging into. 1110 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 1111 return nullptr; 1112 assert(CI.InstClass == Paired.InstClass); 1113 1114 if (getInstSubclass(CI.I->getOpcode(), *TII) != 1115 getInstSubclass(Paired.I->getOpcode(), *TII)) 1116 return nullptr; 1117 1118 // Check both offsets (or masks for MIMG) can be combined and fit in the 1119 // reduced range. 1120 if (CI.InstClass == MIMG) { 1121 if (!dmasksCanBeCombined(CI, *TII, Paired)) 1122 return nullptr; 1123 } else { 1124 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 1125 return nullptr; 1126 } 1127 1128 DenseSet<Register> RegDefs; 1129 DenseSet<Register> RegUses; 1130 CombineInfo *Where; 1131 if (CI.I->mayLoad()) { 1132 // Try to hoist Paired up to CI. 1133 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 1134 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 1135 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 1136 return nullptr; 1137 } 1138 Where = &CI; 1139 } else { 1140 // Try to sink CI down to Paired. 1141 addDefsUsesToList(*CI.I, RegDefs, RegUses); 1142 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 1143 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 1144 return nullptr; 1145 } 1146 Where = &Paired; 1147 } 1148 1149 // Call offsetsCanBeCombined with modify = true so that the offsets are 1150 // correct for the new instruction. This should return true, because 1151 // this function should only be called on CombineInfo objects that 1152 // have already been confirmed to be mergeable. 1153 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 1154 offsetsCanBeCombined(CI, *STM, Paired, true); 1155 return Where; 1156 } 1157 1158 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1159 if (STM->ldsRequiresM0Init()) 1160 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1161 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1162 } 1163 1164 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1165 if (STM->ldsRequiresM0Init()) 1166 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1167 1168 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1169 : AMDGPU::DS_READ2ST64_B64_gfx9; 1170 } 1171 1172 MachineBasicBlock::iterator 1173 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1174 MachineBasicBlock::iterator InsertBefore) { 1175 MachineBasicBlock *MBB = CI.I->getParent(); 1176 1177 // Be careful, since the addresses could be subregisters themselves in weird 1178 // cases, like vectors of pointers. 1179 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1180 1181 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1182 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1183 1184 unsigned NewOffset0 = CI.Offset; 1185 unsigned NewOffset1 = Paired.Offset; 1186 unsigned Opc = 1187 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1188 1189 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1190 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1191 1192 if (NewOffset0 > NewOffset1) { 1193 // Canonicalize the merged instruction so the smaller offset comes first. 1194 std::swap(NewOffset0, NewOffset1); 1195 std::swap(SubRegIdx0, SubRegIdx1); 1196 } 1197 1198 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1199 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1200 1201 const MCInstrDesc &Read2Desc = TII->get(Opc); 1202 1203 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1204 Register DestReg = MRI->createVirtualRegister(SuperRC); 1205 1206 DebugLoc DL = CI.I->getDebugLoc(); 1207 1208 Register BaseReg = AddrReg->getReg(); 1209 unsigned BaseSubReg = AddrReg->getSubReg(); 1210 unsigned BaseRegFlags = 0; 1211 if (CI.BaseOff) { 1212 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1213 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1214 .addImm(CI.BaseOff); 1215 1216 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1217 BaseRegFlags = RegState::Kill; 1218 1219 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1220 .addReg(ImmReg) 1221 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1222 .addImm(0); // clamp bit 1223 BaseSubReg = 0; 1224 } 1225 1226 MachineInstrBuilder Read2 = 1227 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 1228 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1229 .addImm(NewOffset0) // offset0 1230 .addImm(NewOffset1) // offset1 1231 .addImm(0) // gds 1232 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1233 1234 (void)Read2; 1235 1236 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1237 1238 // Copy to the old destination registers. 1239 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1240 .add(*Dest0) // Copy to same destination including flags and sub reg. 1241 .addReg(DestReg, 0, SubRegIdx0); 1242 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1243 .add(*Dest1) 1244 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1245 1246 CI.I->eraseFromParent(); 1247 Paired.I->eraseFromParent(); 1248 1249 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1250 return Read2; 1251 } 1252 1253 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1254 if (STM->ldsRequiresM0Init()) 1255 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1256 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1257 : AMDGPU::DS_WRITE2_B64_gfx9; 1258 } 1259 1260 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1261 if (STM->ldsRequiresM0Init()) 1262 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1263 : AMDGPU::DS_WRITE2ST64_B64; 1264 1265 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1266 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1267 } 1268 1269 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1270 CombineInfo &CI, CombineInfo &Paired, 1271 MachineBasicBlock::iterator InsertBefore) { 1272 MachineBasicBlock *MBB = CI.I->getParent(); 1273 1274 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1275 // sure we preserve the subregister index and any register flags set on them. 1276 const MachineOperand *AddrReg = 1277 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1278 const MachineOperand *Data0 = 1279 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1280 const MachineOperand *Data1 = 1281 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1282 1283 unsigned NewOffset0 = CI.Offset; 1284 unsigned NewOffset1 = Paired.Offset; 1285 unsigned Opc = 1286 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1287 1288 if (NewOffset0 > NewOffset1) { 1289 // Canonicalize the merged instruction so the smaller offset comes first. 1290 std::swap(NewOffset0, NewOffset1); 1291 std::swap(Data0, Data1); 1292 } 1293 1294 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1295 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1296 1297 const MCInstrDesc &Write2Desc = TII->get(Opc); 1298 DebugLoc DL = CI.I->getDebugLoc(); 1299 1300 Register BaseReg = AddrReg->getReg(); 1301 unsigned BaseSubReg = AddrReg->getSubReg(); 1302 unsigned BaseRegFlags = 0; 1303 if (CI.BaseOff) { 1304 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1305 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1306 .addImm(CI.BaseOff); 1307 1308 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1309 BaseRegFlags = RegState::Kill; 1310 1311 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1312 .addReg(ImmReg) 1313 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1314 .addImm(0); // clamp bit 1315 BaseSubReg = 0; 1316 } 1317 1318 MachineInstrBuilder Write2 = 1319 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1320 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1321 .add(*Data0) // data0 1322 .add(*Data1) // data1 1323 .addImm(NewOffset0) // offset0 1324 .addImm(NewOffset1) // offset1 1325 .addImm(0) // gds 1326 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1327 1328 CI.I->eraseFromParent(); 1329 Paired.I->eraseFromParent(); 1330 1331 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1332 return Write2; 1333 } 1334 1335 MachineBasicBlock::iterator 1336 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1337 MachineBasicBlock::iterator InsertBefore) { 1338 MachineBasicBlock *MBB = CI.I->getParent(); 1339 DebugLoc DL = CI.I->getDebugLoc(); 1340 const unsigned Opcode = getNewOpcode(CI, Paired); 1341 1342 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1343 1344 Register DestReg = MRI->createVirtualRegister(SuperRC); 1345 unsigned MergedDMask = CI.DMask | Paired.DMask; 1346 unsigned DMaskIdx = 1347 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1348 1349 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1350 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1351 if (I == DMaskIdx) 1352 MIB.addImm(MergedDMask); 1353 else 1354 MIB.add((*CI.I).getOperand(I)); 1355 } 1356 1357 // It shouldn't be possible to get this far if the two instructions 1358 // don't have a single memoperand, because MachineInstr::mayAlias() 1359 // will return true if this is the case. 1360 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1361 1362 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1363 1364 unsigned SubRegIdx0, SubRegIdx1; 1365 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1366 1367 // Copy to the old destination registers. 1368 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1369 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1370 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1371 1372 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1373 .add(*Dest0) // Copy to same destination including flags and sub reg. 1374 .addReg(DestReg, 0, SubRegIdx0); 1375 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1376 .add(*Dest1) 1377 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1378 1379 CI.I->eraseFromParent(); 1380 Paired.I->eraseFromParent(); 1381 return New; 1382 } 1383 1384 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( 1385 CombineInfo &CI, CombineInfo &Paired, 1386 MachineBasicBlock::iterator InsertBefore) { 1387 MachineBasicBlock *MBB = CI.I->getParent(); 1388 DebugLoc DL = CI.I->getDebugLoc(); 1389 const unsigned Opcode = getNewOpcode(CI, Paired); 1390 1391 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1392 1393 Register DestReg = MRI->createVirtualRegister(SuperRC); 1394 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1395 1396 // It shouldn't be possible to get this far if the two instructions 1397 // don't have a single memoperand, because MachineInstr::mayAlias() 1398 // will return true if this is the case. 1399 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1400 1401 MachineInstrBuilder New = 1402 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1403 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)); 1404 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) 1405 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)); 1406 New.addImm(MergedOffset); 1407 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1408 1409 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1410 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1411 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1412 1413 // Copy to the old destination registers. 1414 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1415 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1416 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1417 1418 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1419 .add(*Dest0) // Copy to same destination including flags and sub reg. 1420 .addReg(DestReg, 0, SubRegIdx0); 1421 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1422 .add(*Dest1) 1423 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1424 1425 CI.I->eraseFromParent(); 1426 Paired.I->eraseFromParent(); 1427 return New; 1428 } 1429 1430 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1431 CombineInfo &CI, CombineInfo &Paired, 1432 MachineBasicBlock::iterator InsertBefore) { 1433 MachineBasicBlock *MBB = CI.I->getParent(); 1434 DebugLoc DL = CI.I->getDebugLoc(); 1435 1436 const unsigned Opcode = getNewOpcode(CI, Paired); 1437 1438 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1439 1440 // Copy to the new source register. 1441 Register DestReg = MRI->createVirtualRegister(SuperRC); 1442 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1443 1444 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1445 1446 AddressRegs Regs = getRegs(Opcode, *TII); 1447 1448 if (Regs.VAddr) 1449 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1450 1451 // It shouldn't be possible to get this far if the two instructions 1452 // don't have a single memoperand, because MachineInstr::mayAlias() 1453 // will return true if this is the case. 1454 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1455 1456 MachineInstr *New = 1457 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1458 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1459 .addImm(MergedOffset) // offset 1460 .addImm(CI.CPol) // cpol 1461 .addImm(0) // swz 1462 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1463 1464 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1465 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1466 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1467 1468 // Copy to the old destination registers. 1469 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1470 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1471 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1472 1473 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1474 .add(*Dest0) // Copy to same destination including flags and sub reg. 1475 .addReg(DestReg, 0, SubRegIdx0); 1476 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1477 .add(*Dest1) 1478 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1479 1480 CI.I->eraseFromParent(); 1481 Paired.I->eraseFromParent(); 1482 return New; 1483 } 1484 1485 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1486 CombineInfo &CI, CombineInfo &Paired, 1487 MachineBasicBlock::iterator InsertBefore) { 1488 MachineBasicBlock *MBB = CI.I->getParent(); 1489 DebugLoc DL = CI.I->getDebugLoc(); 1490 1491 const unsigned Opcode = getNewOpcode(CI, Paired); 1492 1493 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1494 1495 // Copy to the new source register. 1496 Register DestReg = MRI->createVirtualRegister(SuperRC); 1497 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1498 1499 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1500 1501 AddressRegs Regs = getRegs(Opcode, *TII); 1502 1503 if (Regs.VAddr) 1504 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1505 1506 unsigned JoinedFormat = 1507 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1508 1509 // It shouldn't be possible to get this far if the two instructions 1510 // don't have a single memoperand, because MachineInstr::mayAlias() 1511 // will return true if this is the case. 1512 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1513 1514 MachineInstr *New = 1515 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1516 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1517 .addImm(MergedOffset) // offset 1518 .addImm(JoinedFormat) // format 1519 .addImm(CI.CPol) // cpol 1520 .addImm(0) // swz 1521 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1522 1523 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1524 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1525 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1526 1527 // Copy to the old destination registers. 1528 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1529 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1530 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1531 1532 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1533 .add(*Dest0) // Copy to same destination including flags and sub reg. 1534 .addReg(DestReg, 0, SubRegIdx0); 1535 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1536 .add(*Dest1) 1537 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1538 1539 CI.I->eraseFromParent(); 1540 Paired.I->eraseFromParent(); 1541 return New; 1542 } 1543 1544 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1545 CombineInfo &CI, CombineInfo &Paired, 1546 MachineBasicBlock::iterator InsertBefore) { 1547 MachineBasicBlock *MBB = CI.I->getParent(); 1548 DebugLoc DL = CI.I->getDebugLoc(); 1549 1550 const unsigned Opcode = getNewOpcode(CI, Paired); 1551 1552 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1553 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1554 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1555 1556 // Copy to the new source register. 1557 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1558 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1559 1560 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1561 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1562 1563 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1564 .add(*Src0) 1565 .addImm(SubRegIdx0) 1566 .add(*Src1) 1567 .addImm(SubRegIdx1); 1568 1569 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1570 .addReg(SrcReg, RegState::Kill); 1571 1572 AddressRegs Regs = getRegs(Opcode, *TII); 1573 1574 if (Regs.VAddr) 1575 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1576 1577 unsigned JoinedFormat = 1578 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1579 1580 // It shouldn't be possible to get this far if the two instructions 1581 // don't have a single memoperand, because MachineInstr::mayAlias() 1582 // will return true if this is the case. 1583 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1584 1585 MachineInstr *New = 1586 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1587 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1588 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1589 .addImm(JoinedFormat) // format 1590 .addImm(CI.CPol) // cpol 1591 .addImm(0) // swz 1592 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1593 1594 CI.I->eraseFromParent(); 1595 Paired.I->eraseFromParent(); 1596 return New; 1597 } 1598 1599 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( 1600 CombineInfo &CI, CombineInfo &Paired, 1601 MachineBasicBlock::iterator InsertBefore) { 1602 MachineBasicBlock *MBB = CI.I->getParent(); 1603 DebugLoc DL = CI.I->getDebugLoc(); 1604 1605 const unsigned Opcode = getNewOpcode(CI, Paired); 1606 1607 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1608 Register DestReg = MRI->createVirtualRegister(SuperRC); 1609 1610 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1611 1612 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1613 MIB.add(*SAddr); 1614 1615 MachineInstr *New = 1616 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1617 .addImm(std::min(CI.Offset, Paired.Offset)) 1618 .addImm(CI.CPol) 1619 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1620 1621 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1622 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1623 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1624 1625 // Copy to the old destination registers. 1626 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1627 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1628 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1629 1630 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1631 .add(*Dest0) // Copy to same destination including flags and sub reg. 1632 .addReg(DestReg, 0, SubRegIdx0); 1633 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1634 .add(*Dest1) 1635 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1636 1637 CI.I->eraseFromParent(); 1638 Paired.I->eraseFromParent(); 1639 return New; 1640 } 1641 1642 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( 1643 CombineInfo &CI, CombineInfo &Paired, 1644 MachineBasicBlock::iterator InsertBefore) { 1645 MachineBasicBlock *MBB = CI.I->getParent(); 1646 DebugLoc DL = CI.I->getDebugLoc(); 1647 1648 const unsigned Opcode = getNewOpcode(CI, Paired); 1649 1650 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1651 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1652 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1653 1654 // Copy to the new source register. 1655 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1656 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1657 1658 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1659 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1660 1661 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1662 .add(*Src0) 1663 .addImm(SubRegIdx0) 1664 .add(*Src1) 1665 .addImm(SubRegIdx1); 1666 1667 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1668 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1669 .addReg(SrcReg, RegState::Kill); 1670 1671 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1672 MIB.add(*SAddr); 1673 1674 MachineInstr *New = 1675 MIB.addImm(std::min(CI.Offset, Paired.Offset)) 1676 .addImm(CI.CPol) 1677 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1678 1679 CI.I->eraseFromParent(); 1680 Paired.I->eraseFromParent(); 1681 return New; 1682 } 1683 1684 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1685 const CombineInfo &Paired) { 1686 const unsigned Width = CI.Width + Paired.Width; 1687 1688 switch (getCommonInstClass(CI, Paired)) { 1689 default: 1690 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1691 // FIXME: Handle d16 correctly 1692 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1693 Width); 1694 case TBUFFER_LOAD: 1695 case TBUFFER_STORE: 1696 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1697 Width); 1698 1699 case UNKNOWN: 1700 llvm_unreachable("Unknown instruction class"); 1701 case S_BUFFER_LOAD_IMM: 1702 switch (Width) { 1703 default: 1704 return 0; 1705 case 2: 1706 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1707 case 3: 1708 return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; 1709 case 4: 1710 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1711 case 8: 1712 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1713 } 1714 case S_BUFFER_LOAD_SGPR_IMM: 1715 switch (Width) { 1716 default: 1717 return 0; 1718 case 2: 1719 return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; 1720 case 3: 1721 return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; 1722 case 4: 1723 return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; 1724 case 8: 1725 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; 1726 } 1727 case S_LOAD_IMM: 1728 switch (Width) { 1729 default: 1730 return 0; 1731 case 2: 1732 return AMDGPU::S_LOAD_DWORDX2_IMM; 1733 case 3: 1734 return AMDGPU::S_LOAD_DWORDX3_IMM; 1735 case 4: 1736 return AMDGPU::S_LOAD_DWORDX4_IMM; 1737 case 8: 1738 return AMDGPU::S_LOAD_DWORDX8_IMM; 1739 } 1740 case GLOBAL_LOAD: 1741 switch (Width) { 1742 default: 1743 return 0; 1744 case 2: 1745 return AMDGPU::GLOBAL_LOAD_DWORDX2; 1746 case 3: 1747 return AMDGPU::GLOBAL_LOAD_DWORDX3; 1748 case 4: 1749 return AMDGPU::GLOBAL_LOAD_DWORDX4; 1750 } 1751 case GLOBAL_LOAD_SADDR: 1752 switch (Width) { 1753 default: 1754 return 0; 1755 case 2: 1756 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; 1757 case 3: 1758 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; 1759 case 4: 1760 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; 1761 } 1762 case GLOBAL_STORE: 1763 switch (Width) { 1764 default: 1765 return 0; 1766 case 2: 1767 return AMDGPU::GLOBAL_STORE_DWORDX2; 1768 case 3: 1769 return AMDGPU::GLOBAL_STORE_DWORDX3; 1770 case 4: 1771 return AMDGPU::GLOBAL_STORE_DWORDX4; 1772 } 1773 case GLOBAL_STORE_SADDR: 1774 switch (Width) { 1775 default: 1776 return 0; 1777 case 2: 1778 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; 1779 case 3: 1780 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; 1781 case 4: 1782 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; 1783 } 1784 case FLAT_LOAD: 1785 switch (Width) { 1786 default: 1787 return 0; 1788 case 2: 1789 return AMDGPU::FLAT_LOAD_DWORDX2; 1790 case 3: 1791 return AMDGPU::FLAT_LOAD_DWORDX3; 1792 case 4: 1793 return AMDGPU::FLAT_LOAD_DWORDX4; 1794 } 1795 case FLAT_STORE: 1796 switch (Width) { 1797 default: 1798 return 0; 1799 case 2: 1800 return AMDGPU::FLAT_STORE_DWORDX2; 1801 case 3: 1802 return AMDGPU::FLAT_STORE_DWORDX3; 1803 case 4: 1804 return AMDGPU::FLAT_STORE_DWORDX4; 1805 } 1806 case MIMG: 1807 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && 1808 "No overlaps"); 1809 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1810 } 1811 } 1812 1813 std::pair<unsigned, unsigned> 1814 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1815 const CombineInfo &Paired) { 1816 assert((CI.InstClass != MIMG || 1817 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == 1818 CI.Width + Paired.Width)) && 1819 "No overlaps"); 1820 1821 unsigned Idx0; 1822 unsigned Idx1; 1823 1824 static const unsigned Idxs[5][4] = { 1825 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1826 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1827 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1828 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1829 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1830 }; 1831 1832 assert(CI.Width >= 1 && CI.Width <= 4); 1833 assert(Paired.Width >= 1 && Paired.Width <= 4); 1834 1835 if (Paired < CI) { 1836 Idx1 = Idxs[0][Paired.Width - 1]; 1837 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1838 } else { 1839 Idx0 = Idxs[0][CI.Width - 1]; 1840 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1841 } 1842 1843 return std::pair(Idx0, Idx1); 1844 } 1845 1846 const TargetRegisterClass * 1847 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1848 const CombineInfo &Paired) { 1849 if (CI.InstClass == S_BUFFER_LOAD_IMM || 1850 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) { 1851 switch (CI.Width + Paired.Width) { 1852 default: 1853 return nullptr; 1854 case 2: 1855 return &AMDGPU::SReg_64_XEXECRegClass; 1856 case 3: 1857 return &AMDGPU::SGPR_96RegClass; 1858 case 4: 1859 return &AMDGPU::SGPR_128RegClass; 1860 case 8: 1861 return &AMDGPU::SGPR_256RegClass; 1862 case 16: 1863 return &AMDGPU::SGPR_512RegClass; 1864 } 1865 } 1866 1867 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1868 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1869 ? TRI->getAGPRClassForBitWidth(BitWidth) 1870 : TRI->getVGPRClassForBitWidth(BitWidth); 1871 } 1872 1873 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1874 CombineInfo &CI, CombineInfo &Paired, 1875 MachineBasicBlock::iterator InsertBefore) { 1876 MachineBasicBlock *MBB = CI.I->getParent(); 1877 DebugLoc DL = CI.I->getDebugLoc(); 1878 1879 const unsigned Opcode = getNewOpcode(CI, Paired); 1880 1881 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1882 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1883 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1884 1885 // Copy to the new source register. 1886 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1887 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1888 1889 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1890 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1891 1892 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1893 .add(*Src0) 1894 .addImm(SubRegIdx0) 1895 .add(*Src1) 1896 .addImm(SubRegIdx1); 1897 1898 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1899 .addReg(SrcReg, RegState::Kill); 1900 1901 AddressRegs Regs = getRegs(Opcode, *TII); 1902 1903 if (Regs.VAddr) 1904 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1905 1906 1907 // It shouldn't be possible to get this far if the two instructions 1908 // don't have a single memoperand, because MachineInstr::mayAlias() 1909 // will return true if this is the case. 1910 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1911 1912 MachineInstr *New = 1913 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1914 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1915 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1916 .addImm(CI.CPol) // cpol 1917 .addImm(0) // swz 1918 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1919 1920 CI.I->eraseFromParent(); 1921 Paired.I->eraseFromParent(); 1922 return New; 1923 } 1924 1925 MachineOperand 1926 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1927 APInt V(32, Val, true); 1928 if (TII->isInlineConstant(V)) 1929 return MachineOperand::CreateImm(Val); 1930 1931 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1932 MachineInstr *Mov = 1933 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1934 TII->get(AMDGPU::S_MOV_B32), Reg) 1935 .addImm(Val); 1936 (void)Mov; 1937 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1938 return MachineOperand::CreateReg(Reg, false); 1939 } 1940 1941 // Compute base address using Addr and return the final register. 1942 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1943 const MemAddress &Addr) const { 1944 MachineBasicBlock *MBB = MI.getParent(); 1945 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1946 DebugLoc DL = MI.getDebugLoc(); 1947 1948 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1949 Addr.Base.LoSubReg) && 1950 "Expected 32-bit Base-Register-Low!!"); 1951 1952 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1953 Addr.Base.HiSubReg) && 1954 "Expected 32-bit Base-Register-Hi!!"); 1955 1956 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1957 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1958 MachineOperand OffsetHi = 1959 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1960 1961 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1962 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1963 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1964 1965 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1966 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1967 MachineInstr *LoHalf = 1968 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1969 .addReg(CarryReg, RegState::Define) 1970 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1971 .add(OffsetLo) 1972 .addImm(0); // clamp bit 1973 (void)LoHalf; 1974 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1975 1976 MachineInstr *HiHalf = 1977 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1978 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1979 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1980 .add(OffsetHi) 1981 .addReg(CarryReg, RegState::Kill) 1982 .addImm(0); // clamp bit 1983 (void)HiHalf; 1984 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1985 1986 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 1987 MachineInstr *FullBase = 1988 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1989 .addReg(DestSub0) 1990 .addImm(AMDGPU::sub0) 1991 .addReg(DestSub1) 1992 .addImm(AMDGPU::sub1); 1993 (void)FullBase; 1994 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1995 1996 return FullDestReg; 1997 } 1998 1999 // Update base and offset with the NewBase and NewOffset in MI. 2000 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 2001 Register NewBase, 2002 int32_t NewOffset) const { 2003 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2004 Base->setReg(NewBase); 2005 Base->setIsKill(false); 2006 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 2007 } 2008 2009 std::optional<int32_t> 2010 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 2011 if (Op.isImm()) 2012 return Op.getImm(); 2013 2014 if (!Op.isReg()) 2015 return std::nullopt; 2016 2017 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 2018 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 2019 !Def->getOperand(1).isImm()) 2020 return std::nullopt; 2021 2022 return Def->getOperand(1).getImm(); 2023 } 2024 2025 // Analyze Base and extracts: 2026 // - 32bit base registers, subregisters 2027 // - 64bit constant offset 2028 // Expecting base computation as: 2029 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 2030 // %LO:vgpr_32, %c:sreg_64_xexec = 2031 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 2032 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 2033 // %Base:vreg_64 = 2034 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 2035 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 2036 MemAddress &Addr) const { 2037 if (!Base.isReg()) 2038 return; 2039 2040 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 2041 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 2042 || Def->getNumOperands() != 5) 2043 return; 2044 2045 MachineOperand BaseLo = Def->getOperand(1); 2046 MachineOperand BaseHi = Def->getOperand(3); 2047 if (!BaseLo.isReg() || !BaseHi.isReg()) 2048 return; 2049 2050 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 2051 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 2052 2053 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 2054 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 2055 return; 2056 2057 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 2058 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 2059 2060 auto Offset0P = extractConstOffset(*Src0); 2061 if (Offset0P) 2062 BaseLo = *Src1; 2063 else { 2064 if (!(Offset0P = extractConstOffset(*Src1))) 2065 return; 2066 BaseLo = *Src0; 2067 } 2068 2069 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 2070 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 2071 2072 if (Src0->isImm()) 2073 std::swap(Src0, Src1); 2074 2075 if (!Src1->isImm()) 2076 return; 2077 2078 uint64_t Offset1 = Src1->getImm(); 2079 BaseHi = *Src0; 2080 2081 Addr.Base.LoReg = BaseLo.getReg(); 2082 Addr.Base.HiReg = BaseHi.getReg(); 2083 Addr.Base.LoSubReg = BaseLo.getSubReg(); 2084 Addr.Base.HiSubReg = BaseHi.getSubReg(); 2085 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 2086 } 2087 2088 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 2089 MachineInstr &MI, 2090 MemInfoMap &Visited, 2091 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 2092 2093 if (!(MI.mayLoad() ^ MI.mayStore())) 2094 return false; 2095 2096 // TODO: Support flat and scratch. 2097 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 2098 return false; 2099 2100 if (MI.mayLoad() && 2101 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) 2102 return false; 2103 2104 if (AnchorList.count(&MI)) 2105 return false; 2106 2107 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 2108 2109 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 2110 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 2111 return false; 2112 } 2113 2114 // Step1: Find the base-registers and a 64bit constant offset. 2115 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2116 MemAddress MAddr; 2117 if (!Visited.contains(&MI)) { 2118 processBaseWithConstOffset(Base, MAddr); 2119 Visited[&MI] = MAddr; 2120 } else 2121 MAddr = Visited[&MI]; 2122 2123 if (MAddr.Offset == 0) { 2124 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 2125 " constant offsets that can be promoted.\n";); 2126 return false; 2127 } 2128 2129 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 2130 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 2131 2132 // Step2: Traverse through MI's basic block and find an anchor(that has the 2133 // same base-registers) with the highest 13bit distance from MI's offset. 2134 // E.g. (64bit loads) 2135 // bb: 2136 // addr1 = &a + 4096; load1 = load(addr1, 0) 2137 // addr2 = &a + 6144; load2 = load(addr2, 0) 2138 // addr3 = &a + 8192; load3 = load(addr3, 0) 2139 // addr4 = &a + 10240; load4 = load(addr4, 0) 2140 // addr5 = &a + 12288; load5 = load(addr5, 0) 2141 // 2142 // Starting from the first load, the optimization will try to find a new base 2143 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 2144 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 2145 // as the new-base(anchor) because of the maximum distance which can 2146 // accommodate more intermediate bases presumably. 2147 // 2148 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 2149 // (&a + 8192) for load1, load2, load4. 2150 // addr = &a + 8192 2151 // load1 = load(addr, -4096) 2152 // load2 = load(addr, -2048) 2153 // load3 = load(addr, 0) 2154 // load4 = load(addr, 2048) 2155 // addr5 = &a + 12288; load5 = load(addr5, 0) 2156 // 2157 MachineInstr *AnchorInst = nullptr; 2158 MemAddress AnchorAddr; 2159 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 2160 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 2161 2162 MachineBasicBlock *MBB = MI.getParent(); 2163 MachineBasicBlock::iterator E = MBB->end(); 2164 MachineBasicBlock::iterator MBBI = MI.getIterator(); 2165 ++MBBI; 2166 const SITargetLowering *TLI = 2167 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 2168 2169 for ( ; MBBI != E; ++MBBI) { 2170 MachineInstr &MINext = *MBBI; 2171 // TODO: Support finding an anchor(with same base) from store addresses or 2172 // any other load addresses where the opcodes are different. 2173 if (MINext.getOpcode() != MI.getOpcode() || 2174 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 2175 continue; 2176 2177 const MachineOperand &BaseNext = 2178 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 2179 MemAddress MAddrNext; 2180 if (!Visited.contains(&MINext)) { 2181 processBaseWithConstOffset(BaseNext, MAddrNext); 2182 Visited[&MINext] = MAddrNext; 2183 } else 2184 MAddrNext = Visited[&MINext]; 2185 2186 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 2187 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 2188 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 2189 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 2190 continue; 2191 2192 InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset)); 2193 2194 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 2195 TargetLoweringBase::AddrMode AM; 2196 AM.HasBaseReg = true; 2197 AM.BaseOffs = Dist; 2198 if (TLI->isLegalGlobalAddressingMode(AM) && 2199 (uint32_t)std::abs(Dist) > MaxDist) { 2200 MaxDist = std::abs(Dist); 2201 2202 AnchorAddr = MAddrNext; 2203 AnchorInst = &MINext; 2204 } 2205 } 2206 2207 if (AnchorInst) { 2208 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 2209 AnchorInst->dump()); 2210 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 2211 << AnchorAddr.Offset << "\n\n"); 2212 2213 // Instead of moving up, just re-compute anchor-instruction's base address. 2214 Register Base = computeBase(MI, AnchorAddr); 2215 2216 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 2217 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 2218 2219 for (auto P : InstsWCommonBase) { 2220 TargetLoweringBase::AddrMode AM; 2221 AM.HasBaseReg = true; 2222 AM.BaseOffs = P.second - AnchorAddr.Offset; 2223 2224 if (TLI->isLegalGlobalAddressingMode(AM)) { 2225 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 2226 dbgs() << ")"; P.first->dump()); 2227 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 2228 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 2229 } 2230 } 2231 AnchorList.insert(AnchorInst); 2232 return true; 2233 } 2234 2235 return false; 2236 } 2237 2238 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 2239 std::list<std::list<CombineInfo> > &MergeableInsts) const { 2240 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 2241 if (AddrList.front().InstClass == CI.InstClass && 2242 AddrList.front().IsAGPR == CI.IsAGPR && 2243 AddrList.front().hasSameBaseAddress(CI)) { 2244 AddrList.emplace_back(CI); 2245 return; 2246 } 2247 } 2248 2249 // Base address not found, so add a new list. 2250 MergeableInsts.emplace_back(1, CI); 2251 } 2252 2253 std::pair<MachineBasicBlock::iterator, bool> 2254 SILoadStoreOptimizer::collectMergeableInsts( 2255 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2256 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2257 std::list<std::list<CombineInfo>> &MergeableInsts) const { 2258 bool Modified = false; 2259 2260 // Sort potential mergeable instructions into lists. One list per base address. 2261 unsigned Order = 0; 2262 MachineBasicBlock::iterator BlockI = Begin; 2263 for (; BlockI != End; ++BlockI) { 2264 MachineInstr &MI = *BlockI; 2265 2266 // We run this before checking if an address is mergeable, because it can produce 2267 // better code even if the instructions aren't mergeable. 2268 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 2269 Modified = true; 2270 2271 // Treat volatile accesses, ordered accesses and unmodeled side effects as 2272 // barriers. We can look after this barrier for separate merges. 2273 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 2274 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 2275 2276 // Search will resume after this instruction in a separate merge list. 2277 ++BlockI; 2278 break; 2279 } 2280 2281 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2282 if (InstClass == UNKNOWN) 2283 continue; 2284 2285 // Do not merge VMEM buffer instructions with "swizzled" bit set. 2286 int Swizzled = 2287 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 2288 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 2289 continue; 2290 2291 CombineInfo CI; 2292 CI.setMI(MI, *this); 2293 CI.Order = Order++; 2294 2295 if (!CI.hasMergeableAddress(*MRI)) 2296 continue; 2297 2298 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 2299 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2300 // operands. However we are reporting that ds_write2 shall have 2301 // only VGPR data so that machine copy propagation does not 2302 // create an illegal instruction with a VGPR and AGPR sources. 2303 // Consequenctially if we create such instruction the verifier 2304 // will complain. 2305 continue; 2306 } 2307 2308 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2309 2310 addInstToMergeableList(CI, MergeableInsts); 2311 } 2312 2313 // At this point we have lists of Mergeable instructions. 2314 // 2315 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2316 // list try to find an instruction that can be merged with I. If an instruction 2317 // is found, it is stored in the Paired field. If no instructions are found, then 2318 // the CombineInfo object is deleted from the list. 2319 2320 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2321 E = MergeableInsts.end(); I != E;) { 2322 2323 std::list<CombineInfo> &MergeList = *I; 2324 if (MergeList.size() <= 1) { 2325 // This means we have found only one instruction with a given address 2326 // that can be merged, and we need at least 2 instructions to do a merge, 2327 // so this list can be discarded. 2328 I = MergeableInsts.erase(I); 2329 continue; 2330 } 2331 2332 // Sort the lists by offsets, this way mergeable instructions will be 2333 // adjacent to each other in the list, which will make it easier to find 2334 // matches. 2335 MergeList.sort( 2336 [] (const CombineInfo &A, const CombineInfo &B) { 2337 return A.Offset < B.Offset; 2338 }); 2339 ++I; 2340 } 2341 2342 return std::pair(BlockI, Modified); 2343 } 2344 2345 // Scan through looking for adjacent LDS operations with constant offsets from 2346 // the same base register. We rely on the scheduler to do the hard work of 2347 // clustering nearby loads, and assume these are all adjacent. 2348 bool SILoadStoreOptimizer::optimizeBlock( 2349 std::list<std::list<CombineInfo> > &MergeableInsts) { 2350 bool Modified = false; 2351 2352 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2353 E = MergeableInsts.end(); I != E;) { 2354 std::list<CombineInfo> &MergeList = *I; 2355 2356 bool OptimizeListAgain = false; 2357 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2358 // We weren't able to make any changes, so delete the list so we don't 2359 // process the same instructions the next time we try to optimize this 2360 // block. 2361 I = MergeableInsts.erase(I); 2362 continue; 2363 } 2364 2365 Modified = true; 2366 2367 // We made changes, but also determined that there were no more optimization 2368 // opportunities, so we don't need to reprocess the list 2369 if (!OptimizeListAgain) { 2370 I = MergeableInsts.erase(I); 2371 continue; 2372 } 2373 OptimizeAgain = true; 2374 } 2375 return Modified; 2376 } 2377 2378 bool 2379 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2380 std::list<CombineInfo> &MergeList, 2381 bool &OptimizeListAgain) { 2382 if (MergeList.empty()) 2383 return false; 2384 2385 bool Modified = false; 2386 2387 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2388 Next = std::next(I)) { 2389 2390 auto First = I; 2391 auto Second = Next; 2392 2393 if ((*First).Order > (*Second).Order) 2394 std::swap(First, Second); 2395 CombineInfo &CI = *First; 2396 CombineInfo &Paired = *Second; 2397 2398 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 2399 if (!Where) { 2400 ++I; 2401 continue; 2402 } 2403 2404 Modified = true; 2405 2406 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2407 2408 MachineBasicBlock::iterator NewMI; 2409 switch (CI.InstClass) { 2410 default: 2411 llvm_unreachable("unknown InstClass"); 2412 break; 2413 case DS_READ: 2414 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2415 break; 2416 case DS_WRITE: 2417 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2418 break; 2419 case S_BUFFER_LOAD_IMM: 2420 case S_BUFFER_LOAD_SGPR_IMM: 2421 case S_LOAD_IMM: 2422 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I); 2423 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2424 break; 2425 case BUFFER_LOAD: 2426 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2427 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2428 break; 2429 case BUFFER_STORE: 2430 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2431 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2432 break; 2433 case MIMG: 2434 NewMI = mergeImagePair(CI, Paired, Where->I); 2435 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2436 break; 2437 case TBUFFER_LOAD: 2438 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2439 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2440 break; 2441 case TBUFFER_STORE: 2442 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2443 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2444 break; 2445 case FLAT_LOAD: 2446 case GLOBAL_LOAD: 2447 case GLOBAL_LOAD_SADDR: 2448 NewMI = mergeFlatLoadPair(CI, Paired, Where->I); 2449 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2450 break; 2451 case FLAT_STORE: 2452 case GLOBAL_STORE: 2453 case GLOBAL_STORE_SADDR: 2454 NewMI = mergeFlatStorePair(CI, Paired, Where->I); 2455 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2456 break; 2457 } 2458 CI.setMI(NewMI, *this); 2459 CI.Order = Where->Order; 2460 if (I == Second) 2461 I = Next; 2462 2463 MergeList.erase(Second); 2464 } 2465 2466 return Modified; 2467 } 2468 2469 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2470 if (skipFunction(MF.getFunction())) 2471 return false; 2472 2473 STM = &MF.getSubtarget<GCNSubtarget>(); 2474 if (!STM->loadStoreOptEnabled()) 2475 return false; 2476 2477 TII = STM->getInstrInfo(); 2478 TRI = &TII->getRegisterInfo(); 2479 2480 MRI = &MF.getRegInfo(); 2481 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2482 2483 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2484 2485 bool Modified = false; 2486 2487 // Contains the list of instructions for which constant offsets are being 2488 // promoted to the IMM. This is tracked for an entire block at time. 2489 SmallPtrSet<MachineInstr *, 4> AnchorList; 2490 MemInfoMap Visited; 2491 2492 for (MachineBasicBlock &MBB : MF) { 2493 MachineBasicBlock::iterator SectionEnd; 2494 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2495 I = SectionEnd) { 2496 bool CollectModified; 2497 std::list<std::list<CombineInfo>> MergeableInsts; 2498 2499 // First pass: Collect list of all instructions we know how to merge in a 2500 // subset of the block. 2501 std::tie(SectionEnd, CollectModified) = 2502 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2503 2504 Modified |= CollectModified; 2505 2506 do { 2507 OptimizeAgain = false; 2508 Modified |= optimizeBlock(MergeableInsts); 2509 } while (OptimizeAgain); 2510 } 2511 2512 Visited.clear(); 2513 AnchorList.clear(); 2514 } 2515 2516 return Modified; 2517 } 2518