1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 S_BUFFER_LOAD_SGPR_IMM, 78 S_LOAD_IMM, 79 BUFFER_LOAD, 80 BUFFER_STORE, 81 MIMG, 82 TBUFFER_LOAD, 83 TBUFFER_STORE, 84 GLOBAL_LOAD_SADDR, 85 GLOBAL_STORE_SADDR, 86 FLAT_LOAD, 87 FLAT_STORE, 88 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of 89 GLOBAL_STORE // any CombineInfo, they are only ever returned by 90 // getCommonInstClass. 91 }; 92 93 struct AddressRegs { 94 unsigned char NumVAddrs = 0; 95 bool SBase = false; 96 bool SRsrc = false; 97 bool SOffset = false; 98 bool SAddr = false; 99 bool VAddr = false; 100 bool Addr = false; 101 bool SSamp = false; 102 }; 103 104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 105 const unsigned MaxAddressRegs = 12 + 1 + 1; 106 107 class SILoadStoreOptimizer : public MachineFunctionPass { 108 struct CombineInfo { 109 MachineBasicBlock::iterator I; 110 unsigned EltSize; 111 unsigned Offset; 112 unsigned Width; 113 unsigned Format; 114 unsigned BaseOff; 115 unsigned DMask; 116 InstClassEnum InstClass; 117 unsigned CPol = 0; 118 bool IsAGPR; 119 bool UseST64; 120 int AddrIdx[MaxAddressRegs]; 121 const MachineOperand *AddrReg[MaxAddressRegs]; 122 unsigned NumAddresses; 123 unsigned Order; 124 125 bool hasSameBaseAddress(const CombineInfo &CI) { 126 if (NumAddresses != CI.NumAddresses) 127 return false; 128 129 const MachineInstr &MI = *CI.I; 130 for (unsigned i = 0; i < NumAddresses; i++) { 131 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 132 133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 135 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 136 return false; 137 } 138 continue; 139 } 140 141 // Check same base pointer. Be careful of subregisters, which can occur 142 // with vectors of pointers. 143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 145 return false; 146 } 147 } 148 return true; 149 } 150 151 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 152 for (unsigned i = 0; i < NumAddresses; ++i) { 153 const MachineOperand *AddrOp = AddrReg[i]; 154 // Immediates are always OK. 155 if (AddrOp->isImm()) 156 continue; 157 158 // Don't try to merge addresses that aren't either immediates or registers. 159 // TODO: Should be possible to merge FrameIndexes and maybe some other 160 // non-register 161 if (!AddrOp->isReg()) 162 return false; 163 164 // TODO: We should be able to merge instructions with other physical reg 165 // addresses too. 166 if (AddrOp->getReg().isPhysical() && 167 AddrOp->getReg() != AMDGPU::SGPR_NULL) 168 return false; 169 170 // If an address has only one use then there will be no other 171 // instructions with the same address, so we can't merge this one. 172 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 173 return false; 174 } 175 return true; 176 } 177 178 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 179 180 // Compare by pointer order. 181 bool operator<(const CombineInfo& Other) const { 182 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; 183 } 184 }; 185 186 struct BaseRegisters { 187 Register LoReg; 188 Register HiReg; 189 190 unsigned LoSubReg = 0; 191 unsigned HiSubReg = 0; 192 }; 193 194 struct MemAddress { 195 BaseRegisters Base; 196 int64_t Offset = 0; 197 }; 198 199 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 200 201 private: 202 const GCNSubtarget *STM = nullptr; 203 const SIInstrInfo *TII = nullptr; 204 const SIRegisterInfo *TRI = nullptr; 205 MachineRegisterInfo *MRI = nullptr; 206 AliasAnalysis *AA = nullptr; 207 bool OptimizeAgain; 208 209 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 210 const DenseSet<Register> &ARegUses, 211 const MachineInstr &A, const MachineInstr &B) const; 212 static bool dmasksCanBeCombined(const CombineInfo &CI, 213 const SIInstrInfo &TII, 214 const CombineInfo &Paired); 215 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 216 CombineInfo &Paired, bool Modify = false); 217 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 218 const CombineInfo &Paired); 219 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 220 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 221 const CombineInfo &Paired); 222 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 223 const CombineInfo &Paired); 224 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 225 226 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 227 228 unsigned read2Opcode(unsigned EltSize) const; 229 unsigned read2ST64Opcode(unsigned EltSize) const; 230 MachineBasicBlock::iterator 231 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 232 MachineBasicBlock::iterator InsertBefore); 233 234 unsigned write2Opcode(unsigned EltSize) const; 235 unsigned write2ST64Opcode(unsigned EltSize) const; 236 MachineBasicBlock::iterator 237 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 238 MachineBasicBlock::iterator InsertBefore); 239 MachineBasicBlock::iterator 240 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 241 MachineBasicBlock::iterator InsertBefore); 242 MachineBasicBlock::iterator 243 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 244 MachineBasicBlock::iterator InsertBefore); 245 MachineBasicBlock::iterator 246 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 247 MachineBasicBlock::iterator InsertBefore); 248 MachineBasicBlock::iterator 249 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 250 MachineBasicBlock::iterator InsertBefore); 251 MachineBasicBlock::iterator 252 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 253 MachineBasicBlock::iterator InsertBefore); 254 MachineBasicBlock::iterator 255 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 256 MachineBasicBlock::iterator InsertBefore); 257 MachineBasicBlock::iterator 258 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, 259 MachineBasicBlock::iterator InsertBefore); 260 MachineBasicBlock::iterator 261 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, 262 MachineBasicBlock::iterator InsertBefore); 263 264 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 265 int32_t NewOffset) const; 266 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 267 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 268 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 269 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 270 /// Promotes constant offset to the immediate by adjusting the base. It 271 /// tries to use a base from the nearby instructions that allows it to have 272 /// a 13bit constant offset which gets promoted to the immediate. 273 bool promoteConstantOffsetToImm(MachineInstr &CI, 274 MemInfoMap &Visited, 275 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 276 void addInstToMergeableList(const CombineInfo &CI, 277 std::list<std::list<CombineInfo> > &MergeableInsts) const; 278 279 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 280 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 281 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 282 std::list<std::list<CombineInfo>> &MergeableInsts) const; 283 284 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, 285 const CombineInfo &Paired); 286 287 static InstClassEnum getCommonInstClass(const CombineInfo &CI, 288 const CombineInfo &Paired); 289 290 public: 291 static char ID; 292 293 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 294 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 295 } 296 297 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 298 bool &OptimizeListAgain); 299 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 300 301 bool runOnMachineFunction(MachineFunction &MF) override; 302 303 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 304 305 void getAnalysisUsage(AnalysisUsage &AU) const override { 306 AU.setPreservesCFG(); 307 AU.addRequired<AAResultsWrapperPass>(); 308 309 MachineFunctionPass::getAnalysisUsage(AU); 310 } 311 312 MachineFunctionProperties getRequiredProperties() const override { 313 return MachineFunctionProperties() 314 .set(MachineFunctionProperties::Property::IsSSA); 315 } 316 }; 317 318 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 319 const unsigned Opc = MI.getOpcode(); 320 321 if (TII.isMUBUF(Opc)) { 322 // FIXME: Handle d16 correctly 323 return AMDGPU::getMUBUFElements(Opc); 324 } 325 if (TII.isImage(MI)) { 326 uint64_t DMaskImm = 327 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 328 return llvm::popcount(DMaskImm); 329 } 330 if (TII.isMTBUF(Opc)) { 331 return AMDGPU::getMTBUFElements(Opc); 332 } 333 334 switch (Opc) { 335 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 336 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 337 case AMDGPU::S_LOAD_DWORD_IMM: 338 case AMDGPU::GLOBAL_LOAD_DWORD: 339 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 340 case AMDGPU::GLOBAL_STORE_DWORD: 341 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 342 case AMDGPU::FLAT_LOAD_DWORD: 343 case AMDGPU::FLAT_STORE_DWORD: 344 return 1; 345 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 346 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 347 case AMDGPU::S_LOAD_DWORDX2_IMM: 348 case AMDGPU::GLOBAL_LOAD_DWORDX2: 349 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 350 case AMDGPU::GLOBAL_STORE_DWORDX2: 351 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 352 case AMDGPU::FLAT_LOAD_DWORDX2: 353 case AMDGPU::FLAT_STORE_DWORDX2: 354 return 2; 355 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 356 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 357 case AMDGPU::S_LOAD_DWORDX3_IMM: 358 case AMDGPU::GLOBAL_LOAD_DWORDX3: 359 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 360 case AMDGPU::GLOBAL_STORE_DWORDX3: 361 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 362 case AMDGPU::FLAT_LOAD_DWORDX3: 363 case AMDGPU::FLAT_STORE_DWORDX3: 364 return 3; 365 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 366 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 367 case AMDGPU::S_LOAD_DWORDX4_IMM: 368 case AMDGPU::GLOBAL_LOAD_DWORDX4: 369 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 370 case AMDGPU::GLOBAL_STORE_DWORDX4: 371 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 372 case AMDGPU::FLAT_LOAD_DWORDX4: 373 case AMDGPU::FLAT_STORE_DWORDX4: 374 return 4; 375 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 376 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 377 case AMDGPU::S_LOAD_DWORDX8_IMM: 378 return 8; 379 case AMDGPU::DS_READ_B32: 380 case AMDGPU::DS_READ_B32_gfx9: 381 case AMDGPU::DS_WRITE_B32: 382 case AMDGPU::DS_WRITE_B32_gfx9: 383 return 1; 384 case AMDGPU::DS_READ_B64: 385 case AMDGPU::DS_READ_B64_gfx9: 386 case AMDGPU::DS_WRITE_B64: 387 case AMDGPU::DS_WRITE_B64_gfx9: 388 return 2; 389 default: 390 return 0; 391 } 392 } 393 394 /// Maps instruction opcode to enum InstClassEnum. 395 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 396 switch (Opc) { 397 default: 398 if (TII.isMUBUF(Opc)) { 399 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 400 default: 401 return UNKNOWN; 402 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN: 403 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact: 404 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN: 405 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact: 406 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 407 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 408 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 409 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 410 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN: 411 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact: 412 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN: 413 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact: 414 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN: 415 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact: 416 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET: 417 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact: 418 return BUFFER_LOAD; 419 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN: 420 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact: 421 case AMDGPU::BUFFER_STORE_DWORD_IDXEN: 422 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact: 423 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 424 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 425 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 426 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 427 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN: 428 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact: 429 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN: 430 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact: 431 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN: 432 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact: 433 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET: 434 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact: 435 return BUFFER_STORE; 436 } 437 } 438 if (TII.isImage(Opc)) { 439 // Ignore instructions encoded without vaddr. 440 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && 441 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0)) 442 return UNKNOWN; 443 // Ignore BVH instructions 444 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 445 return UNKNOWN; 446 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 447 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 448 TII.isGather4(Opc)) 449 return UNKNOWN; 450 return MIMG; 451 } 452 if (TII.isMTBUF(Opc)) { 453 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 454 default: 455 return UNKNOWN; 456 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: 457 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: 458 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: 459 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: 460 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 461 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 462 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 463 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 464 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN: 465 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact: 466 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN: 467 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact: 468 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN: 469 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact: 470 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET: 471 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact: 472 return TBUFFER_LOAD; 473 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 474 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 475 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 476 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 477 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN: 478 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact: 479 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET: 480 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact: 481 return TBUFFER_STORE; 482 } 483 } 484 return UNKNOWN; 485 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 486 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 487 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 488 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 489 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 490 return S_BUFFER_LOAD_IMM; 491 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 492 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 493 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 494 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 495 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 496 return S_BUFFER_LOAD_SGPR_IMM; 497 case AMDGPU::S_LOAD_DWORD_IMM: 498 case AMDGPU::S_LOAD_DWORDX2_IMM: 499 case AMDGPU::S_LOAD_DWORDX3_IMM: 500 case AMDGPU::S_LOAD_DWORDX4_IMM: 501 case AMDGPU::S_LOAD_DWORDX8_IMM: 502 return S_LOAD_IMM; 503 case AMDGPU::DS_READ_B32: 504 case AMDGPU::DS_READ_B32_gfx9: 505 case AMDGPU::DS_READ_B64: 506 case AMDGPU::DS_READ_B64_gfx9: 507 return DS_READ; 508 case AMDGPU::DS_WRITE_B32: 509 case AMDGPU::DS_WRITE_B32_gfx9: 510 case AMDGPU::DS_WRITE_B64: 511 case AMDGPU::DS_WRITE_B64_gfx9: 512 return DS_WRITE; 513 case AMDGPU::GLOBAL_LOAD_DWORD: 514 case AMDGPU::GLOBAL_LOAD_DWORDX2: 515 case AMDGPU::GLOBAL_LOAD_DWORDX3: 516 case AMDGPU::GLOBAL_LOAD_DWORDX4: 517 case AMDGPU::FLAT_LOAD_DWORD: 518 case AMDGPU::FLAT_LOAD_DWORDX2: 519 case AMDGPU::FLAT_LOAD_DWORDX3: 520 case AMDGPU::FLAT_LOAD_DWORDX4: 521 return FLAT_LOAD; 522 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 523 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 524 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 525 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 526 return GLOBAL_LOAD_SADDR; 527 case AMDGPU::GLOBAL_STORE_DWORD: 528 case AMDGPU::GLOBAL_STORE_DWORDX2: 529 case AMDGPU::GLOBAL_STORE_DWORDX3: 530 case AMDGPU::GLOBAL_STORE_DWORDX4: 531 case AMDGPU::FLAT_STORE_DWORD: 532 case AMDGPU::FLAT_STORE_DWORDX2: 533 case AMDGPU::FLAT_STORE_DWORDX3: 534 case AMDGPU::FLAT_STORE_DWORDX4: 535 return FLAT_STORE; 536 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 537 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 538 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 539 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 540 return GLOBAL_STORE_SADDR; 541 } 542 } 543 544 /// Determines instruction subclass from opcode. Only instructions 545 /// of the same subclass can be merged together. The merged instruction may have 546 /// a different subclass but must have the same class. 547 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 548 switch (Opc) { 549 default: 550 if (TII.isMUBUF(Opc)) 551 return AMDGPU::getMUBUFBaseOpcode(Opc); 552 if (TII.isImage(Opc)) { 553 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 554 assert(Info); 555 return Info->BaseOpcode; 556 } 557 if (TII.isMTBUF(Opc)) 558 return AMDGPU::getMTBUFBaseOpcode(Opc); 559 return -1; 560 case AMDGPU::DS_READ_B32: 561 case AMDGPU::DS_READ_B32_gfx9: 562 case AMDGPU::DS_READ_B64: 563 case AMDGPU::DS_READ_B64_gfx9: 564 case AMDGPU::DS_WRITE_B32: 565 case AMDGPU::DS_WRITE_B32_gfx9: 566 case AMDGPU::DS_WRITE_B64: 567 case AMDGPU::DS_WRITE_B64_gfx9: 568 return Opc; 569 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 570 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 571 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 572 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 573 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 574 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 575 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 576 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 577 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 578 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 579 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 580 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; 581 case AMDGPU::S_LOAD_DWORD_IMM: 582 case AMDGPU::S_LOAD_DWORDX2_IMM: 583 case AMDGPU::S_LOAD_DWORDX3_IMM: 584 case AMDGPU::S_LOAD_DWORDX4_IMM: 585 case AMDGPU::S_LOAD_DWORDX8_IMM: 586 return AMDGPU::S_LOAD_DWORD_IMM; 587 case AMDGPU::GLOBAL_LOAD_DWORD: 588 case AMDGPU::GLOBAL_LOAD_DWORDX2: 589 case AMDGPU::GLOBAL_LOAD_DWORDX3: 590 case AMDGPU::GLOBAL_LOAD_DWORDX4: 591 case AMDGPU::FLAT_LOAD_DWORD: 592 case AMDGPU::FLAT_LOAD_DWORDX2: 593 case AMDGPU::FLAT_LOAD_DWORDX3: 594 case AMDGPU::FLAT_LOAD_DWORDX4: 595 return AMDGPU::FLAT_LOAD_DWORD; 596 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 597 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 598 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 599 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 600 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; 601 case AMDGPU::GLOBAL_STORE_DWORD: 602 case AMDGPU::GLOBAL_STORE_DWORDX2: 603 case AMDGPU::GLOBAL_STORE_DWORDX3: 604 case AMDGPU::GLOBAL_STORE_DWORDX4: 605 case AMDGPU::FLAT_STORE_DWORD: 606 case AMDGPU::FLAT_STORE_DWORDX2: 607 case AMDGPU::FLAT_STORE_DWORDX3: 608 case AMDGPU::FLAT_STORE_DWORDX4: 609 return AMDGPU::FLAT_STORE_DWORD; 610 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 611 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 612 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 613 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 614 return AMDGPU::GLOBAL_STORE_DWORD_SADDR; 615 } 616 } 617 618 // GLOBAL loads and stores are classified as FLAT initially. If both combined 619 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. 620 // If either or both instructions are non segment specific FLAT the resulting 621 // combined operation will be FLAT, potentially promoting one of the GLOBAL 622 // operations to FLAT. 623 // For other instructions return the original unmodified class. 624 InstClassEnum 625 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, 626 const CombineInfo &Paired) { 627 assert(CI.InstClass == Paired.InstClass); 628 629 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && 630 SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I)) 631 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; 632 633 return CI.InstClass; 634 } 635 636 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 637 AddressRegs Result; 638 639 if (TII.isMUBUF(Opc)) { 640 if (AMDGPU::getMUBUFHasVAddr(Opc)) 641 Result.VAddr = true; 642 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 643 Result.SRsrc = true; 644 if (AMDGPU::getMUBUFHasSoffset(Opc)) 645 Result.SOffset = true; 646 647 return Result; 648 } 649 650 if (TII.isImage(Opc)) { 651 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 652 if (VAddr0Idx >= 0) { 653 int RsrcName = 654 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; 655 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName); 656 Result.NumVAddrs = RsrcIdx - VAddr0Idx; 657 } else { 658 Result.VAddr = true; 659 } 660 Result.SRsrc = true; 661 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 662 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 663 Result.SSamp = true; 664 665 return Result; 666 } 667 if (TII.isMTBUF(Opc)) { 668 if (AMDGPU::getMTBUFHasVAddr(Opc)) 669 Result.VAddr = true; 670 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 671 Result.SRsrc = true; 672 if (AMDGPU::getMTBUFHasSoffset(Opc)) 673 Result.SOffset = true; 674 675 return Result; 676 } 677 678 switch (Opc) { 679 default: 680 return Result; 681 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 682 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 683 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 684 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 685 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 686 Result.SOffset = true; 687 [[fallthrough]]; 688 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 689 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 690 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 691 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 692 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 693 case AMDGPU::S_LOAD_DWORD_IMM: 694 case AMDGPU::S_LOAD_DWORDX2_IMM: 695 case AMDGPU::S_LOAD_DWORDX3_IMM: 696 case AMDGPU::S_LOAD_DWORDX4_IMM: 697 case AMDGPU::S_LOAD_DWORDX8_IMM: 698 Result.SBase = true; 699 return Result; 700 case AMDGPU::DS_READ_B32: 701 case AMDGPU::DS_READ_B64: 702 case AMDGPU::DS_READ_B32_gfx9: 703 case AMDGPU::DS_READ_B64_gfx9: 704 case AMDGPU::DS_WRITE_B32: 705 case AMDGPU::DS_WRITE_B64: 706 case AMDGPU::DS_WRITE_B32_gfx9: 707 case AMDGPU::DS_WRITE_B64_gfx9: 708 Result.Addr = true; 709 return Result; 710 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 711 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 712 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 713 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 714 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 715 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 716 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 717 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 718 Result.SAddr = true; 719 [[fallthrough]]; 720 case AMDGPU::GLOBAL_LOAD_DWORD: 721 case AMDGPU::GLOBAL_LOAD_DWORDX2: 722 case AMDGPU::GLOBAL_LOAD_DWORDX3: 723 case AMDGPU::GLOBAL_LOAD_DWORDX4: 724 case AMDGPU::GLOBAL_STORE_DWORD: 725 case AMDGPU::GLOBAL_STORE_DWORDX2: 726 case AMDGPU::GLOBAL_STORE_DWORDX3: 727 case AMDGPU::GLOBAL_STORE_DWORDX4: 728 case AMDGPU::FLAT_LOAD_DWORD: 729 case AMDGPU::FLAT_LOAD_DWORDX2: 730 case AMDGPU::FLAT_LOAD_DWORDX3: 731 case AMDGPU::FLAT_LOAD_DWORDX4: 732 case AMDGPU::FLAT_STORE_DWORD: 733 case AMDGPU::FLAT_STORE_DWORDX2: 734 case AMDGPU::FLAT_STORE_DWORDX3: 735 case AMDGPU::FLAT_STORE_DWORDX4: 736 Result.VAddr = true; 737 return Result; 738 } 739 } 740 741 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 742 const SILoadStoreOptimizer &LSO) { 743 I = MI; 744 unsigned Opc = MI->getOpcode(); 745 InstClass = getInstClass(Opc, *LSO.TII); 746 747 if (InstClass == UNKNOWN) 748 return; 749 750 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 751 752 switch (InstClass) { 753 case DS_READ: 754 EltSize = 755 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 756 : 4; 757 break; 758 case DS_WRITE: 759 EltSize = 760 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 761 : 4; 762 break; 763 case S_BUFFER_LOAD_IMM: 764 case S_BUFFER_LOAD_SGPR_IMM: 765 case S_LOAD_IMM: 766 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 767 break; 768 default: 769 EltSize = 4; 770 break; 771 } 772 773 if (InstClass == MIMG) { 774 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 775 // Offset is not considered for MIMG instructions. 776 Offset = 0; 777 } else { 778 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 779 Offset = I->getOperand(OffsetIdx).getImm(); 780 } 781 782 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 783 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 784 785 Width = getOpcodeWidth(*I, *LSO.TII); 786 787 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 788 Offset &= 0xffff; 789 } else if (InstClass != MIMG) { 790 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 791 } 792 793 AddressRegs Regs = getRegs(Opc, *LSO.TII); 794 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I); 795 796 NumAddresses = 0; 797 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 798 AddrIdx[NumAddresses++] = 799 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 800 if (Regs.Addr) 801 AddrIdx[NumAddresses++] = 802 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 803 if (Regs.SBase) 804 AddrIdx[NumAddresses++] = 805 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 806 if (Regs.SRsrc) 807 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 808 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc); 809 if (Regs.SOffset) 810 AddrIdx[NumAddresses++] = 811 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 812 if (Regs.SAddr) 813 AddrIdx[NumAddresses++] = 814 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 815 if (Regs.VAddr) 816 AddrIdx[NumAddresses++] = 817 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 818 if (Regs.SSamp) 819 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 820 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp); 821 assert(NumAddresses <= MaxAddressRegs); 822 823 for (unsigned J = 0; J < NumAddresses; J++) 824 AddrReg[J] = &I->getOperand(AddrIdx[J]); 825 } 826 827 } // end anonymous namespace. 828 829 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 830 "SI Load Store Optimizer", false, false) 831 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 832 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 833 false, false) 834 835 char SILoadStoreOptimizer::ID = 0; 836 837 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 838 839 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 840 return new SILoadStoreOptimizer(); 841 } 842 843 static void addDefsUsesToList(const MachineInstr &MI, 844 DenseSet<Register> &RegDefs, 845 DenseSet<Register> &RegUses) { 846 for (const auto &Op : MI.operands()) { 847 if (!Op.isReg()) 848 continue; 849 if (Op.isDef()) 850 RegDefs.insert(Op.getReg()); 851 if (Op.readsReg()) 852 RegUses.insert(Op.getReg()); 853 } 854 } 855 856 bool SILoadStoreOptimizer::canSwapInstructions( 857 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 858 const MachineInstr &A, const MachineInstr &B) const { 859 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 860 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 861 return false; 862 for (const auto &BOp : B.operands()) { 863 if (!BOp.isReg()) 864 continue; 865 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 866 return false; 867 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 868 return false; 869 } 870 return true; 871 } 872 873 // Given that \p CI and \p Paired are adjacent memory operations produce a new 874 // MMO for the combined operation with a new access size. 875 MachineMemOperand * 876 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, 877 const CombineInfo &Paired) { 878 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 879 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 880 881 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue(); 882 883 // A base pointer for the combined operation is the same as the leading 884 // operation's pointer. 885 if (Paired < CI) 886 std::swap(MMOa, MMOb); 887 888 MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); 889 // If merging FLAT and GLOBAL set address space to FLAT. 890 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 891 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; 892 893 MachineFunction *MF = CI.I->getMF(); 894 return MF->getMachineMemOperand(MMOa, PtrInfo, Size); 895 } 896 897 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 898 const SIInstrInfo &TII, 899 const CombineInfo &Paired) { 900 assert(CI.InstClass == MIMG); 901 902 // Ignore instructions with tfe/lwe set. 903 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 904 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 905 906 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 907 return false; 908 909 // Check other optional immediate operands for equality. 910 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 911 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 912 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 913 914 for (auto op : OperandsToMatch) { 915 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 916 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 917 return false; 918 if (Idx != -1 && 919 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 920 return false; 921 } 922 923 // Check DMask for overlaps. 924 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 925 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 926 927 if (!MaxMask) 928 return false; 929 930 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask); 931 if ((1u << AllowedBitsForMin) <= MinMask) 932 return false; 933 934 return true; 935 } 936 937 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 938 unsigned ComponentCount, 939 const GCNSubtarget &STI) { 940 if (ComponentCount > 4) 941 return 0; 942 943 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 944 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 945 if (!OldFormatInfo) 946 return 0; 947 948 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 949 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 950 ComponentCount, 951 OldFormatInfo->NumFormat, STI); 952 953 if (!NewFormatInfo) 954 return 0; 955 956 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 957 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 958 959 return NewFormatInfo->Format; 960 } 961 962 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 963 // highest power of two. Note that the result is well defined for all inputs 964 // including corner cases like: 965 // - if Lo == Hi, return that value 966 // - if Lo == 0, return 0 (even though the "- 1" below underflows 967 // - if Lo > Hi, return 0 (as if the range wrapped around) 968 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 969 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1); 970 } 971 972 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 973 const GCNSubtarget &STI, 974 CombineInfo &Paired, 975 bool Modify) { 976 assert(CI.InstClass != MIMG); 977 978 // XXX - Would the same offset be OK? Is there any reason this would happen or 979 // be useful? 980 if (CI.Offset == Paired.Offset) 981 return false; 982 983 // This won't be valid if the offset isn't aligned. 984 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 985 return false; 986 987 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 988 989 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 990 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 991 if (!Info0) 992 return false; 993 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 994 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 995 if (!Info1) 996 return false; 997 998 if (Info0->BitsPerComp != Info1->BitsPerComp || 999 Info0->NumFormat != Info1->NumFormat) 1000 return false; 1001 1002 // TODO: Should be possible to support more formats, but if format loads 1003 // are not dword-aligned, the merged load might not be valid. 1004 if (Info0->BitsPerComp != 32) 1005 return false; 1006 1007 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 1008 return false; 1009 } 1010 1011 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 1012 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 1013 CI.UseST64 = false; 1014 CI.BaseOff = 0; 1015 1016 // Handle all non-DS instructions. 1017 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 1018 if (EltOffset0 + CI.Width != EltOffset1 && 1019 EltOffset1 + Paired.Width != EltOffset0) 1020 return false; 1021 if (CI.CPol != Paired.CPol) 1022 return false; 1023 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || 1024 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { 1025 // Reject cases like: 1026 // dword + dwordx2 -> dwordx3 1027 // dword + dwordx3 -> dwordx4 1028 // If we tried to combine these cases, we would fail to extract a subreg 1029 // for the result of the second load due to SGPR alignment requirements. 1030 if (CI.Width != Paired.Width && 1031 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset)) 1032 return false; 1033 } 1034 return true; 1035 } 1036 1037 // If the offset in elements doesn't fit in 8-bits, we might be able to use 1038 // the stride 64 versions. 1039 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 1040 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 1041 if (Modify) { 1042 CI.Offset = EltOffset0 / 64; 1043 Paired.Offset = EltOffset1 / 64; 1044 CI.UseST64 = true; 1045 } 1046 return true; 1047 } 1048 1049 // Check if the new offsets fit in the reduced 8-bit range. 1050 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 1051 if (Modify) { 1052 CI.Offset = EltOffset0; 1053 Paired.Offset = EltOffset1; 1054 } 1055 return true; 1056 } 1057 1058 // Try to shift base address to decrease offsets. 1059 uint32_t Min = std::min(EltOffset0, EltOffset1); 1060 uint32_t Max = std::max(EltOffset0, EltOffset1); 1061 1062 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 1063 if (((Max - Min) & ~Mask) == 0) { 1064 if (Modify) { 1065 // From the range of values we could use for BaseOff, choose the one that 1066 // is aligned to the highest power of two, to maximise the chance that 1067 // the same offset can be reused for other load/store pairs. 1068 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 1069 // Copy the low bits of the offsets, so that when we adjust them by 1070 // subtracting BaseOff they will be multiples of 64. 1071 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 1072 CI.BaseOff = BaseOff * CI.EltSize; 1073 CI.Offset = (EltOffset0 - BaseOff) / 64; 1074 Paired.Offset = (EltOffset1 - BaseOff) / 64; 1075 CI.UseST64 = true; 1076 } 1077 return true; 1078 } 1079 1080 if (isUInt<8>(Max - Min)) { 1081 if (Modify) { 1082 // From the range of values we could use for BaseOff, choose the one that 1083 // is aligned to the highest power of two, to maximise the chance that 1084 // the same offset can be reused for other load/store pairs. 1085 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 1086 CI.BaseOff = BaseOff * CI.EltSize; 1087 CI.Offset = EltOffset0 - BaseOff; 1088 Paired.Offset = EltOffset1 - BaseOff; 1089 } 1090 return true; 1091 } 1092 1093 return false; 1094 } 1095 1096 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 1097 const CombineInfo &CI, 1098 const CombineInfo &Paired) { 1099 const unsigned Width = (CI.Width + Paired.Width); 1100 switch (CI.InstClass) { 1101 default: 1102 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 1103 case S_BUFFER_LOAD_IMM: 1104 case S_BUFFER_LOAD_SGPR_IMM: 1105 case S_LOAD_IMM: 1106 switch (Width) { 1107 default: 1108 return false; 1109 case 2: 1110 case 4: 1111 case 8: 1112 return true; 1113 case 3: 1114 return STM.hasScalarDwordx3Loads(); 1115 } 1116 } 1117 } 1118 1119 const TargetRegisterClass * 1120 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 1121 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 1122 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1123 } 1124 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 1125 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1126 } 1127 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 1128 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1129 } 1130 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 1131 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1132 } 1133 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 1134 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1135 } 1136 return nullptr; 1137 } 1138 1139 /// This function assumes that CI comes before Paired in a basic block. Return 1140 /// an insertion point for the merged instruction or nullptr on failure. 1141 SILoadStoreOptimizer::CombineInfo * 1142 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 1143 CombineInfo &Paired) { 1144 // If another instruction has already been merged into CI, it may now be a 1145 // type that we can't do any further merging into. 1146 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 1147 return nullptr; 1148 assert(CI.InstClass == Paired.InstClass); 1149 1150 if (getInstSubclass(CI.I->getOpcode(), *TII) != 1151 getInstSubclass(Paired.I->getOpcode(), *TII)) 1152 return nullptr; 1153 1154 // Check both offsets (or masks for MIMG) can be combined and fit in the 1155 // reduced range. 1156 if (CI.InstClass == MIMG) { 1157 if (!dmasksCanBeCombined(CI, *TII, Paired)) 1158 return nullptr; 1159 } else { 1160 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 1161 return nullptr; 1162 } 1163 1164 DenseSet<Register> RegDefs; 1165 DenseSet<Register> RegUses; 1166 CombineInfo *Where; 1167 if (CI.I->mayLoad()) { 1168 // Try to hoist Paired up to CI. 1169 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 1170 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 1171 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 1172 return nullptr; 1173 } 1174 Where = &CI; 1175 } else { 1176 // Try to sink CI down to Paired. 1177 addDefsUsesToList(*CI.I, RegDefs, RegUses); 1178 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 1179 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 1180 return nullptr; 1181 } 1182 Where = &Paired; 1183 } 1184 1185 // Call offsetsCanBeCombined with modify = true so that the offsets are 1186 // correct for the new instruction. This should return true, because 1187 // this function should only be called on CombineInfo objects that 1188 // have already been confirmed to be mergeable. 1189 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 1190 offsetsCanBeCombined(CI, *STM, Paired, true); 1191 return Where; 1192 } 1193 1194 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1195 if (STM->ldsRequiresM0Init()) 1196 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1197 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1198 } 1199 1200 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1201 if (STM->ldsRequiresM0Init()) 1202 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1203 1204 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1205 : AMDGPU::DS_READ2ST64_B64_gfx9; 1206 } 1207 1208 MachineBasicBlock::iterator 1209 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1210 MachineBasicBlock::iterator InsertBefore) { 1211 MachineBasicBlock *MBB = CI.I->getParent(); 1212 1213 // Be careful, since the addresses could be subregisters themselves in weird 1214 // cases, like vectors of pointers. 1215 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1216 1217 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1218 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1219 1220 unsigned NewOffset0 = CI.Offset; 1221 unsigned NewOffset1 = Paired.Offset; 1222 unsigned Opc = 1223 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1224 1225 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1226 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1227 1228 if (NewOffset0 > NewOffset1) { 1229 // Canonicalize the merged instruction so the smaller offset comes first. 1230 std::swap(NewOffset0, NewOffset1); 1231 std::swap(SubRegIdx0, SubRegIdx1); 1232 } 1233 1234 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1235 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1236 1237 const MCInstrDesc &Read2Desc = TII->get(Opc); 1238 1239 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1240 Register DestReg = MRI->createVirtualRegister(SuperRC); 1241 1242 DebugLoc DL = CI.I->getDebugLoc(); 1243 1244 Register BaseReg = AddrReg->getReg(); 1245 unsigned BaseSubReg = AddrReg->getSubReg(); 1246 unsigned BaseRegFlags = 0; 1247 if (CI.BaseOff) { 1248 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1249 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1250 .addImm(CI.BaseOff); 1251 1252 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1253 BaseRegFlags = RegState::Kill; 1254 1255 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1256 .addReg(ImmReg) 1257 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1258 .addImm(0); // clamp bit 1259 BaseSubReg = 0; 1260 } 1261 1262 MachineInstrBuilder Read2 = 1263 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 1264 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1265 .addImm(NewOffset0) // offset0 1266 .addImm(NewOffset1) // offset1 1267 .addImm(0) // gds 1268 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1269 1270 (void)Read2; 1271 1272 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1273 1274 // Copy to the old destination registers. 1275 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1276 .add(*Dest0) // Copy to same destination including flags and sub reg. 1277 .addReg(DestReg, 0, SubRegIdx0); 1278 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1279 .add(*Dest1) 1280 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1281 1282 CI.I->eraseFromParent(); 1283 Paired.I->eraseFromParent(); 1284 1285 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1286 return Read2; 1287 } 1288 1289 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1290 if (STM->ldsRequiresM0Init()) 1291 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1292 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1293 : AMDGPU::DS_WRITE2_B64_gfx9; 1294 } 1295 1296 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1297 if (STM->ldsRequiresM0Init()) 1298 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1299 : AMDGPU::DS_WRITE2ST64_B64; 1300 1301 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1302 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1303 } 1304 1305 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1306 CombineInfo &CI, CombineInfo &Paired, 1307 MachineBasicBlock::iterator InsertBefore) { 1308 MachineBasicBlock *MBB = CI.I->getParent(); 1309 1310 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1311 // sure we preserve the subregister index and any register flags set on them. 1312 const MachineOperand *AddrReg = 1313 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1314 const MachineOperand *Data0 = 1315 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1316 const MachineOperand *Data1 = 1317 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1318 1319 unsigned NewOffset0 = CI.Offset; 1320 unsigned NewOffset1 = Paired.Offset; 1321 unsigned Opc = 1322 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1323 1324 if (NewOffset0 > NewOffset1) { 1325 // Canonicalize the merged instruction so the smaller offset comes first. 1326 std::swap(NewOffset0, NewOffset1); 1327 std::swap(Data0, Data1); 1328 } 1329 1330 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1331 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1332 1333 const MCInstrDesc &Write2Desc = TII->get(Opc); 1334 DebugLoc DL = CI.I->getDebugLoc(); 1335 1336 Register BaseReg = AddrReg->getReg(); 1337 unsigned BaseSubReg = AddrReg->getSubReg(); 1338 unsigned BaseRegFlags = 0; 1339 if (CI.BaseOff) { 1340 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1341 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1342 .addImm(CI.BaseOff); 1343 1344 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1345 BaseRegFlags = RegState::Kill; 1346 1347 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1348 .addReg(ImmReg) 1349 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1350 .addImm(0); // clamp bit 1351 BaseSubReg = 0; 1352 } 1353 1354 MachineInstrBuilder Write2 = 1355 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1356 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1357 .add(*Data0) // data0 1358 .add(*Data1) // data1 1359 .addImm(NewOffset0) // offset0 1360 .addImm(NewOffset1) // offset1 1361 .addImm(0) // gds 1362 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1363 1364 CI.I->eraseFromParent(); 1365 Paired.I->eraseFromParent(); 1366 1367 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1368 return Write2; 1369 } 1370 1371 MachineBasicBlock::iterator 1372 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1373 MachineBasicBlock::iterator InsertBefore) { 1374 MachineBasicBlock *MBB = CI.I->getParent(); 1375 DebugLoc DL = CI.I->getDebugLoc(); 1376 const unsigned Opcode = getNewOpcode(CI, Paired); 1377 1378 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1379 1380 Register DestReg = MRI->createVirtualRegister(SuperRC); 1381 unsigned MergedDMask = CI.DMask | Paired.DMask; 1382 unsigned DMaskIdx = 1383 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1384 1385 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1386 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1387 if (I == DMaskIdx) 1388 MIB.addImm(MergedDMask); 1389 else 1390 MIB.add((*CI.I).getOperand(I)); 1391 } 1392 1393 // It shouldn't be possible to get this far if the two instructions 1394 // don't have a single memoperand, because MachineInstr::mayAlias() 1395 // will return true if this is the case. 1396 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1397 1398 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1399 1400 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1401 1402 // Copy to the old destination registers. 1403 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1404 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1405 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1406 1407 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1408 .add(*Dest0) // Copy to same destination including flags and sub reg. 1409 .addReg(DestReg, 0, SubRegIdx0); 1410 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1411 .add(*Dest1) 1412 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1413 1414 CI.I->eraseFromParent(); 1415 Paired.I->eraseFromParent(); 1416 return New; 1417 } 1418 1419 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( 1420 CombineInfo &CI, CombineInfo &Paired, 1421 MachineBasicBlock::iterator InsertBefore) { 1422 MachineBasicBlock *MBB = CI.I->getParent(); 1423 DebugLoc DL = CI.I->getDebugLoc(); 1424 const unsigned Opcode = getNewOpcode(CI, Paired); 1425 1426 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1427 1428 Register DestReg = MRI->createVirtualRegister(SuperRC); 1429 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1430 1431 // It shouldn't be possible to get this far if the two instructions 1432 // don't have a single memoperand, because MachineInstr::mayAlias() 1433 // will return true if this is the case. 1434 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1435 1436 MachineInstrBuilder New = 1437 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1438 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)); 1439 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) 1440 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)); 1441 New.addImm(MergedOffset); 1442 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1443 1444 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1445 1446 // Copy to the old destination registers. 1447 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1448 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1449 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1450 1451 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1452 .add(*Dest0) // Copy to same destination including flags and sub reg. 1453 .addReg(DestReg, 0, SubRegIdx0); 1454 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1455 .add(*Dest1) 1456 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1457 1458 CI.I->eraseFromParent(); 1459 Paired.I->eraseFromParent(); 1460 return New; 1461 } 1462 1463 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1464 CombineInfo &CI, CombineInfo &Paired, 1465 MachineBasicBlock::iterator InsertBefore) { 1466 MachineBasicBlock *MBB = CI.I->getParent(); 1467 DebugLoc DL = CI.I->getDebugLoc(); 1468 1469 const unsigned Opcode = getNewOpcode(CI, Paired); 1470 1471 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1472 1473 // Copy to the new source register. 1474 Register DestReg = MRI->createVirtualRegister(SuperRC); 1475 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1476 1477 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1478 1479 AddressRegs Regs = getRegs(Opcode, *TII); 1480 1481 if (Regs.VAddr) 1482 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1483 1484 // It shouldn't be possible to get this far if the two instructions 1485 // don't have a single memoperand, because MachineInstr::mayAlias() 1486 // will return true if this is the case. 1487 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1488 1489 MachineInstr *New = 1490 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1491 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1492 .addImm(MergedOffset) // offset 1493 .addImm(CI.CPol) // cpol 1494 .addImm(0) // swz 1495 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1496 1497 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1498 1499 // Copy to the old destination registers. 1500 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1501 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1502 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1503 1504 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1505 .add(*Dest0) // Copy to same destination including flags and sub reg. 1506 .addReg(DestReg, 0, SubRegIdx0); 1507 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1508 .add(*Dest1) 1509 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1510 1511 CI.I->eraseFromParent(); 1512 Paired.I->eraseFromParent(); 1513 return New; 1514 } 1515 1516 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1517 CombineInfo &CI, CombineInfo &Paired, 1518 MachineBasicBlock::iterator InsertBefore) { 1519 MachineBasicBlock *MBB = CI.I->getParent(); 1520 DebugLoc DL = CI.I->getDebugLoc(); 1521 1522 const unsigned Opcode = getNewOpcode(CI, Paired); 1523 1524 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1525 1526 // Copy to the new source register. 1527 Register DestReg = MRI->createVirtualRegister(SuperRC); 1528 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1529 1530 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1531 1532 AddressRegs Regs = getRegs(Opcode, *TII); 1533 1534 if (Regs.VAddr) 1535 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1536 1537 unsigned JoinedFormat = 1538 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1539 1540 // It shouldn't be possible to get this far if the two instructions 1541 // don't have a single memoperand, because MachineInstr::mayAlias() 1542 // will return true if this is the case. 1543 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1544 1545 MachineInstr *New = 1546 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1547 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1548 .addImm(MergedOffset) // offset 1549 .addImm(JoinedFormat) // format 1550 .addImm(CI.CPol) // cpol 1551 .addImm(0) // swz 1552 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1553 1554 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1555 1556 // Copy to the old destination registers. 1557 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1558 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1559 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1560 1561 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1562 .add(*Dest0) // Copy to same destination including flags and sub reg. 1563 .addReg(DestReg, 0, SubRegIdx0); 1564 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1565 .add(*Dest1) 1566 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1567 1568 CI.I->eraseFromParent(); 1569 Paired.I->eraseFromParent(); 1570 return New; 1571 } 1572 1573 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1574 CombineInfo &CI, CombineInfo &Paired, 1575 MachineBasicBlock::iterator InsertBefore) { 1576 MachineBasicBlock *MBB = CI.I->getParent(); 1577 DebugLoc DL = CI.I->getDebugLoc(); 1578 1579 const unsigned Opcode = getNewOpcode(CI, Paired); 1580 1581 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1582 1583 // Copy to the new source register. 1584 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1585 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1586 1587 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1588 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1589 1590 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1591 .add(*Src0) 1592 .addImm(SubRegIdx0) 1593 .add(*Src1) 1594 .addImm(SubRegIdx1); 1595 1596 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1597 .addReg(SrcReg, RegState::Kill); 1598 1599 AddressRegs Regs = getRegs(Opcode, *TII); 1600 1601 if (Regs.VAddr) 1602 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1603 1604 unsigned JoinedFormat = 1605 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1606 1607 // It shouldn't be possible to get this far if the two instructions 1608 // don't have a single memoperand, because MachineInstr::mayAlias() 1609 // will return true if this is the case. 1610 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1611 1612 MachineInstr *New = 1613 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1614 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1615 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1616 .addImm(JoinedFormat) // format 1617 .addImm(CI.CPol) // cpol 1618 .addImm(0) // swz 1619 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1620 1621 CI.I->eraseFromParent(); 1622 Paired.I->eraseFromParent(); 1623 return New; 1624 } 1625 1626 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( 1627 CombineInfo &CI, CombineInfo &Paired, 1628 MachineBasicBlock::iterator InsertBefore) { 1629 MachineBasicBlock *MBB = CI.I->getParent(); 1630 DebugLoc DL = CI.I->getDebugLoc(); 1631 1632 const unsigned Opcode = getNewOpcode(CI, Paired); 1633 1634 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1635 Register DestReg = MRI->createVirtualRegister(SuperRC); 1636 1637 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1638 1639 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1640 MIB.add(*SAddr); 1641 1642 MachineInstr *New = 1643 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1644 .addImm(std::min(CI.Offset, Paired.Offset)) 1645 .addImm(CI.CPol) 1646 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1647 1648 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1649 1650 // Copy to the old destination registers. 1651 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1652 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1653 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1654 1655 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1656 .add(*Dest0) // Copy to same destination including flags and sub reg. 1657 .addReg(DestReg, 0, SubRegIdx0); 1658 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1659 .add(*Dest1) 1660 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1661 1662 CI.I->eraseFromParent(); 1663 Paired.I->eraseFromParent(); 1664 return New; 1665 } 1666 1667 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( 1668 CombineInfo &CI, CombineInfo &Paired, 1669 MachineBasicBlock::iterator InsertBefore) { 1670 MachineBasicBlock *MBB = CI.I->getParent(); 1671 DebugLoc DL = CI.I->getDebugLoc(); 1672 1673 const unsigned Opcode = getNewOpcode(CI, Paired); 1674 1675 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1676 1677 // Copy to the new source register. 1678 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1679 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1680 1681 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1682 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1683 1684 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1685 .add(*Src0) 1686 .addImm(SubRegIdx0) 1687 .add(*Src1) 1688 .addImm(SubRegIdx1); 1689 1690 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1691 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1692 .addReg(SrcReg, RegState::Kill); 1693 1694 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1695 MIB.add(*SAddr); 1696 1697 MachineInstr *New = 1698 MIB.addImm(std::min(CI.Offset, Paired.Offset)) 1699 .addImm(CI.CPol) 1700 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1701 1702 CI.I->eraseFromParent(); 1703 Paired.I->eraseFromParent(); 1704 return New; 1705 } 1706 1707 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1708 const CombineInfo &Paired) { 1709 const unsigned Width = CI.Width + Paired.Width; 1710 1711 switch (getCommonInstClass(CI, Paired)) { 1712 default: 1713 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1714 // FIXME: Handle d16 correctly 1715 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1716 Width); 1717 case TBUFFER_LOAD: 1718 case TBUFFER_STORE: 1719 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1720 Width); 1721 1722 case UNKNOWN: 1723 llvm_unreachable("Unknown instruction class"); 1724 case S_BUFFER_LOAD_IMM: 1725 switch (Width) { 1726 default: 1727 return 0; 1728 case 2: 1729 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1730 case 3: 1731 return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; 1732 case 4: 1733 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1734 case 8: 1735 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1736 } 1737 case S_BUFFER_LOAD_SGPR_IMM: 1738 switch (Width) { 1739 default: 1740 return 0; 1741 case 2: 1742 return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; 1743 case 3: 1744 return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; 1745 case 4: 1746 return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; 1747 case 8: 1748 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; 1749 } 1750 case S_LOAD_IMM: 1751 switch (Width) { 1752 default: 1753 return 0; 1754 case 2: 1755 return AMDGPU::S_LOAD_DWORDX2_IMM; 1756 case 3: 1757 return AMDGPU::S_LOAD_DWORDX3_IMM; 1758 case 4: 1759 return AMDGPU::S_LOAD_DWORDX4_IMM; 1760 case 8: 1761 return AMDGPU::S_LOAD_DWORDX8_IMM; 1762 } 1763 case GLOBAL_LOAD: 1764 switch (Width) { 1765 default: 1766 return 0; 1767 case 2: 1768 return AMDGPU::GLOBAL_LOAD_DWORDX2; 1769 case 3: 1770 return AMDGPU::GLOBAL_LOAD_DWORDX3; 1771 case 4: 1772 return AMDGPU::GLOBAL_LOAD_DWORDX4; 1773 } 1774 case GLOBAL_LOAD_SADDR: 1775 switch (Width) { 1776 default: 1777 return 0; 1778 case 2: 1779 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; 1780 case 3: 1781 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; 1782 case 4: 1783 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; 1784 } 1785 case GLOBAL_STORE: 1786 switch (Width) { 1787 default: 1788 return 0; 1789 case 2: 1790 return AMDGPU::GLOBAL_STORE_DWORDX2; 1791 case 3: 1792 return AMDGPU::GLOBAL_STORE_DWORDX3; 1793 case 4: 1794 return AMDGPU::GLOBAL_STORE_DWORDX4; 1795 } 1796 case GLOBAL_STORE_SADDR: 1797 switch (Width) { 1798 default: 1799 return 0; 1800 case 2: 1801 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; 1802 case 3: 1803 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; 1804 case 4: 1805 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; 1806 } 1807 case FLAT_LOAD: 1808 switch (Width) { 1809 default: 1810 return 0; 1811 case 2: 1812 return AMDGPU::FLAT_LOAD_DWORDX2; 1813 case 3: 1814 return AMDGPU::FLAT_LOAD_DWORDX3; 1815 case 4: 1816 return AMDGPU::FLAT_LOAD_DWORDX4; 1817 } 1818 case FLAT_STORE: 1819 switch (Width) { 1820 default: 1821 return 0; 1822 case 2: 1823 return AMDGPU::FLAT_STORE_DWORDX2; 1824 case 3: 1825 return AMDGPU::FLAT_STORE_DWORDX3; 1826 case 4: 1827 return AMDGPU::FLAT_STORE_DWORDX4; 1828 } 1829 case MIMG: 1830 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && 1831 "No overlaps"); 1832 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1833 } 1834 } 1835 1836 std::pair<unsigned, unsigned> 1837 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1838 const CombineInfo &Paired) { 1839 assert((CI.InstClass != MIMG || 1840 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == 1841 CI.Width + Paired.Width)) && 1842 "No overlaps"); 1843 1844 unsigned Idx0; 1845 unsigned Idx1; 1846 1847 static const unsigned Idxs[5][4] = { 1848 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1849 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1850 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1851 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1852 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1853 }; 1854 1855 assert(CI.Width >= 1 && CI.Width <= 4); 1856 assert(Paired.Width >= 1 && Paired.Width <= 4); 1857 1858 if (Paired < CI) { 1859 Idx1 = Idxs[0][Paired.Width - 1]; 1860 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1861 } else { 1862 Idx0 = Idxs[0][CI.Width - 1]; 1863 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1864 } 1865 1866 return {Idx0, Idx1}; 1867 } 1868 1869 const TargetRegisterClass * 1870 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1871 const CombineInfo &Paired) { 1872 if (CI.InstClass == S_BUFFER_LOAD_IMM || 1873 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) { 1874 switch (CI.Width + Paired.Width) { 1875 default: 1876 return nullptr; 1877 case 2: 1878 return &AMDGPU::SReg_64_XEXECRegClass; 1879 case 3: 1880 return &AMDGPU::SGPR_96RegClass; 1881 case 4: 1882 return &AMDGPU::SGPR_128RegClass; 1883 case 8: 1884 return &AMDGPU::SGPR_256RegClass; 1885 case 16: 1886 return &AMDGPU::SGPR_512RegClass; 1887 } 1888 } 1889 1890 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1891 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1892 ? TRI->getAGPRClassForBitWidth(BitWidth) 1893 : TRI->getVGPRClassForBitWidth(BitWidth); 1894 } 1895 1896 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1897 CombineInfo &CI, CombineInfo &Paired, 1898 MachineBasicBlock::iterator InsertBefore) { 1899 MachineBasicBlock *MBB = CI.I->getParent(); 1900 DebugLoc DL = CI.I->getDebugLoc(); 1901 1902 const unsigned Opcode = getNewOpcode(CI, Paired); 1903 1904 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1905 1906 // Copy to the new source register. 1907 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1908 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1909 1910 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1911 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1912 1913 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1914 .add(*Src0) 1915 .addImm(SubRegIdx0) 1916 .add(*Src1) 1917 .addImm(SubRegIdx1); 1918 1919 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1920 .addReg(SrcReg, RegState::Kill); 1921 1922 AddressRegs Regs = getRegs(Opcode, *TII); 1923 1924 if (Regs.VAddr) 1925 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1926 1927 1928 // It shouldn't be possible to get this far if the two instructions 1929 // don't have a single memoperand, because MachineInstr::mayAlias() 1930 // will return true if this is the case. 1931 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1932 1933 MachineInstr *New = 1934 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1935 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1936 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1937 .addImm(CI.CPol) // cpol 1938 .addImm(0) // swz 1939 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1940 1941 CI.I->eraseFromParent(); 1942 Paired.I->eraseFromParent(); 1943 return New; 1944 } 1945 1946 MachineOperand 1947 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1948 APInt V(32, Val, true); 1949 if (TII->isInlineConstant(V)) 1950 return MachineOperand::CreateImm(Val); 1951 1952 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1953 MachineInstr *Mov = 1954 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1955 TII->get(AMDGPU::S_MOV_B32), Reg) 1956 .addImm(Val); 1957 (void)Mov; 1958 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1959 return MachineOperand::CreateReg(Reg, false); 1960 } 1961 1962 // Compute base address using Addr and return the final register. 1963 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1964 const MemAddress &Addr) const { 1965 MachineBasicBlock *MBB = MI.getParent(); 1966 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1967 DebugLoc DL = MI.getDebugLoc(); 1968 1969 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1970 Addr.Base.LoSubReg) && 1971 "Expected 32-bit Base-Register-Low!!"); 1972 1973 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1974 Addr.Base.HiSubReg) && 1975 "Expected 32-bit Base-Register-Hi!!"); 1976 1977 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1978 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1979 MachineOperand OffsetHi = 1980 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1981 1982 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1983 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1984 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1985 1986 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1987 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1988 MachineInstr *LoHalf = 1989 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1990 .addReg(CarryReg, RegState::Define) 1991 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1992 .add(OffsetLo) 1993 .addImm(0); // clamp bit 1994 (void)LoHalf; 1995 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1996 1997 MachineInstr *HiHalf = 1998 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1999 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 2000 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 2001 .add(OffsetHi) 2002 .addReg(CarryReg, RegState::Kill) 2003 .addImm(0); // clamp bit 2004 (void)HiHalf; 2005 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 2006 2007 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 2008 MachineInstr *FullBase = 2009 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2010 .addReg(DestSub0) 2011 .addImm(AMDGPU::sub0) 2012 .addReg(DestSub1) 2013 .addImm(AMDGPU::sub1); 2014 (void)FullBase; 2015 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 2016 2017 return FullDestReg; 2018 } 2019 2020 // Update base and offset with the NewBase and NewOffset in MI. 2021 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 2022 Register NewBase, 2023 int32_t NewOffset) const { 2024 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2025 Base->setReg(NewBase); 2026 Base->setIsKill(false); 2027 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 2028 } 2029 2030 std::optional<int32_t> 2031 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 2032 if (Op.isImm()) 2033 return Op.getImm(); 2034 2035 if (!Op.isReg()) 2036 return std::nullopt; 2037 2038 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 2039 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 2040 !Def->getOperand(1).isImm()) 2041 return std::nullopt; 2042 2043 return Def->getOperand(1).getImm(); 2044 } 2045 2046 // Analyze Base and extracts: 2047 // - 32bit base registers, subregisters 2048 // - 64bit constant offset 2049 // Expecting base computation as: 2050 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 2051 // %LO:vgpr_32, %c:sreg_64_xexec = 2052 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 2053 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 2054 // %Base:vreg_64 = 2055 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 2056 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 2057 MemAddress &Addr) const { 2058 if (!Base.isReg()) 2059 return; 2060 2061 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 2062 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 2063 || Def->getNumOperands() != 5) 2064 return; 2065 2066 MachineOperand BaseLo = Def->getOperand(1); 2067 MachineOperand BaseHi = Def->getOperand(3); 2068 if (!BaseLo.isReg() || !BaseHi.isReg()) 2069 return; 2070 2071 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 2072 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 2073 2074 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 2075 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 2076 return; 2077 2078 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 2079 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 2080 2081 auto Offset0P = extractConstOffset(*Src0); 2082 if (Offset0P) 2083 BaseLo = *Src1; 2084 else { 2085 if (!(Offset0P = extractConstOffset(*Src1))) 2086 return; 2087 BaseLo = *Src0; 2088 } 2089 2090 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 2091 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 2092 2093 if (Src0->isImm()) 2094 std::swap(Src0, Src1); 2095 2096 if (!Src1->isImm()) 2097 return; 2098 2099 uint64_t Offset1 = Src1->getImm(); 2100 BaseHi = *Src0; 2101 2102 Addr.Base.LoReg = BaseLo.getReg(); 2103 Addr.Base.HiReg = BaseHi.getReg(); 2104 Addr.Base.LoSubReg = BaseLo.getSubReg(); 2105 Addr.Base.HiSubReg = BaseHi.getSubReg(); 2106 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 2107 } 2108 2109 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 2110 MachineInstr &MI, 2111 MemInfoMap &Visited, 2112 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 2113 2114 if (!(MI.mayLoad() ^ MI.mayStore())) 2115 return false; 2116 2117 // TODO: Support flat and scratch. 2118 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 2119 return false; 2120 2121 if (MI.mayLoad() && 2122 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) 2123 return false; 2124 2125 if (AnchorList.count(&MI)) 2126 return false; 2127 2128 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 2129 2130 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 2131 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 2132 return false; 2133 } 2134 2135 // Step1: Find the base-registers and a 64bit constant offset. 2136 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2137 MemAddress MAddr; 2138 if (!Visited.contains(&MI)) { 2139 processBaseWithConstOffset(Base, MAddr); 2140 Visited[&MI] = MAddr; 2141 } else 2142 MAddr = Visited[&MI]; 2143 2144 if (MAddr.Offset == 0) { 2145 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 2146 " constant offsets that can be promoted.\n";); 2147 return false; 2148 } 2149 2150 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 2151 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 2152 2153 // Step2: Traverse through MI's basic block and find an anchor(that has the 2154 // same base-registers) with the highest 13bit distance from MI's offset. 2155 // E.g. (64bit loads) 2156 // bb: 2157 // addr1 = &a + 4096; load1 = load(addr1, 0) 2158 // addr2 = &a + 6144; load2 = load(addr2, 0) 2159 // addr3 = &a + 8192; load3 = load(addr3, 0) 2160 // addr4 = &a + 10240; load4 = load(addr4, 0) 2161 // addr5 = &a + 12288; load5 = load(addr5, 0) 2162 // 2163 // Starting from the first load, the optimization will try to find a new base 2164 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 2165 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 2166 // as the new-base(anchor) because of the maximum distance which can 2167 // accommodate more intermediate bases presumably. 2168 // 2169 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 2170 // (&a + 8192) for load1, load2, load4. 2171 // addr = &a + 8192 2172 // load1 = load(addr, -4096) 2173 // load2 = load(addr, -2048) 2174 // load3 = load(addr, 0) 2175 // load4 = load(addr, 2048) 2176 // addr5 = &a + 12288; load5 = load(addr5, 0) 2177 // 2178 MachineInstr *AnchorInst = nullptr; 2179 MemAddress AnchorAddr; 2180 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 2181 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 2182 2183 MachineBasicBlock *MBB = MI.getParent(); 2184 MachineBasicBlock::iterator E = MBB->end(); 2185 MachineBasicBlock::iterator MBBI = MI.getIterator(); 2186 ++MBBI; 2187 const SITargetLowering *TLI = 2188 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 2189 2190 for ( ; MBBI != E; ++MBBI) { 2191 MachineInstr &MINext = *MBBI; 2192 // TODO: Support finding an anchor(with same base) from store addresses or 2193 // any other load addresses where the opcodes are different. 2194 if (MINext.getOpcode() != MI.getOpcode() || 2195 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 2196 continue; 2197 2198 const MachineOperand &BaseNext = 2199 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 2200 MemAddress MAddrNext; 2201 if (!Visited.contains(&MINext)) { 2202 processBaseWithConstOffset(BaseNext, MAddrNext); 2203 Visited[&MINext] = MAddrNext; 2204 } else 2205 MAddrNext = Visited[&MINext]; 2206 2207 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 2208 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 2209 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 2210 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 2211 continue; 2212 2213 InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset); 2214 2215 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 2216 TargetLoweringBase::AddrMode AM; 2217 AM.HasBaseReg = true; 2218 AM.BaseOffs = Dist; 2219 if (TLI->isLegalGlobalAddressingMode(AM) && 2220 (uint32_t)std::abs(Dist) > MaxDist) { 2221 MaxDist = std::abs(Dist); 2222 2223 AnchorAddr = MAddrNext; 2224 AnchorInst = &MINext; 2225 } 2226 } 2227 2228 if (AnchorInst) { 2229 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 2230 AnchorInst->dump()); 2231 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 2232 << AnchorAddr.Offset << "\n\n"); 2233 2234 // Instead of moving up, just re-compute anchor-instruction's base address. 2235 Register Base = computeBase(MI, AnchorAddr); 2236 2237 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 2238 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 2239 2240 for (auto [OtherMI, OtherOffset] : InstsWCommonBase) { 2241 TargetLoweringBase::AddrMode AM; 2242 AM.HasBaseReg = true; 2243 AM.BaseOffs = OtherOffset - AnchorAddr.Offset; 2244 2245 if (TLI->isLegalGlobalAddressingMode(AM)) { 2246 LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")"; 2247 OtherMI->dump()); 2248 updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset); 2249 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump()); 2250 } 2251 } 2252 AnchorList.insert(AnchorInst); 2253 return true; 2254 } 2255 2256 return false; 2257 } 2258 2259 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 2260 std::list<std::list<CombineInfo> > &MergeableInsts) const { 2261 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 2262 if (AddrList.front().InstClass == CI.InstClass && 2263 AddrList.front().IsAGPR == CI.IsAGPR && 2264 AddrList.front().hasSameBaseAddress(CI)) { 2265 AddrList.emplace_back(CI); 2266 return; 2267 } 2268 } 2269 2270 // Base address not found, so add a new list. 2271 MergeableInsts.emplace_back(1, CI); 2272 } 2273 2274 std::pair<MachineBasicBlock::iterator, bool> 2275 SILoadStoreOptimizer::collectMergeableInsts( 2276 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2277 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2278 std::list<std::list<CombineInfo>> &MergeableInsts) const { 2279 bool Modified = false; 2280 2281 // Sort potential mergeable instructions into lists. One list per base address. 2282 unsigned Order = 0; 2283 MachineBasicBlock::iterator BlockI = Begin; 2284 for (; BlockI != End; ++BlockI) { 2285 MachineInstr &MI = *BlockI; 2286 2287 // We run this before checking if an address is mergeable, because it can produce 2288 // better code even if the instructions aren't mergeable. 2289 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 2290 Modified = true; 2291 2292 // Treat volatile accesses, ordered accesses and unmodeled side effects as 2293 // barriers. We can look after this barrier for separate merges. 2294 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 2295 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 2296 2297 // Search will resume after this instruction in a separate merge list. 2298 ++BlockI; 2299 break; 2300 } 2301 2302 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2303 if (InstClass == UNKNOWN) 2304 continue; 2305 2306 // Do not merge VMEM buffer instructions with "swizzled" bit set. 2307 int Swizzled = 2308 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 2309 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 2310 continue; 2311 2312 CombineInfo CI; 2313 CI.setMI(MI, *this); 2314 CI.Order = Order++; 2315 2316 if (!CI.hasMergeableAddress(*MRI)) 2317 continue; 2318 2319 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 2320 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2321 // operands. However we are reporting that ds_write2 shall have 2322 // only VGPR data so that machine copy propagation does not 2323 // create an illegal instruction with a VGPR and AGPR sources. 2324 // Consequenctially if we create such instruction the verifier 2325 // will complain. 2326 continue; 2327 } 2328 2329 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2330 2331 addInstToMergeableList(CI, MergeableInsts); 2332 } 2333 2334 // At this point we have lists of Mergeable instructions. 2335 // 2336 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2337 // list try to find an instruction that can be merged with I. If an instruction 2338 // is found, it is stored in the Paired field. If no instructions are found, then 2339 // the CombineInfo object is deleted from the list. 2340 2341 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2342 E = MergeableInsts.end(); I != E;) { 2343 2344 std::list<CombineInfo> &MergeList = *I; 2345 if (MergeList.size() <= 1) { 2346 // This means we have found only one instruction with a given address 2347 // that can be merged, and we need at least 2 instructions to do a merge, 2348 // so this list can be discarded. 2349 I = MergeableInsts.erase(I); 2350 continue; 2351 } 2352 2353 // Sort the lists by offsets, this way mergeable instructions will be 2354 // adjacent to each other in the list, which will make it easier to find 2355 // matches. 2356 MergeList.sort( 2357 [] (const CombineInfo &A, const CombineInfo &B) { 2358 return A.Offset < B.Offset; 2359 }); 2360 ++I; 2361 } 2362 2363 return {BlockI, Modified}; 2364 } 2365 2366 // Scan through looking for adjacent LDS operations with constant offsets from 2367 // the same base register. We rely on the scheduler to do the hard work of 2368 // clustering nearby loads, and assume these are all adjacent. 2369 bool SILoadStoreOptimizer::optimizeBlock( 2370 std::list<std::list<CombineInfo> > &MergeableInsts) { 2371 bool Modified = false; 2372 2373 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2374 E = MergeableInsts.end(); I != E;) { 2375 std::list<CombineInfo> &MergeList = *I; 2376 2377 bool OptimizeListAgain = false; 2378 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2379 // We weren't able to make any changes, so delete the list so we don't 2380 // process the same instructions the next time we try to optimize this 2381 // block. 2382 I = MergeableInsts.erase(I); 2383 continue; 2384 } 2385 2386 Modified = true; 2387 2388 // We made changes, but also determined that there were no more optimization 2389 // opportunities, so we don't need to reprocess the list 2390 if (!OptimizeListAgain) { 2391 I = MergeableInsts.erase(I); 2392 continue; 2393 } 2394 OptimizeAgain = true; 2395 } 2396 return Modified; 2397 } 2398 2399 bool 2400 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2401 std::list<CombineInfo> &MergeList, 2402 bool &OptimizeListAgain) { 2403 if (MergeList.empty()) 2404 return false; 2405 2406 bool Modified = false; 2407 2408 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2409 Next = std::next(I)) { 2410 2411 auto First = I; 2412 auto Second = Next; 2413 2414 if ((*First).Order > (*Second).Order) 2415 std::swap(First, Second); 2416 CombineInfo &CI = *First; 2417 CombineInfo &Paired = *Second; 2418 2419 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 2420 if (!Where) { 2421 ++I; 2422 continue; 2423 } 2424 2425 Modified = true; 2426 2427 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2428 2429 MachineBasicBlock::iterator NewMI; 2430 switch (CI.InstClass) { 2431 default: 2432 llvm_unreachable("unknown InstClass"); 2433 break; 2434 case DS_READ: 2435 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2436 break; 2437 case DS_WRITE: 2438 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2439 break; 2440 case S_BUFFER_LOAD_IMM: 2441 case S_BUFFER_LOAD_SGPR_IMM: 2442 case S_LOAD_IMM: 2443 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I); 2444 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2445 break; 2446 case BUFFER_LOAD: 2447 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2448 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2449 break; 2450 case BUFFER_STORE: 2451 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2452 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2453 break; 2454 case MIMG: 2455 NewMI = mergeImagePair(CI, Paired, Where->I); 2456 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2457 break; 2458 case TBUFFER_LOAD: 2459 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2460 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2461 break; 2462 case TBUFFER_STORE: 2463 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2464 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2465 break; 2466 case FLAT_LOAD: 2467 case GLOBAL_LOAD: 2468 case GLOBAL_LOAD_SADDR: 2469 NewMI = mergeFlatLoadPair(CI, Paired, Where->I); 2470 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2471 break; 2472 case FLAT_STORE: 2473 case GLOBAL_STORE: 2474 case GLOBAL_STORE_SADDR: 2475 NewMI = mergeFlatStorePair(CI, Paired, Where->I); 2476 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2477 break; 2478 } 2479 CI.setMI(NewMI, *this); 2480 CI.Order = Where->Order; 2481 if (I == Second) 2482 I = Next; 2483 2484 MergeList.erase(Second); 2485 } 2486 2487 return Modified; 2488 } 2489 2490 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2491 if (skipFunction(MF.getFunction())) 2492 return false; 2493 2494 STM = &MF.getSubtarget<GCNSubtarget>(); 2495 if (!STM->loadStoreOptEnabled()) 2496 return false; 2497 2498 TII = STM->getInstrInfo(); 2499 TRI = &TII->getRegisterInfo(); 2500 2501 MRI = &MF.getRegInfo(); 2502 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2503 2504 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2505 2506 bool Modified = false; 2507 2508 // Contains the list of instructions for which constant offsets are being 2509 // promoted to the IMM. This is tracked for an entire block at time. 2510 SmallPtrSet<MachineInstr *, 4> AnchorList; 2511 MemInfoMap Visited; 2512 2513 for (MachineBasicBlock &MBB : MF) { 2514 MachineBasicBlock::iterator SectionEnd; 2515 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2516 I = SectionEnd) { 2517 bool CollectModified; 2518 std::list<std::list<CombineInfo>> MergeableInsts; 2519 2520 // First pass: Collect list of all instructions we know how to merge in a 2521 // subset of the block. 2522 std::tie(SectionEnd, CollectModified) = 2523 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2524 2525 Modified |= CollectModified; 2526 2527 do { 2528 OptimizeAgain = false; 2529 Modified |= optimizeBlock(MergeableInsts); 2530 } while (OptimizeAgain); 2531 } 2532 2533 Visited.clear(); 2534 AnchorList.clear(); 2535 } 2536 2537 return Modified; 2538 } 2539