1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 S_BUFFER_LOAD_SGPR_IMM, 78 S_LOAD_IMM, 79 BUFFER_LOAD, 80 BUFFER_STORE, 81 MIMG, 82 TBUFFER_LOAD, 83 TBUFFER_STORE, 84 GLOBAL_LOAD_SADDR, 85 GLOBAL_STORE_SADDR, 86 FLAT_LOAD, 87 FLAT_STORE, 88 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of 89 GLOBAL_STORE // any CombineInfo, they are only ever returned by 90 // getCommonInstClass. 91 }; 92 93 struct AddressRegs { 94 unsigned char NumVAddrs = 0; 95 bool SBase = false; 96 bool SRsrc = false; 97 bool SOffset = false; 98 bool SAddr = false; 99 bool VAddr = false; 100 bool Addr = false; 101 bool SSamp = false; 102 }; 103 104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 105 const unsigned MaxAddressRegs = 12 + 1 + 1; 106 107 class SILoadStoreOptimizer : public MachineFunctionPass { 108 struct CombineInfo { 109 MachineBasicBlock::iterator I; 110 unsigned EltSize; 111 unsigned Offset; 112 unsigned Width; 113 unsigned Format; 114 unsigned BaseOff; 115 unsigned DMask; 116 InstClassEnum InstClass; 117 unsigned CPol = 0; 118 bool IsAGPR; 119 bool UseST64; 120 int AddrIdx[MaxAddressRegs]; 121 const MachineOperand *AddrReg[MaxAddressRegs]; 122 unsigned NumAddresses; 123 unsigned Order; 124 125 bool hasSameBaseAddress(const CombineInfo &CI) { 126 if (NumAddresses != CI.NumAddresses) 127 return false; 128 129 const MachineInstr &MI = *CI.I; 130 for (unsigned i = 0; i < NumAddresses; i++) { 131 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 132 133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 135 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 136 return false; 137 } 138 continue; 139 } 140 141 // Check same base pointer. Be careful of subregisters, which can occur 142 // with vectors of pointers. 143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 145 return false; 146 } 147 } 148 return true; 149 } 150 151 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 152 for (unsigned i = 0; i < NumAddresses; ++i) { 153 const MachineOperand *AddrOp = AddrReg[i]; 154 // Immediates are always OK. 155 if (AddrOp->isImm()) 156 continue; 157 158 // Don't try to merge addresses that aren't either immediates or registers. 159 // TODO: Should be possible to merge FrameIndexes and maybe some other 160 // non-register 161 if (!AddrOp->isReg()) 162 return false; 163 164 // TODO: We should be able to merge instructions with other physical reg 165 // addresses too. 166 if (AddrOp->getReg().isPhysical() && 167 AddrOp->getReg() != AMDGPU::SGPR_NULL) 168 return false; 169 170 // If an address has only one use then there will be no other 171 // instructions with the same address, so we can't merge this one. 172 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 173 return false; 174 } 175 return true; 176 } 177 178 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 179 180 // Compare by pointer order. 181 bool operator<(const CombineInfo& Other) const { 182 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; 183 } 184 }; 185 186 struct BaseRegisters { 187 Register LoReg; 188 Register HiReg; 189 190 unsigned LoSubReg = 0; 191 unsigned HiSubReg = 0; 192 }; 193 194 struct MemAddress { 195 BaseRegisters Base; 196 int64_t Offset = 0; 197 }; 198 199 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 200 201 private: 202 const GCNSubtarget *STM = nullptr; 203 const SIInstrInfo *TII = nullptr; 204 const SIRegisterInfo *TRI = nullptr; 205 MachineRegisterInfo *MRI = nullptr; 206 AliasAnalysis *AA = nullptr; 207 bool OptimizeAgain; 208 209 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 210 const DenseSet<Register> &ARegUses, 211 const MachineInstr &A, const MachineInstr &B) const; 212 static bool dmasksCanBeCombined(const CombineInfo &CI, 213 const SIInstrInfo &TII, 214 const CombineInfo &Paired); 215 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 216 CombineInfo &Paired, bool Modify = false); 217 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 218 const CombineInfo &Paired); 219 unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 220 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 221 const CombineInfo &Paired); 222 const TargetRegisterClass * 223 getTargetRegisterClass(const CombineInfo &CI, 224 const CombineInfo &Paired) const; 225 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 226 227 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 228 229 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired, 230 MachineBasicBlock::iterator InsertBefore, int OpName, 231 Register DestReg) const; 232 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, 233 MachineBasicBlock::iterator InsertBefore, 234 int OpName) const; 235 236 unsigned read2Opcode(unsigned EltSize) const; 237 unsigned read2ST64Opcode(unsigned EltSize) const; 238 MachineBasicBlock::iterator 239 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 240 MachineBasicBlock::iterator InsertBefore); 241 242 unsigned write2Opcode(unsigned EltSize) const; 243 unsigned write2ST64Opcode(unsigned EltSize) const; 244 MachineBasicBlock::iterator 245 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 246 MachineBasicBlock::iterator InsertBefore); 247 MachineBasicBlock::iterator 248 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 249 MachineBasicBlock::iterator InsertBefore); 250 MachineBasicBlock::iterator 251 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 252 MachineBasicBlock::iterator InsertBefore); 253 MachineBasicBlock::iterator 254 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 255 MachineBasicBlock::iterator InsertBefore); 256 MachineBasicBlock::iterator 257 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 258 MachineBasicBlock::iterator InsertBefore); 259 MachineBasicBlock::iterator 260 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 261 MachineBasicBlock::iterator InsertBefore); 262 MachineBasicBlock::iterator 263 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 264 MachineBasicBlock::iterator InsertBefore); 265 MachineBasicBlock::iterator 266 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, 267 MachineBasicBlock::iterator InsertBefore); 268 MachineBasicBlock::iterator 269 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, 270 MachineBasicBlock::iterator InsertBefore); 271 272 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 273 int32_t NewOffset) const; 274 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 275 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 276 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 277 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 278 /// Promotes constant offset to the immediate by adjusting the base. It 279 /// tries to use a base from the nearby instructions that allows it to have 280 /// a 13bit constant offset which gets promoted to the immediate. 281 bool promoteConstantOffsetToImm(MachineInstr &CI, 282 MemInfoMap &Visited, 283 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 284 void addInstToMergeableList(const CombineInfo &CI, 285 std::list<std::list<CombineInfo> > &MergeableInsts) const; 286 287 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 288 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 289 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 290 std::list<std::list<CombineInfo>> &MergeableInsts) const; 291 292 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, 293 const CombineInfo &Paired); 294 295 static InstClassEnum getCommonInstClass(const CombineInfo &CI, 296 const CombineInfo &Paired); 297 298 public: 299 static char ID; 300 301 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 302 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 303 } 304 305 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 306 bool &OptimizeListAgain); 307 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 308 309 bool runOnMachineFunction(MachineFunction &MF) override; 310 311 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 312 313 void getAnalysisUsage(AnalysisUsage &AU) const override { 314 AU.setPreservesCFG(); 315 AU.addRequired<AAResultsWrapperPass>(); 316 317 MachineFunctionPass::getAnalysisUsage(AU); 318 } 319 320 MachineFunctionProperties getRequiredProperties() const override { 321 return MachineFunctionProperties() 322 .set(MachineFunctionProperties::Property::IsSSA); 323 } 324 }; 325 326 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 327 const unsigned Opc = MI.getOpcode(); 328 329 if (TII.isMUBUF(Opc)) { 330 // FIXME: Handle d16 correctly 331 return AMDGPU::getMUBUFElements(Opc); 332 } 333 if (TII.isImage(MI)) { 334 uint64_t DMaskImm = 335 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 336 return llvm::popcount(DMaskImm); 337 } 338 if (TII.isMTBUF(Opc)) { 339 return AMDGPU::getMTBUFElements(Opc); 340 } 341 342 switch (Opc) { 343 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 344 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 345 case AMDGPU::S_LOAD_DWORD_IMM: 346 case AMDGPU::GLOBAL_LOAD_DWORD: 347 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 348 case AMDGPU::GLOBAL_STORE_DWORD: 349 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 350 case AMDGPU::FLAT_LOAD_DWORD: 351 case AMDGPU::FLAT_STORE_DWORD: 352 return 1; 353 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 354 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 355 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: 356 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: 357 case AMDGPU::S_LOAD_DWORDX2_IMM: 358 case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 359 case AMDGPU::GLOBAL_LOAD_DWORDX2: 360 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 361 case AMDGPU::GLOBAL_STORE_DWORDX2: 362 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 363 case AMDGPU::FLAT_LOAD_DWORDX2: 364 case AMDGPU::FLAT_STORE_DWORDX2: 365 return 2; 366 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 367 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 368 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: 369 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: 370 case AMDGPU::S_LOAD_DWORDX3_IMM: 371 case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 372 case AMDGPU::GLOBAL_LOAD_DWORDX3: 373 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 374 case AMDGPU::GLOBAL_STORE_DWORDX3: 375 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 376 case AMDGPU::FLAT_LOAD_DWORDX3: 377 case AMDGPU::FLAT_STORE_DWORDX3: 378 return 3; 379 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 380 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 381 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: 382 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: 383 case AMDGPU::S_LOAD_DWORDX4_IMM: 384 case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 385 case AMDGPU::GLOBAL_LOAD_DWORDX4: 386 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 387 case AMDGPU::GLOBAL_STORE_DWORDX4: 388 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 389 case AMDGPU::FLAT_LOAD_DWORDX4: 390 case AMDGPU::FLAT_STORE_DWORDX4: 391 return 4; 392 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 393 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 394 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: 395 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: 396 case AMDGPU::S_LOAD_DWORDX8_IMM: 397 case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 398 return 8; 399 case AMDGPU::DS_READ_B32: 400 case AMDGPU::DS_READ_B32_gfx9: 401 case AMDGPU::DS_WRITE_B32: 402 case AMDGPU::DS_WRITE_B32_gfx9: 403 return 1; 404 case AMDGPU::DS_READ_B64: 405 case AMDGPU::DS_READ_B64_gfx9: 406 case AMDGPU::DS_WRITE_B64: 407 case AMDGPU::DS_WRITE_B64_gfx9: 408 return 2; 409 default: 410 return 0; 411 } 412 } 413 414 /// Maps instruction opcode to enum InstClassEnum. 415 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 416 switch (Opc) { 417 default: 418 if (TII.isMUBUF(Opc)) { 419 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 420 default: 421 return UNKNOWN; 422 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN: 423 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact: 424 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN: 425 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact: 426 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 427 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 428 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 429 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 430 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN: 431 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact: 432 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN: 433 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact: 434 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN: 435 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact: 436 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET: 437 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact: 438 return BUFFER_LOAD; 439 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN: 440 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact: 441 case AMDGPU::BUFFER_STORE_DWORD_IDXEN: 442 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact: 443 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 444 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 445 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 446 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 447 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN: 448 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact: 449 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN: 450 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact: 451 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN: 452 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact: 453 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET: 454 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact: 455 return BUFFER_STORE; 456 } 457 } 458 if (TII.isImage(Opc)) { 459 // Ignore instructions encoded without vaddr. 460 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && 461 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0)) 462 return UNKNOWN; 463 // Ignore BVH instructions 464 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 465 return UNKNOWN; 466 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 467 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 468 TII.isGather4(Opc)) 469 return UNKNOWN; 470 return MIMG; 471 } 472 if (TII.isMTBUF(Opc)) { 473 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 474 default: 475 return UNKNOWN; 476 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: 477 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: 478 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: 479 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: 480 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 481 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 482 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 483 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 484 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN: 485 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact: 486 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN: 487 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact: 488 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN: 489 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact: 490 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET: 491 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact: 492 return TBUFFER_LOAD; 493 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 494 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 495 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 496 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 497 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN: 498 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact: 499 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET: 500 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact: 501 return TBUFFER_STORE; 502 } 503 } 504 return UNKNOWN; 505 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 506 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 507 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 508 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 509 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 510 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: 511 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: 512 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: 513 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: 514 return S_BUFFER_LOAD_IMM; 515 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 516 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 517 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 518 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 519 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 520 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: 521 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: 522 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: 523 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: 524 return S_BUFFER_LOAD_SGPR_IMM; 525 case AMDGPU::S_LOAD_DWORD_IMM: 526 case AMDGPU::S_LOAD_DWORDX2_IMM: 527 case AMDGPU::S_LOAD_DWORDX3_IMM: 528 case AMDGPU::S_LOAD_DWORDX4_IMM: 529 case AMDGPU::S_LOAD_DWORDX8_IMM: 530 case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 531 case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 532 case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 533 case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 534 return S_LOAD_IMM; 535 case AMDGPU::DS_READ_B32: 536 case AMDGPU::DS_READ_B32_gfx9: 537 case AMDGPU::DS_READ_B64: 538 case AMDGPU::DS_READ_B64_gfx9: 539 return DS_READ; 540 case AMDGPU::DS_WRITE_B32: 541 case AMDGPU::DS_WRITE_B32_gfx9: 542 case AMDGPU::DS_WRITE_B64: 543 case AMDGPU::DS_WRITE_B64_gfx9: 544 return DS_WRITE; 545 case AMDGPU::GLOBAL_LOAD_DWORD: 546 case AMDGPU::GLOBAL_LOAD_DWORDX2: 547 case AMDGPU::GLOBAL_LOAD_DWORDX3: 548 case AMDGPU::GLOBAL_LOAD_DWORDX4: 549 case AMDGPU::FLAT_LOAD_DWORD: 550 case AMDGPU::FLAT_LOAD_DWORDX2: 551 case AMDGPU::FLAT_LOAD_DWORDX3: 552 case AMDGPU::FLAT_LOAD_DWORDX4: 553 return FLAT_LOAD; 554 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 555 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 556 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 557 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 558 return GLOBAL_LOAD_SADDR; 559 case AMDGPU::GLOBAL_STORE_DWORD: 560 case AMDGPU::GLOBAL_STORE_DWORDX2: 561 case AMDGPU::GLOBAL_STORE_DWORDX3: 562 case AMDGPU::GLOBAL_STORE_DWORDX4: 563 case AMDGPU::FLAT_STORE_DWORD: 564 case AMDGPU::FLAT_STORE_DWORDX2: 565 case AMDGPU::FLAT_STORE_DWORDX3: 566 case AMDGPU::FLAT_STORE_DWORDX4: 567 return FLAT_STORE; 568 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 569 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 570 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 571 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 572 return GLOBAL_STORE_SADDR; 573 } 574 } 575 576 /// Determines instruction subclass from opcode. Only instructions 577 /// of the same subclass can be merged together. The merged instruction may have 578 /// a different subclass but must have the same class. 579 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 580 switch (Opc) { 581 default: 582 if (TII.isMUBUF(Opc)) 583 return AMDGPU::getMUBUFBaseOpcode(Opc); 584 if (TII.isImage(Opc)) { 585 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 586 assert(Info); 587 return Info->BaseOpcode; 588 } 589 if (TII.isMTBUF(Opc)) 590 return AMDGPU::getMTBUFBaseOpcode(Opc); 591 return -1; 592 case AMDGPU::DS_READ_B32: 593 case AMDGPU::DS_READ_B32_gfx9: 594 case AMDGPU::DS_READ_B64: 595 case AMDGPU::DS_READ_B64_gfx9: 596 case AMDGPU::DS_WRITE_B32: 597 case AMDGPU::DS_WRITE_B32_gfx9: 598 case AMDGPU::DS_WRITE_B64: 599 case AMDGPU::DS_WRITE_B64_gfx9: 600 return Opc; 601 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 602 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 603 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 604 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 605 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 606 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: 607 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: 608 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: 609 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: 610 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 611 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 612 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 613 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 614 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 615 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 616 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: 617 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: 618 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: 619 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: 620 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; 621 case AMDGPU::S_LOAD_DWORD_IMM: 622 case AMDGPU::S_LOAD_DWORDX2_IMM: 623 case AMDGPU::S_LOAD_DWORDX3_IMM: 624 case AMDGPU::S_LOAD_DWORDX4_IMM: 625 case AMDGPU::S_LOAD_DWORDX8_IMM: 626 case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 627 case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 628 case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 629 case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 630 return AMDGPU::S_LOAD_DWORD_IMM; 631 case AMDGPU::GLOBAL_LOAD_DWORD: 632 case AMDGPU::GLOBAL_LOAD_DWORDX2: 633 case AMDGPU::GLOBAL_LOAD_DWORDX3: 634 case AMDGPU::GLOBAL_LOAD_DWORDX4: 635 case AMDGPU::FLAT_LOAD_DWORD: 636 case AMDGPU::FLAT_LOAD_DWORDX2: 637 case AMDGPU::FLAT_LOAD_DWORDX3: 638 case AMDGPU::FLAT_LOAD_DWORDX4: 639 return AMDGPU::FLAT_LOAD_DWORD; 640 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 641 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 642 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 643 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 644 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; 645 case AMDGPU::GLOBAL_STORE_DWORD: 646 case AMDGPU::GLOBAL_STORE_DWORDX2: 647 case AMDGPU::GLOBAL_STORE_DWORDX3: 648 case AMDGPU::GLOBAL_STORE_DWORDX4: 649 case AMDGPU::FLAT_STORE_DWORD: 650 case AMDGPU::FLAT_STORE_DWORDX2: 651 case AMDGPU::FLAT_STORE_DWORDX3: 652 case AMDGPU::FLAT_STORE_DWORDX4: 653 return AMDGPU::FLAT_STORE_DWORD; 654 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 655 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 656 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 657 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 658 return AMDGPU::GLOBAL_STORE_DWORD_SADDR; 659 } 660 } 661 662 // GLOBAL loads and stores are classified as FLAT initially. If both combined 663 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. 664 // If either or both instructions are non segment specific FLAT the resulting 665 // combined operation will be FLAT, potentially promoting one of the GLOBAL 666 // operations to FLAT. 667 // For other instructions return the original unmodified class. 668 InstClassEnum 669 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, 670 const CombineInfo &Paired) { 671 assert(CI.InstClass == Paired.InstClass); 672 673 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && 674 SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I)) 675 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; 676 677 return CI.InstClass; 678 } 679 680 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 681 AddressRegs Result; 682 683 if (TII.isMUBUF(Opc)) { 684 if (AMDGPU::getMUBUFHasVAddr(Opc)) 685 Result.VAddr = true; 686 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 687 Result.SRsrc = true; 688 if (AMDGPU::getMUBUFHasSoffset(Opc)) 689 Result.SOffset = true; 690 691 return Result; 692 } 693 694 if (TII.isImage(Opc)) { 695 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 696 if (VAddr0Idx >= 0) { 697 int RsrcName = 698 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; 699 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName); 700 Result.NumVAddrs = RsrcIdx - VAddr0Idx; 701 } else { 702 Result.VAddr = true; 703 } 704 Result.SRsrc = true; 705 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 706 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 707 Result.SSamp = true; 708 709 return Result; 710 } 711 if (TII.isMTBUF(Opc)) { 712 if (AMDGPU::getMTBUFHasVAddr(Opc)) 713 Result.VAddr = true; 714 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 715 Result.SRsrc = true; 716 if (AMDGPU::getMTBUFHasSoffset(Opc)) 717 Result.SOffset = true; 718 719 return Result; 720 } 721 722 switch (Opc) { 723 default: 724 return Result; 725 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 726 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 727 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 728 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 729 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 730 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: 731 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: 732 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: 733 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: 734 Result.SOffset = true; 735 [[fallthrough]]; 736 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 737 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 738 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 739 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 740 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 741 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: 742 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: 743 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: 744 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: 745 case AMDGPU::S_LOAD_DWORD_IMM: 746 case AMDGPU::S_LOAD_DWORDX2_IMM: 747 case AMDGPU::S_LOAD_DWORDX3_IMM: 748 case AMDGPU::S_LOAD_DWORDX4_IMM: 749 case AMDGPU::S_LOAD_DWORDX8_IMM: 750 case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 751 case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 752 case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 753 case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 754 Result.SBase = true; 755 return Result; 756 case AMDGPU::DS_READ_B32: 757 case AMDGPU::DS_READ_B64: 758 case AMDGPU::DS_READ_B32_gfx9: 759 case AMDGPU::DS_READ_B64_gfx9: 760 case AMDGPU::DS_WRITE_B32: 761 case AMDGPU::DS_WRITE_B64: 762 case AMDGPU::DS_WRITE_B32_gfx9: 763 case AMDGPU::DS_WRITE_B64_gfx9: 764 Result.Addr = true; 765 return Result; 766 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 767 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 768 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 769 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 770 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 771 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 772 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 773 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 774 Result.SAddr = true; 775 [[fallthrough]]; 776 case AMDGPU::GLOBAL_LOAD_DWORD: 777 case AMDGPU::GLOBAL_LOAD_DWORDX2: 778 case AMDGPU::GLOBAL_LOAD_DWORDX3: 779 case AMDGPU::GLOBAL_LOAD_DWORDX4: 780 case AMDGPU::GLOBAL_STORE_DWORD: 781 case AMDGPU::GLOBAL_STORE_DWORDX2: 782 case AMDGPU::GLOBAL_STORE_DWORDX3: 783 case AMDGPU::GLOBAL_STORE_DWORDX4: 784 case AMDGPU::FLAT_LOAD_DWORD: 785 case AMDGPU::FLAT_LOAD_DWORDX2: 786 case AMDGPU::FLAT_LOAD_DWORDX3: 787 case AMDGPU::FLAT_LOAD_DWORDX4: 788 case AMDGPU::FLAT_STORE_DWORD: 789 case AMDGPU::FLAT_STORE_DWORDX2: 790 case AMDGPU::FLAT_STORE_DWORDX3: 791 case AMDGPU::FLAT_STORE_DWORDX4: 792 Result.VAddr = true; 793 return Result; 794 } 795 } 796 797 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 798 const SILoadStoreOptimizer &LSO) { 799 I = MI; 800 unsigned Opc = MI->getOpcode(); 801 InstClass = getInstClass(Opc, *LSO.TII); 802 803 if (InstClass == UNKNOWN) 804 return; 805 806 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 807 808 switch (InstClass) { 809 case DS_READ: 810 EltSize = 811 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 812 : 4; 813 break; 814 case DS_WRITE: 815 EltSize = 816 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 817 : 4; 818 break; 819 case S_BUFFER_LOAD_IMM: 820 case S_BUFFER_LOAD_SGPR_IMM: 821 case S_LOAD_IMM: 822 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 823 break; 824 default: 825 EltSize = 4; 826 break; 827 } 828 829 if (InstClass == MIMG) { 830 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 831 // Offset is not considered for MIMG instructions. 832 Offset = 0; 833 } else { 834 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 835 Offset = I->getOperand(OffsetIdx).getImm(); 836 } 837 838 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 839 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 840 841 Width = getOpcodeWidth(*I, *LSO.TII); 842 843 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 844 Offset &= 0xffff; 845 } else if (InstClass != MIMG) { 846 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 847 } 848 849 AddressRegs Regs = getRegs(Opc, *LSO.TII); 850 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I); 851 852 NumAddresses = 0; 853 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 854 AddrIdx[NumAddresses++] = 855 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 856 if (Regs.Addr) 857 AddrIdx[NumAddresses++] = 858 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 859 if (Regs.SBase) 860 AddrIdx[NumAddresses++] = 861 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 862 if (Regs.SRsrc) 863 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 864 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc); 865 if (Regs.SOffset) 866 AddrIdx[NumAddresses++] = 867 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 868 if (Regs.SAddr) 869 AddrIdx[NumAddresses++] = 870 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 871 if (Regs.VAddr) 872 AddrIdx[NumAddresses++] = 873 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 874 if (Regs.SSamp) 875 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 876 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp); 877 assert(NumAddresses <= MaxAddressRegs); 878 879 for (unsigned J = 0; J < NumAddresses; J++) 880 AddrReg[J] = &I->getOperand(AddrIdx[J]); 881 } 882 883 } // end anonymous namespace. 884 885 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 886 "SI Load Store Optimizer", false, false) 887 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 888 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 889 false, false) 890 891 char SILoadStoreOptimizer::ID = 0; 892 893 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 894 895 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 896 return new SILoadStoreOptimizer(); 897 } 898 899 static void addDefsUsesToList(const MachineInstr &MI, 900 DenseSet<Register> &RegDefs, 901 DenseSet<Register> &RegUses) { 902 for (const auto &Op : MI.operands()) { 903 if (!Op.isReg()) 904 continue; 905 if (Op.isDef()) 906 RegDefs.insert(Op.getReg()); 907 if (Op.readsReg()) 908 RegUses.insert(Op.getReg()); 909 } 910 } 911 912 bool SILoadStoreOptimizer::canSwapInstructions( 913 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 914 const MachineInstr &A, const MachineInstr &B) const { 915 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 916 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 917 return false; 918 for (const auto &BOp : B.operands()) { 919 if (!BOp.isReg()) 920 continue; 921 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 922 return false; 923 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 924 return false; 925 } 926 return true; 927 } 928 929 // Given that \p CI and \p Paired are adjacent memory operations produce a new 930 // MMO for the combined operation with a new access size. 931 MachineMemOperand * 932 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, 933 const CombineInfo &Paired) { 934 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 935 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 936 937 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue(); 938 939 // A base pointer for the combined operation is the same as the leading 940 // operation's pointer. 941 if (Paired < CI) 942 std::swap(MMOa, MMOb); 943 944 MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); 945 // If merging FLAT and GLOBAL set address space to FLAT. 946 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 947 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; 948 949 MachineFunction *MF = CI.I->getMF(); 950 return MF->getMachineMemOperand(MMOa, PtrInfo, Size); 951 } 952 953 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 954 const SIInstrInfo &TII, 955 const CombineInfo &Paired) { 956 assert(CI.InstClass == MIMG); 957 958 // Ignore instructions with tfe/lwe set. 959 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 960 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 961 962 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 963 return false; 964 965 // Check other optional immediate operands for equality. 966 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 967 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 968 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 969 970 for (auto op : OperandsToMatch) { 971 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 972 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 973 return false; 974 if (Idx != -1 && 975 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 976 return false; 977 } 978 979 // Check DMask for overlaps. 980 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 981 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 982 983 if (!MaxMask) 984 return false; 985 986 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask); 987 if ((1u << AllowedBitsForMin) <= MinMask) 988 return false; 989 990 return true; 991 } 992 993 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 994 unsigned ComponentCount, 995 const GCNSubtarget &STI) { 996 if (ComponentCount > 4) 997 return 0; 998 999 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 1000 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 1001 if (!OldFormatInfo) 1002 return 0; 1003 1004 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 1005 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 1006 ComponentCount, 1007 OldFormatInfo->NumFormat, STI); 1008 1009 if (!NewFormatInfo) 1010 return 0; 1011 1012 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 1013 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 1014 1015 return NewFormatInfo->Format; 1016 } 1017 1018 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 1019 // highest power of two. Note that the result is well defined for all inputs 1020 // including corner cases like: 1021 // - if Lo == Hi, return that value 1022 // - if Lo == 0, return 0 (even though the "- 1" below underflows 1023 // - if Lo > Hi, return 0 (as if the range wrapped around) 1024 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 1025 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1); 1026 } 1027 1028 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 1029 const GCNSubtarget &STI, 1030 CombineInfo &Paired, 1031 bool Modify) { 1032 assert(CI.InstClass != MIMG); 1033 1034 // XXX - Would the same offset be OK? Is there any reason this would happen or 1035 // be useful? 1036 if (CI.Offset == Paired.Offset) 1037 return false; 1038 1039 // This won't be valid if the offset isn't aligned. 1040 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 1041 return false; 1042 1043 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 1044 1045 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 1046 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 1047 if (!Info0) 1048 return false; 1049 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 1050 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 1051 if (!Info1) 1052 return false; 1053 1054 if (Info0->BitsPerComp != Info1->BitsPerComp || 1055 Info0->NumFormat != Info1->NumFormat) 1056 return false; 1057 1058 // TODO: Should be possible to support more formats, but if format loads 1059 // are not dword-aligned, the merged load might not be valid. 1060 if (Info0->BitsPerComp != 32) 1061 return false; 1062 1063 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 1064 return false; 1065 } 1066 1067 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 1068 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 1069 CI.UseST64 = false; 1070 CI.BaseOff = 0; 1071 1072 // Handle all non-DS instructions. 1073 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 1074 if (EltOffset0 + CI.Width != EltOffset1 && 1075 EltOffset1 + Paired.Width != EltOffset0) 1076 return false; 1077 if (CI.CPol != Paired.CPol) 1078 return false; 1079 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || 1080 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { 1081 // Reject cases like: 1082 // dword + dwordx2 -> dwordx3 1083 // dword + dwordx3 -> dwordx4 1084 // If we tried to combine these cases, we would fail to extract a subreg 1085 // for the result of the second load due to SGPR alignment requirements. 1086 if (CI.Width != Paired.Width && 1087 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset)) 1088 return false; 1089 } 1090 return true; 1091 } 1092 1093 // If the offset in elements doesn't fit in 8-bits, we might be able to use 1094 // the stride 64 versions. 1095 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 1096 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 1097 if (Modify) { 1098 CI.Offset = EltOffset0 / 64; 1099 Paired.Offset = EltOffset1 / 64; 1100 CI.UseST64 = true; 1101 } 1102 return true; 1103 } 1104 1105 // Check if the new offsets fit in the reduced 8-bit range. 1106 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 1107 if (Modify) { 1108 CI.Offset = EltOffset0; 1109 Paired.Offset = EltOffset1; 1110 } 1111 return true; 1112 } 1113 1114 // Try to shift base address to decrease offsets. 1115 uint32_t Min = std::min(EltOffset0, EltOffset1); 1116 uint32_t Max = std::max(EltOffset0, EltOffset1); 1117 1118 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 1119 if (((Max - Min) & ~Mask) == 0) { 1120 if (Modify) { 1121 // From the range of values we could use for BaseOff, choose the one that 1122 // is aligned to the highest power of two, to maximise the chance that 1123 // the same offset can be reused for other load/store pairs. 1124 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 1125 // Copy the low bits of the offsets, so that when we adjust them by 1126 // subtracting BaseOff they will be multiples of 64. 1127 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 1128 CI.BaseOff = BaseOff * CI.EltSize; 1129 CI.Offset = (EltOffset0 - BaseOff) / 64; 1130 Paired.Offset = (EltOffset1 - BaseOff) / 64; 1131 CI.UseST64 = true; 1132 } 1133 return true; 1134 } 1135 1136 if (isUInt<8>(Max - Min)) { 1137 if (Modify) { 1138 // From the range of values we could use for BaseOff, choose the one that 1139 // is aligned to the highest power of two, to maximise the chance that 1140 // the same offset can be reused for other load/store pairs. 1141 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 1142 CI.BaseOff = BaseOff * CI.EltSize; 1143 CI.Offset = EltOffset0 - BaseOff; 1144 Paired.Offset = EltOffset1 - BaseOff; 1145 } 1146 return true; 1147 } 1148 1149 return false; 1150 } 1151 1152 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 1153 const CombineInfo &CI, 1154 const CombineInfo &Paired) { 1155 const unsigned Width = (CI.Width + Paired.Width); 1156 switch (CI.InstClass) { 1157 default: 1158 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 1159 case S_BUFFER_LOAD_IMM: 1160 case S_BUFFER_LOAD_SGPR_IMM: 1161 case S_LOAD_IMM: 1162 switch (Width) { 1163 default: 1164 return false; 1165 case 2: 1166 case 4: 1167 case 8: 1168 return true; 1169 case 3: 1170 return STM.hasScalarDwordx3Loads(); 1171 } 1172 } 1173 } 1174 1175 const TargetRegisterClass * 1176 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 1177 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 1178 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1179 } 1180 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 1181 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1182 } 1183 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 1184 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1185 } 1186 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 1187 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1188 } 1189 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 1190 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1191 } 1192 return nullptr; 1193 } 1194 1195 /// This function assumes that CI comes before Paired in a basic block. Return 1196 /// an insertion point for the merged instruction or nullptr on failure. 1197 SILoadStoreOptimizer::CombineInfo * 1198 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 1199 CombineInfo &Paired) { 1200 // If another instruction has already been merged into CI, it may now be a 1201 // type that we can't do any further merging into. 1202 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 1203 return nullptr; 1204 assert(CI.InstClass == Paired.InstClass); 1205 1206 if (getInstSubclass(CI.I->getOpcode(), *TII) != 1207 getInstSubclass(Paired.I->getOpcode(), *TII)) 1208 return nullptr; 1209 1210 // Check both offsets (or masks for MIMG) can be combined and fit in the 1211 // reduced range. 1212 if (CI.InstClass == MIMG) { 1213 if (!dmasksCanBeCombined(CI, *TII, Paired)) 1214 return nullptr; 1215 } else { 1216 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 1217 return nullptr; 1218 } 1219 1220 DenseSet<Register> RegDefs; 1221 DenseSet<Register> RegUses; 1222 CombineInfo *Where; 1223 if (CI.I->mayLoad()) { 1224 // Try to hoist Paired up to CI. 1225 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 1226 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 1227 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 1228 return nullptr; 1229 } 1230 Where = &CI; 1231 } else { 1232 // Try to sink CI down to Paired. 1233 addDefsUsesToList(*CI.I, RegDefs, RegUses); 1234 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 1235 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 1236 return nullptr; 1237 } 1238 Where = &Paired; 1239 } 1240 1241 // Call offsetsCanBeCombined with modify = true so that the offsets are 1242 // correct for the new instruction. This should return true, because 1243 // this function should only be called on CombineInfo objects that 1244 // have already been confirmed to be mergeable. 1245 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 1246 offsetsCanBeCombined(CI, *STM, Paired, true); 1247 return Where; 1248 } 1249 1250 // Copy the merged load result from DestReg to the original dest regs of CI and 1251 // Paired. 1252 void SILoadStoreOptimizer::copyToDestRegs( 1253 CombineInfo &CI, CombineInfo &Paired, 1254 MachineBasicBlock::iterator InsertBefore, int OpName, 1255 Register DestReg) const { 1256 MachineBasicBlock *MBB = CI.I->getParent(); 1257 DebugLoc DL = CI.I->getDebugLoc(); 1258 1259 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1260 1261 // Copy to the old destination registers. 1262 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1263 auto *Dest0 = TII->getNamedOperand(*CI.I, OpName); 1264 auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName); 1265 1266 // The constrained sload instructions in S_LOAD_IMM class will have 1267 // `early-clobber` flag in the dst operand. Remove the flag before using the 1268 // MOs in copies. 1269 Dest0->setIsEarlyClobber(false); 1270 Dest1->setIsEarlyClobber(false); 1271 1272 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1273 .add(*Dest0) // Copy to same destination including flags and sub reg. 1274 .addReg(DestReg, 0, SubRegIdx0); 1275 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1276 .add(*Dest1) 1277 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1278 } 1279 1280 // Return a register for the source of the merged store after copying the 1281 // original source regs of CI and Paired into it. 1282 Register 1283 SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, 1284 MachineBasicBlock::iterator InsertBefore, 1285 int OpName) const { 1286 MachineBasicBlock *MBB = CI.I->getParent(); 1287 DebugLoc DL = CI.I->getDebugLoc(); 1288 1289 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1290 1291 // Copy to the new source register. 1292 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1293 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1294 1295 const auto *Src0 = TII->getNamedOperand(*CI.I, OpName); 1296 const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName); 1297 1298 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1299 .add(*Src0) 1300 .addImm(SubRegIdx0) 1301 .add(*Src1) 1302 .addImm(SubRegIdx1); 1303 1304 return SrcReg; 1305 } 1306 1307 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1308 if (STM->ldsRequiresM0Init()) 1309 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1310 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1311 } 1312 1313 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1314 if (STM->ldsRequiresM0Init()) 1315 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1316 1317 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1318 : AMDGPU::DS_READ2ST64_B64_gfx9; 1319 } 1320 1321 MachineBasicBlock::iterator 1322 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1323 MachineBasicBlock::iterator InsertBefore) { 1324 MachineBasicBlock *MBB = CI.I->getParent(); 1325 1326 // Be careful, since the addresses could be subregisters themselves in weird 1327 // cases, like vectors of pointers. 1328 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1329 1330 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset); 1331 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset); 1332 unsigned Opc = 1333 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1334 1335 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1336 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1337 1338 const MCInstrDesc &Read2Desc = TII->get(Opc); 1339 1340 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1341 Register DestReg = MRI->createVirtualRegister(SuperRC); 1342 1343 DebugLoc DL = CI.I->getDebugLoc(); 1344 1345 Register BaseReg = AddrReg->getReg(); 1346 unsigned BaseSubReg = AddrReg->getSubReg(); 1347 unsigned BaseRegFlags = 0; 1348 if (CI.BaseOff) { 1349 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1350 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1351 .addImm(CI.BaseOff); 1352 1353 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1354 BaseRegFlags = RegState::Kill; 1355 1356 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1357 .addReg(ImmReg) 1358 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1359 .addImm(0); // clamp bit 1360 BaseSubReg = 0; 1361 } 1362 1363 MachineInstrBuilder Read2 = 1364 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 1365 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1366 .addImm(NewOffset0) // offset0 1367 .addImm(NewOffset1) // offset1 1368 .addImm(0) // gds 1369 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1370 1371 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); 1372 1373 CI.I->eraseFromParent(); 1374 Paired.I->eraseFromParent(); 1375 1376 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1377 return Read2; 1378 } 1379 1380 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1381 if (STM->ldsRequiresM0Init()) 1382 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1383 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1384 : AMDGPU::DS_WRITE2_B64_gfx9; 1385 } 1386 1387 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1388 if (STM->ldsRequiresM0Init()) 1389 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1390 : AMDGPU::DS_WRITE2ST64_B64; 1391 1392 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1393 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1394 } 1395 1396 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1397 CombineInfo &CI, CombineInfo &Paired, 1398 MachineBasicBlock::iterator InsertBefore) { 1399 MachineBasicBlock *MBB = CI.I->getParent(); 1400 1401 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1402 // sure we preserve the subregister index and any register flags set on them. 1403 const MachineOperand *AddrReg = 1404 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1405 const MachineOperand *Data0 = 1406 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1407 const MachineOperand *Data1 = 1408 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1409 1410 unsigned NewOffset0 = CI.Offset; 1411 unsigned NewOffset1 = Paired.Offset; 1412 unsigned Opc = 1413 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1414 1415 if (NewOffset0 > NewOffset1) { 1416 // Canonicalize the merged instruction so the smaller offset comes first. 1417 std::swap(NewOffset0, NewOffset1); 1418 std::swap(Data0, Data1); 1419 } 1420 1421 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1422 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1423 1424 const MCInstrDesc &Write2Desc = TII->get(Opc); 1425 DebugLoc DL = CI.I->getDebugLoc(); 1426 1427 Register BaseReg = AddrReg->getReg(); 1428 unsigned BaseSubReg = AddrReg->getSubReg(); 1429 unsigned BaseRegFlags = 0; 1430 if (CI.BaseOff) { 1431 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1432 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1433 .addImm(CI.BaseOff); 1434 1435 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1436 BaseRegFlags = RegState::Kill; 1437 1438 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1439 .addReg(ImmReg) 1440 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1441 .addImm(0); // clamp bit 1442 BaseSubReg = 0; 1443 } 1444 1445 MachineInstrBuilder Write2 = 1446 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1447 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1448 .add(*Data0) // data0 1449 .add(*Data1) // data1 1450 .addImm(NewOffset0) // offset0 1451 .addImm(NewOffset1) // offset1 1452 .addImm(0) // gds 1453 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1454 1455 CI.I->eraseFromParent(); 1456 Paired.I->eraseFromParent(); 1457 1458 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1459 return Write2; 1460 } 1461 1462 MachineBasicBlock::iterator 1463 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1464 MachineBasicBlock::iterator InsertBefore) { 1465 MachineBasicBlock *MBB = CI.I->getParent(); 1466 DebugLoc DL = CI.I->getDebugLoc(); 1467 const unsigned Opcode = getNewOpcode(CI, Paired); 1468 1469 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1470 1471 Register DestReg = MRI->createVirtualRegister(SuperRC); 1472 unsigned MergedDMask = CI.DMask | Paired.DMask; 1473 unsigned DMaskIdx = 1474 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1475 1476 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1477 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1478 if (I == DMaskIdx) 1479 MIB.addImm(MergedDMask); 1480 else 1481 MIB.add((*CI.I).getOperand(I)); 1482 } 1483 1484 // It shouldn't be possible to get this far if the two instructions 1485 // don't have a single memoperand, because MachineInstr::mayAlias() 1486 // will return true if this is the case. 1487 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1488 1489 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1490 1491 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 1492 1493 CI.I->eraseFromParent(); 1494 Paired.I->eraseFromParent(); 1495 return New; 1496 } 1497 1498 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( 1499 CombineInfo &CI, CombineInfo &Paired, 1500 MachineBasicBlock::iterator InsertBefore) { 1501 MachineBasicBlock *MBB = CI.I->getParent(); 1502 DebugLoc DL = CI.I->getDebugLoc(); 1503 const unsigned Opcode = getNewOpcode(CI, Paired); 1504 1505 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1506 1507 Register DestReg = MRI->createVirtualRegister(SuperRC); 1508 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1509 1510 // It shouldn't be possible to get this far if the two instructions 1511 // don't have a single memoperand, because MachineInstr::mayAlias() 1512 // will return true if this is the case. 1513 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1514 1515 MachineInstrBuilder New = 1516 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1517 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)); 1518 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) 1519 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)); 1520 New.addImm(MergedOffset); 1521 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1522 1523 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg); 1524 1525 CI.I->eraseFromParent(); 1526 Paired.I->eraseFromParent(); 1527 return New; 1528 } 1529 1530 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1531 CombineInfo &CI, CombineInfo &Paired, 1532 MachineBasicBlock::iterator InsertBefore) { 1533 MachineBasicBlock *MBB = CI.I->getParent(); 1534 DebugLoc DL = CI.I->getDebugLoc(); 1535 1536 const unsigned Opcode = getNewOpcode(CI, Paired); 1537 1538 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1539 1540 // Copy to the new source register. 1541 Register DestReg = MRI->createVirtualRegister(SuperRC); 1542 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1543 1544 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1545 1546 AddressRegs Regs = getRegs(Opcode, *TII); 1547 1548 if (Regs.VAddr) 1549 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1550 1551 // It shouldn't be possible to get this far if the two instructions 1552 // don't have a single memoperand, because MachineInstr::mayAlias() 1553 // will return true if this is the case. 1554 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1555 1556 MachineInstr *New = 1557 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1558 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1559 .addImm(MergedOffset) // offset 1560 .addImm(CI.CPol) // cpol 1561 .addImm(0) // swz 1562 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1563 1564 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 1565 1566 CI.I->eraseFromParent(); 1567 Paired.I->eraseFromParent(); 1568 return New; 1569 } 1570 1571 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1572 CombineInfo &CI, CombineInfo &Paired, 1573 MachineBasicBlock::iterator InsertBefore) { 1574 MachineBasicBlock *MBB = CI.I->getParent(); 1575 DebugLoc DL = CI.I->getDebugLoc(); 1576 1577 const unsigned Opcode = getNewOpcode(CI, Paired); 1578 1579 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1580 1581 // Copy to the new source register. 1582 Register DestReg = MRI->createVirtualRegister(SuperRC); 1583 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1584 1585 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1586 1587 AddressRegs Regs = getRegs(Opcode, *TII); 1588 1589 if (Regs.VAddr) 1590 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1591 1592 unsigned JoinedFormat = 1593 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1594 1595 // It shouldn't be possible to get this far if the two instructions 1596 // don't have a single memoperand, because MachineInstr::mayAlias() 1597 // will return true if this is the case. 1598 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1599 1600 MachineInstr *New = 1601 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1602 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1603 .addImm(MergedOffset) // offset 1604 .addImm(JoinedFormat) // format 1605 .addImm(CI.CPol) // cpol 1606 .addImm(0) // swz 1607 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1608 1609 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 1610 1611 CI.I->eraseFromParent(); 1612 Paired.I->eraseFromParent(); 1613 return New; 1614 } 1615 1616 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1617 CombineInfo &CI, CombineInfo &Paired, 1618 MachineBasicBlock::iterator InsertBefore) { 1619 MachineBasicBlock *MBB = CI.I->getParent(); 1620 DebugLoc DL = CI.I->getDebugLoc(); 1621 1622 const unsigned Opcode = getNewOpcode(CI, Paired); 1623 1624 Register SrcReg = 1625 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 1626 1627 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1628 .addReg(SrcReg, RegState::Kill); 1629 1630 AddressRegs Regs = getRegs(Opcode, *TII); 1631 1632 if (Regs.VAddr) 1633 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1634 1635 unsigned JoinedFormat = 1636 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1637 1638 // It shouldn't be possible to get this far if the two instructions 1639 // don't have a single memoperand, because MachineInstr::mayAlias() 1640 // will return true if this is the case. 1641 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1642 1643 MachineInstr *New = 1644 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1645 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1646 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1647 .addImm(JoinedFormat) // format 1648 .addImm(CI.CPol) // cpol 1649 .addImm(0) // swz 1650 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1651 1652 CI.I->eraseFromParent(); 1653 Paired.I->eraseFromParent(); 1654 return New; 1655 } 1656 1657 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( 1658 CombineInfo &CI, CombineInfo &Paired, 1659 MachineBasicBlock::iterator InsertBefore) { 1660 MachineBasicBlock *MBB = CI.I->getParent(); 1661 DebugLoc DL = CI.I->getDebugLoc(); 1662 1663 const unsigned Opcode = getNewOpcode(CI, Paired); 1664 1665 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1666 Register DestReg = MRI->createVirtualRegister(SuperRC); 1667 1668 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1669 1670 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1671 MIB.add(*SAddr); 1672 1673 MachineInstr *New = 1674 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1675 .addImm(std::min(CI.Offset, Paired.Offset)) 1676 .addImm(CI.CPol) 1677 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1678 1679 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); 1680 1681 CI.I->eraseFromParent(); 1682 Paired.I->eraseFromParent(); 1683 return New; 1684 } 1685 1686 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( 1687 CombineInfo &CI, CombineInfo &Paired, 1688 MachineBasicBlock::iterator InsertBefore) { 1689 MachineBasicBlock *MBB = CI.I->getParent(); 1690 DebugLoc DL = CI.I->getDebugLoc(); 1691 1692 const unsigned Opcode = getNewOpcode(CI, Paired); 1693 1694 Register SrcReg = 1695 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 1696 1697 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1698 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1699 .addReg(SrcReg, RegState::Kill); 1700 1701 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1702 MIB.add(*SAddr); 1703 1704 MachineInstr *New = 1705 MIB.addImm(std::min(CI.Offset, Paired.Offset)) 1706 .addImm(CI.CPol) 1707 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1708 1709 CI.I->eraseFromParent(); 1710 Paired.I->eraseFromParent(); 1711 return New; 1712 } 1713 1714 static bool needsConstrainedOpcode(const GCNSubtarget &STM, 1715 ArrayRef<MachineMemOperand *> MMOs, 1716 unsigned Width) { 1717 // Conservatively returns true if not found the MMO. 1718 return STM.isXNACKEnabled() && 1719 (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4); 1720 } 1721 1722 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1723 const CombineInfo &Paired) { 1724 const unsigned Width = CI.Width + Paired.Width; 1725 1726 switch (getCommonInstClass(CI, Paired)) { 1727 default: 1728 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1729 // FIXME: Handle d16 correctly 1730 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1731 Width); 1732 case TBUFFER_LOAD: 1733 case TBUFFER_STORE: 1734 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1735 Width); 1736 1737 case UNKNOWN: 1738 llvm_unreachable("Unknown instruction class"); 1739 case S_BUFFER_LOAD_IMM: { 1740 // If XNACK is enabled, use the constrained opcodes when the first load is 1741 // under-aligned. 1742 bool NeedsConstrainedOpc = 1743 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width); 1744 switch (Width) { 1745 default: 1746 return 0; 1747 case 2: 1748 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec 1749 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1750 case 3: 1751 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec 1752 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; 1753 case 4: 1754 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec 1755 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1756 case 8: 1757 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec 1758 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1759 } 1760 } 1761 case S_BUFFER_LOAD_SGPR_IMM: { 1762 // If XNACK is enabled, use the constrained opcodes when the first load is 1763 // under-aligned. 1764 bool NeedsConstrainedOpc = 1765 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width); 1766 switch (Width) { 1767 default: 1768 return 0; 1769 case 2: 1770 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec 1771 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; 1772 case 3: 1773 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec 1774 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; 1775 case 4: 1776 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec 1777 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; 1778 case 8: 1779 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec 1780 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; 1781 } 1782 } 1783 case S_LOAD_IMM: { 1784 // If XNACK is enabled, use the constrained opcodes when the first load is 1785 // under-aligned. 1786 bool NeedsConstrainedOpc = 1787 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width); 1788 switch (Width) { 1789 default: 1790 return 0; 1791 case 2: 1792 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec 1793 : AMDGPU::S_LOAD_DWORDX2_IMM; 1794 case 3: 1795 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec 1796 : AMDGPU::S_LOAD_DWORDX3_IMM; 1797 case 4: 1798 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec 1799 : AMDGPU::S_LOAD_DWORDX4_IMM; 1800 case 8: 1801 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec 1802 : AMDGPU::S_LOAD_DWORDX8_IMM; 1803 } 1804 } 1805 case GLOBAL_LOAD: 1806 switch (Width) { 1807 default: 1808 return 0; 1809 case 2: 1810 return AMDGPU::GLOBAL_LOAD_DWORDX2; 1811 case 3: 1812 return AMDGPU::GLOBAL_LOAD_DWORDX3; 1813 case 4: 1814 return AMDGPU::GLOBAL_LOAD_DWORDX4; 1815 } 1816 case GLOBAL_LOAD_SADDR: 1817 switch (Width) { 1818 default: 1819 return 0; 1820 case 2: 1821 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; 1822 case 3: 1823 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; 1824 case 4: 1825 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; 1826 } 1827 case GLOBAL_STORE: 1828 switch (Width) { 1829 default: 1830 return 0; 1831 case 2: 1832 return AMDGPU::GLOBAL_STORE_DWORDX2; 1833 case 3: 1834 return AMDGPU::GLOBAL_STORE_DWORDX3; 1835 case 4: 1836 return AMDGPU::GLOBAL_STORE_DWORDX4; 1837 } 1838 case GLOBAL_STORE_SADDR: 1839 switch (Width) { 1840 default: 1841 return 0; 1842 case 2: 1843 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; 1844 case 3: 1845 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; 1846 case 4: 1847 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; 1848 } 1849 case FLAT_LOAD: 1850 switch (Width) { 1851 default: 1852 return 0; 1853 case 2: 1854 return AMDGPU::FLAT_LOAD_DWORDX2; 1855 case 3: 1856 return AMDGPU::FLAT_LOAD_DWORDX3; 1857 case 4: 1858 return AMDGPU::FLAT_LOAD_DWORDX4; 1859 } 1860 case FLAT_STORE: 1861 switch (Width) { 1862 default: 1863 return 0; 1864 case 2: 1865 return AMDGPU::FLAT_STORE_DWORDX2; 1866 case 3: 1867 return AMDGPU::FLAT_STORE_DWORDX3; 1868 case 4: 1869 return AMDGPU::FLAT_STORE_DWORDX4; 1870 } 1871 case MIMG: 1872 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && 1873 "No overlaps"); 1874 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1875 } 1876 } 1877 1878 std::pair<unsigned, unsigned> 1879 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1880 const CombineInfo &Paired) { 1881 assert((CI.InstClass != MIMG || 1882 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == 1883 CI.Width + Paired.Width)) && 1884 "No overlaps"); 1885 1886 unsigned Idx0; 1887 unsigned Idx1; 1888 1889 static const unsigned Idxs[5][4] = { 1890 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1891 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1892 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1893 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1894 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1895 }; 1896 1897 assert(CI.Width >= 1 && CI.Width <= 4); 1898 assert(Paired.Width >= 1 && Paired.Width <= 4); 1899 1900 if (Paired < CI) { 1901 Idx1 = Idxs[0][Paired.Width - 1]; 1902 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1903 } else { 1904 Idx0 = Idxs[0][CI.Width - 1]; 1905 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1906 } 1907 1908 return {Idx0, Idx1}; 1909 } 1910 1911 const TargetRegisterClass * 1912 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1913 const CombineInfo &Paired) const { 1914 if (CI.InstClass == S_BUFFER_LOAD_IMM || 1915 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) { 1916 switch (CI.Width + Paired.Width) { 1917 default: 1918 return nullptr; 1919 case 2: 1920 return &AMDGPU::SReg_64_XEXECRegClass; 1921 case 3: 1922 return &AMDGPU::SGPR_96RegClass; 1923 case 4: 1924 return &AMDGPU::SGPR_128RegClass; 1925 case 8: 1926 return &AMDGPU::SGPR_256RegClass; 1927 case 16: 1928 return &AMDGPU::SGPR_512RegClass; 1929 } 1930 } 1931 1932 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1933 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1934 ? TRI->getAGPRClassForBitWidth(BitWidth) 1935 : TRI->getVGPRClassForBitWidth(BitWidth); 1936 } 1937 1938 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1939 CombineInfo &CI, CombineInfo &Paired, 1940 MachineBasicBlock::iterator InsertBefore) { 1941 MachineBasicBlock *MBB = CI.I->getParent(); 1942 DebugLoc DL = CI.I->getDebugLoc(); 1943 1944 const unsigned Opcode = getNewOpcode(CI, Paired); 1945 1946 Register SrcReg = 1947 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 1948 1949 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1950 .addReg(SrcReg, RegState::Kill); 1951 1952 AddressRegs Regs = getRegs(Opcode, *TII); 1953 1954 if (Regs.VAddr) 1955 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1956 1957 1958 // It shouldn't be possible to get this far if the two instructions 1959 // don't have a single memoperand, because MachineInstr::mayAlias() 1960 // will return true if this is the case. 1961 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1962 1963 MachineInstr *New = 1964 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1965 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1966 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1967 .addImm(CI.CPol) // cpol 1968 .addImm(0) // swz 1969 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1970 1971 CI.I->eraseFromParent(); 1972 Paired.I->eraseFromParent(); 1973 return New; 1974 } 1975 1976 MachineOperand 1977 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1978 APInt V(32, Val, true); 1979 if (TII->isInlineConstant(V)) 1980 return MachineOperand::CreateImm(Val); 1981 1982 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1983 MachineInstr *Mov = 1984 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1985 TII->get(AMDGPU::S_MOV_B32), Reg) 1986 .addImm(Val); 1987 (void)Mov; 1988 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1989 return MachineOperand::CreateReg(Reg, false); 1990 } 1991 1992 // Compute base address using Addr and return the final register. 1993 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1994 const MemAddress &Addr) const { 1995 MachineBasicBlock *MBB = MI.getParent(); 1996 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1997 DebugLoc DL = MI.getDebugLoc(); 1998 1999 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 2000 Addr.Base.LoSubReg) && 2001 "Expected 32-bit Base-Register-Low!!"); 2002 2003 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 2004 Addr.Base.HiSubReg) && 2005 "Expected 32-bit Base-Register-Hi!!"); 2006 2007 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 2008 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 2009 MachineOperand OffsetHi = 2010 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 2011 2012 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 2013 Register CarryReg = MRI->createVirtualRegister(CarryRC); 2014 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 2015 2016 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2017 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2018 MachineInstr *LoHalf = 2019 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 2020 .addReg(CarryReg, RegState::Define) 2021 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 2022 .add(OffsetLo) 2023 .addImm(0); // clamp bit 2024 (void)LoHalf; 2025 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 2026 2027 MachineInstr *HiHalf = 2028 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 2029 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 2030 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 2031 .add(OffsetHi) 2032 .addReg(CarryReg, RegState::Kill) 2033 .addImm(0); // clamp bit 2034 (void)HiHalf; 2035 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 2036 2037 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 2038 MachineInstr *FullBase = 2039 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2040 .addReg(DestSub0) 2041 .addImm(AMDGPU::sub0) 2042 .addReg(DestSub1) 2043 .addImm(AMDGPU::sub1); 2044 (void)FullBase; 2045 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 2046 2047 return FullDestReg; 2048 } 2049 2050 // Update base and offset with the NewBase and NewOffset in MI. 2051 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 2052 Register NewBase, 2053 int32_t NewOffset) const { 2054 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2055 Base->setReg(NewBase); 2056 Base->setIsKill(false); 2057 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 2058 } 2059 2060 std::optional<int32_t> 2061 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 2062 if (Op.isImm()) 2063 return Op.getImm(); 2064 2065 if (!Op.isReg()) 2066 return std::nullopt; 2067 2068 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 2069 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 2070 !Def->getOperand(1).isImm()) 2071 return std::nullopt; 2072 2073 return Def->getOperand(1).getImm(); 2074 } 2075 2076 // Analyze Base and extracts: 2077 // - 32bit base registers, subregisters 2078 // - 64bit constant offset 2079 // Expecting base computation as: 2080 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 2081 // %LO:vgpr_32, %c:sreg_64_xexec = 2082 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 2083 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 2084 // %Base:vreg_64 = 2085 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 2086 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 2087 MemAddress &Addr) const { 2088 if (!Base.isReg()) 2089 return; 2090 2091 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 2092 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 2093 || Def->getNumOperands() != 5) 2094 return; 2095 2096 MachineOperand BaseLo = Def->getOperand(1); 2097 MachineOperand BaseHi = Def->getOperand(3); 2098 if (!BaseLo.isReg() || !BaseHi.isReg()) 2099 return; 2100 2101 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 2102 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 2103 2104 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 2105 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 2106 return; 2107 2108 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 2109 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 2110 2111 auto Offset0P = extractConstOffset(*Src0); 2112 if (Offset0P) 2113 BaseLo = *Src1; 2114 else { 2115 if (!(Offset0P = extractConstOffset(*Src1))) 2116 return; 2117 BaseLo = *Src0; 2118 } 2119 2120 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 2121 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 2122 2123 if (Src0->isImm()) 2124 std::swap(Src0, Src1); 2125 2126 if (!Src1->isImm() || Src0->isImm()) 2127 return; 2128 2129 uint64_t Offset1 = Src1->getImm(); 2130 BaseHi = *Src0; 2131 2132 Addr.Base.LoReg = BaseLo.getReg(); 2133 Addr.Base.HiReg = BaseHi.getReg(); 2134 Addr.Base.LoSubReg = BaseLo.getSubReg(); 2135 Addr.Base.HiSubReg = BaseHi.getSubReg(); 2136 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 2137 } 2138 2139 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 2140 MachineInstr &MI, 2141 MemInfoMap &Visited, 2142 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 2143 2144 if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI)) 2145 return false; 2146 2147 // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers. 2148 if (SIInstrInfo::isFLATScratch(MI)) 2149 return false; 2150 2151 unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS 2152 : AMDGPUAS::FLAT_ADDRESS; 2153 2154 if (AnchorList.count(&MI)) 2155 return false; 2156 2157 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 2158 2159 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 2160 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 2161 return false; 2162 } 2163 2164 // Step1: Find the base-registers and a 64bit constant offset. 2165 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2166 MemAddress MAddr; 2167 if (!Visited.contains(&MI)) { 2168 processBaseWithConstOffset(Base, MAddr); 2169 Visited[&MI] = MAddr; 2170 } else 2171 MAddr = Visited[&MI]; 2172 2173 if (MAddr.Offset == 0) { 2174 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 2175 " constant offsets that can be promoted.\n";); 2176 return false; 2177 } 2178 2179 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 2180 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 2181 2182 // Step2: Traverse through MI's basic block and find an anchor(that has the 2183 // same base-registers) with the highest 13bit distance from MI's offset. 2184 // E.g. (64bit loads) 2185 // bb: 2186 // addr1 = &a + 4096; load1 = load(addr1, 0) 2187 // addr2 = &a + 6144; load2 = load(addr2, 0) 2188 // addr3 = &a + 8192; load3 = load(addr3, 0) 2189 // addr4 = &a + 10240; load4 = load(addr4, 0) 2190 // addr5 = &a + 12288; load5 = load(addr5, 0) 2191 // 2192 // Starting from the first load, the optimization will try to find a new base 2193 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 2194 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 2195 // as the new-base(anchor) because of the maximum distance which can 2196 // accommodate more intermediate bases presumably. 2197 // 2198 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 2199 // (&a + 8192) for load1, load2, load4. 2200 // addr = &a + 8192 2201 // load1 = load(addr, -4096) 2202 // load2 = load(addr, -2048) 2203 // load3 = load(addr, 0) 2204 // load4 = load(addr, 2048) 2205 // addr5 = &a + 12288; load5 = load(addr5, 0) 2206 // 2207 MachineInstr *AnchorInst = nullptr; 2208 MemAddress AnchorAddr; 2209 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 2210 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 2211 2212 MachineBasicBlock *MBB = MI.getParent(); 2213 MachineBasicBlock::iterator E = MBB->end(); 2214 MachineBasicBlock::iterator MBBI = MI.getIterator(); 2215 ++MBBI; 2216 const SITargetLowering *TLI = 2217 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 2218 2219 for ( ; MBBI != E; ++MBBI) { 2220 MachineInstr &MINext = *MBBI; 2221 // TODO: Support finding an anchor(with same base) from store addresses or 2222 // any other load addresses where the opcodes are different. 2223 if (MINext.getOpcode() != MI.getOpcode() || 2224 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 2225 continue; 2226 2227 const MachineOperand &BaseNext = 2228 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 2229 MemAddress MAddrNext; 2230 if (!Visited.contains(&MINext)) { 2231 processBaseWithConstOffset(BaseNext, MAddrNext); 2232 Visited[&MINext] = MAddrNext; 2233 } else 2234 MAddrNext = Visited[&MINext]; 2235 2236 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 2237 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 2238 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 2239 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 2240 continue; 2241 2242 InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset); 2243 2244 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 2245 TargetLoweringBase::AddrMode AM; 2246 AM.HasBaseReg = true; 2247 AM.BaseOffs = Dist; 2248 if (TLI->isLegalFlatAddressingMode(AM, AS) && 2249 (uint32_t)std::abs(Dist) > MaxDist) { 2250 MaxDist = std::abs(Dist); 2251 2252 AnchorAddr = MAddrNext; 2253 AnchorInst = &MINext; 2254 } 2255 } 2256 2257 if (AnchorInst) { 2258 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 2259 AnchorInst->dump()); 2260 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 2261 << AnchorAddr.Offset << "\n\n"); 2262 2263 // Instead of moving up, just re-compute anchor-instruction's base address. 2264 Register Base = computeBase(MI, AnchorAddr); 2265 2266 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 2267 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 2268 2269 for (auto [OtherMI, OtherOffset] : InstsWCommonBase) { 2270 TargetLoweringBase::AddrMode AM; 2271 AM.HasBaseReg = true; 2272 AM.BaseOffs = OtherOffset - AnchorAddr.Offset; 2273 2274 if (TLI->isLegalFlatAddressingMode(AM, AS)) { 2275 LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")"; 2276 OtherMI->dump()); 2277 updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset); 2278 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump()); 2279 } 2280 } 2281 AnchorList.insert(AnchorInst); 2282 return true; 2283 } 2284 2285 return false; 2286 } 2287 2288 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 2289 std::list<std::list<CombineInfo> > &MergeableInsts) const { 2290 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 2291 if (AddrList.front().InstClass == CI.InstClass && 2292 AddrList.front().IsAGPR == CI.IsAGPR && 2293 AddrList.front().hasSameBaseAddress(CI)) { 2294 AddrList.emplace_back(CI); 2295 return; 2296 } 2297 } 2298 2299 // Base address not found, so add a new list. 2300 MergeableInsts.emplace_back(1, CI); 2301 } 2302 2303 std::pair<MachineBasicBlock::iterator, bool> 2304 SILoadStoreOptimizer::collectMergeableInsts( 2305 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2306 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2307 std::list<std::list<CombineInfo>> &MergeableInsts) const { 2308 bool Modified = false; 2309 2310 // Sort potential mergeable instructions into lists. One list per base address. 2311 unsigned Order = 0; 2312 MachineBasicBlock::iterator BlockI = Begin; 2313 for (; BlockI != End; ++BlockI) { 2314 MachineInstr &MI = *BlockI; 2315 2316 // We run this before checking if an address is mergeable, because it can produce 2317 // better code even if the instructions aren't mergeable. 2318 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 2319 Modified = true; 2320 2321 // Treat volatile accesses, ordered accesses and unmodeled side effects as 2322 // barriers. We can look after this barrier for separate merges. 2323 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 2324 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 2325 2326 // Search will resume after this instruction in a separate merge list. 2327 ++BlockI; 2328 break; 2329 } 2330 2331 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2332 if (InstClass == UNKNOWN) 2333 continue; 2334 2335 // Do not merge VMEM buffer instructions with "swizzled" bit set. 2336 int Swizzled = 2337 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 2338 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 2339 continue; 2340 2341 CombineInfo CI; 2342 CI.setMI(MI, *this); 2343 CI.Order = Order++; 2344 2345 if (!CI.hasMergeableAddress(*MRI)) 2346 continue; 2347 2348 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 2349 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2350 // operands. However we are reporting that ds_write2 shall have 2351 // only VGPR data so that machine copy propagation does not 2352 // create an illegal instruction with a VGPR and AGPR sources. 2353 // Consequenctially if we create such instruction the verifier 2354 // will complain. 2355 continue; 2356 } 2357 2358 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2359 2360 addInstToMergeableList(CI, MergeableInsts); 2361 } 2362 2363 // At this point we have lists of Mergeable instructions. 2364 // 2365 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2366 // list try to find an instruction that can be merged with I. If an instruction 2367 // is found, it is stored in the Paired field. If no instructions are found, then 2368 // the CombineInfo object is deleted from the list. 2369 2370 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2371 E = MergeableInsts.end(); I != E;) { 2372 2373 std::list<CombineInfo> &MergeList = *I; 2374 if (MergeList.size() <= 1) { 2375 // This means we have found only one instruction with a given address 2376 // that can be merged, and we need at least 2 instructions to do a merge, 2377 // so this list can be discarded. 2378 I = MergeableInsts.erase(I); 2379 continue; 2380 } 2381 2382 // Sort the lists by offsets, this way mergeable instructions will be 2383 // adjacent to each other in the list, which will make it easier to find 2384 // matches. 2385 MergeList.sort( 2386 [] (const CombineInfo &A, const CombineInfo &B) { 2387 return A.Offset < B.Offset; 2388 }); 2389 ++I; 2390 } 2391 2392 return {BlockI, Modified}; 2393 } 2394 2395 // Scan through looking for adjacent LDS operations with constant offsets from 2396 // the same base register. We rely on the scheduler to do the hard work of 2397 // clustering nearby loads, and assume these are all adjacent. 2398 bool SILoadStoreOptimizer::optimizeBlock( 2399 std::list<std::list<CombineInfo> > &MergeableInsts) { 2400 bool Modified = false; 2401 2402 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2403 E = MergeableInsts.end(); I != E;) { 2404 std::list<CombineInfo> &MergeList = *I; 2405 2406 bool OptimizeListAgain = false; 2407 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2408 // We weren't able to make any changes, so delete the list so we don't 2409 // process the same instructions the next time we try to optimize this 2410 // block. 2411 I = MergeableInsts.erase(I); 2412 continue; 2413 } 2414 2415 Modified = true; 2416 2417 // We made changes, but also determined that there were no more optimization 2418 // opportunities, so we don't need to reprocess the list 2419 if (!OptimizeListAgain) { 2420 I = MergeableInsts.erase(I); 2421 continue; 2422 } 2423 OptimizeAgain = true; 2424 } 2425 return Modified; 2426 } 2427 2428 bool 2429 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2430 std::list<CombineInfo> &MergeList, 2431 bool &OptimizeListAgain) { 2432 if (MergeList.empty()) 2433 return false; 2434 2435 bool Modified = false; 2436 2437 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2438 Next = std::next(I)) { 2439 2440 auto First = I; 2441 auto Second = Next; 2442 2443 if ((*First).Order > (*Second).Order) 2444 std::swap(First, Second); 2445 CombineInfo &CI = *First; 2446 CombineInfo &Paired = *Second; 2447 2448 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 2449 if (!Where) { 2450 ++I; 2451 continue; 2452 } 2453 2454 Modified = true; 2455 2456 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2457 2458 MachineBasicBlock::iterator NewMI; 2459 switch (CI.InstClass) { 2460 default: 2461 llvm_unreachable("unknown InstClass"); 2462 break; 2463 case DS_READ: 2464 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2465 break; 2466 case DS_WRITE: 2467 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2468 break; 2469 case S_BUFFER_LOAD_IMM: 2470 case S_BUFFER_LOAD_SGPR_IMM: 2471 case S_LOAD_IMM: 2472 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I); 2473 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2474 break; 2475 case BUFFER_LOAD: 2476 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2477 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2478 break; 2479 case BUFFER_STORE: 2480 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2481 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2482 break; 2483 case MIMG: 2484 NewMI = mergeImagePair(CI, Paired, Where->I); 2485 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2486 break; 2487 case TBUFFER_LOAD: 2488 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2489 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2490 break; 2491 case TBUFFER_STORE: 2492 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2493 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2494 break; 2495 case FLAT_LOAD: 2496 case GLOBAL_LOAD: 2497 case GLOBAL_LOAD_SADDR: 2498 NewMI = mergeFlatLoadPair(CI, Paired, Where->I); 2499 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2500 break; 2501 case FLAT_STORE: 2502 case GLOBAL_STORE: 2503 case GLOBAL_STORE_SADDR: 2504 NewMI = mergeFlatStorePair(CI, Paired, Where->I); 2505 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2506 break; 2507 } 2508 CI.setMI(NewMI, *this); 2509 CI.Order = Where->Order; 2510 if (I == Second) 2511 I = Next; 2512 2513 MergeList.erase(Second); 2514 } 2515 2516 return Modified; 2517 } 2518 2519 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2520 if (skipFunction(MF.getFunction())) 2521 return false; 2522 2523 STM = &MF.getSubtarget<GCNSubtarget>(); 2524 if (!STM->loadStoreOptEnabled()) 2525 return false; 2526 2527 TII = STM->getInstrInfo(); 2528 TRI = &TII->getRegisterInfo(); 2529 2530 MRI = &MF.getRegInfo(); 2531 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2532 2533 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2534 2535 bool Modified = false; 2536 2537 // Contains the list of instructions for which constant offsets are being 2538 // promoted to the IMM. This is tracked for an entire block at time. 2539 SmallPtrSet<MachineInstr *, 4> AnchorList; 2540 MemInfoMap Visited; 2541 2542 for (MachineBasicBlock &MBB : MF) { 2543 MachineBasicBlock::iterator SectionEnd; 2544 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2545 I = SectionEnd) { 2546 bool CollectModified; 2547 std::list<std::list<CombineInfo>> MergeableInsts; 2548 2549 // First pass: Collect list of all instructions we know how to merge in a 2550 // subset of the block. 2551 std::tie(SectionEnd, CollectModified) = 2552 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2553 2554 Modified |= CollectModified; 2555 2556 do { 2557 OptimizeAgain = false; 2558 Modified |= optimizeBlock(MergeableInsts); 2559 } while (OptimizeAgain); 2560 } 2561 2562 Visited.clear(); 2563 AnchorList.clear(); 2564 } 2565 2566 return Modified; 2567 } 2568