1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "SILoadStoreOptimizer.h" 61 #include "AMDGPU.h" 62 #include "GCNSubtarget.h" 63 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 64 #include "llvm/Analysis/AliasAnalysis.h" 65 #include "llvm/CodeGen/MachineFunctionPass.h" 66 #include "llvm/InitializePasses.h" 67 68 using namespace llvm; 69 70 #define DEBUG_TYPE "si-load-store-opt" 71 72 namespace { 73 enum InstClassEnum { 74 UNKNOWN, 75 DS_READ, 76 DS_WRITE, 77 S_BUFFER_LOAD_IMM, 78 S_BUFFER_LOAD_SGPR_IMM, 79 S_LOAD_IMM, 80 BUFFER_LOAD, 81 BUFFER_STORE, 82 MIMG, 83 TBUFFER_LOAD, 84 TBUFFER_STORE, 85 GLOBAL_LOAD_SADDR, 86 GLOBAL_STORE_SADDR, 87 FLAT_LOAD, 88 FLAT_STORE, 89 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of 90 GLOBAL_STORE // any CombineInfo, they are only ever returned by 91 // getCommonInstClass. 92 }; 93 94 struct AddressRegs { 95 unsigned char NumVAddrs = 0; 96 bool SBase = false; 97 bool SRsrc = false; 98 bool SOffset = false; 99 bool SAddr = false; 100 bool VAddr = false; 101 bool Addr = false; 102 bool SSamp = false; 103 }; 104 105 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 106 const unsigned MaxAddressRegs = 12 + 1 + 1; 107 108 class SILoadStoreOptimizer { 109 struct CombineInfo { 110 MachineBasicBlock::iterator I; 111 unsigned EltSize; 112 unsigned Offset; 113 unsigned Width; 114 unsigned Format; 115 unsigned BaseOff; 116 unsigned DMask; 117 InstClassEnum InstClass; 118 unsigned CPol = 0; 119 bool IsAGPR; 120 bool UseST64; 121 int AddrIdx[MaxAddressRegs]; 122 const MachineOperand *AddrReg[MaxAddressRegs]; 123 unsigned NumAddresses; 124 unsigned Order; 125 126 bool hasSameBaseAddress(const CombineInfo &CI) { 127 if (NumAddresses != CI.NumAddresses) 128 return false; 129 130 const MachineInstr &MI = *CI.I; 131 for (unsigned i = 0; i < NumAddresses; i++) { 132 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 133 134 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 135 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 136 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 137 return false; 138 } 139 continue; 140 } 141 142 // Check same base pointer. Be careful of subregisters, which can occur 143 // with vectors of pointers. 144 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 145 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 146 return false; 147 } 148 } 149 return true; 150 } 151 152 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 153 for (unsigned i = 0; i < NumAddresses; ++i) { 154 const MachineOperand *AddrOp = AddrReg[i]; 155 // Immediates are always OK. 156 if (AddrOp->isImm()) 157 continue; 158 159 // Don't try to merge addresses that aren't either immediates or registers. 160 // TODO: Should be possible to merge FrameIndexes and maybe some other 161 // non-register 162 if (!AddrOp->isReg()) 163 return false; 164 165 // TODO: We should be able to merge instructions with other physical reg 166 // addresses too. 167 if (AddrOp->getReg().isPhysical() && 168 AddrOp->getReg() != AMDGPU::SGPR_NULL) 169 return false; 170 171 // If an address has only one use then there will be no other 172 // instructions with the same address, so we can't merge this one. 173 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 174 return false; 175 } 176 return true; 177 } 178 179 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 180 181 // Compare by pointer order. 182 bool operator<(const CombineInfo& Other) const { 183 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; 184 } 185 }; 186 187 struct BaseRegisters { 188 Register LoReg; 189 Register HiReg; 190 191 unsigned LoSubReg = 0; 192 unsigned HiSubReg = 0; 193 }; 194 195 struct MemAddress { 196 BaseRegisters Base; 197 int64_t Offset = 0; 198 }; 199 200 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 201 202 private: 203 const GCNSubtarget *STM = nullptr; 204 const SIInstrInfo *TII = nullptr; 205 const SIRegisterInfo *TRI = nullptr; 206 MachineRegisterInfo *MRI = nullptr; 207 AliasAnalysis *AA = nullptr; 208 bool OptimizeAgain; 209 210 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 211 const DenseSet<Register> &ARegUses, 212 const MachineInstr &A, const MachineInstr &B) const; 213 static bool dmasksCanBeCombined(const CombineInfo &CI, 214 const SIInstrInfo &TII, 215 const CombineInfo &Paired); 216 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 217 CombineInfo &Paired, bool Modify = false); 218 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 219 const CombineInfo &Paired); 220 unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 221 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 222 const CombineInfo &Paired); 223 const TargetRegisterClass * 224 getTargetRegisterClass(const CombineInfo &CI, 225 const CombineInfo &Paired) const; 226 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 227 228 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 229 230 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired, 231 MachineBasicBlock::iterator InsertBefore, int OpName, 232 Register DestReg) const; 233 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, 234 MachineBasicBlock::iterator InsertBefore, 235 int OpName) const; 236 237 unsigned read2Opcode(unsigned EltSize) const; 238 unsigned read2ST64Opcode(unsigned EltSize) const; 239 MachineBasicBlock::iterator 240 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 241 MachineBasicBlock::iterator InsertBefore); 242 243 unsigned write2Opcode(unsigned EltSize) const; 244 unsigned write2ST64Opcode(unsigned EltSize) const; 245 MachineBasicBlock::iterator 246 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 247 MachineBasicBlock::iterator InsertBefore); 248 MachineBasicBlock::iterator 249 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 250 MachineBasicBlock::iterator InsertBefore); 251 MachineBasicBlock::iterator 252 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 253 MachineBasicBlock::iterator InsertBefore); 254 MachineBasicBlock::iterator 255 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 256 MachineBasicBlock::iterator InsertBefore); 257 MachineBasicBlock::iterator 258 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 259 MachineBasicBlock::iterator InsertBefore); 260 MachineBasicBlock::iterator 261 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 262 MachineBasicBlock::iterator InsertBefore); 263 MachineBasicBlock::iterator 264 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 265 MachineBasicBlock::iterator InsertBefore); 266 MachineBasicBlock::iterator 267 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, 268 MachineBasicBlock::iterator InsertBefore); 269 MachineBasicBlock::iterator 270 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, 271 MachineBasicBlock::iterator InsertBefore); 272 273 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 274 int32_t NewOffset) const; 275 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 276 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 277 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 278 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 279 /// Promotes constant offset to the immediate by adjusting the base. It 280 /// tries to use a base from the nearby instructions that allows it to have 281 /// a 13bit constant offset which gets promoted to the immediate. 282 bool promoteConstantOffsetToImm(MachineInstr &CI, 283 MemInfoMap &Visited, 284 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 285 void addInstToMergeableList(const CombineInfo &CI, 286 std::list<std::list<CombineInfo> > &MergeableInsts) const; 287 288 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 289 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 290 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 291 std::list<std::list<CombineInfo>> &MergeableInsts) const; 292 293 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, 294 const CombineInfo &Paired); 295 296 static InstClassEnum getCommonInstClass(const CombineInfo &CI, 297 const CombineInfo &Paired); 298 299 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 300 bool &OptimizeListAgain); 301 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 302 303 public: 304 SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {} 305 bool run(MachineFunction &MF); 306 }; 307 308 class SILoadStoreOptimizerLegacy : public MachineFunctionPass { 309 public: 310 static char ID; 311 312 SILoadStoreOptimizerLegacy() : MachineFunctionPass(ID) {} 313 314 bool runOnMachineFunction(MachineFunction &MF) override; 315 316 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 317 318 void getAnalysisUsage(AnalysisUsage &AU) const override { 319 AU.setPreservesCFG(); 320 AU.addRequired<AAResultsWrapperPass>(); 321 322 MachineFunctionPass::getAnalysisUsage(AU); 323 } 324 325 MachineFunctionProperties getRequiredProperties() const override { 326 return MachineFunctionProperties() 327 .set(MachineFunctionProperties::Property::IsSSA); 328 } 329 }; 330 331 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 332 const unsigned Opc = MI.getOpcode(); 333 334 if (TII.isMUBUF(Opc)) { 335 // FIXME: Handle d16 correctly 336 return AMDGPU::getMUBUFElements(Opc); 337 } 338 if (TII.isImage(MI)) { 339 uint64_t DMaskImm = 340 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 341 return llvm::popcount(DMaskImm); 342 } 343 if (TII.isMTBUF(Opc)) { 344 return AMDGPU::getMTBUFElements(Opc); 345 } 346 347 switch (Opc) { 348 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 349 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 350 case AMDGPU::S_LOAD_DWORD_IMM: 351 case AMDGPU::GLOBAL_LOAD_DWORD: 352 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 353 case AMDGPU::GLOBAL_STORE_DWORD: 354 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 355 case AMDGPU::FLAT_LOAD_DWORD: 356 case AMDGPU::FLAT_STORE_DWORD: 357 return 1; 358 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 359 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 360 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: 361 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: 362 case AMDGPU::S_LOAD_DWORDX2_IMM: 363 case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 364 case AMDGPU::GLOBAL_LOAD_DWORDX2: 365 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 366 case AMDGPU::GLOBAL_STORE_DWORDX2: 367 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 368 case AMDGPU::FLAT_LOAD_DWORDX2: 369 case AMDGPU::FLAT_STORE_DWORDX2: 370 return 2; 371 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 372 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 373 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: 374 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: 375 case AMDGPU::S_LOAD_DWORDX3_IMM: 376 case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 377 case AMDGPU::GLOBAL_LOAD_DWORDX3: 378 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 379 case AMDGPU::GLOBAL_STORE_DWORDX3: 380 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 381 case AMDGPU::FLAT_LOAD_DWORDX3: 382 case AMDGPU::FLAT_STORE_DWORDX3: 383 return 3; 384 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 385 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 386 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: 387 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: 388 case AMDGPU::S_LOAD_DWORDX4_IMM: 389 case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 390 case AMDGPU::GLOBAL_LOAD_DWORDX4: 391 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 392 case AMDGPU::GLOBAL_STORE_DWORDX4: 393 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 394 case AMDGPU::FLAT_LOAD_DWORDX4: 395 case AMDGPU::FLAT_STORE_DWORDX4: 396 return 4; 397 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 398 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 399 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: 400 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: 401 case AMDGPU::S_LOAD_DWORDX8_IMM: 402 case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 403 return 8; 404 case AMDGPU::DS_READ_B32: 405 case AMDGPU::DS_READ_B32_gfx9: 406 case AMDGPU::DS_WRITE_B32: 407 case AMDGPU::DS_WRITE_B32_gfx9: 408 return 1; 409 case AMDGPU::DS_READ_B64: 410 case AMDGPU::DS_READ_B64_gfx9: 411 case AMDGPU::DS_WRITE_B64: 412 case AMDGPU::DS_WRITE_B64_gfx9: 413 return 2; 414 default: 415 return 0; 416 } 417 } 418 419 /// Maps instruction opcode to enum InstClassEnum. 420 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 421 switch (Opc) { 422 default: 423 if (TII.isMUBUF(Opc)) { 424 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 425 default: 426 return UNKNOWN; 427 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN: 428 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact: 429 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN: 430 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact: 431 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 432 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 433 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 434 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 435 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN: 436 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact: 437 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN: 438 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact: 439 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN: 440 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact: 441 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET: 442 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact: 443 return BUFFER_LOAD; 444 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN: 445 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact: 446 case AMDGPU::BUFFER_STORE_DWORD_IDXEN: 447 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact: 448 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 449 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 450 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 451 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 452 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN: 453 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact: 454 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN: 455 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact: 456 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN: 457 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact: 458 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET: 459 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact: 460 return BUFFER_STORE; 461 } 462 } 463 if (TII.isImage(Opc)) { 464 // Ignore instructions encoded without vaddr. 465 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && 466 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0)) 467 return UNKNOWN; 468 // Ignore BVH instructions 469 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 470 return UNKNOWN; 471 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 472 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 473 TII.isGather4(Opc)) 474 return UNKNOWN; 475 return MIMG; 476 } 477 if (TII.isMTBUF(Opc)) { 478 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 479 default: 480 return UNKNOWN; 481 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: 482 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: 483 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: 484 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: 485 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 486 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 487 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 488 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 489 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN: 490 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact: 491 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN: 492 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact: 493 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN: 494 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact: 495 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET: 496 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact: 497 return TBUFFER_LOAD; 498 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 499 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 500 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 501 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 502 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN: 503 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact: 504 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET: 505 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact: 506 return TBUFFER_STORE; 507 } 508 } 509 return UNKNOWN; 510 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 511 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 512 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 513 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 514 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 515 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: 516 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: 517 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: 518 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: 519 return S_BUFFER_LOAD_IMM; 520 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 521 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 522 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 523 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 524 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 525 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: 526 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: 527 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: 528 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: 529 return S_BUFFER_LOAD_SGPR_IMM; 530 case AMDGPU::S_LOAD_DWORD_IMM: 531 case AMDGPU::S_LOAD_DWORDX2_IMM: 532 case AMDGPU::S_LOAD_DWORDX3_IMM: 533 case AMDGPU::S_LOAD_DWORDX4_IMM: 534 case AMDGPU::S_LOAD_DWORDX8_IMM: 535 case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 536 case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 537 case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 538 case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 539 return S_LOAD_IMM; 540 case AMDGPU::DS_READ_B32: 541 case AMDGPU::DS_READ_B32_gfx9: 542 case AMDGPU::DS_READ_B64: 543 case AMDGPU::DS_READ_B64_gfx9: 544 return DS_READ; 545 case AMDGPU::DS_WRITE_B32: 546 case AMDGPU::DS_WRITE_B32_gfx9: 547 case AMDGPU::DS_WRITE_B64: 548 case AMDGPU::DS_WRITE_B64_gfx9: 549 return DS_WRITE; 550 case AMDGPU::GLOBAL_LOAD_DWORD: 551 case AMDGPU::GLOBAL_LOAD_DWORDX2: 552 case AMDGPU::GLOBAL_LOAD_DWORDX3: 553 case AMDGPU::GLOBAL_LOAD_DWORDX4: 554 case AMDGPU::FLAT_LOAD_DWORD: 555 case AMDGPU::FLAT_LOAD_DWORDX2: 556 case AMDGPU::FLAT_LOAD_DWORDX3: 557 case AMDGPU::FLAT_LOAD_DWORDX4: 558 return FLAT_LOAD; 559 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 560 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 561 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 562 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 563 return GLOBAL_LOAD_SADDR; 564 case AMDGPU::GLOBAL_STORE_DWORD: 565 case AMDGPU::GLOBAL_STORE_DWORDX2: 566 case AMDGPU::GLOBAL_STORE_DWORDX3: 567 case AMDGPU::GLOBAL_STORE_DWORDX4: 568 case AMDGPU::FLAT_STORE_DWORD: 569 case AMDGPU::FLAT_STORE_DWORDX2: 570 case AMDGPU::FLAT_STORE_DWORDX3: 571 case AMDGPU::FLAT_STORE_DWORDX4: 572 return FLAT_STORE; 573 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 574 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 575 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 576 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 577 return GLOBAL_STORE_SADDR; 578 } 579 } 580 581 /// Determines instruction subclass from opcode. Only instructions 582 /// of the same subclass can be merged together. The merged instruction may have 583 /// a different subclass but must have the same class. 584 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 585 switch (Opc) { 586 default: 587 if (TII.isMUBUF(Opc)) 588 return AMDGPU::getMUBUFBaseOpcode(Opc); 589 if (TII.isImage(Opc)) { 590 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 591 assert(Info); 592 return Info->BaseOpcode; 593 } 594 if (TII.isMTBUF(Opc)) 595 return AMDGPU::getMTBUFBaseOpcode(Opc); 596 return -1; 597 case AMDGPU::DS_READ_B32: 598 case AMDGPU::DS_READ_B32_gfx9: 599 case AMDGPU::DS_READ_B64: 600 case AMDGPU::DS_READ_B64_gfx9: 601 case AMDGPU::DS_WRITE_B32: 602 case AMDGPU::DS_WRITE_B32_gfx9: 603 case AMDGPU::DS_WRITE_B64: 604 case AMDGPU::DS_WRITE_B64_gfx9: 605 return Opc; 606 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 607 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 608 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 609 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 610 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 611 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: 612 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: 613 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: 614 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: 615 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 616 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 617 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 618 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 619 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 620 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 621 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: 622 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: 623 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: 624 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: 625 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; 626 case AMDGPU::S_LOAD_DWORD_IMM: 627 case AMDGPU::S_LOAD_DWORDX2_IMM: 628 case AMDGPU::S_LOAD_DWORDX3_IMM: 629 case AMDGPU::S_LOAD_DWORDX4_IMM: 630 case AMDGPU::S_LOAD_DWORDX8_IMM: 631 case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 632 case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 633 case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 634 case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 635 return AMDGPU::S_LOAD_DWORD_IMM; 636 case AMDGPU::GLOBAL_LOAD_DWORD: 637 case AMDGPU::GLOBAL_LOAD_DWORDX2: 638 case AMDGPU::GLOBAL_LOAD_DWORDX3: 639 case AMDGPU::GLOBAL_LOAD_DWORDX4: 640 case AMDGPU::FLAT_LOAD_DWORD: 641 case AMDGPU::FLAT_LOAD_DWORDX2: 642 case AMDGPU::FLAT_LOAD_DWORDX3: 643 case AMDGPU::FLAT_LOAD_DWORDX4: 644 return AMDGPU::FLAT_LOAD_DWORD; 645 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 646 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 647 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 648 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 649 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; 650 case AMDGPU::GLOBAL_STORE_DWORD: 651 case AMDGPU::GLOBAL_STORE_DWORDX2: 652 case AMDGPU::GLOBAL_STORE_DWORDX3: 653 case AMDGPU::GLOBAL_STORE_DWORDX4: 654 case AMDGPU::FLAT_STORE_DWORD: 655 case AMDGPU::FLAT_STORE_DWORDX2: 656 case AMDGPU::FLAT_STORE_DWORDX3: 657 case AMDGPU::FLAT_STORE_DWORDX4: 658 return AMDGPU::FLAT_STORE_DWORD; 659 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 660 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 661 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 662 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 663 return AMDGPU::GLOBAL_STORE_DWORD_SADDR; 664 } 665 } 666 667 // GLOBAL loads and stores are classified as FLAT initially. If both combined 668 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. 669 // If either or both instructions are non segment specific FLAT the resulting 670 // combined operation will be FLAT, potentially promoting one of the GLOBAL 671 // operations to FLAT. 672 // For other instructions return the original unmodified class. 673 InstClassEnum 674 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, 675 const CombineInfo &Paired) { 676 assert(CI.InstClass == Paired.InstClass); 677 678 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && 679 SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I)) 680 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; 681 682 return CI.InstClass; 683 } 684 685 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 686 AddressRegs Result; 687 688 if (TII.isMUBUF(Opc)) { 689 if (AMDGPU::getMUBUFHasVAddr(Opc)) 690 Result.VAddr = true; 691 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 692 Result.SRsrc = true; 693 if (AMDGPU::getMUBUFHasSoffset(Opc)) 694 Result.SOffset = true; 695 696 return Result; 697 } 698 699 if (TII.isImage(Opc)) { 700 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 701 if (VAddr0Idx >= 0) { 702 int RsrcName = 703 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; 704 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName); 705 Result.NumVAddrs = RsrcIdx - VAddr0Idx; 706 } else { 707 Result.VAddr = true; 708 } 709 Result.SRsrc = true; 710 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 711 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 712 Result.SSamp = true; 713 714 return Result; 715 } 716 if (TII.isMTBUF(Opc)) { 717 if (AMDGPU::getMTBUFHasVAddr(Opc)) 718 Result.VAddr = true; 719 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 720 Result.SRsrc = true; 721 if (AMDGPU::getMTBUFHasSoffset(Opc)) 722 Result.SOffset = true; 723 724 return Result; 725 } 726 727 switch (Opc) { 728 default: 729 return Result; 730 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 731 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 732 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 733 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 734 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 735 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: 736 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: 737 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: 738 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: 739 Result.SOffset = true; 740 [[fallthrough]]; 741 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 742 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 743 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 744 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 745 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 746 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: 747 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: 748 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: 749 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: 750 case AMDGPU::S_LOAD_DWORD_IMM: 751 case AMDGPU::S_LOAD_DWORDX2_IMM: 752 case AMDGPU::S_LOAD_DWORDX3_IMM: 753 case AMDGPU::S_LOAD_DWORDX4_IMM: 754 case AMDGPU::S_LOAD_DWORDX8_IMM: 755 case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 756 case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 757 case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 758 case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 759 Result.SBase = true; 760 return Result; 761 case AMDGPU::DS_READ_B32: 762 case AMDGPU::DS_READ_B64: 763 case AMDGPU::DS_READ_B32_gfx9: 764 case AMDGPU::DS_READ_B64_gfx9: 765 case AMDGPU::DS_WRITE_B32: 766 case AMDGPU::DS_WRITE_B64: 767 case AMDGPU::DS_WRITE_B32_gfx9: 768 case AMDGPU::DS_WRITE_B64_gfx9: 769 Result.Addr = true; 770 return Result; 771 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 772 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 773 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 774 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 775 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 776 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 777 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 778 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 779 Result.SAddr = true; 780 [[fallthrough]]; 781 case AMDGPU::GLOBAL_LOAD_DWORD: 782 case AMDGPU::GLOBAL_LOAD_DWORDX2: 783 case AMDGPU::GLOBAL_LOAD_DWORDX3: 784 case AMDGPU::GLOBAL_LOAD_DWORDX4: 785 case AMDGPU::GLOBAL_STORE_DWORD: 786 case AMDGPU::GLOBAL_STORE_DWORDX2: 787 case AMDGPU::GLOBAL_STORE_DWORDX3: 788 case AMDGPU::GLOBAL_STORE_DWORDX4: 789 case AMDGPU::FLAT_LOAD_DWORD: 790 case AMDGPU::FLAT_LOAD_DWORDX2: 791 case AMDGPU::FLAT_LOAD_DWORDX3: 792 case AMDGPU::FLAT_LOAD_DWORDX4: 793 case AMDGPU::FLAT_STORE_DWORD: 794 case AMDGPU::FLAT_STORE_DWORDX2: 795 case AMDGPU::FLAT_STORE_DWORDX3: 796 case AMDGPU::FLAT_STORE_DWORDX4: 797 Result.VAddr = true; 798 return Result; 799 } 800 } 801 802 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 803 const SILoadStoreOptimizer &LSO) { 804 I = MI; 805 unsigned Opc = MI->getOpcode(); 806 InstClass = getInstClass(Opc, *LSO.TII); 807 808 if (InstClass == UNKNOWN) 809 return; 810 811 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 812 813 switch (InstClass) { 814 case DS_READ: 815 EltSize = 816 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 817 : 4; 818 break; 819 case DS_WRITE: 820 EltSize = 821 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 822 : 4; 823 break; 824 case S_BUFFER_LOAD_IMM: 825 case S_BUFFER_LOAD_SGPR_IMM: 826 case S_LOAD_IMM: 827 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 828 break; 829 default: 830 EltSize = 4; 831 break; 832 } 833 834 if (InstClass == MIMG) { 835 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 836 // Offset is not considered for MIMG instructions. 837 Offset = 0; 838 } else { 839 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 840 Offset = I->getOperand(OffsetIdx).getImm(); 841 } 842 843 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 844 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 845 846 Width = getOpcodeWidth(*I, *LSO.TII); 847 848 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 849 Offset &= 0xffff; 850 } else if (InstClass != MIMG) { 851 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 852 } 853 854 AddressRegs Regs = getRegs(Opc, *LSO.TII); 855 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I); 856 857 NumAddresses = 0; 858 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 859 AddrIdx[NumAddresses++] = 860 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 861 if (Regs.Addr) 862 AddrIdx[NumAddresses++] = 863 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 864 if (Regs.SBase) 865 AddrIdx[NumAddresses++] = 866 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 867 if (Regs.SRsrc) 868 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 869 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc); 870 if (Regs.SOffset) 871 AddrIdx[NumAddresses++] = 872 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 873 if (Regs.SAddr) 874 AddrIdx[NumAddresses++] = 875 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 876 if (Regs.VAddr) 877 AddrIdx[NumAddresses++] = 878 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 879 if (Regs.SSamp) 880 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 881 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp); 882 assert(NumAddresses <= MaxAddressRegs); 883 884 for (unsigned J = 0; J < NumAddresses; J++) 885 AddrReg[J] = &I->getOperand(AddrIdx[J]); 886 } 887 888 } // end anonymous namespace. 889 890 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy, DEBUG_TYPE, 891 "SI Load Store Optimizer", false, false) 892 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 893 INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy, DEBUG_TYPE, 894 "SI Load Store Optimizer", false, false) 895 896 char SILoadStoreOptimizerLegacy::ID = 0; 897 898 char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID; 899 900 FunctionPass *llvm::createSILoadStoreOptimizerLegacyPass() { 901 return new SILoadStoreOptimizerLegacy(); 902 } 903 904 static void addDefsUsesToList(const MachineInstr &MI, 905 DenseSet<Register> &RegDefs, 906 DenseSet<Register> &RegUses) { 907 for (const auto &Op : MI.operands()) { 908 if (!Op.isReg()) 909 continue; 910 if (Op.isDef()) 911 RegDefs.insert(Op.getReg()); 912 if (Op.readsReg()) 913 RegUses.insert(Op.getReg()); 914 } 915 } 916 917 bool SILoadStoreOptimizer::canSwapInstructions( 918 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 919 const MachineInstr &A, const MachineInstr &B) const { 920 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 921 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 922 return false; 923 for (const auto &BOp : B.operands()) { 924 if (!BOp.isReg()) 925 continue; 926 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 927 return false; 928 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 929 return false; 930 } 931 return true; 932 } 933 934 // Given that \p CI and \p Paired are adjacent memory operations produce a new 935 // MMO for the combined operation with a new access size. 936 MachineMemOperand * 937 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, 938 const CombineInfo &Paired) { 939 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 940 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 941 942 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue(); 943 944 // A base pointer for the combined operation is the same as the leading 945 // operation's pointer. 946 if (Paired < CI) 947 std::swap(MMOa, MMOb); 948 949 MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); 950 // If merging FLAT and GLOBAL set address space to FLAT. 951 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 952 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; 953 954 MachineFunction *MF = CI.I->getMF(); 955 return MF->getMachineMemOperand(MMOa, PtrInfo, Size); 956 } 957 958 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 959 const SIInstrInfo &TII, 960 const CombineInfo &Paired) { 961 assert(CI.InstClass == MIMG); 962 963 // Ignore instructions with tfe/lwe set. 964 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 965 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 966 967 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 968 return false; 969 970 // Check other optional immediate operands for equality. 971 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 972 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 973 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 974 975 for (auto op : OperandsToMatch) { 976 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 977 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 978 return false; 979 if (Idx != -1 && 980 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 981 return false; 982 } 983 984 // Check DMask for overlaps. 985 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 986 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 987 988 if (!MaxMask) 989 return false; 990 991 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask); 992 if ((1u << AllowedBitsForMin) <= MinMask) 993 return false; 994 995 return true; 996 } 997 998 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 999 unsigned ComponentCount, 1000 const GCNSubtarget &STI) { 1001 if (ComponentCount > 4) 1002 return 0; 1003 1004 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 1005 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 1006 if (!OldFormatInfo) 1007 return 0; 1008 1009 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 1010 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 1011 ComponentCount, 1012 OldFormatInfo->NumFormat, STI); 1013 1014 if (!NewFormatInfo) 1015 return 0; 1016 1017 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 1018 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 1019 1020 return NewFormatInfo->Format; 1021 } 1022 1023 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 1024 // highest power of two. Note that the result is well defined for all inputs 1025 // including corner cases like: 1026 // - if Lo == Hi, return that value 1027 // - if Lo == 0, return 0 (even though the "- 1" below underflows 1028 // - if Lo > Hi, return 0 (as if the range wrapped around) 1029 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 1030 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1); 1031 } 1032 1033 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 1034 const GCNSubtarget &STI, 1035 CombineInfo &Paired, 1036 bool Modify) { 1037 assert(CI.InstClass != MIMG); 1038 1039 // XXX - Would the same offset be OK? Is there any reason this would happen or 1040 // be useful? 1041 if (CI.Offset == Paired.Offset) 1042 return false; 1043 1044 // This won't be valid if the offset isn't aligned. 1045 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 1046 return false; 1047 1048 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 1049 1050 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 1051 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 1052 if (!Info0) 1053 return false; 1054 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 1055 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 1056 if (!Info1) 1057 return false; 1058 1059 if (Info0->BitsPerComp != Info1->BitsPerComp || 1060 Info0->NumFormat != Info1->NumFormat) 1061 return false; 1062 1063 // TODO: Should be possible to support more formats, but if format loads 1064 // are not dword-aligned, the merged load might not be valid. 1065 if (Info0->BitsPerComp != 32) 1066 return false; 1067 1068 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 1069 return false; 1070 } 1071 1072 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 1073 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 1074 CI.UseST64 = false; 1075 CI.BaseOff = 0; 1076 1077 // Handle all non-DS instructions. 1078 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 1079 if (EltOffset0 + CI.Width != EltOffset1 && 1080 EltOffset1 + Paired.Width != EltOffset0) 1081 return false; 1082 if (CI.CPol != Paired.CPol) 1083 return false; 1084 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || 1085 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { 1086 // Reject cases like: 1087 // dword + dwordx2 -> dwordx3 1088 // dword + dwordx3 -> dwordx4 1089 // If we tried to combine these cases, we would fail to extract a subreg 1090 // for the result of the second load due to SGPR alignment requirements. 1091 if (CI.Width != Paired.Width && 1092 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset)) 1093 return false; 1094 } 1095 return true; 1096 } 1097 1098 // If the offset in elements doesn't fit in 8-bits, we might be able to use 1099 // the stride 64 versions. 1100 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 1101 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 1102 if (Modify) { 1103 CI.Offset = EltOffset0 / 64; 1104 Paired.Offset = EltOffset1 / 64; 1105 CI.UseST64 = true; 1106 } 1107 return true; 1108 } 1109 1110 // Check if the new offsets fit in the reduced 8-bit range. 1111 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 1112 if (Modify) { 1113 CI.Offset = EltOffset0; 1114 Paired.Offset = EltOffset1; 1115 } 1116 return true; 1117 } 1118 1119 // Try to shift base address to decrease offsets. 1120 uint32_t Min = std::min(EltOffset0, EltOffset1); 1121 uint32_t Max = std::max(EltOffset0, EltOffset1); 1122 1123 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 1124 if (((Max - Min) & ~Mask) == 0) { 1125 if (Modify) { 1126 // From the range of values we could use for BaseOff, choose the one that 1127 // is aligned to the highest power of two, to maximise the chance that 1128 // the same offset can be reused for other load/store pairs. 1129 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 1130 // Copy the low bits of the offsets, so that when we adjust them by 1131 // subtracting BaseOff they will be multiples of 64. 1132 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 1133 CI.BaseOff = BaseOff * CI.EltSize; 1134 CI.Offset = (EltOffset0 - BaseOff) / 64; 1135 Paired.Offset = (EltOffset1 - BaseOff) / 64; 1136 CI.UseST64 = true; 1137 } 1138 return true; 1139 } 1140 1141 if (isUInt<8>(Max - Min)) { 1142 if (Modify) { 1143 // From the range of values we could use for BaseOff, choose the one that 1144 // is aligned to the highest power of two, to maximise the chance that 1145 // the same offset can be reused for other load/store pairs. 1146 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 1147 CI.BaseOff = BaseOff * CI.EltSize; 1148 CI.Offset = EltOffset0 - BaseOff; 1149 Paired.Offset = EltOffset1 - BaseOff; 1150 } 1151 return true; 1152 } 1153 1154 return false; 1155 } 1156 1157 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 1158 const CombineInfo &CI, 1159 const CombineInfo &Paired) { 1160 const unsigned Width = (CI.Width + Paired.Width); 1161 switch (CI.InstClass) { 1162 default: 1163 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 1164 case S_BUFFER_LOAD_IMM: 1165 case S_BUFFER_LOAD_SGPR_IMM: 1166 case S_LOAD_IMM: 1167 switch (Width) { 1168 default: 1169 return false; 1170 case 2: 1171 case 4: 1172 case 8: 1173 return true; 1174 case 3: 1175 return STM.hasScalarDwordx3Loads(); 1176 } 1177 } 1178 } 1179 1180 const TargetRegisterClass * 1181 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 1182 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 1183 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1184 } 1185 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 1186 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1187 } 1188 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 1189 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1190 } 1191 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 1192 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1193 } 1194 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 1195 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1196 } 1197 return nullptr; 1198 } 1199 1200 /// This function assumes that CI comes before Paired in a basic block. Return 1201 /// an insertion point for the merged instruction or nullptr on failure. 1202 SILoadStoreOptimizer::CombineInfo * 1203 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 1204 CombineInfo &Paired) { 1205 // If another instruction has already been merged into CI, it may now be a 1206 // type that we can't do any further merging into. 1207 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 1208 return nullptr; 1209 assert(CI.InstClass == Paired.InstClass); 1210 1211 if (getInstSubclass(CI.I->getOpcode(), *TII) != 1212 getInstSubclass(Paired.I->getOpcode(), *TII)) 1213 return nullptr; 1214 1215 // Check both offsets (or masks for MIMG) can be combined and fit in the 1216 // reduced range. 1217 if (CI.InstClass == MIMG) { 1218 if (!dmasksCanBeCombined(CI, *TII, Paired)) 1219 return nullptr; 1220 } else { 1221 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 1222 return nullptr; 1223 } 1224 1225 DenseSet<Register> RegDefs; 1226 DenseSet<Register> RegUses; 1227 CombineInfo *Where; 1228 if (CI.I->mayLoad()) { 1229 // Try to hoist Paired up to CI. 1230 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 1231 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 1232 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 1233 return nullptr; 1234 } 1235 Where = &CI; 1236 } else { 1237 // Try to sink CI down to Paired. 1238 addDefsUsesToList(*CI.I, RegDefs, RegUses); 1239 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 1240 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 1241 return nullptr; 1242 } 1243 Where = &Paired; 1244 } 1245 1246 // Call offsetsCanBeCombined with modify = true so that the offsets are 1247 // correct for the new instruction. This should return true, because 1248 // this function should only be called on CombineInfo objects that 1249 // have already been confirmed to be mergeable. 1250 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 1251 offsetsCanBeCombined(CI, *STM, Paired, true); 1252 return Where; 1253 } 1254 1255 // Copy the merged load result from DestReg to the original dest regs of CI and 1256 // Paired. 1257 void SILoadStoreOptimizer::copyToDestRegs( 1258 CombineInfo &CI, CombineInfo &Paired, 1259 MachineBasicBlock::iterator InsertBefore, int OpName, 1260 Register DestReg) const { 1261 MachineBasicBlock *MBB = CI.I->getParent(); 1262 DebugLoc DL = CI.I->getDebugLoc(); 1263 1264 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1265 1266 // Copy to the old destination registers. 1267 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1268 auto *Dest0 = TII->getNamedOperand(*CI.I, OpName); 1269 auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName); 1270 1271 // The constrained sload instructions in S_LOAD_IMM class will have 1272 // `early-clobber` flag in the dst operand. Remove the flag before using the 1273 // MOs in copies. 1274 Dest0->setIsEarlyClobber(false); 1275 Dest1->setIsEarlyClobber(false); 1276 1277 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1278 .add(*Dest0) // Copy to same destination including flags and sub reg. 1279 .addReg(DestReg, 0, SubRegIdx0); 1280 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1281 .add(*Dest1) 1282 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1283 } 1284 1285 // Return a register for the source of the merged store after copying the 1286 // original source regs of CI and Paired into it. 1287 Register 1288 SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, 1289 MachineBasicBlock::iterator InsertBefore, 1290 int OpName) const { 1291 MachineBasicBlock *MBB = CI.I->getParent(); 1292 DebugLoc DL = CI.I->getDebugLoc(); 1293 1294 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1295 1296 // Copy to the new source register. 1297 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1298 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1299 1300 const auto *Src0 = TII->getNamedOperand(*CI.I, OpName); 1301 const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName); 1302 1303 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1304 .add(*Src0) 1305 .addImm(SubRegIdx0) 1306 .add(*Src1) 1307 .addImm(SubRegIdx1); 1308 1309 return SrcReg; 1310 } 1311 1312 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1313 if (STM->ldsRequiresM0Init()) 1314 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1315 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1316 } 1317 1318 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1319 if (STM->ldsRequiresM0Init()) 1320 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1321 1322 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1323 : AMDGPU::DS_READ2ST64_B64_gfx9; 1324 } 1325 1326 MachineBasicBlock::iterator 1327 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1328 MachineBasicBlock::iterator InsertBefore) { 1329 MachineBasicBlock *MBB = CI.I->getParent(); 1330 1331 // Be careful, since the addresses could be subregisters themselves in weird 1332 // cases, like vectors of pointers. 1333 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1334 1335 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset); 1336 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset); 1337 unsigned Opc = 1338 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1339 1340 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1341 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1342 1343 const MCInstrDesc &Read2Desc = TII->get(Opc); 1344 1345 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1346 Register DestReg = MRI->createVirtualRegister(SuperRC); 1347 1348 DebugLoc DL = CI.I->getDebugLoc(); 1349 1350 Register BaseReg = AddrReg->getReg(); 1351 unsigned BaseSubReg = AddrReg->getSubReg(); 1352 unsigned BaseRegFlags = 0; 1353 if (CI.BaseOff) { 1354 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1355 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1356 .addImm(CI.BaseOff); 1357 1358 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1359 BaseRegFlags = RegState::Kill; 1360 1361 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1362 .addReg(ImmReg) 1363 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1364 .addImm(0); // clamp bit 1365 BaseSubReg = 0; 1366 } 1367 1368 MachineInstrBuilder Read2 = 1369 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 1370 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1371 .addImm(NewOffset0) // offset0 1372 .addImm(NewOffset1) // offset1 1373 .addImm(0) // gds 1374 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1375 1376 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); 1377 1378 CI.I->eraseFromParent(); 1379 Paired.I->eraseFromParent(); 1380 1381 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1382 return Read2; 1383 } 1384 1385 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1386 if (STM->ldsRequiresM0Init()) 1387 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1388 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1389 : AMDGPU::DS_WRITE2_B64_gfx9; 1390 } 1391 1392 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1393 if (STM->ldsRequiresM0Init()) 1394 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1395 : AMDGPU::DS_WRITE2ST64_B64; 1396 1397 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1398 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1399 } 1400 1401 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1402 CombineInfo &CI, CombineInfo &Paired, 1403 MachineBasicBlock::iterator InsertBefore) { 1404 MachineBasicBlock *MBB = CI.I->getParent(); 1405 1406 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1407 // sure we preserve the subregister index and any register flags set on them. 1408 const MachineOperand *AddrReg = 1409 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1410 const MachineOperand *Data0 = 1411 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1412 const MachineOperand *Data1 = 1413 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1414 1415 unsigned NewOffset0 = CI.Offset; 1416 unsigned NewOffset1 = Paired.Offset; 1417 unsigned Opc = 1418 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1419 1420 if (NewOffset0 > NewOffset1) { 1421 // Canonicalize the merged instruction so the smaller offset comes first. 1422 std::swap(NewOffset0, NewOffset1); 1423 std::swap(Data0, Data1); 1424 } 1425 1426 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1427 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1428 1429 const MCInstrDesc &Write2Desc = TII->get(Opc); 1430 DebugLoc DL = CI.I->getDebugLoc(); 1431 1432 Register BaseReg = AddrReg->getReg(); 1433 unsigned BaseSubReg = AddrReg->getSubReg(); 1434 unsigned BaseRegFlags = 0; 1435 if (CI.BaseOff) { 1436 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1437 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1438 .addImm(CI.BaseOff); 1439 1440 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1441 BaseRegFlags = RegState::Kill; 1442 1443 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1444 .addReg(ImmReg) 1445 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1446 .addImm(0); // clamp bit 1447 BaseSubReg = 0; 1448 } 1449 1450 MachineInstrBuilder Write2 = 1451 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1452 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1453 .add(*Data0) // data0 1454 .add(*Data1) // data1 1455 .addImm(NewOffset0) // offset0 1456 .addImm(NewOffset1) // offset1 1457 .addImm(0) // gds 1458 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1459 1460 CI.I->eraseFromParent(); 1461 Paired.I->eraseFromParent(); 1462 1463 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1464 return Write2; 1465 } 1466 1467 MachineBasicBlock::iterator 1468 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1469 MachineBasicBlock::iterator InsertBefore) { 1470 MachineBasicBlock *MBB = CI.I->getParent(); 1471 DebugLoc DL = CI.I->getDebugLoc(); 1472 const unsigned Opcode = getNewOpcode(CI, Paired); 1473 1474 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1475 1476 Register DestReg = MRI->createVirtualRegister(SuperRC); 1477 unsigned MergedDMask = CI.DMask | Paired.DMask; 1478 unsigned DMaskIdx = 1479 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1480 1481 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1482 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1483 if (I == DMaskIdx) 1484 MIB.addImm(MergedDMask); 1485 else 1486 MIB.add((*CI.I).getOperand(I)); 1487 } 1488 1489 // It shouldn't be possible to get this far if the two instructions 1490 // don't have a single memoperand, because MachineInstr::mayAlias() 1491 // will return true if this is the case. 1492 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1493 1494 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1495 1496 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 1497 1498 CI.I->eraseFromParent(); 1499 Paired.I->eraseFromParent(); 1500 return New; 1501 } 1502 1503 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( 1504 CombineInfo &CI, CombineInfo &Paired, 1505 MachineBasicBlock::iterator InsertBefore) { 1506 MachineBasicBlock *MBB = CI.I->getParent(); 1507 DebugLoc DL = CI.I->getDebugLoc(); 1508 const unsigned Opcode = getNewOpcode(CI, Paired); 1509 1510 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1511 1512 Register DestReg = MRI->createVirtualRegister(SuperRC); 1513 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1514 1515 // It shouldn't be possible to get this far if the two instructions 1516 // don't have a single memoperand, because MachineInstr::mayAlias() 1517 // will return true if this is the case. 1518 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1519 1520 MachineInstrBuilder New = 1521 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1522 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)); 1523 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) 1524 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)); 1525 New.addImm(MergedOffset); 1526 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1527 1528 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg); 1529 1530 CI.I->eraseFromParent(); 1531 Paired.I->eraseFromParent(); 1532 return New; 1533 } 1534 1535 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1536 CombineInfo &CI, CombineInfo &Paired, 1537 MachineBasicBlock::iterator InsertBefore) { 1538 MachineBasicBlock *MBB = CI.I->getParent(); 1539 DebugLoc DL = CI.I->getDebugLoc(); 1540 1541 const unsigned Opcode = getNewOpcode(CI, Paired); 1542 1543 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1544 1545 // Copy to the new source register. 1546 Register DestReg = MRI->createVirtualRegister(SuperRC); 1547 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1548 1549 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1550 1551 AddressRegs Regs = getRegs(Opcode, *TII); 1552 1553 if (Regs.VAddr) 1554 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1555 1556 // It shouldn't be possible to get this far if the two instructions 1557 // don't have a single memoperand, because MachineInstr::mayAlias() 1558 // will return true if this is the case. 1559 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1560 1561 MachineInstr *New = 1562 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1563 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1564 .addImm(MergedOffset) // offset 1565 .addImm(CI.CPol) // cpol 1566 .addImm(0) // swz 1567 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1568 1569 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 1570 1571 CI.I->eraseFromParent(); 1572 Paired.I->eraseFromParent(); 1573 return New; 1574 } 1575 1576 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1577 CombineInfo &CI, CombineInfo &Paired, 1578 MachineBasicBlock::iterator InsertBefore) { 1579 MachineBasicBlock *MBB = CI.I->getParent(); 1580 DebugLoc DL = CI.I->getDebugLoc(); 1581 1582 const unsigned Opcode = getNewOpcode(CI, Paired); 1583 1584 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1585 1586 // Copy to the new source register. 1587 Register DestReg = MRI->createVirtualRegister(SuperRC); 1588 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1589 1590 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1591 1592 AddressRegs Regs = getRegs(Opcode, *TII); 1593 1594 if (Regs.VAddr) 1595 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1596 1597 unsigned JoinedFormat = 1598 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1599 1600 // It shouldn't be possible to get this far if the two instructions 1601 // don't have a single memoperand, because MachineInstr::mayAlias() 1602 // will return true if this is the case. 1603 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1604 1605 MachineInstr *New = 1606 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1607 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1608 .addImm(MergedOffset) // offset 1609 .addImm(JoinedFormat) // format 1610 .addImm(CI.CPol) // cpol 1611 .addImm(0) // swz 1612 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1613 1614 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 1615 1616 CI.I->eraseFromParent(); 1617 Paired.I->eraseFromParent(); 1618 return New; 1619 } 1620 1621 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1622 CombineInfo &CI, CombineInfo &Paired, 1623 MachineBasicBlock::iterator InsertBefore) { 1624 MachineBasicBlock *MBB = CI.I->getParent(); 1625 DebugLoc DL = CI.I->getDebugLoc(); 1626 1627 const unsigned Opcode = getNewOpcode(CI, Paired); 1628 1629 Register SrcReg = 1630 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 1631 1632 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1633 .addReg(SrcReg, RegState::Kill); 1634 1635 AddressRegs Regs = getRegs(Opcode, *TII); 1636 1637 if (Regs.VAddr) 1638 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1639 1640 unsigned JoinedFormat = 1641 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1642 1643 // It shouldn't be possible to get this far if the two instructions 1644 // don't have a single memoperand, because MachineInstr::mayAlias() 1645 // will return true if this is the case. 1646 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1647 1648 MachineInstr *New = 1649 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1650 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1651 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1652 .addImm(JoinedFormat) // format 1653 .addImm(CI.CPol) // cpol 1654 .addImm(0) // swz 1655 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1656 1657 CI.I->eraseFromParent(); 1658 Paired.I->eraseFromParent(); 1659 return New; 1660 } 1661 1662 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( 1663 CombineInfo &CI, CombineInfo &Paired, 1664 MachineBasicBlock::iterator InsertBefore) { 1665 MachineBasicBlock *MBB = CI.I->getParent(); 1666 DebugLoc DL = CI.I->getDebugLoc(); 1667 1668 const unsigned Opcode = getNewOpcode(CI, Paired); 1669 1670 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1671 Register DestReg = MRI->createVirtualRegister(SuperRC); 1672 1673 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1674 1675 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1676 MIB.add(*SAddr); 1677 1678 MachineInstr *New = 1679 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1680 .addImm(std::min(CI.Offset, Paired.Offset)) 1681 .addImm(CI.CPol) 1682 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1683 1684 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); 1685 1686 CI.I->eraseFromParent(); 1687 Paired.I->eraseFromParent(); 1688 return New; 1689 } 1690 1691 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( 1692 CombineInfo &CI, CombineInfo &Paired, 1693 MachineBasicBlock::iterator InsertBefore) { 1694 MachineBasicBlock *MBB = CI.I->getParent(); 1695 DebugLoc DL = CI.I->getDebugLoc(); 1696 1697 const unsigned Opcode = getNewOpcode(CI, Paired); 1698 1699 Register SrcReg = 1700 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 1701 1702 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1703 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1704 .addReg(SrcReg, RegState::Kill); 1705 1706 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1707 MIB.add(*SAddr); 1708 1709 MachineInstr *New = 1710 MIB.addImm(std::min(CI.Offset, Paired.Offset)) 1711 .addImm(CI.CPol) 1712 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1713 1714 CI.I->eraseFromParent(); 1715 Paired.I->eraseFromParent(); 1716 return New; 1717 } 1718 1719 static bool needsConstrainedOpcode(const GCNSubtarget &STM, 1720 ArrayRef<MachineMemOperand *> MMOs, 1721 unsigned Width) { 1722 // Conservatively returns true if not found the MMO. 1723 return STM.isXNACKEnabled() && 1724 (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4); 1725 } 1726 1727 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1728 const CombineInfo &Paired) { 1729 const unsigned Width = CI.Width + Paired.Width; 1730 1731 switch (getCommonInstClass(CI, Paired)) { 1732 default: 1733 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1734 // FIXME: Handle d16 correctly 1735 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1736 Width); 1737 case TBUFFER_LOAD: 1738 case TBUFFER_STORE: 1739 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1740 Width); 1741 1742 case UNKNOWN: 1743 llvm_unreachable("Unknown instruction class"); 1744 case S_BUFFER_LOAD_IMM: { 1745 // If XNACK is enabled, use the constrained opcodes when the first load is 1746 // under-aligned. 1747 bool NeedsConstrainedOpc = 1748 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width); 1749 switch (Width) { 1750 default: 1751 return 0; 1752 case 2: 1753 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec 1754 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1755 case 3: 1756 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec 1757 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; 1758 case 4: 1759 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec 1760 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1761 case 8: 1762 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec 1763 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1764 } 1765 } 1766 case S_BUFFER_LOAD_SGPR_IMM: { 1767 // If XNACK is enabled, use the constrained opcodes when the first load is 1768 // under-aligned. 1769 bool NeedsConstrainedOpc = 1770 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width); 1771 switch (Width) { 1772 default: 1773 return 0; 1774 case 2: 1775 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec 1776 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; 1777 case 3: 1778 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec 1779 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; 1780 case 4: 1781 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec 1782 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; 1783 case 8: 1784 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec 1785 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; 1786 } 1787 } 1788 case S_LOAD_IMM: { 1789 // If XNACK is enabled, use the constrained opcodes when the first load is 1790 // under-aligned. 1791 bool NeedsConstrainedOpc = 1792 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width); 1793 switch (Width) { 1794 default: 1795 return 0; 1796 case 2: 1797 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec 1798 : AMDGPU::S_LOAD_DWORDX2_IMM; 1799 case 3: 1800 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec 1801 : AMDGPU::S_LOAD_DWORDX3_IMM; 1802 case 4: 1803 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec 1804 : AMDGPU::S_LOAD_DWORDX4_IMM; 1805 case 8: 1806 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec 1807 : AMDGPU::S_LOAD_DWORDX8_IMM; 1808 } 1809 } 1810 case GLOBAL_LOAD: 1811 switch (Width) { 1812 default: 1813 return 0; 1814 case 2: 1815 return AMDGPU::GLOBAL_LOAD_DWORDX2; 1816 case 3: 1817 return AMDGPU::GLOBAL_LOAD_DWORDX3; 1818 case 4: 1819 return AMDGPU::GLOBAL_LOAD_DWORDX4; 1820 } 1821 case GLOBAL_LOAD_SADDR: 1822 switch (Width) { 1823 default: 1824 return 0; 1825 case 2: 1826 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; 1827 case 3: 1828 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; 1829 case 4: 1830 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; 1831 } 1832 case GLOBAL_STORE: 1833 switch (Width) { 1834 default: 1835 return 0; 1836 case 2: 1837 return AMDGPU::GLOBAL_STORE_DWORDX2; 1838 case 3: 1839 return AMDGPU::GLOBAL_STORE_DWORDX3; 1840 case 4: 1841 return AMDGPU::GLOBAL_STORE_DWORDX4; 1842 } 1843 case GLOBAL_STORE_SADDR: 1844 switch (Width) { 1845 default: 1846 return 0; 1847 case 2: 1848 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; 1849 case 3: 1850 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; 1851 case 4: 1852 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; 1853 } 1854 case FLAT_LOAD: 1855 switch (Width) { 1856 default: 1857 return 0; 1858 case 2: 1859 return AMDGPU::FLAT_LOAD_DWORDX2; 1860 case 3: 1861 return AMDGPU::FLAT_LOAD_DWORDX3; 1862 case 4: 1863 return AMDGPU::FLAT_LOAD_DWORDX4; 1864 } 1865 case FLAT_STORE: 1866 switch (Width) { 1867 default: 1868 return 0; 1869 case 2: 1870 return AMDGPU::FLAT_STORE_DWORDX2; 1871 case 3: 1872 return AMDGPU::FLAT_STORE_DWORDX3; 1873 case 4: 1874 return AMDGPU::FLAT_STORE_DWORDX4; 1875 } 1876 case MIMG: 1877 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && 1878 "No overlaps"); 1879 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1880 } 1881 } 1882 1883 std::pair<unsigned, unsigned> 1884 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1885 const CombineInfo &Paired) { 1886 assert((CI.InstClass != MIMG || 1887 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == 1888 CI.Width + Paired.Width)) && 1889 "No overlaps"); 1890 1891 unsigned Idx0; 1892 unsigned Idx1; 1893 1894 static const unsigned Idxs[5][4] = { 1895 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1896 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1897 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1898 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1899 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1900 }; 1901 1902 assert(CI.Width >= 1 && CI.Width <= 4); 1903 assert(Paired.Width >= 1 && Paired.Width <= 4); 1904 1905 if (Paired < CI) { 1906 Idx1 = Idxs[0][Paired.Width - 1]; 1907 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1908 } else { 1909 Idx0 = Idxs[0][CI.Width - 1]; 1910 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1911 } 1912 1913 return {Idx0, Idx1}; 1914 } 1915 1916 const TargetRegisterClass * 1917 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1918 const CombineInfo &Paired) const { 1919 if (CI.InstClass == S_BUFFER_LOAD_IMM || 1920 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) { 1921 switch (CI.Width + Paired.Width) { 1922 default: 1923 return nullptr; 1924 case 2: 1925 return &AMDGPU::SReg_64_XEXECRegClass; 1926 case 3: 1927 return &AMDGPU::SGPR_96RegClass; 1928 case 4: 1929 return &AMDGPU::SGPR_128RegClass; 1930 case 8: 1931 return &AMDGPU::SGPR_256RegClass; 1932 case 16: 1933 return &AMDGPU::SGPR_512RegClass; 1934 } 1935 } 1936 1937 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1938 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1939 ? TRI->getAGPRClassForBitWidth(BitWidth) 1940 : TRI->getVGPRClassForBitWidth(BitWidth); 1941 } 1942 1943 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1944 CombineInfo &CI, CombineInfo &Paired, 1945 MachineBasicBlock::iterator InsertBefore) { 1946 MachineBasicBlock *MBB = CI.I->getParent(); 1947 DebugLoc DL = CI.I->getDebugLoc(); 1948 1949 const unsigned Opcode = getNewOpcode(CI, Paired); 1950 1951 Register SrcReg = 1952 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 1953 1954 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1955 .addReg(SrcReg, RegState::Kill); 1956 1957 AddressRegs Regs = getRegs(Opcode, *TII); 1958 1959 if (Regs.VAddr) 1960 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1961 1962 1963 // It shouldn't be possible to get this far if the two instructions 1964 // don't have a single memoperand, because MachineInstr::mayAlias() 1965 // will return true if this is the case. 1966 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1967 1968 MachineInstr *New = 1969 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1970 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1971 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1972 .addImm(CI.CPol) // cpol 1973 .addImm(0) // swz 1974 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1975 1976 CI.I->eraseFromParent(); 1977 Paired.I->eraseFromParent(); 1978 return New; 1979 } 1980 1981 MachineOperand 1982 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1983 APInt V(32, Val, true); 1984 if (TII->isInlineConstant(V)) 1985 return MachineOperand::CreateImm(Val); 1986 1987 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1988 MachineInstr *Mov = 1989 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1990 TII->get(AMDGPU::S_MOV_B32), Reg) 1991 .addImm(Val); 1992 (void)Mov; 1993 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1994 return MachineOperand::CreateReg(Reg, false); 1995 } 1996 1997 // Compute base address using Addr and return the final register. 1998 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1999 const MemAddress &Addr) const { 2000 MachineBasicBlock *MBB = MI.getParent(); 2001 MachineBasicBlock::iterator MBBI = MI.getIterator(); 2002 DebugLoc DL = MI.getDebugLoc(); 2003 2004 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 2005 Addr.Base.LoSubReg) && 2006 "Expected 32-bit Base-Register-Low!!"); 2007 2008 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 2009 Addr.Base.HiSubReg) && 2010 "Expected 32-bit Base-Register-Hi!!"); 2011 2012 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 2013 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 2014 MachineOperand OffsetHi = 2015 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 2016 2017 const auto *CarryRC = TRI->getWaveMaskRegClass(); 2018 Register CarryReg = MRI->createVirtualRegister(CarryRC); 2019 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 2020 2021 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2022 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2023 MachineInstr *LoHalf = 2024 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 2025 .addReg(CarryReg, RegState::Define) 2026 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 2027 .add(OffsetLo) 2028 .addImm(0); // clamp bit 2029 (void)LoHalf; 2030 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 2031 2032 MachineInstr *HiHalf = 2033 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 2034 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 2035 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 2036 .add(OffsetHi) 2037 .addReg(CarryReg, RegState::Kill) 2038 .addImm(0); // clamp bit 2039 (void)HiHalf; 2040 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 2041 2042 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 2043 MachineInstr *FullBase = 2044 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2045 .addReg(DestSub0) 2046 .addImm(AMDGPU::sub0) 2047 .addReg(DestSub1) 2048 .addImm(AMDGPU::sub1); 2049 (void)FullBase; 2050 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 2051 2052 return FullDestReg; 2053 } 2054 2055 // Update base and offset with the NewBase and NewOffset in MI. 2056 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 2057 Register NewBase, 2058 int32_t NewOffset) const { 2059 auto *Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2060 Base->setReg(NewBase); 2061 Base->setIsKill(false); 2062 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 2063 } 2064 2065 std::optional<int32_t> 2066 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 2067 if (Op.isImm()) 2068 return Op.getImm(); 2069 2070 if (!Op.isReg()) 2071 return std::nullopt; 2072 2073 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 2074 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 2075 !Def->getOperand(1).isImm()) 2076 return std::nullopt; 2077 2078 return Def->getOperand(1).getImm(); 2079 } 2080 2081 // Analyze Base and extracts: 2082 // - 32bit base registers, subregisters 2083 // - 64bit constant offset 2084 // Expecting base computation as: 2085 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 2086 // %LO:vgpr_32, %c:sreg_64_xexec = 2087 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 2088 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 2089 // %Base:vreg_64 = 2090 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 2091 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 2092 MemAddress &Addr) const { 2093 if (!Base.isReg()) 2094 return; 2095 2096 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 2097 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 2098 || Def->getNumOperands() != 5) 2099 return; 2100 2101 MachineOperand BaseLo = Def->getOperand(1); 2102 MachineOperand BaseHi = Def->getOperand(3); 2103 if (!BaseLo.isReg() || !BaseHi.isReg()) 2104 return; 2105 2106 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 2107 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 2108 2109 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 2110 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 2111 return; 2112 2113 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 2114 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 2115 2116 auto Offset0P = extractConstOffset(*Src0); 2117 if (Offset0P) 2118 BaseLo = *Src1; 2119 else { 2120 if (!(Offset0P = extractConstOffset(*Src1))) 2121 return; 2122 BaseLo = *Src0; 2123 } 2124 2125 if (!BaseLo.isReg()) 2126 return; 2127 2128 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 2129 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 2130 2131 if (Src0->isImm()) 2132 std::swap(Src0, Src1); 2133 2134 if (!Src1->isImm() || Src0->isImm()) 2135 return; 2136 2137 uint64_t Offset1 = Src1->getImm(); 2138 BaseHi = *Src0; 2139 2140 if (!BaseHi.isReg()) 2141 return; 2142 2143 Addr.Base.LoReg = BaseLo.getReg(); 2144 Addr.Base.HiReg = BaseHi.getReg(); 2145 Addr.Base.LoSubReg = BaseLo.getSubReg(); 2146 Addr.Base.HiSubReg = BaseHi.getSubReg(); 2147 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 2148 } 2149 2150 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 2151 MachineInstr &MI, 2152 MemInfoMap &Visited, 2153 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 2154 2155 if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI)) 2156 return false; 2157 2158 // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers. 2159 if (SIInstrInfo::isFLATScratch(MI)) 2160 return false; 2161 2162 unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS 2163 : AMDGPUAS::FLAT_ADDRESS; 2164 2165 if (AnchorList.count(&MI)) 2166 return false; 2167 2168 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 2169 2170 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 2171 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 2172 return false; 2173 } 2174 2175 // Step1: Find the base-registers and a 64bit constant offset. 2176 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2177 MemAddress MAddr; 2178 if (!Visited.contains(&MI)) { 2179 processBaseWithConstOffset(Base, MAddr); 2180 Visited[&MI] = MAddr; 2181 } else 2182 MAddr = Visited[&MI]; 2183 2184 if (MAddr.Offset == 0) { 2185 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 2186 " constant offsets that can be promoted.\n";); 2187 return false; 2188 } 2189 2190 LLVM_DEBUG(dbgs() << " BASE: {" << printReg(MAddr.Base.HiReg, TRI) << ", " 2191 << printReg(MAddr.Base.LoReg, TRI) 2192 << "} Offset: " << MAddr.Offset << "\n\n";); 2193 2194 // Step2: Traverse through MI's basic block and find an anchor(that has the 2195 // same base-registers) with the highest 13bit distance from MI's offset. 2196 // E.g. (64bit loads) 2197 // bb: 2198 // addr1 = &a + 4096; load1 = load(addr1, 0) 2199 // addr2 = &a + 6144; load2 = load(addr2, 0) 2200 // addr3 = &a + 8192; load3 = load(addr3, 0) 2201 // addr4 = &a + 10240; load4 = load(addr4, 0) 2202 // addr5 = &a + 12288; load5 = load(addr5, 0) 2203 // 2204 // Starting from the first load, the optimization will try to find a new base 2205 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 2206 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 2207 // as the new-base(anchor) because of the maximum distance which can 2208 // accommodate more intermediate bases presumably. 2209 // 2210 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 2211 // (&a + 8192) for load1, load2, load4. 2212 // addr = &a + 8192 2213 // load1 = load(addr, -4096) 2214 // load2 = load(addr, -2048) 2215 // load3 = load(addr, 0) 2216 // load4 = load(addr, 2048) 2217 // addr5 = &a + 12288; load5 = load(addr5, 0) 2218 // 2219 MachineInstr *AnchorInst = nullptr; 2220 MemAddress AnchorAddr; 2221 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 2222 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 2223 2224 MachineBasicBlock *MBB = MI.getParent(); 2225 MachineBasicBlock::iterator E = MBB->end(); 2226 MachineBasicBlock::iterator MBBI = MI.getIterator(); 2227 ++MBBI; 2228 const SITargetLowering *TLI = 2229 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 2230 2231 for ( ; MBBI != E; ++MBBI) { 2232 MachineInstr &MINext = *MBBI; 2233 // TODO: Support finding an anchor(with same base) from store addresses or 2234 // any other load addresses where the opcodes are different. 2235 if (MINext.getOpcode() != MI.getOpcode() || 2236 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 2237 continue; 2238 2239 const MachineOperand &BaseNext = 2240 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 2241 MemAddress MAddrNext; 2242 if (!Visited.contains(&MINext)) { 2243 processBaseWithConstOffset(BaseNext, MAddrNext); 2244 Visited[&MINext] = MAddrNext; 2245 } else 2246 MAddrNext = Visited[&MINext]; 2247 2248 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 2249 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 2250 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 2251 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 2252 continue; 2253 2254 InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset); 2255 2256 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 2257 TargetLoweringBase::AddrMode AM; 2258 AM.HasBaseReg = true; 2259 AM.BaseOffs = Dist; 2260 if (TLI->isLegalFlatAddressingMode(AM, AS) && 2261 (uint32_t)std::abs(Dist) > MaxDist) { 2262 MaxDist = std::abs(Dist); 2263 2264 AnchorAddr = MAddrNext; 2265 AnchorInst = &MINext; 2266 } 2267 } 2268 2269 if (AnchorInst) { 2270 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 2271 AnchorInst->dump()); 2272 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 2273 << AnchorAddr.Offset << "\n\n"); 2274 2275 // Instead of moving up, just re-compute anchor-instruction's base address. 2276 Register Base = computeBase(MI, AnchorAddr); 2277 2278 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 2279 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 2280 2281 for (auto [OtherMI, OtherOffset] : InstsWCommonBase) { 2282 TargetLoweringBase::AddrMode AM; 2283 AM.HasBaseReg = true; 2284 AM.BaseOffs = OtherOffset - AnchorAddr.Offset; 2285 2286 if (TLI->isLegalFlatAddressingMode(AM, AS)) { 2287 LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")"; 2288 OtherMI->dump()); 2289 updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset); 2290 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump()); 2291 } 2292 } 2293 AnchorList.insert(AnchorInst); 2294 return true; 2295 } 2296 2297 return false; 2298 } 2299 2300 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 2301 std::list<std::list<CombineInfo> > &MergeableInsts) const { 2302 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 2303 if (AddrList.front().InstClass == CI.InstClass && 2304 AddrList.front().IsAGPR == CI.IsAGPR && 2305 AddrList.front().hasSameBaseAddress(CI)) { 2306 AddrList.emplace_back(CI); 2307 return; 2308 } 2309 } 2310 2311 // Base address not found, so add a new list. 2312 MergeableInsts.emplace_back(1, CI); 2313 } 2314 2315 std::pair<MachineBasicBlock::iterator, bool> 2316 SILoadStoreOptimizer::collectMergeableInsts( 2317 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2318 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2319 std::list<std::list<CombineInfo>> &MergeableInsts) const { 2320 bool Modified = false; 2321 2322 // Sort potential mergeable instructions into lists. One list per base address. 2323 unsigned Order = 0; 2324 MachineBasicBlock::iterator BlockI = Begin; 2325 for (; BlockI != End; ++BlockI) { 2326 MachineInstr &MI = *BlockI; 2327 2328 // We run this before checking if an address is mergeable, because it can produce 2329 // better code even if the instructions aren't mergeable. 2330 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 2331 Modified = true; 2332 2333 // Treat volatile accesses, ordered accesses and unmodeled side effects as 2334 // barriers. We can look after this barrier for separate merges. 2335 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 2336 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 2337 2338 // Search will resume after this instruction in a separate merge list. 2339 ++BlockI; 2340 break; 2341 } 2342 2343 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2344 if (InstClass == UNKNOWN) 2345 continue; 2346 2347 // Do not merge VMEM buffer instructions with "swizzled" bit set. 2348 int Swizzled = 2349 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 2350 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 2351 continue; 2352 2353 CombineInfo CI; 2354 CI.setMI(MI, *this); 2355 CI.Order = Order++; 2356 2357 if (!CI.hasMergeableAddress(*MRI)) 2358 continue; 2359 2360 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 2361 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2362 // operands. However we are reporting that ds_write2 shall have 2363 // only VGPR data so that machine copy propagation does not 2364 // create an illegal instruction with a VGPR and AGPR sources. 2365 // Consequenctially if we create such instruction the verifier 2366 // will complain. 2367 continue; 2368 } 2369 2370 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2371 2372 addInstToMergeableList(CI, MergeableInsts); 2373 } 2374 2375 // At this point we have lists of Mergeable instructions. 2376 // 2377 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2378 // list try to find an instruction that can be merged with I. If an instruction 2379 // is found, it is stored in the Paired field. If no instructions are found, then 2380 // the CombineInfo object is deleted from the list. 2381 2382 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2383 E = MergeableInsts.end(); I != E;) { 2384 2385 std::list<CombineInfo> &MergeList = *I; 2386 if (MergeList.size() <= 1) { 2387 // This means we have found only one instruction with a given address 2388 // that can be merged, and we need at least 2 instructions to do a merge, 2389 // so this list can be discarded. 2390 I = MergeableInsts.erase(I); 2391 continue; 2392 } 2393 2394 // Sort the lists by offsets, this way mergeable instructions will be 2395 // adjacent to each other in the list, which will make it easier to find 2396 // matches. 2397 MergeList.sort( 2398 [] (const CombineInfo &A, const CombineInfo &B) { 2399 return A.Offset < B.Offset; 2400 }); 2401 ++I; 2402 } 2403 2404 return {BlockI, Modified}; 2405 } 2406 2407 // Scan through looking for adjacent LDS operations with constant offsets from 2408 // the same base register. We rely on the scheduler to do the hard work of 2409 // clustering nearby loads, and assume these are all adjacent. 2410 bool SILoadStoreOptimizer::optimizeBlock( 2411 std::list<std::list<CombineInfo> > &MergeableInsts) { 2412 bool Modified = false; 2413 2414 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2415 E = MergeableInsts.end(); I != E;) { 2416 std::list<CombineInfo> &MergeList = *I; 2417 2418 bool OptimizeListAgain = false; 2419 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2420 // We weren't able to make any changes, so delete the list so we don't 2421 // process the same instructions the next time we try to optimize this 2422 // block. 2423 I = MergeableInsts.erase(I); 2424 continue; 2425 } 2426 2427 Modified = true; 2428 2429 // We made changes, but also determined that there were no more optimization 2430 // opportunities, so we don't need to reprocess the list 2431 if (!OptimizeListAgain) { 2432 I = MergeableInsts.erase(I); 2433 continue; 2434 } 2435 OptimizeAgain = true; 2436 } 2437 return Modified; 2438 } 2439 2440 bool 2441 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2442 std::list<CombineInfo> &MergeList, 2443 bool &OptimizeListAgain) { 2444 if (MergeList.empty()) 2445 return false; 2446 2447 bool Modified = false; 2448 2449 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2450 Next = std::next(I)) { 2451 2452 auto First = I; 2453 auto Second = Next; 2454 2455 if ((*First).Order > (*Second).Order) 2456 std::swap(First, Second); 2457 CombineInfo &CI = *First; 2458 CombineInfo &Paired = *Second; 2459 2460 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 2461 if (!Where) { 2462 ++I; 2463 continue; 2464 } 2465 2466 Modified = true; 2467 2468 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2469 2470 MachineBasicBlock::iterator NewMI; 2471 switch (CI.InstClass) { 2472 default: 2473 llvm_unreachable("unknown InstClass"); 2474 break; 2475 case DS_READ: 2476 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2477 break; 2478 case DS_WRITE: 2479 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2480 break; 2481 case S_BUFFER_LOAD_IMM: 2482 case S_BUFFER_LOAD_SGPR_IMM: 2483 case S_LOAD_IMM: 2484 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I); 2485 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2486 break; 2487 case BUFFER_LOAD: 2488 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2489 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2490 break; 2491 case BUFFER_STORE: 2492 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2493 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2494 break; 2495 case MIMG: 2496 NewMI = mergeImagePair(CI, Paired, Where->I); 2497 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2498 break; 2499 case TBUFFER_LOAD: 2500 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2501 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2502 break; 2503 case TBUFFER_STORE: 2504 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2505 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2506 break; 2507 case FLAT_LOAD: 2508 case GLOBAL_LOAD: 2509 case GLOBAL_LOAD_SADDR: 2510 NewMI = mergeFlatLoadPair(CI, Paired, Where->I); 2511 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2512 break; 2513 case FLAT_STORE: 2514 case GLOBAL_STORE: 2515 case GLOBAL_STORE_SADDR: 2516 NewMI = mergeFlatStorePair(CI, Paired, Where->I); 2517 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2518 break; 2519 } 2520 CI.setMI(NewMI, *this); 2521 CI.Order = Where->Order; 2522 if (I == Second) 2523 I = Next; 2524 2525 MergeList.erase(Second); 2526 } 2527 2528 return Modified; 2529 } 2530 2531 bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) { 2532 if (skipFunction(MF.getFunction())) 2533 return false; 2534 return SILoadStoreOptimizer( 2535 &getAnalysis<AAResultsWrapperPass>().getAAResults()) 2536 .run(MF); 2537 } 2538 2539 bool SILoadStoreOptimizer::run(MachineFunction &MF) { 2540 STM = &MF.getSubtarget<GCNSubtarget>(); 2541 if (!STM->loadStoreOptEnabled()) 2542 return false; 2543 2544 TII = STM->getInstrInfo(); 2545 TRI = &TII->getRegisterInfo(); 2546 2547 MRI = &MF.getRegInfo(); 2548 2549 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2550 2551 bool Modified = false; 2552 2553 // Contains the list of instructions for which constant offsets are being 2554 // promoted to the IMM. This is tracked for an entire block at time. 2555 SmallPtrSet<MachineInstr *, 4> AnchorList; 2556 MemInfoMap Visited; 2557 2558 for (MachineBasicBlock &MBB : MF) { 2559 MachineBasicBlock::iterator SectionEnd; 2560 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2561 I = SectionEnd) { 2562 bool CollectModified; 2563 std::list<std::list<CombineInfo>> MergeableInsts; 2564 2565 // First pass: Collect list of all instructions we know how to merge in a 2566 // subset of the block. 2567 std::tie(SectionEnd, CollectModified) = 2568 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2569 2570 Modified |= CollectModified; 2571 2572 do { 2573 OptimizeAgain = false; 2574 Modified |= optimizeBlock(MergeableInsts); 2575 } while (OptimizeAgain); 2576 } 2577 2578 Visited.clear(); 2579 AnchorList.clear(); 2580 } 2581 2582 return Modified; 2583 } 2584 2585 PreservedAnalyses 2586 SILoadStoreOptimizerPass::run(MachineFunction &MF, 2587 MachineFunctionAnalysisManager &MFAM) { 2588 MFPropsModifier _(*this, MF); 2589 2590 if (MF.getFunction().hasOptNone()) 2591 return PreservedAnalyses::all(); 2592 2593 auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF) 2594 .getManager(); 2595 AAResults &AA = FAM.getResult<AAManager>(MF.getFunction()); 2596 2597 bool Changed = SILoadStoreOptimizer(&AA).run(MF); 2598 if (!Changed) 2599 return PreservedAnalyses::all(); 2600 2601 PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); 2602 PA.preserveSet<CFGAnalyses>(); 2603 return PA; 2604 } 2605