1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 S_BUFFER_LOAD_SGPR_IMM, 78 S_LOAD_IMM, 79 BUFFER_LOAD, 80 BUFFER_STORE, 81 MIMG, 82 TBUFFER_LOAD, 83 TBUFFER_STORE, 84 GLOBAL_LOAD_SADDR, 85 GLOBAL_STORE_SADDR, 86 FLAT_LOAD, 87 FLAT_STORE, 88 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of 89 GLOBAL_STORE // any CombineInfo, they are only ever returned by 90 // getCommonInstClass. 91 }; 92 93 struct AddressRegs { 94 unsigned char NumVAddrs = 0; 95 bool SBase = false; 96 bool SRsrc = false; 97 bool SOffset = false; 98 bool SAddr = false; 99 bool VAddr = false; 100 bool Addr = false; 101 bool SSamp = false; 102 }; 103 104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 105 const unsigned MaxAddressRegs = 12 + 1 + 1; 106 107 class SILoadStoreOptimizer : public MachineFunctionPass { 108 struct CombineInfo { 109 MachineBasicBlock::iterator I; 110 unsigned EltSize; 111 unsigned Offset; 112 unsigned Width; 113 unsigned Format; 114 unsigned BaseOff; 115 unsigned DMask; 116 InstClassEnum InstClass; 117 unsigned CPol = 0; 118 bool IsAGPR; 119 bool UseST64; 120 int AddrIdx[MaxAddressRegs]; 121 const MachineOperand *AddrReg[MaxAddressRegs]; 122 unsigned NumAddresses; 123 unsigned Order; 124 125 bool hasSameBaseAddress(const CombineInfo &CI) { 126 if (NumAddresses != CI.NumAddresses) 127 return false; 128 129 const MachineInstr &MI = *CI.I; 130 for (unsigned i = 0; i < NumAddresses; i++) { 131 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 132 133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 135 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 136 return false; 137 } 138 continue; 139 } 140 141 // Check same base pointer. Be careful of subregisters, which can occur 142 // with vectors of pointers. 143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 145 return false; 146 } 147 } 148 return true; 149 } 150 151 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 152 for (unsigned i = 0; i < NumAddresses; ++i) { 153 const MachineOperand *AddrOp = AddrReg[i]; 154 // Immediates are always OK. 155 if (AddrOp->isImm()) 156 continue; 157 158 // Don't try to merge addresses that aren't either immediates or registers. 159 // TODO: Should be possible to merge FrameIndexes and maybe some other 160 // non-register 161 if (!AddrOp->isReg()) 162 return false; 163 164 // TODO: We should be able to merge instructions with other physical reg 165 // addresses too. 166 if (AddrOp->getReg().isPhysical() && 167 AddrOp->getReg() != AMDGPU::SGPR_NULL) 168 return false; 169 170 // If an address has only one use then there will be no other 171 // instructions with the same address, so we can't merge this one. 172 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 173 return false; 174 } 175 return true; 176 } 177 178 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 179 180 // Compare by pointer order. 181 bool operator<(const CombineInfo& Other) const { 182 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; 183 } 184 }; 185 186 struct BaseRegisters { 187 Register LoReg; 188 Register HiReg; 189 190 unsigned LoSubReg = 0; 191 unsigned HiSubReg = 0; 192 }; 193 194 struct MemAddress { 195 BaseRegisters Base; 196 int64_t Offset = 0; 197 }; 198 199 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 200 201 private: 202 const GCNSubtarget *STM = nullptr; 203 const SIInstrInfo *TII = nullptr; 204 const SIRegisterInfo *TRI = nullptr; 205 MachineRegisterInfo *MRI = nullptr; 206 AliasAnalysis *AA = nullptr; 207 bool OptimizeAgain; 208 209 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 210 const DenseSet<Register> &ARegUses, 211 const MachineInstr &A, const MachineInstr &B) const; 212 static bool dmasksCanBeCombined(const CombineInfo &CI, 213 const SIInstrInfo &TII, 214 const CombineInfo &Paired); 215 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 216 CombineInfo &Paired, bool Modify = false); 217 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 218 const CombineInfo &Paired); 219 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 220 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 221 const CombineInfo &Paired); 222 const TargetRegisterClass * 223 getTargetRegisterClass(const CombineInfo &CI, 224 const CombineInfo &Paired) const; 225 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 226 227 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 228 229 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired, 230 MachineBasicBlock::iterator InsertBefore, int OpName, 231 Register DestReg) const; 232 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, 233 MachineBasicBlock::iterator InsertBefore, 234 int OpName) const; 235 236 unsigned read2Opcode(unsigned EltSize) const; 237 unsigned read2ST64Opcode(unsigned EltSize) const; 238 MachineBasicBlock::iterator 239 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 240 MachineBasicBlock::iterator InsertBefore); 241 242 unsigned write2Opcode(unsigned EltSize) const; 243 unsigned write2ST64Opcode(unsigned EltSize) const; 244 MachineBasicBlock::iterator 245 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 246 MachineBasicBlock::iterator InsertBefore); 247 MachineBasicBlock::iterator 248 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 249 MachineBasicBlock::iterator InsertBefore); 250 MachineBasicBlock::iterator 251 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 252 MachineBasicBlock::iterator InsertBefore); 253 MachineBasicBlock::iterator 254 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 255 MachineBasicBlock::iterator InsertBefore); 256 MachineBasicBlock::iterator 257 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 258 MachineBasicBlock::iterator InsertBefore); 259 MachineBasicBlock::iterator 260 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 261 MachineBasicBlock::iterator InsertBefore); 262 MachineBasicBlock::iterator 263 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 264 MachineBasicBlock::iterator InsertBefore); 265 MachineBasicBlock::iterator 266 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, 267 MachineBasicBlock::iterator InsertBefore); 268 MachineBasicBlock::iterator 269 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, 270 MachineBasicBlock::iterator InsertBefore); 271 272 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 273 int32_t NewOffset) const; 274 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 275 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 276 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 277 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 278 /// Promotes constant offset to the immediate by adjusting the base. It 279 /// tries to use a base from the nearby instructions that allows it to have 280 /// a 13bit constant offset which gets promoted to the immediate. 281 bool promoteConstantOffsetToImm(MachineInstr &CI, 282 MemInfoMap &Visited, 283 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 284 void addInstToMergeableList(const CombineInfo &CI, 285 std::list<std::list<CombineInfo> > &MergeableInsts) const; 286 287 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 288 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 289 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 290 std::list<std::list<CombineInfo>> &MergeableInsts) const; 291 292 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, 293 const CombineInfo &Paired); 294 295 static InstClassEnum getCommonInstClass(const CombineInfo &CI, 296 const CombineInfo &Paired); 297 298 public: 299 static char ID; 300 301 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 302 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 303 } 304 305 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 306 bool &OptimizeListAgain); 307 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 308 309 bool runOnMachineFunction(MachineFunction &MF) override; 310 311 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 312 313 void getAnalysisUsage(AnalysisUsage &AU) const override { 314 AU.setPreservesCFG(); 315 AU.addRequired<AAResultsWrapperPass>(); 316 317 MachineFunctionPass::getAnalysisUsage(AU); 318 } 319 320 MachineFunctionProperties getRequiredProperties() const override { 321 return MachineFunctionProperties() 322 .set(MachineFunctionProperties::Property::IsSSA); 323 } 324 }; 325 326 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 327 const unsigned Opc = MI.getOpcode(); 328 329 if (TII.isMUBUF(Opc)) { 330 // FIXME: Handle d16 correctly 331 return AMDGPU::getMUBUFElements(Opc); 332 } 333 if (TII.isImage(MI)) { 334 uint64_t DMaskImm = 335 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 336 return llvm::popcount(DMaskImm); 337 } 338 if (TII.isMTBUF(Opc)) { 339 return AMDGPU::getMTBUFElements(Opc); 340 } 341 342 switch (Opc) { 343 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 344 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 345 case AMDGPU::S_LOAD_DWORD_IMM: 346 case AMDGPU::GLOBAL_LOAD_DWORD: 347 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 348 case AMDGPU::GLOBAL_STORE_DWORD: 349 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 350 case AMDGPU::FLAT_LOAD_DWORD: 351 case AMDGPU::FLAT_STORE_DWORD: 352 return 1; 353 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 354 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 355 case AMDGPU::S_LOAD_DWORDX2_IMM: 356 case AMDGPU::GLOBAL_LOAD_DWORDX2: 357 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 358 case AMDGPU::GLOBAL_STORE_DWORDX2: 359 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 360 case AMDGPU::FLAT_LOAD_DWORDX2: 361 case AMDGPU::FLAT_STORE_DWORDX2: 362 return 2; 363 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 364 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 365 case AMDGPU::S_LOAD_DWORDX3_IMM: 366 case AMDGPU::GLOBAL_LOAD_DWORDX3: 367 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 368 case AMDGPU::GLOBAL_STORE_DWORDX3: 369 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 370 case AMDGPU::FLAT_LOAD_DWORDX3: 371 case AMDGPU::FLAT_STORE_DWORDX3: 372 return 3; 373 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 374 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 375 case AMDGPU::S_LOAD_DWORDX4_IMM: 376 case AMDGPU::GLOBAL_LOAD_DWORDX4: 377 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 378 case AMDGPU::GLOBAL_STORE_DWORDX4: 379 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 380 case AMDGPU::FLAT_LOAD_DWORDX4: 381 case AMDGPU::FLAT_STORE_DWORDX4: 382 return 4; 383 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 384 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 385 case AMDGPU::S_LOAD_DWORDX8_IMM: 386 return 8; 387 case AMDGPU::DS_READ_B32: 388 case AMDGPU::DS_READ_B32_gfx9: 389 case AMDGPU::DS_WRITE_B32: 390 case AMDGPU::DS_WRITE_B32_gfx9: 391 return 1; 392 case AMDGPU::DS_READ_B64: 393 case AMDGPU::DS_READ_B64_gfx9: 394 case AMDGPU::DS_WRITE_B64: 395 case AMDGPU::DS_WRITE_B64_gfx9: 396 return 2; 397 default: 398 return 0; 399 } 400 } 401 402 /// Maps instruction opcode to enum InstClassEnum. 403 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 404 switch (Opc) { 405 default: 406 if (TII.isMUBUF(Opc)) { 407 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 408 default: 409 return UNKNOWN; 410 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN: 411 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact: 412 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN: 413 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact: 414 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 415 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 416 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 417 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 418 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN: 419 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact: 420 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN: 421 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact: 422 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN: 423 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact: 424 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET: 425 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact: 426 return BUFFER_LOAD; 427 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN: 428 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact: 429 case AMDGPU::BUFFER_STORE_DWORD_IDXEN: 430 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact: 431 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 432 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 433 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 434 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 435 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN: 436 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact: 437 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN: 438 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact: 439 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN: 440 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact: 441 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET: 442 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact: 443 return BUFFER_STORE; 444 } 445 } 446 if (TII.isImage(Opc)) { 447 // Ignore instructions encoded without vaddr. 448 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && 449 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0)) 450 return UNKNOWN; 451 // Ignore BVH instructions 452 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 453 return UNKNOWN; 454 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 455 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 456 TII.isGather4(Opc)) 457 return UNKNOWN; 458 return MIMG; 459 } 460 if (TII.isMTBUF(Opc)) { 461 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 462 default: 463 return UNKNOWN; 464 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: 465 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: 466 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: 467 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: 468 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 469 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 470 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 471 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 472 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN: 473 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact: 474 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN: 475 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact: 476 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN: 477 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact: 478 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET: 479 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact: 480 return TBUFFER_LOAD; 481 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 482 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 483 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 484 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 485 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN: 486 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact: 487 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET: 488 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact: 489 return TBUFFER_STORE; 490 } 491 } 492 return UNKNOWN; 493 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 494 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 495 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 496 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 497 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 498 return S_BUFFER_LOAD_IMM; 499 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 500 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 501 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 502 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 503 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 504 return S_BUFFER_LOAD_SGPR_IMM; 505 case AMDGPU::S_LOAD_DWORD_IMM: 506 case AMDGPU::S_LOAD_DWORDX2_IMM: 507 case AMDGPU::S_LOAD_DWORDX3_IMM: 508 case AMDGPU::S_LOAD_DWORDX4_IMM: 509 case AMDGPU::S_LOAD_DWORDX8_IMM: 510 return S_LOAD_IMM; 511 case AMDGPU::DS_READ_B32: 512 case AMDGPU::DS_READ_B32_gfx9: 513 case AMDGPU::DS_READ_B64: 514 case AMDGPU::DS_READ_B64_gfx9: 515 return DS_READ; 516 case AMDGPU::DS_WRITE_B32: 517 case AMDGPU::DS_WRITE_B32_gfx9: 518 case AMDGPU::DS_WRITE_B64: 519 case AMDGPU::DS_WRITE_B64_gfx9: 520 return DS_WRITE; 521 case AMDGPU::GLOBAL_LOAD_DWORD: 522 case AMDGPU::GLOBAL_LOAD_DWORDX2: 523 case AMDGPU::GLOBAL_LOAD_DWORDX3: 524 case AMDGPU::GLOBAL_LOAD_DWORDX4: 525 case AMDGPU::FLAT_LOAD_DWORD: 526 case AMDGPU::FLAT_LOAD_DWORDX2: 527 case AMDGPU::FLAT_LOAD_DWORDX3: 528 case AMDGPU::FLAT_LOAD_DWORDX4: 529 return FLAT_LOAD; 530 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 531 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 532 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 533 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 534 return GLOBAL_LOAD_SADDR; 535 case AMDGPU::GLOBAL_STORE_DWORD: 536 case AMDGPU::GLOBAL_STORE_DWORDX2: 537 case AMDGPU::GLOBAL_STORE_DWORDX3: 538 case AMDGPU::GLOBAL_STORE_DWORDX4: 539 case AMDGPU::FLAT_STORE_DWORD: 540 case AMDGPU::FLAT_STORE_DWORDX2: 541 case AMDGPU::FLAT_STORE_DWORDX3: 542 case AMDGPU::FLAT_STORE_DWORDX4: 543 return FLAT_STORE; 544 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 545 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 546 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 547 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 548 return GLOBAL_STORE_SADDR; 549 } 550 } 551 552 /// Determines instruction subclass from opcode. Only instructions 553 /// of the same subclass can be merged together. The merged instruction may have 554 /// a different subclass but must have the same class. 555 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 556 switch (Opc) { 557 default: 558 if (TII.isMUBUF(Opc)) 559 return AMDGPU::getMUBUFBaseOpcode(Opc); 560 if (TII.isImage(Opc)) { 561 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 562 assert(Info); 563 return Info->BaseOpcode; 564 } 565 if (TII.isMTBUF(Opc)) 566 return AMDGPU::getMTBUFBaseOpcode(Opc); 567 return -1; 568 case AMDGPU::DS_READ_B32: 569 case AMDGPU::DS_READ_B32_gfx9: 570 case AMDGPU::DS_READ_B64: 571 case AMDGPU::DS_READ_B64_gfx9: 572 case AMDGPU::DS_WRITE_B32: 573 case AMDGPU::DS_WRITE_B32_gfx9: 574 case AMDGPU::DS_WRITE_B64: 575 case AMDGPU::DS_WRITE_B64_gfx9: 576 return Opc; 577 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 578 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 579 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 580 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 581 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 582 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 583 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 584 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 585 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 586 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 587 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 588 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; 589 case AMDGPU::S_LOAD_DWORD_IMM: 590 case AMDGPU::S_LOAD_DWORDX2_IMM: 591 case AMDGPU::S_LOAD_DWORDX3_IMM: 592 case AMDGPU::S_LOAD_DWORDX4_IMM: 593 case AMDGPU::S_LOAD_DWORDX8_IMM: 594 return AMDGPU::S_LOAD_DWORD_IMM; 595 case AMDGPU::GLOBAL_LOAD_DWORD: 596 case AMDGPU::GLOBAL_LOAD_DWORDX2: 597 case AMDGPU::GLOBAL_LOAD_DWORDX3: 598 case AMDGPU::GLOBAL_LOAD_DWORDX4: 599 case AMDGPU::FLAT_LOAD_DWORD: 600 case AMDGPU::FLAT_LOAD_DWORDX2: 601 case AMDGPU::FLAT_LOAD_DWORDX3: 602 case AMDGPU::FLAT_LOAD_DWORDX4: 603 return AMDGPU::FLAT_LOAD_DWORD; 604 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 605 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 606 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 607 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 608 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; 609 case AMDGPU::GLOBAL_STORE_DWORD: 610 case AMDGPU::GLOBAL_STORE_DWORDX2: 611 case AMDGPU::GLOBAL_STORE_DWORDX3: 612 case AMDGPU::GLOBAL_STORE_DWORDX4: 613 case AMDGPU::FLAT_STORE_DWORD: 614 case AMDGPU::FLAT_STORE_DWORDX2: 615 case AMDGPU::FLAT_STORE_DWORDX3: 616 case AMDGPU::FLAT_STORE_DWORDX4: 617 return AMDGPU::FLAT_STORE_DWORD; 618 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 619 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 620 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 621 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 622 return AMDGPU::GLOBAL_STORE_DWORD_SADDR; 623 } 624 } 625 626 // GLOBAL loads and stores are classified as FLAT initially. If both combined 627 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. 628 // If either or both instructions are non segment specific FLAT the resulting 629 // combined operation will be FLAT, potentially promoting one of the GLOBAL 630 // operations to FLAT. 631 // For other instructions return the original unmodified class. 632 InstClassEnum 633 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, 634 const CombineInfo &Paired) { 635 assert(CI.InstClass == Paired.InstClass); 636 637 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && 638 SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I)) 639 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; 640 641 return CI.InstClass; 642 } 643 644 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 645 AddressRegs Result; 646 647 if (TII.isMUBUF(Opc)) { 648 if (AMDGPU::getMUBUFHasVAddr(Opc)) 649 Result.VAddr = true; 650 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 651 Result.SRsrc = true; 652 if (AMDGPU::getMUBUFHasSoffset(Opc)) 653 Result.SOffset = true; 654 655 return Result; 656 } 657 658 if (TII.isImage(Opc)) { 659 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 660 if (VAddr0Idx >= 0) { 661 int RsrcName = 662 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; 663 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName); 664 Result.NumVAddrs = RsrcIdx - VAddr0Idx; 665 } else { 666 Result.VAddr = true; 667 } 668 Result.SRsrc = true; 669 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 670 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 671 Result.SSamp = true; 672 673 return Result; 674 } 675 if (TII.isMTBUF(Opc)) { 676 if (AMDGPU::getMTBUFHasVAddr(Opc)) 677 Result.VAddr = true; 678 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 679 Result.SRsrc = true; 680 if (AMDGPU::getMTBUFHasSoffset(Opc)) 681 Result.SOffset = true; 682 683 return Result; 684 } 685 686 switch (Opc) { 687 default: 688 return Result; 689 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 690 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 691 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 692 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 693 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 694 Result.SOffset = true; 695 [[fallthrough]]; 696 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 697 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 698 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 699 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 700 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 701 case AMDGPU::S_LOAD_DWORD_IMM: 702 case AMDGPU::S_LOAD_DWORDX2_IMM: 703 case AMDGPU::S_LOAD_DWORDX3_IMM: 704 case AMDGPU::S_LOAD_DWORDX4_IMM: 705 case AMDGPU::S_LOAD_DWORDX8_IMM: 706 Result.SBase = true; 707 return Result; 708 case AMDGPU::DS_READ_B32: 709 case AMDGPU::DS_READ_B64: 710 case AMDGPU::DS_READ_B32_gfx9: 711 case AMDGPU::DS_READ_B64_gfx9: 712 case AMDGPU::DS_WRITE_B32: 713 case AMDGPU::DS_WRITE_B64: 714 case AMDGPU::DS_WRITE_B32_gfx9: 715 case AMDGPU::DS_WRITE_B64_gfx9: 716 Result.Addr = true; 717 return Result; 718 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 719 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 720 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 721 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 722 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 723 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 724 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 725 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 726 Result.SAddr = true; 727 [[fallthrough]]; 728 case AMDGPU::GLOBAL_LOAD_DWORD: 729 case AMDGPU::GLOBAL_LOAD_DWORDX2: 730 case AMDGPU::GLOBAL_LOAD_DWORDX3: 731 case AMDGPU::GLOBAL_LOAD_DWORDX4: 732 case AMDGPU::GLOBAL_STORE_DWORD: 733 case AMDGPU::GLOBAL_STORE_DWORDX2: 734 case AMDGPU::GLOBAL_STORE_DWORDX3: 735 case AMDGPU::GLOBAL_STORE_DWORDX4: 736 case AMDGPU::FLAT_LOAD_DWORD: 737 case AMDGPU::FLAT_LOAD_DWORDX2: 738 case AMDGPU::FLAT_LOAD_DWORDX3: 739 case AMDGPU::FLAT_LOAD_DWORDX4: 740 case AMDGPU::FLAT_STORE_DWORD: 741 case AMDGPU::FLAT_STORE_DWORDX2: 742 case AMDGPU::FLAT_STORE_DWORDX3: 743 case AMDGPU::FLAT_STORE_DWORDX4: 744 Result.VAddr = true; 745 return Result; 746 } 747 } 748 749 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 750 const SILoadStoreOptimizer &LSO) { 751 I = MI; 752 unsigned Opc = MI->getOpcode(); 753 InstClass = getInstClass(Opc, *LSO.TII); 754 755 if (InstClass == UNKNOWN) 756 return; 757 758 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 759 760 switch (InstClass) { 761 case DS_READ: 762 EltSize = 763 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 764 : 4; 765 break; 766 case DS_WRITE: 767 EltSize = 768 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 769 : 4; 770 break; 771 case S_BUFFER_LOAD_IMM: 772 case S_BUFFER_LOAD_SGPR_IMM: 773 case S_LOAD_IMM: 774 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 775 break; 776 default: 777 EltSize = 4; 778 break; 779 } 780 781 if (InstClass == MIMG) { 782 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 783 // Offset is not considered for MIMG instructions. 784 Offset = 0; 785 } else { 786 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 787 Offset = I->getOperand(OffsetIdx).getImm(); 788 } 789 790 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 791 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 792 793 Width = getOpcodeWidth(*I, *LSO.TII); 794 795 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 796 Offset &= 0xffff; 797 } else if (InstClass != MIMG) { 798 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 799 } 800 801 AddressRegs Regs = getRegs(Opc, *LSO.TII); 802 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I); 803 804 NumAddresses = 0; 805 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 806 AddrIdx[NumAddresses++] = 807 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 808 if (Regs.Addr) 809 AddrIdx[NumAddresses++] = 810 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 811 if (Regs.SBase) 812 AddrIdx[NumAddresses++] = 813 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 814 if (Regs.SRsrc) 815 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 816 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc); 817 if (Regs.SOffset) 818 AddrIdx[NumAddresses++] = 819 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 820 if (Regs.SAddr) 821 AddrIdx[NumAddresses++] = 822 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 823 if (Regs.VAddr) 824 AddrIdx[NumAddresses++] = 825 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 826 if (Regs.SSamp) 827 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 828 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp); 829 assert(NumAddresses <= MaxAddressRegs); 830 831 for (unsigned J = 0; J < NumAddresses; J++) 832 AddrReg[J] = &I->getOperand(AddrIdx[J]); 833 } 834 835 } // end anonymous namespace. 836 837 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 838 "SI Load Store Optimizer", false, false) 839 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 840 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 841 false, false) 842 843 char SILoadStoreOptimizer::ID = 0; 844 845 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 846 847 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 848 return new SILoadStoreOptimizer(); 849 } 850 851 static void addDefsUsesToList(const MachineInstr &MI, 852 DenseSet<Register> &RegDefs, 853 DenseSet<Register> &RegUses) { 854 for (const auto &Op : MI.operands()) { 855 if (!Op.isReg()) 856 continue; 857 if (Op.isDef()) 858 RegDefs.insert(Op.getReg()); 859 if (Op.readsReg()) 860 RegUses.insert(Op.getReg()); 861 } 862 } 863 864 bool SILoadStoreOptimizer::canSwapInstructions( 865 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 866 const MachineInstr &A, const MachineInstr &B) const { 867 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 868 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 869 return false; 870 for (const auto &BOp : B.operands()) { 871 if (!BOp.isReg()) 872 continue; 873 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 874 return false; 875 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 876 return false; 877 } 878 return true; 879 } 880 881 // Given that \p CI and \p Paired are adjacent memory operations produce a new 882 // MMO for the combined operation with a new access size. 883 MachineMemOperand * 884 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, 885 const CombineInfo &Paired) { 886 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 887 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 888 889 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue(); 890 891 // A base pointer for the combined operation is the same as the leading 892 // operation's pointer. 893 if (Paired < CI) 894 std::swap(MMOa, MMOb); 895 896 MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); 897 // If merging FLAT and GLOBAL set address space to FLAT. 898 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 899 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; 900 901 MachineFunction *MF = CI.I->getMF(); 902 return MF->getMachineMemOperand(MMOa, PtrInfo, Size); 903 } 904 905 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 906 const SIInstrInfo &TII, 907 const CombineInfo &Paired) { 908 assert(CI.InstClass == MIMG); 909 910 // Ignore instructions with tfe/lwe set. 911 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 912 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 913 914 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 915 return false; 916 917 // Check other optional immediate operands for equality. 918 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 919 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 920 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 921 922 for (auto op : OperandsToMatch) { 923 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 924 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 925 return false; 926 if (Idx != -1 && 927 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 928 return false; 929 } 930 931 // Check DMask for overlaps. 932 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 933 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 934 935 if (!MaxMask) 936 return false; 937 938 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask); 939 if ((1u << AllowedBitsForMin) <= MinMask) 940 return false; 941 942 return true; 943 } 944 945 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 946 unsigned ComponentCount, 947 const GCNSubtarget &STI) { 948 if (ComponentCount > 4) 949 return 0; 950 951 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 952 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 953 if (!OldFormatInfo) 954 return 0; 955 956 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 957 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 958 ComponentCount, 959 OldFormatInfo->NumFormat, STI); 960 961 if (!NewFormatInfo) 962 return 0; 963 964 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 965 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 966 967 return NewFormatInfo->Format; 968 } 969 970 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 971 // highest power of two. Note that the result is well defined for all inputs 972 // including corner cases like: 973 // - if Lo == Hi, return that value 974 // - if Lo == 0, return 0 (even though the "- 1" below underflows 975 // - if Lo > Hi, return 0 (as if the range wrapped around) 976 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 977 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1); 978 } 979 980 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 981 const GCNSubtarget &STI, 982 CombineInfo &Paired, 983 bool Modify) { 984 assert(CI.InstClass != MIMG); 985 986 // XXX - Would the same offset be OK? Is there any reason this would happen or 987 // be useful? 988 if (CI.Offset == Paired.Offset) 989 return false; 990 991 // This won't be valid if the offset isn't aligned. 992 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 993 return false; 994 995 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 996 997 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 998 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 999 if (!Info0) 1000 return false; 1001 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 1002 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 1003 if (!Info1) 1004 return false; 1005 1006 if (Info0->BitsPerComp != Info1->BitsPerComp || 1007 Info0->NumFormat != Info1->NumFormat) 1008 return false; 1009 1010 // TODO: Should be possible to support more formats, but if format loads 1011 // are not dword-aligned, the merged load might not be valid. 1012 if (Info0->BitsPerComp != 32) 1013 return false; 1014 1015 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 1016 return false; 1017 } 1018 1019 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 1020 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 1021 CI.UseST64 = false; 1022 CI.BaseOff = 0; 1023 1024 // Handle all non-DS instructions. 1025 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 1026 if (EltOffset0 + CI.Width != EltOffset1 && 1027 EltOffset1 + Paired.Width != EltOffset0) 1028 return false; 1029 if (CI.CPol != Paired.CPol) 1030 return false; 1031 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || 1032 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { 1033 // Reject cases like: 1034 // dword + dwordx2 -> dwordx3 1035 // dword + dwordx3 -> dwordx4 1036 // If we tried to combine these cases, we would fail to extract a subreg 1037 // for the result of the second load due to SGPR alignment requirements. 1038 if (CI.Width != Paired.Width && 1039 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset)) 1040 return false; 1041 } 1042 return true; 1043 } 1044 1045 // If the offset in elements doesn't fit in 8-bits, we might be able to use 1046 // the stride 64 versions. 1047 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 1048 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 1049 if (Modify) { 1050 CI.Offset = EltOffset0 / 64; 1051 Paired.Offset = EltOffset1 / 64; 1052 CI.UseST64 = true; 1053 } 1054 return true; 1055 } 1056 1057 // Check if the new offsets fit in the reduced 8-bit range. 1058 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 1059 if (Modify) { 1060 CI.Offset = EltOffset0; 1061 Paired.Offset = EltOffset1; 1062 } 1063 return true; 1064 } 1065 1066 // Try to shift base address to decrease offsets. 1067 uint32_t Min = std::min(EltOffset0, EltOffset1); 1068 uint32_t Max = std::max(EltOffset0, EltOffset1); 1069 1070 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 1071 if (((Max - Min) & ~Mask) == 0) { 1072 if (Modify) { 1073 // From the range of values we could use for BaseOff, choose the one that 1074 // is aligned to the highest power of two, to maximise the chance that 1075 // the same offset can be reused for other load/store pairs. 1076 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 1077 // Copy the low bits of the offsets, so that when we adjust them by 1078 // subtracting BaseOff they will be multiples of 64. 1079 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 1080 CI.BaseOff = BaseOff * CI.EltSize; 1081 CI.Offset = (EltOffset0 - BaseOff) / 64; 1082 Paired.Offset = (EltOffset1 - BaseOff) / 64; 1083 CI.UseST64 = true; 1084 } 1085 return true; 1086 } 1087 1088 if (isUInt<8>(Max - Min)) { 1089 if (Modify) { 1090 // From the range of values we could use for BaseOff, choose the one that 1091 // is aligned to the highest power of two, to maximise the chance that 1092 // the same offset can be reused for other load/store pairs. 1093 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 1094 CI.BaseOff = BaseOff * CI.EltSize; 1095 CI.Offset = EltOffset0 - BaseOff; 1096 Paired.Offset = EltOffset1 - BaseOff; 1097 } 1098 return true; 1099 } 1100 1101 return false; 1102 } 1103 1104 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 1105 const CombineInfo &CI, 1106 const CombineInfo &Paired) { 1107 const unsigned Width = (CI.Width + Paired.Width); 1108 switch (CI.InstClass) { 1109 default: 1110 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 1111 case S_BUFFER_LOAD_IMM: 1112 case S_BUFFER_LOAD_SGPR_IMM: 1113 case S_LOAD_IMM: 1114 switch (Width) { 1115 default: 1116 return false; 1117 case 2: 1118 case 4: 1119 case 8: 1120 return true; 1121 case 3: 1122 return STM.hasScalarDwordx3Loads(); 1123 } 1124 } 1125 } 1126 1127 const TargetRegisterClass * 1128 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 1129 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 1130 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1131 } 1132 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 1133 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1134 } 1135 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 1136 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1137 } 1138 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 1139 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1140 } 1141 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 1142 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1143 } 1144 return nullptr; 1145 } 1146 1147 /// This function assumes that CI comes before Paired in a basic block. Return 1148 /// an insertion point for the merged instruction or nullptr on failure. 1149 SILoadStoreOptimizer::CombineInfo * 1150 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 1151 CombineInfo &Paired) { 1152 // If another instruction has already been merged into CI, it may now be a 1153 // type that we can't do any further merging into. 1154 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 1155 return nullptr; 1156 assert(CI.InstClass == Paired.InstClass); 1157 1158 if (getInstSubclass(CI.I->getOpcode(), *TII) != 1159 getInstSubclass(Paired.I->getOpcode(), *TII)) 1160 return nullptr; 1161 1162 // Check both offsets (or masks for MIMG) can be combined and fit in the 1163 // reduced range. 1164 if (CI.InstClass == MIMG) { 1165 if (!dmasksCanBeCombined(CI, *TII, Paired)) 1166 return nullptr; 1167 } else { 1168 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 1169 return nullptr; 1170 } 1171 1172 DenseSet<Register> RegDefs; 1173 DenseSet<Register> RegUses; 1174 CombineInfo *Where; 1175 if (CI.I->mayLoad()) { 1176 // Try to hoist Paired up to CI. 1177 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 1178 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 1179 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 1180 return nullptr; 1181 } 1182 Where = &CI; 1183 } else { 1184 // Try to sink CI down to Paired. 1185 addDefsUsesToList(*CI.I, RegDefs, RegUses); 1186 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 1187 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 1188 return nullptr; 1189 } 1190 Where = &Paired; 1191 } 1192 1193 // Call offsetsCanBeCombined with modify = true so that the offsets are 1194 // correct for the new instruction. This should return true, because 1195 // this function should only be called on CombineInfo objects that 1196 // have already been confirmed to be mergeable. 1197 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 1198 offsetsCanBeCombined(CI, *STM, Paired, true); 1199 return Where; 1200 } 1201 1202 // Copy the merged load result from DestReg to the original dest regs of CI and 1203 // Paired. 1204 void SILoadStoreOptimizer::copyToDestRegs( 1205 CombineInfo &CI, CombineInfo &Paired, 1206 MachineBasicBlock::iterator InsertBefore, int OpName, 1207 Register DestReg) const { 1208 MachineBasicBlock *MBB = CI.I->getParent(); 1209 DebugLoc DL = CI.I->getDebugLoc(); 1210 1211 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1212 1213 // Copy to the old destination registers. 1214 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1215 const auto *Dest0 = TII->getNamedOperand(*CI.I, OpName); 1216 const auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName); 1217 1218 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1219 .add(*Dest0) // Copy to same destination including flags and sub reg. 1220 .addReg(DestReg, 0, SubRegIdx0); 1221 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1222 .add(*Dest1) 1223 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1224 } 1225 1226 // Return a register for the source of the merged store after copying the 1227 // original source regs of CI and Paired into it. 1228 Register 1229 SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, 1230 MachineBasicBlock::iterator InsertBefore, 1231 int OpName) const { 1232 MachineBasicBlock *MBB = CI.I->getParent(); 1233 DebugLoc DL = CI.I->getDebugLoc(); 1234 1235 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1236 1237 // Copy to the new source register. 1238 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1239 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1240 1241 const auto *Src0 = TII->getNamedOperand(*CI.I, OpName); 1242 const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName); 1243 1244 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1245 .add(*Src0) 1246 .addImm(SubRegIdx0) 1247 .add(*Src1) 1248 .addImm(SubRegIdx1); 1249 1250 return SrcReg; 1251 } 1252 1253 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1254 if (STM->ldsRequiresM0Init()) 1255 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1256 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1257 } 1258 1259 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1260 if (STM->ldsRequiresM0Init()) 1261 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1262 1263 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1264 : AMDGPU::DS_READ2ST64_B64_gfx9; 1265 } 1266 1267 MachineBasicBlock::iterator 1268 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1269 MachineBasicBlock::iterator InsertBefore) { 1270 MachineBasicBlock *MBB = CI.I->getParent(); 1271 1272 // Be careful, since the addresses could be subregisters themselves in weird 1273 // cases, like vectors of pointers. 1274 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1275 1276 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset); 1277 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset); 1278 unsigned Opc = 1279 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1280 1281 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1282 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1283 1284 const MCInstrDesc &Read2Desc = TII->get(Opc); 1285 1286 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1287 Register DestReg = MRI->createVirtualRegister(SuperRC); 1288 1289 DebugLoc DL = CI.I->getDebugLoc(); 1290 1291 Register BaseReg = AddrReg->getReg(); 1292 unsigned BaseSubReg = AddrReg->getSubReg(); 1293 unsigned BaseRegFlags = 0; 1294 if (CI.BaseOff) { 1295 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1296 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1297 .addImm(CI.BaseOff); 1298 1299 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1300 BaseRegFlags = RegState::Kill; 1301 1302 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1303 .addReg(ImmReg) 1304 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1305 .addImm(0); // clamp bit 1306 BaseSubReg = 0; 1307 } 1308 1309 MachineInstrBuilder Read2 = 1310 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 1311 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1312 .addImm(NewOffset0) // offset0 1313 .addImm(NewOffset1) // offset1 1314 .addImm(0) // gds 1315 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1316 1317 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); 1318 1319 CI.I->eraseFromParent(); 1320 Paired.I->eraseFromParent(); 1321 1322 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1323 return Read2; 1324 } 1325 1326 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1327 if (STM->ldsRequiresM0Init()) 1328 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1329 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1330 : AMDGPU::DS_WRITE2_B64_gfx9; 1331 } 1332 1333 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1334 if (STM->ldsRequiresM0Init()) 1335 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1336 : AMDGPU::DS_WRITE2ST64_B64; 1337 1338 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1339 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1340 } 1341 1342 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1343 CombineInfo &CI, CombineInfo &Paired, 1344 MachineBasicBlock::iterator InsertBefore) { 1345 MachineBasicBlock *MBB = CI.I->getParent(); 1346 1347 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1348 // sure we preserve the subregister index and any register flags set on them. 1349 const MachineOperand *AddrReg = 1350 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1351 const MachineOperand *Data0 = 1352 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1353 const MachineOperand *Data1 = 1354 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1355 1356 unsigned NewOffset0 = CI.Offset; 1357 unsigned NewOffset1 = Paired.Offset; 1358 unsigned Opc = 1359 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1360 1361 if (NewOffset0 > NewOffset1) { 1362 // Canonicalize the merged instruction so the smaller offset comes first. 1363 std::swap(NewOffset0, NewOffset1); 1364 std::swap(Data0, Data1); 1365 } 1366 1367 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1368 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1369 1370 const MCInstrDesc &Write2Desc = TII->get(Opc); 1371 DebugLoc DL = CI.I->getDebugLoc(); 1372 1373 Register BaseReg = AddrReg->getReg(); 1374 unsigned BaseSubReg = AddrReg->getSubReg(); 1375 unsigned BaseRegFlags = 0; 1376 if (CI.BaseOff) { 1377 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1378 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1379 .addImm(CI.BaseOff); 1380 1381 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1382 BaseRegFlags = RegState::Kill; 1383 1384 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1385 .addReg(ImmReg) 1386 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1387 .addImm(0); // clamp bit 1388 BaseSubReg = 0; 1389 } 1390 1391 MachineInstrBuilder Write2 = 1392 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1393 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1394 .add(*Data0) // data0 1395 .add(*Data1) // data1 1396 .addImm(NewOffset0) // offset0 1397 .addImm(NewOffset1) // offset1 1398 .addImm(0) // gds 1399 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1400 1401 CI.I->eraseFromParent(); 1402 Paired.I->eraseFromParent(); 1403 1404 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1405 return Write2; 1406 } 1407 1408 MachineBasicBlock::iterator 1409 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1410 MachineBasicBlock::iterator InsertBefore) { 1411 MachineBasicBlock *MBB = CI.I->getParent(); 1412 DebugLoc DL = CI.I->getDebugLoc(); 1413 const unsigned Opcode = getNewOpcode(CI, Paired); 1414 1415 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1416 1417 Register DestReg = MRI->createVirtualRegister(SuperRC); 1418 unsigned MergedDMask = CI.DMask | Paired.DMask; 1419 unsigned DMaskIdx = 1420 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1421 1422 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1423 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1424 if (I == DMaskIdx) 1425 MIB.addImm(MergedDMask); 1426 else 1427 MIB.add((*CI.I).getOperand(I)); 1428 } 1429 1430 // It shouldn't be possible to get this far if the two instructions 1431 // don't have a single memoperand, because MachineInstr::mayAlias() 1432 // will return true if this is the case. 1433 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1434 1435 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1436 1437 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 1438 1439 CI.I->eraseFromParent(); 1440 Paired.I->eraseFromParent(); 1441 return New; 1442 } 1443 1444 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( 1445 CombineInfo &CI, CombineInfo &Paired, 1446 MachineBasicBlock::iterator InsertBefore) { 1447 MachineBasicBlock *MBB = CI.I->getParent(); 1448 DebugLoc DL = CI.I->getDebugLoc(); 1449 const unsigned Opcode = getNewOpcode(CI, Paired); 1450 1451 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1452 1453 Register DestReg = MRI->createVirtualRegister(SuperRC); 1454 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1455 1456 // It shouldn't be possible to get this far if the two instructions 1457 // don't have a single memoperand, because MachineInstr::mayAlias() 1458 // will return true if this is the case. 1459 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1460 1461 MachineInstrBuilder New = 1462 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1463 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)); 1464 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) 1465 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)); 1466 New.addImm(MergedOffset); 1467 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1468 1469 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg); 1470 1471 CI.I->eraseFromParent(); 1472 Paired.I->eraseFromParent(); 1473 return New; 1474 } 1475 1476 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1477 CombineInfo &CI, CombineInfo &Paired, 1478 MachineBasicBlock::iterator InsertBefore) { 1479 MachineBasicBlock *MBB = CI.I->getParent(); 1480 DebugLoc DL = CI.I->getDebugLoc(); 1481 1482 const unsigned Opcode = getNewOpcode(CI, Paired); 1483 1484 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1485 1486 // Copy to the new source register. 1487 Register DestReg = MRI->createVirtualRegister(SuperRC); 1488 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1489 1490 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1491 1492 AddressRegs Regs = getRegs(Opcode, *TII); 1493 1494 if (Regs.VAddr) 1495 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1496 1497 // It shouldn't be possible to get this far if the two instructions 1498 // don't have a single memoperand, because MachineInstr::mayAlias() 1499 // will return true if this is the case. 1500 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1501 1502 MachineInstr *New = 1503 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1504 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1505 .addImm(MergedOffset) // offset 1506 .addImm(CI.CPol) // cpol 1507 .addImm(0) // swz 1508 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1509 1510 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 1511 1512 CI.I->eraseFromParent(); 1513 Paired.I->eraseFromParent(); 1514 return New; 1515 } 1516 1517 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1518 CombineInfo &CI, CombineInfo &Paired, 1519 MachineBasicBlock::iterator InsertBefore) { 1520 MachineBasicBlock *MBB = CI.I->getParent(); 1521 DebugLoc DL = CI.I->getDebugLoc(); 1522 1523 const unsigned Opcode = getNewOpcode(CI, Paired); 1524 1525 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1526 1527 // Copy to the new source register. 1528 Register DestReg = MRI->createVirtualRegister(SuperRC); 1529 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1530 1531 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1532 1533 AddressRegs Regs = getRegs(Opcode, *TII); 1534 1535 if (Regs.VAddr) 1536 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1537 1538 unsigned JoinedFormat = 1539 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1540 1541 // It shouldn't be possible to get this far if the two instructions 1542 // don't have a single memoperand, because MachineInstr::mayAlias() 1543 // will return true if this is the case. 1544 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1545 1546 MachineInstr *New = 1547 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1548 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1549 .addImm(MergedOffset) // offset 1550 .addImm(JoinedFormat) // format 1551 .addImm(CI.CPol) // cpol 1552 .addImm(0) // swz 1553 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1554 1555 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 1556 1557 CI.I->eraseFromParent(); 1558 Paired.I->eraseFromParent(); 1559 return New; 1560 } 1561 1562 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1563 CombineInfo &CI, CombineInfo &Paired, 1564 MachineBasicBlock::iterator InsertBefore) { 1565 MachineBasicBlock *MBB = CI.I->getParent(); 1566 DebugLoc DL = CI.I->getDebugLoc(); 1567 1568 const unsigned Opcode = getNewOpcode(CI, Paired); 1569 1570 Register SrcReg = 1571 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 1572 1573 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1574 .addReg(SrcReg, RegState::Kill); 1575 1576 AddressRegs Regs = getRegs(Opcode, *TII); 1577 1578 if (Regs.VAddr) 1579 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1580 1581 unsigned JoinedFormat = 1582 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1583 1584 // It shouldn't be possible to get this far if the two instructions 1585 // don't have a single memoperand, because MachineInstr::mayAlias() 1586 // will return true if this is the case. 1587 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1588 1589 MachineInstr *New = 1590 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1591 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1592 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1593 .addImm(JoinedFormat) // format 1594 .addImm(CI.CPol) // cpol 1595 .addImm(0) // swz 1596 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1597 1598 CI.I->eraseFromParent(); 1599 Paired.I->eraseFromParent(); 1600 return New; 1601 } 1602 1603 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( 1604 CombineInfo &CI, CombineInfo &Paired, 1605 MachineBasicBlock::iterator InsertBefore) { 1606 MachineBasicBlock *MBB = CI.I->getParent(); 1607 DebugLoc DL = CI.I->getDebugLoc(); 1608 1609 const unsigned Opcode = getNewOpcode(CI, Paired); 1610 1611 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1612 Register DestReg = MRI->createVirtualRegister(SuperRC); 1613 1614 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1615 1616 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1617 MIB.add(*SAddr); 1618 1619 MachineInstr *New = 1620 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1621 .addImm(std::min(CI.Offset, Paired.Offset)) 1622 .addImm(CI.CPol) 1623 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1624 1625 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); 1626 1627 CI.I->eraseFromParent(); 1628 Paired.I->eraseFromParent(); 1629 return New; 1630 } 1631 1632 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( 1633 CombineInfo &CI, CombineInfo &Paired, 1634 MachineBasicBlock::iterator InsertBefore) { 1635 MachineBasicBlock *MBB = CI.I->getParent(); 1636 DebugLoc DL = CI.I->getDebugLoc(); 1637 1638 const unsigned Opcode = getNewOpcode(CI, Paired); 1639 1640 Register SrcReg = 1641 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 1642 1643 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1644 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1645 .addReg(SrcReg, RegState::Kill); 1646 1647 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1648 MIB.add(*SAddr); 1649 1650 MachineInstr *New = 1651 MIB.addImm(std::min(CI.Offset, Paired.Offset)) 1652 .addImm(CI.CPol) 1653 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1654 1655 CI.I->eraseFromParent(); 1656 Paired.I->eraseFromParent(); 1657 return New; 1658 } 1659 1660 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1661 const CombineInfo &Paired) { 1662 const unsigned Width = CI.Width + Paired.Width; 1663 1664 switch (getCommonInstClass(CI, Paired)) { 1665 default: 1666 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1667 // FIXME: Handle d16 correctly 1668 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1669 Width); 1670 case TBUFFER_LOAD: 1671 case TBUFFER_STORE: 1672 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1673 Width); 1674 1675 case UNKNOWN: 1676 llvm_unreachable("Unknown instruction class"); 1677 case S_BUFFER_LOAD_IMM: 1678 switch (Width) { 1679 default: 1680 return 0; 1681 case 2: 1682 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1683 case 3: 1684 return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; 1685 case 4: 1686 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1687 case 8: 1688 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1689 } 1690 case S_BUFFER_LOAD_SGPR_IMM: 1691 switch (Width) { 1692 default: 1693 return 0; 1694 case 2: 1695 return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; 1696 case 3: 1697 return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; 1698 case 4: 1699 return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; 1700 case 8: 1701 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; 1702 } 1703 case S_LOAD_IMM: 1704 switch (Width) { 1705 default: 1706 return 0; 1707 case 2: 1708 return AMDGPU::S_LOAD_DWORDX2_IMM; 1709 case 3: 1710 return AMDGPU::S_LOAD_DWORDX3_IMM; 1711 case 4: 1712 return AMDGPU::S_LOAD_DWORDX4_IMM; 1713 case 8: 1714 return AMDGPU::S_LOAD_DWORDX8_IMM; 1715 } 1716 case GLOBAL_LOAD: 1717 switch (Width) { 1718 default: 1719 return 0; 1720 case 2: 1721 return AMDGPU::GLOBAL_LOAD_DWORDX2; 1722 case 3: 1723 return AMDGPU::GLOBAL_LOAD_DWORDX3; 1724 case 4: 1725 return AMDGPU::GLOBAL_LOAD_DWORDX4; 1726 } 1727 case GLOBAL_LOAD_SADDR: 1728 switch (Width) { 1729 default: 1730 return 0; 1731 case 2: 1732 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; 1733 case 3: 1734 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; 1735 case 4: 1736 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; 1737 } 1738 case GLOBAL_STORE: 1739 switch (Width) { 1740 default: 1741 return 0; 1742 case 2: 1743 return AMDGPU::GLOBAL_STORE_DWORDX2; 1744 case 3: 1745 return AMDGPU::GLOBAL_STORE_DWORDX3; 1746 case 4: 1747 return AMDGPU::GLOBAL_STORE_DWORDX4; 1748 } 1749 case GLOBAL_STORE_SADDR: 1750 switch (Width) { 1751 default: 1752 return 0; 1753 case 2: 1754 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; 1755 case 3: 1756 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; 1757 case 4: 1758 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; 1759 } 1760 case FLAT_LOAD: 1761 switch (Width) { 1762 default: 1763 return 0; 1764 case 2: 1765 return AMDGPU::FLAT_LOAD_DWORDX2; 1766 case 3: 1767 return AMDGPU::FLAT_LOAD_DWORDX3; 1768 case 4: 1769 return AMDGPU::FLAT_LOAD_DWORDX4; 1770 } 1771 case FLAT_STORE: 1772 switch (Width) { 1773 default: 1774 return 0; 1775 case 2: 1776 return AMDGPU::FLAT_STORE_DWORDX2; 1777 case 3: 1778 return AMDGPU::FLAT_STORE_DWORDX3; 1779 case 4: 1780 return AMDGPU::FLAT_STORE_DWORDX4; 1781 } 1782 case MIMG: 1783 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && 1784 "No overlaps"); 1785 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1786 } 1787 } 1788 1789 std::pair<unsigned, unsigned> 1790 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1791 const CombineInfo &Paired) { 1792 assert((CI.InstClass != MIMG || 1793 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == 1794 CI.Width + Paired.Width)) && 1795 "No overlaps"); 1796 1797 unsigned Idx0; 1798 unsigned Idx1; 1799 1800 static const unsigned Idxs[5][4] = { 1801 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1802 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1803 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1804 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1805 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1806 }; 1807 1808 assert(CI.Width >= 1 && CI.Width <= 4); 1809 assert(Paired.Width >= 1 && Paired.Width <= 4); 1810 1811 if (Paired < CI) { 1812 Idx1 = Idxs[0][Paired.Width - 1]; 1813 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1814 } else { 1815 Idx0 = Idxs[0][CI.Width - 1]; 1816 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1817 } 1818 1819 return {Idx0, Idx1}; 1820 } 1821 1822 const TargetRegisterClass * 1823 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1824 const CombineInfo &Paired) const { 1825 if (CI.InstClass == S_BUFFER_LOAD_IMM || 1826 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) { 1827 switch (CI.Width + Paired.Width) { 1828 default: 1829 return nullptr; 1830 case 2: 1831 return &AMDGPU::SReg_64_XEXECRegClass; 1832 case 3: 1833 return &AMDGPU::SGPR_96RegClass; 1834 case 4: 1835 return &AMDGPU::SGPR_128RegClass; 1836 case 8: 1837 return &AMDGPU::SGPR_256RegClass; 1838 case 16: 1839 return &AMDGPU::SGPR_512RegClass; 1840 } 1841 } 1842 1843 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1844 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1845 ? TRI->getAGPRClassForBitWidth(BitWidth) 1846 : TRI->getVGPRClassForBitWidth(BitWidth); 1847 } 1848 1849 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1850 CombineInfo &CI, CombineInfo &Paired, 1851 MachineBasicBlock::iterator InsertBefore) { 1852 MachineBasicBlock *MBB = CI.I->getParent(); 1853 DebugLoc DL = CI.I->getDebugLoc(); 1854 1855 const unsigned Opcode = getNewOpcode(CI, Paired); 1856 1857 Register SrcReg = 1858 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 1859 1860 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1861 .addReg(SrcReg, RegState::Kill); 1862 1863 AddressRegs Regs = getRegs(Opcode, *TII); 1864 1865 if (Regs.VAddr) 1866 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1867 1868 1869 // It shouldn't be possible to get this far if the two instructions 1870 // don't have a single memoperand, because MachineInstr::mayAlias() 1871 // will return true if this is the case. 1872 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1873 1874 MachineInstr *New = 1875 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1876 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1877 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1878 .addImm(CI.CPol) // cpol 1879 .addImm(0) // swz 1880 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1881 1882 CI.I->eraseFromParent(); 1883 Paired.I->eraseFromParent(); 1884 return New; 1885 } 1886 1887 MachineOperand 1888 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1889 APInt V(32, Val, true); 1890 if (TII->isInlineConstant(V)) 1891 return MachineOperand::CreateImm(Val); 1892 1893 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1894 MachineInstr *Mov = 1895 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1896 TII->get(AMDGPU::S_MOV_B32), Reg) 1897 .addImm(Val); 1898 (void)Mov; 1899 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1900 return MachineOperand::CreateReg(Reg, false); 1901 } 1902 1903 // Compute base address using Addr and return the final register. 1904 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1905 const MemAddress &Addr) const { 1906 MachineBasicBlock *MBB = MI.getParent(); 1907 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1908 DebugLoc DL = MI.getDebugLoc(); 1909 1910 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1911 Addr.Base.LoSubReg) && 1912 "Expected 32-bit Base-Register-Low!!"); 1913 1914 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1915 Addr.Base.HiSubReg) && 1916 "Expected 32-bit Base-Register-Hi!!"); 1917 1918 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1919 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1920 MachineOperand OffsetHi = 1921 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1922 1923 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1924 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1925 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1926 1927 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1928 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1929 MachineInstr *LoHalf = 1930 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1931 .addReg(CarryReg, RegState::Define) 1932 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1933 .add(OffsetLo) 1934 .addImm(0); // clamp bit 1935 (void)LoHalf; 1936 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1937 1938 MachineInstr *HiHalf = 1939 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1940 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1941 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1942 .add(OffsetHi) 1943 .addReg(CarryReg, RegState::Kill) 1944 .addImm(0); // clamp bit 1945 (void)HiHalf; 1946 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1947 1948 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 1949 MachineInstr *FullBase = 1950 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1951 .addReg(DestSub0) 1952 .addImm(AMDGPU::sub0) 1953 .addReg(DestSub1) 1954 .addImm(AMDGPU::sub1); 1955 (void)FullBase; 1956 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1957 1958 return FullDestReg; 1959 } 1960 1961 // Update base and offset with the NewBase and NewOffset in MI. 1962 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1963 Register NewBase, 1964 int32_t NewOffset) const { 1965 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1966 Base->setReg(NewBase); 1967 Base->setIsKill(false); 1968 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1969 } 1970 1971 std::optional<int32_t> 1972 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1973 if (Op.isImm()) 1974 return Op.getImm(); 1975 1976 if (!Op.isReg()) 1977 return std::nullopt; 1978 1979 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1980 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1981 !Def->getOperand(1).isImm()) 1982 return std::nullopt; 1983 1984 return Def->getOperand(1).getImm(); 1985 } 1986 1987 // Analyze Base and extracts: 1988 // - 32bit base registers, subregisters 1989 // - 64bit constant offset 1990 // Expecting base computation as: 1991 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1992 // %LO:vgpr_32, %c:sreg_64_xexec = 1993 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1994 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1995 // %Base:vreg_64 = 1996 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1997 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1998 MemAddress &Addr) const { 1999 if (!Base.isReg()) 2000 return; 2001 2002 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 2003 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 2004 || Def->getNumOperands() != 5) 2005 return; 2006 2007 MachineOperand BaseLo = Def->getOperand(1); 2008 MachineOperand BaseHi = Def->getOperand(3); 2009 if (!BaseLo.isReg() || !BaseHi.isReg()) 2010 return; 2011 2012 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 2013 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 2014 2015 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 2016 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 2017 return; 2018 2019 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 2020 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 2021 2022 auto Offset0P = extractConstOffset(*Src0); 2023 if (Offset0P) 2024 BaseLo = *Src1; 2025 else { 2026 if (!(Offset0P = extractConstOffset(*Src1))) 2027 return; 2028 BaseLo = *Src0; 2029 } 2030 2031 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 2032 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 2033 2034 if (Src0->isImm()) 2035 std::swap(Src0, Src1); 2036 2037 if (!Src1->isImm() || Src0->isImm()) 2038 return; 2039 2040 uint64_t Offset1 = Src1->getImm(); 2041 BaseHi = *Src0; 2042 2043 Addr.Base.LoReg = BaseLo.getReg(); 2044 Addr.Base.HiReg = BaseHi.getReg(); 2045 Addr.Base.LoSubReg = BaseLo.getSubReg(); 2046 Addr.Base.HiSubReg = BaseHi.getSubReg(); 2047 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 2048 } 2049 2050 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 2051 MachineInstr &MI, 2052 MemInfoMap &Visited, 2053 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 2054 2055 if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI)) 2056 return false; 2057 2058 // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers. 2059 if (SIInstrInfo::isFLATScratch(MI)) 2060 return false; 2061 2062 unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS 2063 : AMDGPUAS::FLAT_ADDRESS; 2064 2065 if (AnchorList.count(&MI)) 2066 return false; 2067 2068 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 2069 2070 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 2071 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 2072 return false; 2073 } 2074 2075 // Step1: Find the base-registers and a 64bit constant offset. 2076 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2077 MemAddress MAddr; 2078 if (!Visited.contains(&MI)) { 2079 processBaseWithConstOffset(Base, MAddr); 2080 Visited[&MI] = MAddr; 2081 } else 2082 MAddr = Visited[&MI]; 2083 2084 if (MAddr.Offset == 0) { 2085 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 2086 " constant offsets that can be promoted.\n";); 2087 return false; 2088 } 2089 2090 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 2091 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 2092 2093 // Step2: Traverse through MI's basic block and find an anchor(that has the 2094 // same base-registers) with the highest 13bit distance from MI's offset. 2095 // E.g. (64bit loads) 2096 // bb: 2097 // addr1 = &a + 4096; load1 = load(addr1, 0) 2098 // addr2 = &a + 6144; load2 = load(addr2, 0) 2099 // addr3 = &a + 8192; load3 = load(addr3, 0) 2100 // addr4 = &a + 10240; load4 = load(addr4, 0) 2101 // addr5 = &a + 12288; load5 = load(addr5, 0) 2102 // 2103 // Starting from the first load, the optimization will try to find a new base 2104 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 2105 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 2106 // as the new-base(anchor) because of the maximum distance which can 2107 // accommodate more intermediate bases presumably. 2108 // 2109 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 2110 // (&a + 8192) for load1, load2, load4. 2111 // addr = &a + 8192 2112 // load1 = load(addr, -4096) 2113 // load2 = load(addr, -2048) 2114 // load3 = load(addr, 0) 2115 // load4 = load(addr, 2048) 2116 // addr5 = &a + 12288; load5 = load(addr5, 0) 2117 // 2118 MachineInstr *AnchorInst = nullptr; 2119 MemAddress AnchorAddr; 2120 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 2121 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 2122 2123 MachineBasicBlock *MBB = MI.getParent(); 2124 MachineBasicBlock::iterator E = MBB->end(); 2125 MachineBasicBlock::iterator MBBI = MI.getIterator(); 2126 ++MBBI; 2127 const SITargetLowering *TLI = 2128 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 2129 2130 for ( ; MBBI != E; ++MBBI) { 2131 MachineInstr &MINext = *MBBI; 2132 // TODO: Support finding an anchor(with same base) from store addresses or 2133 // any other load addresses where the opcodes are different. 2134 if (MINext.getOpcode() != MI.getOpcode() || 2135 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 2136 continue; 2137 2138 const MachineOperand &BaseNext = 2139 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 2140 MemAddress MAddrNext; 2141 if (!Visited.contains(&MINext)) { 2142 processBaseWithConstOffset(BaseNext, MAddrNext); 2143 Visited[&MINext] = MAddrNext; 2144 } else 2145 MAddrNext = Visited[&MINext]; 2146 2147 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 2148 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 2149 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 2150 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 2151 continue; 2152 2153 InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset); 2154 2155 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 2156 TargetLoweringBase::AddrMode AM; 2157 AM.HasBaseReg = true; 2158 AM.BaseOffs = Dist; 2159 if (TLI->isLegalFlatAddressingMode(AM, AS) && 2160 (uint32_t)std::abs(Dist) > MaxDist) { 2161 MaxDist = std::abs(Dist); 2162 2163 AnchorAddr = MAddrNext; 2164 AnchorInst = &MINext; 2165 } 2166 } 2167 2168 if (AnchorInst) { 2169 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 2170 AnchorInst->dump()); 2171 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 2172 << AnchorAddr.Offset << "\n\n"); 2173 2174 // Instead of moving up, just re-compute anchor-instruction's base address. 2175 Register Base = computeBase(MI, AnchorAddr); 2176 2177 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 2178 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 2179 2180 for (auto [OtherMI, OtherOffset] : InstsWCommonBase) { 2181 TargetLoweringBase::AddrMode AM; 2182 AM.HasBaseReg = true; 2183 AM.BaseOffs = OtherOffset - AnchorAddr.Offset; 2184 2185 if (TLI->isLegalFlatAddressingMode(AM, AS)) { 2186 LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")"; 2187 OtherMI->dump()); 2188 updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset); 2189 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump()); 2190 } 2191 } 2192 AnchorList.insert(AnchorInst); 2193 return true; 2194 } 2195 2196 return false; 2197 } 2198 2199 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 2200 std::list<std::list<CombineInfo> > &MergeableInsts) const { 2201 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 2202 if (AddrList.front().InstClass == CI.InstClass && 2203 AddrList.front().IsAGPR == CI.IsAGPR && 2204 AddrList.front().hasSameBaseAddress(CI)) { 2205 AddrList.emplace_back(CI); 2206 return; 2207 } 2208 } 2209 2210 // Base address not found, so add a new list. 2211 MergeableInsts.emplace_back(1, CI); 2212 } 2213 2214 std::pair<MachineBasicBlock::iterator, bool> 2215 SILoadStoreOptimizer::collectMergeableInsts( 2216 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2217 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2218 std::list<std::list<CombineInfo>> &MergeableInsts) const { 2219 bool Modified = false; 2220 2221 // Sort potential mergeable instructions into lists. One list per base address. 2222 unsigned Order = 0; 2223 MachineBasicBlock::iterator BlockI = Begin; 2224 for (; BlockI != End; ++BlockI) { 2225 MachineInstr &MI = *BlockI; 2226 2227 // We run this before checking if an address is mergeable, because it can produce 2228 // better code even if the instructions aren't mergeable. 2229 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 2230 Modified = true; 2231 2232 // Treat volatile accesses, ordered accesses and unmodeled side effects as 2233 // barriers. We can look after this barrier for separate merges. 2234 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 2235 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 2236 2237 // Search will resume after this instruction in a separate merge list. 2238 ++BlockI; 2239 break; 2240 } 2241 2242 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2243 if (InstClass == UNKNOWN) 2244 continue; 2245 2246 // Do not merge VMEM buffer instructions with "swizzled" bit set. 2247 int Swizzled = 2248 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 2249 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 2250 continue; 2251 2252 CombineInfo CI; 2253 CI.setMI(MI, *this); 2254 CI.Order = Order++; 2255 2256 if (!CI.hasMergeableAddress(*MRI)) 2257 continue; 2258 2259 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 2260 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2261 // operands. However we are reporting that ds_write2 shall have 2262 // only VGPR data so that machine copy propagation does not 2263 // create an illegal instruction with a VGPR and AGPR sources. 2264 // Consequenctially if we create such instruction the verifier 2265 // will complain. 2266 continue; 2267 } 2268 2269 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2270 2271 addInstToMergeableList(CI, MergeableInsts); 2272 } 2273 2274 // At this point we have lists of Mergeable instructions. 2275 // 2276 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2277 // list try to find an instruction that can be merged with I. If an instruction 2278 // is found, it is stored in the Paired field. If no instructions are found, then 2279 // the CombineInfo object is deleted from the list. 2280 2281 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2282 E = MergeableInsts.end(); I != E;) { 2283 2284 std::list<CombineInfo> &MergeList = *I; 2285 if (MergeList.size() <= 1) { 2286 // This means we have found only one instruction with a given address 2287 // that can be merged, and we need at least 2 instructions to do a merge, 2288 // so this list can be discarded. 2289 I = MergeableInsts.erase(I); 2290 continue; 2291 } 2292 2293 // Sort the lists by offsets, this way mergeable instructions will be 2294 // adjacent to each other in the list, which will make it easier to find 2295 // matches. 2296 MergeList.sort( 2297 [] (const CombineInfo &A, const CombineInfo &B) { 2298 return A.Offset < B.Offset; 2299 }); 2300 ++I; 2301 } 2302 2303 return {BlockI, Modified}; 2304 } 2305 2306 // Scan through looking for adjacent LDS operations with constant offsets from 2307 // the same base register. We rely on the scheduler to do the hard work of 2308 // clustering nearby loads, and assume these are all adjacent. 2309 bool SILoadStoreOptimizer::optimizeBlock( 2310 std::list<std::list<CombineInfo> > &MergeableInsts) { 2311 bool Modified = false; 2312 2313 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2314 E = MergeableInsts.end(); I != E;) { 2315 std::list<CombineInfo> &MergeList = *I; 2316 2317 bool OptimizeListAgain = false; 2318 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2319 // We weren't able to make any changes, so delete the list so we don't 2320 // process the same instructions the next time we try to optimize this 2321 // block. 2322 I = MergeableInsts.erase(I); 2323 continue; 2324 } 2325 2326 Modified = true; 2327 2328 // We made changes, but also determined that there were no more optimization 2329 // opportunities, so we don't need to reprocess the list 2330 if (!OptimizeListAgain) { 2331 I = MergeableInsts.erase(I); 2332 continue; 2333 } 2334 OptimizeAgain = true; 2335 } 2336 return Modified; 2337 } 2338 2339 bool 2340 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2341 std::list<CombineInfo> &MergeList, 2342 bool &OptimizeListAgain) { 2343 if (MergeList.empty()) 2344 return false; 2345 2346 bool Modified = false; 2347 2348 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2349 Next = std::next(I)) { 2350 2351 auto First = I; 2352 auto Second = Next; 2353 2354 if ((*First).Order > (*Second).Order) 2355 std::swap(First, Second); 2356 CombineInfo &CI = *First; 2357 CombineInfo &Paired = *Second; 2358 2359 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 2360 if (!Where) { 2361 ++I; 2362 continue; 2363 } 2364 2365 Modified = true; 2366 2367 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2368 2369 MachineBasicBlock::iterator NewMI; 2370 switch (CI.InstClass) { 2371 default: 2372 llvm_unreachable("unknown InstClass"); 2373 break; 2374 case DS_READ: 2375 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2376 break; 2377 case DS_WRITE: 2378 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2379 break; 2380 case S_BUFFER_LOAD_IMM: 2381 case S_BUFFER_LOAD_SGPR_IMM: 2382 case S_LOAD_IMM: 2383 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I); 2384 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2385 break; 2386 case BUFFER_LOAD: 2387 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2388 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2389 break; 2390 case BUFFER_STORE: 2391 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2392 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2393 break; 2394 case MIMG: 2395 NewMI = mergeImagePair(CI, Paired, Where->I); 2396 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2397 break; 2398 case TBUFFER_LOAD: 2399 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2400 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2401 break; 2402 case TBUFFER_STORE: 2403 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2404 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2405 break; 2406 case FLAT_LOAD: 2407 case GLOBAL_LOAD: 2408 case GLOBAL_LOAD_SADDR: 2409 NewMI = mergeFlatLoadPair(CI, Paired, Where->I); 2410 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2411 break; 2412 case FLAT_STORE: 2413 case GLOBAL_STORE: 2414 case GLOBAL_STORE_SADDR: 2415 NewMI = mergeFlatStorePair(CI, Paired, Where->I); 2416 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2417 break; 2418 } 2419 CI.setMI(NewMI, *this); 2420 CI.Order = Where->Order; 2421 if (I == Second) 2422 I = Next; 2423 2424 MergeList.erase(Second); 2425 } 2426 2427 return Modified; 2428 } 2429 2430 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2431 if (skipFunction(MF.getFunction())) 2432 return false; 2433 2434 STM = &MF.getSubtarget<GCNSubtarget>(); 2435 if (!STM->loadStoreOptEnabled()) 2436 return false; 2437 2438 TII = STM->getInstrInfo(); 2439 TRI = &TII->getRegisterInfo(); 2440 2441 MRI = &MF.getRegInfo(); 2442 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2443 2444 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2445 2446 bool Modified = false; 2447 2448 // Contains the list of instructions for which constant offsets are being 2449 // promoted to the IMM. This is tracked for an entire block at time. 2450 SmallPtrSet<MachineInstr *, 4> AnchorList; 2451 MemInfoMap Visited; 2452 2453 for (MachineBasicBlock &MBB : MF) { 2454 MachineBasicBlock::iterator SectionEnd; 2455 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2456 I = SectionEnd) { 2457 bool CollectModified; 2458 std::list<std::list<CombineInfo>> MergeableInsts; 2459 2460 // First pass: Collect list of all instructions we know how to merge in a 2461 // subset of the block. 2462 std::tie(SectionEnd, CollectModified) = 2463 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2464 2465 Modified |= CollectModified; 2466 2467 do { 2468 OptimizeAgain = false; 2469 Modified |= optimizeBlock(MergeableInsts); 2470 } while (OptimizeAgain); 2471 } 2472 2473 Visited.clear(); 2474 AnchorList.clear(); 2475 } 2476 2477 return Modified; 2478 } 2479