1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 S_BUFFER_LOAD_SGPR_IMM, 78 S_LOAD_IMM, 79 BUFFER_LOAD, 80 BUFFER_STORE, 81 MIMG, 82 TBUFFER_LOAD, 83 TBUFFER_STORE, 84 GLOBAL_LOAD_SADDR, 85 GLOBAL_STORE_SADDR, 86 FLAT_LOAD, 87 FLAT_STORE, 88 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of 89 GLOBAL_STORE // any CombineInfo, they are only ever returned by 90 // getCommonInstClass. 91 }; 92 93 struct AddressRegs { 94 unsigned char NumVAddrs = 0; 95 bool SBase = false; 96 bool SRsrc = false; 97 bool SOffset = false; 98 bool SAddr = false; 99 bool VAddr = false; 100 bool Addr = false; 101 bool SSamp = false; 102 }; 103 104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 105 const unsigned MaxAddressRegs = 12 + 1 + 1; 106 107 class SILoadStoreOptimizer : public MachineFunctionPass { 108 struct CombineInfo { 109 MachineBasicBlock::iterator I; 110 unsigned EltSize; 111 unsigned Offset; 112 unsigned Width; 113 unsigned Format; 114 unsigned BaseOff; 115 unsigned DMask; 116 InstClassEnum InstClass; 117 unsigned CPol = 0; 118 bool IsAGPR; 119 bool UseST64; 120 int AddrIdx[MaxAddressRegs]; 121 const MachineOperand *AddrReg[MaxAddressRegs]; 122 unsigned NumAddresses; 123 unsigned Order; 124 125 bool hasSameBaseAddress(const CombineInfo &CI) { 126 if (NumAddresses != CI.NumAddresses) 127 return false; 128 129 const MachineInstr &MI = *CI.I; 130 for (unsigned i = 0; i < NumAddresses; i++) { 131 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 132 133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 135 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 136 return false; 137 } 138 continue; 139 } 140 141 // Check same base pointer. Be careful of subregisters, which can occur 142 // with vectors of pointers. 143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 145 return false; 146 } 147 } 148 return true; 149 } 150 151 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 152 for (unsigned i = 0; i < NumAddresses; ++i) { 153 const MachineOperand *AddrOp = AddrReg[i]; 154 // Immediates are always OK. 155 if (AddrOp->isImm()) 156 continue; 157 158 // Don't try to merge addresses that aren't either immediates or registers. 159 // TODO: Should be possible to merge FrameIndexes and maybe some other 160 // non-register 161 if (!AddrOp->isReg()) 162 return false; 163 164 // TODO: We should be able to merge instructions with other physical reg 165 // addresses too. 166 if (AddrOp->getReg().isPhysical() && 167 AddrOp->getReg() != AMDGPU::SGPR_NULL) 168 return false; 169 170 // If an address has only one use then there will be no other 171 // instructions with the same address, so we can't merge this one. 172 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 173 return false; 174 } 175 return true; 176 } 177 178 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 179 180 // Compare by pointer order. 181 bool operator<(const CombineInfo& Other) const { 182 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; 183 } 184 }; 185 186 struct BaseRegisters { 187 Register LoReg; 188 Register HiReg; 189 190 unsigned LoSubReg = 0; 191 unsigned HiSubReg = 0; 192 }; 193 194 struct MemAddress { 195 BaseRegisters Base; 196 int64_t Offset = 0; 197 }; 198 199 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 200 201 private: 202 const GCNSubtarget *STM = nullptr; 203 const SIInstrInfo *TII = nullptr; 204 const SIRegisterInfo *TRI = nullptr; 205 MachineRegisterInfo *MRI = nullptr; 206 AliasAnalysis *AA = nullptr; 207 bool OptimizeAgain; 208 209 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 210 const DenseSet<Register> &ARegUses, 211 const MachineInstr &A, const MachineInstr &B) const; 212 static bool dmasksCanBeCombined(const CombineInfo &CI, 213 const SIInstrInfo &TII, 214 const CombineInfo &Paired); 215 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 216 CombineInfo &Paired, bool Modify = false); 217 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 218 const CombineInfo &Paired); 219 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 220 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 221 const CombineInfo &Paired); 222 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 223 const CombineInfo &Paired); 224 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 225 226 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 227 228 unsigned read2Opcode(unsigned EltSize) const; 229 unsigned read2ST64Opcode(unsigned EltSize) const; 230 MachineBasicBlock::iterator 231 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 232 MachineBasicBlock::iterator InsertBefore); 233 234 unsigned write2Opcode(unsigned EltSize) const; 235 unsigned write2ST64Opcode(unsigned EltSize) const; 236 MachineBasicBlock::iterator 237 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 238 MachineBasicBlock::iterator InsertBefore); 239 MachineBasicBlock::iterator 240 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 241 MachineBasicBlock::iterator InsertBefore); 242 MachineBasicBlock::iterator 243 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 244 MachineBasicBlock::iterator InsertBefore); 245 MachineBasicBlock::iterator 246 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 247 MachineBasicBlock::iterator InsertBefore); 248 MachineBasicBlock::iterator 249 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 250 MachineBasicBlock::iterator InsertBefore); 251 MachineBasicBlock::iterator 252 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 253 MachineBasicBlock::iterator InsertBefore); 254 MachineBasicBlock::iterator 255 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 256 MachineBasicBlock::iterator InsertBefore); 257 MachineBasicBlock::iterator 258 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, 259 MachineBasicBlock::iterator InsertBefore); 260 MachineBasicBlock::iterator 261 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, 262 MachineBasicBlock::iterator InsertBefore); 263 264 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 265 int32_t NewOffset) const; 266 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 267 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 268 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 269 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 270 /// Promotes constant offset to the immediate by adjusting the base. It 271 /// tries to use a base from the nearby instructions that allows it to have 272 /// a 13bit constant offset which gets promoted to the immediate. 273 bool promoteConstantOffsetToImm(MachineInstr &CI, 274 MemInfoMap &Visited, 275 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 276 void addInstToMergeableList(const CombineInfo &CI, 277 std::list<std::list<CombineInfo> > &MergeableInsts) const; 278 279 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 280 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 281 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 282 std::list<std::list<CombineInfo>> &MergeableInsts) const; 283 284 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, 285 const CombineInfo &Paired); 286 287 static InstClassEnum getCommonInstClass(const CombineInfo &CI, 288 const CombineInfo &Paired); 289 290 public: 291 static char ID; 292 293 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 294 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 295 } 296 297 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 298 bool &OptimizeListAgain); 299 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 300 301 bool runOnMachineFunction(MachineFunction &MF) override; 302 303 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 304 305 void getAnalysisUsage(AnalysisUsage &AU) const override { 306 AU.setPreservesCFG(); 307 AU.addRequired<AAResultsWrapperPass>(); 308 309 MachineFunctionPass::getAnalysisUsage(AU); 310 } 311 312 MachineFunctionProperties getRequiredProperties() const override { 313 return MachineFunctionProperties() 314 .set(MachineFunctionProperties::Property::IsSSA); 315 } 316 }; 317 318 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 319 const unsigned Opc = MI.getOpcode(); 320 321 if (TII.isMUBUF(Opc)) { 322 // FIXME: Handle d16 correctly 323 return AMDGPU::getMUBUFElements(Opc); 324 } 325 if (TII.isImage(MI)) { 326 uint64_t DMaskImm = 327 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 328 return llvm::popcount(DMaskImm); 329 } 330 if (TII.isMTBUF(Opc)) { 331 return AMDGPU::getMTBUFElements(Opc); 332 } 333 334 switch (Opc) { 335 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 336 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 337 case AMDGPU::S_LOAD_DWORD_IMM: 338 case AMDGPU::GLOBAL_LOAD_DWORD: 339 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 340 case AMDGPU::GLOBAL_STORE_DWORD: 341 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 342 case AMDGPU::FLAT_LOAD_DWORD: 343 case AMDGPU::FLAT_STORE_DWORD: 344 return 1; 345 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 346 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 347 case AMDGPU::S_LOAD_DWORDX2_IMM: 348 case AMDGPU::GLOBAL_LOAD_DWORDX2: 349 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 350 case AMDGPU::GLOBAL_STORE_DWORDX2: 351 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 352 case AMDGPU::FLAT_LOAD_DWORDX2: 353 case AMDGPU::FLAT_STORE_DWORDX2: 354 return 2; 355 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 356 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 357 case AMDGPU::S_LOAD_DWORDX3_IMM: 358 case AMDGPU::GLOBAL_LOAD_DWORDX3: 359 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 360 case AMDGPU::GLOBAL_STORE_DWORDX3: 361 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 362 case AMDGPU::FLAT_LOAD_DWORDX3: 363 case AMDGPU::FLAT_STORE_DWORDX3: 364 return 3; 365 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 366 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 367 case AMDGPU::S_LOAD_DWORDX4_IMM: 368 case AMDGPU::GLOBAL_LOAD_DWORDX4: 369 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 370 case AMDGPU::GLOBAL_STORE_DWORDX4: 371 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 372 case AMDGPU::FLAT_LOAD_DWORDX4: 373 case AMDGPU::FLAT_STORE_DWORDX4: 374 return 4; 375 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 376 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 377 case AMDGPU::S_LOAD_DWORDX8_IMM: 378 return 8; 379 case AMDGPU::DS_READ_B32: [[fallthrough]]; 380 case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]]; 381 case AMDGPU::DS_WRITE_B32: [[fallthrough]]; 382 case AMDGPU::DS_WRITE_B32_gfx9: 383 return 1; 384 case AMDGPU::DS_READ_B64: [[fallthrough]]; 385 case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]]; 386 case AMDGPU::DS_WRITE_B64: [[fallthrough]]; 387 case AMDGPU::DS_WRITE_B64_gfx9: 388 return 2; 389 default: 390 return 0; 391 } 392 } 393 394 /// Maps instruction opcode to enum InstClassEnum. 395 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 396 switch (Opc) { 397 default: 398 if (TII.isMUBUF(Opc)) { 399 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 400 default: 401 return UNKNOWN; 402 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN: 403 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact: 404 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN: 405 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact: 406 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 407 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 408 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 409 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 410 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN: 411 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact: 412 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN: 413 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact: 414 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN: 415 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact: 416 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET: 417 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact: 418 return BUFFER_LOAD; 419 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN: 420 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact: 421 case AMDGPU::BUFFER_STORE_DWORD_IDXEN: 422 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact: 423 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 424 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 425 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 426 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 427 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN: 428 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact: 429 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN: 430 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact: 431 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN: 432 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact: 433 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET: 434 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact: 435 return BUFFER_STORE; 436 } 437 } 438 if (TII.isImage(Opc)) { 439 // Ignore instructions encoded without vaddr. 440 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && 441 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0)) 442 return UNKNOWN; 443 // Ignore BVH instructions 444 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 445 return UNKNOWN; 446 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 447 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 448 TII.isGather4(Opc)) 449 return UNKNOWN; 450 return MIMG; 451 } 452 if (TII.isMTBUF(Opc)) { 453 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 454 default: 455 return UNKNOWN; 456 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: 457 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: 458 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: 459 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: 460 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 461 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 462 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 463 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 464 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN: 465 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact: 466 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN: 467 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact: 468 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN: 469 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact: 470 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET: 471 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact: 472 return TBUFFER_LOAD; 473 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 474 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 475 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 476 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 477 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN: 478 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact: 479 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET: 480 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact: 481 return TBUFFER_STORE; 482 } 483 } 484 return UNKNOWN; 485 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 486 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 487 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 488 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 489 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 490 return S_BUFFER_LOAD_IMM; 491 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 492 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 493 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 494 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 495 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 496 return S_BUFFER_LOAD_SGPR_IMM; 497 case AMDGPU::S_LOAD_DWORD_IMM: 498 case AMDGPU::S_LOAD_DWORDX2_IMM: 499 case AMDGPU::S_LOAD_DWORDX3_IMM: 500 case AMDGPU::S_LOAD_DWORDX4_IMM: 501 case AMDGPU::S_LOAD_DWORDX8_IMM: 502 return S_LOAD_IMM; 503 case AMDGPU::DS_READ_B32: 504 case AMDGPU::DS_READ_B32_gfx9: 505 case AMDGPU::DS_READ_B64: 506 case AMDGPU::DS_READ_B64_gfx9: 507 return DS_READ; 508 case AMDGPU::DS_WRITE_B32: 509 case AMDGPU::DS_WRITE_B32_gfx9: 510 case AMDGPU::DS_WRITE_B64: 511 case AMDGPU::DS_WRITE_B64_gfx9: 512 return DS_WRITE; 513 case AMDGPU::GLOBAL_LOAD_DWORD: 514 case AMDGPU::GLOBAL_LOAD_DWORDX2: 515 case AMDGPU::GLOBAL_LOAD_DWORDX3: 516 case AMDGPU::GLOBAL_LOAD_DWORDX4: 517 case AMDGPU::FLAT_LOAD_DWORD: 518 case AMDGPU::FLAT_LOAD_DWORDX2: 519 case AMDGPU::FLAT_LOAD_DWORDX3: 520 case AMDGPU::FLAT_LOAD_DWORDX4: 521 return FLAT_LOAD; 522 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 523 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 524 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 525 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 526 return GLOBAL_LOAD_SADDR; 527 case AMDGPU::GLOBAL_STORE_DWORD: 528 case AMDGPU::GLOBAL_STORE_DWORDX2: 529 case AMDGPU::GLOBAL_STORE_DWORDX3: 530 case AMDGPU::GLOBAL_STORE_DWORDX4: 531 case AMDGPU::FLAT_STORE_DWORD: 532 case AMDGPU::FLAT_STORE_DWORDX2: 533 case AMDGPU::FLAT_STORE_DWORDX3: 534 case AMDGPU::FLAT_STORE_DWORDX4: 535 return FLAT_STORE; 536 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 537 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 538 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 539 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 540 return GLOBAL_STORE_SADDR; 541 } 542 } 543 544 /// Determines instruction subclass from opcode. Only instructions 545 /// of the same subclass can be merged together. The merged instruction may have 546 /// a different subclass but must have the same class. 547 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 548 switch (Opc) { 549 default: 550 if (TII.isMUBUF(Opc)) 551 return AMDGPU::getMUBUFBaseOpcode(Opc); 552 if (TII.isImage(Opc)) { 553 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 554 assert(Info); 555 return Info->BaseOpcode; 556 } 557 if (TII.isMTBUF(Opc)) 558 return AMDGPU::getMTBUFBaseOpcode(Opc); 559 return -1; 560 case AMDGPU::DS_READ_B32: 561 case AMDGPU::DS_READ_B32_gfx9: 562 case AMDGPU::DS_READ_B64: 563 case AMDGPU::DS_READ_B64_gfx9: 564 case AMDGPU::DS_WRITE_B32: 565 case AMDGPU::DS_WRITE_B32_gfx9: 566 case AMDGPU::DS_WRITE_B64: 567 case AMDGPU::DS_WRITE_B64_gfx9: 568 return Opc; 569 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 570 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 571 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 572 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 573 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 574 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 575 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 576 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 577 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 578 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 579 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 580 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; 581 case AMDGPU::S_LOAD_DWORD_IMM: 582 case AMDGPU::S_LOAD_DWORDX2_IMM: 583 case AMDGPU::S_LOAD_DWORDX3_IMM: 584 case AMDGPU::S_LOAD_DWORDX4_IMM: 585 case AMDGPU::S_LOAD_DWORDX8_IMM: 586 return AMDGPU::S_LOAD_DWORD_IMM; 587 case AMDGPU::GLOBAL_LOAD_DWORD: 588 case AMDGPU::GLOBAL_LOAD_DWORDX2: 589 case AMDGPU::GLOBAL_LOAD_DWORDX3: 590 case AMDGPU::GLOBAL_LOAD_DWORDX4: 591 case AMDGPU::FLAT_LOAD_DWORD: 592 case AMDGPU::FLAT_LOAD_DWORDX2: 593 case AMDGPU::FLAT_LOAD_DWORDX3: 594 case AMDGPU::FLAT_LOAD_DWORDX4: 595 return AMDGPU::FLAT_LOAD_DWORD; 596 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 597 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 598 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 599 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 600 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; 601 case AMDGPU::GLOBAL_STORE_DWORD: 602 case AMDGPU::GLOBAL_STORE_DWORDX2: 603 case AMDGPU::GLOBAL_STORE_DWORDX3: 604 case AMDGPU::GLOBAL_STORE_DWORDX4: 605 case AMDGPU::FLAT_STORE_DWORD: 606 case AMDGPU::FLAT_STORE_DWORDX2: 607 case AMDGPU::FLAT_STORE_DWORDX3: 608 case AMDGPU::FLAT_STORE_DWORDX4: 609 return AMDGPU::FLAT_STORE_DWORD; 610 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 611 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 612 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 613 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 614 return AMDGPU::GLOBAL_STORE_DWORD_SADDR; 615 } 616 } 617 618 // GLOBAL loads and stores are classified as FLAT initially. If both combined 619 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. 620 // If either or both instructions are non segment specific FLAT the resulting 621 // combined operation will be FLAT, potentially promoting one of the GLOBAL 622 // operations to FLAT. 623 // For other instructions return the original unmodified class. 624 InstClassEnum 625 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, 626 const CombineInfo &Paired) { 627 assert(CI.InstClass == Paired.InstClass); 628 629 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && 630 SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I)) 631 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; 632 633 return CI.InstClass; 634 } 635 636 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 637 AddressRegs Result; 638 639 if (TII.isMUBUF(Opc)) { 640 if (AMDGPU::getMUBUFHasVAddr(Opc)) 641 Result.VAddr = true; 642 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 643 Result.SRsrc = true; 644 if (AMDGPU::getMUBUFHasSoffset(Opc)) 645 Result.SOffset = true; 646 647 return Result; 648 } 649 650 if (TII.isImage(Opc)) { 651 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 652 if (VAddr0Idx >= 0) { 653 int RsrcName = 654 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; 655 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName); 656 Result.NumVAddrs = RsrcIdx - VAddr0Idx; 657 } else { 658 Result.VAddr = true; 659 } 660 Result.SRsrc = true; 661 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 662 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 663 Result.SSamp = true; 664 665 return Result; 666 } 667 if (TII.isMTBUF(Opc)) { 668 if (AMDGPU::getMTBUFHasVAddr(Opc)) 669 Result.VAddr = true; 670 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 671 Result.SRsrc = true; 672 if (AMDGPU::getMTBUFHasSoffset(Opc)) 673 Result.SOffset = true; 674 675 return Result; 676 } 677 678 switch (Opc) { 679 default: 680 return Result; 681 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 682 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 683 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 684 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 685 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 686 Result.SOffset = true; 687 [[fallthrough]]; 688 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 689 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 690 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 691 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 692 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 693 case AMDGPU::S_LOAD_DWORD_IMM: 694 case AMDGPU::S_LOAD_DWORDX2_IMM: 695 case AMDGPU::S_LOAD_DWORDX3_IMM: 696 case AMDGPU::S_LOAD_DWORDX4_IMM: 697 case AMDGPU::S_LOAD_DWORDX8_IMM: 698 Result.SBase = true; 699 return Result; 700 case AMDGPU::DS_READ_B32: 701 case AMDGPU::DS_READ_B64: 702 case AMDGPU::DS_READ_B32_gfx9: 703 case AMDGPU::DS_READ_B64_gfx9: 704 case AMDGPU::DS_WRITE_B32: 705 case AMDGPU::DS_WRITE_B64: 706 case AMDGPU::DS_WRITE_B32_gfx9: 707 case AMDGPU::DS_WRITE_B64_gfx9: 708 Result.Addr = true; 709 return Result; 710 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 711 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 712 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 713 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 714 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 715 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 716 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 717 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 718 Result.SAddr = true; 719 [[fallthrough]]; 720 case AMDGPU::GLOBAL_LOAD_DWORD: 721 case AMDGPU::GLOBAL_LOAD_DWORDX2: 722 case AMDGPU::GLOBAL_LOAD_DWORDX3: 723 case AMDGPU::GLOBAL_LOAD_DWORDX4: 724 case AMDGPU::GLOBAL_STORE_DWORD: 725 case AMDGPU::GLOBAL_STORE_DWORDX2: 726 case AMDGPU::GLOBAL_STORE_DWORDX3: 727 case AMDGPU::GLOBAL_STORE_DWORDX4: 728 case AMDGPU::FLAT_LOAD_DWORD: 729 case AMDGPU::FLAT_LOAD_DWORDX2: 730 case AMDGPU::FLAT_LOAD_DWORDX3: 731 case AMDGPU::FLAT_LOAD_DWORDX4: 732 case AMDGPU::FLAT_STORE_DWORD: 733 case AMDGPU::FLAT_STORE_DWORDX2: 734 case AMDGPU::FLAT_STORE_DWORDX3: 735 case AMDGPU::FLAT_STORE_DWORDX4: 736 Result.VAddr = true; 737 return Result; 738 } 739 } 740 741 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 742 const SILoadStoreOptimizer &LSO) { 743 I = MI; 744 unsigned Opc = MI->getOpcode(); 745 InstClass = getInstClass(Opc, *LSO.TII); 746 747 if (InstClass == UNKNOWN) 748 return; 749 750 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 751 752 switch (InstClass) { 753 case DS_READ: 754 EltSize = 755 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 756 : 4; 757 break; 758 case DS_WRITE: 759 EltSize = 760 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 761 : 4; 762 break; 763 case S_BUFFER_LOAD_IMM: 764 case S_BUFFER_LOAD_SGPR_IMM: 765 case S_LOAD_IMM: 766 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 767 break; 768 default: 769 EltSize = 4; 770 break; 771 } 772 773 if (InstClass == MIMG) { 774 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 775 // Offset is not considered for MIMG instructions. 776 Offset = 0; 777 } else { 778 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 779 Offset = I->getOperand(OffsetIdx).getImm(); 780 } 781 782 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 783 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 784 785 Width = getOpcodeWidth(*I, *LSO.TII); 786 787 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 788 Offset &= 0xffff; 789 } else if (InstClass != MIMG) { 790 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 791 } 792 793 AddressRegs Regs = getRegs(Opc, *LSO.TII); 794 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I); 795 796 NumAddresses = 0; 797 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 798 AddrIdx[NumAddresses++] = 799 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 800 if (Regs.Addr) 801 AddrIdx[NumAddresses++] = 802 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 803 if (Regs.SBase) 804 AddrIdx[NumAddresses++] = 805 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 806 if (Regs.SRsrc) 807 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 808 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc); 809 if (Regs.SOffset) 810 AddrIdx[NumAddresses++] = 811 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 812 if (Regs.SAddr) 813 AddrIdx[NumAddresses++] = 814 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 815 if (Regs.VAddr) 816 AddrIdx[NumAddresses++] = 817 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 818 if (Regs.SSamp) 819 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 820 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp); 821 assert(NumAddresses <= MaxAddressRegs); 822 823 for (unsigned J = 0; J < NumAddresses; J++) 824 AddrReg[J] = &I->getOperand(AddrIdx[J]); 825 } 826 827 } // end anonymous namespace. 828 829 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 830 "SI Load Store Optimizer", false, false) 831 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 832 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 833 false, false) 834 835 char SILoadStoreOptimizer::ID = 0; 836 837 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 838 839 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 840 return new SILoadStoreOptimizer(); 841 } 842 843 static void addDefsUsesToList(const MachineInstr &MI, 844 DenseSet<Register> &RegDefs, 845 DenseSet<Register> &RegUses) { 846 for (const auto &Op : MI.operands()) { 847 if (!Op.isReg()) 848 continue; 849 if (Op.isDef()) 850 RegDefs.insert(Op.getReg()); 851 if (Op.readsReg()) 852 RegUses.insert(Op.getReg()); 853 } 854 } 855 856 bool SILoadStoreOptimizer::canSwapInstructions( 857 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 858 const MachineInstr &A, const MachineInstr &B) const { 859 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 860 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 861 return false; 862 for (const auto &BOp : B.operands()) { 863 if (!BOp.isReg()) 864 continue; 865 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 866 return false; 867 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 868 return false; 869 } 870 return true; 871 } 872 873 // Given that \p CI and \p Paired are adjacent memory operations produce a new 874 // MMO for the combined operation with a new access size. 875 MachineMemOperand * 876 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, 877 const CombineInfo &Paired) { 878 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 879 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 880 881 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue(); 882 883 // A base pointer for the combined operation is the same as the leading 884 // operation's pointer. 885 if (Paired < CI) 886 std::swap(MMOa, MMOb); 887 888 MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); 889 // If merging FLAT and GLOBAL set address space to FLAT. 890 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 891 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; 892 893 MachineFunction *MF = CI.I->getMF(); 894 return MF->getMachineMemOperand(MMOa, PtrInfo, Size); 895 } 896 897 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 898 const SIInstrInfo &TII, 899 const CombineInfo &Paired) { 900 assert(CI.InstClass == MIMG); 901 902 // Ignore instructions with tfe/lwe set. 903 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 904 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 905 906 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 907 return false; 908 909 // Check other optional immediate operands for equality. 910 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 911 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 912 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 913 914 for (auto op : OperandsToMatch) { 915 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 916 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 917 return false; 918 if (Idx != -1 && 919 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 920 return false; 921 } 922 923 // Check DMask for overlaps. 924 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 925 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 926 927 if (!MaxMask) 928 return false; 929 930 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask); 931 if ((1u << AllowedBitsForMin) <= MinMask) 932 return false; 933 934 return true; 935 } 936 937 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 938 unsigned ComponentCount, 939 const GCNSubtarget &STI) { 940 if (ComponentCount > 4) 941 return 0; 942 943 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 944 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 945 if (!OldFormatInfo) 946 return 0; 947 948 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 949 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 950 ComponentCount, 951 OldFormatInfo->NumFormat, STI); 952 953 if (!NewFormatInfo) 954 return 0; 955 956 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 957 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 958 959 return NewFormatInfo->Format; 960 } 961 962 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 963 // highest power of two. Note that the result is well defined for all inputs 964 // including corner cases like: 965 // - if Lo == Hi, return that value 966 // - if Lo == 0, return 0 (even though the "- 1" below underflows 967 // - if Lo > Hi, return 0 (as if the range wrapped around) 968 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 969 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1); 970 } 971 972 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 973 const GCNSubtarget &STI, 974 CombineInfo &Paired, 975 bool Modify) { 976 assert(CI.InstClass != MIMG); 977 978 // XXX - Would the same offset be OK? Is there any reason this would happen or 979 // be useful? 980 if (CI.Offset == Paired.Offset) 981 return false; 982 983 // This won't be valid if the offset isn't aligned. 984 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 985 return false; 986 987 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 988 989 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 990 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 991 if (!Info0) 992 return false; 993 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 994 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 995 if (!Info1) 996 return false; 997 998 if (Info0->BitsPerComp != Info1->BitsPerComp || 999 Info0->NumFormat != Info1->NumFormat) 1000 return false; 1001 1002 // TODO: Should be possible to support more formats, but if format loads 1003 // are not dword-aligned, the merged load might not be valid. 1004 if (Info0->BitsPerComp != 32) 1005 return false; 1006 1007 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 1008 return false; 1009 } 1010 1011 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 1012 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 1013 CI.UseST64 = false; 1014 CI.BaseOff = 0; 1015 1016 // Handle all non-DS instructions. 1017 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 1018 if (EltOffset0 + CI.Width != EltOffset1 && 1019 EltOffset1 + Paired.Width != EltOffset0) 1020 return false; 1021 if (CI.CPol != Paired.CPol) 1022 return false; 1023 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || 1024 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { 1025 // Reject cases like: 1026 // dword + dwordx2 -> dwordx3 1027 // dword + dwordx3 -> dwordx4 1028 // If we tried to combine these cases, we would fail to extract a subreg 1029 // for the result of the second load due to SGPR alignment requirements. 1030 if (CI.Width != Paired.Width && 1031 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset)) 1032 return false; 1033 } 1034 return true; 1035 } 1036 1037 // If the offset in elements doesn't fit in 8-bits, we might be able to use 1038 // the stride 64 versions. 1039 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 1040 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 1041 if (Modify) { 1042 CI.Offset = EltOffset0 / 64; 1043 Paired.Offset = EltOffset1 / 64; 1044 CI.UseST64 = true; 1045 } 1046 return true; 1047 } 1048 1049 // Check if the new offsets fit in the reduced 8-bit range. 1050 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 1051 if (Modify) { 1052 CI.Offset = EltOffset0; 1053 Paired.Offset = EltOffset1; 1054 } 1055 return true; 1056 } 1057 1058 // Try to shift base address to decrease offsets. 1059 uint32_t Min = std::min(EltOffset0, EltOffset1); 1060 uint32_t Max = std::max(EltOffset0, EltOffset1); 1061 1062 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 1063 if (((Max - Min) & ~Mask) == 0) { 1064 if (Modify) { 1065 // From the range of values we could use for BaseOff, choose the one that 1066 // is aligned to the highest power of two, to maximise the chance that 1067 // the same offset can be reused for other load/store pairs. 1068 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 1069 // Copy the low bits of the offsets, so that when we adjust them by 1070 // subtracting BaseOff they will be multiples of 64. 1071 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 1072 CI.BaseOff = BaseOff * CI.EltSize; 1073 CI.Offset = (EltOffset0 - BaseOff) / 64; 1074 Paired.Offset = (EltOffset1 - BaseOff) / 64; 1075 CI.UseST64 = true; 1076 } 1077 return true; 1078 } 1079 1080 if (isUInt<8>(Max - Min)) { 1081 if (Modify) { 1082 // From the range of values we could use for BaseOff, choose the one that 1083 // is aligned to the highest power of two, to maximise the chance that 1084 // the same offset can be reused for other load/store pairs. 1085 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 1086 CI.BaseOff = BaseOff * CI.EltSize; 1087 CI.Offset = EltOffset0 - BaseOff; 1088 Paired.Offset = EltOffset1 - BaseOff; 1089 } 1090 return true; 1091 } 1092 1093 return false; 1094 } 1095 1096 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 1097 const CombineInfo &CI, 1098 const CombineInfo &Paired) { 1099 const unsigned Width = (CI.Width + Paired.Width); 1100 switch (CI.InstClass) { 1101 default: 1102 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 1103 case S_BUFFER_LOAD_IMM: 1104 case S_BUFFER_LOAD_SGPR_IMM: 1105 case S_LOAD_IMM: 1106 switch (Width) { 1107 default: 1108 return false; 1109 case 2: 1110 case 4: 1111 case 8: 1112 return true; 1113 case 3: 1114 return STM.hasScalarDwordx3Loads(); 1115 } 1116 } 1117 } 1118 1119 const TargetRegisterClass * 1120 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 1121 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 1122 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1123 } 1124 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 1125 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1126 } 1127 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 1128 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1129 } 1130 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 1131 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1132 } 1133 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 1134 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1135 } 1136 return nullptr; 1137 } 1138 1139 /// This function assumes that CI comes before Paired in a basic block. Return 1140 /// an insertion point for the merged instruction or nullptr on failure. 1141 SILoadStoreOptimizer::CombineInfo * 1142 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 1143 CombineInfo &Paired) { 1144 // If another instruction has already been merged into CI, it may now be a 1145 // type that we can't do any further merging into. 1146 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 1147 return nullptr; 1148 assert(CI.InstClass == Paired.InstClass); 1149 1150 if (getInstSubclass(CI.I->getOpcode(), *TII) != 1151 getInstSubclass(Paired.I->getOpcode(), *TII)) 1152 return nullptr; 1153 1154 // Check both offsets (or masks for MIMG) can be combined and fit in the 1155 // reduced range. 1156 if (CI.InstClass == MIMG) { 1157 if (!dmasksCanBeCombined(CI, *TII, Paired)) 1158 return nullptr; 1159 } else { 1160 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 1161 return nullptr; 1162 } 1163 1164 DenseSet<Register> RegDefs; 1165 DenseSet<Register> RegUses; 1166 CombineInfo *Where; 1167 if (CI.I->mayLoad()) { 1168 // Try to hoist Paired up to CI. 1169 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 1170 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 1171 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 1172 return nullptr; 1173 } 1174 Where = &CI; 1175 } else { 1176 // Try to sink CI down to Paired. 1177 addDefsUsesToList(*CI.I, RegDefs, RegUses); 1178 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 1179 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 1180 return nullptr; 1181 } 1182 Where = &Paired; 1183 } 1184 1185 // Call offsetsCanBeCombined with modify = true so that the offsets are 1186 // correct for the new instruction. This should return true, because 1187 // this function should only be called on CombineInfo objects that 1188 // have already been confirmed to be mergeable. 1189 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 1190 offsetsCanBeCombined(CI, *STM, Paired, true); 1191 return Where; 1192 } 1193 1194 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1195 if (STM->ldsRequiresM0Init()) 1196 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1197 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1198 } 1199 1200 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1201 if (STM->ldsRequiresM0Init()) 1202 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1203 1204 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1205 : AMDGPU::DS_READ2ST64_B64_gfx9; 1206 } 1207 1208 MachineBasicBlock::iterator 1209 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1210 MachineBasicBlock::iterator InsertBefore) { 1211 MachineBasicBlock *MBB = CI.I->getParent(); 1212 1213 // Be careful, since the addresses could be subregisters themselves in weird 1214 // cases, like vectors of pointers. 1215 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1216 1217 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1218 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1219 1220 unsigned NewOffset0 = CI.Offset; 1221 unsigned NewOffset1 = Paired.Offset; 1222 unsigned Opc = 1223 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1224 1225 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1226 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1227 1228 if (NewOffset0 > NewOffset1) { 1229 // Canonicalize the merged instruction so the smaller offset comes first. 1230 std::swap(NewOffset0, NewOffset1); 1231 std::swap(SubRegIdx0, SubRegIdx1); 1232 } 1233 1234 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1235 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1236 1237 const MCInstrDesc &Read2Desc = TII->get(Opc); 1238 1239 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1240 Register DestReg = MRI->createVirtualRegister(SuperRC); 1241 1242 DebugLoc DL = CI.I->getDebugLoc(); 1243 1244 Register BaseReg = AddrReg->getReg(); 1245 unsigned BaseSubReg = AddrReg->getSubReg(); 1246 unsigned BaseRegFlags = 0; 1247 if (CI.BaseOff) { 1248 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1249 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1250 .addImm(CI.BaseOff); 1251 1252 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1253 BaseRegFlags = RegState::Kill; 1254 1255 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1256 .addReg(ImmReg) 1257 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1258 .addImm(0); // clamp bit 1259 BaseSubReg = 0; 1260 } 1261 1262 MachineInstrBuilder Read2 = 1263 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 1264 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1265 .addImm(NewOffset0) // offset0 1266 .addImm(NewOffset1) // offset1 1267 .addImm(0) // gds 1268 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1269 1270 (void)Read2; 1271 1272 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1273 1274 // Copy to the old destination registers. 1275 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1276 .add(*Dest0) // Copy to same destination including flags and sub reg. 1277 .addReg(DestReg, 0, SubRegIdx0); 1278 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1279 .add(*Dest1) 1280 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1281 1282 CI.I->eraseFromParent(); 1283 Paired.I->eraseFromParent(); 1284 1285 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1286 return Read2; 1287 } 1288 1289 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1290 if (STM->ldsRequiresM0Init()) 1291 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1292 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1293 : AMDGPU::DS_WRITE2_B64_gfx9; 1294 } 1295 1296 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1297 if (STM->ldsRequiresM0Init()) 1298 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1299 : AMDGPU::DS_WRITE2ST64_B64; 1300 1301 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1302 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1303 } 1304 1305 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1306 CombineInfo &CI, CombineInfo &Paired, 1307 MachineBasicBlock::iterator InsertBefore) { 1308 MachineBasicBlock *MBB = CI.I->getParent(); 1309 1310 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1311 // sure we preserve the subregister index and any register flags set on them. 1312 const MachineOperand *AddrReg = 1313 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1314 const MachineOperand *Data0 = 1315 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1316 const MachineOperand *Data1 = 1317 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1318 1319 unsigned NewOffset0 = CI.Offset; 1320 unsigned NewOffset1 = Paired.Offset; 1321 unsigned Opc = 1322 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1323 1324 if (NewOffset0 > NewOffset1) { 1325 // Canonicalize the merged instruction so the smaller offset comes first. 1326 std::swap(NewOffset0, NewOffset1); 1327 std::swap(Data0, Data1); 1328 } 1329 1330 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1331 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1332 1333 const MCInstrDesc &Write2Desc = TII->get(Opc); 1334 DebugLoc DL = CI.I->getDebugLoc(); 1335 1336 Register BaseReg = AddrReg->getReg(); 1337 unsigned BaseSubReg = AddrReg->getSubReg(); 1338 unsigned BaseRegFlags = 0; 1339 if (CI.BaseOff) { 1340 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1341 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1342 .addImm(CI.BaseOff); 1343 1344 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1345 BaseRegFlags = RegState::Kill; 1346 1347 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1348 .addReg(ImmReg) 1349 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1350 .addImm(0); // clamp bit 1351 BaseSubReg = 0; 1352 } 1353 1354 MachineInstrBuilder Write2 = 1355 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1356 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1357 .add(*Data0) // data0 1358 .add(*Data1) // data1 1359 .addImm(NewOffset0) // offset0 1360 .addImm(NewOffset1) // offset1 1361 .addImm(0) // gds 1362 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1363 1364 CI.I->eraseFromParent(); 1365 Paired.I->eraseFromParent(); 1366 1367 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1368 return Write2; 1369 } 1370 1371 MachineBasicBlock::iterator 1372 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1373 MachineBasicBlock::iterator InsertBefore) { 1374 MachineBasicBlock *MBB = CI.I->getParent(); 1375 DebugLoc DL = CI.I->getDebugLoc(); 1376 const unsigned Opcode = getNewOpcode(CI, Paired); 1377 1378 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1379 1380 Register DestReg = MRI->createVirtualRegister(SuperRC); 1381 unsigned MergedDMask = CI.DMask | Paired.DMask; 1382 unsigned DMaskIdx = 1383 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1384 1385 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1386 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1387 if (I == DMaskIdx) 1388 MIB.addImm(MergedDMask); 1389 else 1390 MIB.add((*CI.I).getOperand(I)); 1391 } 1392 1393 // It shouldn't be possible to get this far if the two instructions 1394 // don't have a single memoperand, because MachineInstr::mayAlias() 1395 // will return true if this is the case. 1396 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1397 1398 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1399 1400 unsigned SubRegIdx0, SubRegIdx1; 1401 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1402 1403 // Copy to the old destination registers. 1404 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1405 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1406 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1407 1408 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1409 .add(*Dest0) // Copy to same destination including flags and sub reg. 1410 .addReg(DestReg, 0, SubRegIdx0); 1411 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1412 .add(*Dest1) 1413 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1414 1415 CI.I->eraseFromParent(); 1416 Paired.I->eraseFromParent(); 1417 return New; 1418 } 1419 1420 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( 1421 CombineInfo &CI, CombineInfo &Paired, 1422 MachineBasicBlock::iterator InsertBefore) { 1423 MachineBasicBlock *MBB = CI.I->getParent(); 1424 DebugLoc DL = CI.I->getDebugLoc(); 1425 const unsigned Opcode = getNewOpcode(CI, Paired); 1426 1427 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1428 1429 Register DestReg = MRI->createVirtualRegister(SuperRC); 1430 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1431 1432 // It shouldn't be possible to get this far if the two instructions 1433 // don't have a single memoperand, because MachineInstr::mayAlias() 1434 // will return true if this is the case. 1435 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1436 1437 MachineInstrBuilder New = 1438 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1439 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)); 1440 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) 1441 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)); 1442 New.addImm(MergedOffset); 1443 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1444 1445 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1446 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1447 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1448 1449 // Copy to the old destination registers. 1450 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1451 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1452 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1453 1454 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1455 .add(*Dest0) // Copy to same destination including flags and sub reg. 1456 .addReg(DestReg, 0, SubRegIdx0); 1457 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1458 .add(*Dest1) 1459 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1460 1461 CI.I->eraseFromParent(); 1462 Paired.I->eraseFromParent(); 1463 return New; 1464 } 1465 1466 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1467 CombineInfo &CI, CombineInfo &Paired, 1468 MachineBasicBlock::iterator InsertBefore) { 1469 MachineBasicBlock *MBB = CI.I->getParent(); 1470 DebugLoc DL = CI.I->getDebugLoc(); 1471 1472 const unsigned Opcode = getNewOpcode(CI, Paired); 1473 1474 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1475 1476 // Copy to the new source register. 1477 Register DestReg = MRI->createVirtualRegister(SuperRC); 1478 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1479 1480 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1481 1482 AddressRegs Regs = getRegs(Opcode, *TII); 1483 1484 if (Regs.VAddr) 1485 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1486 1487 // It shouldn't be possible to get this far if the two instructions 1488 // don't have a single memoperand, because MachineInstr::mayAlias() 1489 // will return true if this is the case. 1490 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1491 1492 MachineInstr *New = 1493 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1494 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1495 .addImm(MergedOffset) // offset 1496 .addImm(CI.CPol) // cpol 1497 .addImm(0) // swz 1498 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1499 1500 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1501 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1502 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1503 1504 // Copy to the old destination registers. 1505 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1506 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1507 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1508 1509 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1510 .add(*Dest0) // Copy to same destination including flags and sub reg. 1511 .addReg(DestReg, 0, SubRegIdx0); 1512 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1513 .add(*Dest1) 1514 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1515 1516 CI.I->eraseFromParent(); 1517 Paired.I->eraseFromParent(); 1518 return New; 1519 } 1520 1521 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1522 CombineInfo &CI, CombineInfo &Paired, 1523 MachineBasicBlock::iterator InsertBefore) { 1524 MachineBasicBlock *MBB = CI.I->getParent(); 1525 DebugLoc DL = CI.I->getDebugLoc(); 1526 1527 const unsigned Opcode = getNewOpcode(CI, Paired); 1528 1529 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1530 1531 // Copy to the new source register. 1532 Register DestReg = MRI->createVirtualRegister(SuperRC); 1533 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1534 1535 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1536 1537 AddressRegs Regs = getRegs(Opcode, *TII); 1538 1539 if (Regs.VAddr) 1540 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1541 1542 unsigned JoinedFormat = 1543 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1544 1545 // It shouldn't be possible to get this far if the two instructions 1546 // don't have a single memoperand, because MachineInstr::mayAlias() 1547 // will return true if this is the case. 1548 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1549 1550 MachineInstr *New = 1551 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1552 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1553 .addImm(MergedOffset) // offset 1554 .addImm(JoinedFormat) // format 1555 .addImm(CI.CPol) // cpol 1556 .addImm(0) // swz 1557 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1558 1559 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1560 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1561 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1562 1563 // Copy to the old destination registers. 1564 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1565 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1566 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1567 1568 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1569 .add(*Dest0) // Copy to same destination including flags and sub reg. 1570 .addReg(DestReg, 0, SubRegIdx0); 1571 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1572 .add(*Dest1) 1573 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1574 1575 CI.I->eraseFromParent(); 1576 Paired.I->eraseFromParent(); 1577 return New; 1578 } 1579 1580 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1581 CombineInfo &CI, CombineInfo &Paired, 1582 MachineBasicBlock::iterator InsertBefore) { 1583 MachineBasicBlock *MBB = CI.I->getParent(); 1584 DebugLoc DL = CI.I->getDebugLoc(); 1585 1586 const unsigned Opcode = getNewOpcode(CI, Paired); 1587 1588 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1589 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1590 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1591 1592 // Copy to the new source register. 1593 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1594 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1595 1596 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1597 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1598 1599 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1600 .add(*Src0) 1601 .addImm(SubRegIdx0) 1602 .add(*Src1) 1603 .addImm(SubRegIdx1); 1604 1605 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1606 .addReg(SrcReg, RegState::Kill); 1607 1608 AddressRegs Regs = getRegs(Opcode, *TII); 1609 1610 if (Regs.VAddr) 1611 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1612 1613 unsigned JoinedFormat = 1614 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1615 1616 // It shouldn't be possible to get this far if the two instructions 1617 // don't have a single memoperand, because MachineInstr::mayAlias() 1618 // will return true if this is the case. 1619 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1620 1621 MachineInstr *New = 1622 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1623 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1624 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1625 .addImm(JoinedFormat) // format 1626 .addImm(CI.CPol) // cpol 1627 .addImm(0) // swz 1628 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1629 1630 CI.I->eraseFromParent(); 1631 Paired.I->eraseFromParent(); 1632 return New; 1633 } 1634 1635 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( 1636 CombineInfo &CI, CombineInfo &Paired, 1637 MachineBasicBlock::iterator InsertBefore) { 1638 MachineBasicBlock *MBB = CI.I->getParent(); 1639 DebugLoc DL = CI.I->getDebugLoc(); 1640 1641 const unsigned Opcode = getNewOpcode(CI, Paired); 1642 1643 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1644 Register DestReg = MRI->createVirtualRegister(SuperRC); 1645 1646 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1647 1648 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1649 MIB.add(*SAddr); 1650 1651 MachineInstr *New = 1652 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1653 .addImm(std::min(CI.Offset, Paired.Offset)) 1654 .addImm(CI.CPol) 1655 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1656 1657 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1658 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1659 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1660 1661 // Copy to the old destination registers. 1662 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1663 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1664 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1665 1666 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1667 .add(*Dest0) // Copy to same destination including flags and sub reg. 1668 .addReg(DestReg, 0, SubRegIdx0); 1669 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1670 .add(*Dest1) 1671 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1672 1673 CI.I->eraseFromParent(); 1674 Paired.I->eraseFromParent(); 1675 return New; 1676 } 1677 1678 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( 1679 CombineInfo &CI, CombineInfo &Paired, 1680 MachineBasicBlock::iterator InsertBefore) { 1681 MachineBasicBlock *MBB = CI.I->getParent(); 1682 DebugLoc DL = CI.I->getDebugLoc(); 1683 1684 const unsigned Opcode = getNewOpcode(CI, Paired); 1685 1686 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1687 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1688 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1689 1690 // Copy to the new source register. 1691 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1692 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1693 1694 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1695 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1696 1697 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1698 .add(*Src0) 1699 .addImm(SubRegIdx0) 1700 .add(*Src1) 1701 .addImm(SubRegIdx1); 1702 1703 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1704 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1705 .addReg(SrcReg, RegState::Kill); 1706 1707 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1708 MIB.add(*SAddr); 1709 1710 MachineInstr *New = 1711 MIB.addImm(std::min(CI.Offset, Paired.Offset)) 1712 .addImm(CI.CPol) 1713 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1714 1715 CI.I->eraseFromParent(); 1716 Paired.I->eraseFromParent(); 1717 return New; 1718 } 1719 1720 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1721 const CombineInfo &Paired) { 1722 const unsigned Width = CI.Width + Paired.Width; 1723 1724 switch (getCommonInstClass(CI, Paired)) { 1725 default: 1726 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1727 // FIXME: Handle d16 correctly 1728 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1729 Width); 1730 case TBUFFER_LOAD: 1731 case TBUFFER_STORE: 1732 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1733 Width); 1734 1735 case UNKNOWN: 1736 llvm_unreachable("Unknown instruction class"); 1737 case S_BUFFER_LOAD_IMM: 1738 switch (Width) { 1739 default: 1740 return 0; 1741 case 2: 1742 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1743 case 3: 1744 return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; 1745 case 4: 1746 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1747 case 8: 1748 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1749 } 1750 case S_BUFFER_LOAD_SGPR_IMM: 1751 switch (Width) { 1752 default: 1753 return 0; 1754 case 2: 1755 return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; 1756 case 3: 1757 return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; 1758 case 4: 1759 return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; 1760 case 8: 1761 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; 1762 } 1763 case S_LOAD_IMM: 1764 switch (Width) { 1765 default: 1766 return 0; 1767 case 2: 1768 return AMDGPU::S_LOAD_DWORDX2_IMM; 1769 case 3: 1770 return AMDGPU::S_LOAD_DWORDX3_IMM; 1771 case 4: 1772 return AMDGPU::S_LOAD_DWORDX4_IMM; 1773 case 8: 1774 return AMDGPU::S_LOAD_DWORDX8_IMM; 1775 } 1776 case GLOBAL_LOAD: 1777 switch (Width) { 1778 default: 1779 return 0; 1780 case 2: 1781 return AMDGPU::GLOBAL_LOAD_DWORDX2; 1782 case 3: 1783 return AMDGPU::GLOBAL_LOAD_DWORDX3; 1784 case 4: 1785 return AMDGPU::GLOBAL_LOAD_DWORDX4; 1786 } 1787 case GLOBAL_LOAD_SADDR: 1788 switch (Width) { 1789 default: 1790 return 0; 1791 case 2: 1792 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; 1793 case 3: 1794 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; 1795 case 4: 1796 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; 1797 } 1798 case GLOBAL_STORE: 1799 switch (Width) { 1800 default: 1801 return 0; 1802 case 2: 1803 return AMDGPU::GLOBAL_STORE_DWORDX2; 1804 case 3: 1805 return AMDGPU::GLOBAL_STORE_DWORDX3; 1806 case 4: 1807 return AMDGPU::GLOBAL_STORE_DWORDX4; 1808 } 1809 case GLOBAL_STORE_SADDR: 1810 switch (Width) { 1811 default: 1812 return 0; 1813 case 2: 1814 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; 1815 case 3: 1816 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; 1817 case 4: 1818 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; 1819 } 1820 case FLAT_LOAD: 1821 switch (Width) { 1822 default: 1823 return 0; 1824 case 2: 1825 return AMDGPU::FLAT_LOAD_DWORDX2; 1826 case 3: 1827 return AMDGPU::FLAT_LOAD_DWORDX3; 1828 case 4: 1829 return AMDGPU::FLAT_LOAD_DWORDX4; 1830 } 1831 case FLAT_STORE: 1832 switch (Width) { 1833 default: 1834 return 0; 1835 case 2: 1836 return AMDGPU::FLAT_STORE_DWORDX2; 1837 case 3: 1838 return AMDGPU::FLAT_STORE_DWORDX3; 1839 case 4: 1840 return AMDGPU::FLAT_STORE_DWORDX4; 1841 } 1842 case MIMG: 1843 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && 1844 "No overlaps"); 1845 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1846 } 1847 } 1848 1849 std::pair<unsigned, unsigned> 1850 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1851 const CombineInfo &Paired) { 1852 assert((CI.InstClass != MIMG || 1853 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == 1854 CI.Width + Paired.Width)) && 1855 "No overlaps"); 1856 1857 unsigned Idx0; 1858 unsigned Idx1; 1859 1860 static const unsigned Idxs[5][4] = { 1861 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1862 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1863 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1864 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1865 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1866 }; 1867 1868 assert(CI.Width >= 1 && CI.Width <= 4); 1869 assert(Paired.Width >= 1 && Paired.Width <= 4); 1870 1871 if (Paired < CI) { 1872 Idx1 = Idxs[0][Paired.Width - 1]; 1873 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1874 } else { 1875 Idx0 = Idxs[0][CI.Width - 1]; 1876 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1877 } 1878 1879 return std::pair(Idx0, Idx1); 1880 } 1881 1882 const TargetRegisterClass * 1883 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1884 const CombineInfo &Paired) { 1885 if (CI.InstClass == S_BUFFER_LOAD_IMM || 1886 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) { 1887 switch (CI.Width + Paired.Width) { 1888 default: 1889 return nullptr; 1890 case 2: 1891 return &AMDGPU::SReg_64_XEXECRegClass; 1892 case 3: 1893 return &AMDGPU::SGPR_96RegClass; 1894 case 4: 1895 return &AMDGPU::SGPR_128RegClass; 1896 case 8: 1897 return &AMDGPU::SGPR_256RegClass; 1898 case 16: 1899 return &AMDGPU::SGPR_512RegClass; 1900 } 1901 } 1902 1903 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1904 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1905 ? TRI->getAGPRClassForBitWidth(BitWidth) 1906 : TRI->getVGPRClassForBitWidth(BitWidth); 1907 } 1908 1909 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1910 CombineInfo &CI, CombineInfo &Paired, 1911 MachineBasicBlock::iterator InsertBefore) { 1912 MachineBasicBlock *MBB = CI.I->getParent(); 1913 DebugLoc DL = CI.I->getDebugLoc(); 1914 1915 const unsigned Opcode = getNewOpcode(CI, Paired); 1916 1917 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1918 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1919 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1920 1921 // Copy to the new source register. 1922 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1923 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1924 1925 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1926 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1927 1928 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1929 .add(*Src0) 1930 .addImm(SubRegIdx0) 1931 .add(*Src1) 1932 .addImm(SubRegIdx1); 1933 1934 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1935 .addReg(SrcReg, RegState::Kill); 1936 1937 AddressRegs Regs = getRegs(Opcode, *TII); 1938 1939 if (Regs.VAddr) 1940 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1941 1942 1943 // It shouldn't be possible to get this far if the two instructions 1944 // don't have a single memoperand, because MachineInstr::mayAlias() 1945 // will return true if this is the case. 1946 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1947 1948 MachineInstr *New = 1949 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1950 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1951 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1952 .addImm(CI.CPol) // cpol 1953 .addImm(0) // swz 1954 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1955 1956 CI.I->eraseFromParent(); 1957 Paired.I->eraseFromParent(); 1958 return New; 1959 } 1960 1961 MachineOperand 1962 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1963 APInt V(32, Val, true); 1964 if (TII->isInlineConstant(V)) 1965 return MachineOperand::CreateImm(Val); 1966 1967 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1968 MachineInstr *Mov = 1969 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1970 TII->get(AMDGPU::S_MOV_B32), Reg) 1971 .addImm(Val); 1972 (void)Mov; 1973 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1974 return MachineOperand::CreateReg(Reg, false); 1975 } 1976 1977 // Compute base address using Addr and return the final register. 1978 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1979 const MemAddress &Addr) const { 1980 MachineBasicBlock *MBB = MI.getParent(); 1981 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1982 DebugLoc DL = MI.getDebugLoc(); 1983 1984 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1985 Addr.Base.LoSubReg) && 1986 "Expected 32-bit Base-Register-Low!!"); 1987 1988 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1989 Addr.Base.HiSubReg) && 1990 "Expected 32-bit Base-Register-Hi!!"); 1991 1992 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1993 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1994 MachineOperand OffsetHi = 1995 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1996 1997 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1998 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1999 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 2000 2001 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2002 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2003 MachineInstr *LoHalf = 2004 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 2005 .addReg(CarryReg, RegState::Define) 2006 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 2007 .add(OffsetLo) 2008 .addImm(0); // clamp bit 2009 (void)LoHalf; 2010 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 2011 2012 MachineInstr *HiHalf = 2013 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 2014 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 2015 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 2016 .add(OffsetHi) 2017 .addReg(CarryReg, RegState::Kill) 2018 .addImm(0); // clamp bit 2019 (void)HiHalf; 2020 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 2021 2022 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 2023 MachineInstr *FullBase = 2024 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2025 .addReg(DestSub0) 2026 .addImm(AMDGPU::sub0) 2027 .addReg(DestSub1) 2028 .addImm(AMDGPU::sub1); 2029 (void)FullBase; 2030 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 2031 2032 return FullDestReg; 2033 } 2034 2035 // Update base and offset with the NewBase and NewOffset in MI. 2036 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 2037 Register NewBase, 2038 int32_t NewOffset) const { 2039 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2040 Base->setReg(NewBase); 2041 Base->setIsKill(false); 2042 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 2043 } 2044 2045 std::optional<int32_t> 2046 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 2047 if (Op.isImm()) 2048 return Op.getImm(); 2049 2050 if (!Op.isReg()) 2051 return std::nullopt; 2052 2053 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 2054 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 2055 !Def->getOperand(1).isImm()) 2056 return std::nullopt; 2057 2058 return Def->getOperand(1).getImm(); 2059 } 2060 2061 // Analyze Base and extracts: 2062 // - 32bit base registers, subregisters 2063 // - 64bit constant offset 2064 // Expecting base computation as: 2065 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 2066 // %LO:vgpr_32, %c:sreg_64_xexec = 2067 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 2068 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 2069 // %Base:vreg_64 = 2070 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 2071 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 2072 MemAddress &Addr) const { 2073 if (!Base.isReg()) 2074 return; 2075 2076 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 2077 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 2078 || Def->getNumOperands() != 5) 2079 return; 2080 2081 MachineOperand BaseLo = Def->getOperand(1); 2082 MachineOperand BaseHi = Def->getOperand(3); 2083 if (!BaseLo.isReg() || !BaseHi.isReg()) 2084 return; 2085 2086 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 2087 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 2088 2089 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 2090 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 2091 return; 2092 2093 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 2094 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 2095 2096 auto Offset0P = extractConstOffset(*Src0); 2097 if (Offset0P) 2098 BaseLo = *Src1; 2099 else { 2100 if (!(Offset0P = extractConstOffset(*Src1))) 2101 return; 2102 BaseLo = *Src0; 2103 } 2104 2105 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 2106 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 2107 2108 if (Src0->isImm()) 2109 std::swap(Src0, Src1); 2110 2111 if (!Src1->isImm()) 2112 return; 2113 2114 uint64_t Offset1 = Src1->getImm(); 2115 BaseHi = *Src0; 2116 2117 Addr.Base.LoReg = BaseLo.getReg(); 2118 Addr.Base.HiReg = BaseHi.getReg(); 2119 Addr.Base.LoSubReg = BaseLo.getSubReg(); 2120 Addr.Base.HiSubReg = BaseHi.getSubReg(); 2121 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 2122 } 2123 2124 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 2125 MachineInstr &MI, 2126 MemInfoMap &Visited, 2127 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 2128 2129 if (!(MI.mayLoad() ^ MI.mayStore())) 2130 return false; 2131 2132 // TODO: Support flat and scratch. 2133 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 2134 return false; 2135 2136 if (MI.mayLoad() && 2137 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) 2138 return false; 2139 2140 if (AnchorList.count(&MI)) 2141 return false; 2142 2143 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 2144 2145 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 2146 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 2147 return false; 2148 } 2149 2150 // Step1: Find the base-registers and a 64bit constant offset. 2151 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2152 MemAddress MAddr; 2153 if (!Visited.contains(&MI)) { 2154 processBaseWithConstOffset(Base, MAddr); 2155 Visited[&MI] = MAddr; 2156 } else 2157 MAddr = Visited[&MI]; 2158 2159 if (MAddr.Offset == 0) { 2160 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 2161 " constant offsets that can be promoted.\n";); 2162 return false; 2163 } 2164 2165 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 2166 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 2167 2168 // Step2: Traverse through MI's basic block and find an anchor(that has the 2169 // same base-registers) with the highest 13bit distance from MI's offset. 2170 // E.g. (64bit loads) 2171 // bb: 2172 // addr1 = &a + 4096; load1 = load(addr1, 0) 2173 // addr2 = &a + 6144; load2 = load(addr2, 0) 2174 // addr3 = &a + 8192; load3 = load(addr3, 0) 2175 // addr4 = &a + 10240; load4 = load(addr4, 0) 2176 // addr5 = &a + 12288; load5 = load(addr5, 0) 2177 // 2178 // Starting from the first load, the optimization will try to find a new base 2179 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 2180 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 2181 // as the new-base(anchor) because of the maximum distance which can 2182 // accommodate more intermediate bases presumably. 2183 // 2184 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 2185 // (&a + 8192) for load1, load2, load4. 2186 // addr = &a + 8192 2187 // load1 = load(addr, -4096) 2188 // load2 = load(addr, -2048) 2189 // load3 = load(addr, 0) 2190 // load4 = load(addr, 2048) 2191 // addr5 = &a + 12288; load5 = load(addr5, 0) 2192 // 2193 MachineInstr *AnchorInst = nullptr; 2194 MemAddress AnchorAddr; 2195 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 2196 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 2197 2198 MachineBasicBlock *MBB = MI.getParent(); 2199 MachineBasicBlock::iterator E = MBB->end(); 2200 MachineBasicBlock::iterator MBBI = MI.getIterator(); 2201 ++MBBI; 2202 const SITargetLowering *TLI = 2203 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 2204 2205 for ( ; MBBI != E; ++MBBI) { 2206 MachineInstr &MINext = *MBBI; 2207 // TODO: Support finding an anchor(with same base) from store addresses or 2208 // any other load addresses where the opcodes are different. 2209 if (MINext.getOpcode() != MI.getOpcode() || 2210 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 2211 continue; 2212 2213 const MachineOperand &BaseNext = 2214 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 2215 MemAddress MAddrNext; 2216 if (!Visited.contains(&MINext)) { 2217 processBaseWithConstOffset(BaseNext, MAddrNext); 2218 Visited[&MINext] = MAddrNext; 2219 } else 2220 MAddrNext = Visited[&MINext]; 2221 2222 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 2223 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 2224 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 2225 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 2226 continue; 2227 2228 InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset)); 2229 2230 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 2231 TargetLoweringBase::AddrMode AM; 2232 AM.HasBaseReg = true; 2233 AM.BaseOffs = Dist; 2234 if (TLI->isLegalGlobalAddressingMode(AM) && 2235 (uint32_t)std::abs(Dist) > MaxDist) { 2236 MaxDist = std::abs(Dist); 2237 2238 AnchorAddr = MAddrNext; 2239 AnchorInst = &MINext; 2240 } 2241 } 2242 2243 if (AnchorInst) { 2244 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 2245 AnchorInst->dump()); 2246 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 2247 << AnchorAddr.Offset << "\n\n"); 2248 2249 // Instead of moving up, just re-compute anchor-instruction's base address. 2250 Register Base = computeBase(MI, AnchorAddr); 2251 2252 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 2253 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 2254 2255 for (auto P : InstsWCommonBase) { 2256 TargetLoweringBase::AddrMode AM; 2257 AM.HasBaseReg = true; 2258 AM.BaseOffs = P.second - AnchorAddr.Offset; 2259 2260 if (TLI->isLegalGlobalAddressingMode(AM)) { 2261 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 2262 dbgs() << ")"; P.first->dump()); 2263 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 2264 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 2265 } 2266 } 2267 AnchorList.insert(AnchorInst); 2268 return true; 2269 } 2270 2271 return false; 2272 } 2273 2274 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 2275 std::list<std::list<CombineInfo> > &MergeableInsts) const { 2276 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 2277 if (AddrList.front().InstClass == CI.InstClass && 2278 AddrList.front().IsAGPR == CI.IsAGPR && 2279 AddrList.front().hasSameBaseAddress(CI)) { 2280 AddrList.emplace_back(CI); 2281 return; 2282 } 2283 } 2284 2285 // Base address not found, so add a new list. 2286 MergeableInsts.emplace_back(1, CI); 2287 } 2288 2289 std::pair<MachineBasicBlock::iterator, bool> 2290 SILoadStoreOptimizer::collectMergeableInsts( 2291 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2292 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2293 std::list<std::list<CombineInfo>> &MergeableInsts) const { 2294 bool Modified = false; 2295 2296 // Sort potential mergeable instructions into lists. One list per base address. 2297 unsigned Order = 0; 2298 MachineBasicBlock::iterator BlockI = Begin; 2299 for (; BlockI != End; ++BlockI) { 2300 MachineInstr &MI = *BlockI; 2301 2302 // We run this before checking if an address is mergeable, because it can produce 2303 // better code even if the instructions aren't mergeable. 2304 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 2305 Modified = true; 2306 2307 // Treat volatile accesses, ordered accesses and unmodeled side effects as 2308 // barriers. We can look after this barrier for separate merges. 2309 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 2310 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 2311 2312 // Search will resume after this instruction in a separate merge list. 2313 ++BlockI; 2314 break; 2315 } 2316 2317 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2318 if (InstClass == UNKNOWN) 2319 continue; 2320 2321 // Do not merge VMEM buffer instructions with "swizzled" bit set. 2322 int Swizzled = 2323 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 2324 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 2325 continue; 2326 2327 CombineInfo CI; 2328 CI.setMI(MI, *this); 2329 CI.Order = Order++; 2330 2331 if (!CI.hasMergeableAddress(*MRI)) 2332 continue; 2333 2334 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 2335 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2336 // operands. However we are reporting that ds_write2 shall have 2337 // only VGPR data so that machine copy propagation does not 2338 // create an illegal instruction with a VGPR and AGPR sources. 2339 // Consequenctially if we create such instruction the verifier 2340 // will complain. 2341 continue; 2342 } 2343 2344 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2345 2346 addInstToMergeableList(CI, MergeableInsts); 2347 } 2348 2349 // At this point we have lists of Mergeable instructions. 2350 // 2351 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2352 // list try to find an instruction that can be merged with I. If an instruction 2353 // is found, it is stored in the Paired field. If no instructions are found, then 2354 // the CombineInfo object is deleted from the list. 2355 2356 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2357 E = MergeableInsts.end(); I != E;) { 2358 2359 std::list<CombineInfo> &MergeList = *I; 2360 if (MergeList.size() <= 1) { 2361 // This means we have found only one instruction with a given address 2362 // that can be merged, and we need at least 2 instructions to do a merge, 2363 // so this list can be discarded. 2364 I = MergeableInsts.erase(I); 2365 continue; 2366 } 2367 2368 // Sort the lists by offsets, this way mergeable instructions will be 2369 // adjacent to each other in the list, which will make it easier to find 2370 // matches. 2371 MergeList.sort( 2372 [] (const CombineInfo &A, const CombineInfo &B) { 2373 return A.Offset < B.Offset; 2374 }); 2375 ++I; 2376 } 2377 2378 return std::pair(BlockI, Modified); 2379 } 2380 2381 // Scan through looking for adjacent LDS operations with constant offsets from 2382 // the same base register. We rely on the scheduler to do the hard work of 2383 // clustering nearby loads, and assume these are all adjacent. 2384 bool SILoadStoreOptimizer::optimizeBlock( 2385 std::list<std::list<CombineInfo> > &MergeableInsts) { 2386 bool Modified = false; 2387 2388 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2389 E = MergeableInsts.end(); I != E;) { 2390 std::list<CombineInfo> &MergeList = *I; 2391 2392 bool OptimizeListAgain = false; 2393 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2394 // We weren't able to make any changes, so delete the list so we don't 2395 // process the same instructions the next time we try to optimize this 2396 // block. 2397 I = MergeableInsts.erase(I); 2398 continue; 2399 } 2400 2401 Modified = true; 2402 2403 // We made changes, but also determined that there were no more optimization 2404 // opportunities, so we don't need to reprocess the list 2405 if (!OptimizeListAgain) { 2406 I = MergeableInsts.erase(I); 2407 continue; 2408 } 2409 OptimizeAgain = true; 2410 } 2411 return Modified; 2412 } 2413 2414 bool 2415 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2416 std::list<CombineInfo> &MergeList, 2417 bool &OptimizeListAgain) { 2418 if (MergeList.empty()) 2419 return false; 2420 2421 bool Modified = false; 2422 2423 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2424 Next = std::next(I)) { 2425 2426 auto First = I; 2427 auto Second = Next; 2428 2429 if ((*First).Order > (*Second).Order) 2430 std::swap(First, Second); 2431 CombineInfo &CI = *First; 2432 CombineInfo &Paired = *Second; 2433 2434 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 2435 if (!Where) { 2436 ++I; 2437 continue; 2438 } 2439 2440 Modified = true; 2441 2442 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2443 2444 MachineBasicBlock::iterator NewMI; 2445 switch (CI.InstClass) { 2446 default: 2447 llvm_unreachable("unknown InstClass"); 2448 break; 2449 case DS_READ: 2450 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2451 break; 2452 case DS_WRITE: 2453 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2454 break; 2455 case S_BUFFER_LOAD_IMM: 2456 case S_BUFFER_LOAD_SGPR_IMM: 2457 case S_LOAD_IMM: 2458 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I); 2459 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2460 break; 2461 case BUFFER_LOAD: 2462 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2463 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2464 break; 2465 case BUFFER_STORE: 2466 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2467 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2468 break; 2469 case MIMG: 2470 NewMI = mergeImagePair(CI, Paired, Where->I); 2471 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2472 break; 2473 case TBUFFER_LOAD: 2474 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2475 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2476 break; 2477 case TBUFFER_STORE: 2478 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2479 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2480 break; 2481 case FLAT_LOAD: 2482 case GLOBAL_LOAD: 2483 case GLOBAL_LOAD_SADDR: 2484 NewMI = mergeFlatLoadPair(CI, Paired, Where->I); 2485 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2486 break; 2487 case FLAT_STORE: 2488 case GLOBAL_STORE: 2489 case GLOBAL_STORE_SADDR: 2490 NewMI = mergeFlatStorePair(CI, Paired, Where->I); 2491 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2492 break; 2493 } 2494 CI.setMI(NewMI, *this); 2495 CI.Order = Where->Order; 2496 if (I == Second) 2497 I = Next; 2498 2499 MergeList.erase(Second); 2500 } 2501 2502 return Modified; 2503 } 2504 2505 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2506 if (skipFunction(MF.getFunction())) 2507 return false; 2508 2509 STM = &MF.getSubtarget<GCNSubtarget>(); 2510 if (!STM->loadStoreOptEnabled()) 2511 return false; 2512 2513 TII = STM->getInstrInfo(); 2514 TRI = &TII->getRegisterInfo(); 2515 2516 MRI = &MF.getRegInfo(); 2517 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2518 2519 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2520 2521 bool Modified = false; 2522 2523 // Contains the list of instructions for which constant offsets are being 2524 // promoted to the IMM. This is tracked for an entire block at time. 2525 SmallPtrSet<MachineInstr *, 4> AnchorList; 2526 MemInfoMap Visited; 2527 2528 for (MachineBasicBlock &MBB : MF) { 2529 MachineBasicBlock::iterator SectionEnd; 2530 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2531 I = SectionEnd) { 2532 bool CollectModified; 2533 std::list<std::list<CombineInfo>> MergeableInsts; 2534 2535 // First pass: Collect list of all instructions we know how to merge in a 2536 // subset of the block. 2537 std::tie(SectionEnd, CollectModified) = 2538 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2539 2540 Modified |= CollectModified; 2541 2542 do { 2543 OptimizeAgain = false; 2544 Modified |= optimizeBlock(MergeableInsts); 2545 } while (OptimizeAgain); 2546 } 2547 2548 Visited.clear(); 2549 AnchorList.clear(); 2550 } 2551 2552 return Modified; 2553 } 2554