1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 S_BUFFER_LOAD_SGPR_IMM, 78 S_LOAD_IMM, 79 BUFFER_LOAD, 80 BUFFER_STORE, 81 MIMG, 82 TBUFFER_LOAD, 83 TBUFFER_STORE, 84 GLOBAL_LOAD_SADDR, 85 GLOBAL_STORE_SADDR, 86 FLAT_LOAD, 87 FLAT_STORE, 88 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of 89 GLOBAL_STORE // any CombineInfo, they are only ever returned by 90 // getCommonInstClass. 91 }; 92 93 struct AddressRegs { 94 unsigned char NumVAddrs = 0; 95 bool SBase = false; 96 bool SRsrc = false; 97 bool SOffset = false; 98 bool SAddr = false; 99 bool VAddr = false; 100 bool Addr = false; 101 bool SSamp = false; 102 }; 103 104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 105 const unsigned MaxAddressRegs = 12 + 1 + 1; 106 107 class SILoadStoreOptimizer : public MachineFunctionPass { 108 struct CombineInfo { 109 MachineBasicBlock::iterator I; 110 unsigned EltSize; 111 unsigned Offset; 112 unsigned Width; 113 unsigned Format; 114 unsigned BaseOff; 115 unsigned DMask; 116 InstClassEnum InstClass; 117 unsigned CPol = 0; 118 bool IsAGPR; 119 bool UseST64; 120 int AddrIdx[MaxAddressRegs]; 121 const MachineOperand *AddrReg[MaxAddressRegs]; 122 unsigned NumAddresses; 123 unsigned Order; 124 125 bool hasSameBaseAddress(const CombineInfo &CI) { 126 if (NumAddresses != CI.NumAddresses) 127 return false; 128 129 const MachineInstr &MI = *CI.I; 130 for (unsigned i = 0; i < NumAddresses; i++) { 131 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 132 133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 135 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 136 return false; 137 } 138 continue; 139 } 140 141 // Check same base pointer. Be careful of subregisters, which can occur 142 // with vectors of pointers. 143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 145 return false; 146 } 147 } 148 return true; 149 } 150 151 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 152 for (unsigned i = 0; i < NumAddresses; ++i) { 153 const MachineOperand *AddrOp = AddrReg[i]; 154 // Immediates are always OK. 155 if (AddrOp->isImm()) 156 continue; 157 158 // Don't try to merge addresses that aren't either immediates or registers. 159 // TODO: Should be possible to merge FrameIndexes and maybe some other 160 // non-register 161 if (!AddrOp->isReg()) 162 return false; 163 164 // TODO: We should be able to merge physical reg addresses. 165 if (AddrOp->getReg().isPhysical()) 166 return false; 167 168 // If an address has only one use then there will be no other 169 // instructions with the same address, so we can't merge this one. 170 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 171 return false; 172 } 173 return true; 174 } 175 176 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 177 178 // Compare by pointer order. 179 bool operator<(const CombineInfo& Other) const { 180 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; 181 } 182 }; 183 184 struct BaseRegisters { 185 Register LoReg; 186 Register HiReg; 187 188 unsigned LoSubReg = 0; 189 unsigned HiSubReg = 0; 190 }; 191 192 struct MemAddress { 193 BaseRegisters Base; 194 int64_t Offset = 0; 195 }; 196 197 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 198 199 private: 200 const GCNSubtarget *STM = nullptr; 201 const SIInstrInfo *TII = nullptr; 202 const SIRegisterInfo *TRI = nullptr; 203 MachineRegisterInfo *MRI = nullptr; 204 AliasAnalysis *AA = nullptr; 205 bool OptimizeAgain; 206 207 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 208 const DenseSet<Register> &ARegUses, 209 const MachineInstr &A, const MachineInstr &B) const; 210 static bool dmasksCanBeCombined(const CombineInfo &CI, 211 const SIInstrInfo &TII, 212 const CombineInfo &Paired); 213 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 214 CombineInfo &Paired, bool Modify = false); 215 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 216 const CombineInfo &Paired); 217 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 218 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 219 const CombineInfo &Paired); 220 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 221 const CombineInfo &Paired); 222 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 223 224 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 225 226 unsigned read2Opcode(unsigned EltSize) const; 227 unsigned read2ST64Opcode(unsigned EltSize) const; 228 MachineBasicBlock::iterator 229 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 230 MachineBasicBlock::iterator InsertBefore); 231 232 unsigned write2Opcode(unsigned EltSize) const; 233 unsigned write2ST64Opcode(unsigned EltSize) const; 234 MachineBasicBlock::iterator 235 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 236 MachineBasicBlock::iterator InsertBefore); 237 MachineBasicBlock::iterator 238 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 239 MachineBasicBlock::iterator InsertBefore); 240 MachineBasicBlock::iterator 241 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 242 MachineBasicBlock::iterator InsertBefore); 243 MachineBasicBlock::iterator 244 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 245 MachineBasicBlock::iterator InsertBefore); 246 MachineBasicBlock::iterator 247 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 248 MachineBasicBlock::iterator InsertBefore); 249 MachineBasicBlock::iterator 250 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 251 MachineBasicBlock::iterator InsertBefore); 252 MachineBasicBlock::iterator 253 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 254 MachineBasicBlock::iterator InsertBefore); 255 MachineBasicBlock::iterator 256 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, 257 MachineBasicBlock::iterator InsertBefore); 258 MachineBasicBlock::iterator 259 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, 260 MachineBasicBlock::iterator InsertBefore); 261 262 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 263 int32_t NewOffset) const; 264 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 265 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 266 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 267 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 268 /// Promotes constant offset to the immediate by adjusting the base. It 269 /// tries to use a base from the nearby instructions that allows it to have 270 /// a 13bit constant offset which gets promoted to the immediate. 271 bool promoteConstantOffsetToImm(MachineInstr &CI, 272 MemInfoMap &Visited, 273 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 274 void addInstToMergeableList(const CombineInfo &CI, 275 std::list<std::list<CombineInfo> > &MergeableInsts) const; 276 277 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 278 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 279 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 280 std::list<std::list<CombineInfo>> &MergeableInsts) const; 281 282 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, 283 const CombineInfo &Paired); 284 285 static InstClassEnum getCommonInstClass(const CombineInfo &CI, 286 const CombineInfo &Paired); 287 288 public: 289 static char ID; 290 291 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 292 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 293 } 294 295 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 296 bool &OptimizeListAgain); 297 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 298 299 bool runOnMachineFunction(MachineFunction &MF) override; 300 301 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 302 303 void getAnalysisUsage(AnalysisUsage &AU) const override { 304 AU.setPreservesCFG(); 305 AU.addRequired<AAResultsWrapperPass>(); 306 307 MachineFunctionPass::getAnalysisUsage(AU); 308 } 309 310 MachineFunctionProperties getRequiredProperties() const override { 311 return MachineFunctionProperties() 312 .set(MachineFunctionProperties::Property::IsSSA); 313 } 314 }; 315 316 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 317 const unsigned Opc = MI.getOpcode(); 318 319 if (TII.isMUBUF(Opc)) { 320 // FIXME: Handle d16 correctly 321 return AMDGPU::getMUBUFElements(Opc); 322 } 323 if (TII.isMIMG(MI)) { 324 uint64_t DMaskImm = 325 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 326 return llvm::popcount(DMaskImm); 327 } 328 if (TII.isMTBUF(Opc)) { 329 return AMDGPU::getMTBUFElements(Opc); 330 } 331 332 switch (Opc) { 333 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 334 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: 335 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 336 case AMDGPU::S_LOAD_DWORD_IMM: 337 case AMDGPU::GLOBAL_LOAD_DWORD: 338 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 339 case AMDGPU::GLOBAL_STORE_DWORD: 340 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 341 case AMDGPU::FLAT_LOAD_DWORD: 342 case AMDGPU::FLAT_STORE_DWORD: 343 return 1; 344 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 345 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: 346 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 347 case AMDGPU::S_LOAD_DWORDX2_IMM: 348 case AMDGPU::GLOBAL_LOAD_DWORDX2: 349 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 350 case AMDGPU::GLOBAL_STORE_DWORDX2: 351 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 352 case AMDGPU::FLAT_LOAD_DWORDX2: 353 case AMDGPU::FLAT_STORE_DWORDX2: 354 return 2; 355 case AMDGPU::GLOBAL_LOAD_DWORDX3: 356 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 357 case AMDGPU::GLOBAL_STORE_DWORDX3: 358 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 359 case AMDGPU::FLAT_LOAD_DWORDX3: 360 case AMDGPU::FLAT_STORE_DWORDX3: 361 return 3; 362 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 363 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: 364 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 365 case AMDGPU::S_LOAD_DWORDX4_IMM: 366 case AMDGPU::GLOBAL_LOAD_DWORDX4: 367 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 368 case AMDGPU::GLOBAL_STORE_DWORDX4: 369 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 370 case AMDGPU::FLAT_LOAD_DWORDX4: 371 case AMDGPU::FLAT_STORE_DWORDX4: 372 return 4; 373 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 374 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: 375 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 376 case AMDGPU::S_LOAD_DWORDX8_IMM: 377 return 8; 378 case AMDGPU::DS_READ_B32: [[fallthrough]]; 379 case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]]; 380 case AMDGPU::DS_WRITE_B32: [[fallthrough]]; 381 case AMDGPU::DS_WRITE_B32_gfx9: 382 return 1; 383 case AMDGPU::DS_READ_B64: [[fallthrough]]; 384 case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]]; 385 case AMDGPU::DS_WRITE_B64: [[fallthrough]]; 386 case AMDGPU::DS_WRITE_B64_gfx9: 387 return 2; 388 default: 389 return 0; 390 } 391 } 392 393 /// Maps instruction opcode to enum InstClassEnum. 394 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 395 switch (Opc) { 396 default: 397 if (TII.isMUBUF(Opc)) { 398 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 399 default: 400 return UNKNOWN; 401 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 402 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 403 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 404 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 405 return BUFFER_LOAD; 406 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 407 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 408 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 409 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 410 return BUFFER_STORE; 411 } 412 } 413 if (TII.isMIMG(Opc)) { 414 // Ignore instructions encoded without vaddr. 415 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && 416 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0)) 417 return UNKNOWN; 418 // Ignore BVH instructions 419 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 420 return UNKNOWN; 421 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 422 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 423 TII.isGather4(Opc)) 424 return UNKNOWN; 425 return MIMG; 426 } 427 if (TII.isMTBUF(Opc)) { 428 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 429 default: 430 return UNKNOWN; 431 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 432 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 433 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 434 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 435 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: 436 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: 437 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: 438 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: 439 return TBUFFER_LOAD; 440 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 441 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 442 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 443 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 444 return TBUFFER_STORE; 445 } 446 } 447 return UNKNOWN; 448 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 449 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 450 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 451 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 452 return S_BUFFER_LOAD_IMM; 453 // For the purposes of this optimization SGPR variants of buffer loads 454 // are considered to be zero-offsetted SGPR_IMM loads. 455 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: 456 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: 457 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: 458 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: 459 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 460 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 461 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 462 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 463 return S_BUFFER_LOAD_SGPR_IMM; 464 case AMDGPU::S_LOAD_DWORD_IMM: 465 case AMDGPU::S_LOAD_DWORDX2_IMM: 466 case AMDGPU::S_LOAD_DWORDX4_IMM: 467 case AMDGPU::S_LOAD_DWORDX8_IMM: 468 return S_LOAD_IMM; 469 case AMDGPU::DS_READ_B32: 470 case AMDGPU::DS_READ_B32_gfx9: 471 case AMDGPU::DS_READ_B64: 472 case AMDGPU::DS_READ_B64_gfx9: 473 return DS_READ; 474 case AMDGPU::DS_WRITE_B32: 475 case AMDGPU::DS_WRITE_B32_gfx9: 476 case AMDGPU::DS_WRITE_B64: 477 case AMDGPU::DS_WRITE_B64_gfx9: 478 return DS_WRITE; 479 case AMDGPU::GLOBAL_LOAD_DWORD: 480 case AMDGPU::GLOBAL_LOAD_DWORDX2: 481 case AMDGPU::GLOBAL_LOAD_DWORDX3: 482 case AMDGPU::GLOBAL_LOAD_DWORDX4: 483 case AMDGPU::FLAT_LOAD_DWORD: 484 case AMDGPU::FLAT_LOAD_DWORDX2: 485 case AMDGPU::FLAT_LOAD_DWORDX3: 486 case AMDGPU::FLAT_LOAD_DWORDX4: 487 return FLAT_LOAD; 488 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 489 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 490 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 491 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 492 return GLOBAL_LOAD_SADDR; 493 case AMDGPU::GLOBAL_STORE_DWORD: 494 case AMDGPU::GLOBAL_STORE_DWORDX2: 495 case AMDGPU::GLOBAL_STORE_DWORDX3: 496 case AMDGPU::GLOBAL_STORE_DWORDX4: 497 case AMDGPU::FLAT_STORE_DWORD: 498 case AMDGPU::FLAT_STORE_DWORDX2: 499 case AMDGPU::FLAT_STORE_DWORDX3: 500 case AMDGPU::FLAT_STORE_DWORDX4: 501 return FLAT_STORE; 502 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 503 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 504 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 505 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 506 return GLOBAL_STORE_SADDR; 507 } 508 } 509 510 /// Determines instruction subclass from opcode. Only instructions 511 /// of the same subclass can be merged together. The merged instruction may have 512 /// a different subclass but must have the same class. 513 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 514 switch (Opc) { 515 default: 516 if (TII.isMUBUF(Opc)) 517 return AMDGPU::getMUBUFBaseOpcode(Opc); 518 if (TII.isMIMG(Opc)) { 519 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 520 assert(Info); 521 return Info->BaseOpcode; 522 } 523 if (TII.isMTBUF(Opc)) 524 return AMDGPU::getMTBUFBaseOpcode(Opc); 525 return -1; 526 case AMDGPU::DS_READ_B32: 527 case AMDGPU::DS_READ_B32_gfx9: 528 case AMDGPU::DS_READ_B64: 529 case AMDGPU::DS_READ_B64_gfx9: 530 case AMDGPU::DS_WRITE_B32: 531 case AMDGPU::DS_WRITE_B32_gfx9: 532 case AMDGPU::DS_WRITE_B64: 533 case AMDGPU::DS_WRITE_B64_gfx9: 534 return Opc; 535 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 536 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 537 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 538 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 539 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 540 // For the purposes of this optimization SGPR variants of buffer loads 541 // are considered to be zero-offsetted SGPR_IMM loads. 542 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: 543 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: 544 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: 545 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: 546 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 547 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 548 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 549 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 550 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; 551 case AMDGPU::S_LOAD_DWORD_IMM: 552 case AMDGPU::S_LOAD_DWORDX2_IMM: 553 case AMDGPU::S_LOAD_DWORDX4_IMM: 554 case AMDGPU::S_LOAD_DWORDX8_IMM: 555 return AMDGPU::S_LOAD_DWORD_IMM; 556 case AMDGPU::GLOBAL_LOAD_DWORD: 557 case AMDGPU::GLOBAL_LOAD_DWORDX2: 558 case AMDGPU::GLOBAL_LOAD_DWORDX3: 559 case AMDGPU::GLOBAL_LOAD_DWORDX4: 560 case AMDGPU::FLAT_LOAD_DWORD: 561 case AMDGPU::FLAT_LOAD_DWORDX2: 562 case AMDGPU::FLAT_LOAD_DWORDX3: 563 case AMDGPU::FLAT_LOAD_DWORDX4: 564 return AMDGPU::FLAT_LOAD_DWORD; 565 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 566 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 567 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 568 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 569 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; 570 case AMDGPU::GLOBAL_STORE_DWORD: 571 case AMDGPU::GLOBAL_STORE_DWORDX2: 572 case AMDGPU::GLOBAL_STORE_DWORDX3: 573 case AMDGPU::GLOBAL_STORE_DWORDX4: 574 case AMDGPU::FLAT_STORE_DWORD: 575 case AMDGPU::FLAT_STORE_DWORDX2: 576 case AMDGPU::FLAT_STORE_DWORDX3: 577 case AMDGPU::FLAT_STORE_DWORDX4: 578 return AMDGPU::FLAT_STORE_DWORD; 579 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 580 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 581 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 582 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 583 return AMDGPU::GLOBAL_STORE_DWORD_SADDR; 584 } 585 } 586 587 // GLOBAL loads and stores are classified as FLAT initially. If both combined 588 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. 589 // If either or both instructions are non segment specific FLAT the resulting 590 // combined operation will be FLAT, potentially promoting one of the GLOBAL 591 // operations to FLAT. 592 // For other instructions return the original unmodified class. 593 InstClassEnum 594 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, 595 const CombineInfo &Paired) { 596 assert(CI.InstClass == Paired.InstClass); 597 598 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && 599 SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I)) 600 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; 601 602 return CI.InstClass; 603 } 604 605 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 606 AddressRegs Result; 607 608 if (TII.isMUBUF(Opc)) { 609 if (AMDGPU::getMUBUFHasVAddr(Opc)) 610 Result.VAddr = true; 611 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 612 Result.SRsrc = true; 613 if (AMDGPU::getMUBUFHasSoffset(Opc)) 614 Result.SOffset = true; 615 616 return Result; 617 } 618 619 if (TII.isMIMG(Opc)) { 620 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 621 if (VAddr0Idx >= 0) { 622 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 623 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 624 } else { 625 Result.VAddr = true; 626 } 627 Result.SRsrc = true; 628 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 629 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 630 Result.SSamp = true; 631 632 return Result; 633 } 634 if (TII.isMTBUF(Opc)) { 635 if (AMDGPU::getMTBUFHasVAddr(Opc)) 636 Result.VAddr = true; 637 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 638 Result.SRsrc = true; 639 if (AMDGPU::getMTBUFHasSoffset(Opc)) 640 Result.SOffset = true; 641 642 return Result; 643 } 644 645 switch (Opc) { 646 default: 647 return Result; 648 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: 649 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: 650 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: 651 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: 652 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 653 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 654 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 655 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 656 Result.SOffset = true; 657 [[fallthrough]]; 658 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 659 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 660 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 661 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 662 case AMDGPU::S_LOAD_DWORD_IMM: 663 case AMDGPU::S_LOAD_DWORDX2_IMM: 664 case AMDGPU::S_LOAD_DWORDX4_IMM: 665 case AMDGPU::S_LOAD_DWORDX8_IMM: 666 Result.SBase = true; 667 return Result; 668 case AMDGPU::DS_READ_B32: 669 case AMDGPU::DS_READ_B64: 670 case AMDGPU::DS_READ_B32_gfx9: 671 case AMDGPU::DS_READ_B64_gfx9: 672 case AMDGPU::DS_WRITE_B32: 673 case AMDGPU::DS_WRITE_B64: 674 case AMDGPU::DS_WRITE_B32_gfx9: 675 case AMDGPU::DS_WRITE_B64_gfx9: 676 Result.Addr = true; 677 return Result; 678 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 679 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 680 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 681 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 682 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 683 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 684 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 685 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 686 Result.SAddr = true; 687 [[fallthrough]]; 688 case AMDGPU::GLOBAL_LOAD_DWORD: 689 case AMDGPU::GLOBAL_LOAD_DWORDX2: 690 case AMDGPU::GLOBAL_LOAD_DWORDX3: 691 case AMDGPU::GLOBAL_LOAD_DWORDX4: 692 case AMDGPU::GLOBAL_STORE_DWORD: 693 case AMDGPU::GLOBAL_STORE_DWORDX2: 694 case AMDGPU::GLOBAL_STORE_DWORDX3: 695 case AMDGPU::GLOBAL_STORE_DWORDX4: 696 case AMDGPU::FLAT_LOAD_DWORD: 697 case AMDGPU::FLAT_LOAD_DWORDX2: 698 case AMDGPU::FLAT_LOAD_DWORDX3: 699 case AMDGPU::FLAT_LOAD_DWORDX4: 700 case AMDGPU::FLAT_STORE_DWORD: 701 case AMDGPU::FLAT_STORE_DWORDX2: 702 case AMDGPU::FLAT_STORE_DWORDX3: 703 case AMDGPU::FLAT_STORE_DWORDX4: 704 Result.VAddr = true; 705 return Result; 706 } 707 } 708 709 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 710 const SILoadStoreOptimizer &LSO) { 711 I = MI; 712 unsigned Opc = MI->getOpcode(); 713 InstClass = getInstClass(Opc, *LSO.TII); 714 715 if (InstClass == UNKNOWN) 716 return; 717 718 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 719 720 switch (InstClass) { 721 case DS_READ: 722 EltSize = 723 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 724 : 4; 725 break; 726 case DS_WRITE: 727 EltSize = 728 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 729 : 4; 730 break; 731 case S_BUFFER_LOAD_IMM: 732 case S_BUFFER_LOAD_SGPR_IMM: 733 case S_LOAD_IMM: 734 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 735 break; 736 default: 737 EltSize = 4; 738 break; 739 } 740 741 if (InstClass == MIMG) { 742 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 743 // Offset is not considered for MIMG instructions. 744 Offset = 0; 745 } else { 746 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 747 Offset = OffsetIdx == -1 ? 0 : I->getOperand(OffsetIdx).getImm(); 748 } 749 750 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 751 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 752 753 Width = getOpcodeWidth(*I, *LSO.TII); 754 755 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 756 Offset &= 0xffff; 757 } else if (InstClass != MIMG) { 758 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 759 } 760 761 AddressRegs Regs = getRegs(Opc, *LSO.TII); 762 763 NumAddresses = 0; 764 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 765 AddrIdx[NumAddresses++] = 766 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 767 if (Regs.Addr) 768 AddrIdx[NumAddresses++] = 769 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 770 if (Regs.SBase) 771 AddrIdx[NumAddresses++] = 772 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 773 if (Regs.SRsrc) 774 AddrIdx[NumAddresses++] = 775 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 776 if (Regs.SOffset) 777 AddrIdx[NumAddresses++] = 778 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 779 if (Regs.SAddr) 780 AddrIdx[NumAddresses++] = 781 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 782 if (Regs.VAddr) 783 AddrIdx[NumAddresses++] = 784 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 785 if (Regs.SSamp) 786 AddrIdx[NumAddresses++] = 787 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 788 assert(NumAddresses <= MaxAddressRegs); 789 790 for (unsigned J = 0; J < NumAddresses; J++) 791 AddrReg[J] = &I->getOperand(AddrIdx[J]); 792 } 793 794 } // end anonymous namespace. 795 796 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 797 "SI Load Store Optimizer", false, false) 798 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 799 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 800 false, false) 801 802 char SILoadStoreOptimizer::ID = 0; 803 804 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 805 806 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 807 return new SILoadStoreOptimizer(); 808 } 809 810 static void addDefsUsesToList(const MachineInstr &MI, 811 DenseSet<Register> &RegDefs, 812 DenseSet<Register> &RegUses) { 813 for (const auto &Op : MI.operands()) { 814 if (!Op.isReg()) 815 continue; 816 if (Op.isDef()) 817 RegDefs.insert(Op.getReg()); 818 if (Op.readsReg()) 819 RegUses.insert(Op.getReg()); 820 } 821 } 822 823 bool SILoadStoreOptimizer::canSwapInstructions( 824 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 825 const MachineInstr &A, const MachineInstr &B) const { 826 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 827 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 828 return false; 829 for (const auto &BOp : B.operands()) { 830 if (!BOp.isReg()) 831 continue; 832 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 833 return false; 834 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 835 return false; 836 } 837 return true; 838 } 839 840 // Given that \p CI and \p Paired are adjacent memory operations produce a new 841 // MMO for the combined operation with a new access size. 842 MachineMemOperand * 843 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, 844 const CombineInfo &Paired) { 845 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 846 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 847 848 unsigned Size = MMOa->getSize() + MMOb->getSize(); 849 850 // A base pointer for the combined operation is the same as the leading 851 // operation's pointer. 852 if (Paired < CI) 853 std::swap(MMOa, MMOb); 854 855 MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); 856 // If merging FLAT and GLOBAL set address space to FLAT. 857 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 858 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; 859 860 MachineFunction *MF = CI.I->getMF(); 861 return MF->getMachineMemOperand(MMOa, PtrInfo, Size); 862 } 863 864 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 865 const SIInstrInfo &TII, 866 const CombineInfo &Paired) { 867 assert(CI.InstClass == MIMG); 868 869 // Ignore instructions with tfe/lwe set. 870 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 871 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 872 873 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 874 return false; 875 876 // Check other optional immediate operands for equality. 877 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 878 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 879 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 880 881 for (auto op : OperandsToMatch) { 882 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 883 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 884 return false; 885 if (Idx != -1 && 886 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 887 return false; 888 } 889 890 // Check DMask for overlaps. 891 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 892 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 893 894 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask); 895 if ((1u << AllowedBitsForMin) <= MinMask) 896 return false; 897 898 return true; 899 } 900 901 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 902 unsigned ComponentCount, 903 const GCNSubtarget &STI) { 904 if (ComponentCount > 4) 905 return 0; 906 907 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 908 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 909 if (!OldFormatInfo) 910 return 0; 911 912 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 913 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 914 ComponentCount, 915 OldFormatInfo->NumFormat, STI); 916 917 if (!NewFormatInfo) 918 return 0; 919 920 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 921 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 922 923 return NewFormatInfo->Format; 924 } 925 926 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 927 // highest power of two. Note that the result is well defined for all inputs 928 // including corner cases like: 929 // - if Lo == Hi, return that value 930 // - if Lo == 0, return 0 (even though the "- 1" below underflows 931 // - if Lo > Hi, return 0 (as if the range wrapped around) 932 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 933 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1); 934 } 935 936 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 937 const GCNSubtarget &STI, 938 CombineInfo &Paired, 939 bool Modify) { 940 assert(CI.InstClass != MIMG); 941 942 // XXX - Would the same offset be OK? Is there any reason this would happen or 943 // be useful? 944 if (CI.Offset == Paired.Offset) 945 return false; 946 947 // This won't be valid if the offset isn't aligned. 948 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 949 return false; 950 951 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 952 953 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 954 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 955 if (!Info0) 956 return false; 957 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 958 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 959 if (!Info1) 960 return false; 961 962 if (Info0->BitsPerComp != Info1->BitsPerComp || 963 Info0->NumFormat != Info1->NumFormat) 964 return false; 965 966 // TODO: Should be possible to support more formats, but if format loads 967 // are not dword-aligned, the merged load might not be valid. 968 if (Info0->BitsPerComp != 32) 969 return false; 970 971 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 972 return false; 973 } 974 975 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 976 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 977 CI.UseST64 = false; 978 CI.BaseOff = 0; 979 980 // Handle all non-DS instructions. 981 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 982 return (EltOffset0 + CI.Width == EltOffset1 || 983 EltOffset1 + Paired.Width == EltOffset0) && 984 CI.CPol == Paired.CPol; 985 } 986 987 // If the offset in elements doesn't fit in 8-bits, we might be able to use 988 // the stride 64 versions. 989 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 990 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 991 if (Modify) { 992 CI.Offset = EltOffset0 / 64; 993 Paired.Offset = EltOffset1 / 64; 994 CI.UseST64 = true; 995 } 996 return true; 997 } 998 999 // Check if the new offsets fit in the reduced 8-bit range. 1000 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 1001 if (Modify) { 1002 CI.Offset = EltOffset0; 1003 Paired.Offset = EltOffset1; 1004 } 1005 return true; 1006 } 1007 1008 // Try to shift base address to decrease offsets. 1009 uint32_t Min = std::min(EltOffset0, EltOffset1); 1010 uint32_t Max = std::max(EltOffset0, EltOffset1); 1011 1012 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 1013 if (((Max - Min) & ~Mask) == 0) { 1014 if (Modify) { 1015 // From the range of values we could use for BaseOff, choose the one that 1016 // is aligned to the highest power of two, to maximise the chance that 1017 // the same offset can be reused for other load/store pairs. 1018 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 1019 // Copy the low bits of the offsets, so that when we adjust them by 1020 // subtracting BaseOff they will be multiples of 64. 1021 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 1022 CI.BaseOff = BaseOff * CI.EltSize; 1023 CI.Offset = (EltOffset0 - BaseOff) / 64; 1024 Paired.Offset = (EltOffset1 - BaseOff) / 64; 1025 CI.UseST64 = true; 1026 } 1027 return true; 1028 } 1029 1030 if (isUInt<8>(Max - Min)) { 1031 if (Modify) { 1032 // From the range of values we could use for BaseOff, choose the one that 1033 // is aligned to the highest power of two, to maximise the chance that 1034 // the same offset can be reused for other load/store pairs. 1035 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 1036 CI.BaseOff = BaseOff * CI.EltSize; 1037 CI.Offset = EltOffset0 - BaseOff; 1038 Paired.Offset = EltOffset1 - BaseOff; 1039 } 1040 return true; 1041 } 1042 1043 return false; 1044 } 1045 1046 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 1047 const CombineInfo &CI, 1048 const CombineInfo &Paired) { 1049 const unsigned Width = (CI.Width + Paired.Width); 1050 switch (CI.InstClass) { 1051 default: 1052 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 1053 case S_BUFFER_LOAD_IMM: 1054 case S_BUFFER_LOAD_SGPR_IMM: 1055 case S_LOAD_IMM: 1056 switch (Width) { 1057 default: 1058 return false; 1059 case 2: 1060 case 4: 1061 case 8: 1062 return true; 1063 } 1064 } 1065 } 1066 1067 const TargetRegisterClass * 1068 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 1069 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 1070 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1071 } 1072 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 1073 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1074 } 1075 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 1076 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1077 } 1078 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 1079 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1080 } 1081 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 1082 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1083 } 1084 return nullptr; 1085 } 1086 1087 /// This function assumes that CI comes before Paired in a basic block. Return 1088 /// an insertion point for the merged instruction or nullptr on failure. 1089 SILoadStoreOptimizer::CombineInfo * 1090 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 1091 CombineInfo &Paired) { 1092 // If another instruction has already been merged into CI, it may now be a 1093 // type that we can't do any further merging into. 1094 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 1095 return nullptr; 1096 assert(CI.InstClass == Paired.InstClass); 1097 1098 if (getInstSubclass(CI.I->getOpcode(), *TII) != 1099 getInstSubclass(Paired.I->getOpcode(), *TII)) 1100 return nullptr; 1101 1102 // Check both offsets (or masks for MIMG) can be combined and fit in the 1103 // reduced range. 1104 if (CI.InstClass == MIMG) { 1105 if (!dmasksCanBeCombined(CI, *TII, Paired)) 1106 return nullptr; 1107 } else { 1108 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 1109 return nullptr; 1110 } 1111 1112 DenseSet<Register> RegDefs; 1113 DenseSet<Register> RegUses; 1114 CombineInfo *Where; 1115 if (CI.I->mayLoad()) { 1116 // Try to hoist Paired up to CI. 1117 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 1118 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 1119 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 1120 return nullptr; 1121 } 1122 Where = &CI; 1123 } else { 1124 // Try to sink CI down to Paired. 1125 addDefsUsesToList(*CI.I, RegDefs, RegUses); 1126 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 1127 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 1128 return nullptr; 1129 } 1130 Where = &Paired; 1131 } 1132 1133 // Call offsetsCanBeCombined with modify = true so that the offsets are 1134 // correct for the new instruction. This should return true, because 1135 // this function should only be called on CombineInfo objects that 1136 // have already been confirmed to be mergeable. 1137 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 1138 offsetsCanBeCombined(CI, *STM, Paired, true); 1139 return Where; 1140 } 1141 1142 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1143 if (STM->ldsRequiresM0Init()) 1144 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1145 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1146 } 1147 1148 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1149 if (STM->ldsRequiresM0Init()) 1150 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1151 1152 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1153 : AMDGPU::DS_READ2ST64_B64_gfx9; 1154 } 1155 1156 MachineBasicBlock::iterator 1157 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1158 MachineBasicBlock::iterator InsertBefore) { 1159 MachineBasicBlock *MBB = CI.I->getParent(); 1160 1161 // Be careful, since the addresses could be subregisters themselves in weird 1162 // cases, like vectors of pointers. 1163 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1164 1165 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1166 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1167 1168 unsigned NewOffset0 = CI.Offset; 1169 unsigned NewOffset1 = Paired.Offset; 1170 unsigned Opc = 1171 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1172 1173 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1174 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1175 1176 if (NewOffset0 > NewOffset1) { 1177 // Canonicalize the merged instruction so the smaller offset comes first. 1178 std::swap(NewOffset0, NewOffset1); 1179 std::swap(SubRegIdx0, SubRegIdx1); 1180 } 1181 1182 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1183 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1184 1185 const MCInstrDesc &Read2Desc = TII->get(Opc); 1186 1187 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1188 Register DestReg = MRI->createVirtualRegister(SuperRC); 1189 1190 DebugLoc DL = CI.I->getDebugLoc(); 1191 1192 Register BaseReg = AddrReg->getReg(); 1193 unsigned BaseSubReg = AddrReg->getSubReg(); 1194 unsigned BaseRegFlags = 0; 1195 if (CI.BaseOff) { 1196 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1197 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1198 .addImm(CI.BaseOff); 1199 1200 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1201 BaseRegFlags = RegState::Kill; 1202 1203 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1204 .addReg(ImmReg) 1205 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1206 .addImm(0); // clamp bit 1207 BaseSubReg = 0; 1208 } 1209 1210 MachineInstrBuilder Read2 = 1211 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 1212 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1213 .addImm(NewOffset0) // offset0 1214 .addImm(NewOffset1) // offset1 1215 .addImm(0) // gds 1216 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1217 1218 (void)Read2; 1219 1220 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1221 1222 // Copy to the old destination registers. 1223 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1224 .add(*Dest0) // Copy to same destination including flags and sub reg. 1225 .addReg(DestReg, 0, SubRegIdx0); 1226 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1227 .add(*Dest1) 1228 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1229 1230 CI.I->eraseFromParent(); 1231 Paired.I->eraseFromParent(); 1232 1233 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1234 return Read2; 1235 } 1236 1237 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1238 if (STM->ldsRequiresM0Init()) 1239 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1240 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1241 : AMDGPU::DS_WRITE2_B64_gfx9; 1242 } 1243 1244 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1245 if (STM->ldsRequiresM0Init()) 1246 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1247 : AMDGPU::DS_WRITE2ST64_B64; 1248 1249 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1250 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1251 } 1252 1253 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1254 CombineInfo &CI, CombineInfo &Paired, 1255 MachineBasicBlock::iterator InsertBefore) { 1256 MachineBasicBlock *MBB = CI.I->getParent(); 1257 1258 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1259 // sure we preserve the subregister index and any register flags set on them. 1260 const MachineOperand *AddrReg = 1261 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1262 const MachineOperand *Data0 = 1263 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1264 const MachineOperand *Data1 = 1265 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1266 1267 unsigned NewOffset0 = CI.Offset; 1268 unsigned NewOffset1 = Paired.Offset; 1269 unsigned Opc = 1270 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1271 1272 if (NewOffset0 > NewOffset1) { 1273 // Canonicalize the merged instruction so the smaller offset comes first. 1274 std::swap(NewOffset0, NewOffset1); 1275 std::swap(Data0, Data1); 1276 } 1277 1278 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1279 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1280 1281 const MCInstrDesc &Write2Desc = TII->get(Opc); 1282 DebugLoc DL = CI.I->getDebugLoc(); 1283 1284 Register BaseReg = AddrReg->getReg(); 1285 unsigned BaseSubReg = AddrReg->getSubReg(); 1286 unsigned BaseRegFlags = 0; 1287 if (CI.BaseOff) { 1288 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1289 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1290 .addImm(CI.BaseOff); 1291 1292 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1293 BaseRegFlags = RegState::Kill; 1294 1295 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1296 .addReg(ImmReg) 1297 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1298 .addImm(0); // clamp bit 1299 BaseSubReg = 0; 1300 } 1301 1302 MachineInstrBuilder Write2 = 1303 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1304 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1305 .add(*Data0) // data0 1306 .add(*Data1) // data1 1307 .addImm(NewOffset0) // offset0 1308 .addImm(NewOffset1) // offset1 1309 .addImm(0) // gds 1310 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1311 1312 CI.I->eraseFromParent(); 1313 Paired.I->eraseFromParent(); 1314 1315 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1316 return Write2; 1317 } 1318 1319 MachineBasicBlock::iterator 1320 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1321 MachineBasicBlock::iterator InsertBefore) { 1322 MachineBasicBlock *MBB = CI.I->getParent(); 1323 DebugLoc DL = CI.I->getDebugLoc(); 1324 const unsigned Opcode = getNewOpcode(CI, Paired); 1325 1326 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1327 1328 Register DestReg = MRI->createVirtualRegister(SuperRC); 1329 unsigned MergedDMask = CI.DMask | Paired.DMask; 1330 unsigned DMaskIdx = 1331 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1332 1333 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1334 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1335 if (I == DMaskIdx) 1336 MIB.addImm(MergedDMask); 1337 else 1338 MIB.add((*CI.I).getOperand(I)); 1339 } 1340 1341 // It shouldn't be possible to get this far if the two instructions 1342 // don't have a single memoperand, because MachineInstr::mayAlias() 1343 // will return true if this is the case. 1344 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1345 1346 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1347 1348 unsigned SubRegIdx0, SubRegIdx1; 1349 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1350 1351 // Copy to the old destination registers. 1352 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1353 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1354 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1355 1356 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1357 .add(*Dest0) // Copy to same destination including flags and sub reg. 1358 .addReg(DestReg, 0, SubRegIdx0); 1359 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1360 .add(*Dest1) 1361 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1362 1363 CI.I->eraseFromParent(); 1364 Paired.I->eraseFromParent(); 1365 return New; 1366 } 1367 1368 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( 1369 CombineInfo &CI, CombineInfo &Paired, 1370 MachineBasicBlock::iterator InsertBefore) { 1371 MachineBasicBlock *MBB = CI.I->getParent(); 1372 DebugLoc DL = CI.I->getDebugLoc(); 1373 const unsigned Opcode = getNewOpcode(CI, Paired); 1374 1375 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1376 1377 Register DestReg = MRI->createVirtualRegister(SuperRC); 1378 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1379 1380 // It shouldn't be possible to get this far if the two instructions 1381 // don't have a single memoperand, because MachineInstr::mayAlias() 1382 // will return true if this is the case. 1383 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1384 1385 MachineInstrBuilder New = 1386 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1387 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)); 1388 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) 1389 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)); 1390 // For convenience, when SGPR_IMM buffer loads are merged into a 1391 // zero-offset load, we generate its SGPR variant. 1392 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::offset)) 1393 New.addImm(MergedOffset); 1394 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1395 1396 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1397 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1398 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1399 1400 // Copy to the old destination registers. 1401 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1402 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1403 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1404 1405 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1406 .add(*Dest0) // Copy to same destination including flags and sub reg. 1407 .addReg(DestReg, 0, SubRegIdx0); 1408 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1409 .add(*Dest1) 1410 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1411 1412 CI.I->eraseFromParent(); 1413 Paired.I->eraseFromParent(); 1414 return New; 1415 } 1416 1417 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1418 CombineInfo &CI, CombineInfo &Paired, 1419 MachineBasicBlock::iterator InsertBefore) { 1420 MachineBasicBlock *MBB = CI.I->getParent(); 1421 DebugLoc DL = CI.I->getDebugLoc(); 1422 1423 const unsigned Opcode = getNewOpcode(CI, Paired); 1424 1425 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1426 1427 // Copy to the new source register. 1428 Register DestReg = MRI->createVirtualRegister(SuperRC); 1429 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1430 1431 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1432 1433 AddressRegs Regs = getRegs(Opcode, *TII); 1434 1435 if (Regs.VAddr) 1436 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1437 1438 // It shouldn't be possible to get this far if the two instructions 1439 // don't have a single memoperand, because MachineInstr::mayAlias() 1440 // will return true if this is the case. 1441 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1442 1443 MachineInstr *New = 1444 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1445 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1446 .addImm(MergedOffset) // offset 1447 .addImm(CI.CPol) // cpol 1448 .addImm(0) // swz 1449 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1450 1451 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1452 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1453 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1454 1455 // Copy to the old destination registers. 1456 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1457 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1458 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1459 1460 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1461 .add(*Dest0) // Copy to same destination including flags and sub reg. 1462 .addReg(DestReg, 0, SubRegIdx0); 1463 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1464 .add(*Dest1) 1465 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1466 1467 CI.I->eraseFromParent(); 1468 Paired.I->eraseFromParent(); 1469 return New; 1470 } 1471 1472 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1473 CombineInfo &CI, CombineInfo &Paired, 1474 MachineBasicBlock::iterator InsertBefore) { 1475 MachineBasicBlock *MBB = CI.I->getParent(); 1476 DebugLoc DL = CI.I->getDebugLoc(); 1477 1478 const unsigned Opcode = getNewOpcode(CI, Paired); 1479 1480 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1481 1482 // Copy to the new source register. 1483 Register DestReg = MRI->createVirtualRegister(SuperRC); 1484 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1485 1486 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1487 1488 AddressRegs Regs = getRegs(Opcode, *TII); 1489 1490 if (Regs.VAddr) 1491 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1492 1493 unsigned JoinedFormat = 1494 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1495 1496 // It shouldn't be possible to get this far if the two instructions 1497 // don't have a single memoperand, because MachineInstr::mayAlias() 1498 // will return true if this is the case. 1499 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1500 1501 MachineInstr *New = 1502 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1503 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1504 .addImm(MergedOffset) // offset 1505 .addImm(JoinedFormat) // format 1506 .addImm(CI.CPol) // cpol 1507 .addImm(0) // swz 1508 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1509 1510 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1511 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1512 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1513 1514 // Copy to the old destination registers. 1515 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1516 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1517 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1518 1519 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1520 .add(*Dest0) // Copy to same destination including flags and sub reg. 1521 .addReg(DestReg, 0, SubRegIdx0); 1522 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1523 .add(*Dest1) 1524 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1525 1526 CI.I->eraseFromParent(); 1527 Paired.I->eraseFromParent(); 1528 return New; 1529 } 1530 1531 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1532 CombineInfo &CI, CombineInfo &Paired, 1533 MachineBasicBlock::iterator InsertBefore) { 1534 MachineBasicBlock *MBB = CI.I->getParent(); 1535 DebugLoc DL = CI.I->getDebugLoc(); 1536 1537 const unsigned Opcode = getNewOpcode(CI, Paired); 1538 1539 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1540 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1541 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1542 1543 // Copy to the new source register. 1544 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1545 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1546 1547 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1548 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1549 1550 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1551 .add(*Src0) 1552 .addImm(SubRegIdx0) 1553 .add(*Src1) 1554 .addImm(SubRegIdx1); 1555 1556 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1557 .addReg(SrcReg, RegState::Kill); 1558 1559 AddressRegs Regs = getRegs(Opcode, *TII); 1560 1561 if (Regs.VAddr) 1562 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1563 1564 unsigned JoinedFormat = 1565 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1566 1567 // It shouldn't be possible to get this far if the two instructions 1568 // don't have a single memoperand, because MachineInstr::mayAlias() 1569 // will return true if this is the case. 1570 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1571 1572 MachineInstr *New = 1573 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1574 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1575 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1576 .addImm(JoinedFormat) // format 1577 .addImm(CI.CPol) // cpol 1578 .addImm(0) // swz 1579 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1580 1581 CI.I->eraseFromParent(); 1582 Paired.I->eraseFromParent(); 1583 return New; 1584 } 1585 1586 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( 1587 CombineInfo &CI, CombineInfo &Paired, 1588 MachineBasicBlock::iterator InsertBefore) { 1589 MachineBasicBlock *MBB = CI.I->getParent(); 1590 DebugLoc DL = CI.I->getDebugLoc(); 1591 1592 const unsigned Opcode = getNewOpcode(CI, Paired); 1593 1594 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1595 Register DestReg = MRI->createVirtualRegister(SuperRC); 1596 1597 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1598 1599 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1600 MIB.add(*SAddr); 1601 1602 MachineInstr *New = 1603 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1604 .addImm(std::min(CI.Offset, Paired.Offset)) 1605 .addImm(CI.CPol) 1606 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1607 1608 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1609 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1610 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1611 1612 // Copy to the old destination registers. 1613 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1614 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1615 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1616 1617 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1618 .add(*Dest0) // Copy to same destination including flags and sub reg. 1619 .addReg(DestReg, 0, SubRegIdx0); 1620 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1621 .add(*Dest1) 1622 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1623 1624 CI.I->eraseFromParent(); 1625 Paired.I->eraseFromParent(); 1626 return New; 1627 } 1628 1629 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( 1630 CombineInfo &CI, CombineInfo &Paired, 1631 MachineBasicBlock::iterator InsertBefore) { 1632 MachineBasicBlock *MBB = CI.I->getParent(); 1633 DebugLoc DL = CI.I->getDebugLoc(); 1634 1635 const unsigned Opcode = getNewOpcode(CI, Paired); 1636 1637 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1638 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1639 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1640 1641 // Copy to the new source register. 1642 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1643 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1644 1645 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1646 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1647 1648 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1649 .add(*Src0) 1650 .addImm(SubRegIdx0) 1651 .add(*Src1) 1652 .addImm(SubRegIdx1); 1653 1654 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1655 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1656 .addReg(SrcReg, RegState::Kill); 1657 1658 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1659 MIB.add(*SAddr); 1660 1661 MachineInstr *New = 1662 MIB.addImm(std::min(CI.Offset, Paired.Offset)) 1663 .addImm(CI.CPol) 1664 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1665 1666 CI.I->eraseFromParent(); 1667 Paired.I->eraseFromParent(); 1668 return New; 1669 } 1670 1671 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1672 const CombineInfo &Paired) { 1673 const unsigned Width = CI.Width + Paired.Width; 1674 1675 switch (getCommonInstClass(CI, Paired)) { 1676 default: 1677 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1678 // FIXME: Handle d16 correctly 1679 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1680 Width); 1681 case TBUFFER_LOAD: 1682 case TBUFFER_STORE: 1683 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1684 Width); 1685 1686 case UNKNOWN: 1687 llvm_unreachable("Unknown instruction class"); 1688 case S_BUFFER_LOAD_IMM: 1689 switch (Width) { 1690 default: 1691 return 0; 1692 case 2: 1693 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1694 case 4: 1695 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1696 case 8: 1697 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1698 } 1699 case S_BUFFER_LOAD_SGPR_IMM: 1700 switch (Width) { 1701 default: 1702 return 0; 1703 case 2: 1704 return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR 1705 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; 1706 case 4: 1707 return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR 1708 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; 1709 case 8: 1710 return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR 1711 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; 1712 } 1713 case S_LOAD_IMM: 1714 switch (Width) { 1715 default: 1716 return 0; 1717 case 2: 1718 return AMDGPU::S_LOAD_DWORDX2_IMM; 1719 case 4: 1720 return AMDGPU::S_LOAD_DWORDX4_IMM; 1721 case 8: 1722 return AMDGPU::S_LOAD_DWORDX8_IMM; 1723 } 1724 case GLOBAL_LOAD: 1725 switch (Width) { 1726 default: 1727 return 0; 1728 case 2: 1729 return AMDGPU::GLOBAL_LOAD_DWORDX2; 1730 case 3: 1731 return AMDGPU::GLOBAL_LOAD_DWORDX3; 1732 case 4: 1733 return AMDGPU::GLOBAL_LOAD_DWORDX4; 1734 } 1735 case GLOBAL_LOAD_SADDR: 1736 switch (Width) { 1737 default: 1738 return 0; 1739 case 2: 1740 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; 1741 case 3: 1742 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; 1743 case 4: 1744 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; 1745 } 1746 case GLOBAL_STORE: 1747 switch (Width) { 1748 default: 1749 return 0; 1750 case 2: 1751 return AMDGPU::GLOBAL_STORE_DWORDX2; 1752 case 3: 1753 return AMDGPU::GLOBAL_STORE_DWORDX3; 1754 case 4: 1755 return AMDGPU::GLOBAL_STORE_DWORDX4; 1756 } 1757 case GLOBAL_STORE_SADDR: 1758 switch (Width) { 1759 default: 1760 return 0; 1761 case 2: 1762 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; 1763 case 3: 1764 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; 1765 case 4: 1766 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; 1767 } 1768 case FLAT_LOAD: 1769 switch (Width) { 1770 default: 1771 return 0; 1772 case 2: 1773 return AMDGPU::FLAT_LOAD_DWORDX2; 1774 case 3: 1775 return AMDGPU::FLAT_LOAD_DWORDX3; 1776 case 4: 1777 return AMDGPU::FLAT_LOAD_DWORDX4; 1778 } 1779 case FLAT_STORE: 1780 switch (Width) { 1781 default: 1782 return 0; 1783 case 2: 1784 return AMDGPU::FLAT_STORE_DWORDX2; 1785 case 3: 1786 return AMDGPU::FLAT_STORE_DWORDX3; 1787 case 4: 1788 return AMDGPU::FLAT_STORE_DWORDX4; 1789 } 1790 case MIMG: 1791 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && 1792 "No overlaps"); 1793 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1794 } 1795 } 1796 1797 std::pair<unsigned, unsigned> 1798 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1799 const CombineInfo &Paired) { 1800 assert((CI.InstClass != MIMG || 1801 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == 1802 CI.Width + Paired.Width)) && 1803 "No overlaps"); 1804 1805 unsigned Idx0; 1806 unsigned Idx1; 1807 1808 static const unsigned Idxs[5][4] = { 1809 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1810 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1811 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1812 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1813 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1814 }; 1815 1816 assert(CI.Width >= 1 && CI.Width <= 4); 1817 assert(Paired.Width >= 1 && Paired.Width <= 4); 1818 1819 if (Paired < CI) { 1820 Idx1 = Idxs[0][Paired.Width - 1]; 1821 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1822 } else { 1823 Idx0 = Idxs[0][CI.Width - 1]; 1824 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1825 } 1826 1827 return std::pair(Idx0, Idx1); 1828 } 1829 1830 const TargetRegisterClass * 1831 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1832 const CombineInfo &Paired) { 1833 if (CI.InstClass == S_BUFFER_LOAD_IMM || 1834 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) { 1835 switch (CI.Width + Paired.Width) { 1836 default: 1837 return nullptr; 1838 case 2: 1839 return &AMDGPU::SReg_64_XEXECRegClass; 1840 case 4: 1841 return &AMDGPU::SGPR_128RegClass; 1842 case 8: 1843 return &AMDGPU::SGPR_256RegClass; 1844 case 16: 1845 return &AMDGPU::SGPR_512RegClass; 1846 } 1847 } 1848 1849 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1850 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1851 ? TRI->getAGPRClassForBitWidth(BitWidth) 1852 : TRI->getVGPRClassForBitWidth(BitWidth); 1853 } 1854 1855 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1856 CombineInfo &CI, CombineInfo &Paired, 1857 MachineBasicBlock::iterator InsertBefore) { 1858 MachineBasicBlock *MBB = CI.I->getParent(); 1859 DebugLoc DL = CI.I->getDebugLoc(); 1860 1861 const unsigned Opcode = getNewOpcode(CI, Paired); 1862 1863 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1864 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1865 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1866 1867 // Copy to the new source register. 1868 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1869 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1870 1871 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1872 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1873 1874 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1875 .add(*Src0) 1876 .addImm(SubRegIdx0) 1877 .add(*Src1) 1878 .addImm(SubRegIdx1); 1879 1880 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1881 .addReg(SrcReg, RegState::Kill); 1882 1883 AddressRegs Regs = getRegs(Opcode, *TII); 1884 1885 if (Regs.VAddr) 1886 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1887 1888 1889 // It shouldn't be possible to get this far if the two instructions 1890 // don't have a single memoperand, because MachineInstr::mayAlias() 1891 // will return true if this is the case. 1892 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1893 1894 MachineInstr *New = 1895 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1896 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1897 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1898 .addImm(CI.CPol) // cpol 1899 .addImm(0) // swz 1900 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1901 1902 CI.I->eraseFromParent(); 1903 Paired.I->eraseFromParent(); 1904 return New; 1905 } 1906 1907 MachineOperand 1908 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1909 APInt V(32, Val, true); 1910 if (TII->isInlineConstant(V)) 1911 return MachineOperand::CreateImm(Val); 1912 1913 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1914 MachineInstr *Mov = 1915 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1916 TII->get(AMDGPU::S_MOV_B32), Reg) 1917 .addImm(Val); 1918 (void)Mov; 1919 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1920 return MachineOperand::CreateReg(Reg, false); 1921 } 1922 1923 // Compute base address using Addr and return the final register. 1924 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1925 const MemAddress &Addr) const { 1926 MachineBasicBlock *MBB = MI.getParent(); 1927 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1928 DebugLoc DL = MI.getDebugLoc(); 1929 1930 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1931 Addr.Base.LoSubReg) && 1932 "Expected 32-bit Base-Register-Low!!"); 1933 1934 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1935 Addr.Base.HiSubReg) && 1936 "Expected 32-bit Base-Register-Hi!!"); 1937 1938 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1939 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1940 MachineOperand OffsetHi = 1941 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1942 1943 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1944 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1945 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1946 1947 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1948 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1949 MachineInstr *LoHalf = 1950 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1951 .addReg(CarryReg, RegState::Define) 1952 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1953 .add(OffsetLo) 1954 .addImm(0); // clamp bit 1955 (void)LoHalf; 1956 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1957 1958 MachineInstr *HiHalf = 1959 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1960 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1961 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1962 .add(OffsetHi) 1963 .addReg(CarryReg, RegState::Kill) 1964 .addImm(0); // clamp bit 1965 (void)HiHalf; 1966 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1967 1968 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 1969 MachineInstr *FullBase = 1970 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1971 .addReg(DestSub0) 1972 .addImm(AMDGPU::sub0) 1973 .addReg(DestSub1) 1974 .addImm(AMDGPU::sub1); 1975 (void)FullBase; 1976 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1977 1978 return FullDestReg; 1979 } 1980 1981 // Update base and offset with the NewBase and NewOffset in MI. 1982 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1983 Register NewBase, 1984 int32_t NewOffset) const { 1985 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1986 Base->setReg(NewBase); 1987 Base->setIsKill(false); 1988 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1989 } 1990 1991 std::optional<int32_t> 1992 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1993 if (Op.isImm()) 1994 return Op.getImm(); 1995 1996 if (!Op.isReg()) 1997 return std::nullopt; 1998 1999 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 2000 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 2001 !Def->getOperand(1).isImm()) 2002 return std::nullopt; 2003 2004 return Def->getOperand(1).getImm(); 2005 } 2006 2007 // Analyze Base and extracts: 2008 // - 32bit base registers, subregisters 2009 // - 64bit constant offset 2010 // Expecting base computation as: 2011 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 2012 // %LO:vgpr_32, %c:sreg_64_xexec = 2013 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 2014 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 2015 // %Base:vreg_64 = 2016 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 2017 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 2018 MemAddress &Addr) const { 2019 if (!Base.isReg()) 2020 return; 2021 2022 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 2023 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 2024 || Def->getNumOperands() != 5) 2025 return; 2026 2027 MachineOperand BaseLo = Def->getOperand(1); 2028 MachineOperand BaseHi = Def->getOperand(3); 2029 if (!BaseLo.isReg() || !BaseHi.isReg()) 2030 return; 2031 2032 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 2033 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 2034 2035 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 2036 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 2037 return; 2038 2039 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 2040 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 2041 2042 auto Offset0P = extractConstOffset(*Src0); 2043 if (Offset0P) 2044 BaseLo = *Src1; 2045 else { 2046 if (!(Offset0P = extractConstOffset(*Src1))) 2047 return; 2048 BaseLo = *Src0; 2049 } 2050 2051 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 2052 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 2053 2054 if (Src0->isImm()) 2055 std::swap(Src0, Src1); 2056 2057 if (!Src1->isImm()) 2058 return; 2059 2060 uint64_t Offset1 = Src1->getImm(); 2061 BaseHi = *Src0; 2062 2063 Addr.Base.LoReg = BaseLo.getReg(); 2064 Addr.Base.HiReg = BaseHi.getReg(); 2065 Addr.Base.LoSubReg = BaseLo.getSubReg(); 2066 Addr.Base.HiSubReg = BaseHi.getSubReg(); 2067 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 2068 } 2069 2070 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 2071 MachineInstr &MI, 2072 MemInfoMap &Visited, 2073 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 2074 2075 if (!(MI.mayLoad() ^ MI.mayStore())) 2076 return false; 2077 2078 // TODO: Support flat and scratch. 2079 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 2080 return false; 2081 2082 if (MI.mayLoad() && 2083 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) 2084 return false; 2085 2086 if (AnchorList.count(&MI)) 2087 return false; 2088 2089 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 2090 2091 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 2092 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 2093 return false; 2094 } 2095 2096 // Step1: Find the base-registers and a 64bit constant offset. 2097 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2098 MemAddress MAddr; 2099 if (!Visited.contains(&MI)) { 2100 processBaseWithConstOffset(Base, MAddr); 2101 Visited[&MI] = MAddr; 2102 } else 2103 MAddr = Visited[&MI]; 2104 2105 if (MAddr.Offset == 0) { 2106 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 2107 " constant offsets that can be promoted.\n";); 2108 return false; 2109 } 2110 2111 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 2112 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 2113 2114 // Step2: Traverse through MI's basic block and find an anchor(that has the 2115 // same base-registers) with the highest 13bit distance from MI's offset. 2116 // E.g. (64bit loads) 2117 // bb: 2118 // addr1 = &a + 4096; load1 = load(addr1, 0) 2119 // addr2 = &a + 6144; load2 = load(addr2, 0) 2120 // addr3 = &a + 8192; load3 = load(addr3, 0) 2121 // addr4 = &a + 10240; load4 = load(addr4, 0) 2122 // addr5 = &a + 12288; load5 = load(addr5, 0) 2123 // 2124 // Starting from the first load, the optimization will try to find a new base 2125 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 2126 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 2127 // as the new-base(anchor) because of the maximum distance which can 2128 // accommodate more intermediate bases presumably. 2129 // 2130 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 2131 // (&a + 8192) for load1, load2, load4. 2132 // addr = &a + 8192 2133 // load1 = load(addr, -4096) 2134 // load2 = load(addr, -2048) 2135 // load3 = load(addr, 0) 2136 // load4 = load(addr, 2048) 2137 // addr5 = &a + 12288; load5 = load(addr5, 0) 2138 // 2139 MachineInstr *AnchorInst = nullptr; 2140 MemAddress AnchorAddr; 2141 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 2142 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 2143 2144 MachineBasicBlock *MBB = MI.getParent(); 2145 MachineBasicBlock::iterator E = MBB->end(); 2146 MachineBasicBlock::iterator MBBI = MI.getIterator(); 2147 ++MBBI; 2148 const SITargetLowering *TLI = 2149 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 2150 2151 for ( ; MBBI != E; ++MBBI) { 2152 MachineInstr &MINext = *MBBI; 2153 // TODO: Support finding an anchor(with same base) from store addresses or 2154 // any other load addresses where the opcodes are different. 2155 if (MINext.getOpcode() != MI.getOpcode() || 2156 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 2157 continue; 2158 2159 const MachineOperand &BaseNext = 2160 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 2161 MemAddress MAddrNext; 2162 if (!Visited.contains(&MINext)) { 2163 processBaseWithConstOffset(BaseNext, MAddrNext); 2164 Visited[&MINext] = MAddrNext; 2165 } else 2166 MAddrNext = Visited[&MINext]; 2167 2168 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 2169 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 2170 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 2171 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 2172 continue; 2173 2174 InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset)); 2175 2176 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 2177 TargetLoweringBase::AddrMode AM; 2178 AM.HasBaseReg = true; 2179 AM.BaseOffs = Dist; 2180 if (TLI->isLegalGlobalAddressingMode(AM) && 2181 (uint32_t)std::abs(Dist) > MaxDist) { 2182 MaxDist = std::abs(Dist); 2183 2184 AnchorAddr = MAddrNext; 2185 AnchorInst = &MINext; 2186 } 2187 } 2188 2189 if (AnchorInst) { 2190 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 2191 AnchorInst->dump()); 2192 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 2193 << AnchorAddr.Offset << "\n\n"); 2194 2195 // Instead of moving up, just re-compute anchor-instruction's base address. 2196 Register Base = computeBase(MI, AnchorAddr); 2197 2198 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 2199 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 2200 2201 for (auto P : InstsWCommonBase) { 2202 TargetLoweringBase::AddrMode AM; 2203 AM.HasBaseReg = true; 2204 AM.BaseOffs = P.second - AnchorAddr.Offset; 2205 2206 if (TLI->isLegalGlobalAddressingMode(AM)) { 2207 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 2208 dbgs() << ")"; P.first->dump()); 2209 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 2210 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 2211 } 2212 } 2213 AnchorList.insert(AnchorInst); 2214 return true; 2215 } 2216 2217 return false; 2218 } 2219 2220 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 2221 std::list<std::list<CombineInfo> > &MergeableInsts) const { 2222 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 2223 if (AddrList.front().InstClass == CI.InstClass && 2224 AddrList.front().IsAGPR == CI.IsAGPR && 2225 AddrList.front().hasSameBaseAddress(CI)) { 2226 AddrList.emplace_back(CI); 2227 return; 2228 } 2229 } 2230 2231 // Base address not found, so add a new list. 2232 MergeableInsts.emplace_back(1, CI); 2233 } 2234 2235 std::pair<MachineBasicBlock::iterator, bool> 2236 SILoadStoreOptimizer::collectMergeableInsts( 2237 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2238 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2239 std::list<std::list<CombineInfo>> &MergeableInsts) const { 2240 bool Modified = false; 2241 2242 // Sort potential mergeable instructions into lists. One list per base address. 2243 unsigned Order = 0; 2244 MachineBasicBlock::iterator BlockI = Begin; 2245 for (; BlockI != End; ++BlockI) { 2246 MachineInstr &MI = *BlockI; 2247 2248 // We run this before checking if an address is mergeable, because it can produce 2249 // better code even if the instructions aren't mergeable. 2250 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 2251 Modified = true; 2252 2253 // Treat volatile accesses, ordered accesses and unmodeled side effects as 2254 // barriers. We can look after this barrier for separate merges. 2255 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 2256 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 2257 2258 // Search will resume after this instruction in a separate merge list. 2259 ++BlockI; 2260 break; 2261 } 2262 2263 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2264 if (InstClass == UNKNOWN) 2265 continue; 2266 2267 // Do not merge VMEM buffer instructions with "swizzled" bit set. 2268 int Swizzled = 2269 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 2270 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 2271 continue; 2272 2273 CombineInfo CI; 2274 CI.setMI(MI, *this); 2275 CI.Order = Order++; 2276 2277 if (!CI.hasMergeableAddress(*MRI)) 2278 continue; 2279 2280 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 2281 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2282 // operands. However we are reporting that ds_write2 shall have 2283 // only VGPR data so that machine copy propagation does not 2284 // create an illegal instruction with a VGPR and AGPR sources. 2285 // Consequenctially if we create such instruction the verifier 2286 // will complain. 2287 continue; 2288 } 2289 2290 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2291 2292 addInstToMergeableList(CI, MergeableInsts); 2293 } 2294 2295 // At this point we have lists of Mergeable instructions. 2296 // 2297 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2298 // list try to find an instruction that can be merged with I. If an instruction 2299 // is found, it is stored in the Paired field. If no instructions are found, then 2300 // the CombineInfo object is deleted from the list. 2301 2302 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2303 E = MergeableInsts.end(); I != E;) { 2304 2305 std::list<CombineInfo> &MergeList = *I; 2306 if (MergeList.size() <= 1) { 2307 // This means we have found only one instruction with a given address 2308 // that can be merged, and we need at least 2 instructions to do a merge, 2309 // so this list can be discarded. 2310 I = MergeableInsts.erase(I); 2311 continue; 2312 } 2313 2314 // Sort the lists by offsets, this way mergeable instructions will be 2315 // adjacent to each other in the list, which will make it easier to find 2316 // matches. 2317 MergeList.sort( 2318 [] (const CombineInfo &A, const CombineInfo &B) { 2319 return A.Offset < B.Offset; 2320 }); 2321 ++I; 2322 } 2323 2324 return std::pair(BlockI, Modified); 2325 } 2326 2327 // Scan through looking for adjacent LDS operations with constant offsets from 2328 // the same base register. We rely on the scheduler to do the hard work of 2329 // clustering nearby loads, and assume these are all adjacent. 2330 bool SILoadStoreOptimizer::optimizeBlock( 2331 std::list<std::list<CombineInfo> > &MergeableInsts) { 2332 bool Modified = false; 2333 2334 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2335 E = MergeableInsts.end(); I != E;) { 2336 std::list<CombineInfo> &MergeList = *I; 2337 2338 bool OptimizeListAgain = false; 2339 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2340 // We weren't able to make any changes, so delete the list so we don't 2341 // process the same instructions the next time we try to optimize this 2342 // block. 2343 I = MergeableInsts.erase(I); 2344 continue; 2345 } 2346 2347 Modified = true; 2348 2349 // We made changes, but also determined that there were no more optimization 2350 // opportunities, so we don't need to reprocess the list 2351 if (!OptimizeListAgain) { 2352 I = MergeableInsts.erase(I); 2353 continue; 2354 } 2355 OptimizeAgain = true; 2356 } 2357 return Modified; 2358 } 2359 2360 bool 2361 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2362 std::list<CombineInfo> &MergeList, 2363 bool &OptimizeListAgain) { 2364 if (MergeList.empty()) 2365 return false; 2366 2367 bool Modified = false; 2368 2369 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2370 Next = std::next(I)) { 2371 2372 auto First = I; 2373 auto Second = Next; 2374 2375 if ((*First).Order > (*Second).Order) 2376 std::swap(First, Second); 2377 CombineInfo &CI = *First; 2378 CombineInfo &Paired = *Second; 2379 2380 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 2381 if (!Where) { 2382 ++I; 2383 continue; 2384 } 2385 2386 Modified = true; 2387 2388 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2389 2390 MachineBasicBlock::iterator NewMI; 2391 switch (CI.InstClass) { 2392 default: 2393 llvm_unreachable("unknown InstClass"); 2394 break; 2395 case DS_READ: 2396 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2397 break; 2398 case DS_WRITE: 2399 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2400 break; 2401 case S_BUFFER_LOAD_IMM: 2402 case S_BUFFER_LOAD_SGPR_IMM: 2403 case S_LOAD_IMM: 2404 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I); 2405 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2406 break; 2407 case BUFFER_LOAD: 2408 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2409 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2410 break; 2411 case BUFFER_STORE: 2412 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2413 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2414 break; 2415 case MIMG: 2416 NewMI = mergeImagePair(CI, Paired, Where->I); 2417 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2418 break; 2419 case TBUFFER_LOAD: 2420 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2421 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2422 break; 2423 case TBUFFER_STORE: 2424 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2425 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2426 break; 2427 case FLAT_LOAD: 2428 case GLOBAL_LOAD: 2429 case GLOBAL_LOAD_SADDR: 2430 NewMI = mergeFlatLoadPair(CI, Paired, Where->I); 2431 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2432 break; 2433 case FLAT_STORE: 2434 case GLOBAL_STORE: 2435 case GLOBAL_STORE_SADDR: 2436 NewMI = mergeFlatStorePair(CI, Paired, Where->I); 2437 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2438 break; 2439 } 2440 CI.setMI(NewMI, *this); 2441 CI.Order = Where->Order; 2442 if (I == Second) 2443 I = Next; 2444 2445 MergeList.erase(Second); 2446 } 2447 2448 return Modified; 2449 } 2450 2451 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2452 if (skipFunction(MF.getFunction())) 2453 return false; 2454 2455 STM = &MF.getSubtarget<GCNSubtarget>(); 2456 if (!STM->loadStoreOptEnabled()) 2457 return false; 2458 2459 TII = STM->getInstrInfo(); 2460 TRI = &TII->getRegisterInfo(); 2461 2462 MRI = &MF.getRegInfo(); 2463 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2464 2465 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2466 2467 bool Modified = false; 2468 2469 // Contains the list of instructions for which constant offsets are being 2470 // promoted to the IMM. This is tracked for an entire block at time. 2471 SmallPtrSet<MachineInstr *, 4> AnchorList; 2472 MemInfoMap Visited; 2473 2474 for (MachineBasicBlock &MBB : MF) { 2475 MachineBasicBlock::iterator SectionEnd; 2476 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2477 I = SectionEnd) { 2478 bool CollectModified; 2479 std::list<std::list<CombineInfo>> MergeableInsts; 2480 2481 // First pass: Collect list of all instructions we know how to merge in a 2482 // subset of the block. 2483 std::tie(SectionEnd, CollectModified) = 2484 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2485 2486 Modified |= CollectModified; 2487 2488 do { 2489 OptimizeAgain = false; 2490 Modified |= optimizeBlock(MergeableInsts); 2491 } while (OptimizeAgain); 2492 } 2493 2494 Visited.clear(); 2495 AnchorList.clear(); 2496 } 2497 2498 return Modified; 2499 } 2500