10b57cec5SDimitry Andric //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric // This pass tries to fuse DS instructions with close by immediate offsets. 100b57cec5SDimitry Andric // This will fuse operations such as 110b57cec5SDimitry Andric // ds_read_b32 v0, v2 offset:16 120b57cec5SDimitry Andric // ds_read_b32 v1, v2 offset:32 130b57cec5SDimitry Andric // ==> 140b57cec5SDimitry Andric // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 150b57cec5SDimitry Andric // 160b57cec5SDimitry Andric // The same is done for certain SMEM and VMEM opcodes, e.g.: 170b57cec5SDimitry Andric // s_buffer_load_dword s4, s[0:3], 4 180b57cec5SDimitry Andric // s_buffer_load_dword s5, s[0:3], 8 190b57cec5SDimitry Andric // ==> 200b57cec5SDimitry Andric // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 210b57cec5SDimitry Andric // 220b57cec5SDimitry Andric // This pass also tries to promote constant offset to the immediate by 230b57cec5SDimitry Andric // adjusting the base. It tries to use a base from the nearby instructions that 240b57cec5SDimitry Andric // allows it to have a 13bit constant offset and then promotes the 13bit offset 250b57cec5SDimitry Andric // to the immediate. 260b57cec5SDimitry Andric // E.g. 270b57cec5SDimitry Andric // s_movk_i32 s0, 0x1800 280b57cec5SDimitry Andric // v_add_co_u32_e32 v0, vcc, s0, v2 290b57cec5SDimitry Andric // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 300b57cec5SDimitry Andric // 310b57cec5SDimitry Andric // s_movk_i32 s0, 0x1000 320b57cec5SDimitry Andric // v_add_co_u32_e32 v5, vcc, s0, v2 330b57cec5SDimitry Andric // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 340b57cec5SDimitry Andric // global_load_dwordx2 v[5:6], v[5:6], off 350b57cec5SDimitry Andric // global_load_dwordx2 v[0:1], v[0:1], off 360b57cec5SDimitry Andric // => 370b57cec5SDimitry Andric // s_movk_i32 s0, 0x1000 380b57cec5SDimitry Andric // v_add_co_u32_e32 v5, vcc, s0, v2 390b57cec5SDimitry Andric // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 400b57cec5SDimitry Andric // global_load_dwordx2 v[5:6], v[5:6], off 410b57cec5SDimitry Andric // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 420b57cec5SDimitry Andric // 430b57cec5SDimitry Andric // Future improvements: 440b57cec5SDimitry Andric // 458bcb0991SDimitry Andric // - This is currently missing stores of constants because loading 460b57cec5SDimitry Andric // the constant into the data register is placed between the stores, although 470b57cec5SDimitry Andric // this is arguably a scheduling problem. 480b57cec5SDimitry Andric // 490b57cec5SDimitry Andric // - Live interval recomputing seems inefficient. This currently only matches 500b57cec5SDimitry Andric // one pair, and recomputes live intervals and moves on to the next pair. It 510b57cec5SDimitry Andric // would be better to compute a list of all merges that need to occur. 520b57cec5SDimitry Andric // 530b57cec5SDimitry Andric // - With a list of instructions to process, we can also merge more. If a 540b57cec5SDimitry Andric // cluster of loads have offsets that are too large to fit in the 8-bit 550b57cec5SDimitry Andric // offsets, but are close enough to fit in the 8 bits, we can add to the base 560b57cec5SDimitry Andric // pointer and use the new reduced offsets. 570b57cec5SDimitry Andric // 580b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 590b57cec5SDimitry Andric 600b57cec5SDimitry Andric #include "AMDGPU.h" 61e8d8bef9SDimitry Andric #include "GCNSubtarget.h" 620b57cec5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 630b57cec5SDimitry Andric #include "llvm/Analysis/AliasAnalysis.h" 640b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h" 65480093f4SDimitry Andric #include "llvm/InitializePasses.h" 660b57cec5SDimitry Andric 670b57cec5SDimitry Andric using namespace llvm; 680b57cec5SDimitry Andric 690b57cec5SDimitry Andric #define DEBUG_TYPE "si-load-store-opt" 700b57cec5SDimitry Andric 710b57cec5SDimitry Andric namespace { 720b57cec5SDimitry Andric enum InstClassEnum { 730b57cec5SDimitry Andric UNKNOWN, 740b57cec5SDimitry Andric DS_READ, 750b57cec5SDimitry Andric DS_WRITE, 760b57cec5SDimitry Andric S_BUFFER_LOAD_IMM, 77bdd1243dSDimitry Andric S_BUFFER_LOAD_SGPR_IMM, 78bdd1243dSDimitry Andric S_LOAD_IMM, 798bcb0991SDimitry Andric BUFFER_LOAD, 808bcb0991SDimitry Andric BUFFER_STORE, 818bcb0991SDimitry Andric MIMG, 82480093f4SDimitry Andric TBUFFER_LOAD, 83480093f4SDimitry Andric TBUFFER_STORE, 8481ad6265SDimitry Andric GLOBAL_LOAD_SADDR, 8581ad6265SDimitry Andric GLOBAL_STORE_SADDR, 8681ad6265SDimitry Andric FLAT_LOAD, 8781ad6265SDimitry Andric FLAT_STORE, 8881ad6265SDimitry Andric GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of 8981ad6265SDimitry Andric GLOBAL_STORE // any CombineInfo, they are only ever returned by 9081ad6265SDimitry Andric // getCommonInstClass. 910b57cec5SDimitry Andric }; 920b57cec5SDimitry Andric 935ffd83dbSDimitry Andric struct AddressRegs { 945ffd83dbSDimitry Andric unsigned char NumVAddrs = 0; 955ffd83dbSDimitry Andric bool SBase = false; 965ffd83dbSDimitry Andric bool SRsrc = false; 975ffd83dbSDimitry Andric bool SOffset = false; 9881ad6265SDimitry Andric bool SAddr = false; 995ffd83dbSDimitry Andric bool VAddr = false; 1005ffd83dbSDimitry Andric bool Addr = false; 1015ffd83dbSDimitry Andric bool SSamp = false; 1020b57cec5SDimitry Andric }; 1030b57cec5SDimitry Andric 1045ffd83dbSDimitry Andric // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 1055ffd83dbSDimitry Andric const unsigned MaxAddressRegs = 12 + 1 + 1; 1065ffd83dbSDimitry Andric 1070b57cec5SDimitry Andric class SILoadStoreOptimizer : public MachineFunctionPass { 1080b57cec5SDimitry Andric struct CombineInfo { 1090b57cec5SDimitry Andric MachineBasicBlock::iterator I; 1100b57cec5SDimitry Andric unsigned EltSize; 111480093f4SDimitry Andric unsigned Offset; 112480093f4SDimitry Andric unsigned Width; 113480093f4SDimitry Andric unsigned Format; 1140b57cec5SDimitry Andric unsigned BaseOff; 115480093f4SDimitry Andric unsigned DMask; 1160b57cec5SDimitry Andric InstClassEnum InstClass; 117fe6060f1SDimitry Andric unsigned CPol = 0; 11804eeddc0SDimitry Andric bool IsAGPR; 1190b57cec5SDimitry Andric bool UseST64; 1205ffd83dbSDimitry Andric int AddrIdx[MaxAddressRegs]; 1215ffd83dbSDimitry Andric const MachineOperand *AddrReg[MaxAddressRegs]; 1228bcb0991SDimitry Andric unsigned NumAddresses; 1235ffd83dbSDimitry Andric unsigned Order; 1248bcb0991SDimitry Andric 125bdd1243dSDimitry Andric bool hasSameBaseAddress(const CombineInfo &CI) { 126bdd1243dSDimitry Andric if (NumAddresses != CI.NumAddresses) 127bdd1243dSDimitry Andric return false; 128bdd1243dSDimitry Andric 129bdd1243dSDimitry Andric const MachineInstr &MI = *CI.I; 1308bcb0991SDimitry Andric for (unsigned i = 0; i < NumAddresses; i++) { 1318bcb0991SDimitry Andric const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 1328bcb0991SDimitry Andric 1338bcb0991SDimitry Andric if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 1348bcb0991SDimitry Andric if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 1358bcb0991SDimitry Andric AddrReg[i]->getImm() != AddrRegNext.getImm()) { 1368bcb0991SDimitry Andric return false; 1378bcb0991SDimitry Andric } 1388bcb0991SDimitry Andric continue; 1398bcb0991SDimitry Andric } 1408bcb0991SDimitry Andric 1418bcb0991SDimitry Andric // Check same base pointer. Be careful of subregisters, which can occur 1428bcb0991SDimitry Andric // with vectors of pointers. 1438bcb0991SDimitry Andric if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 1448bcb0991SDimitry Andric AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 1458bcb0991SDimitry Andric return false; 1468bcb0991SDimitry Andric } 1478bcb0991SDimitry Andric } 1488bcb0991SDimitry Andric return true; 1498bcb0991SDimitry Andric } 1508bcb0991SDimitry Andric 1518bcb0991SDimitry Andric bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 1528bcb0991SDimitry Andric for (unsigned i = 0; i < NumAddresses; ++i) { 1538bcb0991SDimitry Andric const MachineOperand *AddrOp = AddrReg[i]; 1548bcb0991SDimitry Andric // Immediates are always OK. 1558bcb0991SDimitry Andric if (AddrOp->isImm()) 1568bcb0991SDimitry Andric continue; 1578bcb0991SDimitry Andric 1588bcb0991SDimitry Andric // Don't try to merge addresses that aren't either immediates or registers. 1598bcb0991SDimitry Andric // TODO: Should be possible to merge FrameIndexes and maybe some other 1608bcb0991SDimitry Andric // non-register 1618bcb0991SDimitry Andric if (!AddrOp->isReg()) 1628bcb0991SDimitry Andric return false; 1638bcb0991SDimitry Andric 1645f757f3fSDimitry Andric // TODO: We should be able to merge instructions with other physical reg 1655f757f3fSDimitry Andric // addresses too. 1665f757f3fSDimitry Andric if (AddrOp->getReg().isPhysical() && 1675f757f3fSDimitry Andric AddrOp->getReg() != AMDGPU::SGPR_NULL) 1688bcb0991SDimitry Andric return false; 1698bcb0991SDimitry Andric 170bdd1243dSDimitry Andric // If an address has only one use then there will be no other 1718bcb0991SDimitry Andric // instructions with the same address, so we can't merge this one. 1728bcb0991SDimitry Andric if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 1738bcb0991SDimitry Andric return false; 1748bcb0991SDimitry Andric } 1758bcb0991SDimitry Andric return true; 1768bcb0991SDimitry Andric } 1778bcb0991SDimitry Andric 17804eeddc0SDimitry Andric void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 17981ad6265SDimitry Andric 18081ad6265SDimitry Andric // Compare by pointer order. 18181ad6265SDimitry Andric bool operator<(const CombineInfo& Other) const { 18281ad6265SDimitry Andric return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; 18381ad6265SDimitry Andric } 1840b57cec5SDimitry Andric }; 1850b57cec5SDimitry Andric 1860b57cec5SDimitry Andric struct BaseRegisters { 1875ffd83dbSDimitry Andric Register LoReg; 1885ffd83dbSDimitry Andric Register HiReg; 1890b57cec5SDimitry Andric 1900b57cec5SDimitry Andric unsigned LoSubReg = 0; 1910b57cec5SDimitry Andric unsigned HiSubReg = 0; 1920b57cec5SDimitry Andric }; 1930b57cec5SDimitry Andric 1940b57cec5SDimitry Andric struct MemAddress { 1950b57cec5SDimitry Andric BaseRegisters Base; 1960b57cec5SDimitry Andric int64_t Offset = 0; 1970b57cec5SDimitry Andric }; 1980b57cec5SDimitry Andric 1990b57cec5SDimitry Andric using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 2000b57cec5SDimitry Andric 2010b57cec5SDimitry Andric private: 2020b57cec5SDimitry Andric const GCNSubtarget *STM = nullptr; 2030b57cec5SDimitry Andric const SIInstrInfo *TII = nullptr; 2040b57cec5SDimitry Andric const SIRegisterInfo *TRI = nullptr; 2050b57cec5SDimitry Andric MachineRegisterInfo *MRI = nullptr; 2060b57cec5SDimitry Andric AliasAnalysis *AA = nullptr; 2070b57cec5SDimitry Andric bool OptimizeAgain; 2080b57cec5SDimitry Andric 20981ad6265SDimitry Andric bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 21081ad6265SDimitry Andric const DenseSet<Register> &ARegUses, 21181ad6265SDimitry Andric const MachineInstr &A, const MachineInstr &B) const; 212480093f4SDimitry Andric static bool dmasksCanBeCombined(const CombineInfo &CI, 213480093f4SDimitry Andric const SIInstrInfo &TII, 214480093f4SDimitry Andric const CombineInfo &Paired); 2155ffd83dbSDimitry Andric static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 2165ffd83dbSDimitry Andric CombineInfo &Paired, bool Modify = false); 2175ffd83dbSDimitry Andric static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 218480093f4SDimitry Andric const CombineInfo &Paired); 219*0fca6ea1SDimitry Andric unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 220480093f4SDimitry Andric static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 221480093f4SDimitry Andric const CombineInfo &Paired); 222*0fca6ea1SDimitry Andric const TargetRegisterClass * 223*0fca6ea1SDimitry Andric getTargetRegisterClass(const CombineInfo &CI, 224*0fca6ea1SDimitry Andric const CombineInfo &Paired) const; 225fe6060f1SDimitry Andric const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 2260b57cec5SDimitry Andric 22781ad6265SDimitry Andric CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 2280b57cec5SDimitry Andric 229*0fca6ea1SDimitry Andric void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired, 230*0fca6ea1SDimitry Andric MachineBasicBlock::iterator InsertBefore, int OpName, 231*0fca6ea1SDimitry Andric Register DestReg) const; 232*0fca6ea1SDimitry Andric Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, 233*0fca6ea1SDimitry Andric MachineBasicBlock::iterator InsertBefore, 234*0fca6ea1SDimitry Andric int OpName) const; 235*0fca6ea1SDimitry Andric 2360b57cec5SDimitry Andric unsigned read2Opcode(unsigned EltSize) const; 2370b57cec5SDimitry Andric unsigned read2ST64Opcode(unsigned EltSize) const; 23881ad6265SDimitry Andric MachineBasicBlock::iterator 23981ad6265SDimitry Andric mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 24081ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore); 2410b57cec5SDimitry Andric 2420b57cec5SDimitry Andric unsigned write2Opcode(unsigned EltSize) const; 2430b57cec5SDimitry Andric unsigned write2ST64Opcode(unsigned EltSize) const; 2445ffd83dbSDimitry Andric MachineBasicBlock::iterator 2455ffd83dbSDimitry Andric mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 24681ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore); 2475ffd83dbSDimitry Andric MachineBasicBlock::iterator 2485ffd83dbSDimitry Andric mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 24981ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore); 2505ffd83dbSDimitry Andric MachineBasicBlock::iterator 251bdd1243dSDimitry Andric mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 25281ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore); 2535ffd83dbSDimitry Andric MachineBasicBlock::iterator 2545ffd83dbSDimitry Andric mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 25581ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore); 2565ffd83dbSDimitry Andric MachineBasicBlock::iterator 2575ffd83dbSDimitry Andric mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 25881ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore); 2595ffd83dbSDimitry Andric MachineBasicBlock::iterator 2605ffd83dbSDimitry Andric mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 26181ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore); 2625ffd83dbSDimitry Andric MachineBasicBlock::iterator 2635ffd83dbSDimitry Andric mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 26481ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore); 26581ad6265SDimitry Andric MachineBasicBlock::iterator 26681ad6265SDimitry Andric mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, 26781ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore); 26881ad6265SDimitry Andric MachineBasicBlock::iterator 26981ad6265SDimitry Andric mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, 27081ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore); 2710b57cec5SDimitry Andric 2725ffd83dbSDimitry Andric void updateBaseAndOffset(MachineInstr &I, Register NewBase, 2738bcb0991SDimitry Andric int32_t NewOffset) const; 2745ffd83dbSDimitry Andric Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 2758bcb0991SDimitry Andric MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 276bdd1243dSDimitry Andric std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 2778bcb0991SDimitry Andric void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 2780b57cec5SDimitry Andric /// Promotes constant offset to the immediate by adjusting the base. It 2790b57cec5SDimitry Andric /// tries to use a base from the nearby instructions that allows it to have 2800b57cec5SDimitry Andric /// a 13bit constant offset which gets promoted to the immediate. 2810b57cec5SDimitry Andric bool promoteConstantOffsetToImm(MachineInstr &CI, 2820b57cec5SDimitry Andric MemInfoMap &Visited, 2838bcb0991SDimitry Andric SmallPtrSet<MachineInstr *, 4> &Promoted) const; 2848bcb0991SDimitry Andric void addInstToMergeableList(const CombineInfo &CI, 2858bcb0991SDimitry Andric std::list<std::list<CombineInfo> > &MergeableInsts) const; 2865ffd83dbSDimitry Andric 2875ffd83dbSDimitry Andric std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 2885ffd83dbSDimitry Andric MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2895ffd83dbSDimitry Andric MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2908bcb0991SDimitry Andric std::list<std::list<CombineInfo>> &MergeableInsts) const; 2910b57cec5SDimitry Andric 29281ad6265SDimitry Andric static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, 29381ad6265SDimitry Andric const CombineInfo &Paired); 29481ad6265SDimitry Andric 29581ad6265SDimitry Andric static InstClassEnum getCommonInstClass(const CombineInfo &CI, 29681ad6265SDimitry Andric const CombineInfo &Paired); 29781ad6265SDimitry Andric 2980b57cec5SDimitry Andric public: 2990b57cec5SDimitry Andric static char ID; 3000b57cec5SDimitry Andric 3010b57cec5SDimitry Andric SILoadStoreOptimizer() : MachineFunctionPass(ID) { 3020b57cec5SDimitry Andric initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 3030b57cec5SDimitry Andric } 3040b57cec5SDimitry Andric 3058bcb0991SDimitry Andric bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 3068bcb0991SDimitry Andric bool &OptimizeListAgain); 3078bcb0991SDimitry Andric bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 3080b57cec5SDimitry Andric 3090b57cec5SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 3100b57cec5SDimitry Andric 3110b57cec5SDimitry Andric StringRef getPassName() const override { return "SI Load Store Optimizer"; } 3120b57cec5SDimitry Andric 3130b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 3140b57cec5SDimitry Andric AU.setPreservesCFG(); 3150b57cec5SDimitry Andric AU.addRequired<AAResultsWrapperPass>(); 3160b57cec5SDimitry Andric 3170b57cec5SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 3180b57cec5SDimitry Andric } 3195ffd83dbSDimitry Andric 3205ffd83dbSDimitry Andric MachineFunctionProperties getRequiredProperties() const override { 3215ffd83dbSDimitry Andric return MachineFunctionProperties() 3225ffd83dbSDimitry Andric .set(MachineFunctionProperties::Property::IsSSA); 3235ffd83dbSDimitry Andric } 3240b57cec5SDimitry Andric }; 3250b57cec5SDimitry Andric 3268bcb0991SDimitry Andric static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 3278bcb0991SDimitry Andric const unsigned Opc = MI.getOpcode(); 3288bcb0991SDimitry Andric 3298bcb0991SDimitry Andric if (TII.isMUBUF(Opc)) { 3308bcb0991SDimitry Andric // FIXME: Handle d16 correctly 3318bcb0991SDimitry Andric return AMDGPU::getMUBUFElements(Opc); 3328bcb0991SDimitry Andric } 3335f757f3fSDimitry Andric if (TII.isImage(MI)) { 3348bcb0991SDimitry Andric uint64_t DMaskImm = 3358bcb0991SDimitry Andric TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 336bdd1243dSDimitry Andric return llvm::popcount(DMaskImm); 3378bcb0991SDimitry Andric } 338480093f4SDimitry Andric if (TII.isMTBUF(Opc)) { 339480093f4SDimitry Andric return AMDGPU::getMTBUFElements(Opc); 340480093f4SDimitry Andric } 3418bcb0991SDimitry Andric 3428bcb0991SDimitry Andric switch (Opc) { 3438bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 344bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 345bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORD_IMM: 34681ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD: 34781ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 34881ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD: 34981ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 35081ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORD: 35181ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORD: 3528bcb0991SDimitry Andric return 1; 3538bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 354bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 355bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM: 356*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 35781ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2: 35881ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 35981ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2: 36081ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 36181ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX2: 36281ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX2: 3638bcb0991SDimitry Andric return 2; 3645f757f3fSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 3655f757f3fSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 3665f757f3fSDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM: 367*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 36881ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3: 36981ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 37081ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3: 37181ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 37281ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX3: 37381ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX3: 37481ad6265SDimitry Andric return 3; 3758bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 376bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 377bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM: 378*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 37981ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4: 38081ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 38181ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4: 38281ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 38381ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX4: 38481ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX4: 3858bcb0991SDimitry Andric return 4; 386349cc55cSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 387bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 388bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM: 389*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 390349cc55cSDimitry Andric return 8; 391*0fca6ea1SDimitry Andric case AMDGPU::DS_READ_B32: 392*0fca6ea1SDimitry Andric case AMDGPU::DS_READ_B32_gfx9: 393*0fca6ea1SDimitry Andric case AMDGPU::DS_WRITE_B32: 394fe6060f1SDimitry Andric case AMDGPU::DS_WRITE_B32_gfx9: 395fe6060f1SDimitry Andric return 1; 396*0fca6ea1SDimitry Andric case AMDGPU::DS_READ_B64: 397*0fca6ea1SDimitry Andric case AMDGPU::DS_READ_B64_gfx9: 398*0fca6ea1SDimitry Andric case AMDGPU::DS_WRITE_B64: 399fe6060f1SDimitry Andric case AMDGPU::DS_WRITE_B64_gfx9: 400fe6060f1SDimitry Andric return 2; 4018bcb0991SDimitry Andric default: 4028bcb0991SDimitry Andric return 0; 4038bcb0991SDimitry Andric } 4048bcb0991SDimitry Andric } 4058bcb0991SDimitry Andric 4068bcb0991SDimitry Andric /// Maps instruction opcode to enum InstClassEnum. 4078bcb0991SDimitry Andric static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 4088bcb0991SDimitry Andric switch (Opc) { 4098bcb0991SDimitry Andric default: 4108bcb0991SDimitry Andric if (TII.isMUBUF(Opc)) { 4118bcb0991SDimitry Andric switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 4128bcb0991SDimitry Andric default: 4138bcb0991SDimitry Andric return UNKNOWN; 414*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN: 415*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact: 416*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_IDXEN: 417*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact: 4188bcb0991SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 4198bcb0991SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 4208bcb0991SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 4218bcb0991SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 422*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN: 423*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact: 424*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN: 425*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact: 4265f757f3fSDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN: 4275f757f3fSDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact: 4285f757f3fSDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET: 4295f757f3fSDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact: 4308bcb0991SDimitry Andric return BUFFER_LOAD; 431*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_BOTHEN: 432*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact: 433*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_IDXEN: 434*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact: 4358bcb0991SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 4368bcb0991SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 4378bcb0991SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 4388bcb0991SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 439*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN: 440*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact: 441*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN: 442*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact: 4435f757f3fSDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN: 4445f757f3fSDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact: 4455f757f3fSDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET: 4465f757f3fSDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact: 4478bcb0991SDimitry Andric return BUFFER_STORE; 4488bcb0991SDimitry Andric } 4498bcb0991SDimitry Andric } 4505f757f3fSDimitry Andric if (TII.isImage(Opc)) { 4518bcb0991SDimitry Andric // Ignore instructions encoded without vaddr. 452bdd1243dSDimitry Andric if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && 453bdd1243dSDimitry Andric !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0)) 4548bcb0991SDimitry Andric return UNKNOWN; 455349cc55cSDimitry Andric // Ignore BVH instructions 456349cc55cSDimitry Andric if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 457349cc55cSDimitry Andric return UNKNOWN; 4588bcb0991SDimitry Andric // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 459480093f4SDimitry Andric if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 460480093f4SDimitry Andric TII.isGather4(Opc)) 4618bcb0991SDimitry Andric return UNKNOWN; 4628bcb0991SDimitry Andric return MIMG; 4638bcb0991SDimitry Andric } 464480093f4SDimitry Andric if (TII.isMTBUF(Opc)) { 465480093f4SDimitry Andric switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 466480093f4SDimitry Andric default: 467480093f4SDimitry Andric return UNKNOWN; 4685f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: 4695f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: 4705f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: 4715f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: 472480093f4SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 473480093f4SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 474480093f4SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 475480093f4SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 4765f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN: 4775f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact: 4785f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN: 4795f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact: 4805f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN: 4815f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact: 4825f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET: 4835f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact: 484480093f4SDimitry Andric return TBUFFER_LOAD; 485480093f4SDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 486480093f4SDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 487480093f4SDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 488480093f4SDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 4895f757f3fSDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN: 4905f757f3fSDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact: 4915f757f3fSDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET: 4925f757f3fSDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact: 493480093f4SDimitry Andric return TBUFFER_STORE; 494480093f4SDimitry Andric } 495480093f4SDimitry Andric } 4968bcb0991SDimitry Andric return UNKNOWN; 4978bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 4988bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 4995f757f3fSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 5008bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 501349cc55cSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 5028bcb0991SDimitry Andric return S_BUFFER_LOAD_IMM; 503bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 504bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 5055f757f3fSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 506bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 507bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 508bdd1243dSDimitry Andric return S_BUFFER_LOAD_SGPR_IMM; 509bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORD_IMM: 510bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM: 5115f757f3fSDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM: 512bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM: 513bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM: 514*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 515*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 516*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 517*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 518bdd1243dSDimitry Andric return S_LOAD_IMM; 5198bcb0991SDimitry Andric case AMDGPU::DS_READ_B32: 5208bcb0991SDimitry Andric case AMDGPU::DS_READ_B32_gfx9: 5218bcb0991SDimitry Andric case AMDGPU::DS_READ_B64: 5228bcb0991SDimitry Andric case AMDGPU::DS_READ_B64_gfx9: 5238bcb0991SDimitry Andric return DS_READ; 5248bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B32: 5258bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B32_gfx9: 5268bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B64: 5278bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B64_gfx9: 5288bcb0991SDimitry Andric return DS_WRITE; 52981ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD: 53081ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2: 53181ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3: 53281ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4: 53381ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORD: 53481ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX2: 53581ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX3: 53681ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX4: 53781ad6265SDimitry Andric return FLAT_LOAD; 53881ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 53981ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 54081ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 54181ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 54281ad6265SDimitry Andric return GLOBAL_LOAD_SADDR; 54381ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD: 54481ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2: 54581ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3: 54681ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4: 54781ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORD: 54881ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX2: 54981ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX3: 55081ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX4: 55181ad6265SDimitry Andric return FLAT_STORE; 55281ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 55381ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 55481ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 55581ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 55681ad6265SDimitry Andric return GLOBAL_STORE_SADDR; 5578bcb0991SDimitry Andric } 5588bcb0991SDimitry Andric } 5598bcb0991SDimitry Andric 5608bcb0991SDimitry Andric /// Determines instruction subclass from opcode. Only instructions 56181ad6265SDimitry Andric /// of the same subclass can be merged together. The merged instruction may have 56281ad6265SDimitry Andric /// a different subclass but must have the same class. 5638bcb0991SDimitry Andric static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 5648bcb0991SDimitry Andric switch (Opc) { 5658bcb0991SDimitry Andric default: 5668bcb0991SDimitry Andric if (TII.isMUBUF(Opc)) 5678bcb0991SDimitry Andric return AMDGPU::getMUBUFBaseOpcode(Opc); 5685f757f3fSDimitry Andric if (TII.isImage(Opc)) { 5698bcb0991SDimitry Andric const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 5708bcb0991SDimitry Andric assert(Info); 5718bcb0991SDimitry Andric return Info->BaseOpcode; 5728bcb0991SDimitry Andric } 573480093f4SDimitry Andric if (TII.isMTBUF(Opc)) 574480093f4SDimitry Andric return AMDGPU::getMTBUFBaseOpcode(Opc); 5758bcb0991SDimitry Andric return -1; 5768bcb0991SDimitry Andric case AMDGPU::DS_READ_B32: 5778bcb0991SDimitry Andric case AMDGPU::DS_READ_B32_gfx9: 5788bcb0991SDimitry Andric case AMDGPU::DS_READ_B64: 5798bcb0991SDimitry Andric case AMDGPU::DS_READ_B64_gfx9: 5808bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B32: 5818bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B32_gfx9: 5828bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B64: 5838bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B64_gfx9: 5848bcb0991SDimitry Andric return Opc; 5858bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 5868bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 5875f757f3fSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 5888bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 589349cc55cSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 5908bcb0991SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 591bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 592bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 5935f757f3fSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 594bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 595bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 596bdd1243dSDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; 597bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORD_IMM: 598bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM: 5995f757f3fSDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM: 600bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM: 601bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM: 602*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 603*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 604*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 605*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 606bdd1243dSDimitry Andric return AMDGPU::S_LOAD_DWORD_IMM; 60781ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD: 60881ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2: 60981ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3: 61081ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4: 61181ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORD: 61281ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX2: 61381ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX3: 61481ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX4: 61581ad6265SDimitry Andric return AMDGPU::FLAT_LOAD_DWORD; 61681ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 61781ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 61881ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 61981ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 62081ad6265SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; 62181ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD: 62281ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2: 62381ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3: 62481ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4: 62581ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORD: 62681ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX2: 62781ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX3: 62881ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX4: 62981ad6265SDimitry Andric return AMDGPU::FLAT_STORE_DWORD; 63081ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 63181ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 63281ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 63381ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 63481ad6265SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORD_SADDR; 6358bcb0991SDimitry Andric } 6368bcb0991SDimitry Andric } 6378bcb0991SDimitry Andric 63881ad6265SDimitry Andric // GLOBAL loads and stores are classified as FLAT initially. If both combined 63981ad6265SDimitry Andric // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. 64081ad6265SDimitry Andric // If either or both instructions are non segment specific FLAT the resulting 64181ad6265SDimitry Andric // combined operation will be FLAT, potentially promoting one of the GLOBAL 64281ad6265SDimitry Andric // operations to FLAT. 64381ad6265SDimitry Andric // For other instructions return the original unmodified class. 64481ad6265SDimitry Andric InstClassEnum 64581ad6265SDimitry Andric SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, 64681ad6265SDimitry Andric const CombineInfo &Paired) { 64781ad6265SDimitry Andric assert(CI.InstClass == Paired.InstClass); 64881ad6265SDimitry Andric 64981ad6265SDimitry Andric if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && 65081ad6265SDimitry Andric SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I)) 65181ad6265SDimitry Andric return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; 65281ad6265SDimitry Andric 65381ad6265SDimitry Andric return CI.InstClass; 65481ad6265SDimitry Andric } 65581ad6265SDimitry Andric 6565ffd83dbSDimitry Andric static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 6575ffd83dbSDimitry Andric AddressRegs Result; 6585ffd83dbSDimitry Andric 6598bcb0991SDimitry Andric if (TII.isMUBUF(Opc)) { 6605ffd83dbSDimitry Andric if (AMDGPU::getMUBUFHasVAddr(Opc)) 6615ffd83dbSDimitry Andric Result.VAddr = true; 6625ffd83dbSDimitry Andric if (AMDGPU::getMUBUFHasSrsrc(Opc)) 6635ffd83dbSDimitry Andric Result.SRsrc = true; 6645ffd83dbSDimitry Andric if (AMDGPU::getMUBUFHasSoffset(Opc)) 6655ffd83dbSDimitry Andric Result.SOffset = true; 6668bcb0991SDimitry Andric 6675ffd83dbSDimitry Andric return Result; 6688bcb0991SDimitry Andric } 6698bcb0991SDimitry Andric 6705f757f3fSDimitry Andric if (TII.isImage(Opc)) { 6715ffd83dbSDimitry Andric int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 6725ffd83dbSDimitry Andric if (VAddr0Idx >= 0) { 6735f757f3fSDimitry Andric int RsrcName = 6745f757f3fSDimitry Andric TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; 6755f757f3fSDimitry Andric int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName); 6765f757f3fSDimitry Andric Result.NumVAddrs = RsrcIdx - VAddr0Idx; 6775ffd83dbSDimitry Andric } else { 6785ffd83dbSDimitry Andric Result.VAddr = true; 6795ffd83dbSDimitry Andric } 6805ffd83dbSDimitry Andric Result.SRsrc = true; 6818bcb0991SDimitry Andric const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 6828bcb0991SDimitry Andric if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 6835ffd83dbSDimitry Andric Result.SSamp = true; 684480093f4SDimitry Andric 6855ffd83dbSDimitry Andric return Result; 686480093f4SDimitry Andric } 687480093f4SDimitry Andric if (TII.isMTBUF(Opc)) { 6885ffd83dbSDimitry Andric if (AMDGPU::getMTBUFHasVAddr(Opc)) 6895ffd83dbSDimitry Andric Result.VAddr = true; 6905ffd83dbSDimitry Andric if (AMDGPU::getMTBUFHasSrsrc(Opc)) 6915ffd83dbSDimitry Andric Result.SRsrc = true; 6925ffd83dbSDimitry Andric if (AMDGPU::getMTBUFHasSoffset(Opc)) 6935ffd83dbSDimitry Andric Result.SOffset = true; 694480093f4SDimitry Andric 6955ffd83dbSDimitry Andric return Result; 6968bcb0991SDimitry Andric } 6978bcb0991SDimitry Andric 6988bcb0991SDimitry Andric switch (Opc) { 6998bcb0991SDimitry Andric default: 7005ffd83dbSDimitry Andric return Result; 701bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 702bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 7035f757f3fSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 704bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 705bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 706bdd1243dSDimitry Andric Result.SOffset = true; 707bdd1243dSDimitry Andric [[fallthrough]]; 7088bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 7098bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 7105f757f3fSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 7118bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 712349cc55cSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 713bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORD_IMM: 714bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM: 7155f757f3fSDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM: 716bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM: 717bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM: 718*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 719*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 720*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 721*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 7225ffd83dbSDimitry Andric Result.SBase = true; 7235ffd83dbSDimitry Andric return Result; 7248bcb0991SDimitry Andric case AMDGPU::DS_READ_B32: 7258bcb0991SDimitry Andric case AMDGPU::DS_READ_B64: 7268bcb0991SDimitry Andric case AMDGPU::DS_READ_B32_gfx9: 7278bcb0991SDimitry Andric case AMDGPU::DS_READ_B64_gfx9: 7288bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B32: 7298bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B64: 7308bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B32_gfx9: 7318bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B64_gfx9: 7325ffd83dbSDimitry Andric Result.Addr = true; 7335ffd83dbSDimitry Andric return Result; 73481ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 73581ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 73681ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 73781ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 73881ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 73981ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 74081ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 74181ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 74281ad6265SDimitry Andric Result.SAddr = true; 743bdd1243dSDimitry Andric [[fallthrough]]; 74481ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD: 74581ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2: 74681ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3: 74781ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4: 74881ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD: 74981ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2: 75081ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3: 75181ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4: 75281ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORD: 75381ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX2: 75481ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX3: 75581ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX4: 75681ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORD: 75781ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX2: 75881ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX3: 75981ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX4: 76081ad6265SDimitry Andric Result.VAddr = true; 76181ad6265SDimitry Andric return Result; 7628bcb0991SDimitry Andric } 7638bcb0991SDimitry Andric } 7648bcb0991SDimitry Andric 7658bcb0991SDimitry Andric void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 76604eeddc0SDimitry Andric const SILoadStoreOptimizer &LSO) { 7678bcb0991SDimitry Andric I = MI; 7688bcb0991SDimitry Andric unsigned Opc = MI->getOpcode(); 76904eeddc0SDimitry Andric InstClass = getInstClass(Opc, *LSO.TII); 7708bcb0991SDimitry Andric 7718bcb0991SDimitry Andric if (InstClass == UNKNOWN) 7728bcb0991SDimitry Andric return; 7738bcb0991SDimitry Andric 77404eeddc0SDimitry Andric IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 77504eeddc0SDimitry Andric 7768bcb0991SDimitry Andric switch (InstClass) { 7778bcb0991SDimitry Andric case DS_READ: 7788bcb0991SDimitry Andric EltSize = 7798bcb0991SDimitry Andric (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 7808bcb0991SDimitry Andric : 4; 7818bcb0991SDimitry Andric break; 7828bcb0991SDimitry Andric case DS_WRITE: 7838bcb0991SDimitry Andric EltSize = 7848bcb0991SDimitry Andric (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 7858bcb0991SDimitry Andric : 4; 7868bcb0991SDimitry Andric break; 7878bcb0991SDimitry Andric case S_BUFFER_LOAD_IMM: 788bdd1243dSDimitry Andric case S_BUFFER_LOAD_SGPR_IMM: 789bdd1243dSDimitry Andric case S_LOAD_IMM: 79004eeddc0SDimitry Andric EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 7918bcb0991SDimitry Andric break; 7928bcb0991SDimitry Andric default: 7938bcb0991SDimitry Andric EltSize = 4; 7948bcb0991SDimitry Andric break; 7958bcb0991SDimitry Andric } 7968bcb0991SDimitry Andric 7978bcb0991SDimitry Andric if (InstClass == MIMG) { 79804eeddc0SDimitry Andric DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 7995ffd83dbSDimitry Andric // Offset is not considered for MIMG instructions. 8005ffd83dbSDimitry Andric Offset = 0; 8018bcb0991SDimitry Andric } else { 8028bcb0991SDimitry Andric int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 80306c3fb27SDimitry Andric Offset = I->getOperand(OffsetIdx).getImm(); 8048bcb0991SDimitry Andric } 8058bcb0991SDimitry Andric 806480093f4SDimitry Andric if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 80704eeddc0SDimitry Andric Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 808480093f4SDimitry Andric 80904eeddc0SDimitry Andric Width = getOpcodeWidth(*I, *LSO.TII); 8108bcb0991SDimitry Andric 8118bcb0991SDimitry Andric if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 812480093f4SDimitry Andric Offset &= 0xffff; 8138bcb0991SDimitry Andric } else if (InstClass != MIMG) { 81404eeddc0SDimitry Andric CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 8158bcb0991SDimitry Andric } 8168bcb0991SDimitry Andric 81704eeddc0SDimitry Andric AddressRegs Regs = getRegs(Opc, *LSO.TII); 8185f757f3fSDimitry Andric bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I); 8195ffd83dbSDimitry Andric 8208bcb0991SDimitry Andric NumAddresses = 0; 8215ffd83dbSDimitry Andric for (unsigned J = 0; J < Regs.NumVAddrs; J++) 8225ffd83dbSDimitry Andric AddrIdx[NumAddresses++] = 8235ffd83dbSDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 8245ffd83dbSDimitry Andric if (Regs.Addr) 8255ffd83dbSDimitry Andric AddrIdx[NumAddresses++] = 8265ffd83dbSDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 8275ffd83dbSDimitry Andric if (Regs.SBase) 8285ffd83dbSDimitry Andric AddrIdx[NumAddresses++] = 8295ffd83dbSDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 8305ffd83dbSDimitry Andric if (Regs.SRsrc) 8315f757f3fSDimitry Andric AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 8325f757f3fSDimitry Andric Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc); 8335ffd83dbSDimitry Andric if (Regs.SOffset) 8345ffd83dbSDimitry Andric AddrIdx[NumAddresses++] = 8355ffd83dbSDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 83681ad6265SDimitry Andric if (Regs.SAddr) 83781ad6265SDimitry Andric AddrIdx[NumAddresses++] = 83881ad6265SDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 8395ffd83dbSDimitry Andric if (Regs.VAddr) 8405ffd83dbSDimitry Andric AddrIdx[NumAddresses++] = 8415ffd83dbSDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 8425ffd83dbSDimitry Andric if (Regs.SSamp) 8435f757f3fSDimitry Andric AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 8445f757f3fSDimitry Andric Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp); 8455ffd83dbSDimitry Andric assert(NumAddresses <= MaxAddressRegs); 8468bcb0991SDimitry Andric 8475ffd83dbSDimitry Andric for (unsigned J = 0; J < NumAddresses; J++) 8485ffd83dbSDimitry Andric AddrReg[J] = &I->getOperand(AddrIdx[J]); 8498bcb0991SDimitry Andric } 8508bcb0991SDimitry Andric 8510b57cec5SDimitry Andric } // end anonymous namespace. 8520b57cec5SDimitry Andric 8530b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 8540b57cec5SDimitry Andric "SI Load Store Optimizer", false, false) 8550b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 8560b57cec5SDimitry Andric INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 8570b57cec5SDimitry Andric false, false) 8580b57cec5SDimitry Andric 8590b57cec5SDimitry Andric char SILoadStoreOptimizer::ID = 0; 8600b57cec5SDimitry Andric 8610b57cec5SDimitry Andric char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 8620b57cec5SDimitry Andric 8630b57cec5SDimitry Andric FunctionPass *llvm::createSILoadStoreOptimizerPass() { 8640b57cec5SDimitry Andric return new SILoadStoreOptimizer(); 8650b57cec5SDimitry Andric } 8660b57cec5SDimitry Andric 8670b57cec5SDimitry Andric static void addDefsUsesToList(const MachineInstr &MI, 8685ffd83dbSDimitry Andric DenseSet<Register> &RegDefs, 86981ad6265SDimitry Andric DenseSet<Register> &RegUses) { 87081ad6265SDimitry Andric for (const auto &Op : MI.operands()) { 87181ad6265SDimitry Andric if (!Op.isReg()) 87281ad6265SDimitry Andric continue; 8730b57cec5SDimitry Andric if (Op.isDef()) 8740b57cec5SDimitry Andric RegDefs.insert(Op.getReg()); 87581ad6265SDimitry Andric if (Op.readsReg()) 87681ad6265SDimitry Andric RegUses.insert(Op.getReg()); 8770b57cec5SDimitry Andric } 8780b57cec5SDimitry Andric } 8790b57cec5SDimitry Andric 88081ad6265SDimitry Andric bool SILoadStoreOptimizer::canSwapInstructions( 88181ad6265SDimitry Andric const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 88281ad6265SDimitry Andric const MachineInstr &A, const MachineInstr &B) const { 88381ad6265SDimitry Andric if (A.mayLoadOrStore() && B.mayLoadOrStore() && 88481ad6265SDimitry Andric (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 8850b57cec5SDimitry Andric return false; 88681ad6265SDimitry Andric for (const auto &BOp : B.operands()) { 88781ad6265SDimitry Andric if (!BOp.isReg()) 8880b57cec5SDimitry Andric continue; 88981ad6265SDimitry Andric if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 89081ad6265SDimitry Andric return false; 89181ad6265SDimitry Andric if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 8920b57cec5SDimitry Andric return false; 8930b57cec5SDimitry Andric } 8940b57cec5SDimitry Andric return true; 8950b57cec5SDimitry Andric } 8960b57cec5SDimitry Andric 89781ad6265SDimitry Andric // Given that \p CI and \p Paired are adjacent memory operations produce a new 89881ad6265SDimitry Andric // MMO for the combined operation with a new access size. 89981ad6265SDimitry Andric MachineMemOperand * 90081ad6265SDimitry Andric SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, 90181ad6265SDimitry Andric const CombineInfo &Paired) { 90281ad6265SDimitry Andric const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 90381ad6265SDimitry Andric const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 90481ad6265SDimitry Andric 905*0fca6ea1SDimitry Andric unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue(); 90681ad6265SDimitry Andric 90781ad6265SDimitry Andric // A base pointer for the combined operation is the same as the leading 90881ad6265SDimitry Andric // operation's pointer. 90981ad6265SDimitry Andric if (Paired < CI) 91081ad6265SDimitry Andric std::swap(MMOa, MMOb); 91181ad6265SDimitry Andric 91281ad6265SDimitry Andric MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); 91381ad6265SDimitry Andric // If merging FLAT and GLOBAL set address space to FLAT. 91481ad6265SDimitry Andric if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 91581ad6265SDimitry Andric PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; 91681ad6265SDimitry Andric 91781ad6265SDimitry Andric MachineFunction *MF = CI.I->getMF(); 91881ad6265SDimitry Andric return MF->getMachineMemOperand(MMOa, PtrInfo, Size); 9198bcb0991SDimitry Andric } 9208bcb0991SDimitry Andric 921480093f4SDimitry Andric bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 922480093f4SDimitry Andric const SIInstrInfo &TII, 923480093f4SDimitry Andric const CombineInfo &Paired) { 9248bcb0991SDimitry Andric assert(CI.InstClass == MIMG); 9258bcb0991SDimitry Andric 9268bcb0991SDimitry Andric // Ignore instructions with tfe/lwe set. 9278bcb0991SDimitry Andric const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 9288bcb0991SDimitry Andric const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 9298bcb0991SDimitry Andric 9308bcb0991SDimitry Andric if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 9318bcb0991SDimitry Andric return false; 9328bcb0991SDimitry Andric 9338bcb0991SDimitry Andric // Check other optional immediate operands for equality. 934fe6060f1SDimitry Andric unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 935fe6060f1SDimitry Andric AMDGPU::OpName::unorm, AMDGPU::OpName::da, 936fe6060f1SDimitry Andric AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 9378bcb0991SDimitry Andric 9388bcb0991SDimitry Andric for (auto op : OperandsToMatch) { 9398bcb0991SDimitry Andric int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 940480093f4SDimitry Andric if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 9418bcb0991SDimitry Andric return false; 9428bcb0991SDimitry Andric if (Idx != -1 && 943480093f4SDimitry Andric CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 9448bcb0991SDimitry Andric return false; 9458bcb0991SDimitry Andric } 9468bcb0991SDimitry Andric 9478bcb0991SDimitry Andric // Check DMask for overlaps. 948480093f4SDimitry Andric unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 949480093f4SDimitry Andric unsigned MinMask = std::min(CI.DMask, Paired.DMask); 9508bcb0991SDimitry Andric 9515f757f3fSDimitry Andric if (!MaxMask) 9525f757f3fSDimitry Andric return false; 9535f757f3fSDimitry Andric 95406c3fb27SDimitry Andric unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask); 9558bcb0991SDimitry Andric if ((1u << AllowedBitsForMin) <= MinMask) 9568bcb0991SDimitry Andric return false; 9578bcb0991SDimitry Andric 9588bcb0991SDimitry Andric return true; 9598bcb0991SDimitry Andric } 9608bcb0991SDimitry Andric 961480093f4SDimitry Andric static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 962480093f4SDimitry Andric unsigned ComponentCount, 9635ffd83dbSDimitry Andric const GCNSubtarget &STI) { 964480093f4SDimitry Andric if (ComponentCount > 4) 965480093f4SDimitry Andric return 0; 966480093f4SDimitry Andric 967480093f4SDimitry Andric const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 968480093f4SDimitry Andric llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 969480093f4SDimitry Andric if (!OldFormatInfo) 970480093f4SDimitry Andric return 0; 971480093f4SDimitry Andric 972480093f4SDimitry Andric const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 973480093f4SDimitry Andric llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 974480093f4SDimitry Andric ComponentCount, 975480093f4SDimitry Andric OldFormatInfo->NumFormat, STI); 976480093f4SDimitry Andric 977480093f4SDimitry Andric if (!NewFormatInfo) 978480093f4SDimitry Andric return 0; 979480093f4SDimitry Andric 980480093f4SDimitry Andric assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 981480093f4SDimitry Andric NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 982480093f4SDimitry Andric 983480093f4SDimitry Andric return NewFormatInfo->Format; 984480093f4SDimitry Andric } 985480093f4SDimitry Andric 986fe6060f1SDimitry Andric // Return the value in the inclusive range [Lo,Hi] that is aligned to the 987fe6060f1SDimitry Andric // highest power of two. Note that the result is well defined for all inputs 988fe6060f1SDimitry Andric // including corner cases like: 989fe6060f1SDimitry Andric // - if Lo == Hi, return that value 990fe6060f1SDimitry Andric // - if Lo == 0, return 0 (even though the "- 1" below underflows 991fe6060f1SDimitry Andric // - if Lo > Hi, return 0 (as if the range wrapped around) 992fe6060f1SDimitry Andric static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 99306c3fb27SDimitry Andric return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1); 994fe6060f1SDimitry Andric } 995fe6060f1SDimitry Andric 996480093f4SDimitry Andric bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 9975ffd83dbSDimitry Andric const GCNSubtarget &STI, 9985ffd83dbSDimitry Andric CombineInfo &Paired, 9995ffd83dbSDimitry Andric bool Modify) { 10008bcb0991SDimitry Andric assert(CI.InstClass != MIMG); 10018bcb0991SDimitry Andric 10020b57cec5SDimitry Andric // XXX - Would the same offset be OK? Is there any reason this would happen or 10030b57cec5SDimitry Andric // be useful? 1004480093f4SDimitry Andric if (CI.Offset == Paired.Offset) 10050b57cec5SDimitry Andric return false; 10060b57cec5SDimitry Andric 10070b57cec5SDimitry Andric // This won't be valid if the offset isn't aligned. 1008480093f4SDimitry Andric if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 10090b57cec5SDimitry Andric return false; 10100b57cec5SDimitry Andric 1011480093f4SDimitry Andric if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 1012480093f4SDimitry Andric 1013480093f4SDimitry Andric const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 1014480093f4SDimitry Andric llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 1015480093f4SDimitry Andric if (!Info0) 1016480093f4SDimitry Andric return false; 1017480093f4SDimitry Andric const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 1018480093f4SDimitry Andric llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 1019480093f4SDimitry Andric if (!Info1) 1020480093f4SDimitry Andric return false; 1021480093f4SDimitry Andric 1022480093f4SDimitry Andric if (Info0->BitsPerComp != Info1->BitsPerComp || 1023480093f4SDimitry Andric Info0->NumFormat != Info1->NumFormat) 1024480093f4SDimitry Andric return false; 1025480093f4SDimitry Andric 1026480093f4SDimitry Andric // TODO: Should be possible to support more formats, but if format loads 1027480093f4SDimitry Andric // are not dword-aligned, the merged load might not be valid. 1028480093f4SDimitry Andric if (Info0->BitsPerComp != 32) 1029480093f4SDimitry Andric return false; 1030480093f4SDimitry Andric 1031480093f4SDimitry Andric if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 1032480093f4SDimitry Andric return false; 1033480093f4SDimitry Andric } 1034480093f4SDimitry Andric 1035fe6060f1SDimitry Andric uint32_t EltOffset0 = CI.Offset / CI.EltSize; 1036fe6060f1SDimitry Andric uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 10370b57cec5SDimitry Andric CI.UseST64 = false; 10380b57cec5SDimitry Andric CI.BaseOff = 0; 10390b57cec5SDimitry Andric 1040fe6060f1SDimitry Andric // Handle all non-DS instructions. 10410b57cec5SDimitry Andric if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 104206c3fb27SDimitry Andric if (EltOffset0 + CI.Width != EltOffset1 && 104306c3fb27SDimitry Andric EltOffset1 + Paired.Width != EltOffset0) 104406c3fb27SDimitry Andric return false; 104506c3fb27SDimitry Andric if (CI.CPol != Paired.CPol) 104606c3fb27SDimitry Andric return false; 10475f757f3fSDimitry Andric if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || 10485f757f3fSDimitry Andric CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { 10495f757f3fSDimitry Andric // Reject cases like: 10505f757f3fSDimitry Andric // dword + dwordx2 -> dwordx3 10515f757f3fSDimitry Andric // dword + dwordx3 -> dwordx4 10525f757f3fSDimitry Andric // If we tried to combine these cases, we would fail to extract a subreg 10535f757f3fSDimitry Andric // for the result of the second load due to SGPR alignment requirements. 10545f757f3fSDimitry Andric if (CI.Width != Paired.Width && 10555f757f3fSDimitry Andric (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset)) 10565f757f3fSDimitry Andric return false; 10575f757f3fSDimitry Andric } 105806c3fb27SDimitry Andric return true; 10590b57cec5SDimitry Andric } 10600b57cec5SDimitry Andric 10610b57cec5SDimitry Andric // If the offset in elements doesn't fit in 8-bits, we might be able to use 10620b57cec5SDimitry Andric // the stride 64 versions. 10630b57cec5SDimitry Andric if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 10640b57cec5SDimitry Andric isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 10655ffd83dbSDimitry Andric if (Modify) { 1066480093f4SDimitry Andric CI.Offset = EltOffset0 / 64; 1067480093f4SDimitry Andric Paired.Offset = EltOffset1 / 64; 10680b57cec5SDimitry Andric CI.UseST64 = true; 10695ffd83dbSDimitry Andric } 10700b57cec5SDimitry Andric return true; 10710b57cec5SDimitry Andric } 10720b57cec5SDimitry Andric 10730b57cec5SDimitry Andric // Check if the new offsets fit in the reduced 8-bit range. 10740b57cec5SDimitry Andric if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 10755ffd83dbSDimitry Andric if (Modify) { 1076480093f4SDimitry Andric CI.Offset = EltOffset0; 1077480093f4SDimitry Andric Paired.Offset = EltOffset1; 10785ffd83dbSDimitry Andric } 10790b57cec5SDimitry Andric return true; 10800b57cec5SDimitry Andric } 10810b57cec5SDimitry Andric 10820b57cec5SDimitry Andric // Try to shift base address to decrease offsets. 1083fe6060f1SDimitry Andric uint32_t Min = std::min(EltOffset0, EltOffset1); 1084fe6060f1SDimitry Andric uint32_t Max = std::max(EltOffset0, EltOffset1); 10850b57cec5SDimitry Andric 1086fe6060f1SDimitry Andric const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 1087fe6060f1SDimitry Andric if (((Max - Min) & ~Mask) == 0) { 10885ffd83dbSDimitry Andric if (Modify) { 1089fe6060f1SDimitry Andric // From the range of values we could use for BaseOff, choose the one that 1090fe6060f1SDimitry Andric // is aligned to the highest power of two, to maximise the chance that 1091fe6060f1SDimitry Andric // the same offset can be reused for other load/store pairs. 1092fe6060f1SDimitry Andric uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 1093fe6060f1SDimitry Andric // Copy the low bits of the offsets, so that when we adjust them by 1094fe6060f1SDimitry Andric // subtracting BaseOff they will be multiples of 64. 1095fe6060f1SDimitry Andric BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 1096fe6060f1SDimitry Andric CI.BaseOff = BaseOff * CI.EltSize; 1097fe6060f1SDimitry Andric CI.Offset = (EltOffset0 - BaseOff) / 64; 1098fe6060f1SDimitry Andric Paired.Offset = (EltOffset1 - BaseOff) / 64; 10990b57cec5SDimitry Andric CI.UseST64 = true; 11005ffd83dbSDimitry Andric } 11010b57cec5SDimitry Andric return true; 11020b57cec5SDimitry Andric } 11030b57cec5SDimitry Andric 1104fe6060f1SDimitry Andric if (isUInt<8>(Max - Min)) { 11055ffd83dbSDimitry Andric if (Modify) { 1106fe6060f1SDimitry Andric // From the range of values we could use for BaseOff, choose the one that 1107fe6060f1SDimitry Andric // is aligned to the highest power of two, to maximise the chance that 1108fe6060f1SDimitry Andric // the same offset can be reused for other load/store pairs. 1109fe6060f1SDimitry Andric uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 1110fe6060f1SDimitry Andric CI.BaseOff = BaseOff * CI.EltSize; 1111fe6060f1SDimitry Andric CI.Offset = EltOffset0 - BaseOff; 1112fe6060f1SDimitry Andric Paired.Offset = EltOffset1 - BaseOff; 11135ffd83dbSDimitry Andric } 11140b57cec5SDimitry Andric return true; 11150b57cec5SDimitry Andric } 11160b57cec5SDimitry Andric 11170b57cec5SDimitry Andric return false; 11180b57cec5SDimitry Andric } 11190b57cec5SDimitry Andric 11200b57cec5SDimitry Andric bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 1121480093f4SDimitry Andric const CombineInfo &CI, 1122480093f4SDimitry Andric const CombineInfo &Paired) { 1123480093f4SDimitry Andric const unsigned Width = (CI.Width + Paired.Width); 11240b57cec5SDimitry Andric switch (CI.InstClass) { 11250b57cec5SDimitry Andric default: 11260b57cec5SDimitry Andric return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 11270b57cec5SDimitry Andric case S_BUFFER_LOAD_IMM: 1128bdd1243dSDimitry Andric case S_BUFFER_LOAD_SGPR_IMM: 1129bdd1243dSDimitry Andric case S_LOAD_IMM: 11300b57cec5SDimitry Andric switch (Width) { 11310b57cec5SDimitry Andric default: 11320b57cec5SDimitry Andric return false; 11330b57cec5SDimitry Andric case 2: 11340b57cec5SDimitry Andric case 4: 1135349cc55cSDimitry Andric case 8: 11360b57cec5SDimitry Andric return true; 11375f757f3fSDimitry Andric case 3: 11385f757f3fSDimitry Andric return STM.hasScalarDwordx3Loads(); 11390b57cec5SDimitry Andric } 11400b57cec5SDimitry Andric } 11410b57cec5SDimitry Andric } 11420b57cec5SDimitry Andric 1143fe6060f1SDimitry Andric const TargetRegisterClass * 1144fe6060f1SDimitry Andric SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 1145fe6060f1SDimitry Andric if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 1146fe6060f1SDimitry Andric return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1147fe6060f1SDimitry Andric } 1148fe6060f1SDimitry Andric if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 1149fe6060f1SDimitry Andric return TRI->getRegClassForReg(*MRI, Src->getReg()); 1150fe6060f1SDimitry Andric } 1151fe6060f1SDimitry Andric if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 1152fe6060f1SDimitry Andric return TRI->getRegClassForReg(*MRI, Src->getReg()); 1153fe6060f1SDimitry Andric } 1154fe6060f1SDimitry Andric if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 1155fe6060f1SDimitry Andric return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1156fe6060f1SDimitry Andric } 1157fe6060f1SDimitry Andric if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 1158fe6060f1SDimitry Andric return TRI->getRegClassForReg(*MRI, Src->getReg()); 1159fe6060f1SDimitry Andric } 1160fe6060f1SDimitry Andric return nullptr; 1161fe6060f1SDimitry Andric } 1162fe6060f1SDimitry Andric 116381ad6265SDimitry Andric /// This function assumes that CI comes before Paired in a basic block. Return 116481ad6265SDimitry Andric /// an insertion point for the merged instruction or nullptr on failure. 116581ad6265SDimitry Andric SILoadStoreOptimizer::CombineInfo * 116681ad6265SDimitry Andric SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 116781ad6265SDimitry Andric CombineInfo &Paired) { 116881ad6265SDimitry Andric // If another instruction has already been merged into CI, it may now be a 116981ad6265SDimitry Andric // type that we can't do any further merging into. 117081ad6265SDimitry Andric if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 117181ad6265SDimitry Andric return nullptr; 117281ad6265SDimitry Andric assert(CI.InstClass == Paired.InstClass); 117381ad6265SDimitry Andric 117481ad6265SDimitry Andric if (getInstSubclass(CI.I->getOpcode(), *TII) != 117581ad6265SDimitry Andric getInstSubclass(Paired.I->getOpcode(), *TII)) 117681ad6265SDimitry Andric return nullptr; 11775ffd83dbSDimitry Andric 11785ffd83dbSDimitry Andric // Check both offsets (or masks for MIMG) can be combined and fit in the 11795ffd83dbSDimitry Andric // reduced range. 118081ad6265SDimitry Andric if (CI.InstClass == MIMG) { 118181ad6265SDimitry Andric if (!dmasksCanBeCombined(CI, *TII, Paired)) 118281ad6265SDimitry Andric return nullptr; 118381ad6265SDimitry Andric } else { 118481ad6265SDimitry Andric if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 118581ad6265SDimitry Andric return nullptr; 11865ffd83dbSDimitry Andric } 11875ffd83dbSDimitry Andric 118881ad6265SDimitry Andric DenseSet<Register> RegDefs; 118981ad6265SDimitry Andric DenseSet<Register> RegUses; 119081ad6265SDimitry Andric CombineInfo *Where; 119181ad6265SDimitry Andric if (CI.I->mayLoad()) { 119281ad6265SDimitry Andric // Try to hoist Paired up to CI. 119381ad6265SDimitry Andric addDefsUsesToList(*Paired.I, RegDefs, RegUses); 119481ad6265SDimitry Andric for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 119581ad6265SDimitry Andric if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 119681ad6265SDimitry Andric return nullptr; 11970b57cec5SDimitry Andric } 119881ad6265SDimitry Andric Where = &CI; 119981ad6265SDimitry Andric } else { 120081ad6265SDimitry Andric // Try to sink CI down to Paired. 120181ad6265SDimitry Andric addDefsUsesToList(*CI.I, RegDefs, RegUses); 120281ad6265SDimitry Andric for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 120381ad6265SDimitry Andric if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 120481ad6265SDimitry Andric return nullptr; 12050b57cec5SDimitry Andric } 120681ad6265SDimitry Andric Where = &Paired; 120781ad6265SDimitry Andric } 12085ffd83dbSDimitry Andric 12095ffd83dbSDimitry Andric // Call offsetsCanBeCombined with modify = true so that the offsets are 12105ffd83dbSDimitry Andric // correct for the new instruction. This should return true, because 12115ffd83dbSDimitry Andric // this function should only be called on CombineInfo objects that 12125ffd83dbSDimitry Andric // have already been confirmed to be mergeable. 121381ad6265SDimitry Andric if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 12145ffd83dbSDimitry Andric offsetsCanBeCombined(CI, *STM, Paired, true); 121581ad6265SDimitry Andric return Where; 12160b57cec5SDimitry Andric } 12170b57cec5SDimitry Andric 1218*0fca6ea1SDimitry Andric // Copy the merged load result from DestReg to the original dest regs of CI and 1219*0fca6ea1SDimitry Andric // Paired. 1220*0fca6ea1SDimitry Andric void SILoadStoreOptimizer::copyToDestRegs( 1221*0fca6ea1SDimitry Andric CombineInfo &CI, CombineInfo &Paired, 1222*0fca6ea1SDimitry Andric MachineBasicBlock::iterator InsertBefore, int OpName, 1223*0fca6ea1SDimitry Andric Register DestReg) const { 1224*0fca6ea1SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent(); 1225*0fca6ea1SDimitry Andric DebugLoc DL = CI.I->getDebugLoc(); 1226*0fca6ea1SDimitry Andric 1227*0fca6ea1SDimitry Andric auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1228*0fca6ea1SDimitry Andric 1229*0fca6ea1SDimitry Andric // Copy to the old destination registers. 1230*0fca6ea1SDimitry Andric const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1231*0fca6ea1SDimitry Andric auto *Dest0 = TII->getNamedOperand(*CI.I, OpName); 1232*0fca6ea1SDimitry Andric auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName); 1233*0fca6ea1SDimitry Andric 1234*0fca6ea1SDimitry Andric // The constrained sload instructions in S_LOAD_IMM class will have 1235*0fca6ea1SDimitry Andric // `early-clobber` flag in the dst operand. Remove the flag before using the 1236*0fca6ea1SDimitry Andric // MOs in copies. 1237*0fca6ea1SDimitry Andric Dest0->setIsEarlyClobber(false); 1238*0fca6ea1SDimitry Andric Dest1->setIsEarlyClobber(false); 1239*0fca6ea1SDimitry Andric 1240*0fca6ea1SDimitry Andric BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1241*0fca6ea1SDimitry Andric .add(*Dest0) // Copy to same destination including flags and sub reg. 1242*0fca6ea1SDimitry Andric .addReg(DestReg, 0, SubRegIdx0); 1243*0fca6ea1SDimitry Andric BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1244*0fca6ea1SDimitry Andric .add(*Dest1) 1245*0fca6ea1SDimitry Andric .addReg(DestReg, RegState::Kill, SubRegIdx1); 1246*0fca6ea1SDimitry Andric } 1247*0fca6ea1SDimitry Andric 1248*0fca6ea1SDimitry Andric // Return a register for the source of the merged store after copying the 1249*0fca6ea1SDimitry Andric // original source regs of CI and Paired into it. 1250*0fca6ea1SDimitry Andric Register 1251*0fca6ea1SDimitry Andric SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, 1252*0fca6ea1SDimitry Andric MachineBasicBlock::iterator InsertBefore, 1253*0fca6ea1SDimitry Andric int OpName) const { 1254*0fca6ea1SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent(); 1255*0fca6ea1SDimitry Andric DebugLoc DL = CI.I->getDebugLoc(); 1256*0fca6ea1SDimitry Andric 1257*0fca6ea1SDimitry Andric auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1258*0fca6ea1SDimitry Andric 1259*0fca6ea1SDimitry Andric // Copy to the new source register. 1260*0fca6ea1SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1261*0fca6ea1SDimitry Andric Register SrcReg = MRI->createVirtualRegister(SuperRC); 1262*0fca6ea1SDimitry Andric 1263*0fca6ea1SDimitry Andric const auto *Src0 = TII->getNamedOperand(*CI.I, OpName); 1264*0fca6ea1SDimitry Andric const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName); 1265*0fca6ea1SDimitry Andric 1266*0fca6ea1SDimitry Andric BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1267*0fca6ea1SDimitry Andric .add(*Src0) 1268*0fca6ea1SDimitry Andric .addImm(SubRegIdx0) 1269*0fca6ea1SDimitry Andric .add(*Src1) 1270*0fca6ea1SDimitry Andric .addImm(SubRegIdx1); 1271*0fca6ea1SDimitry Andric 1272*0fca6ea1SDimitry Andric return SrcReg; 1273*0fca6ea1SDimitry Andric } 1274*0fca6ea1SDimitry Andric 12750b57cec5SDimitry Andric unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 12760b57cec5SDimitry Andric if (STM->ldsRequiresM0Init()) 12770b57cec5SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 12780b57cec5SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 12790b57cec5SDimitry Andric } 12800b57cec5SDimitry Andric 12810b57cec5SDimitry Andric unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 12820b57cec5SDimitry Andric if (STM->ldsRequiresM0Init()) 12830b57cec5SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 12840b57cec5SDimitry Andric 12850b57cec5SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 12860b57cec5SDimitry Andric : AMDGPU::DS_READ2ST64_B64_gfx9; 12870b57cec5SDimitry Andric } 12880b57cec5SDimitry Andric 12890b57cec5SDimitry Andric MachineBasicBlock::iterator 12905ffd83dbSDimitry Andric SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 129181ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) { 12920b57cec5SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent(); 12930b57cec5SDimitry Andric 12940b57cec5SDimitry Andric // Be careful, since the addresses could be subregisters themselves in weird 12950b57cec5SDimitry Andric // cases, like vectors of pointers. 12960b57cec5SDimitry Andric const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 12970b57cec5SDimitry Andric 1298*0fca6ea1SDimitry Andric unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset); 1299*0fca6ea1SDimitry Andric unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset); 13000b57cec5SDimitry Andric unsigned Opc = 13010b57cec5SDimitry Andric CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 13020b57cec5SDimitry Andric 13030b57cec5SDimitry Andric assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 13040b57cec5SDimitry Andric (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 13050b57cec5SDimitry Andric 13060b57cec5SDimitry Andric const MCInstrDesc &Read2Desc = TII->get(Opc); 13070b57cec5SDimitry Andric 1308fe6060f1SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 13098bcb0991SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC); 13100b57cec5SDimitry Andric 13110b57cec5SDimitry Andric DebugLoc DL = CI.I->getDebugLoc(); 13120b57cec5SDimitry Andric 13138bcb0991SDimitry Andric Register BaseReg = AddrReg->getReg(); 13140b57cec5SDimitry Andric unsigned BaseSubReg = AddrReg->getSubReg(); 13150b57cec5SDimitry Andric unsigned BaseRegFlags = 0; 13160b57cec5SDimitry Andric if (CI.BaseOff) { 13178bcb0991SDimitry Andric Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 131881ad6265SDimitry Andric BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 13190b57cec5SDimitry Andric .addImm(CI.BaseOff); 13200b57cec5SDimitry Andric 13210b57cec5SDimitry Andric BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 13220b57cec5SDimitry Andric BaseRegFlags = RegState::Kill; 13230b57cec5SDimitry Andric 132481ad6265SDimitry Andric TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 13250b57cec5SDimitry Andric .addReg(ImmReg) 13260b57cec5SDimitry Andric .addReg(AddrReg->getReg(), 0, BaseSubReg) 13270b57cec5SDimitry Andric .addImm(0); // clamp bit 13280b57cec5SDimitry Andric BaseSubReg = 0; 13290b57cec5SDimitry Andric } 13300b57cec5SDimitry Andric 13310b57cec5SDimitry Andric MachineInstrBuilder Read2 = 133281ad6265SDimitry Andric BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 13330b57cec5SDimitry Andric .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 13340b57cec5SDimitry Andric .addImm(NewOffset0) // offset0 13350b57cec5SDimitry Andric .addImm(NewOffset1) // offset1 13360b57cec5SDimitry Andric .addImm(0) // gds 1337480093f4SDimitry Andric .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 13380b57cec5SDimitry Andric 1339*0fca6ea1SDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); 13400b57cec5SDimitry Andric 13410b57cec5SDimitry Andric CI.I->eraseFromParent(); 1342480093f4SDimitry Andric Paired.I->eraseFromParent(); 13430b57cec5SDimitry Andric 13440b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 13458bcb0991SDimitry Andric return Read2; 13460b57cec5SDimitry Andric } 13470b57cec5SDimitry Andric 13480b57cec5SDimitry Andric unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 13490b57cec5SDimitry Andric if (STM->ldsRequiresM0Init()) 13500b57cec5SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 13510b57cec5SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 13520b57cec5SDimitry Andric : AMDGPU::DS_WRITE2_B64_gfx9; 13530b57cec5SDimitry Andric } 13540b57cec5SDimitry Andric 13550b57cec5SDimitry Andric unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 13560b57cec5SDimitry Andric if (STM->ldsRequiresM0Init()) 13570b57cec5SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 13580b57cec5SDimitry Andric : AMDGPU::DS_WRITE2ST64_B64; 13590b57cec5SDimitry Andric 13600b57cec5SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 13610b57cec5SDimitry Andric : AMDGPU::DS_WRITE2ST64_B64_gfx9; 13620b57cec5SDimitry Andric } 13630b57cec5SDimitry Andric 136481ad6265SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 136581ad6265SDimitry Andric CombineInfo &CI, CombineInfo &Paired, 136681ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) { 13670b57cec5SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent(); 13680b57cec5SDimitry Andric 13690b57cec5SDimitry Andric // Be sure to use .addOperand(), and not .addReg() with these. We want to be 13700b57cec5SDimitry Andric // sure we preserve the subregister index and any register flags set on them. 13710b57cec5SDimitry Andric const MachineOperand *AddrReg = 13720b57cec5SDimitry Andric TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 13730b57cec5SDimitry Andric const MachineOperand *Data0 = 13740b57cec5SDimitry Andric TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 13750b57cec5SDimitry Andric const MachineOperand *Data1 = 1376480093f4SDimitry Andric TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 13770b57cec5SDimitry Andric 1378480093f4SDimitry Andric unsigned NewOffset0 = CI.Offset; 1379480093f4SDimitry Andric unsigned NewOffset1 = Paired.Offset; 13800b57cec5SDimitry Andric unsigned Opc = 13810b57cec5SDimitry Andric CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 13820b57cec5SDimitry Andric 13830b57cec5SDimitry Andric if (NewOffset0 > NewOffset1) { 13840b57cec5SDimitry Andric // Canonicalize the merged instruction so the smaller offset comes first. 13850b57cec5SDimitry Andric std::swap(NewOffset0, NewOffset1); 13860b57cec5SDimitry Andric std::swap(Data0, Data1); 13870b57cec5SDimitry Andric } 13880b57cec5SDimitry Andric 13890b57cec5SDimitry Andric assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 13900b57cec5SDimitry Andric (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 13910b57cec5SDimitry Andric 13920b57cec5SDimitry Andric const MCInstrDesc &Write2Desc = TII->get(Opc); 13930b57cec5SDimitry Andric DebugLoc DL = CI.I->getDebugLoc(); 13940b57cec5SDimitry Andric 13958bcb0991SDimitry Andric Register BaseReg = AddrReg->getReg(); 13960b57cec5SDimitry Andric unsigned BaseSubReg = AddrReg->getSubReg(); 13970b57cec5SDimitry Andric unsigned BaseRegFlags = 0; 13980b57cec5SDimitry Andric if (CI.BaseOff) { 13998bcb0991SDimitry Andric Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 140081ad6265SDimitry Andric BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 14010b57cec5SDimitry Andric .addImm(CI.BaseOff); 14020b57cec5SDimitry Andric 14030b57cec5SDimitry Andric BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 14040b57cec5SDimitry Andric BaseRegFlags = RegState::Kill; 14050b57cec5SDimitry Andric 140681ad6265SDimitry Andric TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 14070b57cec5SDimitry Andric .addReg(ImmReg) 14080b57cec5SDimitry Andric .addReg(AddrReg->getReg(), 0, BaseSubReg) 14090b57cec5SDimitry Andric .addImm(0); // clamp bit 14100b57cec5SDimitry Andric BaseSubReg = 0; 14110b57cec5SDimitry Andric } 14120b57cec5SDimitry Andric 14130b57cec5SDimitry Andric MachineInstrBuilder Write2 = 141481ad6265SDimitry Andric BuildMI(*MBB, InsertBefore, DL, Write2Desc) 14150b57cec5SDimitry Andric .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 14160b57cec5SDimitry Andric .add(*Data0) // data0 14170b57cec5SDimitry Andric .add(*Data1) // data1 14180b57cec5SDimitry Andric .addImm(NewOffset0) // offset0 14190b57cec5SDimitry Andric .addImm(NewOffset1) // offset1 14200b57cec5SDimitry Andric .addImm(0) // gds 1421480093f4SDimitry Andric .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 14220b57cec5SDimitry Andric 14230b57cec5SDimitry Andric CI.I->eraseFromParent(); 1424480093f4SDimitry Andric Paired.I->eraseFromParent(); 14250b57cec5SDimitry Andric 14260b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 14278bcb0991SDimitry Andric return Write2; 14280b57cec5SDimitry Andric } 14290b57cec5SDimitry Andric 14300b57cec5SDimitry Andric MachineBasicBlock::iterator 14315ffd83dbSDimitry Andric SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 143281ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) { 14330b57cec5SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent(); 14340b57cec5SDimitry Andric DebugLoc DL = CI.I->getDebugLoc(); 1435480093f4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired); 14360b57cec5SDimitry Andric 1437480093f4SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 14380b57cec5SDimitry Andric 14398bcb0991SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC); 1440480093f4SDimitry Andric unsigned MergedDMask = CI.DMask | Paired.DMask; 14418bcb0991SDimitry Andric unsigned DMaskIdx = 14428bcb0991SDimitry Andric AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 14430b57cec5SDimitry Andric 144481ad6265SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 14458bcb0991SDimitry Andric for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 14468bcb0991SDimitry Andric if (I == DMaskIdx) 14478bcb0991SDimitry Andric MIB.addImm(MergedDMask); 14488bcb0991SDimitry Andric else 14498bcb0991SDimitry Andric MIB.add((*CI.I).getOperand(I)); 14508bcb0991SDimitry Andric } 14510b57cec5SDimitry Andric 14528bcb0991SDimitry Andric // It shouldn't be possible to get this far if the two instructions 14538bcb0991SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias() 14548bcb0991SDimitry Andric // will return true if this is the case. 1455480093f4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 14560b57cec5SDimitry Andric 145781ad6265SDimitry Andric MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 14580b57cec5SDimitry Andric 1459*0fca6ea1SDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 14600b57cec5SDimitry Andric 14610b57cec5SDimitry Andric CI.I->eraseFromParent(); 1462480093f4SDimitry Andric Paired.I->eraseFromParent(); 14638bcb0991SDimitry Andric return New; 14648bcb0991SDimitry Andric } 14658bcb0991SDimitry Andric 1466bdd1243dSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( 14675ffd83dbSDimitry Andric CombineInfo &CI, CombineInfo &Paired, 146881ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) { 14698bcb0991SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent(); 14708bcb0991SDimitry Andric DebugLoc DL = CI.I->getDebugLoc(); 1471480093f4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired); 14728bcb0991SDimitry Andric 1473480093f4SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 14748bcb0991SDimitry Andric 14758bcb0991SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC); 1476480093f4SDimitry Andric unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 14778bcb0991SDimitry Andric 14788bcb0991SDimitry Andric // It shouldn't be possible to get this far if the two instructions 14798bcb0991SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias() 14808bcb0991SDimitry Andric // will return true if this is the case. 1481480093f4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 14828bcb0991SDimitry Andric 1483bdd1243dSDimitry Andric MachineInstrBuilder New = 148481ad6265SDimitry Andric BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1485bdd1243dSDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)); 1486bdd1243dSDimitry Andric if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) 1487bdd1243dSDimitry Andric New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)); 1488bdd1243dSDimitry Andric New.addImm(MergedOffset); 1489bdd1243dSDimitry Andric New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 14908bcb0991SDimitry Andric 1491*0fca6ea1SDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg); 14928bcb0991SDimitry Andric 14938bcb0991SDimitry Andric CI.I->eraseFromParent(); 1494480093f4SDimitry Andric Paired.I->eraseFromParent(); 14958bcb0991SDimitry Andric return New; 14968bcb0991SDimitry Andric } 14978bcb0991SDimitry Andric 14985ffd83dbSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 14995ffd83dbSDimitry Andric CombineInfo &CI, CombineInfo &Paired, 150081ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) { 15018bcb0991SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent(); 15028bcb0991SDimitry Andric DebugLoc DL = CI.I->getDebugLoc(); 15038bcb0991SDimitry Andric 1504480093f4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired); 15058bcb0991SDimitry Andric 1506480093f4SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 15078bcb0991SDimitry Andric 15088bcb0991SDimitry Andric // Copy to the new source register. 15098bcb0991SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC); 1510480093f4SDimitry Andric unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 15118bcb0991SDimitry Andric 151281ad6265SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 15138bcb0991SDimitry Andric 15145ffd83dbSDimitry Andric AddressRegs Regs = getRegs(Opcode, *TII); 15158bcb0991SDimitry Andric 15165ffd83dbSDimitry Andric if (Regs.VAddr) 15178bcb0991SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 15188bcb0991SDimitry Andric 15198bcb0991SDimitry Andric // It shouldn't be possible to get this far if the two instructions 15208bcb0991SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias() 15218bcb0991SDimitry Andric // will return true if this is the case. 1522480093f4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 15238bcb0991SDimitry Andric 15248bcb0991SDimitry Andric MachineInstr *New = 15258bcb0991SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 15268bcb0991SDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 15278bcb0991SDimitry Andric .addImm(MergedOffset) // offset 1528fe6060f1SDimitry Andric .addImm(CI.CPol) // cpol 15298bcb0991SDimitry Andric .addImm(0) // swz 153081ad6265SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 15318bcb0991SDimitry Andric 1532*0fca6ea1SDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 15338bcb0991SDimitry Andric 15348bcb0991SDimitry Andric CI.I->eraseFromParent(); 1535480093f4SDimitry Andric Paired.I->eraseFromParent(); 15368bcb0991SDimitry Andric return New; 15370b57cec5SDimitry Andric } 15380b57cec5SDimitry Andric 15395ffd83dbSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 15405ffd83dbSDimitry Andric CombineInfo &CI, CombineInfo &Paired, 154181ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) { 1542480093f4SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent(); 1543480093f4SDimitry Andric DebugLoc DL = CI.I->getDebugLoc(); 1544480093f4SDimitry Andric 1545480093f4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired); 1546480093f4SDimitry Andric 1547480093f4SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1548480093f4SDimitry Andric 1549480093f4SDimitry Andric // Copy to the new source register. 1550480093f4SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC); 1551480093f4SDimitry Andric unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1552480093f4SDimitry Andric 155381ad6265SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1554480093f4SDimitry Andric 15555ffd83dbSDimitry Andric AddressRegs Regs = getRegs(Opcode, *TII); 1556480093f4SDimitry Andric 15575ffd83dbSDimitry Andric if (Regs.VAddr) 1558480093f4SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1559480093f4SDimitry Andric 1560480093f4SDimitry Andric unsigned JoinedFormat = 15615ffd83dbSDimitry Andric getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1562480093f4SDimitry Andric 1563480093f4SDimitry Andric // It shouldn't be possible to get this far if the two instructions 1564480093f4SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias() 1565480093f4SDimitry Andric // will return true if this is the case. 1566480093f4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1567480093f4SDimitry Andric 1568480093f4SDimitry Andric MachineInstr *New = 1569480093f4SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1570480093f4SDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1571480093f4SDimitry Andric .addImm(MergedOffset) // offset 1572480093f4SDimitry Andric .addImm(JoinedFormat) // format 1573fe6060f1SDimitry Andric .addImm(CI.CPol) // cpol 1574480093f4SDimitry Andric .addImm(0) // swz 157581ad6265SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1576480093f4SDimitry Andric 1577*0fca6ea1SDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 1578480093f4SDimitry Andric 1579480093f4SDimitry Andric CI.I->eraseFromParent(); 1580480093f4SDimitry Andric Paired.I->eraseFromParent(); 1581480093f4SDimitry Andric return New; 1582480093f4SDimitry Andric } 1583480093f4SDimitry Andric 15845ffd83dbSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 15855ffd83dbSDimitry Andric CombineInfo &CI, CombineInfo &Paired, 158681ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) { 1587480093f4SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent(); 1588480093f4SDimitry Andric DebugLoc DL = CI.I->getDebugLoc(); 1589480093f4SDimitry Andric 1590480093f4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired); 1591480093f4SDimitry Andric 1592*0fca6ea1SDimitry Andric Register SrcReg = 1593*0fca6ea1SDimitry Andric copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 1594480093f4SDimitry Andric 159581ad6265SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1596480093f4SDimitry Andric .addReg(SrcReg, RegState::Kill); 1597480093f4SDimitry Andric 15985ffd83dbSDimitry Andric AddressRegs Regs = getRegs(Opcode, *TII); 1599480093f4SDimitry Andric 16005ffd83dbSDimitry Andric if (Regs.VAddr) 1601480093f4SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1602480093f4SDimitry Andric 1603480093f4SDimitry Andric unsigned JoinedFormat = 16045ffd83dbSDimitry Andric getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1605480093f4SDimitry Andric 1606480093f4SDimitry Andric // It shouldn't be possible to get this far if the two instructions 1607480093f4SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias() 1608480093f4SDimitry Andric // will return true if this is the case. 1609480093f4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1610480093f4SDimitry Andric 1611480093f4SDimitry Andric MachineInstr *New = 1612480093f4SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1613480093f4SDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1614480093f4SDimitry Andric .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1615480093f4SDimitry Andric .addImm(JoinedFormat) // format 1616fe6060f1SDimitry Andric .addImm(CI.CPol) // cpol 1617480093f4SDimitry Andric .addImm(0) // swz 161881ad6265SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1619480093f4SDimitry Andric 162081ad6265SDimitry Andric CI.I->eraseFromParent(); 162181ad6265SDimitry Andric Paired.I->eraseFromParent(); 162281ad6265SDimitry Andric return New; 162381ad6265SDimitry Andric } 162481ad6265SDimitry Andric 162581ad6265SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( 162681ad6265SDimitry Andric CombineInfo &CI, CombineInfo &Paired, 162781ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) { 162881ad6265SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent(); 162981ad6265SDimitry Andric DebugLoc DL = CI.I->getDebugLoc(); 163081ad6265SDimitry Andric 163181ad6265SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired); 163281ad6265SDimitry Andric 163381ad6265SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 163481ad6265SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC); 163581ad6265SDimitry Andric 163681ad6265SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 163781ad6265SDimitry Andric 163881ad6265SDimitry Andric if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 163981ad6265SDimitry Andric MIB.add(*SAddr); 164081ad6265SDimitry Andric 164181ad6265SDimitry Andric MachineInstr *New = 164281ad6265SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 164381ad6265SDimitry Andric .addImm(std::min(CI.Offset, Paired.Offset)) 164481ad6265SDimitry Andric .addImm(CI.CPol) 164581ad6265SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 164681ad6265SDimitry Andric 1647*0fca6ea1SDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); 164881ad6265SDimitry Andric 164981ad6265SDimitry Andric CI.I->eraseFromParent(); 165081ad6265SDimitry Andric Paired.I->eraseFromParent(); 165181ad6265SDimitry Andric return New; 165281ad6265SDimitry Andric } 165381ad6265SDimitry Andric 165481ad6265SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( 165581ad6265SDimitry Andric CombineInfo &CI, CombineInfo &Paired, 165681ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) { 165781ad6265SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent(); 165881ad6265SDimitry Andric DebugLoc DL = CI.I->getDebugLoc(); 165981ad6265SDimitry Andric 166081ad6265SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired); 166181ad6265SDimitry Andric 1662*0fca6ea1SDimitry Andric Register SrcReg = 1663*0fca6ea1SDimitry Andric copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 166481ad6265SDimitry Andric 166581ad6265SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 166681ad6265SDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 166781ad6265SDimitry Andric .addReg(SrcReg, RegState::Kill); 166881ad6265SDimitry Andric 166981ad6265SDimitry Andric if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 167081ad6265SDimitry Andric MIB.add(*SAddr); 167181ad6265SDimitry Andric 167281ad6265SDimitry Andric MachineInstr *New = 167381ad6265SDimitry Andric MIB.addImm(std::min(CI.Offset, Paired.Offset)) 167481ad6265SDimitry Andric .addImm(CI.CPol) 167581ad6265SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1676480093f4SDimitry Andric 1677480093f4SDimitry Andric CI.I->eraseFromParent(); 1678480093f4SDimitry Andric Paired.I->eraseFromParent(); 1679480093f4SDimitry Andric return New; 1680480093f4SDimitry Andric } 1681480093f4SDimitry Andric 1682480093f4SDimitry Andric unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1683480093f4SDimitry Andric const CombineInfo &Paired) { 1684480093f4SDimitry Andric const unsigned Width = CI.Width + Paired.Width; 16850b57cec5SDimitry Andric 168681ad6265SDimitry Andric switch (getCommonInstClass(CI, Paired)) { 16870b57cec5SDimitry Andric default: 16888bcb0991SDimitry Andric assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 16898bcb0991SDimitry Andric // FIXME: Handle d16 correctly 16908bcb0991SDimitry Andric return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 16918bcb0991SDimitry Andric Width); 1692480093f4SDimitry Andric case TBUFFER_LOAD: 1693480093f4SDimitry Andric case TBUFFER_STORE: 1694480093f4SDimitry Andric return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1695480093f4SDimitry Andric Width); 1696480093f4SDimitry Andric 16970b57cec5SDimitry Andric case UNKNOWN: 16980b57cec5SDimitry Andric llvm_unreachable("Unknown instruction class"); 16990b57cec5SDimitry Andric case S_BUFFER_LOAD_IMM: 17000b57cec5SDimitry Andric switch (Width) { 17010b57cec5SDimitry Andric default: 17020b57cec5SDimitry Andric return 0; 17030b57cec5SDimitry Andric case 2: 17040b57cec5SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 17055f757f3fSDimitry Andric case 3: 17065f757f3fSDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; 17070b57cec5SDimitry Andric case 4: 17080b57cec5SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1709349cc55cSDimitry Andric case 8: 1710349cc55cSDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 17110b57cec5SDimitry Andric } 1712bdd1243dSDimitry Andric case S_BUFFER_LOAD_SGPR_IMM: 1713bdd1243dSDimitry Andric switch (Width) { 1714bdd1243dSDimitry Andric default: 1715bdd1243dSDimitry Andric return 0; 1716bdd1243dSDimitry Andric case 2: 171706c3fb27SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; 17185f757f3fSDimitry Andric case 3: 17195f757f3fSDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; 1720bdd1243dSDimitry Andric case 4: 172106c3fb27SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; 1722bdd1243dSDimitry Andric case 8: 172306c3fb27SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; 1724bdd1243dSDimitry Andric } 1725*0fca6ea1SDimitry Andric case S_LOAD_IMM: { 1726*0fca6ea1SDimitry Andric // If XNACK is enabled, use the constrained opcodes when the first load is 1727*0fca6ea1SDimitry Andric // under-aligned. 1728*0fca6ea1SDimitry Andric const MachineMemOperand *MMO = *CI.I->memoperands_begin(); 1729*0fca6ea1SDimitry Andric bool NeedsConstrainedOpc = 1730*0fca6ea1SDimitry Andric STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4; 1731bdd1243dSDimitry Andric switch (Width) { 1732bdd1243dSDimitry Andric default: 1733bdd1243dSDimitry Andric return 0; 1734bdd1243dSDimitry Andric case 2: 1735*0fca6ea1SDimitry Andric return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec 1736*0fca6ea1SDimitry Andric : AMDGPU::S_LOAD_DWORDX2_IMM; 17375f757f3fSDimitry Andric case 3: 1738*0fca6ea1SDimitry Andric return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec 1739*0fca6ea1SDimitry Andric : AMDGPU::S_LOAD_DWORDX3_IMM; 1740bdd1243dSDimitry Andric case 4: 1741*0fca6ea1SDimitry Andric return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec 1742*0fca6ea1SDimitry Andric : AMDGPU::S_LOAD_DWORDX4_IMM; 1743bdd1243dSDimitry Andric case 8: 1744*0fca6ea1SDimitry Andric return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec 1745*0fca6ea1SDimitry Andric : AMDGPU::S_LOAD_DWORDX8_IMM; 1746*0fca6ea1SDimitry Andric } 1747bdd1243dSDimitry Andric } 174881ad6265SDimitry Andric case GLOBAL_LOAD: 174981ad6265SDimitry Andric switch (Width) { 175081ad6265SDimitry Andric default: 175181ad6265SDimitry Andric return 0; 175281ad6265SDimitry Andric case 2: 175381ad6265SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX2; 175481ad6265SDimitry Andric case 3: 175581ad6265SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX3; 175681ad6265SDimitry Andric case 4: 175781ad6265SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX4; 175881ad6265SDimitry Andric } 175981ad6265SDimitry Andric case GLOBAL_LOAD_SADDR: 176081ad6265SDimitry Andric switch (Width) { 176181ad6265SDimitry Andric default: 176281ad6265SDimitry Andric return 0; 176381ad6265SDimitry Andric case 2: 176481ad6265SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; 176581ad6265SDimitry Andric case 3: 176681ad6265SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; 176781ad6265SDimitry Andric case 4: 176881ad6265SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; 176981ad6265SDimitry Andric } 177081ad6265SDimitry Andric case GLOBAL_STORE: 177181ad6265SDimitry Andric switch (Width) { 177281ad6265SDimitry Andric default: 177381ad6265SDimitry Andric return 0; 177481ad6265SDimitry Andric case 2: 177581ad6265SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX2; 177681ad6265SDimitry Andric case 3: 177781ad6265SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX3; 177881ad6265SDimitry Andric case 4: 177981ad6265SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX4; 178081ad6265SDimitry Andric } 178181ad6265SDimitry Andric case GLOBAL_STORE_SADDR: 178281ad6265SDimitry Andric switch (Width) { 178381ad6265SDimitry Andric default: 178481ad6265SDimitry Andric return 0; 178581ad6265SDimitry Andric case 2: 178681ad6265SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; 178781ad6265SDimitry Andric case 3: 178881ad6265SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; 178981ad6265SDimitry Andric case 4: 179081ad6265SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; 179181ad6265SDimitry Andric } 179281ad6265SDimitry Andric case FLAT_LOAD: 179381ad6265SDimitry Andric switch (Width) { 179481ad6265SDimitry Andric default: 179581ad6265SDimitry Andric return 0; 179681ad6265SDimitry Andric case 2: 179781ad6265SDimitry Andric return AMDGPU::FLAT_LOAD_DWORDX2; 179881ad6265SDimitry Andric case 3: 179981ad6265SDimitry Andric return AMDGPU::FLAT_LOAD_DWORDX3; 180081ad6265SDimitry Andric case 4: 180181ad6265SDimitry Andric return AMDGPU::FLAT_LOAD_DWORDX4; 180281ad6265SDimitry Andric } 180381ad6265SDimitry Andric case FLAT_STORE: 180481ad6265SDimitry Andric switch (Width) { 180581ad6265SDimitry Andric default: 180681ad6265SDimitry Andric return 0; 180781ad6265SDimitry Andric case 2: 180881ad6265SDimitry Andric return AMDGPU::FLAT_STORE_DWORDX2; 180981ad6265SDimitry Andric case 3: 181081ad6265SDimitry Andric return AMDGPU::FLAT_STORE_DWORDX3; 181181ad6265SDimitry Andric case 4: 181281ad6265SDimitry Andric return AMDGPU::FLAT_STORE_DWORDX4; 181381ad6265SDimitry Andric } 18148bcb0991SDimitry Andric case MIMG: 1815bdd1243dSDimitry Andric assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && 1816349cc55cSDimitry Andric "No overlaps"); 18178bcb0991SDimitry Andric return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 18180b57cec5SDimitry Andric } 18190b57cec5SDimitry Andric } 18200b57cec5SDimitry Andric 18210b57cec5SDimitry Andric std::pair<unsigned, unsigned> 1822349cc55cSDimitry Andric SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1823349cc55cSDimitry Andric const CombineInfo &Paired) { 1824bdd1243dSDimitry Andric assert((CI.InstClass != MIMG || 1825bdd1243dSDimitry Andric ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == 182681ad6265SDimitry Andric CI.Width + Paired.Width)) && 18278bcb0991SDimitry Andric "No overlaps"); 18288bcb0991SDimitry Andric 1829349cc55cSDimitry Andric unsigned Idx0; 1830349cc55cSDimitry Andric unsigned Idx1; 1831349cc55cSDimitry Andric 183204eeddc0SDimitry Andric static const unsigned Idxs[5][4] = { 18338bcb0991SDimitry Andric {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 183404eeddc0SDimitry Andric {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 183504eeddc0SDimitry Andric {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 183604eeddc0SDimitry Andric {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 183704eeddc0SDimitry Andric {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 18388bcb0991SDimitry Andric }; 18398bcb0991SDimitry Andric 184004eeddc0SDimitry Andric assert(CI.Width >= 1 && CI.Width <= 4); 184104eeddc0SDimitry Andric assert(Paired.Width >= 1 && Paired.Width <= 4); 18428bcb0991SDimitry Andric 184381ad6265SDimitry Andric if (Paired < CI) { 1844480093f4SDimitry Andric Idx1 = Idxs[0][Paired.Width - 1]; 1845480093f4SDimitry Andric Idx0 = Idxs[Paired.Width][CI.Width - 1]; 18460b57cec5SDimitry Andric } else { 1847480093f4SDimitry Andric Idx0 = Idxs[0][CI.Width - 1]; 1848480093f4SDimitry Andric Idx1 = Idxs[CI.Width][Paired.Width - 1]; 18490b57cec5SDimitry Andric } 18508bcb0991SDimitry Andric 1851*0fca6ea1SDimitry Andric return {Idx0, Idx1}; 18520b57cec5SDimitry Andric } 18530b57cec5SDimitry Andric 18540b57cec5SDimitry Andric const TargetRegisterClass * 1855480093f4SDimitry Andric SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1856*0fca6ea1SDimitry Andric const CombineInfo &Paired) const { 1857bdd1243dSDimitry Andric if (CI.InstClass == S_BUFFER_LOAD_IMM || 1858bdd1243dSDimitry Andric CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) { 1859480093f4SDimitry Andric switch (CI.Width + Paired.Width) { 18600b57cec5SDimitry Andric default: 18610b57cec5SDimitry Andric return nullptr; 18620b57cec5SDimitry Andric case 2: 18630b57cec5SDimitry Andric return &AMDGPU::SReg_64_XEXECRegClass; 18645f757f3fSDimitry Andric case 3: 18655f757f3fSDimitry Andric return &AMDGPU::SGPR_96RegClass; 18660b57cec5SDimitry Andric case 4: 18678bcb0991SDimitry Andric return &AMDGPU::SGPR_128RegClass; 18680b57cec5SDimitry Andric case 8: 18695ffd83dbSDimitry Andric return &AMDGPU::SGPR_256RegClass; 18700b57cec5SDimitry Andric case 16: 18715ffd83dbSDimitry Andric return &AMDGPU::SGPR_512RegClass; 18720b57cec5SDimitry Andric } 18730b57cec5SDimitry Andric } 1874fe6060f1SDimitry Andric 1875fe6060f1SDimitry Andric unsigned BitWidth = 32 * (CI.Width + Paired.Width); 18764824e7fdSDimitry Andric return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1877fe6060f1SDimitry Andric ? TRI->getAGPRClassForBitWidth(BitWidth) 1878fe6060f1SDimitry Andric : TRI->getVGPRClassForBitWidth(BitWidth); 18790b57cec5SDimitry Andric } 18800b57cec5SDimitry Andric 18815ffd83dbSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 18825ffd83dbSDimitry Andric CombineInfo &CI, CombineInfo &Paired, 188381ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) { 18840b57cec5SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent(); 18850b57cec5SDimitry Andric DebugLoc DL = CI.I->getDebugLoc(); 18860b57cec5SDimitry Andric 1887480093f4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired); 18880b57cec5SDimitry Andric 1889*0fca6ea1SDimitry Andric Register SrcReg = 1890*0fca6ea1SDimitry Andric copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 18910b57cec5SDimitry Andric 189281ad6265SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 18930b57cec5SDimitry Andric .addReg(SrcReg, RegState::Kill); 18940b57cec5SDimitry Andric 18955ffd83dbSDimitry Andric AddressRegs Regs = getRegs(Opcode, *TII); 18960b57cec5SDimitry Andric 18975ffd83dbSDimitry Andric if (Regs.VAddr) 18980b57cec5SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 18990b57cec5SDimitry Andric 19008bcb0991SDimitry Andric 19018bcb0991SDimitry Andric // It shouldn't be possible to get this far if the two instructions 19028bcb0991SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias() 19038bcb0991SDimitry Andric // will return true if this is the case. 1904480093f4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 19058bcb0991SDimitry Andric 19068bcb0991SDimitry Andric MachineInstr *New = 19070b57cec5SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 19080b57cec5SDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1909480093f4SDimitry Andric .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1910fe6060f1SDimitry Andric .addImm(CI.CPol) // cpol 19118bcb0991SDimitry Andric .addImm(0) // swz 191281ad6265SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 19130b57cec5SDimitry Andric 19140b57cec5SDimitry Andric CI.I->eraseFromParent(); 1915480093f4SDimitry Andric Paired.I->eraseFromParent(); 19168bcb0991SDimitry Andric return New; 19170b57cec5SDimitry Andric } 19180b57cec5SDimitry Andric 19190b57cec5SDimitry Andric MachineOperand 19208bcb0991SDimitry Andric SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 19210b57cec5SDimitry Andric APInt V(32, Val, true); 19220b57cec5SDimitry Andric if (TII->isInlineConstant(V)) 19230b57cec5SDimitry Andric return MachineOperand::CreateImm(Val); 19240b57cec5SDimitry Andric 19258bcb0991SDimitry Andric Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 19260b57cec5SDimitry Andric MachineInstr *Mov = 19270b57cec5SDimitry Andric BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 19280b57cec5SDimitry Andric TII->get(AMDGPU::S_MOV_B32), Reg) 19290b57cec5SDimitry Andric .addImm(Val); 19300b57cec5SDimitry Andric (void)Mov; 19310b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " "; Mov->dump()); 19320b57cec5SDimitry Andric return MachineOperand::CreateReg(Reg, false); 19330b57cec5SDimitry Andric } 19340b57cec5SDimitry Andric 19350b57cec5SDimitry Andric // Compute base address using Addr and return the final register. 19365ffd83dbSDimitry Andric Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 19378bcb0991SDimitry Andric const MemAddress &Addr) const { 19380b57cec5SDimitry Andric MachineBasicBlock *MBB = MI.getParent(); 19390b57cec5SDimitry Andric MachineBasicBlock::iterator MBBI = MI.getIterator(); 19400b57cec5SDimitry Andric DebugLoc DL = MI.getDebugLoc(); 19410b57cec5SDimitry Andric 19420b57cec5SDimitry Andric assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 19430b57cec5SDimitry Andric Addr.Base.LoSubReg) && 19440b57cec5SDimitry Andric "Expected 32-bit Base-Register-Low!!"); 19450b57cec5SDimitry Andric 19460b57cec5SDimitry Andric assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 19470b57cec5SDimitry Andric Addr.Base.HiSubReg) && 19480b57cec5SDimitry Andric "Expected 32-bit Base-Register-Hi!!"); 19490b57cec5SDimitry Andric 19500b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 19510b57cec5SDimitry Andric MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 19520b57cec5SDimitry Andric MachineOperand OffsetHi = 19530b57cec5SDimitry Andric createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 19540b57cec5SDimitry Andric 19550b57cec5SDimitry Andric const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 19568bcb0991SDimitry Andric Register CarryReg = MRI->createVirtualRegister(CarryRC); 19578bcb0991SDimitry Andric Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 19580b57cec5SDimitry Andric 19598bcb0991SDimitry Andric Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 19608bcb0991SDimitry Andric Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 19610b57cec5SDimitry Andric MachineInstr *LoHalf = 1962e8d8bef9SDimitry Andric BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 19630b57cec5SDimitry Andric .addReg(CarryReg, RegState::Define) 19640b57cec5SDimitry Andric .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 19650b57cec5SDimitry Andric .add(OffsetLo) 19660b57cec5SDimitry Andric .addImm(0); // clamp bit 19670b57cec5SDimitry Andric (void)LoHalf; 19680b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 19690b57cec5SDimitry Andric 19700b57cec5SDimitry Andric MachineInstr *HiHalf = 19710b57cec5SDimitry Andric BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 19720b57cec5SDimitry Andric .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 19730b57cec5SDimitry Andric .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 19740b57cec5SDimitry Andric .add(OffsetHi) 19750b57cec5SDimitry Andric .addReg(CarryReg, RegState::Kill) 19760b57cec5SDimitry Andric .addImm(0); // clamp bit 19770b57cec5SDimitry Andric (void)HiHalf; 19780b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 19790b57cec5SDimitry Andric 1980fe6060f1SDimitry Andric Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 19810b57cec5SDimitry Andric MachineInstr *FullBase = 19820b57cec5SDimitry Andric BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 19830b57cec5SDimitry Andric .addReg(DestSub0) 19840b57cec5SDimitry Andric .addImm(AMDGPU::sub0) 19850b57cec5SDimitry Andric .addReg(DestSub1) 19860b57cec5SDimitry Andric .addImm(AMDGPU::sub1); 19870b57cec5SDimitry Andric (void)FullBase; 19880b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 19890b57cec5SDimitry Andric 19900b57cec5SDimitry Andric return FullDestReg; 19910b57cec5SDimitry Andric } 19920b57cec5SDimitry Andric 19930b57cec5SDimitry Andric // Update base and offset with the NewBase and NewOffset in MI. 19940b57cec5SDimitry Andric void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 19955ffd83dbSDimitry Andric Register NewBase, 19968bcb0991SDimitry Andric int32_t NewOffset) const { 1997480093f4SDimitry Andric auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1998480093f4SDimitry Andric Base->setReg(NewBase); 1999480093f4SDimitry Andric Base->setIsKill(false); 20000b57cec5SDimitry Andric TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 20010b57cec5SDimitry Andric } 20020b57cec5SDimitry Andric 2003bdd1243dSDimitry Andric std::optional<int32_t> 20048bcb0991SDimitry Andric SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 20050b57cec5SDimitry Andric if (Op.isImm()) 20060b57cec5SDimitry Andric return Op.getImm(); 20070b57cec5SDimitry Andric 20080b57cec5SDimitry Andric if (!Op.isReg()) 2009bdd1243dSDimitry Andric return std::nullopt; 20100b57cec5SDimitry Andric 20110b57cec5SDimitry Andric MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 20120b57cec5SDimitry Andric if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 20130b57cec5SDimitry Andric !Def->getOperand(1).isImm()) 2014bdd1243dSDimitry Andric return std::nullopt; 20150b57cec5SDimitry Andric 20160b57cec5SDimitry Andric return Def->getOperand(1).getImm(); 20170b57cec5SDimitry Andric } 20180b57cec5SDimitry Andric 20190b57cec5SDimitry Andric // Analyze Base and extracts: 20200b57cec5SDimitry Andric // - 32bit base registers, subregisters 20210b57cec5SDimitry Andric // - 64bit constant offset 20220b57cec5SDimitry Andric // Expecting base computation as: 20230b57cec5SDimitry Andric // %OFFSET0:sgpr_32 = S_MOV_B32 8000 20240b57cec5SDimitry Andric // %LO:vgpr_32, %c:sreg_64_xexec = 2025e8d8bef9SDimitry Andric // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 20260b57cec5SDimitry Andric // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 20270b57cec5SDimitry Andric // %Base:vreg_64 = 20280b57cec5SDimitry Andric // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 20290b57cec5SDimitry Andric void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 20308bcb0991SDimitry Andric MemAddress &Addr) const { 20310b57cec5SDimitry Andric if (!Base.isReg()) 20320b57cec5SDimitry Andric return; 20330b57cec5SDimitry Andric 20340b57cec5SDimitry Andric MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 20350b57cec5SDimitry Andric if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 20360b57cec5SDimitry Andric || Def->getNumOperands() != 5) 20370b57cec5SDimitry Andric return; 20380b57cec5SDimitry Andric 20390b57cec5SDimitry Andric MachineOperand BaseLo = Def->getOperand(1); 20400b57cec5SDimitry Andric MachineOperand BaseHi = Def->getOperand(3); 20410b57cec5SDimitry Andric if (!BaseLo.isReg() || !BaseHi.isReg()) 20420b57cec5SDimitry Andric return; 20430b57cec5SDimitry Andric 20440b57cec5SDimitry Andric MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 20450b57cec5SDimitry Andric MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 20460b57cec5SDimitry Andric 2047e8d8bef9SDimitry Andric if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 20480b57cec5SDimitry Andric !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 20490b57cec5SDimitry Andric return; 20500b57cec5SDimitry Andric 20510b57cec5SDimitry Andric const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 20520b57cec5SDimitry Andric const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 20530b57cec5SDimitry Andric 20540b57cec5SDimitry Andric auto Offset0P = extractConstOffset(*Src0); 20550b57cec5SDimitry Andric if (Offset0P) 20560b57cec5SDimitry Andric BaseLo = *Src1; 20570b57cec5SDimitry Andric else { 20580b57cec5SDimitry Andric if (!(Offset0P = extractConstOffset(*Src1))) 20590b57cec5SDimitry Andric return; 20600b57cec5SDimitry Andric BaseLo = *Src0; 20610b57cec5SDimitry Andric } 20620b57cec5SDimitry Andric 20630b57cec5SDimitry Andric Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 20640b57cec5SDimitry Andric Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 20650b57cec5SDimitry Andric 20660b57cec5SDimitry Andric if (Src0->isImm()) 20670b57cec5SDimitry Andric std::swap(Src0, Src1); 20680b57cec5SDimitry Andric 2069*0fca6ea1SDimitry Andric if (!Src1->isImm() || Src0->isImm()) 20700b57cec5SDimitry Andric return; 20710b57cec5SDimitry Andric 20720b57cec5SDimitry Andric uint64_t Offset1 = Src1->getImm(); 20730b57cec5SDimitry Andric BaseHi = *Src0; 20740b57cec5SDimitry Andric 20750b57cec5SDimitry Andric Addr.Base.LoReg = BaseLo.getReg(); 20760b57cec5SDimitry Andric Addr.Base.HiReg = BaseHi.getReg(); 20770b57cec5SDimitry Andric Addr.Base.LoSubReg = BaseLo.getSubReg(); 20780b57cec5SDimitry Andric Addr.Base.HiSubReg = BaseHi.getSubReg(); 20790b57cec5SDimitry Andric Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 20800b57cec5SDimitry Andric } 20810b57cec5SDimitry Andric 20820b57cec5SDimitry Andric bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 20830b57cec5SDimitry Andric MachineInstr &MI, 20840b57cec5SDimitry Andric MemInfoMap &Visited, 20858bcb0991SDimitry Andric SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 20860b57cec5SDimitry Andric 2087*0fca6ea1SDimitry Andric if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI)) 20880b57cec5SDimitry Andric return false; 20890b57cec5SDimitry Andric 2090*0fca6ea1SDimitry Andric // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers. 2091*0fca6ea1SDimitry Andric if (SIInstrInfo::isFLATScratch(MI)) 20928bcb0991SDimitry Andric return false; 20938bcb0991SDimitry Andric 2094*0fca6ea1SDimitry Andric unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS 2095*0fca6ea1SDimitry Andric : AMDGPUAS::FLAT_ADDRESS; 20960b57cec5SDimitry Andric 20970b57cec5SDimitry Andric if (AnchorList.count(&MI)) 20980b57cec5SDimitry Andric return false; 20990b57cec5SDimitry Andric 21000b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 21010b57cec5SDimitry Andric 21020b57cec5SDimitry Andric if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 21030b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 21040b57cec5SDimitry Andric return false; 21050b57cec5SDimitry Andric } 21060b57cec5SDimitry Andric 21070b57cec5SDimitry Andric // Step1: Find the base-registers and a 64bit constant offset. 21080b57cec5SDimitry Andric MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 21090b57cec5SDimitry Andric MemAddress MAddr; 211006c3fb27SDimitry Andric if (!Visited.contains(&MI)) { 21110b57cec5SDimitry Andric processBaseWithConstOffset(Base, MAddr); 21120b57cec5SDimitry Andric Visited[&MI] = MAddr; 21130b57cec5SDimitry Andric } else 21140b57cec5SDimitry Andric MAddr = Visited[&MI]; 21150b57cec5SDimitry Andric 21160b57cec5SDimitry Andric if (MAddr.Offset == 0) { 21170b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 21180b57cec5SDimitry Andric " constant offsets that can be promoted.\n";); 21190b57cec5SDimitry Andric return false; 21200b57cec5SDimitry Andric } 21210b57cec5SDimitry Andric 21220b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 21230b57cec5SDimitry Andric << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 21240b57cec5SDimitry Andric 21250b57cec5SDimitry Andric // Step2: Traverse through MI's basic block and find an anchor(that has the 21260b57cec5SDimitry Andric // same base-registers) with the highest 13bit distance from MI's offset. 21270b57cec5SDimitry Andric // E.g. (64bit loads) 21280b57cec5SDimitry Andric // bb: 21290b57cec5SDimitry Andric // addr1 = &a + 4096; load1 = load(addr1, 0) 21300b57cec5SDimitry Andric // addr2 = &a + 6144; load2 = load(addr2, 0) 21310b57cec5SDimitry Andric // addr3 = &a + 8192; load3 = load(addr3, 0) 21320b57cec5SDimitry Andric // addr4 = &a + 10240; load4 = load(addr4, 0) 21330b57cec5SDimitry Andric // addr5 = &a + 12288; load5 = load(addr5, 0) 21340b57cec5SDimitry Andric // 21350b57cec5SDimitry Andric // Starting from the first load, the optimization will try to find a new base 21360b57cec5SDimitry Andric // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 21370b57cec5SDimitry Andric // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 21380b57cec5SDimitry Andric // as the new-base(anchor) because of the maximum distance which can 213981ad6265SDimitry Andric // accommodate more intermediate bases presumably. 21400b57cec5SDimitry Andric // 21410b57cec5SDimitry Andric // Step3: move (&a + 8192) above load1. Compute and promote offsets from 21420b57cec5SDimitry Andric // (&a + 8192) for load1, load2, load4. 21430b57cec5SDimitry Andric // addr = &a + 8192 21440b57cec5SDimitry Andric // load1 = load(addr, -4096) 21450b57cec5SDimitry Andric // load2 = load(addr, -2048) 21460b57cec5SDimitry Andric // load3 = load(addr, 0) 21470b57cec5SDimitry Andric // load4 = load(addr, 2048) 21480b57cec5SDimitry Andric // addr5 = &a + 12288; load5 = load(addr5, 0) 21490b57cec5SDimitry Andric // 21500b57cec5SDimitry Andric MachineInstr *AnchorInst = nullptr; 21510b57cec5SDimitry Andric MemAddress AnchorAddr; 21520b57cec5SDimitry Andric uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 21530b57cec5SDimitry Andric SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 21540b57cec5SDimitry Andric 21550b57cec5SDimitry Andric MachineBasicBlock *MBB = MI.getParent(); 21560b57cec5SDimitry Andric MachineBasicBlock::iterator E = MBB->end(); 21570b57cec5SDimitry Andric MachineBasicBlock::iterator MBBI = MI.getIterator(); 21580b57cec5SDimitry Andric ++MBBI; 21590b57cec5SDimitry Andric const SITargetLowering *TLI = 21600b57cec5SDimitry Andric static_cast<const SITargetLowering *>(STM->getTargetLowering()); 21610b57cec5SDimitry Andric 21620b57cec5SDimitry Andric for ( ; MBBI != E; ++MBBI) { 21630b57cec5SDimitry Andric MachineInstr &MINext = *MBBI; 21640b57cec5SDimitry Andric // TODO: Support finding an anchor(with same base) from store addresses or 21650b57cec5SDimitry Andric // any other load addresses where the opcodes are different. 21660b57cec5SDimitry Andric if (MINext.getOpcode() != MI.getOpcode() || 21670b57cec5SDimitry Andric TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 21680b57cec5SDimitry Andric continue; 21690b57cec5SDimitry Andric 21700b57cec5SDimitry Andric const MachineOperand &BaseNext = 21710b57cec5SDimitry Andric *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 21720b57cec5SDimitry Andric MemAddress MAddrNext; 217306c3fb27SDimitry Andric if (!Visited.contains(&MINext)) { 21740b57cec5SDimitry Andric processBaseWithConstOffset(BaseNext, MAddrNext); 21750b57cec5SDimitry Andric Visited[&MINext] = MAddrNext; 21760b57cec5SDimitry Andric } else 21770b57cec5SDimitry Andric MAddrNext = Visited[&MINext]; 21780b57cec5SDimitry Andric 21790b57cec5SDimitry Andric if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 21800b57cec5SDimitry Andric MAddrNext.Base.HiReg != MAddr.Base.HiReg || 21810b57cec5SDimitry Andric MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 21820b57cec5SDimitry Andric MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 21830b57cec5SDimitry Andric continue; 21840b57cec5SDimitry Andric 2185*0fca6ea1SDimitry Andric InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset); 21860b57cec5SDimitry Andric 21870b57cec5SDimitry Andric int64_t Dist = MAddr.Offset - MAddrNext.Offset; 21880b57cec5SDimitry Andric TargetLoweringBase::AddrMode AM; 21890b57cec5SDimitry Andric AM.HasBaseReg = true; 21900b57cec5SDimitry Andric AM.BaseOffs = Dist; 2191*0fca6ea1SDimitry Andric if (TLI->isLegalFlatAddressingMode(AM, AS) && 21920b57cec5SDimitry Andric (uint32_t)std::abs(Dist) > MaxDist) { 21930b57cec5SDimitry Andric MaxDist = std::abs(Dist); 21940b57cec5SDimitry Andric 21950b57cec5SDimitry Andric AnchorAddr = MAddrNext; 21960b57cec5SDimitry Andric AnchorInst = &MINext; 21970b57cec5SDimitry Andric } 21980b57cec5SDimitry Andric } 21990b57cec5SDimitry Andric 22000b57cec5SDimitry Andric if (AnchorInst) { 22010b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 22020b57cec5SDimitry Andric AnchorInst->dump()); 22030b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 22040b57cec5SDimitry Andric << AnchorAddr.Offset << "\n\n"); 22050b57cec5SDimitry Andric 22060b57cec5SDimitry Andric // Instead of moving up, just re-compute anchor-instruction's base address. 22075ffd83dbSDimitry Andric Register Base = computeBase(MI, AnchorAddr); 22080b57cec5SDimitry Andric 22090b57cec5SDimitry Andric updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 22100b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 22110b57cec5SDimitry Andric 2212*0fca6ea1SDimitry Andric for (auto [OtherMI, OtherOffset] : InstsWCommonBase) { 22130b57cec5SDimitry Andric TargetLoweringBase::AddrMode AM; 22140b57cec5SDimitry Andric AM.HasBaseReg = true; 2215*0fca6ea1SDimitry Andric AM.BaseOffs = OtherOffset - AnchorAddr.Offset; 22160b57cec5SDimitry Andric 2217*0fca6ea1SDimitry Andric if (TLI->isLegalFlatAddressingMode(AM, AS)) { 2218*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")"; 2219*0fca6ea1SDimitry Andric OtherMI->dump()); 2220*0fca6ea1SDimitry Andric updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset); 2221*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump()); 22220b57cec5SDimitry Andric } 22230b57cec5SDimitry Andric } 22240b57cec5SDimitry Andric AnchorList.insert(AnchorInst); 22250b57cec5SDimitry Andric return true; 22260b57cec5SDimitry Andric } 22270b57cec5SDimitry Andric 22280b57cec5SDimitry Andric return false; 22290b57cec5SDimitry Andric } 22300b57cec5SDimitry Andric 22318bcb0991SDimitry Andric void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 22328bcb0991SDimitry Andric std::list<std::list<CombineInfo> > &MergeableInsts) const { 22338bcb0991SDimitry Andric for (std::list<CombineInfo> &AddrList : MergeableInsts) { 2234480093f4SDimitry Andric if (AddrList.front().InstClass == CI.InstClass && 223504eeddc0SDimitry Andric AddrList.front().IsAGPR == CI.IsAGPR && 2236bdd1243dSDimitry Andric AddrList.front().hasSameBaseAddress(CI)) { 22378bcb0991SDimitry Andric AddrList.emplace_back(CI); 22388bcb0991SDimitry Andric return; 22398bcb0991SDimitry Andric } 22408bcb0991SDimitry Andric } 22410b57cec5SDimitry Andric 22428bcb0991SDimitry Andric // Base address not found, so add a new list. 22438bcb0991SDimitry Andric MergeableInsts.emplace_back(1, CI); 22448bcb0991SDimitry Andric } 22458bcb0991SDimitry Andric 22465ffd83dbSDimitry Andric std::pair<MachineBasicBlock::iterator, bool> 22475ffd83dbSDimitry Andric SILoadStoreOptimizer::collectMergeableInsts( 22485ffd83dbSDimitry Andric MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 22495ffd83dbSDimitry Andric MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 22508bcb0991SDimitry Andric std::list<std::list<CombineInfo>> &MergeableInsts) const { 22518bcb0991SDimitry Andric bool Modified = false; 22520b57cec5SDimitry Andric 22538bcb0991SDimitry Andric // Sort potential mergeable instructions into lists. One list per base address. 22545ffd83dbSDimitry Andric unsigned Order = 0; 22555ffd83dbSDimitry Andric MachineBasicBlock::iterator BlockI = Begin; 22565ffd83dbSDimitry Andric for (; BlockI != End; ++BlockI) { 22575ffd83dbSDimitry Andric MachineInstr &MI = *BlockI; 22585ffd83dbSDimitry Andric 22598bcb0991SDimitry Andric // We run this before checking if an address is mergeable, because it can produce 22608bcb0991SDimitry Andric // better code even if the instructions aren't mergeable. 22610b57cec5SDimitry Andric if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 22620b57cec5SDimitry Andric Modified = true; 22630b57cec5SDimitry Andric 22641fd87a68SDimitry Andric // Treat volatile accesses, ordered accesses and unmodeled side effects as 22651fd87a68SDimitry Andric // barriers. We can look after this barrier for separate merges. 22661fd87a68SDimitry Andric if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 22671fd87a68SDimitry Andric LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 22685ffd83dbSDimitry Andric 22695ffd83dbSDimitry Andric // Search will resume after this instruction in a separate merge list. 22705ffd83dbSDimitry Andric ++BlockI; 22715ffd83dbSDimitry Andric break; 22725ffd83dbSDimitry Andric } 22735ffd83dbSDimitry Andric 22748bcb0991SDimitry Andric const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 22758bcb0991SDimitry Andric if (InstClass == UNKNOWN) 22768bcb0991SDimitry Andric continue; 22778bcb0991SDimitry Andric 227804eeddc0SDimitry Andric // Do not merge VMEM buffer instructions with "swizzled" bit set. 227904eeddc0SDimitry Andric int Swizzled = 228004eeddc0SDimitry Andric AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 228104eeddc0SDimitry Andric if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 228204eeddc0SDimitry Andric continue; 228304eeddc0SDimitry Andric 22848bcb0991SDimitry Andric CombineInfo CI; 228504eeddc0SDimitry Andric CI.setMI(MI, *this); 22865ffd83dbSDimitry Andric CI.Order = Order++; 22878bcb0991SDimitry Andric 22888bcb0991SDimitry Andric if (!CI.hasMergeableAddress(*MRI)) 22898bcb0991SDimitry Andric continue; 22908bcb0991SDimitry Andric 229104eeddc0SDimitry Andric if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 229204eeddc0SDimitry Andric // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 229304eeddc0SDimitry Andric // operands. However we are reporting that ds_write2 shall have 229404eeddc0SDimitry Andric // only VGPR data so that machine copy propagation does not 229504eeddc0SDimitry Andric // create an illegal instruction with a VGPR and AGPR sources. 229604eeddc0SDimitry Andric // Consequenctially if we create such instruction the verifier 229704eeddc0SDimitry Andric // will complain. 229804eeddc0SDimitry Andric continue; 229904eeddc0SDimitry Andric } 230004eeddc0SDimitry Andric 23015ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 23025ffd83dbSDimitry Andric 23038bcb0991SDimitry Andric addInstToMergeableList(CI, MergeableInsts); 23048bcb0991SDimitry Andric } 23055ffd83dbSDimitry Andric 23065ffd83dbSDimitry Andric // At this point we have lists of Mergeable instructions. 23075ffd83dbSDimitry Andric // 23085ffd83dbSDimitry Andric // Part 2: Sort lists by offset and then for each CombineInfo object in the 23095ffd83dbSDimitry Andric // list try to find an instruction that can be merged with I. If an instruction 23105ffd83dbSDimitry Andric // is found, it is stored in the Paired field. If no instructions are found, then 23115ffd83dbSDimitry Andric // the CombineInfo object is deleted from the list. 23125ffd83dbSDimitry Andric 23135ffd83dbSDimitry Andric for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 23145ffd83dbSDimitry Andric E = MergeableInsts.end(); I != E;) { 23155ffd83dbSDimitry Andric 23165ffd83dbSDimitry Andric std::list<CombineInfo> &MergeList = *I; 23175ffd83dbSDimitry Andric if (MergeList.size() <= 1) { 23185ffd83dbSDimitry Andric // This means we have found only one instruction with a given address 23195ffd83dbSDimitry Andric // that can be merged, and we need at least 2 instructions to do a merge, 23205ffd83dbSDimitry Andric // so this list can be discarded. 23215ffd83dbSDimitry Andric I = MergeableInsts.erase(I); 23225ffd83dbSDimitry Andric continue; 23235ffd83dbSDimitry Andric } 23245ffd83dbSDimitry Andric 23255ffd83dbSDimitry Andric // Sort the lists by offsets, this way mergeable instructions will be 23265ffd83dbSDimitry Andric // adjacent to each other in the list, which will make it easier to find 23275ffd83dbSDimitry Andric // matches. 23285ffd83dbSDimitry Andric MergeList.sort( 2329349cc55cSDimitry Andric [] (const CombineInfo &A, const CombineInfo &B) { 23305ffd83dbSDimitry Andric return A.Offset < B.Offset; 23315ffd83dbSDimitry Andric }); 23325ffd83dbSDimitry Andric ++I; 23335ffd83dbSDimitry Andric } 23345ffd83dbSDimitry Andric 2335*0fca6ea1SDimitry Andric return {BlockI, Modified}; 23368bcb0991SDimitry Andric } 23378bcb0991SDimitry Andric 23388bcb0991SDimitry Andric // Scan through looking for adjacent LDS operations with constant offsets from 23398bcb0991SDimitry Andric // the same base register. We rely on the scheduler to do the hard work of 23408bcb0991SDimitry Andric // clustering nearby loads, and assume these are all adjacent. 23418bcb0991SDimitry Andric bool SILoadStoreOptimizer::optimizeBlock( 23428bcb0991SDimitry Andric std::list<std::list<CombineInfo> > &MergeableInsts) { 23438bcb0991SDimitry Andric bool Modified = false; 23448bcb0991SDimitry Andric 23455ffd83dbSDimitry Andric for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 23465ffd83dbSDimitry Andric E = MergeableInsts.end(); I != E;) { 23475ffd83dbSDimitry Andric std::list<CombineInfo> &MergeList = *I; 23488bcb0991SDimitry Andric 23498bcb0991SDimitry Andric bool OptimizeListAgain = false; 23508bcb0991SDimitry Andric if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 23515ffd83dbSDimitry Andric // We weren't able to make any changes, so delete the list so we don't 23528bcb0991SDimitry Andric // process the same instructions the next time we try to optimize this 23538bcb0991SDimitry Andric // block. 23545ffd83dbSDimitry Andric I = MergeableInsts.erase(I); 23550b57cec5SDimitry Andric continue; 23560b57cec5SDimitry Andric } 23570b57cec5SDimitry Andric 23585ffd83dbSDimitry Andric Modified = true; 23595ffd83dbSDimitry Andric 23608bcb0991SDimitry Andric // We made changes, but also determined that there were no more optimization 23618bcb0991SDimitry Andric // opportunities, so we don't need to reprocess the list 23625ffd83dbSDimitry Andric if (!OptimizeListAgain) { 23635ffd83dbSDimitry Andric I = MergeableInsts.erase(I); 23645ffd83dbSDimitry Andric continue; 23655ffd83dbSDimitry Andric } 23665ffd83dbSDimitry Andric OptimizeAgain = true; 23678bcb0991SDimitry Andric } 23688bcb0991SDimitry Andric return Modified; 23698bcb0991SDimitry Andric } 23708bcb0991SDimitry Andric 23718bcb0991SDimitry Andric bool 23728bcb0991SDimitry Andric SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 23738bcb0991SDimitry Andric std::list<CombineInfo> &MergeList, 23748bcb0991SDimitry Andric bool &OptimizeListAgain) { 23755ffd83dbSDimitry Andric if (MergeList.empty()) 23765ffd83dbSDimitry Andric return false; 23775ffd83dbSDimitry Andric 23788bcb0991SDimitry Andric bool Modified = false; 2379480093f4SDimitry Andric 23805ffd83dbSDimitry Andric for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 23815ffd83dbSDimitry Andric Next = std::next(I)) { 23825ffd83dbSDimitry Andric 23835ffd83dbSDimitry Andric auto First = I; 23845ffd83dbSDimitry Andric auto Second = Next; 23855ffd83dbSDimitry Andric 23865ffd83dbSDimitry Andric if ((*First).Order > (*Second).Order) 23875ffd83dbSDimitry Andric std::swap(First, Second); 23885ffd83dbSDimitry Andric CombineInfo &CI = *First; 23895ffd83dbSDimitry Andric CombineInfo &Paired = *Second; 23905ffd83dbSDimitry Andric 239181ad6265SDimitry Andric CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 239281ad6265SDimitry Andric if (!Where) { 23935ffd83dbSDimitry Andric ++I; 2394480093f4SDimitry Andric continue; 23955ffd83dbSDimitry Andric } 2396480093f4SDimitry Andric 2397480093f4SDimitry Andric Modified = true; 23985ffd83dbSDimitry Andric 23995ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 24000b57cec5SDimitry Andric 240181ad6265SDimitry Andric MachineBasicBlock::iterator NewMI; 24020b57cec5SDimitry Andric switch (CI.InstClass) { 24030b57cec5SDimitry Andric default: 2404480093f4SDimitry Andric llvm_unreachable("unknown InstClass"); 24050b57cec5SDimitry Andric break; 240681ad6265SDimitry Andric case DS_READ: 240781ad6265SDimitry Andric NewMI = mergeRead2Pair(CI, Paired, Where->I); 240881ad6265SDimitry Andric break; 240981ad6265SDimitry Andric case DS_WRITE: 241081ad6265SDimitry Andric NewMI = mergeWrite2Pair(CI, Paired, Where->I); 241181ad6265SDimitry Andric break; 241281ad6265SDimitry Andric case S_BUFFER_LOAD_IMM: 2413bdd1243dSDimitry Andric case S_BUFFER_LOAD_SGPR_IMM: 2414bdd1243dSDimitry Andric case S_LOAD_IMM: 2415bdd1243dSDimitry Andric NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I); 241681ad6265SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 8; 241781ad6265SDimitry Andric break; 241881ad6265SDimitry Andric case BUFFER_LOAD: 241981ad6265SDimitry Andric NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 242081ad6265SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4; 242181ad6265SDimitry Andric break; 242281ad6265SDimitry Andric case BUFFER_STORE: 242381ad6265SDimitry Andric NewMI = mergeBufferStorePair(CI, Paired, Where->I); 242481ad6265SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4; 242581ad6265SDimitry Andric break; 242681ad6265SDimitry Andric case MIMG: 242781ad6265SDimitry Andric NewMI = mergeImagePair(CI, Paired, Where->I); 242881ad6265SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4; 242981ad6265SDimitry Andric break; 243081ad6265SDimitry Andric case TBUFFER_LOAD: 243181ad6265SDimitry Andric NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 243281ad6265SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4; 243381ad6265SDimitry Andric break; 243481ad6265SDimitry Andric case TBUFFER_STORE: 243581ad6265SDimitry Andric NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 243681ad6265SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4; 243781ad6265SDimitry Andric break; 243881ad6265SDimitry Andric case FLAT_LOAD: 243981ad6265SDimitry Andric case GLOBAL_LOAD: 244081ad6265SDimitry Andric case GLOBAL_LOAD_SADDR: 244181ad6265SDimitry Andric NewMI = mergeFlatLoadPair(CI, Paired, Where->I); 244281ad6265SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4; 244381ad6265SDimitry Andric break; 244481ad6265SDimitry Andric case FLAT_STORE: 244581ad6265SDimitry Andric case GLOBAL_STORE: 244681ad6265SDimitry Andric case GLOBAL_STORE_SADDR: 244781ad6265SDimitry Andric NewMI = mergeFlatStorePair(CI, Paired, Where->I); 244881ad6265SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4; 24498bcb0991SDimitry Andric break; 2450480093f4SDimitry Andric } 245104eeddc0SDimitry Andric CI.setMI(NewMI, *this); 245281ad6265SDimitry Andric CI.Order = Where->Order; 24535ffd83dbSDimitry Andric if (I == Second) 24545ffd83dbSDimitry Andric I = Next; 2455480093f4SDimitry Andric 24565ffd83dbSDimitry Andric MergeList.erase(Second); 24570b57cec5SDimitry Andric } 24580b57cec5SDimitry Andric 24590b57cec5SDimitry Andric return Modified; 24600b57cec5SDimitry Andric } 24610b57cec5SDimitry Andric 24620b57cec5SDimitry Andric bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 24630b57cec5SDimitry Andric if (skipFunction(MF.getFunction())) 24640b57cec5SDimitry Andric return false; 24650b57cec5SDimitry Andric 24660b57cec5SDimitry Andric STM = &MF.getSubtarget<GCNSubtarget>(); 24670b57cec5SDimitry Andric if (!STM->loadStoreOptEnabled()) 24680b57cec5SDimitry Andric return false; 24690b57cec5SDimitry Andric 24700b57cec5SDimitry Andric TII = STM->getInstrInfo(); 24710b57cec5SDimitry Andric TRI = &TII->getRegisterInfo(); 24720b57cec5SDimitry Andric 24730b57cec5SDimitry Andric MRI = &MF.getRegInfo(); 24740b57cec5SDimitry Andric AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 24750b57cec5SDimitry Andric 24760b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 24770b57cec5SDimitry Andric 24780b57cec5SDimitry Andric bool Modified = false; 24790b57cec5SDimitry Andric 24805ffd83dbSDimitry Andric // Contains the list of instructions for which constant offsets are being 24815ffd83dbSDimitry Andric // promoted to the IMM. This is tracked for an entire block at time. 24825ffd83dbSDimitry Andric SmallPtrSet<MachineInstr *, 4> AnchorList; 24835ffd83dbSDimitry Andric MemInfoMap Visited; 24848bcb0991SDimitry Andric 24850b57cec5SDimitry Andric for (MachineBasicBlock &MBB : MF) { 24865ffd83dbSDimitry Andric MachineBasicBlock::iterator SectionEnd; 24875ffd83dbSDimitry Andric for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 24885ffd83dbSDimitry Andric I = SectionEnd) { 24895ffd83dbSDimitry Andric bool CollectModified; 24908bcb0991SDimitry Andric std::list<std::list<CombineInfo>> MergeableInsts; 24915ffd83dbSDimitry Andric 24925ffd83dbSDimitry Andric // First pass: Collect list of all instructions we know how to merge in a 24935ffd83dbSDimitry Andric // subset of the block. 24945ffd83dbSDimitry Andric std::tie(SectionEnd, CollectModified) = 24955ffd83dbSDimitry Andric collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 24965ffd83dbSDimitry Andric 24975ffd83dbSDimitry Andric Modified |= CollectModified; 24985ffd83dbSDimitry Andric 24990b57cec5SDimitry Andric do { 25000b57cec5SDimitry Andric OptimizeAgain = false; 25018bcb0991SDimitry Andric Modified |= optimizeBlock(MergeableInsts); 25020b57cec5SDimitry Andric } while (OptimizeAgain); 25030b57cec5SDimitry Andric } 25040b57cec5SDimitry Andric 25055ffd83dbSDimitry Andric Visited.clear(); 25065ffd83dbSDimitry Andric AnchorList.clear(); 25075ffd83dbSDimitry Andric } 25085ffd83dbSDimitry Andric 25090b57cec5SDimitry Andric return Modified; 25100b57cec5SDimitry Andric } 2511