Lines Matching +full:abs +full:- +full:flat
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
45 // - This is currently missing stores of constants because loading
49 // - Live interval recomputing seems inefficient. This currently only matches
53 // - With a list of instructions to process, we can also merge more. If a
54 // cluster of loads have offsets that are too large to fit in the 8-bit
58 //===----------------------------------------------------------------------===//
69 #define DEBUG_TYPE "si-load-store-opt"
133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
155 if (AddrOp->isImm())
160 // non-register
161 if (!AddrOp->isReg())
166 if (AddrOp->getReg().isPhysical() &&
167 AddrOp->getReg() != AMDGPU::SGPR_NULL)
172 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
335 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
456 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
571 return Info->BaseOpcode;
575 return -1;
638 // GLOBAL loads and stores are classified as FLAT initially. If both combined
639 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
640 // If either or both instructions are non segment specific FLAT the resulting
641 // combined operation will be FLAT, potentially promoting one of the GLOBAL
642 // operations to FLAT.
676 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
682 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
768 unsigned Opc = MI->getOpcode();
774 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
798 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
803 Offset = I->getOperand(OffsetIdx).getImm();
807 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
814 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
818 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
848 AddrReg[J] = &I->getOperand(AddrIdx[J]);
902 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
903 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
905 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
912 MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
913 // If merging FLAT and GLOBAL set address space to FLAT.
914 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
917 MachineFunction *MF = CI.I->getMF();
918 return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
930 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
939 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
940 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
942 if (Idx != -1 &&
943 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
973 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
975 OldFormatInfo->NumFormat, STI);
980 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
981 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
983 return NewFormatInfo->Format;
989 // - if Lo == Hi, return that value
990 // - if Lo == 0, return 0 (even though the "- 1" below underflows
991 // - if Lo > Hi, return 0 (as if the range wrapped around)
993 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
1002 // XXX - Would the same offset be OK? Is there any reason this would happen or
1022 if (Info0->BitsPerComp != Info1->BitsPerComp ||
1023 Info0->NumFormat != Info1->NumFormat)
1027 // are not dword-aligned, the merged load might not be valid.
1028 if (Info0->BitsPerComp != 32)
1040 // Handle all non-DS instructions.
1050 // dword + dwordx2 -> dwordx3
1051 // dword + dwordx3 -> dwordx4
1061 // If the offset in elements doesn't fit in 8-bits, we might be able to use
1073 // Check if the new offsets fit in the reduced 8-bit range.
1087 if (((Max - Min) & ~Mask) == 0) {
1092 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1097 CI.Offset = (EltOffset0 - BaseOff) / 64;
1098 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1104 if (isUInt<8>(Max - Min)) {
1109 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1111 CI.Offset = EltOffset0 - BaseOff;
1112 Paired.Offset = EltOffset1 - BaseOff;
1145 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1146 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1148 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1149 return TRI->getRegClassForReg(*MRI, Src->getReg());
1151 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1152 return TRI->getRegClassForReg(*MRI, Src->getReg());
1154 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1155 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1157 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1158 return TRI->getRegClassForReg(*MRI, Src->getReg());
1174 if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1175 getInstSubclass(Paired.I->getOpcode(), *TII))
1191 if (CI.I->mayLoad()) {
1194 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1224 MachineBasicBlock *MBB = CI.I->getParent();
1225 DebugLoc DL = CI.I->getDebugLoc();
1230 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1231 auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1232 auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1235 // `early-clobber` flag in the dst operand. Remove the flag before using the
1237 Dest0->setIsEarlyClobber(false);
1238 Dest1->setIsEarlyClobber(false);
1254 MachineBasicBlock *MBB = CI.I->getParent();
1255 DebugLoc DL = CI.I->getDebugLoc();
1261 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1263 const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1264 const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1266 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1276 if (STM->ldsRequiresM0Init())
1282 if (STM->ldsRequiresM0Init())
1292 MachineBasicBlock *MBB = CI.I->getParent();
1296 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1306 const MCInstrDesc &Read2Desc = TII->get(Opc);
1309 Register DestReg = MRI->createVirtualRegister(SuperRC);
1311 DebugLoc DL = CI.I->getDebugLoc();
1313 Register BaseReg = AddrReg->getReg();
1314 unsigned BaseSubReg = AddrReg->getSubReg();
1317 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1318 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1321 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1324 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1326 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1341 CI.I->eraseFromParent();
1342 Paired.I->eraseFromParent();
1349 if (STM->ldsRequiresM0Init())
1356 if (STM->ldsRequiresM0Init())
1367 MachineBasicBlock *MBB = CI.I->getParent();
1372 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1374 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1376 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1392 const MCInstrDesc &Write2Desc = TII->get(Opc);
1393 DebugLoc DL = CI.I->getDebugLoc();
1395 Register BaseReg = AddrReg->getReg();
1396 unsigned BaseSubReg = AddrReg->getSubReg();
1399 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1400 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1403 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1406 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1408 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1423 CI.I->eraseFromParent();
1424 Paired.I->eraseFromParent();
1433 MachineBasicBlock *MBB = CI.I->getParent();
1434 DebugLoc DL = CI.I->getDebugLoc();
1439 Register DestReg = MRI->createVirtualRegister(SuperRC);
1442 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1444 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1455 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1461 CI.I->eraseFromParent();
1462 Paired.I->eraseFromParent();
1469 MachineBasicBlock *MBB = CI.I->getParent();
1470 DebugLoc DL = CI.I->getDebugLoc();
1475 Register DestReg = MRI->createVirtualRegister(SuperRC);
1481 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1484 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1485 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1487 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1493 CI.I->eraseFromParent();
1494 Paired.I->eraseFromParent();
1501 MachineBasicBlock *MBB = CI.I->getParent();
1502 DebugLoc DL = CI.I->getDebugLoc();
1509 Register DestReg = MRI->createVirtualRegister(SuperRC);
1512 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1517 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1522 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1525 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1526 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1534 CI.I->eraseFromParent();
1535 Paired.I->eraseFromParent();
1542 MachineBasicBlock *MBB = CI.I->getParent();
1543 DebugLoc DL = CI.I->getDebugLoc();
1550 Register DestReg = MRI->createVirtualRegister(SuperRC);
1553 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1558 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1566 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1569 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1570 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1579 CI.I->eraseFromParent();
1580 Paired.I->eraseFromParent();
1587 MachineBasicBlock *MBB = CI.I->getParent();
1588 DebugLoc DL = CI.I->getDebugLoc();
1595 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1601 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1609 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1612 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1613 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1620 CI.I->eraseFromParent();
1621 Paired.I->eraseFromParent();
1628 MachineBasicBlock *MBB = CI.I->getParent();
1629 DebugLoc DL = CI.I->getDebugLoc();
1634 Register DestReg = MRI->createVirtualRegister(SuperRC);
1636 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1638 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1642 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1649 CI.I->eraseFromParent();
1650 Paired.I->eraseFromParent();
1657 MachineBasicBlock *MBB = CI.I->getParent();
1658 DebugLoc DL = CI.I->getDebugLoc();
1665 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1666 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1669 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1677 CI.I->eraseFromParent();
1678 Paired.I->eraseFromParent();
1690 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1694 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1727 // under-aligned.
1728 const MachineMemOperand *MMO = *CI.I->memoperands_begin();
1730 STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
1817 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1844 Idx1 = Idxs[0][Paired.Width - 1];
1845 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1847 Idx0 = Idxs[0][CI.Width - 1];
1848 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1876 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1877 ? TRI->getAGPRClassForBitWidth(BitWidth)
1878 : TRI->getVGPRClassForBitWidth(BitWidth);
1884 MachineBasicBlock *MBB = CI.I->getParent();
1885 DebugLoc DL = CI.I->getDebugLoc();
1892 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1898 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1904 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1907 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1908 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1914 CI.I->eraseFromParent();
1915 Paired.I->eraseFromParent();
1922 if (TII->isInlineConstant(V))
1925 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1928 TII->get(AMDGPU::S_MOV_B32), Reg)
1931 LLVM_DEBUG(dbgs() << " "; Mov->dump());
1942 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1944 "Expected 32-bit Base-Register-Low!!");
1946 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1948 "Expected 32-bit Base-Register-Hi!!");
1950 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1955 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1956 Register CarryReg = MRI->createVirtualRegister(CarryRC);
1957 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1959 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1960 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1962 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1968 LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1971 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1978 LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1980 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1982 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1988 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1997 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1998 Base->setReg(NewBase);
1999 Base->setIsKill(false);
2000 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2011 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2012 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2013 !Def->getOperand(1).isImm())
2016 return Def->getOperand(1).getImm();
2020 // - 32bit base registers, subregisters
2021 // - 64bit constant offset
2034 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2035 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2036 || Def->getNumOperands() != 5)
2039 MachineOperand BaseLo = Def->getOperand(1);
2040 MachineOperand BaseHi = Def->getOperand(3);
2044 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2045 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2047 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2048 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2051 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2052 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2063 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2064 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2066 if (Src0->isImm())
2069 if (!Src1->isImm() || Src0->isImm())
2072 uint64_t Offset1 = Src1->getImm();
2087 if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2090 // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2102 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2103 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2107 // Step1: Find the base-registers and a 64bit constant offset.
2108 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2117 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2126 // same base-registers) with the highest 13bit distance from MI's offset.
2138 // as the new-base(anchor) because of the maximum distance which can
2144 // load1 = load(addr, -4096)
2145 // load2 = load(addr, -2048)
2156 MachineBasicBlock::iterator E = MBB->end();
2160 static_cast<const SITargetLowering *>(STM->getTargetLowering());
2167 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2171 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2187 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2191 if (TLI->isLegalFlatAddressingMode(AM, AS) &&
2192 (uint32_t)std::abs(Dist) > MaxDist) {
2193 MaxDist = std::abs(Dist);
2201 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2202 AnchorInst->dump());
2203 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2206 // Instead of moving up, just re-compute anchor-instruction's base address.
2209 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2215 AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2217 if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2219 OtherMI->dump());
2220 updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2221 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump());
2281 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2407 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2410 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2415 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2419 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2423 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2427 NewMI = mergeImagePair(CI, Paired, Where->I);
2431 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2435 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2441 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2447 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2452 CI.Order = Where->Order;
2467 if (!STM->loadStoreOptEnabled())
2470 TII = STM->getInstrInfo();
2471 TRI = &TII->getRegisterInfo();