Lines Matching +full:abs +full:- +full:flat
1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
12 //===----------------------------------------------------------------------===//
38 #define DEBUG_TYPE "si-instr-info"
54 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58 "amdgpu-fix-16-bit-physreg-copies",
69 //===----------------------------------------------------------------------===//
71 //===----------------------------------------------------------------------===//
74 unsigned N = Node->getNumOperands();
75 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
76 --N;
83 unsigned Opc0 = N0->getMachineOpcode();
84 unsigned Opc1 = N1->getMachineOpcode();
89 if (Op0Idx == -1 && Op1Idx == -1)
93 if ((Op0Idx == -1 && Op1Idx != -1) ||
94 (Op1Idx == -1 && Op0Idx != -1))
101 --Op0Idx;
102 --Op1Idx;
104 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
117 return MMO->isLoad() && MMO->isInvariant();
153 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
197 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
205 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
209 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
212 while (FromCycle && !FromCycle->contains(ToCycle)) {
214 FromCycle->getExitingBlocks(ExitingBlocks);
222 FromCycle = FromCycle->getParentCycle();
233 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
236 unsigned Opc0 = Load0->getMachineOpcode();
237 unsigned Opc1 = Load1->getMachineOpcode();
254 if (Load0->getOperand(0) != Load1->getOperand(0))
262 if (Offset0Idx == -1 || Offset1Idx == -1)
265 // XXX - be careful of dataless loads
269 Offset0Idx -= get(Opc0).NumDefs;
270 Offset1Idx -= get(Opc1).NumDefs;
271 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
272 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
287 if (Load0->getOperand(0) != Load1->getOperand(0))
292 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
296 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
298 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
303 Offset0 = Load0Offset->getZExtValue();
304 Offset1 = Load1Offset->getZExtValue();
320 if (OffIdx0 == -1 || OffIdx1 == -1)
326 OffIdx0 -= get(Opc0).NumDefs;
327 OffIdx1 -= get(Opc1).NumDefs;
329 SDValue Off0 = Load0->getOperand(OffIdx0);
330 SDValue Off1 = Load1->getOperand(OffIdx1);
336 Offset0 = Off0->getAsZExtVal();
337 Offset1 = Off1->getAsZExtVal();
379 Offset = OffsetOp->getImm();
382 if (DataOpIdx == -1)
394 unsigned Offset0 = Offset0Op->getImm() & 0xff;
395 unsigned Offset1 = Offset1Op->getImm() & 0xff;
404 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
408 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
418 if (DataOpIdx == -1) {
436 if (BaseOp && !BaseOp->isFI())
440 Offset = OffsetImm->getImm();
444 if (SOffset->isReg())
447 Offset += SOffset->getImm();
451 if (DataOpIdx == -1)
453 if (DataOpIdx == -1) // LDS DMA
475 if (DataOpIdx == -1)
487 Offset = OffsetOp ? OffsetOp->getImm() : 0;
490 if (DataOpIdx == -1)
504 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
507 if (DataOpIdx == -1)
509 if (DataOpIdx == -1) // LDS DMA
525 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
533 if (MO1->getAddrSpace() != MO2->getAddrSpace())
536 auto Base1 = MO1->getValue();
537 auto Base2 = MO2->getValue();
558 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
559 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
571 // The good thing about this heuristic is - it avoids clustering of too many
572 // sub-word loads, and also avoids clustering of wide loads. Below is the
603 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
612 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
613 LLVMContext &C = MF->getFunction().getContext();
616 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
647 // an accvgpr_write used for this same copy due to implicit-defs
650 --Def;
652 if (!Def->modifiesRegister(SrcReg, &RI))
655 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
656 Def->getOperand(0).getReg() != SrcReg)
659 MachineOperand &DefOp = Def->getOperand(1);
667 if (I->modifiesRegister(DefOp.getReg(), &RI))
700 // use register number to pick one of three round-robin temps.
701 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
703 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
704 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
709 while (RegNo--) {
759 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
760 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
780 I--;
787 FirstMI->addOperand(
791 LastMI->addRegisterKilled(SrcReg, &RI);
804 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
808 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
906 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
1024 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1077 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1090 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1098 // If there is an overlap, we can't kill the super-register on the last
1108 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1114 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1152 if (NewOpc != -1)
1154 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1158 if (NewOpc != -1)
1159 // Check if the original (non-REV) opcode exists on the target.
1160 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1169 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1193 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1232 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1356 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1369 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1689 // Currently, there is only 32-bit WWM register spills needed.
1722 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1723 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1728 MachineMemOperand *MMO = MF->getMachineMemOperand(
1731 unsigned SpillSize = TRI->getSpillSize(*RC);
1733 MachineRegisterInfo &MRI = MF->getRegInfo();
1735 MFI->setHasSpilledSGPRs();
1754 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
1763 MFI->setHasSpilledVGPRs();
1768 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1915 // Currently, there is only 32-bit WWM register spills needed.
1949 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1950 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1952 unsigned SpillSize = TRI->getSpillSize(*RC);
1957 MachineMemOperand *MMO = MF->getMachineMemOperand(
1962 MFI->setHasSpilledSGPRs();
1968 // lowered to non-memory instructions.
1971 MachineRegisterInfo &MRI = MF->getRegInfo();
1980 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
1989 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
2005 Quantity -= Arg;
2006 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2012 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
2014 assert(Info->isEntryFunction());
2019 if (Info->returnsVoid()) {
2038 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2042 TrapBB = MF->CreateMachineBasicBlock();
2044 MF->push_back(TrapBB);
2050 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2053 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2056 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2060 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2065 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2068 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2070 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2072 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2074 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2075 TrapBB->addSuccessor(HaltLoopBB);
2077 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2078 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2080 MF->push_back(HaltLoopBB);
2081 HaltLoopBB->addSuccessor(HaltLoopBB);
2190 // FIXME: Will this work for 64-bit floating point immediates?
2224 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2283 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2299 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2364 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2389 SetOn->getOperand(3).setIsUndef();
2403 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2407 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2434 SetOn->getOperand(3).setIsUndef();
2443 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2456 // Create a bundle so these instructions won't be re-ordered by the
2457 // post-RA scheduler.
2472 // Fix up hardware that does not sign-extend the 48-bit PC value by
2523 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2526 // the function and missing live-ins. We are fine in practice because callee
2532 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2549 // Fix up hardware that does not sign-extend the 48-bit PC value by
2578 if (I->isBundled())
2584 for (auto &CandMO : I->operands()) {
2593 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2596 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2597 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2600 MachineRegisterInfo &MRI = MF->getRegInfo();
2603 unsigned NewOpcode = -1;
2616 UseMO->setReg(DestReg);
2617 UseMO->setSubReg(AMDGPU::NoSubRegister);
2620 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2621 MI->setDesc(TID);
2622 MI->getOperand(0).setReg(DestReg);
2623 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2626 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2627 OffsetMO->setImm(FinalOffset);
2631 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2633 MI->setMemRefs(*MF, NewMMOs);
2652 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2660 MachineRegisterInfo &MRI = MF->getRegInfo();
2701 .addReg(Split[0]->getOperand(0).getReg())
2703 .addReg(Split[1]->getOperand(0).getReg())
2731 int Src0ModsVal = Src0Mods->getImm();
2732 int Src1ModsVal = Src1Mods->getImm();
2734 Src1Mods->setImm(Src0ModsVal);
2735 Src0Mods->setImm(Src1ModsVal);
2775 if (CommutedOpcode == -1)
2814 CommutedMI->setDesc(get(CommutedOpcode));
2837 if (Src0Idx == -1)
2841 if (Src1Idx == -1)
2858 BrOffset -= 1;
2869 for (const MachineInstr &MI : MBB->terminators()) {
2891 MachineRegisterInfo &MRI = MF->getRegInfo();
2892 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
2904 auto &MCCtx = MF->getContext();
2907 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2956 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2962 RS->enterBasicBlock(MBB);
2965 RS->enterBasicBlockEnd(MBB);
2966 Scav = RS->scavengeRegisterBackwards(
2971 RS->setRegUsed(Scav);
2977 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2979 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
2991 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
2993 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3040 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3042 TBB = I->getOperand(0).getMBB();
3048 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
3049 CondBB = I->getOperand(1).getMBB();
3050 Cond.push_back(I->getOperand(0));
3052 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3056 CondBB = I->getOperand(0).getMBB();
3058 Cond.push_back(I->getOperand(1)); // Save the branch register.
3063 // Conditional branch followed by fall-through.
3068 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3070 FBB = I->getOperand(0).getMBB();
3088 while (I != E && !I->isBranch() && !I->isReturn()) {
3089 switch (I->getOpcode()) {
3110 llvm_unreachable("unexpected non-branch terminator inst");
3180 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3197 MachineOperand &CondReg = CondBr->getOperand(1);
3214 Cond[0].setImm(-Cond[0].getImm());
3229 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3244 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3269 Pred = static_cast<BranchPredicate>(-Pred);
3273 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3290 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3300 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3323 // 64-bit select is only available for SALU.
3324 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3340 I = MIB->getIterator();
3362 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3407 if (!MRI->hasOneNonDBGUse(Reg))
3426 if (!ImmOp->isImm())
3429 auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
3430 int64_t Imm = ImmOp->getImm();
3489 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3494 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3527 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3528 (Src1->isReg() && Src1->getReg() == Reg)) {
3530 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3531 if (!RegSrc->isReg())
3533 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3537 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3548 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3549 if (Def && Def->isMoveImmediate() &&
3550 !isInlineConstant(Def->getOperand(1)))
3558 if (pseudoToMCOpcode(NewOpc) == -1)
3572 Register SrcReg = RegSrc->getReg();
3573 unsigned SrcSubReg = RegSrc->getSubReg();
3574 Src0->setReg(SrcReg);
3575 Src0->setSubReg(SrcSubReg);
3576 Src0->setIsKill(RegSrc->isKill());
3584 Src1->ChangeToImmediate(Imm);
3589 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3597 if (Src2->isReg() && Src2->getReg() == Reg) {
3602 if (Src0->isReg()) {
3606 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3607 if (Def && Def->isMoveImmediate() &&
3608 isInlineConstant(Def->getOperand(1)) &&
3609 MRI->hasOneUse(Src0->getReg())) {
3610 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3613 RI.isSGPRReg(*MRI, Src0->getReg())) {
3616 // VGPR is okay as Src0 - fallthrough
3619 if (Src1->isReg() && !Src0Inlined) {
3620 // We have one slot for inlinable constant so far - try to fill it
3621 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3622 if (Def && Def->isMoveImmediate() &&
3623 isInlineConstant(Def->getOperand(1)) &&
3624 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3625 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3626 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3628 // VGPR is okay as Src1 - fallthrough
3637 if (pseudoToMCOpcode(NewOpc) == -1)
3656 Src2->ChangeToImmediate(getImmFor(*Src2));
3666 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3683 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3717 LocationSize Width0 = MIa.memoperands().front()->getSize();
3718 LocationSize Width1 = MIb.memoperands().front()->getSize();
3732 // XXX - Can we relax this between address spaces?
3791 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3792 Imm = Def->getOperand(1).getImm();
3802 if (!MO->isReg())
3804 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3805 const MachineRegisterInfo &MRI = MF->getRegInfo();
3806 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3816 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3829 if (NewMFMAOpc != -1) {
3836 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3837 // SlotIndex of defs needs to be updated when converting to early-clobber
3838 MachineOperand &Def = MIB->getOperand(0);
3840 LIS->hasInterval(Def.getReg())) {
3841 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
3842 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
3843 auto &LI = LIS->getInterval(Def.getReg());
3846 if (S != LR.end() && S->start == OldIndex) {
3847 assert(S->valno && S->valno->def == OldIndex);
3848 S->start = NewIndex;
3849 S->valno->def = NewIndex;
3865 MIB->addOperand(MI.getOperand(I));
3869 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3876 "pre-RA");
3917 if (!Src0->isReg() && !Src0->isImm())
3920 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3945 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3946 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3948 const auto killDef = [&]() -> void {
3949 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3951 Register DefReg = DefMI->getOperand(0).getReg();
3955 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
3956 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
3957 DefMI->removeOperand(I);
3959 LV->getVarInfo(DefReg).AliveBlocks.clear();
3969 if (pseudoToMCOpcode(NewOpc) != -1) {
3978 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3989 if (pseudoToMCOpcode(NewOpc) != -1) {
3998 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4005 Imm = Src0->getImm();
4008 if (pseudoToMCOpcode(NewOpc) != -1 &&
4020 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4041 if (pseudoToMCOpcode(NewOpc) == -1)
4046 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4048 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4050 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4052 .addImm(Clamp ? Clamp->getImm() : 0)
4053 .addImm(Omod ? Omod->getImm() : 0)
4056 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4059 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4065 // XXX - Why isn't hasSideEffects sufficient for these?
4098 // Target-independent instructions do not have an implicit-use of EXEC, even
4170 // This won't read exec if this is an SGPR->SGPR copy.
4236 // records a 64-bit value. We need to know the size to determine if a 32-bit
4238 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4268 // distinction. However, in the case of 16-bit integer operations, the
4269 // "floating point" values appear to not work. It seems read the low 16-bits
4270 // of 32-bit immediates, which happens to always work for the integer
4275 // TODO: Theoretically we could use op-sel to use the high bits of the
4276 // 32-bit FP values.
4295 // A few special case instructions have 16-bit operands on subtargets
4296 // where 16-bit instructions are not legal.
4297 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4391 if (Op32 == -1)
4394 return pseudoToMCOpcode(Op32) != -1;
4407 return Mods && Mods->getImm();
4428 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4441 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4452 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4495 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4499 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4512 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4618 const MachineFunction *MF = MI.getParent()->getParent();
4619 const MachineRegisterInfo &MRI = MF->getRegInfo();
4624 int Src3Idx = -1;
4625 if (Src0Idx == -1) {
4654 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4720 ErrInfo = "Expected immediate, but got non-immediate";
4755 if (RegClass != -1) {
4760 if (!RC->contains(Reg)) {
4777 if (OpIdx == -1)
4801 (!OMod->isImm() || OMod->getImm() != 0)) {
4813 unsigned Mods = Src0ModsMO->getImm();
4814 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
4816 ErrInfo = "sext, abs and neg are not allowed on this instruction";
4823 if (!ST.hasSDWASdst() && DstIdx != -1) {
4833 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4840 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4848 if (DstUnused && DstUnused->isImm() &&
4849 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4876 uint64_t DMaskImm = DMask->getImm();
4884 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
4888 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
4913 if (ImmIdx != -1) {
4923 // bus, and we don't want to check pseudo-operands like the source modifier
4926 if (OpIdx == -1)
4962 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4975 // Special case for writelane - this can break the multiple constant bus rule,
4982 if (OpIdx == -1)
5014 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5015 SISrcMods::ABS) ||
5016 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5017 SISrcMods::ABS) ||
5018 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5019 SISrcMods::ABS)) {
5020 ErrInfo = "ABS not allowed in VOP3B instructions";
5041 if (!Op->isMBB()) {
5046 uint64_t Imm = Op->getImm();
5073 // RA scheduler where the main implicit operand is killed and implicit-defs
5074 // are added for sub-registers that remain live after this instruction.
5082 if (!Dst->isUse()) {
5097 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5120 if (Soff && Soff->getReg() != AMDGPU::M0) {
5129 if (Offset->getImm() != 0) {
5130 ErrInfo = "subtarget does not support offsets in flat instructions";
5137 if (GDSOp && GDSOp->getImm() != 0) {
5153 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5155 AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm());
5165 IsA16 = R128A16->getImm() != 0;
5168 IsA16 = A16->getImm() != 0;
5171 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5178 VAddrWords = RsrcIdx - VAddr0Idx;
5181 unsigned LastVAddrIdx = RsrcIdx - 1;
5182 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5203 unsigned DC = DppCt->getImm();
5257 if (Data && !Data->isReg())
5262 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5268 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5274 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5275 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5276 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5285 const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
5289 Register Reg = Op->getReg();
5294 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5320 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5339 // clang-format off
5353 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5500 // clang-format on
5517 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5519 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5521 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5522 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5528 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5529 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5531 Indexes->insertMachineInstrInMaps(*SaveExec);
5544 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5591 if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) {
5597 // The check is limited to FLAT and DS because atomics in non-flat encoding
5604 if (DataIdx != -1) {
5605 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5615 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5618 Desc.operands()[OpNo].RegClass == -1) {
5634 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
5646 DebugLoc DL = MBB->findDebugLoc(I);
5655 MachineBasicBlock *MBB = MI->getParent();
5656 DebugLoc DL = MI->getDebugLoc();
5701 return DRC->contains(Reg);
5706 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5715 return RC->hasSuperClassEq(DRC);
5724 // Handle non-register types that are treated like immediates.
5731 const MachineFunction &MF = *MI.getParent()->getParent();
5736 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
5743 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--)
5747 if (MO->isReg())
5748 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
5759 if (--ConstantBusLimit <= 0)
5765 if (!LiteralLimit--)
5767 if (--ConstantBusLimit <= 0)
5771 } else if (ST.hasNoF16PseudoScalarTransInlineConstants() && !MO->isReg() &&
5777 if (MO->isReg()) {
5782 bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
5794 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5799 if (VDstIdx != -1 &&
5805 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5811 RI.isSGPRReg(MRI, MO->getReg()))
5816 if (MO->isImm()) {
5817 uint64_t Imm = MO->getImm();
5828 // FIXME: We can use sign extended 64-bit literals, but only for signed
5837 // Handle non-register types that are treated like immediates.
5838 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
5867 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
5931 // TODO: Other immediate-like operand kinds could be commuted if there was a
5940 if (CommutedOpc == -1) {
6003 --ConstantBusLimit;
6007 if (Idx == -1)
6016 --LiteralLimit;
6017 --ConstantBusLimit;
6021 --LiteralLimit;
6022 --ConstantBusLimit;
6042 --ConstantBusLimit;
6107 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6108 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6109 SBase->setReg(SGPR);
6112 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
6113 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6114 SOff->setReg(SGPR);
6132 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6148 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6149 !VAddrDef->getOperand(1).isImm() ||
6150 VAddrDef->getOperand(1).getImm() != 0)
6178 if (NewVDstIn != -1) {
6185 if (NewVDstIn != -1) {
6192 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6193 VAddrDef->eraseFromParent();
6207 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6213 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
6214 SAddr->setReg(ToSGPR);
6244 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6247 bool ImpDef = Def->isImplicitDef();
6248 while (!ImpDef && Def && Def->isCopy()) {
6249 if (Def->getOperand(1).getReg().isPhysical())
6251 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6252 ImpDef = Def && Def->isImplicitDef();
6254 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6276 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6284 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6286 Register VScalarOp = ScalarOp->getReg();
6312 ScalarOp->setReg(CurReg);
6313 ScalarOp->setIsKill();
6315 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6323 // Read the next variant <- also loop target.
6325 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6327 // Read the next variant <- also loop target.
6330 TRI->getSubRegFromChannel(Idx + 1));
6335 // Comparison is to be done as 64-bit.
6351 TRI->getSubRegFromChannel(Idx, 2));
6366 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6374 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6378 ScalarOp->setReg(SScalarOp);
6379 ScalarOp->setIsKill();
6424 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6448 // incorrect due to the added control-flow.
6452 for (auto &MO : I->all_uses())
6468 LoopBB->addSuccessor(BodyBB);
6469 BodyBB->addSuccessor(LoopBB);
6470 BodyBB->addSuccessor(RemainderBB);
6474 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6475 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6476 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6485 MDT->addNewBlock(LoopBB, &MBB);
6486 MDT->addNewBlock(BodyBB, LoopBB);
6487 MDT->addNewBlock(RemainderBB, BodyBB);
6488 for (auto &Succ : RemainderBB->successors()) {
6489 if (MDT->properlyDominates(&MBB, Succ)) {
6490 MDT->changeImmediateDominator(Succ, RemainderBB);
6497 MachineBasicBlock::iterator First = RemainderBB->begin();
6510 // Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6533 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6537 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6556 MachineFunction &MF = *MI.getParent()->getParent();
6578 // Legalize FLAT
6602 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6631 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
6633 // Avoid creating no-op copies with the same src and dst reg class. These
6714 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
6719 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
6728 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
6738 while (Start->getOpcode() != FrameSetupOpcode)
6739 --Start;
6741 while (End->getOpcode() != FrameDestroyOpcode)
6745 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
6746 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
6770 if (SoffsetIdx != -1) {
6772 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
6773 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
6781 if (RsrcIdx != -1) {
6783 if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) {
6797 // a zero-value SRsrc.
6803 // Otherwise we are on non-ADDR64 hardware, and/or we have
6810 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
6829 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
6836 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
6847 VAddr->setReg(NewVAddr);
6848 Rsrc->setReg(NewSRsrc);
6853 "FIXME: Need to emit flat atomics here");
6881 MIB.addImm(CPol->getImm());
6886 MIB.addImm(TFE->getImm());
6909 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
6941 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
6942 if (RsrcIdx != -1) {
6968 "Deferred MachineInstr are not supposed to re-populate worklist");
6979 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7006 // Split s_mul_u64 in 32-bit vector multiplications.
7243 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7366 // Handle converting generic instructions like COPY-to-SGPR into
7367 // COPY-to-VGPR.
7396 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7397 // these are deleted later, but at -O0 it would leave a suspicious
7399 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7416 NewInstr->addOperand(Inst.getOperand(0));
7426 NewInstr->addOperand(Src);
7456 NewInstr->addOperand(Inst.getOperand(2));
7461 NewInstr->addOperand(Inst.getOperand(3));
7472 NewInstr->addOperand(Op);
7480 // Only propagate through live-def of SCC.
7489 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
7490 Register DstReg = NewInstr->getOperand(0).getReg();
7515 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7546 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7562 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
7579 Inst.getParent()->rend())) {
7581 -1) {
7598 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
7599 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
7628 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7655 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7721 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7750 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7779 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7835 // split the s_mul_u64 in 32-bit vector multiplications.
7840 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7863 // First, we extract the low 32-bit and high 32-bit values from each of the
7878 // --------------------
7881 // -----------------------------------------
7884 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
7886 // The low 32-bit value is Op1L*Op0L.
7887 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
7943 // Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
7949 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7972 // First, we extract the low 32-bit and high 32-bit values from each of the
8012 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8079 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8120 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8158 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8219 // (S_FLBIT_I32_B64 hi:lo) ->
8220 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8221 // (S_FF1_I32_B64 hi:lo) ->
8222 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8225 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8276 MachineInstr &UseMI = *I->getParent();
8300 } while (I != E && I->getParent() == &UseMI);
8393 SCCDefInst.getParent()->end())) {
8396 if (SCCIdx != -1) {
8398 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8412 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
8416 Copy->eraseFromParent();
8432 SCCUseInst->getParent()->rend())) {
8510 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8514 if (Idx == -1)
8543 // V_FMA_F32 v0, s0, s0, s0 -> No moves
8544 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
8546 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
8565 if (Idx == -1)
8603 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
8634 if (!Addr || !Addr->isFI())
8638 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
8640 FrameIndex = Addr->getIndex();
8641 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
8647 assert(Addr && Addr->isFI());
8648 FrameIndex = Addr->getIndex();
8649 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
8683 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
8684 while (++I != E && I->isInsideBundle()) {
8685 assert(!I->isBundle() && "No nested bundle!");
8710 // Instructions may have a 32-bit literal encoded after them. Check
8734 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
8742 const MachineFunction *MF = MI.getParent()->getParent();
8744 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
8761 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
8773 MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
8774 assert(TI != IfEntry->end());
8777 MachineFunction *MF = IfEntry->getParent();
8778 MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
8780 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8783 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
8784 .add(Branch->getOperand(0))
8785 .add(Branch->getOperand(1));
8787 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
8790 IfEntry->erase(TI);
8791 IfEntry->insert(IfEntry->end(), SIIF);
8792 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
8798 MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
8800 assert(TI != LoopEnd->end());
8803 MachineFunction *MF = LoopEnd->getParent();
8804 MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
8806 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8811 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
8812 for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) {
8817 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
8824 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
8827 .add(Branch->getOperand(0));
8829 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
8833 LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
8834 LoopEnd->erase(TI);
8835 LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
8836 LoopEnd->insert(LoopEnd->end(), SILOOP);
8843 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
8844 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
8845 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
8846 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
8847 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
8851 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
8852 /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
8856 return new GCNHazardRecognizer(DAG->MF);
8859 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
8867 // - pre-RA scheduling and post-RA scheduling
8873 // post-RA scheduling; we can tell that we're post-RA because we don't
8875 if (!DAG->hasVRegLiveness())
8876 return new GCNHazardRecognizer(DAG->MF);
8888 { MO_GOTPCREL, "amdgpu-gotprel" },
8889 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
8890 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
8891 { MO_REL32_LO, "amdgpu-rel32-lo" },
8892 { MO_REL32_HI, "amdgpu-rel32-hi" },
8893 { MO_ABS32_LO, "amdgpu-abs32-lo" },
8894 { MO_ABS32_HI, "amdgpu-abs32-hi" },
8904 {MONoClobber, "amdgpu-noclobber"},
8905 {MOLastUse, "amdgpu-last-use"},
8915 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
8931 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8936 // FIXME: Copies inserted in the block prolog for live-range split should also
8951 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9008 // GFX12 field is non-negative 24-bit signed byte offset.
9011 return (1 << OffsetBits) - 1;
9033 if (Idx == -1) // e.g. s_memtime
9037 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9056 Overflow = Imm - MaxImm;
9060 // the corresponding register contents can be re-used.
9062 // Load values with all low-bits (except for alignment bits) set into
9071 Overflow = High - Alignment.value();
9094 // Pre-GFX12, flat instruction offsets can only be non-negative, global and
9099 // On gfx10.1, flat instructions that go into the global address space cannot
9105 // +----------------------------+------+------+
9106 // | Address-Mode | SGPR | VGPR |
9107 // +----------------------------+------+------+
9109 // | negative, 4-aligned offset | x | ok |
9111 // +----------------------------+------+------+
9113 // | negative, 4-aligned offset | ok | ok |
9115 // +----------------------------+------+------+
9117 // | negative, 4-aligned offset | ok | ok |
9119 // +----------------------------+------+------+
9129 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9153 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9159 ImmField = COffsetVal - RemainderOffset;
9166 ImmField -= ImmField % 4;
9170 RemainderOffset = COffsetVal - ImmField;
9183 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9257 if (MFMAOp != -1)
9263 // -1 means that Opcode is already a native instruction.
9264 if (MCOp == -1)
9268 uint16_t NMCOp = (uint16_t)-1;
9271 if (NMCOp == (uint16_t)-1)
9273 if (NMCOp == (uint16_t)-1)
9275 if (NMCOp != (uint16_t)-1)
9279 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9281 if (MCOp == (uint16_t)-1)
9282 return -1;
9285 return -1;
9300 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9308 // Try to find the definition of reg:subreg in subreg-manipulation pseudos
9345 switch (MI->getOpcode()) {
9348 auto &Op1 = MI->getOperand(1);
9390 if (I->isDebugInstr())
9396 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9433 assert(I != DefBB->end());
9435 if (I->isDebugInstr())
9441 for (const MachineOperand &Op : I->operands()) {
9451 if (Reg == VReg && --NumUse == 0)
9453 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
9465 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
9478 (InsPt->getOpcode() == AMDGPU::SI_IF ||
9479 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9480 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9481 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
9520 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
9524 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
9539 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
9541 for (++I; I != E && I->isBundledWithPred(); ++I) {
9545 return Lat + Count - 1;
9555 auto IID = GI->getIntrinsicID();
9571 // Loads from the private and flat address spaces are divergent, because
9582 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9583 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9585 // At least one MMO in a non-global address space.
9635 // Loads from the private and flat address spaces are divergent, because
9643 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9644 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9646 // At least one MMO in a non-global address space.
9653 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9670 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
9671 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
9679 // currently turned into no-op COPYs by SelectionDAG ISel and are
9778 bool IsReversible, bool IsSigned) -> bool {
9802 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
9803 if (!Def || Def->getParent() != CmpInstr.getParent())
9806 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
9807 Def->getOpcode() != AMDGPU::S_AND_B64)
9811 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
9812 if (MO->isImm())
9813 Mask = MO->getImm();
9820 MachineOperand *SrcOp = &Def->getOperand(1);
9822 SrcOp = &Def->getOperand(2);
9823 else if (isMask(&Def->getOperand(2)))
9824 SrcOp = &Def->getOperand(1);
9829 if (IsSigned && BitNo == SrcSize - 1)
9843 Register DefReg = Def->getOperand(0).getReg();
9844 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
9847 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
9849 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
9850 I->killsRegister(AMDGPU::SCC, &RI))
9855 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
9856 SccDef->setIsDead(false);
9859 if (!MRI->use_nodbg_empty(DefReg)) {
9865 MachineBasicBlock *MBB = Def->getParent();
9872 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
9875 Def->eraseFromParent();
9926 // Add implicit aligned super-reg to force alignment on the data operand.
9929 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();