Lines Matching +full:abs +full:- +full:flat

1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
12 //===----------------------------------------------------------------------===//
32 #define DEBUG_TYPE "amdgpu-isel"
66 Subtarget->checkSubtargetFeatures(MF.getFunction());
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
96 return RB->getID() == AMDGPU::VCCRegBankID;
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
156 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
158 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
178 if (!MRI->getRegClassOrNull(SrcReg))
179 MRI->setRegClass(SrcReg, SrcRC);
207 const LLT DefTy = MRI->getType(DefReg);
209 // S1 G_PHIs should not be selected in instruction-select, instead:
210 // - divergent S1 G_PHI should go through lane mask merging algorithm
211 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
212 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
219 MRI->getRegClassOrRegBank(DefReg);
248 MachineBasicBlock *BB = MO.getParent()->getParent();
249 Register DstReg = MRI->createVirtualRegister(&SubRC);
254 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
295 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
296 DstRB->getID() != AMDGPU::VCCRegBankID)
299 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
303 // Dead implicit-def of scc
313 MachineFunction *MF = BB->getParent();
316 LLT Ty = MRI->getType(DstReg);
322 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
347 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
370 Register DstLo = MRI->createVirtualRegister(&HalfRC);
371 Register DstHi = MRI->createVirtualRegister(&HalfRC);
383 Register CarryReg = MRI->createVirtualRegister(CarryRC);
390 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
417 MachineFunction *MF = BB->getParent();
451 if (MRI->use_nodbg_empty(Dst1Reg)) {
456 if (!MRI->getRegClassOrNull(Dst1Reg))
457 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
477 MachineFunction *MF = BB->getParent();
481 if (Subtarget->hasMADIntraFwdBug())
492 // TODO: We should probably legalize these to only using 32-bit results.
497 LLT DstTy = MRI->getType(DstReg);
498 LLT SrcTy = MRI->getType(SrcReg);
507 // 16-bit operations really use 32-bit registers.
508 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
541 LLT DstTy = MRI->getType(DstReg);
542 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
559 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
579 const int NumDst = MI.getNumOperands() - 1;
585 LLT DstTy = MRI->getType(DstReg0);
586 LLT SrcTy = MRI->getType(SrcReg);
628 LLT SrcTy = MRI->getType(Src0);
639 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
645 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
648 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
649 DstBank->getID() == AMDGPU::VGPRRegBankID);
650 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
663 const int64_t K0 = ConstSrc0->Value.getSExtValue();
664 const int64_t K1 = ConstSrc1->Value.getSExtValue();
688 // (build_vector $src0, undef) -> copy $src0
690 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
701 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
751 if (ConstSrc1 && ConstSrc1->Value == 0) {
752 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
777 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
792 LLT Src1Ty = MRI->getType(Src1Reg);
794 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
851 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
853 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
854 "64-bit vector BFX instructions are expanded in regbankselect");
888 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
905 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
915 // count as using the constant bus twice - but in this case it is allowed since
939 MIB.addImm(ConstSelect->Value.getSExtValue() &
947 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
949 MIB.addImm(ConstVal->Value.getSExtValue());
977 LLT Ty = MRI->getType(Dst0);
1032 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1087 return -1;
1090 return -1;
1190 return -1;
1198 return -1;
1259 return -1;
1295 return -1;
1311 if (Opcode == -1)
1329 if (Opcode == -1)
1336 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1348 LLT DstTy = MRI->getType(Dst);
1369 if (Opcode == -1)
1405 const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1425 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1435 const int64_t Value = Arg->Value.getSExtValue();
1439 } else if (Value == -1) // all ones
1457 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1459 Module *M = MF->getFunction().getParent();
1461 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1463 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1475 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1479 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1488 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1489 MIB.addImm(MFI->getLDSSize());
1491 Module *M = MF->getFunction().getParent();
1503 MachineFunction &MF = *MBB->getParent();
1512 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1518 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1529 // Get the return address reg and mark it as an implicit live-in
1549 if (!MRI->getRegClassOrNull(Reg))
1550 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1557 MachineFunction *MF = MBB->getParent();
1591 Offset1 |= (CountDw - 1) << 6;
1649 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1663 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1665 BaseOffset = OffsetDef->getOperand(1).getReg();
1669 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1672 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1675 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1688 Readfirstlane->getOperand(1).setReg(BaseOffset);
1689 BaseOffset = Readfirstlane->getOperand(0).getReg();
1696 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1731 LLT PtrTy = MRI->getType(PtrBase);
1754 .addImm(IsGDS ? -1 : 0)
1762 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1806 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1808 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1809 unsigned IntrOpcode = Intr->BaseOpcode;
1818 int NumVDataDwords = -1;
1823 if (!BaseOpcode->Sampler)
1826 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1831 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1835 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1846 if (BaseOpcode->Atomic) {
1849 LLT Ty = MRI->getType(VDataIn);
1851 // Be careful to allow atomic swap on 16-bit element vectors.
1852 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1856 if (BaseOpcode->AtomicX2) {
1866 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1867 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
1869 if (BaseOpcode->Store) {
1871 VDataTy = MRI->getType(VDataIn);
1873 } else if (BaseOpcode->NoReturn) {
1877 VDataTy = MRI->getType(VDataOut);
1886 if (Subtarget->hasG16() && IsG16) {
1888 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1890 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1896 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1897 if (BaseOpcode->Atomic)
1898 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1905 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1909 continue; // XXX - Break?
1916 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1927 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1934 int Opcode = -1;
1949 if (Subtarget->hasGFX90AInsts()) {
1952 if (Opcode == -1) {
1959 if (Opcode == -1 &&
1963 if (Opcode == -1)
1967 if (Opcode == -1)
1974 if (BaseOpcode->AtomicX2) {
1975 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1977 Register TmpReg = MRI->createVirtualRegister(
1982 if (!MRI->use_empty(VDataOut)) {
1996 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2003 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2004 if (BaseOpcode->Sampler)
2005 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2010 MIB.addImm(DimInfo->Encoding);
2016 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2018 MIB.addImm(IsA16 ? -1 : 0);
2020 if (!Subtarget->hasGFX90AInsts()) {
2030 MIB.addImm(DimInfo->DA ? -1 : 0);
2031 if (BaseOpcode->HasD16)
2032 MIB.addImm(IsD16 ? -1 : 0);
2098 Function &F = I.getMF()->getFunction();
2142 if (!MRI->getRegClassOrNull(CCReg))
2143 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2188 return -1;
2196 const LLT DstTy = MRI->getType(DstReg);
2197 const LLT SrcTy = MRI->getType(SrcReg);
2212 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2234 Register LoReg = MRI->createVirtualRegister(DstRC);
2235 Register HiReg = MRI->createVirtualRegister(DstRC);
2242 // Write the low 16-bits of the high element into the high 16-bits of the
2253 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2255 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2256 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2257 Register ImmReg = MRI->createVirtualRegister(DstRC);
2297 if (SubRegIdx == -1)
2323 return SignedMask >= -16 && SignedMask <= 64;
2348 const LLT DstTy = MRI->getType(DstReg);
2349 const LLT SrcTy = MRI->getType(SrcReg);
2370 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2383 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2384 // 64-bit should have been split up in RegBankSelect
2407 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2422 // Using a single 32-bit SALU to calculate the high half is smaller than
2425 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2451 // We need a 64-bit register source, but the high bits don't matter.
2452 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2453 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2502 if (!Subtarget->hasSALUFloatInsts())
2507 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2512 if (MRI->getType(Dst) == LLT::scalar(32) &&
2513 MRI->getType(Src) == LLT::scalar(16)) {
2530 unsigned Size = MRI->getType(DstReg).getSizeInBits();
2535 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2539 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2545 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2548 if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2583 Register LoReg = MRI->createVirtualRegister(RC);
2584 Register HiReg = MRI->createVirtualRegister(RC);
2603 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2623 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2624 MRI->getType(Dst) != LLT::scalar(64))
2630 Src = Fabs->getOperand(1).getReg();
2638 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2639 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2640 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2641 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2669 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2670 MRI->getType(Dst) != LLT::scalar(64))
2676 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2677 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2678 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2679 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2721 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2727 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2734 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2738 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2749 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2757 const Value *Ptr = MMO->getValue();
2767 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2771 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2775 return I && I->getMetadata("amdgpu.uniform");
2787 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2795 .addImm(-1);
2821 return GI->is(Intrinsic::amdgcn_class);
2842 if (MRI->getType(CondReg) != LLT::scalar(32))
2849 // FIXME: Should scc->vcc copies and with exec?
2858 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2871 if (!MRI->getRegClassOrNull(CondReg))
2872 MRI->setRegClass(CondReg, ConstrainRC);
2887 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2900 LLT Ty = MRI->getType(DstReg);
2901 LLT MaskTy = MRI->getType(MaskReg);
2908 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2913 // the 64-bit pointer.
2914 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
2959 Register HiReg = MRI->createVirtualRegister(&RegRC);
2960 Register LoReg = MRI->createVirtualRegister(&RegRC);
2975 Register MaskLo = MRI->createVirtualRegister(&RegRC);
2976 MaskedLo = MRI->createVirtualRegister(&RegRC);
2989 Register MaskHi = MRI->createVirtualRegister(&RegRC);
2990 MaskedHi = MRI->createVirtualRegister(&RegRC);
3041 LLT DstTy = MRI->getType(DstReg);
3042 LLT SrcTy = MRI->getType(SrcReg);
3050 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3072 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3087 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3119 LLT VecTy = MRI->getType(DstReg);
3120 LLT ValTy = MRI->getType(ValReg);
3132 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3146 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3153 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3164 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3202 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3235 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3257 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3263 auto F = LoadMMO->getFlags() &
3265 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3266 Size, LoadMMO->getBaseAlign());
3269 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3270 sizeof(int32_t), LoadMMO->getBaseAlign());
3278 /// Match a zero extend from a 32-bit value to 64-bits.
3286 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3289 assert(Def->getNumOperands() == 3 &&
3290 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3291 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3292 return Def->getOperand(1).getReg();
3327 if (isSGPR(AddrDef->Reg)) {
3328 Addr = AddrDef->Reg;
3329 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3331 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3333 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3345 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3361 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3366 auto F = LoadMMO->getFlags() &
3368 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3369 Size, LoadMMO->getBaseAlign());
3371 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3383 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3442 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3450 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3456 .addImm(Subtarget->getWavefrontSizeLog2())
3461 .addImm(Subtarget->getWavefrontSizeLog2())
3479 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3481 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
3487 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3490 .addImm(Subtarget->getWavefrontSizeLog2())
3601 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
3665 if (MI->getOpcode() == AMDGPU::G_FNEG) {
3666 Src = MI->getOperand(1).getReg();
3669 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
3670 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3673 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
3674 if (LHS && LHS->isZero()) {
3676 Src = MI->getOperand(2).getReg();
3680 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
3681 Src = MI->getOperand(1).getReg();
3682 Mods |= SISrcMods::ABS;
3695 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3700 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
3701 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
3811 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
3824 if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3829 Src = MI->getOperand(1).getReg();
3838 // Packed instructions do not have abs modifiers.
3847 = Root.getParent()->getParent()->getParent()->getRegInfo();
3862 = Root.getParent()->getParent()->getParent()->getRegInfo();
3878 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
3879 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3882 if (Root.getImm() == -1)
3892 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3928 return MIB->getOperand(0).getReg();
3937 // Check if all elements also have abs modifier
3949 // Neg and Abs
3955 // Abs
3967 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
3968 assert(BV->getNumSources() > 0);
3969 // Based on first element decide which mod we match, neg or abs
3970 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
3971 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
3974 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
3975 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
3976 if (ElF32->getOpcode() != ModOpcode)
3978 EltsF32.push_back(ElF32->getOperand(1).getReg());
3982 if (BV->getNumSources() == EltsF32.size()) {
3998 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
3999 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4001 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
4007 if (CV->getNumSources() == EltsV2F16.size()) {
4024 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4025 assert(CV->getNumSources() > 0);
4026 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
4027 // Based on first element decide which mod we match, neg or abs
4028 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4032 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4033 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
4034 if (ElV2F16->getOpcode() != ModOpcode)
4036 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4040 if (CV->getNumSources() == EltsV2F16.size()) {
4055 if (TII.isInlineConstant(FPValReg->Value)) {
4057 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4060 // Non-inlineable splat floats should not fall-through for integer immediate
4079 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4085 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4086 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4087 Key = ShiftAmt->Value.getZExtValue() / 8;
4101 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4107 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4108 ShiftAmt->Value.getZExtValue() == 16) {
4173 MachineBasicBlock *MBB = MI->getParent();
4175 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4176 // then we can select all ptr + 32-bit offsets.
4205 auto SKnown = KB->getKnownBits(*SOffset);
4227 // If we make it this far we have a load with an 32-bit immediate offset.
4232 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4233 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4322 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4331 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4359 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4378 if (isSGPR(PtrBaseDef->Reg)) {
4382 // saddr + large_offset -> saddr +
4391 MachineBasicBlock *MBB = MI->getParent();
4393 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4395 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4425 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4426 // Look through the SGPR->VGPR copy.
4428 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4431 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4451 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4452 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4455 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4456 // moves required to copy a 64-bit SGPR to VGPR.
4458 MachineBasicBlock *MBB = MI->getParent();
4459 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4461 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4465 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4490 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4491 int FI = AddrDef->MI->getOperand(1).getIndex();
4498 Register SAddr = AddrDef->Reg;
4500 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4501 Register LHS = AddrDef->MI->getOperand(1).getReg();
4502 Register RHS = AddrDef->MI->getOperand(2).getReg();
4506 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4507 isSGPR(RHSDef->Reg)) {
4508 int FI = LHSDef->MI->getOperand(1).getIndex();
4512 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4516 .addReg(RHSDef->Reg)
4530 // Check whether the flat scratch SVS swizzle bug affects this access.
4533 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4539 auto VKnown = KB->getKnownBits(VAddr);
4541 /*Add=*/true, /*NSW=*/false, /*NUW=*/false, KB->getKnownBits(SAddr),
4567 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4570 Register RHS = AddrDef->MI->getOperand(2).getReg();
4571 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4574 Register LHS = AddrDef->MI->getOperand(1).getReg();
4588 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4589 int FI = LHSDef->MI->getOperand(1).getIndex();
4610 MachineBasicBlock *MBB = MI->getParent();
4611 MachineFunction *MF = MBB->getParent();
4612 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
4617 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4622 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4627 MIB.addReg(Info->getScratchRSrcReg());
4642 assert(Offset == 0 || Offset == -1);
4648 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
4655 KB->signBitIsZero(PtrBase))) {
4656 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
4657 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
4658 FI = PtrBaseDef->getOperand(1).getIndex();
4663 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4664 FI = RootDef->getOperand(1).getIndex();
4669 MIB.addReg(Info->getScratchRSrcReg());
4697 return KB->signBitIsZero(Base);
4713 return KB->signBitIsZero(Base);
4718 return Addr->getOpcode() == TargetOpcode::G_OR ||
4719 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
4720 Addr->getFlag(MachineInstr::NoUWrap));
4723 // Check that the base address of flat scratch load/store in the form of `base +
4737 Register LHS = AddrMI->getOperand(1).getReg();
4738 Register RHS = AddrMI->getOperand(2).getReg();
4740 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
4747 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
4748 RhsValReg->Value.getSExtValue() > -0x40000000)
4752 return KB->signBitIsZero(LHS);
4755 // Check address value in SGPR/VGPR are legal for flat scratch in the form
4768 Register LHS = AddrMI->getOperand(1).getReg();
4769 Register RHS = AddrMI->getOperand(2).getReg();
4770 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4773 // Check address value in SGPR/VGPR are legal for flat scratch in the form
4783 Register Base = AddrMI->getOperand(1).getReg();
4787 getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI);
4794 if (isNoUnsignedWrap(BaseDef->MI) &&
4796 (RHSOffset->Value.getSExtValue() < 0 &&
4797 RHSOffset->Value.getSExtValue() > -0x40000000)))
4800 Register LHS = BaseDef->MI->getOperand(1).getReg();
4801 Register RHS = BaseDef->MI->getOperand(2).getReg();
4802 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4814 if (RHS->countr_one() >= ShAmtBits)
4817 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
4825 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
4830 Reg = Def->Reg;
4832 if (Register WaveBase = getWaveAddress(Def->MI)) {
4835 MIB.addReg(Info->getScratchRSrcReg());
4860 MIB.addReg(Info->getScratchRSrcReg());
4875 MIB.addReg(Info->getScratchRSrcReg());
4886 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4902 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4951 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4969 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4988 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
4991 MachineOperand &RHS = RootI->getOperand(2);
4996 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
5003 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5021 // full 128-bit register. If we are building multiple resource descriptors,
5022 // this will allow CSEing of the 2-component register.
5063 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5082 Data.N2 = InputAdd->getOperand(1).getReg();
5083 Data.N3 = InputAdd->getOperand(2).getReg();
5085 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5090 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5091 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5099 // (ptr_add N2, N3) -> addr64, or
5100 // (ptr_add (ptr_add N2, N3), C1) -> addr64
5105 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5117 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5145 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5147 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5160 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5164 // N0 -> offset, or
5165 // (N0 + C1) -> offset
5187 // N0 -> offset, or
5188 // (N0 + C1) -> offset
5192 // TODO: Look through extensions for 32-bit soffset.
5275 /// Get an immediate that must be 32-bits, and treated as zero extended.
5317 // Match the (soffset + offset) pair as a 32-bit register base and
5331 assert(MRI->getType(SOffset) == LLT::scalar(32));
5339 if (MI->getOpcode() == AMDGPU::G_BITCAST)
5340 return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
5344 // Figure out if this is really an extract of the high 16-bits of a dword,
5350 if (Inst->getOpcode() != AMDGPU::G_TRUNC)
5354 getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI);
5358 if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
5360 TruncOp->getOperand(2).getReg(), MRI);
5361 if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
5363 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5372 if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
5373 assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
5376 ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
5381 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5399 if (MI->getOpcode() == AMDGPU::G_FPEXT) {
5400 MachineOperand *MO = &MI->getOperand(1);
5401 Src = MO->getReg();
5404 assert(MRI->getType(Src) == LLT::scalar(16));
5408 if (MI->getOpcode() == AMDGPU::G_BITCAST) {
5409 MO = &MI->getOperand(1);
5410 Src = MO->getReg();
5415 // Be careful about folding modifiers if we already have an abs. fneg is
5417 if ((Mods & SISrcMods::ABS) == 0) {
5425 if ((ModsTmp & SISrcMods::ABS) != 0)
5426 Mods |= SISrcMods::ABS;
5442 MO = &MI->getOperand(0);
5443 Src = MO->getReg();
5554 TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5556 // and S_OR can be constant-folded
5569 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5616 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5618 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
5624 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5626 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
5632 assert(OpIdx == -1);
5636 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
5639 MIB.addImm(Op.getCImm()->getSExtValue());
5646 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5648 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
5652 /// no-op here.
5703 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();