15ffd83dbSDimitry Andric //=== lib/CodeGen/GlobalISel/AMDGPURegBankCombiner.cpp ---------------===// 25ffd83dbSDimitry Andric // 35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 65ffd83dbSDimitry Andric // 75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 85ffd83dbSDimitry Andric // 95ffd83dbSDimitry Andric // This pass does combining of machine instructions at the generic MI level, 105ffd83dbSDimitry Andric // after register banks are known. 115ffd83dbSDimitry Andric // 125ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 135ffd83dbSDimitry Andric 14e8d8bef9SDimitry Andric #include "AMDGPU.h" 155ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h" 16fe6060f1SDimitry Andric #include "AMDGPURegisterBankInfo.h" 17e8d8bef9SDimitry Andric #include "GCNSubtarget.h" 18fe6060f1SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 190eae32dcSDimitry Andric #include "SIMachineFunctionInfo.h" 205ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/Combiner.h" 215ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 225ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 2306c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 245ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 255ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 265ffd83dbSDimitry Andric #include "llvm/CodeGen/MachineDominators.h" 275ffd83dbSDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h" 280eae32dcSDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h" 29e8d8bef9SDimitry Andric #include "llvm/Target/TargetMachine.h" 3006c3fb27SDimitry Andric 3106c3fb27SDimitry Andric #define GET_GICOMBINER_DEPS 3206c3fb27SDimitry Andric #include "AMDGPUGenPreLegalizeGICombiner.inc" 3306c3fb27SDimitry Andric #undef GET_GICOMBINER_DEPS 3406c3fb27SDimitry Andric 355ffd83dbSDimitry Andric #define DEBUG_TYPE "amdgpu-regbank-combiner" 365ffd83dbSDimitry Andric 375ffd83dbSDimitry Andric using namespace llvm; 385ffd83dbSDimitry Andric using namespace MIPatternMatch; 395ffd83dbSDimitry Andric 4006c3fb27SDimitry Andric namespace { 4106c3fb27SDimitry Andric #define GET_GICOMBINER_TYPES 4206c3fb27SDimitry Andric #include "AMDGPUGenRegBankGICombiner.inc" 4306c3fb27SDimitry Andric #undef GET_GICOMBINER_TYPES 4406c3fb27SDimitry Andric 455f757f3fSDimitry Andric class AMDGPURegBankCombinerImpl : public Combiner { 46fe6060f1SDimitry Andric protected: 4706c3fb27SDimitry Andric const AMDGPURegBankCombinerImplRuleConfig &RuleConfig; 4806c3fb27SDimitry Andric const GCNSubtarget &STI; 49fe6060f1SDimitry Andric const RegisterBankInfo &RBI; 50fe6060f1SDimitry Andric const TargetRegisterInfo &TRI; 510eae32dcSDimitry Andric const SIInstrInfo &TII; 525f757f3fSDimitry Andric // TODO: Make CombinerHelper methods const. 535f757f3fSDimitry Andric mutable CombinerHelper Helper; 54fe6060f1SDimitry Andric 55fe6060f1SDimitry Andric public: 5606c3fb27SDimitry Andric AMDGPURegBankCombinerImpl( 575f757f3fSDimitry Andric MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 585f757f3fSDimitry Andric GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 5906c3fb27SDimitry Andric const AMDGPURegBankCombinerImplRuleConfig &RuleConfig, 605f757f3fSDimitry Andric const GCNSubtarget &STI, MachineDominatorTree *MDT, 615f757f3fSDimitry Andric const LegalizerInfo *LI); 62fe6060f1SDimitry Andric 6306c3fb27SDimitry Andric static const char *getName() { return "AMDGPURegBankCombinerImpl"; } 6406c3fb27SDimitry Andric 655f757f3fSDimitry Andric bool tryCombineAll(MachineInstr &I) const override; 6606c3fb27SDimitry Andric 6706c3fb27SDimitry Andric bool isVgprRegBank(Register Reg) const; 6806c3fb27SDimitry Andric Register getAsVgpr(Register Reg) const; 69fe6060f1SDimitry Andric 70fe6060f1SDimitry Andric struct MinMaxMedOpc { 71fe6060f1SDimitry Andric unsigned Min, Max, Med; 72fe6060f1SDimitry Andric }; 73fe6060f1SDimitry Andric 74fe6060f1SDimitry Andric struct Med3MatchInfo { 75fe6060f1SDimitry Andric unsigned Opc; 76fe6060f1SDimitry Andric Register Val0, Val1, Val2; 77fe6060f1SDimitry Andric }; 78fe6060f1SDimitry Andric 7906c3fb27SDimitry Andric MinMaxMedOpc getMinMaxPair(unsigned Opc) const; 80fe6060f1SDimitry Andric 81349cc55cSDimitry Andric template <class m_Cst, typename CstTy> 82fe6060f1SDimitry Andric bool matchMed(MachineInstr &MI, MachineRegisterInfo &MRI, MinMaxMedOpc MMMOpc, 8306c3fb27SDimitry Andric Register &Val, CstTy &K0, CstTy &K1) const; 84fe6060f1SDimitry Andric 8506c3fb27SDimitry Andric bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const; 8606c3fb27SDimitry Andric bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const; 8706c3fb27SDimitry Andric bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg) const; 8806c3fb27SDimitry Andric bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg) const; 8906c3fb27SDimitry Andric void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const; 9006c3fb27SDimitry Andric void applyClamp(MachineInstr &MI, Register &Reg) const; 910eae32dcSDimitry Andric 920eae32dcSDimitry Andric private: 9306c3fb27SDimitry Andric SIModeRegisterDefaults getMode() const; 9406c3fb27SDimitry Andric bool getIEEE() const; 9506c3fb27SDimitry Andric bool getDX10Clamp() const; 9606c3fb27SDimitry Andric bool isFminnumIeee(const MachineInstr &MI) const; 9706c3fb27SDimitry Andric bool isFCst(MachineInstr *MI) const; 9806c3fb27SDimitry Andric bool isClampZeroToOne(MachineInstr *K0, MachineInstr *K1) const; 9906c3fb27SDimitry Andric 10006c3fb27SDimitry Andric #define GET_GICOMBINER_CLASS_MEMBERS 10106c3fb27SDimitry Andric #define AMDGPUSubtarget GCNSubtarget 10206c3fb27SDimitry Andric #include "AMDGPUGenRegBankGICombiner.inc" 10306c3fb27SDimitry Andric #undef GET_GICOMBINER_CLASS_MEMBERS 10406c3fb27SDimitry Andric #undef AMDGPUSubtarget 105fe6060f1SDimitry Andric }; 106fe6060f1SDimitry Andric 10706c3fb27SDimitry Andric #define GET_GICOMBINER_IMPL 10806c3fb27SDimitry Andric #define AMDGPUSubtarget GCNSubtarget 10906c3fb27SDimitry Andric #include "AMDGPUGenRegBankGICombiner.inc" 11006c3fb27SDimitry Andric #undef AMDGPUSubtarget 11106c3fb27SDimitry Andric #undef GET_GICOMBINER_IMPL 11206c3fb27SDimitry Andric 11306c3fb27SDimitry Andric AMDGPURegBankCombinerImpl::AMDGPURegBankCombinerImpl( 1145f757f3fSDimitry Andric MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 1155f757f3fSDimitry Andric GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 1165f757f3fSDimitry Andric const AMDGPURegBankCombinerImplRuleConfig &RuleConfig, 1175f757f3fSDimitry Andric const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) 1185f757f3fSDimitry Andric : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI), 1195f757f3fSDimitry Andric RBI(*STI.getRegBankInfo()), TRI(*STI.getRegisterInfo()), 1205f757f3fSDimitry Andric TII(*STI.getInstrInfo()), 1215f757f3fSDimitry Andric Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI), 12206c3fb27SDimitry Andric #define GET_GICOMBINER_CONSTRUCTOR_INITS 12306c3fb27SDimitry Andric #include "AMDGPUGenRegBankGICombiner.inc" 12406c3fb27SDimitry Andric #undef GET_GICOMBINER_CONSTRUCTOR_INITS 12506c3fb27SDimitry Andric { 12606c3fb27SDimitry Andric } 12706c3fb27SDimitry Andric 12806c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::isVgprRegBank(Register Reg) const { 129fe6060f1SDimitry Andric return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::VGPRRegBankID; 130fe6060f1SDimitry Andric } 131fe6060f1SDimitry Andric 13206c3fb27SDimitry Andric Register AMDGPURegBankCombinerImpl::getAsVgpr(Register Reg) const { 1334824e7fdSDimitry Andric if (isVgprRegBank(Reg)) 1344824e7fdSDimitry Andric return Reg; 1354824e7fdSDimitry Andric 1364824e7fdSDimitry Andric // Search for existing copy of Reg to vgpr. 1374824e7fdSDimitry Andric for (MachineInstr &Use : MRI.use_instructions(Reg)) { 1384824e7fdSDimitry Andric Register Def = Use.getOperand(0).getReg(); 1394824e7fdSDimitry Andric if (Use.getOpcode() == AMDGPU::COPY && isVgprRegBank(Def)) 1404824e7fdSDimitry Andric return Def; 1414824e7fdSDimitry Andric } 1424824e7fdSDimitry Andric 1434824e7fdSDimitry Andric // Copy Reg to vgpr. 1444824e7fdSDimitry Andric Register VgprReg = B.buildCopy(MRI.getType(Reg), Reg).getReg(0); 1454824e7fdSDimitry Andric MRI.setRegBank(VgprReg, RBI.getRegBank(AMDGPU::VGPRRegBankID)); 1464824e7fdSDimitry Andric return VgprReg; 1474824e7fdSDimitry Andric } 1484824e7fdSDimitry Andric 14906c3fb27SDimitry Andric AMDGPURegBankCombinerImpl::MinMaxMedOpc 15006c3fb27SDimitry Andric AMDGPURegBankCombinerImpl::getMinMaxPair(unsigned Opc) const { 151fe6060f1SDimitry Andric switch (Opc) { 152fe6060f1SDimitry Andric default: 153fe6060f1SDimitry Andric llvm_unreachable("Unsupported opcode"); 154fe6060f1SDimitry Andric case AMDGPU::G_SMAX: 155fe6060f1SDimitry Andric case AMDGPU::G_SMIN: 156fe6060f1SDimitry Andric return {AMDGPU::G_SMIN, AMDGPU::G_SMAX, AMDGPU::G_AMDGPU_SMED3}; 157fe6060f1SDimitry Andric case AMDGPU::G_UMAX: 158fe6060f1SDimitry Andric case AMDGPU::G_UMIN: 159fe6060f1SDimitry Andric return {AMDGPU::G_UMIN, AMDGPU::G_UMAX, AMDGPU::G_AMDGPU_UMED3}; 1600eae32dcSDimitry Andric case AMDGPU::G_FMAXNUM: 1610eae32dcSDimitry Andric case AMDGPU::G_FMINNUM: 1620eae32dcSDimitry Andric return {AMDGPU::G_FMINNUM, AMDGPU::G_FMAXNUM, AMDGPU::G_AMDGPU_FMED3}; 1630eae32dcSDimitry Andric case AMDGPU::G_FMAXNUM_IEEE: 1640eae32dcSDimitry Andric case AMDGPU::G_FMINNUM_IEEE: 1650eae32dcSDimitry Andric return {AMDGPU::G_FMINNUM_IEEE, AMDGPU::G_FMAXNUM_IEEE, 1660eae32dcSDimitry Andric AMDGPU::G_AMDGPU_FMED3}; 167fe6060f1SDimitry Andric } 168fe6060f1SDimitry Andric } 169fe6060f1SDimitry Andric 170349cc55cSDimitry Andric template <class m_Cst, typename CstTy> 17106c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::matchMed(MachineInstr &MI, 172fe6060f1SDimitry Andric MachineRegisterInfo &MRI, 173fe6060f1SDimitry Andric MinMaxMedOpc MMMOpc, Register &Val, 17406c3fb27SDimitry Andric CstTy &K0, CstTy &K1) const { 175fe6060f1SDimitry Andric // 4 operand commutes of: min(max(Val, K0), K1). 176fe6060f1SDimitry Andric // Find K1 from outer instr: min(max(...), K1) or min(K1, max(...)). 177fe6060f1SDimitry Andric // Find K0 and Val from inner instr: max(K0, Val) or max(Val, K0). 178fe6060f1SDimitry Andric // 4 operand commutes of: max(min(Val, K1), K0). 179fe6060f1SDimitry Andric // Find K0 from outer instr: max(min(...), K0) or max(K0, min(...)). 180fe6060f1SDimitry Andric // Find K1 and Val from inner instr: min(K1, Val) or min(Val, K1). 181fe6060f1SDimitry Andric return mi_match( 182fe6060f1SDimitry Andric MI, MRI, 183fe6060f1SDimitry Andric m_any_of( 184fe6060f1SDimitry Andric m_CommutativeBinOp( 185fe6060f1SDimitry Andric MMMOpc.Min, m_CommutativeBinOp(MMMOpc.Max, m_Reg(Val), m_Cst(K0)), 186fe6060f1SDimitry Andric m_Cst(K1)), 187fe6060f1SDimitry Andric m_CommutativeBinOp( 188fe6060f1SDimitry Andric MMMOpc.Max, m_CommutativeBinOp(MMMOpc.Min, m_Reg(Val), m_Cst(K1)), 189fe6060f1SDimitry Andric m_Cst(K0)))); 190fe6060f1SDimitry Andric } 191fe6060f1SDimitry Andric 19206c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::matchIntMinMaxToMed3( 19306c3fb27SDimitry Andric MachineInstr &MI, Med3MatchInfo &MatchInfo) const { 194fe6060f1SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 195fe6060f1SDimitry Andric if (!isVgprRegBank(Dst)) 196fe6060f1SDimitry Andric return false; 197fe6060f1SDimitry Andric 198fcaf7f86SDimitry Andric // med3 for i16 is only available on gfx9+, and not available for v2i16. 199fcaf7f86SDimitry Andric LLT Ty = MRI.getType(Dst); 20006c3fb27SDimitry Andric if ((Ty != LLT::scalar(16) || !STI.hasMed3_16()) && Ty != LLT::scalar(32)) 201fe6060f1SDimitry Andric return false; 202fe6060f1SDimitry Andric 203fe6060f1SDimitry Andric MinMaxMedOpc OpcodeTriple = getMinMaxPair(MI.getOpcode()); 204349cc55cSDimitry Andric Register Val; 205bdd1243dSDimitry Andric std::optional<ValueAndVReg> K0, K1; 206fe6060f1SDimitry Andric // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1. 207349cc55cSDimitry Andric if (!matchMed<GCstAndRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1)) 208fe6060f1SDimitry Andric return false; 209fe6060f1SDimitry Andric 210349cc55cSDimitry Andric if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_SMED3 && K0->Value.sgt(K1->Value)) 211fe6060f1SDimitry Andric return false; 212349cc55cSDimitry Andric if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_UMED3 && K0->Value.ugt(K1->Value)) 213fe6060f1SDimitry Andric return false; 214fe6060f1SDimitry Andric 215349cc55cSDimitry Andric MatchInfo = {OpcodeTriple.Med, Val, K0->VReg, K1->VReg}; 216fe6060f1SDimitry Andric return true; 217fe6060f1SDimitry Andric } 218fe6060f1SDimitry Andric 2190eae32dcSDimitry Andric // fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) 2200eae32dcSDimitry Andric // ieee = true : min/max(SNaN, K) = QNaN, min/max(QNaN, K) = K 2210eae32dcSDimitry Andric // ieee = false : min/max(NaN, K) = K 2220eae32dcSDimitry Andric // clamp(NaN) = dx10_clamp ? 0.0 : NaN 2230eae32dcSDimitry Andric // Consider values of min(max(Val, K0), K1) and max(min(Val, K1), K0) as input. 2240eae32dcSDimitry Andric // Other operand commutes (see matchMed) give same result since min and max are 2250eae32dcSDimitry Andric // commutative. 2260eae32dcSDimitry Andric 2270eae32dcSDimitry Andric // Try to replace fp min(max(Val, K0), K1) or max(min(Val, K1), K0), KO<=K1 2280eae32dcSDimitry Andric // with fmed3(Val, K0, K1) or clamp(Val). Clamp requires K0 = 0.0 and K1 = 1.0. 2290eae32dcSDimitry Andric // Val = SNaN only for ieee = true 2300eae32dcSDimitry Andric // fmed3(SNaN, K0, K1) = min(min(SNaN, K0), K1) = min(QNaN, K1) = K1 2310eae32dcSDimitry Andric // min(max(SNaN, K0), K1) = min(QNaN, K1) = K1 2320eae32dcSDimitry Andric // max(min(SNaN, K1), K0) = max(K1, K0) = K1 2330eae32dcSDimitry Andric // Val = NaN,ieee = false or Val = QNaN,ieee = true 2340eae32dcSDimitry Andric // fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0 2350eae32dcSDimitry Andric // min(max(NaN, K0), K1) = min(K0, K1) = K0 (can clamp when dx10_clamp = true) 2360eae32dcSDimitry Andric // max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0 23706c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::matchFPMinMaxToMed3( 23806c3fb27SDimitry Andric MachineInstr &MI, Med3MatchInfo &MatchInfo) const { 2390eae32dcSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 2400eae32dcSDimitry Andric LLT Ty = MRI.getType(Dst); 24104eeddc0SDimitry Andric 24204eeddc0SDimitry Andric // med3 for f16 is only available on gfx9+, and not available for v2f16. 24306c3fb27SDimitry Andric if ((Ty != LLT::scalar(16) || !STI.hasMed3_16()) && Ty != LLT::scalar(32)) 2440eae32dcSDimitry Andric return false; 2450eae32dcSDimitry Andric 2460eae32dcSDimitry Andric auto OpcodeTriple = getMinMaxPair(MI.getOpcode()); 2470eae32dcSDimitry Andric 2480eae32dcSDimitry Andric Register Val; 249bdd1243dSDimitry Andric std::optional<FPValueAndVReg> K0, K1; 2500eae32dcSDimitry Andric // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1. 2510eae32dcSDimitry Andric if (!matchMed<GFCstAndRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1)) 2520eae32dcSDimitry Andric return false; 2530eae32dcSDimitry Andric 2540eae32dcSDimitry Andric if (K0->Value > K1->Value) 2550eae32dcSDimitry Andric return false; 2560eae32dcSDimitry Andric 2570eae32dcSDimitry Andric // For IEEE=false perform combine only when it's safe to assume that there are 2580eae32dcSDimitry Andric // no NaN inputs. Most often MI is marked with nnan fast math flag. 2590eae32dcSDimitry Andric // For IEEE=true consider NaN inputs. fmed3(NaN, K0, K1) is equivalent to 2600eae32dcSDimitry Andric // min(min(NaN, K0), K1). Safe to fold for min(max(Val, K0), K1) since inner 2610eae32dcSDimitry Andric // nodes(max/min) have same behavior when one input is NaN and other isn't. 2620eae32dcSDimitry Andric // Don't consider max(min(SNaN, K1), K0) since there is no isKnownNeverQNaN, 2630eae32dcSDimitry Andric // also post-legalizer inputs to min/max are fcanonicalized (never SNaN). 2640eae32dcSDimitry Andric if ((getIEEE() && isFminnumIeee(MI)) || isKnownNeverNaN(Dst, MRI)) { 2650eae32dcSDimitry Andric // Don't fold single use constant that can't be inlined. 2660eae32dcSDimitry Andric if ((!MRI.hasOneNonDBGUse(K0->VReg) || TII.isInlineConstant(K0->Value)) && 2670eae32dcSDimitry Andric (!MRI.hasOneNonDBGUse(K1->VReg) || TII.isInlineConstant(K1->Value))) { 2680eae32dcSDimitry Andric MatchInfo = {OpcodeTriple.Med, Val, K0->VReg, K1->VReg}; 2690eae32dcSDimitry Andric return true; 2700eae32dcSDimitry Andric } 2710eae32dcSDimitry Andric } 2720eae32dcSDimitry Andric 2730eae32dcSDimitry Andric return false; 2740eae32dcSDimitry Andric } 2750eae32dcSDimitry Andric 27606c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::matchFPMinMaxToClamp(MachineInstr &MI, 27706c3fb27SDimitry Andric Register &Reg) const { 2780eae32dcSDimitry Andric // Clamp is available on all types after regbankselect (f16, f32, f64, v2f16). 2790eae32dcSDimitry Andric auto OpcodeTriple = getMinMaxPair(MI.getOpcode()); 2800eae32dcSDimitry Andric Register Val; 281bdd1243dSDimitry Andric std::optional<FPValueAndVReg> K0, K1; 2820eae32dcSDimitry Andric // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). 2830eae32dcSDimitry Andric if (!matchMed<GFCstOrSplatGFCstMatch>(MI, MRI, OpcodeTriple, Val, K0, K1)) 2840eae32dcSDimitry Andric return false; 2850eae32dcSDimitry Andric 2860eae32dcSDimitry Andric if (!K0->Value.isExactlyValue(0.0) || !K1->Value.isExactlyValue(1.0)) 2870eae32dcSDimitry Andric return false; 2880eae32dcSDimitry Andric 2890eae32dcSDimitry Andric // For IEEE=false perform combine only when it's safe to assume that there are 2900eae32dcSDimitry Andric // no NaN inputs. Most often MI is marked with nnan fast math flag. 2910eae32dcSDimitry Andric // For IEEE=true consider NaN inputs. Only min(max(QNaN, 0.0), 1.0) evaluates 2920eae32dcSDimitry Andric // to 0.0 requires dx10_clamp = true. 2930eae32dcSDimitry Andric if ((getIEEE() && getDX10Clamp() && isFminnumIeee(MI) && 2940eae32dcSDimitry Andric isKnownNeverSNaN(Val, MRI)) || 2950eae32dcSDimitry Andric isKnownNeverNaN(MI.getOperand(0).getReg(), MRI)) { 2960eae32dcSDimitry Andric Reg = Val; 2970eae32dcSDimitry Andric return true; 2980eae32dcSDimitry Andric } 2990eae32dcSDimitry Andric 3000eae32dcSDimitry Andric return false; 3010eae32dcSDimitry Andric } 3020eae32dcSDimitry Andric 3030eae32dcSDimitry Andric // Replacing fmed3(NaN, 0.0, 1.0) with clamp. Requires dx10_clamp = true. 3040eae32dcSDimitry Andric // Val = SNaN only for ieee = true. It is important which operand is NaN. 3050eae32dcSDimitry Andric // min(min(SNaN, 0.0), 1.0) = min(QNaN, 1.0) = 1.0 3060eae32dcSDimitry Andric // min(min(SNaN, 1.0), 0.0) = min(QNaN, 0.0) = 0.0 3070eae32dcSDimitry Andric // min(min(0.0, 1.0), SNaN) = min(0.0, SNaN) = QNaN 3080eae32dcSDimitry Andric // Val = NaN,ieee = false or Val = QNaN,ieee = true 3090eae32dcSDimitry Andric // min(min(NaN, 0.0), 1.0) = min(0.0, 1.0) = 0.0 3100eae32dcSDimitry Andric // min(min(NaN, 1.0), 0.0) = min(1.0, 0.0) = 0.0 3110eae32dcSDimitry Andric // min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0 31206c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI, 31306c3fb27SDimitry Andric Register &Reg) const { 3140eae32dcSDimitry Andric // In llvm-ir, clamp is often represented as an intrinsic call to 3150eae32dcSDimitry Andric // @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders. 31606c3fb27SDimitry Andric MachineInstr *Src0 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI); 31706c3fb27SDimitry Andric MachineInstr *Src1 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI); 31806c3fb27SDimitry Andric MachineInstr *Src2 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI); 3190eae32dcSDimitry Andric 3200eae32dcSDimitry Andric if (isFCst(Src0) && !isFCst(Src1)) 3210eae32dcSDimitry Andric std::swap(Src0, Src1); 3220eae32dcSDimitry Andric if (isFCst(Src1) && !isFCst(Src2)) 3230eae32dcSDimitry Andric std::swap(Src1, Src2); 3240eae32dcSDimitry Andric if (isFCst(Src0) && !isFCst(Src1)) 3250eae32dcSDimitry Andric std::swap(Src0, Src1); 3260eae32dcSDimitry Andric if (!isClampZeroToOne(Src1, Src2)) 3270eae32dcSDimitry Andric return false; 3280eae32dcSDimitry Andric 3290eae32dcSDimitry Andric Register Val = Src0->getOperand(0).getReg(); 3300eae32dcSDimitry Andric 3310eae32dcSDimitry Andric auto isOp3Zero = [&]() { 3320eae32dcSDimitry Andric MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI); 3330eae32dcSDimitry Andric if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT) 3340eae32dcSDimitry Andric return Op3->getOperand(1).getFPImm()->isExactlyValue(0.0); 3350eae32dcSDimitry Andric return false; 3360eae32dcSDimitry Andric }; 3370eae32dcSDimitry Andric // For IEEE=false perform combine only when it's safe to assume that there are 3380eae32dcSDimitry Andric // no NaN inputs. Most often MI is marked with nnan fast math flag. 3390eae32dcSDimitry Andric // For IEEE=true consider NaN inputs. Requires dx10_clamp = true. Safe to fold 3400eae32dcSDimitry Andric // when Val could be QNaN. If Val can also be SNaN third input should be 0.0. 3410eae32dcSDimitry Andric if (isKnownNeverNaN(MI.getOperand(0).getReg(), MRI) || 3420eae32dcSDimitry Andric (getIEEE() && getDX10Clamp() && 3430eae32dcSDimitry Andric (isKnownNeverSNaN(Val, MRI) || isOp3Zero()))) { 3440eae32dcSDimitry Andric Reg = Val; 3450eae32dcSDimitry Andric return true; 3460eae32dcSDimitry Andric } 3470eae32dcSDimitry Andric 3480eae32dcSDimitry Andric return false; 3490eae32dcSDimitry Andric } 3500eae32dcSDimitry Andric 35106c3fb27SDimitry Andric void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI, 35206c3fb27SDimitry Andric Register &Reg) const { 3530eae32dcSDimitry Andric B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg}, 3540eae32dcSDimitry Andric MI.getFlags()); 3550eae32dcSDimitry Andric MI.eraseFromParent(); 3560eae32dcSDimitry Andric } 3570eae32dcSDimitry Andric 35806c3fb27SDimitry Andric void AMDGPURegBankCombinerImpl::applyMed3(MachineInstr &MI, 35906c3fb27SDimitry Andric Med3MatchInfo &MatchInfo) const { 360fe6060f1SDimitry Andric B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)}, 3614824e7fdSDimitry Andric {getAsVgpr(MatchInfo.Val0), getAsVgpr(MatchInfo.Val1), 3624824e7fdSDimitry Andric getAsVgpr(MatchInfo.Val2)}, 3634824e7fdSDimitry Andric MI.getFlags()); 364fe6060f1SDimitry Andric MI.eraseFromParent(); 365fe6060f1SDimitry Andric } 366fe6060f1SDimitry Andric 36706c3fb27SDimitry Andric SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const { 3680eae32dcSDimitry Andric return MF.getInfo<SIMachineFunctionInfo>()->getMode(); 3690eae32dcSDimitry Andric } 3700eae32dcSDimitry Andric 37106c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::getIEEE() const { return getMode().IEEE; } 3720eae32dcSDimitry Andric 37306c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::getDX10Clamp() const { 37406c3fb27SDimitry Andric return getMode().DX10Clamp; 37506c3fb27SDimitry Andric } 3760eae32dcSDimitry Andric 37706c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::isFminnumIeee(const MachineInstr &MI) const { 3780eae32dcSDimitry Andric return MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE; 3790eae32dcSDimitry Andric } 3800eae32dcSDimitry Andric 38106c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::isFCst(MachineInstr *MI) const { 3820eae32dcSDimitry Andric return MI->getOpcode() == AMDGPU::G_FCONSTANT; 3830eae32dcSDimitry Andric } 3840eae32dcSDimitry Andric 38506c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::isClampZeroToOne(MachineInstr *K0, 38606c3fb27SDimitry Andric MachineInstr *K1) const { 3870eae32dcSDimitry Andric if (isFCst(K0) && isFCst(K1)) { 3880eae32dcSDimitry Andric const ConstantFP *KO_FPImm = K0->getOperand(1).getFPImm(); 3890eae32dcSDimitry Andric const ConstantFP *K1_FPImm = K1->getOperand(1).getFPImm(); 3900eae32dcSDimitry Andric return (KO_FPImm->isExactlyValue(0.0) && K1_FPImm->isExactlyValue(1.0)) || 3910eae32dcSDimitry Andric (KO_FPImm->isExactlyValue(1.0) && K1_FPImm->isExactlyValue(0.0)); 3920eae32dcSDimitry Andric } 3930eae32dcSDimitry Andric return false; 3940eae32dcSDimitry Andric } 3950eae32dcSDimitry Andric 3965ffd83dbSDimitry Andric // Pass boilerplate 3975ffd83dbSDimitry Andric // ================ 3985ffd83dbSDimitry Andric 3995ffd83dbSDimitry Andric class AMDGPURegBankCombiner : public MachineFunctionPass { 4005ffd83dbSDimitry Andric public: 4015ffd83dbSDimitry Andric static char ID; 4025ffd83dbSDimitry Andric 4035ffd83dbSDimitry Andric AMDGPURegBankCombiner(bool IsOptNone = false); 4045ffd83dbSDimitry Andric 40506c3fb27SDimitry Andric StringRef getPassName() const override { return "AMDGPURegBankCombiner"; } 4065ffd83dbSDimitry Andric 4075ffd83dbSDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 4085ffd83dbSDimitry Andric 4095ffd83dbSDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override; 4105f757f3fSDimitry Andric 4115ffd83dbSDimitry Andric private: 4125ffd83dbSDimitry Andric bool IsOptNone; 4135f757f3fSDimitry Andric AMDGPURegBankCombinerImplRuleConfig RuleConfig; 4145ffd83dbSDimitry Andric }; 4155ffd83dbSDimitry Andric } // end anonymous namespace 4165ffd83dbSDimitry Andric 4175ffd83dbSDimitry Andric void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 4185ffd83dbSDimitry Andric AU.addRequired<TargetPassConfig>(); 4195ffd83dbSDimitry Andric AU.setPreservesCFG(); 4205ffd83dbSDimitry Andric getSelectionDAGFallbackAnalysisUsage(AU); 4215ffd83dbSDimitry Andric AU.addRequired<GISelKnownBitsAnalysis>(); 4225ffd83dbSDimitry Andric AU.addPreserved<GISelKnownBitsAnalysis>(); 4235ffd83dbSDimitry Andric if (!IsOptNone) { 424*0fca6ea1SDimitry Andric AU.addRequired<MachineDominatorTreeWrapperPass>(); 425*0fca6ea1SDimitry Andric AU.addPreserved<MachineDominatorTreeWrapperPass>(); 4265ffd83dbSDimitry Andric } 4275ffd83dbSDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 4285ffd83dbSDimitry Andric } 4295ffd83dbSDimitry Andric 4305ffd83dbSDimitry Andric AMDGPURegBankCombiner::AMDGPURegBankCombiner(bool IsOptNone) 4315ffd83dbSDimitry Andric : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 4325ffd83dbSDimitry Andric initializeAMDGPURegBankCombinerPass(*PassRegistry::getPassRegistry()); 4335f757f3fSDimitry Andric 4345f757f3fSDimitry Andric if (!RuleConfig.parseCommandLineOption()) 4355f757f3fSDimitry Andric report_fatal_error("Invalid rule identifier"); 4365ffd83dbSDimitry Andric } 4375ffd83dbSDimitry Andric 4385ffd83dbSDimitry Andric bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) { 4395ffd83dbSDimitry Andric if (MF.getProperties().hasProperty( 4405ffd83dbSDimitry Andric MachineFunctionProperties::Property::FailedISel)) 4415ffd83dbSDimitry Andric return false; 4425ffd83dbSDimitry Andric auto *TPC = &getAnalysis<TargetPassConfig>(); 4435ffd83dbSDimitry Andric const Function &F = MF.getFunction(); 4445ffd83dbSDimitry Andric bool EnableOpt = 4455f757f3fSDimitry Andric MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); 4465ffd83dbSDimitry Andric 4475ffd83dbSDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4485ffd83dbSDimitry Andric GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 4495f757f3fSDimitry Andric 4505f757f3fSDimitry Andric const auto *LI = ST.getLegalizerInfo(); 4515ffd83dbSDimitry Andric MachineDominatorTree *MDT = 452*0fca6ea1SDimitry Andric IsOptNone ? nullptr 453*0fca6ea1SDimitry Andric : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); 4545f757f3fSDimitry Andric 4555f757f3fSDimitry Andric CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 4565f757f3fSDimitry Andric LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); 4575f757f3fSDimitry Andric AMDGPURegBankCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr, 4585f757f3fSDimitry Andric RuleConfig, ST, MDT, LI); 4595f757f3fSDimitry Andric return Impl.combineMachineInstrs(); 4605ffd83dbSDimitry Andric } 4615ffd83dbSDimitry Andric 4625ffd83dbSDimitry Andric char AMDGPURegBankCombiner::ID = 0; 4635ffd83dbSDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPURegBankCombiner, DEBUG_TYPE, 4645ffd83dbSDimitry Andric "Combine AMDGPU machine instrs after regbankselect", 4655ffd83dbSDimitry Andric false, false) 4665ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 4675ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 4685ffd83dbSDimitry Andric INITIALIZE_PASS_END(AMDGPURegBankCombiner, DEBUG_TYPE, 4695ffd83dbSDimitry Andric "Combine AMDGPU machine instrs after regbankselect", false, 4705ffd83dbSDimitry Andric false) 4715ffd83dbSDimitry Andric 4725ffd83dbSDimitry Andric namespace llvm { 4735ffd83dbSDimitry Andric FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone) { 4745ffd83dbSDimitry Andric return new AMDGPURegBankCombiner(IsOptNone); 4755ffd83dbSDimitry Andric } 4765ffd83dbSDimitry Andric } // end namespace llvm 477