106c3fb27SDimitry Andric //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp --------------===// 25ffd83dbSDimitry Andric // 35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 65ffd83dbSDimitry Andric // 75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 85ffd83dbSDimitry Andric // 95ffd83dbSDimitry Andric // This pass does combining of machine instructions at the generic MI level, 105ffd83dbSDimitry Andric // after the legalizer. 115ffd83dbSDimitry Andric // 125ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 135ffd83dbSDimitry Andric 14e8d8bef9SDimitry Andric #include "AMDGPU.h" 15349cc55cSDimitry Andric #include "AMDGPUCombinerHelper.h" 165ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h" 17e8d8bef9SDimitry Andric #include "GCNSubtarget.h" 18e8d8bef9SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 195ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/Combiner.h" 205ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 215ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 2206c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 235ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 245f757f3fSDimitry Andric #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 255ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 265ffd83dbSDimitry Andric #include "llvm/CodeGen/MachineDominators.h" 275ffd83dbSDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h" 284824e7fdSDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h" 29e8d8bef9SDimitry Andric #include "llvm/Target/TargetMachine.h" 305ffd83dbSDimitry Andric 3106c3fb27SDimitry Andric #define GET_GICOMBINER_DEPS 3206c3fb27SDimitry Andric #include "AMDGPUGenPreLegalizeGICombiner.inc" 3306c3fb27SDimitry Andric #undef GET_GICOMBINER_DEPS 3406c3fb27SDimitry Andric 355ffd83dbSDimitry Andric #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 365ffd83dbSDimitry Andric 375ffd83dbSDimitry Andric using namespace llvm; 385ffd83dbSDimitry Andric using namespace MIPatternMatch; 395ffd83dbSDimitry Andric 4006c3fb27SDimitry Andric namespace { 4106c3fb27SDimitry Andric #define GET_GICOMBINER_TYPES 4206c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc" 4306c3fb27SDimitry Andric #undef GET_GICOMBINER_TYPES 4406c3fb27SDimitry Andric 455f757f3fSDimitry Andric class AMDGPUPostLegalizerCombinerImpl : public Combiner { 46e8d8bef9SDimitry Andric protected: 4706c3fb27SDimitry Andric const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig; 4806c3fb27SDimitry Andric const GCNSubtarget &STI; 4906c3fb27SDimitry Andric const SIInstrInfo &TII; 505f757f3fSDimitry Andric // TODO: Make CombinerHelper methods const. 515f757f3fSDimitry Andric mutable AMDGPUCombinerHelper Helper; 52e8d8bef9SDimitry Andric 53e8d8bef9SDimitry Andric public: 5406c3fb27SDimitry Andric AMDGPUPostLegalizerCombinerImpl( 555f757f3fSDimitry Andric MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 565f757f3fSDimitry Andric GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 5706c3fb27SDimitry Andric const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, 585f757f3fSDimitry Andric const GCNSubtarget &STI, MachineDominatorTree *MDT, 595f757f3fSDimitry Andric const LegalizerInfo *LI); 6006c3fb27SDimitry Andric 6106c3fb27SDimitry Andric static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl"; } 6206c3fb27SDimitry Andric 635f757f3fSDimitry Andric bool tryCombineAllImpl(MachineInstr &I) const; 645f757f3fSDimitry Andric bool tryCombineAll(MachineInstr &I) const override; 65e8d8bef9SDimitry Andric 665ffd83dbSDimitry Andric struct FMinFMaxLegacyInfo { 675ffd83dbSDimitry Andric Register LHS; 685ffd83dbSDimitry Andric Register RHS; 695ffd83dbSDimitry Andric CmpInst::Predicate Pred; 705ffd83dbSDimitry Andric }; 715ffd83dbSDimitry Andric 725ffd83dbSDimitry Andric // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 73*0fca6ea1SDimitry Andric bool matchFMinFMaxLegacy(MachineInstr &MI, MachineInstr &FCmp, 74*0fca6ea1SDimitry Andric FMinFMaxLegacyInfo &Info) const; 75*0fca6ea1SDimitry Andric void applySelectFCmpToFMinFMaxLegacy(MachineInstr &MI, 7606c3fb27SDimitry Andric const FMinFMaxLegacyInfo &Info) const; 77e8d8bef9SDimitry Andric 7806c3fb27SDimitry Andric bool matchUCharToFloat(MachineInstr &MI) const; 7906c3fb27SDimitry Andric void applyUCharToFloat(MachineInstr &MI) const; 80e8d8bef9SDimitry Andric 8106c3fb27SDimitry Andric bool 8206c3fb27SDimitry Andric matchRcpSqrtToRsq(MachineInstr &MI, 8306c3fb27SDimitry Andric std::function<void(MachineIRBuilder &)> &MatchInfo) const; 844824e7fdSDimitry Andric 85*0fca6ea1SDimitry Andric bool matchFDivSqrtToRsqF16(MachineInstr &MI) const; 86*0fca6ea1SDimitry Andric void applyFDivSqrtToRsqF16(MachineInstr &MI, const Register &X) const; 87*0fca6ea1SDimitry Andric 88e8d8bef9SDimitry Andric // FIXME: Should be able to have 2 separate matchdatas rather than custom 89e8d8bef9SDimitry Andric // struct boilerplate. 90e8d8bef9SDimitry Andric struct CvtF32UByteMatchInfo { 91e8d8bef9SDimitry Andric Register CvtVal; 92e8d8bef9SDimitry Andric unsigned ShiftOffset; 93e8d8bef9SDimitry Andric }; 94e8d8bef9SDimitry Andric 9506c3fb27SDimitry Andric bool matchCvtF32UByteN(MachineInstr &MI, 9606c3fb27SDimitry Andric CvtF32UByteMatchInfo &MatchInfo) const; 97e8d8bef9SDimitry Andric void applyCvtF32UByteN(MachineInstr &MI, 9806c3fb27SDimitry Andric const CvtF32UByteMatchInfo &MatchInfo) const; 99fe6060f1SDimitry Andric 10006c3fb27SDimitry Andric bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg) const; 10106c3fb27SDimitry Andric 10206c3fb27SDimitry Andric // Combine unsigned buffer load and signed extension instructions to generate 103*0fca6ea1SDimitry Andric // signed buffer load instructions. 104297eecfbSDimitry Andric bool matchCombineSignExtendInReg( 105297eecfbSDimitry Andric MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const; 106297eecfbSDimitry Andric void applyCombineSignExtendInReg( 107297eecfbSDimitry Andric MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const; 10806c3fb27SDimitry Andric 1091db9f3b2SDimitry Andric // Find the s_mul_u64 instructions where the higher bits are either 1101db9f3b2SDimitry Andric // zero-extended or sign-extended. 1111db9f3b2SDimitry Andric // Replace the s_mul_u64 instructions with S_MUL_I64_I32_PSEUDO if the higher 1121db9f3b2SDimitry Andric // 33 bits are sign extended and with S_MUL_U64_U32_PSEUDO if the higher 32 1131db9f3b2SDimitry Andric // bits are zero extended. 114*0fca6ea1SDimitry Andric bool matchCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const; 1151db9f3b2SDimitry Andric 11606c3fb27SDimitry Andric private: 11706c3fb27SDimitry Andric #define GET_GICOMBINER_CLASS_MEMBERS 11806c3fb27SDimitry Andric #define AMDGPUSubtarget GCNSubtarget 11906c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc" 12006c3fb27SDimitry Andric #undef GET_GICOMBINER_CLASS_MEMBERS 12106c3fb27SDimitry Andric #undef AMDGPUSubtarget 122e8d8bef9SDimitry Andric }; 123e8d8bef9SDimitry Andric 12406c3fb27SDimitry Andric #define GET_GICOMBINER_IMPL 12506c3fb27SDimitry Andric #define AMDGPUSubtarget GCNSubtarget 12606c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc" 12706c3fb27SDimitry Andric #undef AMDGPUSubtarget 12806c3fb27SDimitry Andric #undef GET_GICOMBINER_IMPL 12906c3fb27SDimitry Andric 13006c3fb27SDimitry Andric AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl( 1315f757f3fSDimitry Andric MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 1325f757f3fSDimitry Andric GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 13306c3fb27SDimitry Andric const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, 1345f757f3fSDimitry Andric const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) 1355f757f3fSDimitry Andric : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI), 1365f757f3fSDimitry Andric TII(*STI.getInstrInfo()), 1375f757f3fSDimitry Andric Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI), 13806c3fb27SDimitry Andric #define GET_GICOMBINER_CONSTRUCTOR_INITS 13906c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc" 14006c3fb27SDimitry Andric #undef GET_GICOMBINER_CONSTRUCTOR_INITS 14106c3fb27SDimitry Andric { 14206c3fb27SDimitry Andric } 14306c3fb27SDimitry Andric 1445f757f3fSDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { 1455f757f3fSDimitry Andric if (tryCombineAllImpl(MI)) 1465f757f3fSDimitry Andric return true; 1475f757f3fSDimitry Andric 1485f757f3fSDimitry Andric switch (MI.getOpcode()) { 1495f757f3fSDimitry Andric case TargetOpcode::G_SHL: 1505f757f3fSDimitry Andric case TargetOpcode::G_LSHR: 1515f757f3fSDimitry Andric case TargetOpcode::G_ASHR: 1525f757f3fSDimitry Andric // On some subtargets, 64-bit shift is a quarter rate instruction. In the 1535f757f3fSDimitry Andric // common case, splitting this into a move and a 32-bit shift is faster and 1545f757f3fSDimitry Andric // the same code size. 1555f757f3fSDimitry Andric return Helper.tryCombineShiftToUnmerge(MI, 32); 1565f757f3fSDimitry Andric } 1575f757f3fSDimitry Andric 1585f757f3fSDimitry Andric return false; 1595f757f3fSDimitry Andric } 1605f757f3fSDimitry Andric 16106c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy( 162*0fca6ea1SDimitry Andric MachineInstr &MI, MachineInstr &FCmp, FMinFMaxLegacyInfo &Info) const { 163*0fca6ea1SDimitry Andric if (!MRI.hasOneNonDBGUse(FCmp.getOperand(0).getReg())) 1645ffd83dbSDimitry Andric return false; 1655ffd83dbSDimitry Andric 166*0fca6ea1SDimitry Andric Info.Pred = 167*0fca6ea1SDimitry Andric static_cast<CmpInst::Predicate>(FCmp.getOperand(1).getPredicate()); 168*0fca6ea1SDimitry Andric Info.LHS = FCmp.getOperand(2).getReg(); 169*0fca6ea1SDimitry Andric Info.RHS = FCmp.getOperand(3).getReg(); 170*0fca6ea1SDimitry Andric Register True = MI.getOperand(2).getReg(); 171*0fca6ea1SDimitry Andric Register False = MI.getOperand(3).getReg(); 1725ffd83dbSDimitry Andric 17306c3fb27SDimitry Andric // TODO: Handle case where the the selected value is an fneg and the compared 17406c3fb27SDimitry Andric // constant is the negation of the selected value. 175*0fca6ea1SDimitry Andric if ((Info.LHS != True || Info.RHS != False) && 176*0fca6ea1SDimitry Andric (Info.LHS != False || Info.RHS != True)) 1775ffd83dbSDimitry Andric return false; 1785ffd83dbSDimitry Andric 179*0fca6ea1SDimitry Andric // Invert the predicate if necessary so that the apply function can assume 180*0fca6ea1SDimitry Andric // that the select operands are the same as the fcmp operands. 181*0fca6ea1SDimitry Andric // (select (fcmp P, L, R), R, L) -> (select (fcmp !P, L, R), L, R) 182*0fca6ea1SDimitry Andric if (Info.LHS != True) 183*0fca6ea1SDimitry Andric Info.Pred = CmpInst::getInversePredicate(Info.Pred); 184*0fca6ea1SDimitry Andric 185*0fca6ea1SDimitry Andric // Only match </<=/>=/> not ==/!= etc. 186*0fca6ea1SDimitry Andric return Info.Pred != CmpInst::getSwappedPredicate(Info.Pred); 1875ffd83dbSDimitry Andric } 1885ffd83dbSDimitry Andric 189*0fca6ea1SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinFMaxLegacy( 19006c3fb27SDimitry Andric MachineInstr &MI, const FMinFMaxLegacyInfo &Info) const { 191*0fca6ea1SDimitry Andric unsigned Opc = (Info.Pred & CmpInst::FCMP_OGT) ? AMDGPU::G_AMDGPU_FMAX_LEGACY 192*0fca6ea1SDimitry Andric : AMDGPU::G_AMDGPU_FMIN_LEGACY; 193*0fca6ea1SDimitry Andric Register X = Info.LHS; 194*0fca6ea1SDimitry Andric Register Y = Info.RHS; 195*0fca6ea1SDimitry Andric if (Info.Pred == CmpInst::getUnorderedPredicate(Info.Pred)) { 1965ffd83dbSDimitry Andric // We need to permute the operands to get the correct NaN behavior. The 1975ffd83dbSDimitry Andric // selected operand is the second one based on the failing compare with NaN, 1985ffd83dbSDimitry Andric // so permute it based on the compare type the hardware uses. 199*0fca6ea1SDimitry Andric std::swap(X, Y); 2005ffd83dbSDimitry Andric } 201*0fca6ea1SDimitry Andric 202*0fca6ea1SDimitry Andric B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 2035ffd83dbSDimitry Andric 2045ffd83dbSDimitry Andric MI.eraseFromParent(); 2055ffd83dbSDimitry Andric } 2065ffd83dbSDimitry Andric 20706c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat( 20806c3fb27SDimitry Andric MachineInstr &MI) const { 2095ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2105ffd83dbSDimitry Andric 2115ffd83dbSDimitry Andric // TODO: We could try to match extracting the higher bytes, which would be 2125ffd83dbSDimitry Andric // easier if i8 vectors weren't promoted to i32 vectors, particularly after 2135ffd83dbSDimitry Andric // types are legalized. v4i8 -> v4f32 is probably the only case to worry 2145ffd83dbSDimitry Andric // about in practice. 2155ffd83dbSDimitry Andric LLT Ty = MRI.getType(DstReg); 2165ffd83dbSDimitry Andric if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 2175ffd83dbSDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 2185ffd83dbSDimitry Andric unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 2195ffd83dbSDimitry Andric assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); 2205ffd83dbSDimitry Andric const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); 2215ffd83dbSDimitry Andric return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); 2225ffd83dbSDimitry Andric } 2235ffd83dbSDimitry Andric 2245ffd83dbSDimitry Andric return false; 2255ffd83dbSDimitry Andric } 2265ffd83dbSDimitry Andric 22706c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat( 22806c3fb27SDimitry Andric MachineInstr &MI) const { 2295ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 2305ffd83dbSDimitry Andric 2315ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2325ffd83dbSDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 233e8d8bef9SDimitry Andric LLT Ty = MRI.getType(DstReg); 234e8d8bef9SDimitry Andric LLT SrcTy = MRI.getType(SrcReg); 2355ffd83dbSDimitry Andric if (SrcTy != S32) 2365ffd83dbSDimitry Andric SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); 2375ffd83dbSDimitry Andric 2385ffd83dbSDimitry Andric if (Ty == S32) { 23906c3fb27SDimitry Andric B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg}, 24006c3fb27SDimitry Andric MI.getFlags()); 2415ffd83dbSDimitry Andric } else { 24206c3fb27SDimitry Andric auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg}, 24306c3fb27SDimitry Andric MI.getFlags()); 2445ffd83dbSDimitry Andric B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 2455ffd83dbSDimitry Andric } 2465ffd83dbSDimitry Andric 2475ffd83dbSDimitry Andric MI.eraseFromParent(); 2485ffd83dbSDimitry Andric } 2495ffd83dbSDimitry Andric 25006c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq( 25106c3fb27SDimitry Andric MachineInstr &MI, 25206c3fb27SDimitry Andric std::function<void(MachineIRBuilder &)> &MatchInfo) const { 2535f757f3fSDimitry Andric auto getRcpSrc = [=](const MachineInstr &MI) -> MachineInstr * { 2545f757f3fSDimitry Andric if (!MI.getFlag(MachineInstr::FmContract)) 2555f757f3fSDimitry Andric return nullptr; 2564824e7fdSDimitry Andric 2575f757f3fSDimitry Andric if (auto *GI = dyn_cast<GIntrinsic>(&MI)) { 2585f757f3fSDimitry Andric if (GI->is(Intrinsic::amdgcn_rcp)) 2595f757f3fSDimitry Andric return MRI.getVRegDef(MI.getOperand(2).getReg()); 2605f757f3fSDimitry Andric } 2615f757f3fSDimitry Andric return nullptr; 2624824e7fdSDimitry Andric }; 2634824e7fdSDimitry Andric 2645f757f3fSDimitry Andric auto getSqrtSrc = [=](const MachineInstr &MI) -> MachineInstr * { 2655f757f3fSDimitry Andric if (!MI.getFlag(MachineInstr::FmContract)) 2665f757f3fSDimitry Andric return nullptr; 2674824e7fdSDimitry Andric MachineInstr *SqrtSrcMI = nullptr; 268bdd1243dSDimitry Andric auto Match = 2694824e7fdSDimitry Andric mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI))); 270bdd1243dSDimitry Andric (void)Match; 2714824e7fdSDimitry Andric return SqrtSrcMI; 2724824e7fdSDimitry Andric }; 2734824e7fdSDimitry Andric 2744824e7fdSDimitry Andric MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr; 2754824e7fdSDimitry Andric // rcp(sqrt(x)) 2764824e7fdSDimitry Andric if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) { 2774824e7fdSDimitry Andric MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) { 2785f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}) 2794824e7fdSDimitry Andric .addUse(SqrtSrcMI->getOperand(0).getReg()) 2804824e7fdSDimitry Andric .setMIFlags(MI.getFlags()); 2814824e7fdSDimitry Andric }; 2824824e7fdSDimitry Andric return true; 2834824e7fdSDimitry Andric } 2844824e7fdSDimitry Andric 2854824e7fdSDimitry Andric // sqrt(rcp(x)) 2864824e7fdSDimitry Andric if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) { 2874824e7fdSDimitry Andric MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) { 2885f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}) 2894824e7fdSDimitry Andric .addUse(RcpSrcMI->getOperand(0).getReg()) 2904824e7fdSDimitry Andric .setMIFlags(MI.getFlags()); 2914824e7fdSDimitry Andric }; 2924824e7fdSDimitry Andric return true; 2934824e7fdSDimitry Andric } 2944824e7fdSDimitry Andric return false; 2954824e7fdSDimitry Andric } 2964824e7fdSDimitry Andric 297*0fca6ea1SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16( 298*0fca6ea1SDimitry Andric MachineInstr &MI) const { 299*0fca6ea1SDimitry Andric Register Sqrt = MI.getOperand(2).getReg(); 300*0fca6ea1SDimitry Andric return MRI.hasOneNonDBGUse(Sqrt); 301*0fca6ea1SDimitry Andric } 302*0fca6ea1SDimitry Andric 303*0fca6ea1SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16( 304*0fca6ea1SDimitry Andric MachineInstr &MI, const Register &X) const { 305*0fca6ea1SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 306*0fca6ea1SDimitry Andric Register Y = MI.getOperand(1).getReg(); 307*0fca6ea1SDimitry Andric LLT DstTy = MRI.getType(Dst); 308*0fca6ea1SDimitry Andric uint32_t Flags = MI.getFlags(); 309*0fca6ea1SDimitry Andric Register RSQ = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy}) 310*0fca6ea1SDimitry Andric .addUse(X) 311*0fca6ea1SDimitry Andric .setMIFlags(Flags) 312*0fca6ea1SDimitry Andric .getReg(0); 313*0fca6ea1SDimitry Andric B.buildFMul(Dst, RSQ, Y, Flags); 314*0fca6ea1SDimitry Andric MI.eraseFromParent(); 315*0fca6ea1SDimitry Andric } 316*0fca6ea1SDimitry Andric 31706c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN( 31806c3fb27SDimitry Andric MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const { 3195ffd83dbSDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 3205ffd83dbSDimitry Andric 3215ffd83dbSDimitry Andric // Look through G_ZEXT. 322bdd1243dSDimitry Andric bool IsShr = mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 3235ffd83dbSDimitry Andric 3245ffd83dbSDimitry Andric Register Src0; 3255ffd83dbSDimitry Andric int64_t ShiftAmt; 326bdd1243dSDimitry Andric IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 3275ffd83dbSDimitry Andric if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 3285ffd83dbSDimitry Andric const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 3295ffd83dbSDimitry Andric 3305ffd83dbSDimitry Andric unsigned ShiftOffset = 8 * Offset; 3315ffd83dbSDimitry Andric if (IsShr) 3325ffd83dbSDimitry Andric ShiftOffset += ShiftAmt; 3335ffd83dbSDimitry Andric else 3345ffd83dbSDimitry Andric ShiftOffset -= ShiftAmt; 3355ffd83dbSDimitry Andric 3365ffd83dbSDimitry Andric MatchInfo.CvtVal = Src0; 3375ffd83dbSDimitry Andric MatchInfo.ShiftOffset = ShiftOffset; 3385ffd83dbSDimitry Andric return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 3395ffd83dbSDimitry Andric } 3405ffd83dbSDimitry Andric 3415ffd83dbSDimitry Andric // TODO: Simplify demanded bits. 3425ffd83dbSDimitry Andric return false; 3435ffd83dbSDimitry Andric } 3445ffd83dbSDimitry Andric 34506c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN( 34606c3fb27SDimitry Andric MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) const { 3475ffd83dbSDimitry Andric unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 3485ffd83dbSDimitry Andric 3495ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 3505ffd83dbSDimitry Andric Register CvtSrc = MatchInfo.CvtVal; 351e8d8bef9SDimitry Andric LLT SrcTy = MRI.getType(MatchInfo.CvtVal); 3525ffd83dbSDimitry Andric if (SrcTy != S32) { 3535ffd83dbSDimitry Andric assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 3545ffd83dbSDimitry Andric CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 3555ffd83dbSDimitry Andric } 3565ffd83dbSDimitry Andric 3575ffd83dbSDimitry Andric assert(MI.getOpcode() != NewOpc); 3585ffd83dbSDimitry Andric B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 3595ffd83dbSDimitry Andric MI.eraseFromParent(); 3605ffd83dbSDimitry Andric } 3615ffd83dbSDimitry Andric 36206c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize( 36306c3fb27SDimitry Andric MachineInstr &MI, Register &Reg) const { 364fe6060f1SDimitry Andric const SITargetLowering *TLI = static_cast<const SITargetLowering *>( 365fe6060f1SDimitry Andric MF.getSubtarget().getTargetLowering()); 366fe6060f1SDimitry Andric Reg = MI.getOperand(1).getReg(); 367fe6060f1SDimitry Andric return TLI->isCanonicalized(Reg, MF); 368fe6060f1SDimitry Andric } 369fe6060f1SDimitry Andric 37006c3fb27SDimitry Andric // The buffer_load_{i8, i16} intrinsics are intially lowered as buffer_load_{u8, 37106c3fb27SDimitry Andric // u16} instructions. Here, the buffer_load_{u8, u16} instructions are combined 37206c3fb27SDimitry Andric // with sign extension instrucions in order to generate buffer_load_{i8, i16} 37306c3fb27SDimitry Andric // instructions. 374e8d8bef9SDimitry Andric 37506c3fb27SDimitry Andric // Identify buffer_load_{u8, u16}. 37606c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg( 377297eecfbSDimitry Andric MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const { 378297eecfbSDimitry Andric Register LoadReg = MI.getOperand(1).getReg(); 379297eecfbSDimitry Andric if (!MRI.hasOneNonDBGUse(LoadReg)) 38006c3fb27SDimitry Andric return false; 381e8d8bef9SDimitry Andric 38206c3fb27SDimitry Andric // Check if the first operand of the sign extension is a subword buffer load 38306c3fb27SDimitry Andric // instruction. 384297eecfbSDimitry Andric MachineInstr *LoadMI = MRI.getVRegDef(LoadReg); 385297eecfbSDimitry Andric int64_t Width = MI.getOperand(2).getImm(); 386297eecfbSDimitry Andric switch (LoadMI->getOpcode()) { 387297eecfbSDimitry Andric case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 388297eecfbSDimitry Andric MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE}; 389297eecfbSDimitry Andric return Width == 8; 390297eecfbSDimitry Andric case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 391297eecfbSDimitry Andric MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT}; 392297eecfbSDimitry Andric return Width == 16; 3937a6dacacSDimitry Andric case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE: 3947a6dacacSDimitry Andric MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE}; 3957a6dacacSDimitry Andric return Width == 8; 3967a6dacacSDimitry Andric case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT: 3977a6dacacSDimitry Andric MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT}; 3987a6dacacSDimitry Andric return Width == 16; 399297eecfbSDimitry Andric } 400297eecfbSDimitry Andric return false; 40106c3fb27SDimitry Andric } 4025ffd83dbSDimitry Andric 40306c3fb27SDimitry Andric // Combine buffer_load_{u8, u16} and the sign extension instruction to generate 40406c3fb27SDimitry Andric // buffer_load_{i8, i16}. 40506c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg( 406297eecfbSDimitry Andric MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const { 407297eecfbSDimitry Andric auto [LoadMI, NewOpcode] = MatchData; 408297eecfbSDimitry Andric LoadMI->setDesc(TII.get(NewOpcode)); 409297eecfbSDimitry Andric // Update the destination register of the load with the destination register 410297eecfbSDimitry Andric // of the sign extension. 41106c3fb27SDimitry Andric Register SignExtendInsnDst = MI.getOperand(0).getReg(); 412297eecfbSDimitry Andric LoadMI->getOperand(0).setReg(SignExtendInsnDst); 41306c3fb27SDimitry Andric // Remove the sign extension. 41406c3fb27SDimitry Andric MI.eraseFromParent(); 41506c3fb27SDimitry Andric } 4165ffd83dbSDimitry Andric 4171db9f3b2SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64( 4181db9f3b2SDimitry Andric MachineInstr &MI, unsigned &NewOpcode) const { 4191db9f3b2SDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 4201db9f3b2SDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 4211db9f3b2SDimitry Andric if (MRI.getType(Src0) != LLT::scalar(64)) 4221db9f3b2SDimitry Andric return false; 4231db9f3b2SDimitry Andric 4241db9f3b2SDimitry Andric if (KB->getKnownBits(Src1).countMinLeadingZeros() >= 32 && 4251db9f3b2SDimitry Andric KB->getKnownBits(Src0).countMinLeadingZeros() >= 32) { 4261db9f3b2SDimitry Andric NewOpcode = AMDGPU::G_AMDGPU_S_MUL_U64_U32; 4271db9f3b2SDimitry Andric return true; 4281db9f3b2SDimitry Andric } 4291db9f3b2SDimitry Andric 4301db9f3b2SDimitry Andric if (KB->computeNumSignBits(Src1) >= 33 && 4311db9f3b2SDimitry Andric KB->computeNumSignBits(Src0) >= 33) { 4321db9f3b2SDimitry Andric NewOpcode = AMDGPU::G_AMDGPU_S_MUL_I64_I32; 4331db9f3b2SDimitry Andric return true; 4341db9f3b2SDimitry Andric } 4351db9f3b2SDimitry Andric return false; 4361db9f3b2SDimitry Andric } 4371db9f3b2SDimitry Andric 4385ffd83dbSDimitry Andric // Pass boilerplate 4395ffd83dbSDimitry Andric // ================ 4405ffd83dbSDimitry Andric 4415ffd83dbSDimitry Andric class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 4425ffd83dbSDimitry Andric public: 4435ffd83dbSDimitry Andric static char ID; 4445ffd83dbSDimitry Andric 4455ffd83dbSDimitry Andric AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 4465ffd83dbSDimitry Andric 4475ffd83dbSDimitry Andric StringRef getPassName() const override { 4485ffd83dbSDimitry Andric return "AMDGPUPostLegalizerCombiner"; 4495ffd83dbSDimitry Andric } 4505ffd83dbSDimitry Andric 4515ffd83dbSDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 4525ffd83dbSDimitry Andric 4535ffd83dbSDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override; 4545f757f3fSDimitry Andric 4555ffd83dbSDimitry Andric private: 4565ffd83dbSDimitry Andric bool IsOptNone; 4575f757f3fSDimitry Andric AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig; 4585ffd83dbSDimitry Andric }; 4595ffd83dbSDimitry Andric } // end anonymous namespace 4605ffd83dbSDimitry Andric 4615ffd83dbSDimitry Andric void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 4625ffd83dbSDimitry Andric AU.addRequired<TargetPassConfig>(); 4635ffd83dbSDimitry Andric AU.setPreservesCFG(); 4645ffd83dbSDimitry Andric getSelectionDAGFallbackAnalysisUsage(AU); 4655ffd83dbSDimitry Andric AU.addRequired<GISelKnownBitsAnalysis>(); 4665ffd83dbSDimitry Andric AU.addPreserved<GISelKnownBitsAnalysis>(); 4675ffd83dbSDimitry Andric if (!IsOptNone) { 468*0fca6ea1SDimitry Andric AU.addRequired<MachineDominatorTreeWrapperPass>(); 469*0fca6ea1SDimitry Andric AU.addPreserved<MachineDominatorTreeWrapperPass>(); 4705ffd83dbSDimitry Andric } 4715ffd83dbSDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 4725ffd83dbSDimitry Andric } 4735ffd83dbSDimitry Andric 4745ffd83dbSDimitry Andric AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 4755ffd83dbSDimitry Andric : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 4765ffd83dbSDimitry Andric initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 4775f757f3fSDimitry Andric 4785f757f3fSDimitry Andric if (!RuleConfig.parseCommandLineOption()) 4795f757f3fSDimitry Andric report_fatal_error("Invalid rule identifier"); 4805ffd83dbSDimitry Andric } 4815ffd83dbSDimitry Andric 4825ffd83dbSDimitry Andric bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 4835ffd83dbSDimitry Andric if (MF.getProperties().hasProperty( 4845ffd83dbSDimitry Andric MachineFunctionProperties::Property::FailedISel)) 4855ffd83dbSDimitry Andric return false; 4865ffd83dbSDimitry Andric auto *TPC = &getAnalysis<TargetPassConfig>(); 4875ffd83dbSDimitry Andric const Function &F = MF.getFunction(); 4885ffd83dbSDimitry Andric bool EnableOpt = 4895f757f3fSDimitry Andric MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); 4905ffd83dbSDimitry Andric 4915ffd83dbSDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 49206c3fb27SDimitry Andric const AMDGPULegalizerInfo *LI = 49306c3fb27SDimitry Andric static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 4945ffd83dbSDimitry Andric 4955ffd83dbSDimitry Andric GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 4965ffd83dbSDimitry Andric MachineDominatorTree *MDT = 497*0fca6ea1SDimitry Andric IsOptNone ? nullptr 498*0fca6ea1SDimitry Andric : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); 4995f757f3fSDimitry Andric 5005f757f3fSDimitry Andric CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 5015f757f3fSDimitry Andric LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); 5025f757f3fSDimitry Andric 5035f757f3fSDimitry Andric AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr, 5045f757f3fSDimitry Andric RuleConfig, ST, MDT, LI); 5055f757f3fSDimitry Andric return Impl.combineMachineInstrs(); 5065ffd83dbSDimitry Andric } 5075ffd83dbSDimitry Andric 5085ffd83dbSDimitry Andric char AMDGPUPostLegalizerCombiner::ID = 0; 5095ffd83dbSDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 51006c3fb27SDimitry Andric "Combine AMDGPU machine instrs after legalization", false, 51106c3fb27SDimitry Andric false) 5125ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 5135ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 5145ffd83dbSDimitry Andric INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 5155ffd83dbSDimitry Andric "Combine AMDGPU machine instrs after legalization", false, 5165ffd83dbSDimitry Andric false) 5175ffd83dbSDimitry Andric 5185ffd83dbSDimitry Andric namespace llvm { 5195ffd83dbSDimitry Andric FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 5205ffd83dbSDimitry Andric return new AMDGPUPostLegalizerCombiner(IsOptNone); 5215ffd83dbSDimitry Andric } 5225ffd83dbSDimitry Andric } // end namespace llvm 523