1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // before the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUCombinerHelper.h" 16 #include "AMDGPULegalizerInfo.h" 17 #include "GCNSubtarget.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 20 #include "llvm/CodeGen/GlobalISel/Combiner.h" 21 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 23 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 24 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/MachineDominators.h" 27 #include "llvm/CodeGen/TargetPassConfig.h" 28 #include "llvm/Target/TargetMachine.h" 29 30 #define GET_GICOMBINER_DEPS 31 #include "AMDGPUGenPreLegalizeGICombiner.inc" 32 #undef GET_GICOMBINER_DEPS 33 34 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner" 35 36 using namespace llvm; 37 using namespace MIPatternMatch; 38 namespace { 39 40 #define GET_GICOMBINER_TYPES 41 #include "AMDGPUGenPreLegalizeGICombiner.inc" 42 #undef GET_GICOMBINER_TYPES 43 44 class AMDGPUPreLegalizerCombinerImpl : public Combiner { 45 protected: 46 const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig; 47 const GCNSubtarget &STI; 48 const AMDGPUCombinerHelper Helper; 49 50 public: 51 AMDGPUPreLegalizerCombinerImpl( 52 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 53 GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 54 const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig, 55 const GCNSubtarget &STI, MachineDominatorTree *MDT, 56 const LegalizerInfo *LI); 57 58 static const char *getName() { return "AMDGPUPreLegalizerCombinerImpl"; } 59 60 bool tryCombineAllImpl(MachineInstr &MI) const; 61 bool tryCombineAll(MachineInstr &I) const override; 62 63 struct ClampI64ToI16MatchInfo { 64 int64_t Cmp1 = 0; 65 int64_t Cmp2 = 0; 66 Register Origin; 67 }; 68 69 bool matchClampI64ToI16(MachineInstr &MI, const MachineRegisterInfo &MRI, 70 const MachineFunction &MF, 71 ClampI64ToI16MatchInfo &MatchInfo) const; 72 73 void applyClampI64ToI16(MachineInstr &MI, 74 const ClampI64ToI16MatchInfo &MatchInfo) const; 75 76 private: 77 #define GET_GICOMBINER_CLASS_MEMBERS 78 #define AMDGPUSubtarget GCNSubtarget 79 #include "AMDGPUGenPreLegalizeGICombiner.inc" 80 #undef GET_GICOMBINER_CLASS_MEMBERS 81 #undef AMDGPUSubtarget 82 }; 83 84 #define GET_GICOMBINER_IMPL 85 #define AMDGPUSubtarget GCNSubtarget 86 #include "AMDGPUGenPreLegalizeGICombiner.inc" 87 #undef AMDGPUSubtarget 88 #undef GET_GICOMBINER_IMPL 89 90 AMDGPUPreLegalizerCombinerImpl::AMDGPUPreLegalizerCombinerImpl( 91 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 92 GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 93 const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig, 94 const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) 95 : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI), 96 Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI, STI), 97 #define GET_GICOMBINER_CONSTRUCTOR_INITS 98 #include "AMDGPUGenPreLegalizeGICombiner.inc" 99 #undef GET_GICOMBINER_CONSTRUCTOR_INITS 100 { 101 } 102 103 bool AMDGPUPreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { 104 if (tryCombineAllImpl(MI)) 105 return true; 106 107 switch (MI.getOpcode()) { 108 case TargetOpcode::G_SHUFFLE_VECTOR: 109 return Helper.tryCombineShuffleVector(MI); 110 } 111 112 return false; 113 } 114 115 bool AMDGPUPreLegalizerCombinerImpl::matchClampI64ToI16( 116 MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineFunction &MF, 117 ClampI64ToI16MatchInfo &MatchInfo) const { 118 assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!"); 119 120 // Try to find a pattern where an i64 value should get clamped to short. 121 const LLT SrcType = MRI.getType(MI.getOperand(1).getReg()); 122 if (SrcType != LLT::scalar(64)) 123 return false; 124 125 const LLT DstType = MRI.getType(MI.getOperand(0).getReg()); 126 if (DstType != LLT::scalar(16)) 127 return false; 128 129 Register Base; 130 131 auto IsApplicableForCombine = [&MatchInfo]() -> bool { 132 const auto Cmp1 = MatchInfo.Cmp1; 133 const auto Cmp2 = MatchInfo.Cmp2; 134 const auto Diff = std::abs(Cmp2 - Cmp1); 135 136 // If the difference between both comparison values is 0 or 1, there is no 137 // need to clamp. 138 if (Diff == 0 || Diff == 1) 139 return false; 140 141 const int64_t Min = std::numeric_limits<int16_t>::min(); 142 const int64_t Max = std::numeric_limits<int16_t>::max(); 143 144 // Check if the comparison values are between SHORT_MIN and SHORT_MAX. 145 return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || 146 (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); 147 }; 148 149 // Try to match a combination of min / max MIR opcodes. 150 if (mi_match(MI.getOperand(1).getReg(), MRI, 151 m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { 152 if (mi_match(Base, MRI, 153 m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { 154 return IsApplicableForCombine(); 155 } 156 } 157 158 if (mi_match(MI.getOperand(1).getReg(), MRI, 159 m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { 160 if (mi_match(Base, MRI, 161 m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { 162 return IsApplicableForCombine(); 163 } 164 } 165 166 return false; 167 } 168 169 // We want to find a combination of instructions that 170 // gets generated when an i64 gets clamped to i16. 171 // The corresponding pattern is: 172 // G_MAX / G_MAX for i16 <= G_TRUNC i64. 173 // This can be efficiently written as following: 174 // v_cvt_pk_i16_i32 v0, v0, v1 175 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max 176 void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16( 177 MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) const { 178 179 Register Src = MatchInfo.Origin; 180 assert(MI.getParent()->getParent()->getRegInfo().getType(Src) == 181 LLT::scalar(64)); 182 const LLT S32 = LLT::scalar(32); 183 184 auto Unmerge = B.buildUnmerge(S32, Src); 185 186 assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32); 187 188 const LLT V2S16 = LLT::fixed_vector(2, 16); 189 auto CvtPk = 190 B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, {V2S16}, 191 {Unmerge.getReg(0), Unmerge.getReg(1)}, MI.getFlags()); 192 193 auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2); 194 auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2); 195 auto MinBoundaryDst = B.buildConstant(S32, MinBoundary); 196 auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary); 197 198 auto Bitcast = B.buildBitcast({S32}, CvtPk); 199 200 auto Med3 = B.buildInstr( 201 AMDGPU::G_AMDGPU_SMED3, {S32}, 202 {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)}, 203 MI.getFlags()); 204 205 B.buildTrunc(MI.getOperand(0).getReg(), Med3); 206 207 MI.eraseFromParent(); 208 } 209 210 // Pass boilerplate 211 // ================ 212 213 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass { 214 public: 215 static char ID; 216 217 AMDGPUPreLegalizerCombiner(bool IsOptNone = false); 218 219 StringRef getPassName() const override { 220 return "AMDGPUPreLegalizerCombiner"; 221 } 222 223 bool runOnMachineFunction(MachineFunction &MF) override; 224 225 void getAnalysisUsage(AnalysisUsage &AU) const override; 226 227 private: 228 bool IsOptNone; 229 AMDGPUPreLegalizerCombinerImplRuleConfig RuleConfig; 230 }; 231 } // end anonymous namespace 232 233 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 234 AU.addRequired<TargetPassConfig>(); 235 AU.setPreservesCFG(); 236 getSelectionDAGFallbackAnalysisUsage(AU); 237 AU.addRequired<GISelKnownBitsAnalysis>(); 238 AU.addPreserved<GISelKnownBitsAnalysis>(); 239 if (!IsOptNone) { 240 AU.addRequired<MachineDominatorTreeWrapperPass>(); 241 AU.addPreserved<MachineDominatorTreeWrapperPass>(); 242 } 243 244 AU.addRequired<GISelCSEAnalysisWrapperPass>(); 245 AU.addPreserved<GISelCSEAnalysisWrapperPass>(); 246 MachineFunctionPass::getAnalysisUsage(AU); 247 } 248 249 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone) 250 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 251 initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 252 253 if (!RuleConfig.parseCommandLineOption()) 254 report_fatal_error("Invalid rule identifier"); 255 } 256 257 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 258 if (MF.getProperties().hasProperty( 259 MachineFunctionProperties::Property::FailedISel)) 260 return false; 261 auto *TPC = &getAnalysis<TargetPassConfig>(); 262 const Function &F = MF.getFunction(); 263 bool EnableOpt = 264 MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); 265 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 266 267 // Enable CSE. 268 GISelCSEAnalysisWrapper &Wrapper = 269 getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); 270 auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig()); 271 272 const GCNSubtarget &STI = MF.getSubtarget<GCNSubtarget>(); 273 MachineDominatorTree *MDT = 274 IsOptNone ? nullptr 275 : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); 276 CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, 277 nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize()); 278 // Disable fixed-point iteration to reduce compile-time 279 CInfo.MaxIterations = 1; 280 CInfo.ObserverLvl = CombinerInfo::ObserverLevel::SinglePass; 281 // This is the first Combiner, so the input IR might contain dead 282 // instructions. 283 CInfo.EnableFullDCE = true; 284 AMDGPUPreLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, RuleConfig, 285 STI, MDT, STI.getLegalizerInfo()); 286 return Impl.combineMachineInstrs(); 287 } 288 289 char AMDGPUPreLegalizerCombiner::ID = 0; 290 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, 291 "Combine AMDGPU machine instrs before legalization", 292 false, false) 293 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 294 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 295 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, 296 "Combine AMDGPU machine instrs before legalization", false, 297 false) 298 299 namespace llvm { 300 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) { 301 return new AMDGPUPreLegalizerCombiner(IsOptNone); 302 } 303 } // end namespace llvm 304