xref: /netbsd-src/external/apache2/llvm/dist/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // before the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "GCNSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "llvm/CodeGen/GlobalISel/Combiner.h"
19 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
20 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23 #include "llvm/CodeGen/MachineDominators.h"
24 #include "llvm/CodeGen/TargetPassConfig.h"
25 #include "llvm/Target/TargetMachine.h"
26 
27 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
28 
29 using namespace llvm;
30 using namespace MIPatternMatch;
31 
32 class AMDGPUPreLegalizerCombinerHelper {
33 protected:
34   MachineIRBuilder &B;
35   MachineFunction &MF;
36   MachineRegisterInfo &MRI;
37   CombinerHelper &Helper;
38 
39 public:
AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder & B,CombinerHelper & Helper)40   AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
41       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
42 
43   struct ClampI64ToI16MatchInfo {
44     int64_t Cmp1 = 0;
45     int64_t Cmp2 = 0;
46     Register Origin;
47   };
48 
49   bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
50                           MachineFunction &MF,
51                           ClampI64ToI16MatchInfo &MatchInfo);
52 
53   void applyClampI64ToI16(MachineInstr &MI,
54                           const ClampI64ToI16MatchInfo &MatchInfo);
55 };
56 
matchClampI64ToI16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineFunction & MF,ClampI64ToI16MatchInfo & MatchInfo)57 bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
58     MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
59     ClampI64ToI16MatchInfo &MatchInfo) {
60   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
61 
62   // Try to find a pattern where an i64 value should get clamped to short.
63   const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
64   if (SrcType != LLT::scalar(64))
65     return false;
66 
67   const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
68   if (DstType != LLT::scalar(16))
69     return false;
70 
71   Register Base;
72 
73   auto IsApplicableForCombine = [&MatchInfo]() -> bool {
74     const auto Cmp1 = MatchInfo.Cmp1;
75     const auto Cmp2 = MatchInfo.Cmp2;
76     const auto Diff = std::abs(Cmp2 - Cmp1);
77 
78     // If the difference between both comparison values is 0 or 1, there is no
79     // need to clamp.
80     if (Diff == 0 || Diff == 1)
81       return false;
82 
83     const int64_t Min = std::numeric_limits<int16_t>::min();
84     const int64_t Max = std::numeric_limits<int16_t>::max();
85 
86     // Check if the comparison values are between SHORT_MIN and SHORT_MAX.
87     return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
88             (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
89   };
90 
91   // Try to match a combination of min / max MIR opcodes.
92   if (mi_match(MI.getOperand(1).getReg(), MRI,
93                m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
94     if (mi_match(Base, MRI,
95                  m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
96       return IsApplicableForCombine();
97     }
98   }
99 
100   if (mi_match(MI.getOperand(1).getReg(), MRI,
101                m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
102     if (mi_match(Base, MRI,
103                  m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
104       return IsApplicableForCombine();
105     }
106   }
107 
108   return false;
109 }
110 
111 // We want to find a combination of instructions that
112 // gets generated when an i64 gets clamped to i16.
113 // The corresponding pattern is:
114 // G_MAX / G_MAX for i16 <= G_TRUNC i64.
115 // This can be efficiently written as following:
116 // v_cvt_pk_i16_i32 v0, v0, v1
117 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
applyClampI64ToI16(MachineInstr & MI,const ClampI64ToI16MatchInfo & MatchInfo)118 void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
119     MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
120 
121   Register Src = MatchInfo.Origin;
122   assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
123          LLT::scalar(64));
124   const LLT S32 = LLT::scalar(32);
125 
126   B.setMBB(*MI.getParent());
127   B.setInstrAndDebugLoc(MI);
128 
129   auto Unmerge = B.buildUnmerge(S32, Src);
130 
131   assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
132 
133   const LLT V2S16 = LLT::vector(2, 16);
134   auto CvtPk =
135       B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, {V2S16},
136                    {Unmerge.getReg(0), Unmerge.getReg(1)}, MI.getFlags());
137 
138   auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
139   auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
140   auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
141   auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
142 
143   auto Bitcast = B.buildBitcast({S32}, CvtPk);
144 
145   auto Med3 = B.buildInstr(
146       AMDGPU::G_AMDGPU_SMED3, {S32},
147       {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)},
148       MI.getFlags());
149 
150   B.buildTrunc(MI.getOperand(0).getReg(), Med3);
151 
152   MI.eraseFromParent();
153 }
154 
155 class AMDGPUPreLegalizerCombinerHelperState {
156 protected:
157   CombinerHelper &Helper;
158   AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
159 
160 public:
AMDGPUPreLegalizerCombinerHelperState(CombinerHelper & Helper,AMDGPUPreLegalizerCombinerHelper & PreLegalizerHelper)161   AMDGPUPreLegalizerCombinerHelperState(
162       CombinerHelper &Helper,
163       AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
164       : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
165 };
166 
167 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
168 #include "AMDGPUGenPreLegalizeGICombiner.inc"
169 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
170 
171 namespace {
172 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
173 #include "AMDGPUGenPreLegalizeGICombiner.inc"
174 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
175 
176 class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo {
177   GISelKnownBits *KB;
178   MachineDominatorTree *MDT;
179 
180 public:
181   AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
182 
AMDGPUPreLegalizerCombinerInfo(bool EnableOpt,bool OptSize,bool MinSize,GISelKnownBits * KB,MachineDominatorTree * MDT)183   AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
184                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
185       : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
186                      /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
187         KB(KB), MDT(MDT) {
188     if (!GeneratedRuleCfg.parseCommandLineOption())
189       report_fatal_error("Invalid rule identifier");
190   }
191 
192   virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
193                        MachineIRBuilder &B) const override;
194 };
195 
combine(GISelChangeObserver & Observer,MachineInstr & MI,MachineIRBuilder & B) const196 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
197                                               MachineInstr &MI,
198                                               MachineIRBuilder &B) const {
199   CombinerHelper Helper(Observer, B, KB, MDT);
200   AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
201   AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
202                                                 PreLegalizerHelper);
203 
204   if (Generated.tryCombineAll(Observer, MI, B, Helper))
205     return true;
206 
207   switch (MI.getOpcode()) {
208   case TargetOpcode::G_CONCAT_VECTORS:
209     return Helper.tryCombineConcatVectors(MI);
210   case TargetOpcode::G_SHUFFLE_VECTOR:
211     return Helper.tryCombineShuffleVector(MI);
212   }
213 
214   return false;
215 }
216 
217 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
218 #include "AMDGPUGenPreLegalizeGICombiner.inc"
219 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
220 
221 // Pass boilerplate
222 // ================
223 
224 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
225 public:
226   static char ID;
227 
228   AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
229 
getPassName() const230   StringRef getPassName() const override {
231     return "AMDGPUPreLegalizerCombiner";
232   }
233 
234   bool runOnMachineFunction(MachineFunction &MF) override;
235 
236   void getAnalysisUsage(AnalysisUsage &AU) const override;
237 private:
238   bool IsOptNone;
239 };
240 } // end anonymous namespace
241 
getAnalysisUsage(AnalysisUsage & AU) const242 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
243   AU.addRequired<TargetPassConfig>();
244   AU.setPreservesCFG();
245   getSelectionDAGFallbackAnalysisUsage(AU);
246   AU.addRequired<GISelKnownBitsAnalysis>();
247   AU.addPreserved<GISelKnownBitsAnalysis>();
248   if (!IsOptNone) {
249     AU.addRequired<MachineDominatorTree>();
250     AU.addPreserved<MachineDominatorTree>();
251   }
252 
253   AU.addRequired<GISelCSEAnalysisWrapperPass>();
254   AU.addPreserved<GISelCSEAnalysisWrapperPass>();
255   MachineFunctionPass::getAnalysisUsage(AU);
256 }
257 
AMDGPUPreLegalizerCombiner(bool IsOptNone)258 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
259   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
260   initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
261 }
262 
runOnMachineFunction(MachineFunction & MF)263 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
264   if (MF.getProperties().hasProperty(
265           MachineFunctionProperties::Property::FailedISel))
266     return false;
267   auto *TPC = &getAnalysis<TargetPassConfig>();
268   const Function &F = MF.getFunction();
269   bool EnableOpt =
270       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
271   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
272   MachineDominatorTree *MDT =
273       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
274   AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
275                                         F.hasMinSize(), KB, MDT);
276   // Enable CSE.
277   GISelCSEAnalysisWrapper &Wrapper =
278       getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
279   auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig());
280 
281   Combiner C(PCInfo, TPC);
282   return C.combineMachineInstrs(MF, CSEInfo);
283 }
284 
285 char AMDGPUPreLegalizerCombiner::ID = 0;
286 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
287                       "Combine AMDGPU machine instrs before legalization",
288                       false, false)
289 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
290 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
291 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
292                     "Combine AMDGPU machine instrs before legalization", false,
293                     false)
294 
295 namespace llvm {
createAMDGPUPreLegalizeCombiner(bool IsOptNone)296 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) {
297   return new AMDGPUPreLegalizerCombiner(IsOptNone);
298 }
299 } // end namespace llvm
300