1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // before the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPU.h"
15 #include "AMDGPUCombinerHelper.h"
16 #include "AMDGPULegalizerInfo.h"
17 #include "GCNSubtarget.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
20 #include "llvm/CodeGen/GlobalISel/Combiner.h"
21 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
24 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
25 #include "llvm/CodeGen/MachineDominators.h"
26 #include "llvm/CodeGen/TargetPassConfig.h"
27 #include "llvm/Target/TargetMachine.h"
28
29 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
30
31 using namespace llvm;
32 using namespace MIPatternMatch;
33
34 class AMDGPUPreLegalizerCombinerHelper {
35 protected:
36 MachineIRBuilder &B;
37 MachineFunction &MF;
38 MachineRegisterInfo &MRI;
39 AMDGPUCombinerHelper &Helper;
40
41 public:
AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder & B,AMDGPUCombinerHelper & Helper)42 AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B,
43 AMDGPUCombinerHelper &Helper)
44 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
45
46 struct ClampI64ToI16MatchInfo {
47 int64_t Cmp1 = 0;
48 int64_t Cmp2 = 0;
49 Register Origin;
50 };
51
52 bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
53 MachineFunction &MF,
54 ClampI64ToI16MatchInfo &MatchInfo);
55
56 void applyClampI64ToI16(MachineInstr &MI,
57 const ClampI64ToI16MatchInfo &MatchInfo);
58 };
59
matchClampI64ToI16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineFunction & MF,ClampI64ToI16MatchInfo & MatchInfo)60 bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
61 MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
62 ClampI64ToI16MatchInfo &MatchInfo) {
63 assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
64
65 // Try to find a pattern where an i64 value should get clamped to short.
66 const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
67 if (SrcType != LLT::scalar(64))
68 return false;
69
70 const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
71 if (DstType != LLT::scalar(16))
72 return false;
73
74 Register Base;
75
76 auto IsApplicableForCombine = [&MatchInfo]() -> bool {
77 const auto Cmp1 = MatchInfo.Cmp1;
78 const auto Cmp2 = MatchInfo.Cmp2;
79 const auto Diff = std::abs(Cmp2 - Cmp1);
80
81 // If the difference between both comparison values is 0 or 1, there is no
82 // need to clamp.
83 if (Diff == 0 || Diff == 1)
84 return false;
85
86 const int64_t Min = std::numeric_limits<int16_t>::min();
87 const int64_t Max = std::numeric_limits<int16_t>::max();
88
89 // Check if the comparison values are between SHORT_MIN and SHORT_MAX.
90 return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
91 (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
92 };
93
94 // Try to match a combination of min / max MIR opcodes.
95 if (mi_match(MI.getOperand(1).getReg(), MRI,
96 m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
97 if (mi_match(Base, MRI,
98 m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
99 return IsApplicableForCombine();
100 }
101 }
102
103 if (mi_match(MI.getOperand(1).getReg(), MRI,
104 m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
105 if (mi_match(Base, MRI,
106 m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
107 return IsApplicableForCombine();
108 }
109 }
110
111 return false;
112 }
113
114 // We want to find a combination of instructions that
115 // gets generated when an i64 gets clamped to i16.
116 // The corresponding pattern is:
117 // G_MAX / G_MAX for i16 <= G_TRUNC i64.
118 // This can be efficiently written as following:
119 // v_cvt_pk_i16_i32 v0, v0, v1
120 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
applyClampI64ToI16(MachineInstr & MI,const ClampI64ToI16MatchInfo & MatchInfo)121 void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
122 MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
123
124 Register Src = MatchInfo.Origin;
125 assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
126 LLT::scalar(64));
127 const LLT S32 = LLT::scalar(32);
128
129 B.setInstrAndDebugLoc(MI);
130
131 auto Unmerge = B.buildUnmerge(S32, Src);
132
133 assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
134
135 const LLT V2S16 = LLT::fixed_vector(2, 16);
136 auto CvtPk =
137 B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, {V2S16},
138 {Unmerge.getReg(0), Unmerge.getReg(1)}, MI.getFlags());
139
140 auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
141 auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
142 auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
143 auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
144
145 auto Bitcast = B.buildBitcast({S32}, CvtPk);
146
147 auto Med3 = B.buildInstr(
148 AMDGPU::G_AMDGPU_SMED3, {S32},
149 {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)},
150 MI.getFlags());
151
152 B.buildTrunc(MI.getOperand(0).getReg(), Med3);
153
154 MI.eraseFromParent();
155 }
156
157 class AMDGPUPreLegalizerCombinerHelperState {
158 protected:
159 AMDGPUCombinerHelper &Helper;
160 AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
161
162 public:
AMDGPUPreLegalizerCombinerHelperState(AMDGPUCombinerHelper & Helper,AMDGPUPreLegalizerCombinerHelper & PreLegalizerHelper)163 AMDGPUPreLegalizerCombinerHelperState(
164 AMDGPUCombinerHelper &Helper,
165 AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
166 : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
167 };
168
169 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
170 #include "AMDGPUGenPreLegalizeGICombiner.inc"
171 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
172
173 namespace {
174 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
175 #include "AMDGPUGenPreLegalizeGICombiner.inc"
176 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
177
178 class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo {
179 GISelKnownBits *KB;
180 MachineDominatorTree *MDT;
181
182 public:
183 AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
184
AMDGPUPreLegalizerCombinerInfo(bool EnableOpt,bool OptSize,bool MinSize,GISelKnownBits * KB,MachineDominatorTree * MDT)185 AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
186 GISelKnownBits *KB, MachineDominatorTree *MDT)
187 : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
188 /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
189 KB(KB), MDT(MDT) {
190 if (!GeneratedRuleCfg.parseCommandLineOption())
191 report_fatal_error("Invalid rule identifier");
192 }
193
194 bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
195 MachineIRBuilder &B) const override;
196 };
197
combine(GISelChangeObserver & Observer,MachineInstr & MI,MachineIRBuilder & B) const198 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
199 MachineInstr &MI,
200 MachineIRBuilder &B) const {
201 const auto *LI = MI.getMF()->getSubtarget().getLegalizerInfo();
202 AMDGPUCombinerHelper Helper(Observer, B, /*IsPreLegalize*/ true, KB, MDT, LI);
203 AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
204 AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
205 PreLegalizerHelper);
206
207 if (Generated.tryCombineAll(Observer, MI, B))
208 return true;
209
210 switch (MI.getOpcode()) {
211 case TargetOpcode::G_CONCAT_VECTORS:
212 return Helper.tryCombineConcatVectors(MI);
213 case TargetOpcode::G_SHUFFLE_VECTOR:
214 return Helper.tryCombineShuffleVector(MI);
215 }
216
217 return false;
218 }
219
220 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
221 #include "AMDGPUGenPreLegalizeGICombiner.inc"
222 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
223
224 // Pass boilerplate
225 // ================
226
227 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
228 public:
229 static char ID;
230
231 AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
232
getPassName() const233 StringRef getPassName() const override {
234 return "AMDGPUPreLegalizerCombiner";
235 }
236
237 bool runOnMachineFunction(MachineFunction &MF) override;
238
239 void getAnalysisUsage(AnalysisUsage &AU) const override;
240 private:
241 bool IsOptNone;
242 };
243 } // end anonymous namespace
244
getAnalysisUsage(AnalysisUsage & AU) const245 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
246 AU.addRequired<TargetPassConfig>();
247 AU.setPreservesCFG();
248 getSelectionDAGFallbackAnalysisUsage(AU);
249 AU.addRequired<GISelKnownBitsAnalysis>();
250 AU.addPreserved<GISelKnownBitsAnalysis>();
251 if (!IsOptNone) {
252 AU.addRequired<MachineDominatorTree>();
253 AU.addPreserved<MachineDominatorTree>();
254 }
255
256 AU.addRequired<GISelCSEAnalysisWrapperPass>();
257 AU.addPreserved<GISelCSEAnalysisWrapperPass>();
258 MachineFunctionPass::getAnalysisUsage(AU);
259 }
260
AMDGPUPreLegalizerCombiner(bool IsOptNone)261 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
262 : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
263 initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
264 }
265
runOnMachineFunction(MachineFunction & MF)266 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
267 if (MF.getProperties().hasProperty(
268 MachineFunctionProperties::Property::FailedISel))
269 return false;
270 auto *TPC = &getAnalysis<TargetPassConfig>();
271 const Function &F = MF.getFunction();
272 bool EnableOpt =
273 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
274 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
275 MachineDominatorTree *MDT =
276 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
277 AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
278 F.hasMinSize(), KB, MDT);
279 // Enable CSE.
280 GISelCSEAnalysisWrapper &Wrapper =
281 getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
282 auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig());
283
284 Combiner C(PCInfo, TPC);
285 return C.combineMachineInstrs(MF, CSEInfo);
286 }
287
288 char AMDGPUPreLegalizerCombiner::ID = 0;
289 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
290 "Combine AMDGPU machine instrs before legalization",
291 false, false)
292 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
293 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
294 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
295 "Combine AMDGPU machine instrs before legalization", false,
296 false)
297
298 namespace llvm {
createAMDGPUPreLegalizeCombiner(bool IsOptNone)299 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) {
300 return new AMDGPUPreLegalizerCombiner(IsOptNone);
301 }
302 } // end namespace llvm
303