1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // before the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPU.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "GCNSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "llvm/CodeGen/GlobalISel/Combiner.h"
19 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
20 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23 #include "llvm/CodeGen/MachineDominators.h"
24 #include "llvm/CodeGen/TargetPassConfig.h"
25 #include "llvm/Target/TargetMachine.h"
26
27 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
28
29 using namespace llvm;
30 using namespace MIPatternMatch;
31
32 class AMDGPUPreLegalizerCombinerHelper {
33 protected:
34 MachineIRBuilder &B;
35 MachineFunction &MF;
36 MachineRegisterInfo &MRI;
37 CombinerHelper &Helper;
38
39 public:
AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder & B,CombinerHelper & Helper)40 AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
41 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
42
43 struct ClampI64ToI16MatchInfo {
44 int64_t Cmp1 = 0;
45 int64_t Cmp2 = 0;
46 Register Origin;
47 };
48
49 bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
50 MachineFunction &MF,
51 ClampI64ToI16MatchInfo &MatchInfo);
52
53 void applyClampI64ToI16(MachineInstr &MI,
54 const ClampI64ToI16MatchInfo &MatchInfo);
55 };
56
matchClampI64ToI16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineFunction & MF,ClampI64ToI16MatchInfo & MatchInfo)57 bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
58 MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
59 ClampI64ToI16MatchInfo &MatchInfo) {
60 assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
61
62 // Try to find a pattern where an i64 value should get clamped to short.
63 const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
64 if (SrcType != LLT::scalar(64))
65 return false;
66
67 const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
68 if (DstType != LLT::scalar(16))
69 return false;
70
71 Register Base;
72
73 auto IsApplicableForCombine = [&MatchInfo]() -> bool {
74 const auto Cmp1 = MatchInfo.Cmp1;
75 const auto Cmp2 = MatchInfo.Cmp2;
76 const auto Diff = std::abs(Cmp2 - Cmp1);
77
78 // If the difference between both comparison values is 0 or 1, there is no
79 // need to clamp.
80 if (Diff == 0 || Diff == 1)
81 return false;
82
83 const int64_t Min = std::numeric_limits<int16_t>::min();
84 const int64_t Max = std::numeric_limits<int16_t>::max();
85
86 // Check if the comparison values are between SHORT_MIN and SHORT_MAX.
87 return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
88 (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
89 };
90
91 // Try to match a combination of min / max MIR opcodes.
92 if (mi_match(MI.getOperand(1).getReg(), MRI,
93 m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
94 if (mi_match(Base, MRI,
95 m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
96 return IsApplicableForCombine();
97 }
98 }
99
100 if (mi_match(MI.getOperand(1).getReg(), MRI,
101 m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
102 if (mi_match(Base, MRI,
103 m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
104 return IsApplicableForCombine();
105 }
106 }
107
108 return false;
109 }
110
111 // We want to find a combination of instructions that
112 // gets generated when an i64 gets clamped to i16.
113 // The corresponding pattern is:
114 // G_MAX / G_MAX for i16 <= G_TRUNC i64.
115 // This can be efficiently written as following:
116 // v_cvt_pk_i16_i32 v0, v0, v1
117 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
applyClampI64ToI16(MachineInstr & MI,const ClampI64ToI16MatchInfo & MatchInfo)118 void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
119 MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
120
121 Register Src = MatchInfo.Origin;
122 assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
123 LLT::scalar(64));
124 const LLT S32 = LLT::scalar(32);
125
126 B.setMBB(*MI.getParent());
127 B.setInstrAndDebugLoc(MI);
128
129 auto Unmerge = B.buildUnmerge(S32, Src);
130
131 assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
132
133 const LLT V2S16 = LLT::vector(2, 16);
134 auto CvtPk =
135 B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, {V2S16},
136 {Unmerge.getReg(0), Unmerge.getReg(1)}, MI.getFlags());
137
138 auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
139 auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
140 auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
141 auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
142
143 auto Bitcast = B.buildBitcast({S32}, CvtPk);
144
145 auto Med3 = B.buildInstr(
146 AMDGPU::G_AMDGPU_SMED3, {S32},
147 {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)},
148 MI.getFlags());
149
150 B.buildTrunc(MI.getOperand(0).getReg(), Med3);
151
152 MI.eraseFromParent();
153 }
154
155 class AMDGPUPreLegalizerCombinerHelperState {
156 protected:
157 CombinerHelper &Helper;
158 AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
159
160 public:
AMDGPUPreLegalizerCombinerHelperState(CombinerHelper & Helper,AMDGPUPreLegalizerCombinerHelper & PreLegalizerHelper)161 AMDGPUPreLegalizerCombinerHelperState(
162 CombinerHelper &Helper,
163 AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
164 : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
165 };
166
167 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
168 #include "AMDGPUGenPreLegalizeGICombiner.inc"
169 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
170
171 namespace {
172 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
173 #include "AMDGPUGenPreLegalizeGICombiner.inc"
174 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
175
176 class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo {
177 GISelKnownBits *KB;
178 MachineDominatorTree *MDT;
179
180 public:
181 AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
182
AMDGPUPreLegalizerCombinerInfo(bool EnableOpt,bool OptSize,bool MinSize,GISelKnownBits * KB,MachineDominatorTree * MDT)183 AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
184 GISelKnownBits *KB, MachineDominatorTree *MDT)
185 : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
186 /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
187 KB(KB), MDT(MDT) {
188 if (!GeneratedRuleCfg.parseCommandLineOption())
189 report_fatal_error("Invalid rule identifier");
190 }
191
192 virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
193 MachineIRBuilder &B) const override;
194 };
195
combine(GISelChangeObserver & Observer,MachineInstr & MI,MachineIRBuilder & B) const196 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
197 MachineInstr &MI,
198 MachineIRBuilder &B) const {
199 CombinerHelper Helper(Observer, B, KB, MDT);
200 AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
201 AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
202 PreLegalizerHelper);
203
204 if (Generated.tryCombineAll(Observer, MI, B, Helper))
205 return true;
206
207 switch (MI.getOpcode()) {
208 case TargetOpcode::G_CONCAT_VECTORS:
209 return Helper.tryCombineConcatVectors(MI);
210 case TargetOpcode::G_SHUFFLE_VECTOR:
211 return Helper.tryCombineShuffleVector(MI);
212 }
213
214 return false;
215 }
216
217 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
218 #include "AMDGPUGenPreLegalizeGICombiner.inc"
219 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
220
221 // Pass boilerplate
222 // ================
223
224 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
225 public:
226 static char ID;
227
228 AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
229
getPassName() const230 StringRef getPassName() const override {
231 return "AMDGPUPreLegalizerCombiner";
232 }
233
234 bool runOnMachineFunction(MachineFunction &MF) override;
235
236 void getAnalysisUsage(AnalysisUsage &AU) const override;
237 private:
238 bool IsOptNone;
239 };
240 } // end anonymous namespace
241
getAnalysisUsage(AnalysisUsage & AU) const242 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
243 AU.addRequired<TargetPassConfig>();
244 AU.setPreservesCFG();
245 getSelectionDAGFallbackAnalysisUsage(AU);
246 AU.addRequired<GISelKnownBitsAnalysis>();
247 AU.addPreserved<GISelKnownBitsAnalysis>();
248 if (!IsOptNone) {
249 AU.addRequired<MachineDominatorTree>();
250 AU.addPreserved<MachineDominatorTree>();
251 }
252
253 AU.addRequired<GISelCSEAnalysisWrapperPass>();
254 AU.addPreserved<GISelCSEAnalysisWrapperPass>();
255 MachineFunctionPass::getAnalysisUsage(AU);
256 }
257
AMDGPUPreLegalizerCombiner(bool IsOptNone)258 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
259 : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
260 initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
261 }
262
runOnMachineFunction(MachineFunction & MF)263 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
264 if (MF.getProperties().hasProperty(
265 MachineFunctionProperties::Property::FailedISel))
266 return false;
267 auto *TPC = &getAnalysis<TargetPassConfig>();
268 const Function &F = MF.getFunction();
269 bool EnableOpt =
270 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
271 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
272 MachineDominatorTree *MDT =
273 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
274 AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
275 F.hasMinSize(), KB, MDT);
276 // Enable CSE.
277 GISelCSEAnalysisWrapper &Wrapper =
278 getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
279 auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig());
280
281 Combiner C(PCInfo, TPC);
282 return C.combineMachineInstrs(MF, CSEInfo);
283 }
284
285 char AMDGPUPreLegalizerCombiner::ID = 0;
286 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
287 "Combine AMDGPU machine instrs before legalization",
288 false, false)
289 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
290 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
291 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
292 "Combine AMDGPU machine instrs before legalization", false,
293 false)
294
295 namespace llvm {
createAMDGPUPreLegalizeCombiner(bool IsOptNone)296 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) {
297 return new AMDGPUPreLegalizerCombiner(IsOptNone);
298 }
299 } // end namespace llvm
300