1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // after the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPU.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "GCNSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "llvm/CodeGen/GlobalISel/Combiner.h"
19 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
20 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23 #include "llvm/CodeGen/MachineDominators.h"
24 #include "llvm/CodeGen/TargetPassConfig.h"
25 #include "llvm/Target/TargetMachine.h"
26
27 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
28
29 using namespace llvm;
30 using namespace MIPatternMatch;
31
32 class AMDGPUPostLegalizerCombinerHelper {
33 protected:
34 MachineIRBuilder &B;
35 MachineFunction &MF;
36 MachineRegisterInfo &MRI;
37 CombinerHelper &Helper;
38
39 public:
AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder & B,CombinerHelper & Helper)40 AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
41 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
42
43 struct FMinFMaxLegacyInfo {
44 Register LHS;
45 Register RHS;
46 Register True;
47 Register False;
48 CmpInst::Predicate Pred;
49 };
50
51 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
52 bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
53 void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
54 const FMinFMaxLegacyInfo &Info);
55
56 bool matchUCharToFloat(MachineInstr &MI);
57 void applyUCharToFloat(MachineInstr &MI);
58
59 // FIXME: Should be able to have 2 separate matchdatas rather than custom
60 // struct boilerplate.
61 struct CvtF32UByteMatchInfo {
62 Register CvtVal;
63 unsigned ShiftOffset;
64 };
65
66 bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
67 void applyCvtF32UByteN(MachineInstr &MI,
68 const CvtF32UByteMatchInfo &MatchInfo);
69
70 bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg);
71 };
72
matchFMinFMaxLegacy(MachineInstr & MI,FMinFMaxLegacyInfo & Info)73 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
74 MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
75 // FIXME: Combines should have subtarget predicates, and we shouldn't need
76 // this here.
77 if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
78 return false;
79
80 // FIXME: Type predicate on pattern
81 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
82 return false;
83
84 Register Cond = MI.getOperand(1).getReg();
85 if (!MRI.hasOneNonDBGUse(Cond) ||
86 !mi_match(Cond, MRI,
87 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
88 return false;
89
90 Info.True = MI.getOperand(2).getReg();
91 Info.False = MI.getOperand(3).getReg();
92
93 if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
94 !(Info.LHS == Info.False && Info.RHS == Info.True))
95 return false;
96
97 switch (Info.Pred) {
98 case CmpInst::FCMP_FALSE:
99 case CmpInst::FCMP_OEQ:
100 case CmpInst::FCMP_ONE:
101 case CmpInst::FCMP_ORD:
102 case CmpInst::FCMP_UNO:
103 case CmpInst::FCMP_UEQ:
104 case CmpInst::FCMP_UNE:
105 case CmpInst::FCMP_TRUE:
106 return false;
107 default:
108 return true;
109 }
110 }
111
applySelectFCmpToFMinToFMaxLegacy(MachineInstr & MI,const FMinFMaxLegacyInfo & Info)112 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
113 MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
114 B.setInstrAndDebugLoc(MI);
115 auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
116 B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
117 };
118
119 switch (Info.Pred) {
120 case CmpInst::FCMP_ULT:
121 case CmpInst::FCMP_ULE:
122 if (Info.LHS == Info.True)
123 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
124 else
125 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
126 break;
127 case CmpInst::FCMP_OLE:
128 case CmpInst::FCMP_OLT: {
129 // We need to permute the operands to get the correct NaN behavior. The
130 // selected operand is the second one based on the failing compare with NaN,
131 // so permute it based on the compare type the hardware uses.
132 if (Info.LHS == Info.True)
133 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
134 else
135 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
136 break;
137 }
138 case CmpInst::FCMP_UGE:
139 case CmpInst::FCMP_UGT: {
140 if (Info.LHS == Info.True)
141 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
142 else
143 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
144 break;
145 }
146 case CmpInst::FCMP_OGT:
147 case CmpInst::FCMP_OGE: {
148 if (Info.LHS == Info.True)
149 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
150 else
151 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
152 break;
153 }
154 default:
155 llvm_unreachable("predicate should not have matched");
156 }
157
158 MI.eraseFromParent();
159 }
160
matchUCharToFloat(MachineInstr & MI)161 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
162 Register DstReg = MI.getOperand(0).getReg();
163
164 // TODO: We could try to match extracting the higher bytes, which would be
165 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
166 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
167 // about in practice.
168 LLT Ty = MRI.getType(DstReg);
169 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
170 Register SrcReg = MI.getOperand(1).getReg();
171 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
172 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
173 const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
174 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
175 }
176
177 return false;
178 }
179
applyUCharToFloat(MachineInstr & MI)180 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
181 B.setInstrAndDebugLoc(MI);
182
183 const LLT S32 = LLT::scalar(32);
184
185 Register DstReg = MI.getOperand(0).getReg();
186 Register SrcReg = MI.getOperand(1).getReg();
187 LLT Ty = MRI.getType(DstReg);
188 LLT SrcTy = MRI.getType(SrcReg);
189 if (SrcTy != S32)
190 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
191
192 if (Ty == S32) {
193 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
194 {SrcReg}, MI.getFlags());
195 } else {
196 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
197 {SrcReg}, MI.getFlags());
198 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
199 }
200
201 MI.eraseFromParent();
202 }
203
matchCvtF32UByteN(MachineInstr & MI,CvtF32UByteMatchInfo & MatchInfo)204 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
205 MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
206 Register SrcReg = MI.getOperand(1).getReg();
207
208 // Look through G_ZEXT.
209 mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
210
211 Register Src0;
212 int64_t ShiftAmt;
213 bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
214 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
215 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
216
217 unsigned ShiftOffset = 8 * Offset;
218 if (IsShr)
219 ShiftOffset += ShiftAmt;
220 else
221 ShiftOffset -= ShiftAmt;
222
223 MatchInfo.CvtVal = Src0;
224 MatchInfo.ShiftOffset = ShiftOffset;
225 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
226 }
227
228 // TODO: Simplify demanded bits.
229 return false;
230 }
231
applyCvtF32UByteN(MachineInstr & MI,const CvtF32UByteMatchInfo & MatchInfo)232 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
233 MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
234 B.setInstrAndDebugLoc(MI);
235 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
236
237 const LLT S32 = LLT::scalar(32);
238 Register CvtSrc = MatchInfo.CvtVal;
239 LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
240 if (SrcTy != S32) {
241 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
242 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
243 }
244
245 assert(MI.getOpcode() != NewOpc);
246 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
247 MI.eraseFromParent();
248 }
249
matchRemoveFcanonicalize(MachineInstr & MI,Register & Reg)250 bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize(
251 MachineInstr &MI, Register &Reg) {
252 const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
253 MF.getSubtarget().getTargetLowering());
254 Reg = MI.getOperand(1).getReg();
255 return TLI->isCanonicalized(Reg, MF);
256 }
257
258 class AMDGPUPostLegalizerCombinerHelperState {
259 protected:
260 CombinerHelper &Helper;
261 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
262
263 public:
AMDGPUPostLegalizerCombinerHelperState(CombinerHelper & Helper,AMDGPUPostLegalizerCombinerHelper & PostLegalizerHelper)264 AMDGPUPostLegalizerCombinerHelperState(
265 CombinerHelper &Helper,
266 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper)
267 : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {}
268 };
269
270 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
271 #include "AMDGPUGenPostLegalizeGICombiner.inc"
272 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
273
274 namespace {
275 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
276 #include "AMDGPUGenPostLegalizeGICombiner.inc"
277 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
278
279 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
280 GISelKnownBits *KB;
281 MachineDominatorTree *MDT;
282
283 public:
284 AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
285
AMDGPUPostLegalizerCombinerInfo(bool EnableOpt,bool OptSize,bool MinSize,const AMDGPULegalizerInfo * LI,GISelKnownBits * KB,MachineDominatorTree * MDT)286 AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
287 const AMDGPULegalizerInfo *LI,
288 GISelKnownBits *KB, MachineDominatorTree *MDT)
289 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
290 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
291 KB(KB), MDT(MDT) {
292 if (!GeneratedRuleCfg.parseCommandLineOption())
293 report_fatal_error("Invalid rule identifier");
294 }
295
296 bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
297 MachineIRBuilder &B) const override;
298 };
299
combine(GISelChangeObserver & Observer,MachineInstr & MI,MachineIRBuilder & B) const300 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
301 MachineInstr &MI,
302 MachineIRBuilder &B) const {
303 CombinerHelper Helper(Observer, B, KB, MDT, LInfo);
304 AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
305 AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
306 PostLegalizerHelper);
307
308 if (Generated.tryCombineAll(Observer, MI, B))
309 return true;
310
311 switch (MI.getOpcode()) {
312 case TargetOpcode::G_SHL:
313 case TargetOpcode::G_LSHR:
314 case TargetOpcode::G_ASHR:
315 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
316 // common case, splitting this into a move and a 32-bit shift is faster and
317 // the same code size.
318 return Helper.tryCombineShiftToUnmerge(MI, 32);
319 }
320
321 return false;
322 }
323
324 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
325 #include "AMDGPUGenPostLegalizeGICombiner.inc"
326 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
327
328 // Pass boilerplate
329 // ================
330
331 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
332 public:
333 static char ID;
334
335 AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
336
getPassName() const337 StringRef getPassName() const override {
338 return "AMDGPUPostLegalizerCombiner";
339 }
340
341 bool runOnMachineFunction(MachineFunction &MF) override;
342
343 void getAnalysisUsage(AnalysisUsage &AU) const override;
344 private:
345 bool IsOptNone;
346 };
347 } // end anonymous namespace
348
getAnalysisUsage(AnalysisUsage & AU) const349 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
350 AU.addRequired<TargetPassConfig>();
351 AU.setPreservesCFG();
352 getSelectionDAGFallbackAnalysisUsage(AU);
353 AU.addRequired<GISelKnownBitsAnalysis>();
354 AU.addPreserved<GISelKnownBitsAnalysis>();
355 if (!IsOptNone) {
356 AU.addRequired<MachineDominatorTree>();
357 AU.addPreserved<MachineDominatorTree>();
358 }
359 MachineFunctionPass::getAnalysisUsage(AU);
360 }
361
AMDGPUPostLegalizerCombiner(bool IsOptNone)362 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
363 : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
364 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
365 }
366
runOnMachineFunction(MachineFunction & MF)367 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
368 if (MF.getProperties().hasProperty(
369 MachineFunctionProperties::Property::FailedISel))
370 return false;
371 auto *TPC = &getAnalysis<TargetPassConfig>();
372 const Function &F = MF.getFunction();
373 bool EnableOpt =
374 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
375
376 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
377 const AMDGPULegalizerInfo *LI
378 = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
379
380 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
381 MachineDominatorTree *MDT =
382 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
383 AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
384 F.hasMinSize(), LI, KB, MDT);
385 Combiner C(PCInfo, TPC);
386 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
387 }
388
389 char AMDGPUPostLegalizerCombiner::ID = 0;
390 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
391 "Combine AMDGPU machine instrs after legalization",
392 false, false)
393 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
394 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
395 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
396 "Combine AMDGPU machine instrs after legalization", false,
397 false)
398
399 namespace llvm {
createAMDGPUPostLegalizeCombiner(bool IsOptNone)400 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
401 return new AMDGPUPostLegalizerCombiner(IsOptNone);
402 }
403 } // end namespace llvm
404