xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (revision d59e6404559f9adc510b1d89bae7177468c6d8c9)
1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This pass does misc. AMDGPU optimizations on IR before instruction
12 /// selection.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUIntrinsicInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/Analysis/DivergenceAnalysis.h"
22 #include "llvm/CodeGen/Passes.h"
23 #include "llvm/IR/Attributes.h"
24 #include "llvm/IR/BasicBlock.h"
25 #include "llvm/IR/Constants.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/Function.h"
28 #include "llvm/IR/InstrTypes.h"
29 #include "llvm/IR/Instruction.h"
30 #include "llvm/IR/Instructions.h"
31 #include "llvm/IR/InstVisitor.h"
32 #include "llvm/IR/IntrinsicInst.h"
33 #include "llvm/IR/Intrinsics.h"
34 #include "llvm/IR/IRBuilder.h"
35 #include "llvm/IR/LLVMContext.h"
36 #include "llvm/IR/Operator.h"
37 #include "llvm/IR/Type.h"
38 #include "llvm/IR/Value.h"
39 #include "llvm/Pass.h"
40 #include "llvm/Support/Casting.h"
41 #include <cassert>
42 #include <iterator>
43 
44 #define DEBUG_TYPE "amdgpu-codegenprepare"
45 
46 using namespace llvm;
47 
48 namespace {
49 
50 class AMDGPUCodeGenPrepare : public FunctionPass,
51                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
52   const GCNTargetMachine *TM;
53   const SISubtarget *ST = nullptr;
54   DivergenceAnalysis *DA = nullptr;
55   Module *Mod = nullptr;
56   bool HasUnsafeFPMath = false;
57 
58   /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
59   /// binary operation \p V.
60   ///
61   /// \returns Binary operation \p V.
62   /// \returns \p T's base element bit width.
63   unsigned getBaseElementBitWidth(const Type *T) const;
64 
65   /// \returns Equivalent 32 bit integer type for given type \p T. For example,
66   /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
67   /// is returned.
68   Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
69 
70   /// \returns True if binary operation \p I is a signed binary operation, false
71   /// otherwise.
72   bool isSigned(const BinaryOperator &I) const;
73 
74   /// \returns True if the condition of 'select' operation \p I comes from a
75   /// signed 'icmp' operation, false otherwise.
76   bool isSigned(const SelectInst &I) const;
77 
78   /// \returns True if type \p T needs to be promoted to 32 bit integer type,
79   /// false otherwise.
80   bool needsPromotionToI32(const Type *T) const;
81 
82   /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary
83   /// operation.
84   ///
85   /// \details \p I's base element bit width must be greater than 1 and less
86   /// than or equal 16. Promotion is done by sign or zero extending operands to
87   /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
88   /// truncating the result of 32 bit binary operation back to \p I's original
89   /// type. Division operation is not promoted.
90   ///
91   /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
92   /// false otherwise.
93   bool promoteUniformOpToI32(BinaryOperator &I) const;
94 
95   /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
96   ///
97   /// \details \p I's base element bit width must be greater than 1 and less
98   /// than or equal 16. Promotion is done by sign or zero extending operands to
99   /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
100   ///
101   /// \returns True.
102   bool promoteUniformOpToI32(ICmpInst &I) const;
103 
104   /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select'
105   /// operation.
106   ///
107   /// \details \p I's base element bit width must be greater than 1 and less
108   /// than or equal 16. Promotion is done by sign or zero extending operands to
109   /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
110   /// result of 32 bit 'select' operation back to \p I's original type.
111   ///
112   /// \returns True.
113   bool promoteUniformOpToI32(SelectInst &I) const;
114 
115   /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
116   /// intrinsic.
117   ///
118   /// \details \p I's base element bit width must be greater than 1 and less
119   /// than or equal 16. Promotion is done by zero extending the operand to 32
120   /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
121   /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
122   /// shift amount is 32 minus \p I's base element bit width), and truncating
123   /// the result of the shift operation back to \p I's original type.
124   ///
125   /// \returns True.
126   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
127 
128 public:
129   static char ID;
130 
131   AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
132     FunctionPass(ID), TM(static_cast<const GCNTargetMachine *>(TM)) {}
133 
134   bool visitFDiv(BinaryOperator &I);
135 
136   bool visitInstruction(Instruction &I) { return false; }
137   bool visitBinaryOperator(BinaryOperator &I);
138   bool visitICmpInst(ICmpInst &I);
139   bool visitSelectInst(SelectInst &I);
140 
141   bool visitIntrinsicInst(IntrinsicInst &I);
142   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
143 
144   bool doInitialization(Module &M) override;
145   bool runOnFunction(Function &F) override;
146 
147   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
148 
149   void getAnalysisUsage(AnalysisUsage &AU) const override {
150     AU.addRequired<DivergenceAnalysis>();
151     AU.setPreservesAll();
152  }
153 };
154 
155 } // end anonymous namespace
156 
157 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
158   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
159 
160   if (T->isIntegerTy())
161     return T->getIntegerBitWidth();
162   return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
163 }
164 
165 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
166   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
167 
168   if (T->isIntegerTy())
169     return B.getInt32Ty();
170   return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
171 }
172 
173 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
174   return I.getOpcode() == Instruction::AShr ||
175       I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
176 }
177 
178 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
179   return isa<ICmpInst>(I.getOperand(0)) ?
180       cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
181 }
182 
183 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
184   if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 &&
185       T->getIntegerBitWidth() <= 16)
186     return true;
187   if (!T->isVectorTy())
188     return false;
189   return needsPromotionToI32(cast<VectorType>(T)->getElementType());
190 }
191 
192 // Return true if the op promoted to i32 should have nsw set.
193 static bool promotedOpIsNSW(const Instruction &I) {
194   switch (I.getOpcode()) {
195   case Instruction::Shl:
196   case Instruction::Add:
197   case Instruction::Sub:
198     return true;
199   case Instruction::Mul:
200     return I.hasNoUnsignedWrap();
201   default:
202     return false;
203   }
204 }
205 
206 // Return true if the op promoted to i32 should have nuw set.
207 static bool promotedOpIsNUW(const Instruction &I) {
208   switch (I.getOpcode()) {
209   case Instruction::Shl:
210   case Instruction::Add:
211   case Instruction::Mul:
212     return true;
213   case Instruction::Sub:
214     return I.hasNoUnsignedWrap();
215   default:
216     return false;
217   }
218 }
219 
220 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
221   assert(needsPromotionToI32(I.getType()) &&
222          "I does not need promotion to i32");
223 
224   if (I.getOpcode() == Instruction::SDiv ||
225       I.getOpcode() == Instruction::UDiv)
226     return false;
227 
228   IRBuilder<> Builder(&I);
229   Builder.SetCurrentDebugLocation(I.getDebugLoc());
230 
231   Type *I32Ty = getI32Ty(Builder, I.getType());
232   Value *ExtOp0 = nullptr;
233   Value *ExtOp1 = nullptr;
234   Value *ExtRes = nullptr;
235   Value *TruncRes = nullptr;
236 
237   if (isSigned(I)) {
238     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
239     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
240   } else {
241     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
242     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
243   }
244 
245   ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
246   if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
247     if (promotedOpIsNSW(cast<Instruction>(I)))
248       Inst->setHasNoSignedWrap();
249 
250     if (promotedOpIsNUW(cast<Instruction>(I)))
251       Inst->setHasNoUnsignedWrap();
252 
253     if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
254       Inst->setIsExact(ExactOp->isExact());
255   }
256 
257   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
258 
259   I.replaceAllUsesWith(TruncRes);
260   I.eraseFromParent();
261 
262   return true;
263 }
264 
265 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
266   assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
267          "I does not need promotion to i32");
268 
269   IRBuilder<> Builder(&I);
270   Builder.SetCurrentDebugLocation(I.getDebugLoc());
271 
272   Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
273   Value *ExtOp0 = nullptr;
274   Value *ExtOp1 = nullptr;
275   Value *NewICmp  = nullptr;
276 
277   if (I.isSigned()) {
278     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
279     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
280   } else {
281     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
282     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
283   }
284   NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
285 
286   I.replaceAllUsesWith(NewICmp);
287   I.eraseFromParent();
288 
289   return true;
290 }
291 
292 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
293   assert(needsPromotionToI32(I.getType()) &&
294          "I does not need promotion to i32");
295 
296   IRBuilder<> Builder(&I);
297   Builder.SetCurrentDebugLocation(I.getDebugLoc());
298 
299   Type *I32Ty = getI32Ty(Builder, I.getType());
300   Value *ExtOp1 = nullptr;
301   Value *ExtOp2 = nullptr;
302   Value *ExtRes = nullptr;
303   Value *TruncRes = nullptr;
304 
305   if (isSigned(I)) {
306     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
307     ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
308   } else {
309     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
310     ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
311   }
312   ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
313   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
314 
315   I.replaceAllUsesWith(TruncRes);
316   I.eraseFromParent();
317 
318   return true;
319 }
320 
321 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
322     IntrinsicInst &I) const {
323   assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
324          "I must be bitreverse intrinsic");
325   assert(needsPromotionToI32(I.getType()) &&
326          "I does not need promotion to i32");
327 
328   IRBuilder<> Builder(&I);
329   Builder.SetCurrentDebugLocation(I.getDebugLoc());
330 
331   Type *I32Ty = getI32Ty(Builder, I.getType());
332   Function *I32 =
333       Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
334   Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
335   Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
336   Value *LShrOp =
337       Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
338   Value *TruncRes =
339       Builder.CreateTrunc(LShrOp, I.getType());
340 
341   I.replaceAllUsesWith(TruncRes);
342   I.eraseFromParent();
343 
344   return true;
345 }
346 
347 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
348   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
349   if (!CNum)
350     return false;
351 
352   // Reciprocal f32 is handled separately without denormals.
353   return UnsafeDiv || CNum->isExactlyValue(+1.0);
354 }
355 
356 // Insert an intrinsic for fast fdiv for safe math situations where we can
357 // reduce precision. Leave fdiv for situations where the generic node is
358 // expected to be optimized.
359 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
360   Type *Ty = FDiv.getType();
361 
362   if (!Ty->getScalarType()->isFloatTy())
363     return false;
364 
365   MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
366   if (!FPMath)
367     return false;
368 
369   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
370   float ULP = FPOp->getFPAccuracy();
371   if (ULP < 2.5f)
372     return false;
373 
374   FastMathFlags FMF = FPOp->getFastMathFlags();
375   bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
376                                       FMF.allowReciprocal();
377   if (ST->hasFP32Denormals() && !UnsafeDiv)
378     return false;
379 
380   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
381   Builder.setFastMathFlags(FMF);
382   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
383 
384   const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
385   Function *Decl
386     = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
387 
388   Value *Num = FDiv.getOperand(0);
389   Value *Den = FDiv.getOperand(1);
390 
391   Value *NewFDiv = nullptr;
392 
393   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
394     NewFDiv = UndefValue::get(VT);
395 
396     // FIXME: Doesn't do the right thing for cases where the vector is partially
397     // constant. This works when the scalarizer pass is run first.
398     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
399       Value *NumEltI = Builder.CreateExtractElement(Num, I);
400       Value *DenEltI = Builder.CreateExtractElement(Den, I);
401       Value *NewElt;
402 
403       if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
404         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
405       } else {
406         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
407       }
408 
409       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
410     }
411   } else {
412     if (!shouldKeepFDivF32(Num, UnsafeDiv))
413       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
414   }
415 
416   if (NewFDiv) {
417     FDiv.replaceAllUsesWith(NewFDiv);
418     NewFDiv->takeName(&FDiv);
419     FDiv.eraseFromParent();
420   }
421 
422   return true;
423 }
424 
425 static bool hasUnsafeFPMath(const Function &F) {
426   Attribute Attr = F.getFnAttribute("unsafe-fp-math");
427   return Attr.getValueAsString() == "true";
428 }
429 
430 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
431   bool Changed = false;
432 
433   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
434       DA->isUniform(&I))
435     Changed |= promoteUniformOpToI32(I);
436 
437   return Changed;
438 }
439 
440 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
441   bool Changed = false;
442 
443   if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
444       DA->isUniform(&I))
445     Changed |= promoteUniformOpToI32(I);
446 
447   return Changed;
448 }
449 
450 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
451   bool Changed = false;
452 
453   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
454       DA->isUniform(&I))
455     Changed |= promoteUniformOpToI32(I);
456 
457   return Changed;
458 }
459 
460 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
461   switch (I.getIntrinsicID()) {
462   case Intrinsic::bitreverse:
463     return visitBitreverseIntrinsicInst(I);
464   default:
465     return false;
466   }
467 }
468 
469 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
470   bool Changed = false;
471 
472   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
473       DA->isUniform(&I))
474     Changed |= promoteUniformBitreverseToI32(I);
475 
476   return Changed;
477 }
478 
479 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
480   Mod = &M;
481   return false;
482 }
483 
484 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
485   if (!TM || skipFunction(F))
486     return false;
487 
488   ST = &TM->getSubtarget<SISubtarget>(F);
489   DA = &getAnalysis<DivergenceAnalysis>();
490   HasUnsafeFPMath = hasUnsafeFPMath(F);
491 
492   bool MadeChange = false;
493 
494   for (BasicBlock &BB : F) {
495     BasicBlock::iterator Next;
496     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
497       Next = std::next(I);
498       MadeChange |= visit(*I);
499     }
500   }
501 
502   return MadeChange;
503 }
504 
505 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
506                       "AMDGPU IR optimizations", false, false)
507 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
508 INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
509                        "AMDGPU IR optimizations", false, false)
510 
511 char AMDGPUCodeGenPrepare::ID = 0;
512 
513 FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
514   return new AMDGPUCodeGenPrepare(TM);
515 }
516