xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (revision 5f8f34e459b60efb332337e7cfe902a7cabe4096)
1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This pass does misc. AMDGPU optimizations on IR before instruction
12 /// selection.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/Analysis/DivergenceAnalysis.h"
21 #include "llvm/Analysis/Loads.h"
22 #include "llvm/CodeGen/Passes.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/IR/Attributes.h"
25 #include "llvm/IR/BasicBlock.h"
26 #include "llvm/IR/Constants.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/Function.h"
29 #include "llvm/IR/IRBuilder.h"
30 #include "llvm/IR/InstVisitor.h"
31 #include "llvm/IR/InstrTypes.h"
32 #include "llvm/IR/Instruction.h"
33 #include "llvm/IR/Instructions.h"
34 #include "llvm/IR/IntrinsicInst.h"
35 #include "llvm/IR/Intrinsics.h"
36 #include "llvm/IR/LLVMContext.h"
37 #include "llvm/IR/Operator.h"
38 #include "llvm/IR/Type.h"
39 #include "llvm/IR/Value.h"
40 #include "llvm/Pass.h"
41 #include "llvm/Support/Casting.h"
42 #include <cassert>
43 #include <iterator>
44 
45 #define DEBUG_TYPE "amdgpu-codegenprepare"
46 
47 using namespace llvm;
48 
49 namespace {
50 
51 class AMDGPUCodeGenPrepare : public FunctionPass,
52                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
53   const SISubtarget *ST = nullptr;
54   DivergenceAnalysis *DA = nullptr;
55   Module *Mod = nullptr;
56   bool HasUnsafeFPMath = false;
57   AMDGPUAS AMDGPUASI;
58 
59   /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
60   /// binary operation \p V.
61   ///
62   /// \returns Binary operation \p V.
63   /// \returns \p T's base element bit width.
64   unsigned getBaseElementBitWidth(const Type *T) const;
65 
66   /// \returns Equivalent 32 bit integer type for given type \p T. For example,
67   /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
68   /// is returned.
69   Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
70 
71   /// \returns True if binary operation \p I is a signed binary operation, false
72   /// otherwise.
73   bool isSigned(const BinaryOperator &I) const;
74 
75   /// \returns True if the condition of 'select' operation \p I comes from a
76   /// signed 'icmp' operation, false otherwise.
77   bool isSigned(const SelectInst &I) const;
78 
79   /// \returns True if type \p T needs to be promoted to 32 bit integer type,
80   /// false otherwise.
81   bool needsPromotionToI32(const Type *T) const;
82 
83   /// Promotes uniform binary operation \p I to equivalent 32 bit binary
84   /// operation.
85   ///
86   /// \details \p I's base element bit width must be greater than 1 and less
87   /// than or equal 16. Promotion is done by sign or zero extending operands to
88   /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
89   /// truncating the result of 32 bit binary operation back to \p I's original
90   /// type. Division operation is not promoted.
91   ///
92   /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
93   /// false otherwise.
94   bool promoteUniformOpToI32(BinaryOperator &I) const;
95 
96   /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
97   ///
98   /// \details \p I's base element bit width must be greater than 1 and less
99   /// than or equal 16. Promotion is done by sign or zero extending operands to
100   /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
101   ///
102   /// \returns True.
103   bool promoteUniformOpToI32(ICmpInst &I) const;
104 
105   /// Promotes uniform 'select' operation \p I to 32 bit 'select'
106   /// operation.
107   ///
108   /// \details \p I's base element bit width must be greater than 1 and less
109   /// than or equal 16. Promotion is done by sign or zero extending operands to
110   /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
111   /// result of 32 bit 'select' operation back to \p I's original type.
112   ///
113   /// \returns True.
114   bool promoteUniformOpToI32(SelectInst &I) const;
115 
116   /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
117   /// intrinsic.
118   ///
119   /// \details \p I's base element bit width must be greater than 1 and less
120   /// than or equal 16. Promotion is done by zero extending the operand to 32
121   /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
122   /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
123   /// shift amount is 32 minus \p I's base element bit width), and truncating
124   /// the result of the shift operation back to \p I's original type.
125   ///
126   /// \returns True.
127   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
128   /// Widen a scalar load.
129   ///
130   /// \details \p Widen scalar load for uniform, small type loads from constant
131   //  memory / to a full 32-bits and then truncate the input to allow a scalar
132   //  load instead of a vector load.
133   //
134   /// \returns True.
135 
136   bool canWidenScalarExtLoad(LoadInst &I) const;
137 
138 public:
139   static char ID;
140 
141   AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
142 
143   bool visitFDiv(BinaryOperator &I);
144 
145   bool visitInstruction(Instruction &I) { return false; }
146   bool visitBinaryOperator(BinaryOperator &I);
147   bool visitLoadInst(LoadInst &I);
148   bool visitICmpInst(ICmpInst &I);
149   bool visitSelectInst(SelectInst &I);
150 
151   bool visitIntrinsicInst(IntrinsicInst &I);
152   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
153 
154   bool doInitialization(Module &M) override;
155   bool runOnFunction(Function &F) override;
156 
157   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
158 
159   void getAnalysisUsage(AnalysisUsage &AU) const override {
160     AU.addRequired<DivergenceAnalysis>();
161     AU.setPreservesAll();
162  }
163 };
164 
165 } // end anonymous namespace
166 
167 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
168   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
169 
170   if (T->isIntegerTy())
171     return T->getIntegerBitWidth();
172   return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
173 }
174 
175 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
176   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
177 
178   if (T->isIntegerTy())
179     return B.getInt32Ty();
180   return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
181 }
182 
183 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
184   return I.getOpcode() == Instruction::AShr ||
185       I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
186 }
187 
188 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
189   return isa<ICmpInst>(I.getOperand(0)) ?
190       cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
191 }
192 
193 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
194   const IntegerType *IntTy = dyn_cast<IntegerType>(T);
195   if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
196     return true;
197 
198   if (const VectorType *VT = dyn_cast<VectorType>(T)) {
199     // TODO: The set of packed operations is more limited, so may want to
200     // promote some anyway.
201     if (ST->hasVOP3PInsts())
202       return false;
203 
204     return needsPromotionToI32(VT->getElementType());
205   }
206 
207   return false;
208 }
209 
210 // Return true if the op promoted to i32 should have nsw set.
211 static bool promotedOpIsNSW(const Instruction &I) {
212   switch (I.getOpcode()) {
213   case Instruction::Shl:
214   case Instruction::Add:
215   case Instruction::Sub:
216     return true;
217   case Instruction::Mul:
218     return I.hasNoUnsignedWrap();
219   default:
220     return false;
221   }
222 }
223 
224 // Return true if the op promoted to i32 should have nuw set.
225 static bool promotedOpIsNUW(const Instruction &I) {
226   switch (I.getOpcode()) {
227   case Instruction::Shl:
228   case Instruction::Add:
229   case Instruction::Mul:
230     return true;
231   case Instruction::Sub:
232     return I.hasNoUnsignedWrap();
233   default:
234     return false;
235   }
236 }
237 
238 bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
239   Type *Ty = I.getType();
240   const DataLayout &DL = Mod->getDataLayout();
241   int TySize = DL.getTypeSizeInBits(Ty);
242   unsigned Align = I.getAlignment() ?
243                    I.getAlignment() : DL.getABITypeAlignment(Ty);
244 
245   return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
246 }
247 
248 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
249   assert(needsPromotionToI32(I.getType()) &&
250          "I does not need promotion to i32");
251 
252   if (I.getOpcode() == Instruction::SDiv ||
253       I.getOpcode() == Instruction::UDiv)
254     return false;
255 
256   IRBuilder<> Builder(&I);
257   Builder.SetCurrentDebugLocation(I.getDebugLoc());
258 
259   Type *I32Ty = getI32Ty(Builder, I.getType());
260   Value *ExtOp0 = nullptr;
261   Value *ExtOp1 = nullptr;
262   Value *ExtRes = nullptr;
263   Value *TruncRes = nullptr;
264 
265   if (isSigned(I)) {
266     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
267     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
268   } else {
269     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
270     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
271   }
272 
273   ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
274   if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
275     if (promotedOpIsNSW(cast<Instruction>(I)))
276       Inst->setHasNoSignedWrap();
277 
278     if (promotedOpIsNUW(cast<Instruction>(I)))
279       Inst->setHasNoUnsignedWrap();
280 
281     if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
282       Inst->setIsExact(ExactOp->isExact());
283   }
284 
285   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
286 
287   I.replaceAllUsesWith(TruncRes);
288   I.eraseFromParent();
289 
290   return true;
291 }
292 
293 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
294   assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
295          "I does not need promotion to i32");
296 
297   IRBuilder<> Builder(&I);
298   Builder.SetCurrentDebugLocation(I.getDebugLoc());
299 
300   Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
301   Value *ExtOp0 = nullptr;
302   Value *ExtOp1 = nullptr;
303   Value *NewICmp  = nullptr;
304 
305   if (I.isSigned()) {
306     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
307     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
308   } else {
309     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
310     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
311   }
312   NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
313 
314   I.replaceAllUsesWith(NewICmp);
315   I.eraseFromParent();
316 
317   return true;
318 }
319 
320 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
321   assert(needsPromotionToI32(I.getType()) &&
322          "I does not need promotion to i32");
323 
324   IRBuilder<> Builder(&I);
325   Builder.SetCurrentDebugLocation(I.getDebugLoc());
326 
327   Type *I32Ty = getI32Ty(Builder, I.getType());
328   Value *ExtOp1 = nullptr;
329   Value *ExtOp2 = nullptr;
330   Value *ExtRes = nullptr;
331   Value *TruncRes = nullptr;
332 
333   if (isSigned(I)) {
334     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
335     ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
336   } else {
337     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
338     ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
339   }
340   ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
341   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
342 
343   I.replaceAllUsesWith(TruncRes);
344   I.eraseFromParent();
345 
346   return true;
347 }
348 
349 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
350     IntrinsicInst &I) const {
351   assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
352          "I must be bitreverse intrinsic");
353   assert(needsPromotionToI32(I.getType()) &&
354          "I does not need promotion to i32");
355 
356   IRBuilder<> Builder(&I);
357   Builder.SetCurrentDebugLocation(I.getDebugLoc());
358 
359   Type *I32Ty = getI32Ty(Builder, I.getType());
360   Function *I32 =
361       Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
362   Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
363   Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
364   Value *LShrOp =
365       Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
366   Value *TruncRes =
367       Builder.CreateTrunc(LShrOp, I.getType());
368 
369   I.replaceAllUsesWith(TruncRes);
370   I.eraseFromParent();
371 
372   return true;
373 }
374 
375 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
376   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
377   if (!CNum)
378     return false;
379 
380   // Reciprocal f32 is handled separately without denormals.
381   return UnsafeDiv || CNum->isExactlyValue(+1.0);
382 }
383 
384 // Insert an intrinsic for fast fdiv for safe math situations where we can
385 // reduce precision. Leave fdiv for situations where the generic node is
386 // expected to be optimized.
387 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
388   Type *Ty = FDiv.getType();
389 
390   if (!Ty->getScalarType()->isFloatTy())
391     return false;
392 
393   MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
394   if (!FPMath)
395     return false;
396 
397   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
398   float ULP = FPOp->getFPAccuracy();
399   if (ULP < 2.5f)
400     return false;
401 
402   FastMathFlags FMF = FPOp->getFastMathFlags();
403   bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
404                                       FMF.allowReciprocal();
405 
406   // With UnsafeDiv node will be optimized to just rcp and mul.
407   if (ST->hasFP32Denormals() || UnsafeDiv)
408     return false;
409 
410   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
411   Builder.setFastMathFlags(FMF);
412   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
413 
414   Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
415 
416   Value *Num = FDiv.getOperand(0);
417   Value *Den = FDiv.getOperand(1);
418 
419   Value *NewFDiv = nullptr;
420 
421   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
422     NewFDiv = UndefValue::get(VT);
423 
424     // FIXME: Doesn't do the right thing for cases where the vector is partially
425     // constant. This works when the scalarizer pass is run first.
426     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
427       Value *NumEltI = Builder.CreateExtractElement(Num, I);
428       Value *DenEltI = Builder.CreateExtractElement(Den, I);
429       Value *NewElt;
430 
431       if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
432         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
433       } else {
434         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
435       }
436 
437       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
438     }
439   } else {
440     if (!shouldKeepFDivF32(Num, UnsafeDiv))
441       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
442   }
443 
444   if (NewFDiv) {
445     FDiv.replaceAllUsesWith(NewFDiv);
446     NewFDiv->takeName(&FDiv);
447     FDiv.eraseFromParent();
448   }
449 
450   return true;
451 }
452 
453 static bool hasUnsafeFPMath(const Function &F) {
454   Attribute Attr = F.getFnAttribute("unsafe-fp-math");
455   return Attr.getValueAsString() == "true";
456 }
457 
458 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
459   bool Changed = false;
460 
461   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
462       DA->isUniform(&I))
463     Changed |= promoteUniformOpToI32(I);
464 
465   return Changed;
466 }
467 
468 bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst  &I) {
469   if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
470        I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
471       canWidenScalarExtLoad(I)) {
472     IRBuilder<> Builder(&I);
473     Builder.SetCurrentDebugLocation(I.getDebugLoc());
474 
475     Type *I32Ty = Builder.getInt32Ty();
476     Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
477     Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
478     Value *WidenLoad = Builder.CreateLoad(BitCast);
479 
480     int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
481     Type *IntNTy = Builder.getIntNTy(TySize);
482     Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
483     Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
484     I.replaceAllUsesWith(ValOrig);
485     I.eraseFromParent();
486     return true;
487   }
488 
489   return false;
490 }
491 
492 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
493   bool Changed = false;
494 
495   if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
496       DA->isUniform(&I))
497     Changed |= promoteUniformOpToI32(I);
498 
499   return Changed;
500 }
501 
502 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
503   bool Changed = false;
504 
505   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
506       DA->isUniform(&I))
507     Changed |= promoteUniformOpToI32(I);
508 
509   return Changed;
510 }
511 
512 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
513   switch (I.getIntrinsicID()) {
514   case Intrinsic::bitreverse:
515     return visitBitreverseIntrinsicInst(I);
516   default:
517     return false;
518   }
519 }
520 
521 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
522   bool Changed = false;
523 
524   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
525       DA->isUniform(&I))
526     Changed |= promoteUniformBitreverseToI32(I);
527 
528   return Changed;
529 }
530 
531 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
532   Mod = &M;
533   return false;
534 }
535 
536 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
537   if (skipFunction(F))
538     return false;
539 
540   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
541   if (!TPC)
542     return false;
543 
544   const TargetMachine &TM = TPC->getTM<TargetMachine>();
545   ST = &TM.getSubtarget<SISubtarget>(F);
546   DA = &getAnalysis<DivergenceAnalysis>();
547   HasUnsafeFPMath = hasUnsafeFPMath(F);
548 
549   bool MadeChange = false;
550 
551   for (BasicBlock &BB : F) {
552     BasicBlock::iterator Next;
553     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
554       Next = std::next(I);
555       MadeChange |= visit(*I);
556     }
557   }
558 
559   return MadeChange;
560 }
561 
562 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
563                       "AMDGPU IR optimizations", false, false)
564 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
565 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
566                     false, false)
567 
568 char AMDGPUCodeGenPrepare::ID = 0;
569 
570 FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
571   return new AMDGPUCodeGenPrepare();
572 }
573