xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (revision 5bfbae5cb180ef00cc5ce412fd0fb6566918028e)
1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This pass does misc. AMDGPU optimizations on IR before instruction
12 /// selection.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/Analysis/DivergenceAnalysis.h"
21 #include "llvm/Analysis/Loads.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/CodeGen/Passes.h"
24 #include "llvm/CodeGen/TargetPassConfig.h"
25 #include "llvm/IR/Attributes.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/Constants.h"
28 #include "llvm/IR/DerivedTypes.h"
29 #include "llvm/IR/Function.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/InstVisitor.h"
32 #include "llvm/IR/InstrTypes.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/Intrinsics.h"
37 #include "llvm/IR/LLVMContext.h"
38 #include "llvm/IR/Operator.h"
39 #include "llvm/IR/Type.h"
40 #include "llvm/IR/Value.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/Casting.h"
43 #include <cassert>
44 #include <iterator>
45 
46 #define DEBUG_TYPE "amdgpu-codegenprepare"
47 
48 using namespace llvm;
49 
50 namespace {
51 
52 static cl::opt<bool> WidenLoads(
53   "amdgpu-codegenprepare-widen-constant-loads",
54   cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
55   cl::ReallyHidden,
56   cl::init(true));
57 
58 class AMDGPUCodeGenPrepare : public FunctionPass,
59                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
60   const GCNSubtarget *ST = nullptr;
61   DivergenceAnalysis *DA = nullptr;
62   Module *Mod = nullptr;
63   bool HasUnsafeFPMath = false;
64   AMDGPUAS AMDGPUASI;
65 
66   /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
67   /// binary operation \p V.
68   ///
69   /// \returns Binary operation \p V.
70   /// \returns \p T's base element bit width.
71   unsigned getBaseElementBitWidth(const Type *T) const;
72 
73   /// \returns Equivalent 32 bit integer type for given type \p T. For example,
74   /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
75   /// is returned.
76   Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
77 
78   /// \returns True if binary operation \p I is a signed binary operation, false
79   /// otherwise.
80   bool isSigned(const BinaryOperator &I) const;
81 
82   /// \returns True if the condition of 'select' operation \p I comes from a
83   /// signed 'icmp' operation, false otherwise.
84   bool isSigned(const SelectInst &I) const;
85 
86   /// \returns True if type \p T needs to be promoted to 32 bit integer type,
87   /// false otherwise.
88   bool needsPromotionToI32(const Type *T) const;
89 
90   /// Promotes uniform binary operation \p I to equivalent 32 bit binary
91   /// operation.
92   ///
93   /// \details \p I's base element bit width must be greater than 1 and less
94   /// than or equal 16. Promotion is done by sign or zero extending operands to
95   /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
96   /// truncating the result of 32 bit binary operation back to \p I's original
97   /// type. Division operation is not promoted.
98   ///
99   /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
100   /// false otherwise.
101   bool promoteUniformOpToI32(BinaryOperator &I) const;
102 
103   /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
104   ///
105   /// \details \p I's base element bit width must be greater than 1 and less
106   /// than or equal 16. Promotion is done by sign or zero extending operands to
107   /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
108   ///
109   /// \returns True.
110   bool promoteUniformOpToI32(ICmpInst &I) const;
111 
112   /// Promotes uniform 'select' operation \p I to 32 bit 'select'
113   /// operation.
114   ///
115   /// \details \p I's base element bit width must be greater than 1 and less
116   /// than or equal 16. Promotion is done by sign or zero extending operands to
117   /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
118   /// result of 32 bit 'select' operation back to \p I's original type.
119   ///
120   /// \returns True.
121   bool promoteUniformOpToI32(SelectInst &I) const;
122 
123   /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
124   /// intrinsic.
125   ///
126   /// \details \p I's base element bit width must be greater than 1 and less
127   /// than or equal 16. Promotion is done by zero extending the operand to 32
128   /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
129   /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
130   /// shift amount is 32 minus \p I's base element bit width), and truncating
131   /// the result of the shift operation back to \p I's original type.
132   ///
133   /// \returns True.
134   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
135 
136   /// Expands 24 bit div or rem.
137   Value* expandDivRem24(IRBuilder<> &Builder, Value *Num, Value *Den,
138                         bool IsDiv, bool IsSigned) const;
139 
140   /// Expands 32 bit div or rem.
141   Value* expandDivRem32(IRBuilder<> &Builder, Instruction::BinaryOps Opc,
142                         Value *Num, Value *Den) const;
143 
144   /// Widen a scalar load.
145   ///
146   /// \details \p Widen scalar load for uniform, small type loads from constant
147   //  memory / to a full 32-bits and then truncate the input to allow a scalar
148   //  load instead of a vector load.
149   //
150   /// \returns True.
151 
152   bool canWidenScalarExtLoad(LoadInst &I) const;
153 
154 public:
155   static char ID;
156 
157   AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
158 
159   bool visitFDiv(BinaryOperator &I);
160 
161   bool visitInstruction(Instruction &I) { return false; }
162   bool visitBinaryOperator(BinaryOperator &I);
163   bool visitLoadInst(LoadInst &I);
164   bool visitICmpInst(ICmpInst &I);
165   bool visitSelectInst(SelectInst &I);
166 
167   bool visitIntrinsicInst(IntrinsicInst &I);
168   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
169 
170   bool doInitialization(Module &M) override;
171   bool runOnFunction(Function &F) override;
172 
173   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
174 
175   void getAnalysisUsage(AnalysisUsage &AU) const override {
176     AU.addRequired<DivergenceAnalysis>();
177     AU.setPreservesAll();
178  }
179 };
180 
181 } // end anonymous namespace
182 
183 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
184   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
185 
186   if (T->isIntegerTy())
187     return T->getIntegerBitWidth();
188   return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
189 }
190 
191 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
192   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
193 
194   if (T->isIntegerTy())
195     return B.getInt32Ty();
196   return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
197 }
198 
199 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
200   return I.getOpcode() == Instruction::AShr ||
201       I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
202 }
203 
204 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
205   return isa<ICmpInst>(I.getOperand(0)) ?
206       cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
207 }
208 
209 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
210   const IntegerType *IntTy = dyn_cast<IntegerType>(T);
211   if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
212     return true;
213 
214   if (const VectorType *VT = dyn_cast<VectorType>(T)) {
215     // TODO: The set of packed operations is more limited, so may want to
216     // promote some anyway.
217     if (ST->hasVOP3PInsts())
218       return false;
219 
220     return needsPromotionToI32(VT->getElementType());
221   }
222 
223   return false;
224 }
225 
226 // Return true if the op promoted to i32 should have nsw set.
227 static bool promotedOpIsNSW(const Instruction &I) {
228   switch (I.getOpcode()) {
229   case Instruction::Shl:
230   case Instruction::Add:
231   case Instruction::Sub:
232     return true;
233   case Instruction::Mul:
234     return I.hasNoUnsignedWrap();
235   default:
236     return false;
237   }
238 }
239 
240 // Return true if the op promoted to i32 should have nuw set.
241 static bool promotedOpIsNUW(const Instruction &I) {
242   switch (I.getOpcode()) {
243   case Instruction::Shl:
244   case Instruction::Add:
245   case Instruction::Mul:
246     return true;
247   case Instruction::Sub:
248     return I.hasNoUnsignedWrap();
249   default:
250     return false;
251   }
252 }
253 
254 bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
255   Type *Ty = I.getType();
256   const DataLayout &DL = Mod->getDataLayout();
257   int TySize = DL.getTypeSizeInBits(Ty);
258   unsigned Align = I.getAlignment() ?
259                    I.getAlignment() : DL.getABITypeAlignment(Ty);
260 
261   return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
262 }
263 
264 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
265   assert(needsPromotionToI32(I.getType()) &&
266          "I does not need promotion to i32");
267 
268   if (I.getOpcode() == Instruction::SDiv ||
269       I.getOpcode() == Instruction::UDiv ||
270       I.getOpcode() == Instruction::SRem ||
271       I.getOpcode() == Instruction::URem)
272     return false;
273 
274   IRBuilder<> Builder(&I);
275   Builder.SetCurrentDebugLocation(I.getDebugLoc());
276 
277   Type *I32Ty = getI32Ty(Builder, I.getType());
278   Value *ExtOp0 = nullptr;
279   Value *ExtOp1 = nullptr;
280   Value *ExtRes = nullptr;
281   Value *TruncRes = nullptr;
282 
283   if (isSigned(I)) {
284     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
285     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
286   } else {
287     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
288     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
289   }
290 
291   ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
292   if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
293     if (promotedOpIsNSW(cast<Instruction>(I)))
294       Inst->setHasNoSignedWrap();
295 
296     if (promotedOpIsNUW(cast<Instruction>(I)))
297       Inst->setHasNoUnsignedWrap();
298 
299     if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
300       Inst->setIsExact(ExactOp->isExact());
301   }
302 
303   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
304 
305   I.replaceAllUsesWith(TruncRes);
306   I.eraseFromParent();
307 
308   return true;
309 }
310 
311 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
312   assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
313          "I does not need promotion to i32");
314 
315   IRBuilder<> Builder(&I);
316   Builder.SetCurrentDebugLocation(I.getDebugLoc());
317 
318   Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
319   Value *ExtOp0 = nullptr;
320   Value *ExtOp1 = nullptr;
321   Value *NewICmp  = nullptr;
322 
323   if (I.isSigned()) {
324     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
325     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
326   } else {
327     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
328     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
329   }
330   NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
331 
332   I.replaceAllUsesWith(NewICmp);
333   I.eraseFromParent();
334 
335   return true;
336 }
337 
338 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
339   assert(needsPromotionToI32(I.getType()) &&
340          "I does not need promotion to i32");
341 
342   IRBuilder<> Builder(&I);
343   Builder.SetCurrentDebugLocation(I.getDebugLoc());
344 
345   Type *I32Ty = getI32Ty(Builder, I.getType());
346   Value *ExtOp1 = nullptr;
347   Value *ExtOp2 = nullptr;
348   Value *ExtRes = nullptr;
349   Value *TruncRes = nullptr;
350 
351   if (isSigned(I)) {
352     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
353     ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
354   } else {
355     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
356     ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
357   }
358   ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
359   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
360 
361   I.replaceAllUsesWith(TruncRes);
362   I.eraseFromParent();
363 
364   return true;
365 }
366 
367 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
368     IntrinsicInst &I) const {
369   assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
370          "I must be bitreverse intrinsic");
371   assert(needsPromotionToI32(I.getType()) &&
372          "I does not need promotion to i32");
373 
374   IRBuilder<> Builder(&I);
375   Builder.SetCurrentDebugLocation(I.getDebugLoc());
376 
377   Type *I32Ty = getI32Ty(Builder, I.getType());
378   Function *I32 =
379       Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
380   Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
381   Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
382   Value *LShrOp =
383       Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
384   Value *TruncRes =
385       Builder.CreateTrunc(LShrOp, I.getType());
386 
387   I.replaceAllUsesWith(TruncRes);
388   I.eraseFromParent();
389 
390   return true;
391 }
392 
393 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
394   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
395   if (!CNum)
396     return HasDenormals;
397 
398   if (UnsafeDiv)
399     return true;
400 
401   bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
402 
403   // Reciprocal f32 is handled separately without denormals.
404   return HasDenormals ^ IsOne;
405 }
406 
407 // Insert an intrinsic for fast fdiv for safe math situations where we can
408 // reduce precision. Leave fdiv for situations where the generic node is
409 // expected to be optimized.
410 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
411   Type *Ty = FDiv.getType();
412 
413   if (!Ty->getScalarType()->isFloatTy())
414     return false;
415 
416   MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
417   if (!FPMath)
418     return false;
419 
420   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
421   float ULP = FPOp->getFPAccuracy();
422   if (ULP < 2.5f)
423     return false;
424 
425   FastMathFlags FMF = FPOp->getFastMathFlags();
426   bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
427                                       FMF.allowReciprocal();
428 
429   // With UnsafeDiv node will be optimized to just rcp and mul.
430   if (UnsafeDiv)
431     return false;
432 
433   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
434   Builder.setFastMathFlags(FMF);
435   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
436 
437   Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
438 
439   Value *Num = FDiv.getOperand(0);
440   Value *Den = FDiv.getOperand(1);
441 
442   Value *NewFDiv = nullptr;
443 
444   bool HasDenormals = ST->hasFP32Denormals();
445   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
446     NewFDiv = UndefValue::get(VT);
447 
448     // FIXME: Doesn't do the right thing for cases where the vector is partially
449     // constant. This works when the scalarizer pass is run first.
450     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
451       Value *NumEltI = Builder.CreateExtractElement(Num, I);
452       Value *DenEltI = Builder.CreateExtractElement(Den, I);
453       Value *NewElt;
454 
455       if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) {
456         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
457       } else {
458         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
459       }
460 
461       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
462     }
463   } else {
464     if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals))
465       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
466   }
467 
468   if (NewFDiv) {
469     FDiv.replaceAllUsesWith(NewFDiv);
470     NewFDiv->takeName(&FDiv);
471     FDiv.eraseFromParent();
472   }
473 
474   return !!NewFDiv;
475 }
476 
477 static bool hasUnsafeFPMath(const Function &F) {
478   Attribute Attr = F.getFnAttribute("unsafe-fp-math");
479   return Attr.getValueAsString() == "true";
480 }
481 
482 static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
483                                           Value *LHS, Value *RHS) {
484   Type *I32Ty = Builder.getInt32Ty();
485   Type *I64Ty = Builder.getInt64Ty();
486 
487   Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
488   Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
489   Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
490   Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
491   Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
492   Hi = Builder.CreateTrunc(Hi, I32Ty);
493   return std::make_pair(Lo, Hi);
494 }
495 
496 static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
497   return getMul64(Builder, LHS, RHS).second;
498 }
499 
500 // The fractional part of a float is enough to accurately represent up to
501 // a 24-bit signed integer.
502 Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
503                                             Value *Num, Value *Den,
504                                             bool IsDiv, bool IsSigned) const {
505   assert(Num->getType()->isIntegerTy(32));
506 
507   const DataLayout &DL = Mod->getDataLayout();
508   unsigned LHSSignBits = ComputeNumSignBits(Num, DL);
509   if (LHSSignBits < 9)
510     return nullptr;
511 
512   unsigned RHSSignBits = ComputeNumSignBits(Den, DL);
513   if (RHSSignBits < 9)
514     return nullptr;
515 
516 
517   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
518   unsigned DivBits = 32 - SignBits;
519   if (IsSigned)
520     ++DivBits;
521 
522   Type *Ty = Num->getType();
523   Type *I32Ty = Builder.getInt32Ty();
524   Type *F32Ty = Builder.getFloatTy();
525   ConstantInt *One = Builder.getInt32(1);
526   Value *JQ = One;
527 
528   if (IsSigned) {
529     // char|short jq = ia ^ ib;
530     JQ = Builder.CreateXor(Num, Den);
531 
532     // jq = jq >> (bitsize - 2)
533     JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
534 
535     // jq = jq | 0x1
536     JQ = Builder.CreateOr(JQ, One);
537   }
538 
539   // int ia = (int)LHS;
540   Value *IA = Num;
541 
542   // int ib, (int)RHS;
543   Value *IB = Den;
544 
545   // float fa = (float)ia;
546   Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
547                        : Builder.CreateUIToFP(IA, F32Ty);
548 
549   // float fb = (float)ib;
550   Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
551                        : Builder.CreateUIToFP(IB,F32Ty);
552 
553   Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB);
554   Value *FQM = Builder.CreateFMul(FA, RCP);
555 
556   // fq = trunc(fqm);
557   CallInst* FQ = Builder.CreateIntrinsic(Intrinsic::trunc, { FQM });
558   FQ->copyFastMathFlags(Builder.getFastMathFlags());
559 
560   // float fqneg = -fq;
561   Value *FQNeg = Builder.CreateFNeg(FQ);
562 
563   // float fr = mad(fqneg, fb, fa);
564   Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz,
565                                       { FQNeg, FB, FA }, FQ);
566 
567   // int iq = (int)fq;
568   Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
569                        : Builder.CreateFPToUI(FQ, I32Ty);
570 
571   // fr = fabs(fr);
572   FR = Builder.CreateIntrinsic(Intrinsic::fabs, { FR }, FQ);
573 
574   // fb = fabs(fb);
575   FB = Builder.CreateIntrinsic(Intrinsic::fabs, { FB }, FQ);
576 
577   // int cv = fr >= fb;
578   Value *CV = Builder.CreateFCmpOGE(FR, FB);
579 
580   // jq = (cv ? jq : 0);
581   JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
582 
583   // dst = iq + jq;
584   Value *Div = Builder.CreateAdd(IQ, JQ);
585 
586   Value *Res = Div;
587   if (!IsDiv) {
588     // Rem needs compensation, it's easier to recompute it
589     Value *Rem = Builder.CreateMul(Div, Den);
590     Res = Builder.CreateSub(Num, Rem);
591   }
592 
593   // Truncate to number of bits this divide really is.
594   if (IsSigned) {
595     Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits));
596     Res = Builder.CreateSExt(Res, Ty);
597   } else {
598     ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
599     Res = Builder.CreateAnd(Res, TruncMask);
600   }
601 
602   return Res;
603 }
604 
605 Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
606                                             Instruction::BinaryOps Opc,
607                                             Value *Num, Value *Den) const {
608   assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
609          Opc == Instruction::SRem || Opc == Instruction::SDiv);
610 
611   FastMathFlags FMF;
612   FMF.setFast();
613   Builder.setFastMathFlags(FMF);
614 
615   if (isa<Constant>(Den))
616     return nullptr; // Keep it for optimization
617 
618   bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
619   bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
620 
621   Type *Ty = Num->getType();
622   Type *I32Ty = Builder.getInt32Ty();
623   Type *F32Ty = Builder.getFloatTy();
624 
625   if (Ty->getScalarSizeInBits() < 32) {
626     if (IsSigned) {
627       Num = Builder.CreateSExt(Num, I32Ty);
628       Den = Builder.CreateSExt(Den, I32Ty);
629     } else {
630       Num = Builder.CreateZExt(Num, I32Ty);
631       Den = Builder.CreateZExt(Den, I32Ty);
632     }
633   }
634 
635   if (Value *Res = expandDivRem24(Builder, Num, Den, IsDiv, IsSigned)) {
636     Res = Builder.CreateTrunc(Res, Ty);
637     return Res;
638   }
639 
640   ConstantInt *Zero = Builder.getInt32(0);
641   ConstantInt *One = Builder.getInt32(1);
642   ConstantInt *MinusOne = Builder.getInt32(~0);
643 
644   Value *Sign = nullptr;
645   if (IsSigned) {
646     ConstantInt *K31 = Builder.getInt32(31);
647     Value *LHSign = Builder.CreateAShr(Num, K31);
648     Value *RHSign = Builder.CreateAShr(Den, K31);
649     // Remainder sign is the same as LHS
650     Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign;
651 
652     Num = Builder.CreateAdd(Num, LHSign);
653     Den = Builder.CreateAdd(Den, RHSign);
654 
655     Num = Builder.CreateXor(Num, LHSign);
656     Den = Builder.CreateXor(Den, RHSign);
657   }
658 
659   // RCP =  URECIP(Den) = 2^32 / Den + e
660   // e is rounding error.
661   Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty);
662   Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32);
663   Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000));
664   Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1);
665   Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty);
666 
667   // RCP_LO, RCP_HI = mul(RCP, Den) */
668   Value *RCP_LO, *RCP_HI;
669   std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den);
670 
671   // NEG_RCP_LO = -RCP_LO
672   Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO);
673 
674   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
675   Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero);
676   Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO);
677 
678   // Calculate the rounding error from the URECIP instruction
679   // E = mulhu(ABS_RCP_LO, RCP)
680   Value *E = getMulHu(Builder, ABS_RCP_LO, RCP);
681 
682   // RCP_A_E = RCP + E
683   Value *RCP_A_E = Builder.CreateAdd(RCP, E);
684 
685   // RCP_S_E = RCP - E
686   Value *RCP_S_E = Builder.CreateSub(RCP, E);
687 
688   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
689   Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E);
690 
691   // Quotient = mulhu(Tmp0, Num)
692   Value *Quotient = getMulHu(Builder, Tmp0, Num);
693 
694   // Num_S_Remainder = Quotient * Den
695   Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den);
696 
697   // Remainder = Num - Num_S_Remainder
698   Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder);
699 
700   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
701   Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den);
702   Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero);
703 
704   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
705   Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder);
706   Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC,
707                                                   MinusOne, Zero);
708 
709   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
710   Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero);
711   Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero);
712 
713   Value *Res;
714   if (IsDiv) {
715     // Quotient_A_One = Quotient + 1
716     Value *Quotient_A_One = Builder.CreateAdd(Quotient, One);
717 
718     // Quotient_S_One = Quotient - 1
719     Value *Quotient_S_One = Builder.CreateSub(Quotient, One);
720 
721     // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
722     Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One);
723 
724     // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
725     Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One);
726   } else {
727     // Remainder_S_Den = Remainder - Den
728     Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den);
729 
730     // Remainder_A_Den = Remainder + Den
731     Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den);
732 
733     // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
734     Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den);
735 
736     // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
737     Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den);
738   }
739 
740   if (IsSigned) {
741     Res = Builder.CreateXor(Res, Sign);
742     Res = Builder.CreateSub(Res, Sign);
743   }
744 
745   Res = Builder.CreateTrunc(Res, Ty);
746 
747   return Res;
748 }
749 
750 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
751   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
752       DA->isUniform(&I) && promoteUniformOpToI32(I))
753     return true;
754 
755   bool Changed = false;
756   Instruction::BinaryOps Opc = I.getOpcode();
757   Type *Ty = I.getType();
758   Value *NewDiv = nullptr;
759   if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
760        Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
761       Ty->getScalarSizeInBits() <= 32) {
762     Value *Num = I.getOperand(0);
763     Value *Den = I.getOperand(1);
764     IRBuilder<> Builder(&I);
765     Builder.SetCurrentDebugLocation(I.getDebugLoc());
766 
767     if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
768       NewDiv = UndefValue::get(VT);
769 
770       for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
771         Value *NumEltI = Builder.CreateExtractElement(Num, I);
772         Value *DenEltI = Builder.CreateExtractElement(Den, I);
773         Value *NewElt = expandDivRem32(Builder, Opc, NumEltI, DenEltI);
774         if (!NewElt)
775           NewElt = Builder.CreateBinOp(Opc, NumEltI, DenEltI);
776         NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, I);
777       }
778     } else {
779       NewDiv = expandDivRem32(Builder, Opc, Num, Den);
780     }
781 
782     if (NewDiv) {
783       I.replaceAllUsesWith(NewDiv);
784       I.eraseFromParent();
785       Changed = true;
786     }
787   }
788 
789   return Changed;
790 }
791 
792 bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
793   if (!WidenLoads)
794     return false;
795 
796   if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
797        I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
798       canWidenScalarExtLoad(I)) {
799     IRBuilder<> Builder(&I);
800     Builder.SetCurrentDebugLocation(I.getDebugLoc());
801 
802     Type *I32Ty = Builder.getInt32Ty();
803     Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
804     Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
805     LoadInst *WidenLoad = Builder.CreateLoad(BitCast);
806     WidenLoad->copyMetadata(I);
807 
808     // If we have range metadata, we need to convert the type, and not make
809     // assumptions about the high bits.
810     if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
811       ConstantInt *Lower =
812         mdconst::extract<ConstantInt>(Range->getOperand(0));
813 
814       if (Lower->getValue().isNullValue()) {
815         WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
816       } else {
817         Metadata *LowAndHigh[] = {
818           ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
819           // Don't make assumptions about the high bits.
820           ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
821         };
822 
823         WidenLoad->setMetadata(LLVMContext::MD_range,
824                                MDNode::get(Mod->getContext(), LowAndHigh));
825       }
826     }
827 
828     int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
829     Type *IntNTy = Builder.getIntNTy(TySize);
830     Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
831     Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
832     I.replaceAllUsesWith(ValOrig);
833     I.eraseFromParent();
834     return true;
835   }
836 
837   return false;
838 }
839 
840 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
841   bool Changed = false;
842 
843   if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
844       DA->isUniform(&I))
845     Changed |= promoteUniformOpToI32(I);
846 
847   return Changed;
848 }
849 
850 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
851   bool Changed = false;
852 
853   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
854       DA->isUniform(&I))
855     Changed |= promoteUniformOpToI32(I);
856 
857   return Changed;
858 }
859 
860 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
861   switch (I.getIntrinsicID()) {
862   case Intrinsic::bitreverse:
863     return visitBitreverseIntrinsicInst(I);
864   default:
865     return false;
866   }
867 }
868 
869 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
870   bool Changed = false;
871 
872   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
873       DA->isUniform(&I))
874     Changed |= promoteUniformBitreverseToI32(I);
875 
876   return Changed;
877 }
878 
879 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
880   Mod = &M;
881   return false;
882 }
883 
884 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
885   if (skipFunction(F))
886     return false;
887 
888   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
889   if (!TPC)
890     return false;
891 
892   const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
893   ST = &TM.getSubtarget<GCNSubtarget>(F);
894   DA = &getAnalysis<DivergenceAnalysis>();
895   HasUnsafeFPMath = hasUnsafeFPMath(F);
896   AMDGPUASI = TM.getAMDGPUAS();
897 
898   bool MadeChange = false;
899 
900   for (BasicBlock &BB : F) {
901     BasicBlock::iterator Next;
902     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
903       Next = std::next(I);
904       MadeChange |= visit(*I);
905     }
906   }
907 
908   return MadeChange;
909 }
910 
911 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
912                       "AMDGPU IR optimizations", false, false)
913 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
914 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
915                     false, false)
916 
917 char AMDGPUCodeGenPrepare::ID = 0;
918 
919 FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
920   return new AMDGPUCodeGenPrepare();
921 }
922