1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This pass does misc. AMDGPU optimizations on IR before instruction 12 /// selection. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUSubtarget.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/Analysis/AssumptionCache.h" 21 #include "llvm/Analysis/DivergenceAnalysis.h" 22 #include "llvm/Analysis/Loads.h" 23 #include "llvm/Analysis/ValueTracking.h" 24 #include "llvm/CodeGen/Passes.h" 25 #include "llvm/CodeGen/TargetPassConfig.h" 26 #include "llvm/IR/Attributes.h" 27 #include "llvm/IR/BasicBlock.h" 28 #include "llvm/IR/Constants.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/Function.h" 31 #include "llvm/IR/IRBuilder.h" 32 #include "llvm/IR/InstVisitor.h" 33 #include "llvm/IR/InstrTypes.h" 34 #include "llvm/IR/Instruction.h" 35 #include "llvm/IR/Instructions.h" 36 #include "llvm/IR/IntrinsicInst.h" 37 #include "llvm/IR/Intrinsics.h" 38 #include "llvm/IR/LLVMContext.h" 39 #include "llvm/IR/Operator.h" 40 #include "llvm/IR/Type.h" 41 #include "llvm/IR/Value.h" 42 #include "llvm/Pass.h" 43 #include "llvm/Support/Casting.h" 44 #include <cassert> 45 #include <iterator> 46 47 #define DEBUG_TYPE "amdgpu-codegenprepare" 48 49 using namespace llvm; 50 51 namespace { 52 53 static cl::opt<bool> WidenLoads( 54 "amdgpu-codegenprepare-widen-constant-loads", 55 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"), 56 cl::ReallyHidden, 57 cl::init(true)); 58 59 class AMDGPUCodeGenPrepare : public FunctionPass, 60 public InstVisitor<AMDGPUCodeGenPrepare, bool> { 61 const GCNSubtarget *ST = nullptr; 62 AssumptionCache *AC = nullptr; 63 DivergenceAnalysis *DA = nullptr; 64 Module *Mod = nullptr; 65 bool HasUnsafeFPMath = false; 66 AMDGPUAS AMDGPUASI; 67 68 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to 69 /// binary operation \p V. 70 /// 71 /// \returns Binary operation \p V. 72 /// \returns \p T's base element bit width. 73 unsigned getBaseElementBitWidth(const Type *T) const; 74 75 /// \returns Equivalent 32 bit integer type for given type \p T. For example, 76 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> 77 /// is returned. 78 Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 79 80 /// \returns True if binary operation \p I is a signed binary operation, false 81 /// otherwise. 82 bool isSigned(const BinaryOperator &I) const; 83 84 /// \returns True if the condition of 'select' operation \p I comes from a 85 /// signed 'icmp' operation, false otherwise. 86 bool isSigned(const SelectInst &I) const; 87 88 /// \returns True if type \p T needs to be promoted to 32 bit integer type, 89 /// false otherwise. 90 bool needsPromotionToI32(const Type *T) const; 91 92 /// Promotes uniform binary operation \p I to equivalent 32 bit binary 93 /// operation. 94 /// 95 /// \details \p I's base element bit width must be greater than 1 and less 96 /// than or equal 16. Promotion is done by sign or zero extending operands to 97 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and 98 /// truncating the result of 32 bit binary operation back to \p I's original 99 /// type. Division operation is not promoted. 100 /// 101 /// \returns True if \p I is promoted to equivalent 32 bit binary operation, 102 /// false otherwise. 103 bool promoteUniformOpToI32(BinaryOperator &I) const; 104 105 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. 106 /// 107 /// \details \p I's base element bit width must be greater than 1 and less 108 /// than or equal 16. Promotion is done by sign or zero extending operands to 109 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. 110 /// 111 /// \returns True. 112 bool promoteUniformOpToI32(ICmpInst &I) const; 113 114 /// Promotes uniform 'select' operation \p I to 32 bit 'select' 115 /// operation. 116 /// 117 /// \details \p I's base element bit width must be greater than 1 and less 118 /// than or equal 16. Promotion is done by sign or zero extending operands to 119 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the 120 /// result of 32 bit 'select' operation back to \p I's original type. 121 /// 122 /// \returns True. 123 bool promoteUniformOpToI32(SelectInst &I) const; 124 125 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' 126 /// intrinsic. 127 /// 128 /// \details \p I's base element bit width must be greater than 1 and less 129 /// than or equal 16. Promotion is done by zero extending the operand to 32 130 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the 131 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the 132 /// shift amount is 32 minus \p I's base element bit width), and truncating 133 /// the result of the shift operation back to \p I's original type. 134 /// 135 /// \returns True. 136 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; 137 138 /// Expands 24 bit div or rem. 139 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I, 140 Value *Num, Value *Den, 141 bool IsDiv, bool IsSigned) const; 142 143 /// Expands 32 bit div or rem. 144 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I, 145 Value *Num, Value *Den) const; 146 147 /// Widen a scalar load. 148 /// 149 /// \details \p Widen scalar load for uniform, small type loads from constant 150 // memory / to a full 32-bits and then truncate the input to allow a scalar 151 // load instead of a vector load. 152 // 153 /// \returns True. 154 155 bool canWidenScalarExtLoad(LoadInst &I) const; 156 157 public: 158 static char ID; 159 160 AMDGPUCodeGenPrepare() : FunctionPass(ID) {} 161 162 bool visitFDiv(BinaryOperator &I); 163 164 bool visitInstruction(Instruction &I) { return false; } 165 bool visitBinaryOperator(BinaryOperator &I); 166 bool visitLoadInst(LoadInst &I); 167 bool visitICmpInst(ICmpInst &I); 168 bool visitSelectInst(SelectInst &I); 169 170 bool visitIntrinsicInst(IntrinsicInst &I); 171 bool visitBitreverseIntrinsicInst(IntrinsicInst &I); 172 173 bool doInitialization(Module &M) override; 174 bool runOnFunction(Function &F) override; 175 176 StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 177 178 void getAnalysisUsage(AnalysisUsage &AU) const override { 179 AU.addRequired<AssumptionCacheTracker>(); 180 AU.addRequired<DivergenceAnalysis>(); 181 AU.setPreservesAll(); 182 } 183 }; 184 185 } // end anonymous namespace 186 187 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { 188 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 189 190 if (T->isIntegerTy()) 191 return T->getIntegerBitWidth(); 192 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); 193 } 194 195 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 196 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 197 198 if (T->isIntegerTy()) 199 return B.getInt32Ty(); 200 return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 201 } 202 203 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 204 return I.getOpcode() == Instruction::AShr || 205 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 206 } 207 208 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 209 return isa<ICmpInst>(I.getOperand(0)) ? 210 cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 211 } 212 213 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { 214 const IntegerType *IntTy = dyn_cast<IntegerType>(T); 215 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) 216 return true; 217 218 if (const VectorType *VT = dyn_cast<VectorType>(T)) { 219 // TODO: The set of packed operations is more limited, so may want to 220 // promote some anyway. 221 if (ST->hasVOP3PInsts()) 222 return false; 223 224 return needsPromotionToI32(VT->getElementType()); 225 } 226 227 return false; 228 } 229 230 // Return true if the op promoted to i32 should have nsw set. 231 static bool promotedOpIsNSW(const Instruction &I) { 232 switch (I.getOpcode()) { 233 case Instruction::Shl: 234 case Instruction::Add: 235 case Instruction::Sub: 236 return true; 237 case Instruction::Mul: 238 return I.hasNoUnsignedWrap(); 239 default: 240 return false; 241 } 242 } 243 244 // Return true if the op promoted to i32 should have nuw set. 245 static bool promotedOpIsNUW(const Instruction &I) { 246 switch (I.getOpcode()) { 247 case Instruction::Shl: 248 case Instruction::Add: 249 case Instruction::Mul: 250 return true; 251 case Instruction::Sub: 252 return I.hasNoUnsignedWrap(); 253 default: 254 return false; 255 } 256 } 257 258 bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const { 259 Type *Ty = I.getType(); 260 const DataLayout &DL = Mod->getDataLayout(); 261 int TySize = DL.getTypeSizeInBits(Ty); 262 unsigned Align = I.getAlignment() ? 263 I.getAlignment() : DL.getABITypeAlignment(Ty); 264 265 return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I); 266 } 267 268 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { 269 assert(needsPromotionToI32(I.getType()) && 270 "I does not need promotion to i32"); 271 272 if (I.getOpcode() == Instruction::SDiv || 273 I.getOpcode() == Instruction::UDiv || 274 I.getOpcode() == Instruction::SRem || 275 I.getOpcode() == Instruction::URem) 276 return false; 277 278 IRBuilder<> Builder(&I); 279 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 280 281 Type *I32Ty = getI32Ty(Builder, I.getType()); 282 Value *ExtOp0 = nullptr; 283 Value *ExtOp1 = nullptr; 284 Value *ExtRes = nullptr; 285 Value *TruncRes = nullptr; 286 287 if (isSigned(I)) { 288 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 289 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 290 } else { 291 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 292 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 293 } 294 295 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); 296 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { 297 if (promotedOpIsNSW(cast<Instruction>(I))) 298 Inst->setHasNoSignedWrap(); 299 300 if (promotedOpIsNUW(cast<Instruction>(I))) 301 Inst->setHasNoUnsignedWrap(); 302 303 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) 304 Inst->setIsExact(ExactOp->isExact()); 305 } 306 307 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 308 309 I.replaceAllUsesWith(TruncRes); 310 I.eraseFromParent(); 311 312 return true; 313 } 314 315 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { 316 assert(needsPromotionToI32(I.getOperand(0)->getType()) && 317 "I does not need promotion to i32"); 318 319 IRBuilder<> Builder(&I); 320 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 321 322 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); 323 Value *ExtOp0 = nullptr; 324 Value *ExtOp1 = nullptr; 325 Value *NewICmp = nullptr; 326 327 if (I.isSigned()) { 328 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 329 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 330 } else { 331 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 332 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 333 } 334 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 335 336 I.replaceAllUsesWith(NewICmp); 337 I.eraseFromParent(); 338 339 return true; 340 } 341 342 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { 343 assert(needsPromotionToI32(I.getType()) && 344 "I does not need promotion to i32"); 345 346 IRBuilder<> Builder(&I); 347 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 348 349 Type *I32Ty = getI32Ty(Builder, I.getType()); 350 Value *ExtOp1 = nullptr; 351 Value *ExtOp2 = nullptr; 352 Value *ExtRes = nullptr; 353 Value *TruncRes = nullptr; 354 355 if (isSigned(I)) { 356 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 357 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 358 } else { 359 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 360 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 361 } 362 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 363 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 364 365 I.replaceAllUsesWith(TruncRes); 366 I.eraseFromParent(); 367 368 return true; 369 } 370 371 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( 372 IntrinsicInst &I) const { 373 assert(I.getIntrinsicID() == Intrinsic::bitreverse && 374 "I must be bitreverse intrinsic"); 375 assert(needsPromotionToI32(I.getType()) && 376 "I does not need promotion to i32"); 377 378 IRBuilder<> Builder(&I); 379 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 380 381 Type *I32Ty = getI32Ty(Builder, I.getType()); 382 Function *I32 = 383 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); 384 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); 385 Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); 386 Value *LShrOp = 387 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); 388 Value *TruncRes = 389 Builder.CreateTrunc(LShrOp, I.getType()); 390 391 I.replaceAllUsesWith(TruncRes); 392 I.eraseFromParent(); 393 394 return true; 395 } 396 397 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) { 398 const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 399 if (!CNum) 400 return HasDenormals; 401 402 if (UnsafeDiv) 403 return true; 404 405 bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0); 406 407 // Reciprocal f32 is handled separately without denormals. 408 return HasDenormals ^ IsOne; 409 } 410 411 // Insert an intrinsic for fast fdiv for safe math situations where we can 412 // reduce precision. Leave fdiv for situations where the generic node is 413 // expected to be optimized. 414 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 415 Type *Ty = FDiv.getType(); 416 417 if (!Ty->getScalarType()->isFloatTy()) 418 return false; 419 420 MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 421 if (!FPMath) 422 return false; 423 424 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 425 float ULP = FPOp->getFPAccuracy(); 426 if (ULP < 2.5f) 427 return false; 428 429 FastMathFlags FMF = FPOp->getFastMathFlags(); 430 bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() || 431 FMF.allowReciprocal(); 432 433 // With UnsafeDiv node will be optimized to just rcp and mul. 434 if (UnsafeDiv) 435 return false; 436 437 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 438 Builder.setFastMathFlags(FMF); 439 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 440 441 Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); 442 443 Value *Num = FDiv.getOperand(0); 444 Value *Den = FDiv.getOperand(1); 445 446 Value *NewFDiv = nullptr; 447 448 bool HasDenormals = ST->hasFP32Denormals(); 449 if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 450 NewFDiv = UndefValue::get(VT); 451 452 // FIXME: Doesn't do the right thing for cases where the vector is partially 453 // constant. This works when the scalarizer pass is run first. 454 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 455 Value *NumEltI = Builder.CreateExtractElement(Num, I); 456 Value *DenEltI = Builder.CreateExtractElement(Den, I); 457 Value *NewElt; 458 459 if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) { 460 NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 461 } else { 462 NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 463 } 464 465 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 466 } 467 } else { 468 if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals)) 469 NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 470 } 471 472 if (NewFDiv) { 473 FDiv.replaceAllUsesWith(NewFDiv); 474 NewFDiv->takeName(&FDiv); 475 FDiv.eraseFromParent(); 476 } 477 478 return !!NewFDiv; 479 } 480 481 static bool hasUnsafeFPMath(const Function &F) { 482 Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 483 return Attr.getValueAsString() == "true"; 484 } 485 486 static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder, 487 Value *LHS, Value *RHS) { 488 Type *I32Ty = Builder.getInt32Ty(); 489 Type *I64Ty = Builder.getInt64Ty(); 490 491 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty); 492 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty); 493 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64); 494 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty); 495 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32)); 496 Hi = Builder.CreateTrunc(Hi, I32Ty); 497 return std::make_pair(Lo, Hi); 498 } 499 500 static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) { 501 return getMul64(Builder, LHS, RHS).second; 502 } 503 504 // The fractional part of a float is enough to accurately represent up to 505 // a 24-bit signed integer. 506 Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, 507 BinaryOperator &I, 508 Value *Num, Value *Den, 509 bool IsDiv, bool IsSigned) const { 510 assert(Num->getType()->isIntegerTy(32)); 511 512 const DataLayout &DL = Mod->getDataLayout(); 513 unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I); 514 if (LHSSignBits < 9) 515 return nullptr; 516 517 unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I); 518 if (RHSSignBits < 9) 519 return nullptr; 520 521 522 unsigned SignBits = std::min(LHSSignBits, RHSSignBits); 523 unsigned DivBits = 32 - SignBits; 524 if (IsSigned) 525 ++DivBits; 526 527 Type *Ty = Num->getType(); 528 Type *I32Ty = Builder.getInt32Ty(); 529 Type *F32Ty = Builder.getFloatTy(); 530 ConstantInt *One = Builder.getInt32(1); 531 Value *JQ = One; 532 533 if (IsSigned) { 534 // char|short jq = ia ^ ib; 535 JQ = Builder.CreateXor(Num, Den); 536 537 // jq = jq >> (bitsize - 2) 538 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30)); 539 540 // jq = jq | 0x1 541 JQ = Builder.CreateOr(JQ, One); 542 } 543 544 // int ia = (int)LHS; 545 Value *IA = Num; 546 547 // int ib, (int)RHS; 548 Value *IB = Den; 549 550 // float fa = (float)ia; 551 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty) 552 : Builder.CreateUIToFP(IA, F32Ty); 553 554 // float fb = (float)ib; 555 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty) 556 : Builder.CreateUIToFP(IB,F32Ty); 557 558 Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB); 559 Value *FQM = Builder.CreateFMul(FA, RCP); 560 561 // fq = trunc(fqm); 562 CallInst* FQ = Builder.CreateIntrinsic(Intrinsic::trunc, { FQM }); 563 FQ->copyFastMathFlags(Builder.getFastMathFlags()); 564 565 // float fqneg = -fq; 566 Value *FQNeg = Builder.CreateFNeg(FQ); 567 568 // float fr = mad(fqneg, fb, fa); 569 Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz, 570 { FQNeg, FB, FA }, FQ); 571 572 // int iq = (int)fq; 573 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty) 574 : Builder.CreateFPToUI(FQ, I32Ty); 575 576 // fr = fabs(fr); 577 FR = Builder.CreateIntrinsic(Intrinsic::fabs, { FR }, FQ); 578 579 // fb = fabs(fb); 580 FB = Builder.CreateIntrinsic(Intrinsic::fabs, { FB }, FQ); 581 582 // int cv = fr >= fb; 583 Value *CV = Builder.CreateFCmpOGE(FR, FB); 584 585 // jq = (cv ? jq : 0); 586 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0)); 587 588 // dst = iq + jq; 589 Value *Div = Builder.CreateAdd(IQ, JQ); 590 591 Value *Res = Div; 592 if (!IsDiv) { 593 // Rem needs compensation, it's easier to recompute it 594 Value *Rem = Builder.CreateMul(Div, Den); 595 Res = Builder.CreateSub(Num, Rem); 596 } 597 598 // Truncate to number of bits this divide really is. 599 if (IsSigned) { 600 Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits)); 601 Res = Builder.CreateSExt(Res, Ty); 602 } else { 603 ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1); 604 Res = Builder.CreateAnd(Res, TruncMask); 605 } 606 607 return Res; 608 } 609 610 Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, 611 BinaryOperator &I, 612 Value *Num, Value *Den) const { 613 Instruction::BinaryOps Opc = I.getOpcode(); 614 assert(Opc == Instruction::URem || Opc == Instruction::UDiv || 615 Opc == Instruction::SRem || Opc == Instruction::SDiv); 616 617 FastMathFlags FMF; 618 FMF.setFast(); 619 Builder.setFastMathFlags(FMF); 620 621 if (isa<Constant>(Den)) 622 return nullptr; // Keep it for optimization 623 624 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv; 625 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv; 626 627 Type *Ty = Num->getType(); 628 Type *I32Ty = Builder.getInt32Ty(); 629 Type *F32Ty = Builder.getFloatTy(); 630 631 if (Ty->getScalarSizeInBits() < 32) { 632 if (IsSigned) { 633 Num = Builder.CreateSExt(Num, I32Ty); 634 Den = Builder.CreateSExt(Den, I32Ty); 635 } else { 636 Num = Builder.CreateZExt(Num, I32Ty); 637 Den = Builder.CreateZExt(Den, I32Ty); 638 } 639 } 640 641 if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) { 642 Res = Builder.CreateTrunc(Res, Ty); 643 return Res; 644 } 645 646 ConstantInt *Zero = Builder.getInt32(0); 647 ConstantInt *One = Builder.getInt32(1); 648 ConstantInt *MinusOne = Builder.getInt32(~0); 649 650 Value *Sign = nullptr; 651 if (IsSigned) { 652 ConstantInt *K31 = Builder.getInt32(31); 653 Value *LHSign = Builder.CreateAShr(Num, K31); 654 Value *RHSign = Builder.CreateAShr(Den, K31); 655 // Remainder sign is the same as LHS 656 Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign; 657 658 Num = Builder.CreateAdd(Num, LHSign); 659 Den = Builder.CreateAdd(Den, RHSign); 660 661 Num = Builder.CreateXor(Num, LHSign); 662 Den = Builder.CreateXor(Den, RHSign); 663 } 664 665 // RCP = URECIP(Den) = 2^32 / Den + e 666 // e is rounding error. 667 Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty); 668 Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32); 669 Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000)); 670 Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1); 671 Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty); 672 673 // RCP_LO, RCP_HI = mul(RCP, Den) */ 674 Value *RCP_LO, *RCP_HI; 675 std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den); 676 677 // NEG_RCP_LO = -RCP_LO 678 Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO); 679 680 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 681 Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero); 682 Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO); 683 684 // Calculate the rounding error from the URECIP instruction 685 // E = mulhu(ABS_RCP_LO, RCP) 686 Value *E = getMulHu(Builder, ABS_RCP_LO, RCP); 687 688 // RCP_A_E = RCP + E 689 Value *RCP_A_E = Builder.CreateAdd(RCP, E); 690 691 // RCP_S_E = RCP - E 692 Value *RCP_S_E = Builder.CreateSub(RCP, E); 693 694 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 695 Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E); 696 697 // Quotient = mulhu(Tmp0, Num) 698 Value *Quotient = getMulHu(Builder, Tmp0, Num); 699 700 // Num_S_Remainder = Quotient * Den 701 Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den); 702 703 // Remainder = Num - Num_S_Remainder 704 Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder); 705 706 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) 707 Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den); 708 Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero); 709 710 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) 711 Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder); 712 Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, 713 MinusOne, Zero); 714 715 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 716 Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero); 717 Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero); 718 719 Value *Res; 720 if (IsDiv) { 721 // Quotient_A_One = Quotient + 1 722 Value *Quotient_A_One = Builder.CreateAdd(Quotient, One); 723 724 // Quotient_S_One = Quotient - 1 725 Value *Quotient_S_One = Builder.CreateSub(Quotient, One); 726 727 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) 728 Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One); 729 730 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) 731 Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One); 732 } else { 733 // Remainder_S_Den = Remainder - Den 734 Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den); 735 736 // Remainder_A_Den = Remainder + Den 737 Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den); 738 739 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) 740 Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den); 741 742 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) 743 Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den); 744 } 745 746 if (IsSigned) { 747 Res = Builder.CreateXor(Res, Sign); 748 Res = Builder.CreateSub(Res, Sign); 749 } 750 751 Res = Builder.CreateTrunc(Res, Ty); 752 753 return Res; 754 } 755 756 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 757 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 758 DA->isUniform(&I) && promoteUniformOpToI32(I)) 759 return true; 760 761 bool Changed = false; 762 Instruction::BinaryOps Opc = I.getOpcode(); 763 Type *Ty = I.getType(); 764 Value *NewDiv = nullptr; 765 if ((Opc == Instruction::URem || Opc == Instruction::UDiv || 766 Opc == Instruction::SRem || Opc == Instruction::SDiv) && 767 Ty->getScalarSizeInBits() <= 32) { 768 Value *Num = I.getOperand(0); 769 Value *Den = I.getOperand(1); 770 IRBuilder<> Builder(&I); 771 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 772 773 if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 774 NewDiv = UndefValue::get(VT); 775 776 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) { 777 Value *NumEltN = Builder.CreateExtractElement(Num, N); 778 Value *DenEltN = Builder.CreateExtractElement(Den, N); 779 Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); 780 if (!NewElt) 781 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); 782 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N); 783 } 784 } else { 785 NewDiv = expandDivRem32(Builder, I, Num, Den); 786 } 787 788 if (NewDiv) { 789 I.replaceAllUsesWith(NewDiv); 790 I.eraseFromParent(); 791 Changed = true; 792 } 793 } 794 795 return Changed; 796 } 797 798 bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { 799 if (!WidenLoads) 800 return false; 801 802 if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || 803 I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && 804 canWidenScalarExtLoad(I)) { 805 IRBuilder<> Builder(&I); 806 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 807 808 Type *I32Ty = Builder.getInt32Ty(); 809 Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); 810 Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); 811 LoadInst *WidenLoad = Builder.CreateLoad(BitCast); 812 WidenLoad->copyMetadata(I); 813 814 // If we have range metadata, we need to convert the type, and not make 815 // assumptions about the high bits. 816 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) { 817 ConstantInt *Lower = 818 mdconst::extract<ConstantInt>(Range->getOperand(0)); 819 820 if (Lower->getValue().isNullValue()) { 821 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr); 822 } else { 823 Metadata *LowAndHigh[] = { 824 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))), 825 // Don't make assumptions about the high bits. 826 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0)) 827 }; 828 829 WidenLoad->setMetadata(LLVMContext::MD_range, 830 MDNode::get(Mod->getContext(), LowAndHigh)); 831 } 832 } 833 834 int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); 835 Type *IntNTy = Builder.getIntNTy(TySize); 836 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); 837 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); 838 I.replaceAllUsesWith(ValOrig); 839 I.eraseFromParent(); 840 return true; 841 } 842 843 return false; 844 } 845 846 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 847 bool Changed = false; 848 849 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && 850 DA->isUniform(&I)) 851 Changed |= promoteUniformOpToI32(I); 852 853 return Changed; 854 } 855 856 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 857 bool Changed = false; 858 859 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 860 DA->isUniform(&I)) 861 Changed |= promoteUniformOpToI32(I); 862 863 return Changed; 864 } 865 866 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { 867 switch (I.getIntrinsicID()) { 868 case Intrinsic::bitreverse: 869 return visitBitreverseIntrinsicInst(I); 870 default: 871 return false; 872 } 873 } 874 875 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { 876 bool Changed = false; 877 878 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 879 DA->isUniform(&I)) 880 Changed |= promoteUniformBitreverseToI32(I); 881 882 return Changed; 883 } 884 885 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 886 Mod = &M; 887 return false; 888 } 889 890 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 891 if (skipFunction(F)) 892 return false; 893 894 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 895 if (!TPC) 896 return false; 897 898 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>(); 899 ST = &TM.getSubtarget<GCNSubtarget>(F); 900 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 901 DA = &getAnalysis<DivergenceAnalysis>(); 902 HasUnsafeFPMath = hasUnsafeFPMath(F); 903 AMDGPUASI = TM.getAMDGPUAS(); 904 905 bool MadeChange = false; 906 907 for (BasicBlock &BB : F) { 908 BasicBlock::iterator Next; 909 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 910 Next = std::next(I); 911 MadeChange |= visit(*I); 912 } 913 } 914 915 return MadeChange; 916 } 917 918 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 919 "AMDGPU IR optimizations", false, false) 920 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 921 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) 922 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", 923 false, false) 924 925 char AMDGPUCodeGenPrepare::ID = 0; 926 927 FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { 928 return new AMDGPUCodeGenPrepare(); 929 } 930