1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass does misc. AMDGPU optimizations on IR before instruction 11 /// selection. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "AMDGPUSubtarget.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "llvm/ADT/StringRef.h" 19 #include "llvm/Analysis/AssumptionCache.h" 20 #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 21 #include "llvm/Analysis/Loads.h" 22 #include "llvm/Analysis/ValueTracking.h" 23 #include "llvm/CodeGen/Passes.h" 24 #include "llvm/CodeGen/TargetPassConfig.h" 25 #include "llvm/IR/Attributes.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/Constants.h" 28 #include "llvm/IR/DerivedTypes.h" 29 #include "llvm/IR/Function.h" 30 #include "llvm/IR/IRBuilder.h" 31 #include "llvm/IR/InstVisitor.h" 32 #include "llvm/IR/InstrTypes.h" 33 #include "llvm/IR/Instruction.h" 34 #include "llvm/IR/Instructions.h" 35 #include "llvm/IR/IntrinsicInst.h" 36 #include "llvm/IR/Intrinsics.h" 37 #include "llvm/IR/LLVMContext.h" 38 #include "llvm/IR/Operator.h" 39 #include "llvm/IR/Type.h" 40 #include "llvm/IR/Value.h" 41 #include "llvm/Pass.h" 42 #include "llvm/Support/Casting.h" 43 #include <cassert> 44 #include <iterator> 45 46 #define DEBUG_TYPE "amdgpu-codegenprepare" 47 48 using namespace llvm; 49 50 namespace { 51 52 static cl::opt<bool> WidenLoads( 53 "amdgpu-codegenprepare-widen-constant-loads", 54 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"), 55 cl::ReallyHidden, 56 cl::init(true)); 57 58 class AMDGPUCodeGenPrepare : public FunctionPass, 59 public InstVisitor<AMDGPUCodeGenPrepare, bool> { 60 const GCNSubtarget *ST = nullptr; 61 AssumptionCache *AC = nullptr; 62 LegacyDivergenceAnalysis *DA = nullptr; 63 Module *Mod = nullptr; 64 bool HasUnsafeFPMath = false; 65 66 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to 67 /// binary operation \p V. 68 /// 69 /// \returns Binary operation \p V. 70 /// \returns \p T's base element bit width. 71 unsigned getBaseElementBitWidth(const Type *T) const; 72 73 /// \returns Equivalent 32 bit integer type for given type \p T. For example, 74 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> 75 /// is returned. 76 Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 77 78 /// \returns True if binary operation \p I is a signed binary operation, false 79 /// otherwise. 80 bool isSigned(const BinaryOperator &I) const; 81 82 /// \returns True if the condition of 'select' operation \p I comes from a 83 /// signed 'icmp' operation, false otherwise. 84 bool isSigned(const SelectInst &I) const; 85 86 /// \returns True if type \p T needs to be promoted to 32 bit integer type, 87 /// false otherwise. 88 bool needsPromotionToI32(const Type *T) const; 89 90 /// Promotes uniform binary operation \p I to equivalent 32 bit binary 91 /// operation. 92 /// 93 /// \details \p I's base element bit width must be greater than 1 and less 94 /// than or equal 16. Promotion is done by sign or zero extending operands to 95 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and 96 /// truncating the result of 32 bit binary operation back to \p I's original 97 /// type. Division operation is not promoted. 98 /// 99 /// \returns True if \p I is promoted to equivalent 32 bit binary operation, 100 /// false otherwise. 101 bool promoteUniformOpToI32(BinaryOperator &I) const; 102 103 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. 104 /// 105 /// \details \p I's base element bit width must be greater than 1 and less 106 /// than or equal 16. Promotion is done by sign or zero extending operands to 107 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. 108 /// 109 /// \returns True. 110 bool promoteUniformOpToI32(ICmpInst &I) const; 111 112 /// Promotes uniform 'select' operation \p I to 32 bit 'select' 113 /// operation. 114 /// 115 /// \details \p I's base element bit width must be greater than 1 and less 116 /// than or equal 16. Promotion is done by sign or zero extending operands to 117 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the 118 /// result of 32 bit 'select' operation back to \p I's original type. 119 /// 120 /// \returns True. 121 bool promoteUniformOpToI32(SelectInst &I) const; 122 123 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' 124 /// intrinsic. 125 /// 126 /// \details \p I's base element bit width must be greater than 1 and less 127 /// than or equal 16. Promotion is done by zero extending the operand to 32 128 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the 129 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the 130 /// shift amount is 32 minus \p I's base element bit width), and truncating 131 /// the result of the shift operation back to \p I's original type. 132 /// 133 /// \returns True. 134 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; 135 136 /// Expands 24 bit div or rem. 137 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I, 138 Value *Num, Value *Den, 139 bool IsDiv, bool IsSigned) const; 140 141 /// Expands 32 bit div or rem. 142 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I, 143 Value *Num, Value *Den) const; 144 145 /// Widen a scalar load. 146 /// 147 /// \details \p Widen scalar load for uniform, small type loads from constant 148 // memory / to a full 32-bits and then truncate the input to allow a scalar 149 // load instead of a vector load. 150 // 151 /// \returns True. 152 153 bool canWidenScalarExtLoad(LoadInst &I) const; 154 155 public: 156 static char ID; 157 158 AMDGPUCodeGenPrepare() : FunctionPass(ID) {} 159 160 bool visitFDiv(BinaryOperator &I); 161 162 bool visitInstruction(Instruction &I) { return false; } 163 bool visitBinaryOperator(BinaryOperator &I); 164 bool visitLoadInst(LoadInst &I); 165 bool visitICmpInst(ICmpInst &I); 166 bool visitSelectInst(SelectInst &I); 167 168 bool visitIntrinsicInst(IntrinsicInst &I); 169 bool visitBitreverseIntrinsicInst(IntrinsicInst &I); 170 171 bool doInitialization(Module &M) override; 172 bool runOnFunction(Function &F) override; 173 174 StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 175 176 void getAnalysisUsage(AnalysisUsage &AU) const override { 177 AU.addRequired<AssumptionCacheTracker>(); 178 AU.addRequired<LegacyDivergenceAnalysis>(); 179 AU.setPreservesAll(); 180 } 181 }; 182 183 } // end anonymous namespace 184 185 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { 186 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 187 188 if (T->isIntegerTy()) 189 return T->getIntegerBitWidth(); 190 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); 191 } 192 193 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 194 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 195 196 if (T->isIntegerTy()) 197 return B.getInt32Ty(); 198 return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 199 } 200 201 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 202 return I.getOpcode() == Instruction::AShr || 203 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 204 } 205 206 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 207 return isa<ICmpInst>(I.getOperand(0)) ? 208 cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 209 } 210 211 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { 212 const IntegerType *IntTy = dyn_cast<IntegerType>(T); 213 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) 214 return true; 215 216 if (const VectorType *VT = dyn_cast<VectorType>(T)) { 217 // TODO: The set of packed operations is more limited, so may want to 218 // promote some anyway. 219 if (ST->hasVOP3PInsts()) 220 return false; 221 222 return needsPromotionToI32(VT->getElementType()); 223 } 224 225 return false; 226 } 227 228 // Return true if the op promoted to i32 should have nsw set. 229 static bool promotedOpIsNSW(const Instruction &I) { 230 switch (I.getOpcode()) { 231 case Instruction::Shl: 232 case Instruction::Add: 233 case Instruction::Sub: 234 return true; 235 case Instruction::Mul: 236 return I.hasNoUnsignedWrap(); 237 default: 238 return false; 239 } 240 } 241 242 // Return true if the op promoted to i32 should have nuw set. 243 static bool promotedOpIsNUW(const Instruction &I) { 244 switch (I.getOpcode()) { 245 case Instruction::Shl: 246 case Instruction::Add: 247 case Instruction::Mul: 248 return true; 249 case Instruction::Sub: 250 return I.hasNoUnsignedWrap(); 251 default: 252 return false; 253 } 254 } 255 256 bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const { 257 Type *Ty = I.getType(); 258 const DataLayout &DL = Mod->getDataLayout(); 259 int TySize = DL.getTypeSizeInBits(Ty); 260 unsigned Align = I.getAlignment() ? 261 I.getAlignment() : DL.getABITypeAlignment(Ty); 262 263 return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I); 264 } 265 266 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { 267 assert(needsPromotionToI32(I.getType()) && 268 "I does not need promotion to i32"); 269 270 if (I.getOpcode() == Instruction::SDiv || 271 I.getOpcode() == Instruction::UDiv || 272 I.getOpcode() == Instruction::SRem || 273 I.getOpcode() == Instruction::URem) 274 return false; 275 276 IRBuilder<> Builder(&I); 277 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 278 279 Type *I32Ty = getI32Ty(Builder, I.getType()); 280 Value *ExtOp0 = nullptr; 281 Value *ExtOp1 = nullptr; 282 Value *ExtRes = nullptr; 283 Value *TruncRes = nullptr; 284 285 if (isSigned(I)) { 286 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 287 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 288 } else { 289 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 290 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 291 } 292 293 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); 294 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { 295 if (promotedOpIsNSW(cast<Instruction>(I))) 296 Inst->setHasNoSignedWrap(); 297 298 if (promotedOpIsNUW(cast<Instruction>(I))) 299 Inst->setHasNoUnsignedWrap(); 300 301 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) 302 Inst->setIsExact(ExactOp->isExact()); 303 } 304 305 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 306 307 I.replaceAllUsesWith(TruncRes); 308 I.eraseFromParent(); 309 310 return true; 311 } 312 313 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { 314 assert(needsPromotionToI32(I.getOperand(0)->getType()) && 315 "I does not need promotion to i32"); 316 317 IRBuilder<> Builder(&I); 318 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 319 320 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); 321 Value *ExtOp0 = nullptr; 322 Value *ExtOp1 = nullptr; 323 Value *NewICmp = nullptr; 324 325 if (I.isSigned()) { 326 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 327 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 328 } else { 329 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 330 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 331 } 332 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 333 334 I.replaceAllUsesWith(NewICmp); 335 I.eraseFromParent(); 336 337 return true; 338 } 339 340 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { 341 assert(needsPromotionToI32(I.getType()) && 342 "I does not need promotion to i32"); 343 344 IRBuilder<> Builder(&I); 345 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 346 347 Type *I32Ty = getI32Ty(Builder, I.getType()); 348 Value *ExtOp1 = nullptr; 349 Value *ExtOp2 = nullptr; 350 Value *ExtRes = nullptr; 351 Value *TruncRes = nullptr; 352 353 if (isSigned(I)) { 354 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 355 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 356 } else { 357 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 358 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 359 } 360 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 361 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 362 363 I.replaceAllUsesWith(TruncRes); 364 I.eraseFromParent(); 365 366 return true; 367 } 368 369 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( 370 IntrinsicInst &I) const { 371 assert(I.getIntrinsicID() == Intrinsic::bitreverse && 372 "I must be bitreverse intrinsic"); 373 assert(needsPromotionToI32(I.getType()) && 374 "I does not need promotion to i32"); 375 376 IRBuilder<> Builder(&I); 377 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 378 379 Type *I32Ty = getI32Ty(Builder, I.getType()); 380 Function *I32 = 381 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); 382 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); 383 Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); 384 Value *LShrOp = 385 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); 386 Value *TruncRes = 387 Builder.CreateTrunc(LShrOp, I.getType()); 388 389 I.replaceAllUsesWith(TruncRes); 390 I.eraseFromParent(); 391 392 return true; 393 } 394 395 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) { 396 const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 397 if (!CNum) 398 return HasDenormals; 399 400 if (UnsafeDiv) 401 return true; 402 403 bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0); 404 405 // Reciprocal f32 is handled separately without denormals. 406 return HasDenormals ^ IsOne; 407 } 408 409 // Insert an intrinsic for fast fdiv for safe math situations where we can 410 // reduce precision. Leave fdiv for situations where the generic node is 411 // expected to be optimized. 412 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 413 Type *Ty = FDiv.getType(); 414 415 if (!Ty->getScalarType()->isFloatTy()) 416 return false; 417 418 MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 419 if (!FPMath) 420 return false; 421 422 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 423 float ULP = FPOp->getFPAccuracy(); 424 if (ULP < 2.5f) 425 return false; 426 427 FastMathFlags FMF = FPOp->getFastMathFlags(); 428 bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() || 429 FMF.allowReciprocal(); 430 431 // With UnsafeDiv node will be optimized to just rcp and mul. 432 if (UnsafeDiv) 433 return false; 434 435 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 436 Builder.setFastMathFlags(FMF); 437 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 438 439 Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); 440 441 Value *Num = FDiv.getOperand(0); 442 Value *Den = FDiv.getOperand(1); 443 444 Value *NewFDiv = nullptr; 445 446 bool HasDenormals = ST->hasFP32Denormals(); 447 if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 448 NewFDiv = UndefValue::get(VT); 449 450 // FIXME: Doesn't do the right thing for cases where the vector is partially 451 // constant. This works when the scalarizer pass is run first. 452 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 453 Value *NumEltI = Builder.CreateExtractElement(Num, I); 454 Value *DenEltI = Builder.CreateExtractElement(Den, I); 455 Value *NewElt; 456 457 if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) { 458 NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 459 } else { 460 NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 461 } 462 463 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 464 } 465 } else { 466 if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals)) 467 NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 468 } 469 470 if (NewFDiv) { 471 FDiv.replaceAllUsesWith(NewFDiv); 472 NewFDiv->takeName(&FDiv); 473 FDiv.eraseFromParent(); 474 } 475 476 return !!NewFDiv; 477 } 478 479 static bool hasUnsafeFPMath(const Function &F) { 480 Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 481 return Attr.getValueAsString() == "true"; 482 } 483 484 static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder, 485 Value *LHS, Value *RHS) { 486 Type *I32Ty = Builder.getInt32Ty(); 487 Type *I64Ty = Builder.getInt64Ty(); 488 489 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty); 490 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty); 491 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64); 492 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty); 493 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32)); 494 Hi = Builder.CreateTrunc(Hi, I32Ty); 495 return std::make_pair(Lo, Hi); 496 } 497 498 static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) { 499 return getMul64(Builder, LHS, RHS).second; 500 } 501 502 // The fractional part of a float is enough to accurately represent up to 503 // a 24-bit signed integer. 504 Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, 505 BinaryOperator &I, 506 Value *Num, Value *Den, 507 bool IsDiv, bool IsSigned) const { 508 assert(Num->getType()->isIntegerTy(32)); 509 510 const DataLayout &DL = Mod->getDataLayout(); 511 unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I); 512 if (LHSSignBits < 9) 513 return nullptr; 514 515 unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I); 516 if (RHSSignBits < 9) 517 return nullptr; 518 519 520 unsigned SignBits = std::min(LHSSignBits, RHSSignBits); 521 unsigned DivBits = 32 - SignBits; 522 if (IsSigned) 523 ++DivBits; 524 525 Type *Ty = Num->getType(); 526 Type *I32Ty = Builder.getInt32Ty(); 527 Type *F32Ty = Builder.getFloatTy(); 528 ConstantInt *One = Builder.getInt32(1); 529 Value *JQ = One; 530 531 if (IsSigned) { 532 // char|short jq = ia ^ ib; 533 JQ = Builder.CreateXor(Num, Den); 534 535 // jq = jq >> (bitsize - 2) 536 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30)); 537 538 // jq = jq | 0x1 539 JQ = Builder.CreateOr(JQ, One); 540 } 541 542 // int ia = (int)LHS; 543 Value *IA = Num; 544 545 // int ib, (int)RHS; 546 Value *IB = Den; 547 548 // float fa = (float)ia; 549 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty) 550 : Builder.CreateUIToFP(IA, F32Ty); 551 552 // float fb = (float)ib; 553 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty) 554 : Builder.CreateUIToFP(IB,F32Ty); 555 556 Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB); 557 Value *FQM = Builder.CreateFMul(FA, RCP); 558 559 // fq = trunc(fqm); 560 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM); 561 FQ->copyFastMathFlags(Builder.getFastMathFlags()); 562 563 // float fqneg = -fq; 564 Value *FQNeg = Builder.CreateFNeg(FQ); 565 566 // float fr = mad(fqneg, fb, fa); 567 Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz, 568 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ); 569 570 // int iq = (int)fq; 571 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty) 572 : Builder.CreateFPToUI(FQ, I32Ty); 573 574 // fr = fabs(fr); 575 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ); 576 577 // fb = fabs(fb); 578 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ); 579 580 // int cv = fr >= fb; 581 Value *CV = Builder.CreateFCmpOGE(FR, FB); 582 583 // jq = (cv ? jq : 0); 584 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0)); 585 586 // dst = iq + jq; 587 Value *Div = Builder.CreateAdd(IQ, JQ); 588 589 Value *Res = Div; 590 if (!IsDiv) { 591 // Rem needs compensation, it's easier to recompute it 592 Value *Rem = Builder.CreateMul(Div, Den); 593 Res = Builder.CreateSub(Num, Rem); 594 } 595 596 // Truncate to number of bits this divide really is. 597 if (IsSigned) { 598 Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits)); 599 Res = Builder.CreateSExt(Res, Ty); 600 } else { 601 ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1); 602 Res = Builder.CreateAnd(Res, TruncMask); 603 } 604 605 return Res; 606 } 607 608 Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, 609 BinaryOperator &I, 610 Value *Num, Value *Den) const { 611 Instruction::BinaryOps Opc = I.getOpcode(); 612 assert(Opc == Instruction::URem || Opc == Instruction::UDiv || 613 Opc == Instruction::SRem || Opc == Instruction::SDiv); 614 615 FastMathFlags FMF; 616 FMF.setFast(); 617 Builder.setFastMathFlags(FMF); 618 619 if (isa<Constant>(Den)) 620 return nullptr; // Keep it for optimization 621 622 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv; 623 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv; 624 625 Type *Ty = Num->getType(); 626 Type *I32Ty = Builder.getInt32Ty(); 627 Type *F32Ty = Builder.getFloatTy(); 628 629 if (Ty->getScalarSizeInBits() < 32) { 630 if (IsSigned) { 631 Num = Builder.CreateSExt(Num, I32Ty); 632 Den = Builder.CreateSExt(Den, I32Ty); 633 } else { 634 Num = Builder.CreateZExt(Num, I32Ty); 635 Den = Builder.CreateZExt(Den, I32Ty); 636 } 637 } 638 639 if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) { 640 Res = Builder.CreateTrunc(Res, Ty); 641 return Res; 642 } 643 644 ConstantInt *Zero = Builder.getInt32(0); 645 ConstantInt *One = Builder.getInt32(1); 646 ConstantInt *MinusOne = Builder.getInt32(~0); 647 648 Value *Sign = nullptr; 649 if (IsSigned) { 650 ConstantInt *K31 = Builder.getInt32(31); 651 Value *LHSign = Builder.CreateAShr(Num, K31); 652 Value *RHSign = Builder.CreateAShr(Den, K31); 653 // Remainder sign is the same as LHS 654 Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign; 655 656 Num = Builder.CreateAdd(Num, LHSign); 657 Den = Builder.CreateAdd(Den, RHSign); 658 659 Num = Builder.CreateXor(Num, LHSign); 660 Den = Builder.CreateXor(Den, RHSign); 661 } 662 663 // RCP = URECIP(Den) = 2^32 / Den + e 664 // e is rounding error. 665 Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty); 666 Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32); 667 Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000)); 668 Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1); 669 Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty); 670 671 // RCP_LO, RCP_HI = mul(RCP, Den) */ 672 Value *RCP_LO, *RCP_HI; 673 std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den); 674 675 // NEG_RCP_LO = -RCP_LO 676 Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO); 677 678 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 679 Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero); 680 Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO); 681 682 // Calculate the rounding error from the URECIP instruction 683 // E = mulhu(ABS_RCP_LO, RCP) 684 Value *E = getMulHu(Builder, ABS_RCP_LO, RCP); 685 686 // RCP_A_E = RCP + E 687 Value *RCP_A_E = Builder.CreateAdd(RCP, E); 688 689 // RCP_S_E = RCP - E 690 Value *RCP_S_E = Builder.CreateSub(RCP, E); 691 692 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 693 Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E); 694 695 // Quotient = mulhu(Tmp0, Num) 696 Value *Quotient = getMulHu(Builder, Tmp0, Num); 697 698 // Num_S_Remainder = Quotient * Den 699 Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den); 700 701 // Remainder = Num - Num_S_Remainder 702 Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder); 703 704 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) 705 Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den); 706 Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero); 707 708 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) 709 Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder); 710 Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, 711 MinusOne, Zero); 712 713 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 714 Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero); 715 Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero); 716 717 Value *Res; 718 if (IsDiv) { 719 // Quotient_A_One = Quotient + 1 720 Value *Quotient_A_One = Builder.CreateAdd(Quotient, One); 721 722 // Quotient_S_One = Quotient - 1 723 Value *Quotient_S_One = Builder.CreateSub(Quotient, One); 724 725 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) 726 Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One); 727 728 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) 729 Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One); 730 } else { 731 // Remainder_S_Den = Remainder - Den 732 Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den); 733 734 // Remainder_A_Den = Remainder + Den 735 Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den); 736 737 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) 738 Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den); 739 740 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) 741 Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den); 742 } 743 744 if (IsSigned) { 745 Res = Builder.CreateXor(Res, Sign); 746 Res = Builder.CreateSub(Res, Sign); 747 } 748 749 Res = Builder.CreateTrunc(Res, Ty); 750 751 return Res; 752 } 753 754 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 755 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 756 DA->isUniform(&I) && promoteUniformOpToI32(I)) 757 return true; 758 759 bool Changed = false; 760 Instruction::BinaryOps Opc = I.getOpcode(); 761 Type *Ty = I.getType(); 762 Value *NewDiv = nullptr; 763 if ((Opc == Instruction::URem || Opc == Instruction::UDiv || 764 Opc == Instruction::SRem || Opc == Instruction::SDiv) && 765 Ty->getScalarSizeInBits() <= 32) { 766 Value *Num = I.getOperand(0); 767 Value *Den = I.getOperand(1); 768 IRBuilder<> Builder(&I); 769 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 770 771 if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 772 NewDiv = UndefValue::get(VT); 773 774 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) { 775 Value *NumEltN = Builder.CreateExtractElement(Num, N); 776 Value *DenEltN = Builder.CreateExtractElement(Den, N); 777 Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); 778 if (!NewElt) 779 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); 780 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N); 781 } 782 } else { 783 NewDiv = expandDivRem32(Builder, I, Num, Den); 784 } 785 786 if (NewDiv) { 787 I.replaceAllUsesWith(NewDiv); 788 I.eraseFromParent(); 789 Changed = true; 790 } 791 } 792 793 return Changed; 794 } 795 796 bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { 797 if (!WidenLoads) 798 return false; 799 800 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 801 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && 802 canWidenScalarExtLoad(I)) { 803 IRBuilder<> Builder(&I); 804 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 805 806 Type *I32Ty = Builder.getInt32Ty(); 807 Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); 808 Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); 809 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast); 810 WidenLoad->copyMetadata(I); 811 812 // If we have range metadata, we need to convert the type, and not make 813 // assumptions about the high bits. 814 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) { 815 ConstantInt *Lower = 816 mdconst::extract<ConstantInt>(Range->getOperand(0)); 817 818 if (Lower->getValue().isNullValue()) { 819 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr); 820 } else { 821 Metadata *LowAndHigh[] = { 822 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))), 823 // Don't make assumptions about the high bits. 824 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0)) 825 }; 826 827 WidenLoad->setMetadata(LLVMContext::MD_range, 828 MDNode::get(Mod->getContext(), LowAndHigh)); 829 } 830 } 831 832 int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); 833 Type *IntNTy = Builder.getIntNTy(TySize); 834 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); 835 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); 836 I.replaceAllUsesWith(ValOrig); 837 I.eraseFromParent(); 838 return true; 839 } 840 841 return false; 842 } 843 844 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 845 bool Changed = false; 846 847 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && 848 DA->isUniform(&I)) 849 Changed |= promoteUniformOpToI32(I); 850 851 return Changed; 852 } 853 854 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 855 bool Changed = false; 856 857 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 858 DA->isUniform(&I)) 859 Changed |= promoteUniformOpToI32(I); 860 861 return Changed; 862 } 863 864 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { 865 switch (I.getIntrinsicID()) { 866 case Intrinsic::bitreverse: 867 return visitBitreverseIntrinsicInst(I); 868 default: 869 return false; 870 } 871 } 872 873 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { 874 bool Changed = false; 875 876 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 877 DA->isUniform(&I)) 878 Changed |= promoteUniformBitreverseToI32(I); 879 880 return Changed; 881 } 882 883 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 884 Mod = &M; 885 return false; 886 } 887 888 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 889 if (skipFunction(F)) 890 return false; 891 892 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 893 if (!TPC) 894 return false; 895 896 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>(); 897 ST = &TM.getSubtarget<GCNSubtarget>(F); 898 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 899 DA = &getAnalysis<LegacyDivergenceAnalysis>(); 900 HasUnsafeFPMath = hasUnsafeFPMath(F); 901 902 bool MadeChange = false; 903 904 for (BasicBlock &BB : F) { 905 BasicBlock::iterator Next; 906 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 907 Next = std::next(I); 908 MadeChange |= visit(*I); 909 } 910 } 911 912 return MadeChange; 913 } 914 915 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 916 "AMDGPU IR optimizations", false, false) 917 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 918 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 919 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", 920 false, false) 921 922 char AMDGPUCodeGenPrepare::ID = 0; 923 924 FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { 925 return new AMDGPUCodeGenPrepare(); 926 } 927