1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This pass does misc. AMDGPU optimizations on IR before instruction 12 /// selection. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUSubtarget.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/Analysis/DivergenceAnalysis.h" 21 #include "llvm/Analysis/Loads.h" 22 #include "llvm/CodeGen/Passes.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/IR/Attributes.h" 25 #include "llvm/IR/BasicBlock.h" 26 #include "llvm/IR/Constants.h" 27 #include "llvm/IR/DerivedTypes.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/InstrTypes.h" 32 #include "llvm/IR/Instruction.h" 33 #include "llvm/IR/Instructions.h" 34 #include "llvm/IR/IntrinsicInst.h" 35 #include "llvm/IR/Intrinsics.h" 36 #include "llvm/IR/LLVMContext.h" 37 #include "llvm/IR/Operator.h" 38 #include "llvm/IR/Type.h" 39 #include "llvm/IR/Value.h" 40 #include "llvm/Pass.h" 41 #include "llvm/Support/Casting.h" 42 #include <cassert> 43 #include <iterator> 44 45 #define DEBUG_TYPE "amdgpu-codegenprepare" 46 47 using namespace llvm; 48 49 namespace { 50 51 class AMDGPUCodeGenPrepare : public FunctionPass, 52 public InstVisitor<AMDGPUCodeGenPrepare, bool> { 53 const SISubtarget *ST = nullptr; 54 DivergenceAnalysis *DA = nullptr; 55 Module *Mod = nullptr; 56 bool HasUnsafeFPMath = false; 57 AMDGPUAS AMDGPUASI; 58 59 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to 60 /// binary operation \p V. 61 /// 62 /// \returns Binary operation \p V. 63 /// \returns \p T's base element bit width. 64 unsigned getBaseElementBitWidth(const Type *T) const; 65 66 /// \returns Equivalent 32 bit integer type for given type \p T. For example, 67 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> 68 /// is returned. 69 Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 70 71 /// \returns True if binary operation \p I is a signed binary operation, false 72 /// otherwise. 73 bool isSigned(const BinaryOperator &I) const; 74 75 /// \returns True if the condition of 'select' operation \p I comes from a 76 /// signed 'icmp' operation, false otherwise. 77 bool isSigned(const SelectInst &I) const; 78 79 /// \returns True if type \p T needs to be promoted to 32 bit integer type, 80 /// false otherwise. 81 bool needsPromotionToI32(const Type *T) const; 82 83 /// Promotes uniform binary operation \p I to equivalent 32 bit binary 84 /// operation. 85 /// 86 /// \details \p I's base element bit width must be greater than 1 and less 87 /// than or equal 16. Promotion is done by sign or zero extending operands to 88 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and 89 /// truncating the result of 32 bit binary operation back to \p I's original 90 /// type. Division operation is not promoted. 91 /// 92 /// \returns True if \p I is promoted to equivalent 32 bit binary operation, 93 /// false otherwise. 94 bool promoteUniformOpToI32(BinaryOperator &I) const; 95 96 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. 97 /// 98 /// \details \p I's base element bit width must be greater than 1 and less 99 /// than or equal 16. Promotion is done by sign or zero extending operands to 100 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. 101 /// 102 /// \returns True. 103 bool promoteUniformOpToI32(ICmpInst &I) const; 104 105 /// Promotes uniform 'select' operation \p I to 32 bit 'select' 106 /// operation. 107 /// 108 /// \details \p I's base element bit width must be greater than 1 and less 109 /// than or equal 16. Promotion is done by sign or zero extending operands to 110 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the 111 /// result of 32 bit 'select' operation back to \p I's original type. 112 /// 113 /// \returns True. 114 bool promoteUniformOpToI32(SelectInst &I) const; 115 116 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' 117 /// intrinsic. 118 /// 119 /// \details \p I's base element bit width must be greater than 1 and less 120 /// than or equal 16. Promotion is done by zero extending the operand to 32 121 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the 122 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the 123 /// shift amount is 32 minus \p I's base element bit width), and truncating 124 /// the result of the shift operation back to \p I's original type. 125 /// 126 /// \returns True. 127 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; 128 /// Widen a scalar load. 129 /// 130 /// \details \p Widen scalar load for uniform, small type loads from constant 131 // memory / to a full 32-bits and then truncate the input to allow a scalar 132 // load instead of a vector load. 133 // 134 /// \returns True. 135 136 bool canWidenScalarExtLoad(LoadInst &I) const; 137 138 public: 139 static char ID; 140 141 AMDGPUCodeGenPrepare() : FunctionPass(ID) {} 142 143 bool visitFDiv(BinaryOperator &I); 144 145 bool visitInstruction(Instruction &I) { return false; } 146 bool visitBinaryOperator(BinaryOperator &I); 147 bool visitLoadInst(LoadInst &I); 148 bool visitICmpInst(ICmpInst &I); 149 bool visitSelectInst(SelectInst &I); 150 151 bool visitIntrinsicInst(IntrinsicInst &I); 152 bool visitBitreverseIntrinsicInst(IntrinsicInst &I); 153 154 bool doInitialization(Module &M) override; 155 bool runOnFunction(Function &F) override; 156 157 StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 158 159 void getAnalysisUsage(AnalysisUsage &AU) const override { 160 AU.addRequired<DivergenceAnalysis>(); 161 AU.setPreservesAll(); 162 } 163 }; 164 165 } // end anonymous namespace 166 167 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { 168 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 169 170 if (T->isIntegerTy()) 171 return T->getIntegerBitWidth(); 172 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); 173 } 174 175 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 176 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 177 178 if (T->isIntegerTy()) 179 return B.getInt32Ty(); 180 return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 181 } 182 183 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 184 return I.getOpcode() == Instruction::AShr || 185 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 186 } 187 188 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 189 return isa<ICmpInst>(I.getOperand(0)) ? 190 cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 191 } 192 193 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { 194 const IntegerType *IntTy = dyn_cast<IntegerType>(T); 195 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) 196 return true; 197 198 if (const VectorType *VT = dyn_cast<VectorType>(T)) { 199 // TODO: The set of packed operations is more limited, so may want to 200 // promote some anyway. 201 if (ST->hasVOP3PInsts()) 202 return false; 203 204 return needsPromotionToI32(VT->getElementType()); 205 } 206 207 return false; 208 } 209 210 // Return true if the op promoted to i32 should have nsw set. 211 static bool promotedOpIsNSW(const Instruction &I) { 212 switch (I.getOpcode()) { 213 case Instruction::Shl: 214 case Instruction::Add: 215 case Instruction::Sub: 216 return true; 217 case Instruction::Mul: 218 return I.hasNoUnsignedWrap(); 219 default: 220 return false; 221 } 222 } 223 224 // Return true if the op promoted to i32 should have nuw set. 225 static bool promotedOpIsNUW(const Instruction &I) { 226 switch (I.getOpcode()) { 227 case Instruction::Shl: 228 case Instruction::Add: 229 case Instruction::Mul: 230 return true; 231 case Instruction::Sub: 232 return I.hasNoUnsignedWrap(); 233 default: 234 return false; 235 } 236 } 237 238 bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const { 239 Type *Ty = I.getType(); 240 const DataLayout &DL = Mod->getDataLayout(); 241 int TySize = DL.getTypeSizeInBits(Ty); 242 unsigned Align = I.getAlignment() ? 243 I.getAlignment() : DL.getABITypeAlignment(Ty); 244 245 return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I); 246 } 247 248 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { 249 assert(needsPromotionToI32(I.getType()) && 250 "I does not need promotion to i32"); 251 252 if (I.getOpcode() == Instruction::SDiv || 253 I.getOpcode() == Instruction::UDiv) 254 return false; 255 256 IRBuilder<> Builder(&I); 257 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 258 259 Type *I32Ty = getI32Ty(Builder, I.getType()); 260 Value *ExtOp0 = nullptr; 261 Value *ExtOp1 = nullptr; 262 Value *ExtRes = nullptr; 263 Value *TruncRes = nullptr; 264 265 if (isSigned(I)) { 266 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 267 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 268 } else { 269 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 270 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 271 } 272 273 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); 274 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { 275 if (promotedOpIsNSW(cast<Instruction>(I))) 276 Inst->setHasNoSignedWrap(); 277 278 if (promotedOpIsNUW(cast<Instruction>(I))) 279 Inst->setHasNoUnsignedWrap(); 280 281 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) 282 Inst->setIsExact(ExactOp->isExact()); 283 } 284 285 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 286 287 I.replaceAllUsesWith(TruncRes); 288 I.eraseFromParent(); 289 290 return true; 291 } 292 293 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { 294 assert(needsPromotionToI32(I.getOperand(0)->getType()) && 295 "I does not need promotion to i32"); 296 297 IRBuilder<> Builder(&I); 298 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 299 300 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); 301 Value *ExtOp0 = nullptr; 302 Value *ExtOp1 = nullptr; 303 Value *NewICmp = nullptr; 304 305 if (I.isSigned()) { 306 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 307 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 308 } else { 309 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 310 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 311 } 312 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 313 314 I.replaceAllUsesWith(NewICmp); 315 I.eraseFromParent(); 316 317 return true; 318 } 319 320 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { 321 assert(needsPromotionToI32(I.getType()) && 322 "I does not need promotion to i32"); 323 324 IRBuilder<> Builder(&I); 325 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 326 327 Type *I32Ty = getI32Ty(Builder, I.getType()); 328 Value *ExtOp1 = nullptr; 329 Value *ExtOp2 = nullptr; 330 Value *ExtRes = nullptr; 331 Value *TruncRes = nullptr; 332 333 if (isSigned(I)) { 334 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 335 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 336 } else { 337 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 338 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 339 } 340 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 341 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 342 343 I.replaceAllUsesWith(TruncRes); 344 I.eraseFromParent(); 345 346 return true; 347 } 348 349 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( 350 IntrinsicInst &I) const { 351 assert(I.getIntrinsicID() == Intrinsic::bitreverse && 352 "I must be bitreverse intrinsic"); 353 assert(needsPromotionToI32(I.getType()) && 354 "I does not need promotion to i32"); 355 356 IRBuilder<> Builder(&I); 357 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 358 359 Type *I32Ty = getI32Ty(Builder, I.getType()); 360 Function *I32 = 361 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); 362 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); 363 Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); 364 Value *LShrOp = 365 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); 366 Value *TruncRes = 367 Builder.CreateTrunc(LShrOp, I.getType()); 368 369 I.replaceAllUsesWith(TruncRes); 370 I.eraseFromParent(); 371 372 return true; 373 } 374 375 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { 376 const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 377 if (!CNum) 378 return false; 379 380 // Reciprocal f32 is handled separately without denormals. 381 return UnsafeDiv || CNum->isExactlyValue(+1.0); 382 } 383 384 // Insert an intrinsic for fast fdiv for safe math situations where we can 385 // reduce precision. Leave fdiv for situations where the generic node is 386 // expected to be optimized. 387 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 388 Type *Ty = FDiv.getType(); 389 390 if (!Ty->getScalarType()->isFloatTy()) 391 return false; 392 393 MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 394 if (!FPMath) 395 return false; 396 397 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 398 float ULP = FPOp->getFPAccuracy(); 399 if (ULP < 2.5f) 400 return false; 401 402 FastMathFlags FMF = FPOp->getFastMathFlags(); 403 bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() || 404 FMF.allowReciprocal(); 405 406 // With UnsafeDiv node will be optimized to just rcp and mul. 407 if (ST->hasFP32Denormals() || UnsafeDiv) 408 return false; 409 410 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 411 Builder.setFastMathFlags(FMF); 412 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 413 414 Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); 415 416 Value *Num = FDiv.getOperand(0); 417 Value *Den = FDiv.getOperand(1); 418 419 Value *NewFDiv = nullptr; 420 421 if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 422 NewFDiv = UndefValue::get(VT); 423 424 // FIXME: Doesn't do the right thing for cases where the vector is partially 425 // constant. This works when the scalarizer pass is run first. 426 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 427 Value *NumEltI = Builder.CreateExtractElement(Num, I); 428 Value *DenEltI = Builder.CreateExtractElement(Den, I); 429 Value *NewElt; 430 431 if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { 432 NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 433 } else { 434 NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 435 } 436 437 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 438 } 439 } else { 440 if (!shouldKeepFDivF32(Num, UnsafeDiv)) 441 NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 442 } 443 444 if (NewFDiv) { 445 FDiv.replaceAllUsesWith(NewFDiv); 446 NewFDiv->takeName(&FDiv); 447 FDiv.eraseFromParent(); 448 } 449 450 return true; 451 } 452 453 static bool hasUnsafeFPMath(const Function &F) { 454 Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 455 return Attr.getValueAsString() == "true"; 456 } 457 458 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 459 bool Changed = false; 460 461 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 462 DA->isUniform(&I)) 463 Changed |= promoteUniformOpToI32(I); 464 465 return Changed; 466 } 467 468 bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { 469 if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || 470 I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && 471 canWidenScalarExtLoad(I)) { 472 IRBuilder<> Builder(&I); 473 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 474 475 Type *I32Ty = Builder.getInt32Ty(); 476 Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); 477 Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); 478 Value *WidenLoad = Builder.CreateLoad(BitCast); 479 480 int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); 481 Type *IntNTy = Builder.getIntNTy(TySize); 482 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); 483 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); 484 I.replaceAllUsesWith(ValOrig); 485 I.eraseFromParent(); 486 return true; 487 } 488 489 return false; 490 } 491 492 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 493 bool Changed = false; 494 495 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && 496 DA->isUniform(&I)) 497 Changed |= promoteUniformOpToI32(I); 498 499 return Changed; 500 } 501 502 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 503 bool Changed = false; 504 505 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 506 DA->isUniform(&I)) 507 Changed |= promoteUniformOpToI32(I); 508 509 return Changed; 510 } 511 512 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { 513 switch (I.getIntrinsicID()) { 514 case Intrinsic::bitreverse: 515 return visitBitreverseIntrinsicInst(I); 516 default: 517 return false; 518 } 519 } 520 521 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { 522 bool Changed = false; 523 524 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 525 DA->isUniform(&I)) 526 Changed |= promoteUniformBitreverseToI32(I); 527 528 return Changed; 529 } 530 531 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 532 Mod = &M; 533 return false; 534 } 535 536 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 537 if (skipFunction(F)) 538 return false; 539 540 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 541 if (!TPC) 542 return false; 543 544 const TargetMachine &TM = TPC->getTM<TargetMachine>(); 545 ST = &TM.getSubtarget<SISubtarget>(F); 546 DA = &getAnalysis<DivergenceAnalysis>(); 547 HasUnsafeFPMath = hasUnsafeFPMath(F); 548 549 bool MadeChange = false; 550 551 for (BasicBlock &BB : F) { 552 BasicBlock::iterator Next; 553 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 554 Next = std::next(I); 555 MadeChange |= visit(*I); 556 } 557 } 558 559 return MadeChange; 560 } 561 562 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 563 "AMDGPU IR optimizations", false, false) 564 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) 565 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", 566 false, false) 567 568 char AMDGPUCodeGenPrepare::ID = 0; 569 570 FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { 571 return new AMDGPUCodeGenPrepare(); 572 } 573