1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This pass does misc. AMDGPU optimizations on IR before instruction 12 /// selection. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUSubtarget.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/Analysis/DivergenceAnalysis.h" 21 #include "llvm/CodeGen/Passes.h" 22 #include "llvm/CodeGen/TargetPassConfig.h" 23 #include "llvm/IR/Attributes.h" 24 #include "llvm/IR/BasicBlock.h" 25 #include "llvm/IR/Constants.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/IRBuilder.h" 29 #include "llvm/IR/InstVisitor.h" 30 #include "llvm/IR/InstrTypes.h" 31 #include "llvm/IR/Instruction.h" 32 #include "llvm/IR/Instructions.h" 33 #include "llvm/IR/IntrinsicInst.h" 34 #include "llvm/IR/Intrinsics.h" 35 #include "llvm/IR/LLVMContext.h" 36 #include "llvm/IR/Operator.h" 37 #include "llvm/IR/Type.h" 38 #include "llvm/IR/Value.h" 39 #include "llvm/Pass.h" 40 #include "llvm/Support/Casting.h" 41 #include <cassert> 42 #include <iterator> 43 44 #define DEBUG_TYPE "amdgpu-codegenprepare" 45 46 using namespace llvm; 47 48 namespace { 49 50 class AMDGPUCodeGenPrepare : public FunctionPass, 51 public InstVisitor<AMDGPUCodeGenPrepare, bool> { 52 const SISubtarget *ST = nullptr; 53 DivergenceAnalysis *DA = nullptr; 54 Module *Mod = nullptr; 55 bool HasUnsafeFPMath = false; 56 57 /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to 58 /// binary operation \p V. 59 /// 60 /// \returns Binary operation \p V. 61 /// \returns \p T's base element bit width. 62 unsigned getBaseElementBitWidth(const Type *T) const; 63 64 /// \returns Equivalent 32 bit integer type for given type \p T. For example, 65 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> 66 /// is returned. 67 Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 68 69 /// \returns True if binary operation \p I is a signed binary operation, false 70 /// otherwise. 71 bool isSigned(const BinaryOperator &I) const; 72 73 /// \returns True if the condition of 'select' operation \p I comes from a 74 /// signed 'icmp' operation, false otherwise. 75 bool isSigned(const SelectInst &I) const; 76 77 /// \returns True if type \p T needs to be promoted to 32 bit integer type, 78 /// false otherwise. 79 bool needsPromotionToI32(const Type *T) const; 80 81 /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary 82 /// operation. 83 /// 84 /// \details \p I's base element bit width must be greater than 1 and less 85 /// than or equal 16. Promotion is done by sign or zero extending operands to 86 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and 87 /// truncating the result of 32 bit binary operation back to \p I's original 88 /// type. Division operation is not promoted. 89 /// 90 /// \returns True if \p I is promoted to equivalent 32 bit binary operation, 91 /// false otherwise. 92 bool promoteUniformOpToI32(BinaryOperator &I) const; 93 94 /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. 95 /// 96 /// \details \p I's base element bit width must be greater than 1 and less 97 /// than or equal 16. Promotion is done by sign or zero extending operands to 98 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. 99 /// 100 /// \returns True. 101 bool promoteUniformOpToI32(ICmpInst &I) const; 102 103 /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select' 104 /// operation. 105 /// 106 /// \details \p I's base element bit width must be greater than 1 and less 107 /// than or equal 16. Promotion is done by sign or zero extending operands to 108 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the 109 /// result of 32 bit 'select' operation back to \p I's original type. 110 /// 111 /// \returns True. 112 bool promoteUniformOpToI32(SelectInst &I) const; 113 114 /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' 115 /// intrinsic. 116 /// 117 /// \details \p I's base element bit width must be greater than 1 and less 118 /// than or equal 16. Promotion is done by zero extending the operand to 32 119 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the 120 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the 121 /// shift amount is 32 minus \p I's base element bit width), and truncating 122 /// the result of the shift operation back to \p I's original type. 123 /// 124 /// \returns True. 125 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; 126 127 public: 128 static char ID; 129 130 AMDGPUCodeGenPrepare() : FunctionPass(ID) {} 131 132 bool visitFDiv(BinaryOperator &I); 133 134 bool visitInstruction(Instruction &I) { return false; } 135 bool visitBinaryOperator(BinaryOperator &I); 136 bool visitICmpInst(ICmpInst &I); 137 bool visitSelectInst(SelectInst &I); 138 139 bool visitIntrinsicInst(IntrinsicInst &I); 140 bool visitBitreverseIntrinsicInst(IntrinsicInst &I); 141 142 bool doInitialization(Module &M) override; 143 bool runOnFunction(Function &F) override; 144 145 StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 146 147 void getAnalysisUsage(AnalysisUsage &AU) const override { 148 AU.addRequired<DivergenceAnalysis>(); 149 AU.setPreservesAll(); 150 } 151 }; 152 153 } // end anonymous namespace 154 155 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { 156 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 157 158 if (T->isIntegerTy()) 159 return T->getIntegerBitWidth(); 160 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); 161 } 162 163 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 164 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 165 166 if (T->isIntegerTy()) 167 return B.getInt32Ty(); 168 return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 169 } 170 171 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 172 return I.getOpcode() == Instruction::AShr || 173 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 174 } 175 176 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 177 return isa<ICmpInst>(I.getOperand(0)) ? 178 cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 179 } 180 181 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { 182 const IntegerType *IntTy = dyn_cast<IntegerType>(T); 183 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) 184 return true; 185 186 if (const VectorType *VT = dyn_cast<VectorType>(T)) { 187 // TODO: The set of packed operations is more limited, so may want to 188 // promote some anyway. 189 if (ST->hasVOP3PInsts()) 190 return false; 191 192 return needsPromotionToI32(VT->getElementType()); 193 } 194 195 return false; 196 } 197 198 // Return true if the op promoted to i32 should have nsw set. 199 static bool promotedOpIsNSW(const Instruction &I) { 200 switch (I.getOpcode()) { 201 case Instruction::Shl: 202 case Instruction::Add: 203 case Instruction::Sub: 204 return true; 205 case Instruction::Mul: 206 return I.hasNoUnsignedWrap(); 207 default: 208 return false; 209 } 210 } 211 212 // Return true if the op promoted to i32 should have nuw set. 213 static bool promotedOpIsNUW(const Instruction &I) { 214 switch (I.getOpcode()) { 215 case Instruction::Shl: 216 case Instruction::Add: 217 case Instruction::Mul: 218 return true; 219 case Instruction::Sub: 220 return I.hasNoUnsignedWrap(); 221 default: 222 return false; 223 } 224 } 225 226 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { 227 assert(needsPromotionToI32(I.getType()) && 228 "I does not need promotion to i32"); 229 230 if (I.getOpcode() == Instruction::SDiv || 231 I.getOpcode() == Instruction::UDiv) 232 return false; 233 234 IRBuilder<> Builder(&I); 235 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 236 237 Type *I32Ty = getI32Ty(Builder, I.getType()); 238 Value *ExtOp0 = nullptr; 239 Value *ExtOp1 = nullptr; 240 Value *ExtRes = nullptr; 241 Value *TruncRes = nullptr; 242 243 if (isSigned(I)) { 244 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 245 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 246 } else { 247 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 248 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 249 } 250 251 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); 252 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { 253 if (promotedOpIsNSW(cast<Instruction>(I))) 254 Inst->setHasNoSignedWrap(); 255 256 if (promotedOpIsNUW(cast<Instruction>(I))) 257 Inst->setHasNoUnsignedWrap(); 258 259 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) 260 Inst->setIsExact(ExactOp->isExact()); 261 } 262 263 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 264 265 I.replaceAllUsesWith(TruncRes); 266 I.eraseFromParent(); 267 268 return true; 269 } 270 271 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { 272 assert(needsPromotionToI32(I.getOperand(0)->getType()) && 273 "I does not need promotion to i32"); 274 275 IRBuilder<> Builder(&I); 276 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 277 278 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); 279 Value *ExtOp0 = nullptr; 280 Value *ExtOp1 = nullptr; 281 Value *NewICmp = nullptr; 282 283 if (I.isSigned()) { 284 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 285 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 286 } else { 287 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 288 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 289 } 290 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 291 292 I.replaceAllUsesWith(NewICmp); 293 I.eraseFromParent(); 294 295 return true; 296 } 297 298 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { 299 assert(needsPromotionToI32(I.getType()) && 300 "I does not need promotion to i32"); 301 302 IRBuilder<> Builder(&I); 303 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 304 305 Type *I32Ty = getI32Ty(Builder, I.getType()); 306 Value *ExtOp1 = nullptr; 307 Value *ExtOp2 = nullptr; 308 Value *ExtRes = nullptr; 309 Value *TruncRes = nullptr; 310 311 if (isSigned(I)) { 312 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 313 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 314 } else { 315 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 316 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 317 } 318 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 319 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 320 321 I.replaceAllUsesWith(TruncRes); 322 I.eraseFromParent(); 323 324 return true; 325 } 326 327 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( 328 IntrinsicInst &I) const { 329 assert(I.getIntrinsicID() == Intrinsic::bitreverse && 330 "I must be bitreverse intrinsic"); 331 assert(needsPromotionToI32(I.getType()) && 332 "I does not need promotion to i32"); 333 334 IRBuilder<> Builder(&I); 335 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 336 337 Type *I32Ty = getI32Ty(Builder, I.getType()); 338 Function *I32 = 339 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); 340 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); 341 Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); 342 Value *LShrOp = 343 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); 344 Value *TruncRes = 345 Builder.CreateTrunc(LShrOp, I.getType()); 346 347 I.replaceAllUsesWith(TruncRes); 348 I.eraseFromParent(); 349 350 return true; 351 } 352 353 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { 354 const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 355 if (!CNum) 356 return false; 357 358 // Reciprocal f32 is handled separately without denormals. 359 return UnsafeDiv || CNum->isExactlyValue(+1.0); 360 } 361 362 // Insert an intrinsic for fast fdiv for safe math situations where we can 363 // reduce precision. Leave fdiv for situations where the generic node is 364 // expected to be optimized. 365 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 366 Type *Ty = FDiv.getType(); 367 368 if (!Ty->getScalarType()->isFloatTy()) 369 return false; 370 371 MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 372 if (!FPMath) 373 return false; 374 375 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 376 float ULP = FPOp->getFPAccuracy(); 377 if (ULP < 2.5f) 378 return false; 379 380 FastMathFlags FMF = FPOp->getFastMathFlags(); 381 bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || 382 FMF.allowReciprocal(); 383 384 // With UnsafeDiv node will be optimized to just rcp and mul. 385 if (ST->hasFP32Denormals() || UnsafeDiv) 386 return false; 387 388 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 389 Builder.setFastMathFlags(FMF); 390 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 391 392 Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); 393 394 Value *Num = FDiv.getOperand(0); 395 Value *Den = FDiv.getOperand(1); 396 397 Value *NewFDiv = nullptr; 398 399 if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 400 NewFDiv = UndefValue::get(VT); 401 402 // FIXME: Doesn't do the right thing for cases where the vector is partially 403 // constant. This works when the scalarizer pass is run first. 404 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 405 Value *NumEltI = Builder.CreateExtractElement(Num, I); 406 Value *DenEltI = Builder.CreateExtractElement(Den, I); 407 Value *NewElt; 408 409 if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { 410 NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 411 } else { 412 NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 413 } 414 415 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 416 } 417 } else { 418 if (!shouldKeepFDivF32(Num, UnsafeDiv)) 419 NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 420 } 421 422 if (NewFDiv) { 423 FDiv.replaceAllUsesWith(NewFDiv); 424 NewFDiv->takeName(&FDiv); 425 FDiv.eraseFromParent(); 426 } 427 428 return true; 429 } 430 431 static bool hasUnsafeFPMath(const Function &F) { 432 Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 433 return Attr.getValueAsString() == "true"; 434 } 435 436 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 437 bool Changed = false; 438 439 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 440 DA->isUniform(&I)) 441 Changed |= promoteUniformOpToI32(I); 442 443 return Changed; 444 } 445 446 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 447 bool Changed = false; 448 449 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && 450 DA->isUniform(&I)) 451 Changed |= promoteUniformOpToI32(I); 452 453 return Changed; 454 } 455 456 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 457 bool Changed = false; 458 459 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 460 DA->isUniform(&I)) 461 Changed |= promoteUniformOpToI32(I); 462 463 return Changed; 464 } 465 466 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { 467 switch (I.getIntrinsicID()) { 468 case Intrinsic::bitreverse: 469 return visitBitreverseIntrinsicInst(I); 470 default: 471 return false; 472 } 473 } 474 475 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { 476 bool Changed = false; 477 478 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 479 DA->isUniform(&I)) 480 Changed |= promoteUniformBitreverseToI32(I); 481 482 return Changed; 483 } 484 485 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 486 Mod = &M; 487 return false; 488 } 489 490 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 491 if (skipFunction(F)) 492 return false; 493 494 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 495 if (!TPC) 496 return false; 497 498 const TargetMachine &TM = TPC->getTM<TargetMachine>(); 499 ST = &TM.getSubtarget<SISubtarget>(F); 500 DA = &getAnalysis<DivergenceAnalysis>(); 501 HasUnsafeFPMath = hasUnsafeFPMath(F); 502 503 bool MadeChange = false; 504 505 for (BasicBlock &BB : F) { 506 BasicBlock::iterator Next; 507 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 508 Next = std::next(I); 509 MadeChange |= visit(*I); 510 } 511 } 512 513 return MadeChange; 514 } 515 516 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 517 "AMDGPU IR optimizations", false, false) 518 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) 519 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", 520 false, false) 521 522 char AMDGPUCodeGenPrepare::ID = 0; 523 524 FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { 525 return new AMDGPUCodeGenPrepare(); 526 } 527