1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This pass does misc. AMDGPU optimizations on IR before instruction 12 /// selection. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUIntrinsicInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "llvm/ADT/StringRef.h" 21 #include "llvm/Analysis/DivergenceAnalysis.h" 22 #include "llvm/CodeGen/Passes.h" 23 #include "llvm/IR/Attributes.h" 24 #include "llvm/IR/BasicBlock.h" 25 #include "llvm/IR/Constants.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/InstrTypes.h" 29 #include "llvm/IR/Instruction.h" 30 #include "llvm/IR/Instructions.h" 31 #include "llvm/IR/InstVisitor.h" 32 #include "llvm/IR/IntrinsicInst.h" 33 #include "llvm/IR/Intrinsics.h" 34 #include "llvm/IR/IRBuilder.h" 35 #include "llvm/IR/LLVMContext.h" 36 #include "llvm/IR/Operator.h" 37 #include "llvm/IR/Type.h" 38 #include "llvm/IR/Value.h" 39 #include "llvm/Pass.h" 40 #include "llvm/Support/Casting.h" 41 #include <cassert> 42 #include <iterator> 43 44 #define DEBUG_TYPE "amdgpu-codegenprepare" 45 46 using namespace llvm; 47 48 namespace { 49 50 class AMDGPUCodeGenPrepare : public FunctionPass, 51 public InstVisitor<AMDGPUCodeGenPrepare, bool> { 52 const GCNTargetMachine *TM; 53 const SISubtarget *ST = nullptr; 54 DivergenceAnalysis *DA = nullptr; 55 Module *Mod = nullptr; 56 bool HasUnsafeFPMath = false; 57 58 /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to 59 /// binary operation \p V. 60 /// 61 /// \returns Binary operation \p V. 62 /// \returns \p T's base element bit width. 63 unsigned getBaseElementBitWidth(const Type *T) const; 64 65 /// \returns Equivalent 32 bit integer type for given type \p T. For example, 66 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> 67 /// is returned. 68 Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 69 70 /// \returns True if binary operation \p I is a signed binary operation, false 71 /// otherwise. 72 bool isSigned(const BinaryOperator &I) const; 73 74 /// \returns True if the condition of 'select' operation \p I comes from a 75 /// signed 'icmp' operation, false otherwise. 76 bool isSigned(const SelectInst &I) const; 77 78 /// \returns True if type \p T needs to be promoted to 32 bit integer type, 79 /// false otherwise. 80 bool needsPromotionToI32(const Type *T) const; 81 82 /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary 83 /// operation. 84 /// 85 /// \details \p I's base element bit width must be greater than 1 and less 86 /// than or equal 16. Promotion is done by sign or zero extending operands to 87 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and 88 /// truncating the result of 32 bit binary operation back to \p I's original 89 /// type. Division operation is not promoted. 90 /// 91 /// \returns True if \p I is promoted to equivalent 32 bit binary operation, 92 /// false otherwise. 93 bool promoteUniformOpToI32(BinaryOperator &I) const; 94 95 /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. 96 /// 97 /// \details \p I's base element bit width must be greater than 1 and less 98 /// than or equal 16. Promotion is done by sign or zero extending operands to 99 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. 100 /// 101 /// \returns True. 102 bool promoteUniformOpToI32(ICmpInst &I) const; 103 104 /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select' 105 /// operation. 106 /// 107 /// \details \p I's base element bit width must be greater than 1 and less 108 /// than or equal 16. Promotion is done by sign or zero extending operands to 109 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the 110 /// result of 32 bit 'select' operation back to \p I's original type. 111 /// 112 /// \returns True. 113 bool promoteUniformOpToI32(SelectInst &I) const; 114 115 /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' 116 /// intrinsic. 117 /// 118 /// \details \p I's base element bit width must be greater than 1 and less 119 /// than or equal 16. Promotion is done by zero extending the operand to 32 120 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the 121 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the 122 /// shift amount is 32 minus \p I's base element bit width), and truncating 123 /// the result of the shift operation back to \p I's original type. 124 /// 125 /// \returns True. 126 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; 127 128 public: 129 static char ID; 130 131 AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : 132 FunctionPass(ID), TM(static_cast<const GCNTargetMachine *>(TM)) {} 133 134 bool visitFDiv(BinaryOperator &I); 135 136 bool visitInstruction(Instruction &I) { return false; } 137 bool visitBinaryOperator(BinaryOperator &I); 138 bool visitICmpInst(ICmpInst &I); 139 bool visitSelectInst(SelectInst &I); 140 141 bool visitIntrinsicInst(IntrinsicInst &I); 142 bool visitBitreverseIntrinsicInst(IntrinsicInst &I); 143 144 bool doInitialization(Module &M) override; 145 bool runOnFunction(Function &F) override; 146 147 StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 148 149 void getAnalysisUsage(AnalysisUsage &AU) const override { 150 AU.addRequired<DivergenceAnalysis>(); 151 AU.setPreservesAll(); 152 } 153 }; 154 155 } // end anonymous namespace 156 157 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { 158 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 159 160 if (T->isIntegerTy()) 161 return T->getIntegerBitWidth(); 162 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); 163 } 164 165 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 166 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 167 168 if (T->isIntegerTy()) 169 return B.getInt32Ty(); 170 return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 171 } 172 173 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 174 return I.getOpcode() == Instruction::AShr || 175 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 176 } 177 178 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 179 return isa<ICmpInst>(I.getOperand(0)) ? 180 cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 181 } 182 183 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { 184 const IntegerType *IntTy = dyn_cast<IntegerType>(T); 185 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) 186 return true; 187 188 if (const VectorType *VT = dyn_cast<VectorType>(T)) { 189 // TODO: The set of packed operations is more limited, so may want to 190 // promote some anyway. 191 if (ST->hasVOP3PInsts()) 192 return false; 193 194 return needsPromotionToI32(VT->getElementType()); 195 } 196 197 return false; 198 } 199 200 // Return true if the op promoted to i32 should have nsw set. 201 static bool promotedOpIsNSW(const Instruction &I) { 202 switch (I.getOpcode()) { 203 case Instruction::Shl: 204 case Instruction::Add: 205 case Instruction::Sub: 206 return true; 207 case Instruction::Mul: 208 return I.hasNoUnsignedWrap(); 209 default: 210 return false; 211 } 212 } 213 214 // Return true if the op promoted to i32 should have nuw set. 215 static bool promotedOpIsNUW(const Instruction &I) { 216 switch (I.getOpcode()) { 217 case Instruction::Shl: 218 case Instruction::Add: 219 case Instruction::Mul: 220 return true; 221 case Instruction::Sub: 222 return I.hasNoUnsignedWrap(); 223 default: 224 return false; 225 } 226 } 227 228 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { 229 assert(needsPromotionToI32(I.getType()) && 230 "I does not need promotion to i32"); 231 232 if (I.getOpcode() == Instruction::SDiv || 233 I.getOpcode() == Instruction::UDiv) 234 return false; 235 236 IRBuilder<> Builder(&I); 237 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 238 239 Type *I32Ty = getI32Ty(Builder, I.getType()); 240 Value *ExtOp0 = nullptr; 241 Value *ExtOp1 = nullptr; 242 Value *ExtRes = nullptr; 243 Value *TruncRes = nullptr; 244 245 if (isSigned(I)) { 246 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 247 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 248 } else { 249 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 250 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 251 } 252 253 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); 254 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { 255 if (promotedOpIsNSW(cast<Instruction>(I))) 256 Inst->setHasNoSignedWrap(); 257 258 if (promotedOpIsNUW(cast<Instruction>(I))) 259 Inst->setHasNoUnsignedWrap(); 260 261 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) 262 Inst->setIsExact(ExactOp->isExact()); 263 } 264 265 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 266 267 I.replaceAllUsesWith(TruncRes); 268 I.eraseFromParent(); 269 270 return true; 271 } 272 273 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { 274 assert(needsPromotionToI32(I.getOperand(0)->getType()) && 275 "I does not need promotion to i32"); 276 277 IRBuilder<> Builder(&I); 278 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 279 280 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); 281 Value *ExtOp0 = nullptr; 282 Value *ExtOp1 = nullptr; 283 Value *NewICmp = nullptr; 284 285 if (I.isSigned()) { 286 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 287 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 288 } else { 289 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 290 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 291 } 292 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 293 294 I.replaceAllUsesWith(NewICmp); 295 I.eraseFromParent(); 296 297 return true; 298 } 299 300 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { 301 assert(needsPromotionToI32(I.getType()) && 302 "I does not need promotion to i32"); 303 304 IRBuilder<> Builder(&I); 305 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 306 307 Type *I32Ty = getI32Ty(Builder, I.getType()); 308 Value *ExtOp1 = nullptr; 309 Value *ExtOp2 = nullptr; 310 Value *ExtRes = nullptr; 311 Value *TruncRes = nullptr; 312 313 if (isSigned(I)) { 314 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 315 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 316 } else { 317 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 318 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 319 } 320 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 321 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 322 323 I.replaceAllUsesWith(TruncRes); 324 I.eraseFromParent(); 325 326 return true; 327 } 328 329 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( 330 IntrinsicInst &I) const { 331 assert(I.getIntrinsicID() == Intrinsic::bitreverse && 332 "I must be bitreverse intrinsic"); 333 assert(needsPromotionToI32(I.getType()) && 334 "I does not need promotion to i32"); 335 336 IRBuilder<> Builder(&I); 337 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 338 339 Type *I32Ty = getI32Ty(Builder, I.getType()); 340 Function *I32 = 341 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); 342 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); 343 Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); 344 Value *LShrOp = 345 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); 346 Value *TruncRes = 347 Builder.CreateTrunc(LShrOp, I.getType()); 348 349 I.replaceAllUsesWith(TruncRes); 350 I.eraseFromParent(); 351 352 return true; 353 } 354 355 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { 356 const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 357 if (!CNum) 358 return false; 359 360 // Reciprocal f32 is handled separately without denormals. 361 return UnsafeDiv || CNum->isExactlyValue(+1.0); 362 } 363 364 // Insert an intrinsic for fast fdiv for safe math situations where we can 365 // reduce precision. Leave fdiv for situations where the generic node is 366 // expected to be optimized. 367 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 368 Type *Ty = FDiv.getType(); 369 370 if (!Ty->getScalarType()->isFloatTy()) 371 return false; 372 373 MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 374 if (!FPMath) 375 return false; 376 377 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 378 float ULP = FPOp->getFPAccuracy(); 379 if (ULP < 2.5f) 380 return false; 381 382 FastMathFlags FMF = FPOp->getFastMathFlags(); 383 bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || 384 FMF.allowReciprocal(); 385 if (ST->hasFP32Denormals() && !UnsafeDiv) 386 return false; 387 388 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 389 Builder.setFastMathFlags(FMF); 390 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 391 392 const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); 393 Function *Decl 394 = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); 395 396 Value *Num = FDiv.getOperand(0); 397 Value *Den = FDiv.getOperand(1); 398 399 Value *NewFDiv = nullptr; 400 401 if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 402 NewFDiv = UndefValue::get(VT); 403 404 // FIXME: Doesn't do the right thing for cases where the vector is partially 405 // constant. This works when the scalarizer pass is run first. 406 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 407 Value *NumEltI = Builder.CreateExtractElement(Num, I); 408 Value *DenEltI = Builder.CreateExtractElement(Den, I); 409 Value *NewElt; 410 411 if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { 412 NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 413 } else { 414 NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 415 } 416 417 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 418 } 419 } else { 420 if (!shouldKeepFDivF32(Num, UnsafeDiv)) 421 NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 422 } 423 424 if (NewFDiv) { 425 FDiv.replaceAllUsesWith(NewFDiv); 426 NewFDiv->takeName(&FDiv); 427 FDiv.eraseFromParent(); 428 } 429 430 return true; 431 } 432 433 static bool hasUnsafeFPMath(const Function &F) { 434 Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 435 return Attr.getValueAsString() == "true"; 436 } 437 438 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 439 bool Changed = false; 440 441 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 442 DA->isUniform(&I)) 443 Changed |= promoteUniformOpToI32(I); 444 445 return Changed; 446 } 447 448 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 449 bool Changed = false; 450 451 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && 452 DA->isUniform(&I)) 453 Changed |= promoteUniformOpToI32(I); 454 455 return Changed; 456 } 457 458 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 459 bool Changed = false; 460 461 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 462 DA->isUniform(&I)) 463 Changed |= promoteUniformOpToI32(I); 464 465 return Changed; 466 } 467 468 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { 469 switch (I.getIntrinsicID()) { 470 case Intrinsic::bitreverse: 471 return visitBitreverseIntrinsicInst(I); 472 default: 473 return false; 474 } 475 } 476 477 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { 478 bool Changed = false; 479 480 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 481 DA->isUniform(&I)) 482 Changed |= promoteUniformBitreverseToI32(I); 483 484 return Changed; 485 } 486 487 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 488 Mod = &M; 489 return false; 490 } 491 492 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 493 if (!TM || skipFunction(F)) 494 return false; 495 496 ST = &TM->getSubtarget<SISubtarget>(F); 497 DA = &getAnalysis<DivergenceAnalysis>(); 498 HasUnsafeFPMath = hasUnsafeFPMath(F); 499 500 bool MadeChange = false; 501 502 for (BasicBlock &BB : F) { 503 BasicBlock::iterator Next; 504 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 505 Next = std::next(I); 506 MadeChange |= visit(*I); 507 } 508 } 509 510 return MadeChange; 511 } 512 513 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 514 "AMDGPU IR optimizations", false, false) 515 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) 516 INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, 517 "AMDGPU IR optimizations", false, false) 518 519 char AMDGPUCodeGenPrepare::ID = 0; 520 521 FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) { 522 return new AMDGPUCodeGenPrepare(TM); 523 } 524