1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This pass does misc. AMDGPU optimizations on IR before instruction 12 /// selection. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUIntrinsicInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "AMDGPUTargetMachine.h" 20 21 #include "llvm/Analysis/DivergenceAnalysis.h" 22 #include "llvm/CodeGen/Passes.h" 23 #include "llvm/IR/InstVisitor.h" 24 #include "llvm/IR/IRBuilder.h" 25 #include "llvm/Support/Debug.h" 26 #include "llvm/Support/raw_ostream.h" 27 28 #define DEBUG_TYPE "amdgpu-codegenprepare" 29 30 using namespace llvm; 31 32 namespace { 33 34 class AMDGPUCodeGenPrepare : public FunctionPass, 35 public InstVisitor<AMDGPUCodeGenPrepare, bool> { 36 const GCNTargetMachine *TM; 37 const SISubtarget *ST; 38 DivergenceAnalysis *DA; 39 Module *Mod; 40 bool HasUnsafeFPMath; 41 42 /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to 43 /// binary operation \p V. 44 /// 45 /// \returns Binary operation \p V. 46 Value *copyFlags(const BinaryOperator &I, Value *V) const; 47 48 /// \returns \p T's base element bit width. 49 unsigned getBaseElementBitWidth(const Type *T) const; 50 51 /// \returns Equivalent 32 bit integer type for given type \p T. For example, 52 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> 53 /// is returned. 54 Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 55 56 /// \returns True if binary operation \p I is a signed binary operation, false 57 /// otherwise. 58 bool isSigned(const BinaryOperator &I) const; 59 60 /// \returns True if the condition of 'select' operation \p I comes from a 61 /// signed 'icmp' operation, false otherwise. 62 bool isSigned(const SelectInst &I) const; 63 64 /// \returns True if type \p T needs to be promoted to 32 bit integer type, 65 /// false otherwise. 66 bool needsPromotionToI32(const Type *T) const; 67 68 /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary 69 /// operation. 70 /// 71 /// \details \p I's base element bit width must be greater than 1 and less 72 /// than or equal 16. Promotion is done by sign or zero extending operands to 73 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and 74 /// truncating the result of 32 bit binary operation back to \p I's original 75 /// type. Division operation is not promoted. 76 /// 77 /// \returns True if \p I is promoted to equivalent 32 bit binary operation, 78 /// false otherwise. 79 bool promoteUniformOpToI32(BinaryOperator &I) const; 80 81 /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. 82 /// 83 /// \details \p I's base element bit width must be greater than 1 and less 84 /// than or equal 16. Promotion is done by sign or zero extending operands to 85 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. 86 /// 87 /// \returns True. 88 bool promoteUniformOpToI32(ICmpInst &I) const; 89 90 /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select' 91 /// operation. 92 /// 93 /// \details \p I's base element bit width must be greater than 1 and less 94 /// than or equal 16. Promotion is done by sign or zero extending operands to 95 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the 96 /// result of 32 bit 'select' operation back to \p I's original type. 97 /// 98 /// \returns True. 99 bool promoteUniformOpToI32(SelectInst &I) const; 100 101 /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' 102 /// intrinsic. 103 /// 104 /// \details \p I's base element bit width must be greater than 1 and less 105 /// than or equal 16. Promotion is done by zero extending the operand to 32 106 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the 107 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the 108 /// shift amount is 32 minus \p I's base element bit width), and truncating 109 /// the result of the shift operation back to \p I's original type. 110 /// 111 /// \returns True. 112 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; 113 114 public: 115 static char ID; 116 AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : 117 FunctionPass(ID), 118 TM(static_cast<const GCNTargetMachine *>(TM)), 119 ST(nullptr), 120 DA(nullptr), 121 Mod(nullptr), 122 HasUnsafeFPMath(false) { } 123 124 bool visitFDiv(BinaryOperator &I); 125 126 bool visitInstruction(Instruction &I) { return false; } 127 bool visitBinaryOperator(BinaryOperator &I); 128 bool visitICmpInst(ICmpInst &I); 129 bool visitSelectInst(SelectInst &I); 130 131 bool visitIntrinsicInst(IntrinsicInst &I); 132 bool visitBitreverseIntrinsicInst(IntrinsicInst &I); 133 134 bool doInitialization(Module &M) override; 135 bool runOnFunction(Function &F) override; 136 137 StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 138 139 void getAnalysisUsage(AnalysisUsage &AU) const override { 140 AU.addRequired<DivergenceAnalysis>(); 141 AU.setPreservesAll(); 142 } 143 }; 144 145 } // End anonymous namespace 146 147 Value *AMDGPUCodeGenPrepare::copyFlags( 148 const BinaryOperator &I, Value *V) const { 149 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V); 150 if (!BinOp) // Possibly constant expression. 151 return V; 152 153 if (isa<OverflowingBinaryOperator>(BinOp)) { 154 BinOp->setHasNoSignedWrap(I.hasNoSignedWrap()); 155 BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); 156 } else if (isa<PossiblyExactOperator>(BinOp)) 157 BinOp->setIsExact(I.isExact()); 158 159 return V; 160 } 161 162 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { 163 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 164 165 if (T->isIntegerTy()) 166 return T->getIntegerBitWidth(); 167 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); 168 } 169 170 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 171 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 172 173 if (T->isIntegerTy()) 174 return B.getInt32Ty(); 175 return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 176 } 177 178 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 179 return I.getOpcode() == Instruction::AShr || 180 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 181 } 182 183 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 184 return isa<ICmpInst>(I.getOperand(0)) ? 185 cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 186 } 187 188 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { 189 if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 && 190 T->getIntegerBitWidth() <= 16) 191 return true; 192 if (!T->isVectorTy()) 193 return false; 194 return needsPromotionToI32(cast<VectorType>(T)->getElementType()); 195 } 196 197 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { 198 assert(needsPromotionToI32(I.getType()) && 199 "I does not need promotion to i32"); 200 201 if (I.getOpcode() == Instruction::SDiv || 202 I.getOpcode() == Instruction::UDiv) 203 return false; 204 205 IRBuilder<> Builder(&I); 206 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 207 208 Type *I32Ty = getI32Ty(Builder, I.getType()); 209 Value *ExtOp0 = nullptr; 210 Value *ExtOp1 = nullptr; 211 Value *ExtRes = nullptr; 212 Value *TruncRes = nullptr; 213 214 if (isSigned(I)) { 215 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 216 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 217 } else { 218 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 219 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 220 } 221 ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1)); 222 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 223 224 I.replaceAllUsesWith(TruncRes); 225 I.eraseFromParent(); 226 227 return true; 228 } 229 230 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { 231 assert(needsPromotionToI32(I.getOperand(0)->getType()) && 232 "I does not need promotion to i32"); 233 234 IRBuilder<> Builder(&I); 235 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 236 237 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); 238 Value *ExtOp0 = nullptr; 239 Value *ExtOp1 = nullptr; 240 Value *NewICmp = nullptr; 241 242 if (I.isSigned()) { 243 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 244 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 245 } else { 246 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 247 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 248 } 249 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 250 251 I.replaceAllUsesWith(NewICmp); 252 I.eraseFromParent(); 253 254 return true; 255 } 256 257 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { 258 assert(needsPromotionToI32(I.getType()) && 259 "I does not need promotion to i32"); 260 261 IRBuilder<> Builder(&I); 262 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 263 264 Type *I32Ty = getI32Ty(Builder, I.getType()); 265 Value *ExtOp1 = nullptr; 266 Value *ExtOp2 = nullptr; 267 Value *ExtRes = nullptr; 268 Value *TruncRes = nullptr; 269 270 if (isSigned(I)) { 271 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 272 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 273 } else { 274 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 275 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 276 } 277 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 278 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 279 280 I.replaceAllUsesWith(TruncRes); 281 I.eraseFromParent(); 282 283 return true; 284 } 285 286 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( 287 IntrinsicInst &I) const { 288 assert(I.getIntrinsicID() == Intrinsic::bitreverse && 289 "I must be bitreverse intrinsic"); 290 assert(needsPromotionToI32(I.getType()) && 291 "I does not need promotion to i32"); 292 293 IRBuilder<> Builder(&I); 294 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 295 296 Type *I32Ty = getI32Ty(Builder, I.getType()); 297 Function *I32 = 298 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); 299 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); 300 Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); 301 Value *LShrOp = 302 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); 303 Value *TruncRes = 304 Builder.CreateTrunc(LShrOp, I.getType()); 305 306 I.replaceAllUsesWith(TruncRes); 307 I.eraseFromParent(); 308 309 return true; 310 } 311 312 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { 313 const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 314 if (!CNum) 315 return false; 316 317 // Reciprocal f32 is handled separately without denormals. 318 return UnsafeDiv || CNum->isExactlyValue(+1.0); 319 } 320 321 // Insert an intrinsic for fast fdiv for safe math situations where we can 322 // reduce precision. Leave fdiv for situations where the generic node is 323 // expected to be optimized. 324 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 325 Type *Ty = FDiv.getType(); 326 327 // TODO: Handle half 328 if (!Ty->getScalarType()->isFloatTy()) 329 return false; 330 331 MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 332 if (!FPMath) 333 return false; 334 335 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 336 float ULP = FPOp->getFPAccuracy(); 337 if (ULP < 2.5f) 338 return false; 339 340 FastMathFlags FMF = FPOp->getFastMathFlags(); 341 bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || 342 FMF.allowReciprocal(); 343 if (ST->hasFP32Denormals() && !UnsafeDiv) 344 return false; 345 346 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 347 Builder.setFastMathFlags(FMF); 348 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 349 350 const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); 351 Function *Decl 352 = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); 353 354 Value *Num = FDiv.getOperand(0); 355 Value *Den = FDiv.getOperand(1); 356 357 Value *NewFDiv = nullptr; 358 359 if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 360 NewFDiv = UndefValue::get(VT); 361 362 // FIXME: Doesn't do the right thing for cases where the vector is partially 363 // constant. This works when the scalarizer pass is run first. 364 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 365 Value *NumEltI = Builder.CreateExtractElement(Num, I); 366 Value *DenEltI = Builder.CreateExtractElement(Den, I); 367 Value *NewElt; 368 369 if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { 370 NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 371 } else { 372 NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 373 } 374 375 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 376 } 377 } else { 378 if (!shouldKeepFDivF32(Num, UnsafeDiv)) 379 NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 380 } 381 382 if (NewFDiv) { 383 FDiv.replaceAllUsesWith(NewFDiv); 384 NewFDiv->takeName(&FDiv); 385 FDiv.eraseFromParent(); 386 } 387 388 return true; 389 } 390 391 static bool hasUnsafeFPMath(const Function &F) { 392 Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 393 return Attr.getValueAsString() == "true"; 394 } 395 396 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 397 bool Changed = false; 398 399 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 400 DA->isUniform(&I)) 401 Changed |= promoteUniformOpToI32(I); 402 403 return Changed; 404 } 405 406 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 407 bool Changed = false; 408 409 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && 410 DA->isUniform(&I)) 411 Changed |= promoteUniformOpToI32(I); 412 413 return Changed; 414 } 415 416 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 417 bool Changed = false; 418 419 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 420 DA->isUniform(&I)) 421 Changed |= promoteUniformOpToI32(I); 422 423 return Changed; 424 } 425 426 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { 427 switch (I.getIntrinsicID()) { 428 case Intrinsic::bitreverse: 429 return visitBitreverseIntrinsicInst(I); 430 default: 431 return false; 432 } 433 } 434 435 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { 436 bool Changed = false; 437 438 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 439 DA->isUniform(&I)) 440 Changed |= promoteUniformBitreverseToI32(I); 441 442 return Changed; 443 } 444 445 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 446 Mod = &M; 447 return false; 448 } 449 450 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 451 if (!TM || skipFunction(F)) 452 return false; 453 454 ST = &TM->getSubtarget<SISubtarget>(F); 455 DA = &getAnalysis<DivergenceAnalysis>(); 456 HasUnsafeFPMath = hasUnsafeFPMath(F); 457 458 bool MadeChange = false; 459 460 for (BasicBlock &BB : F) { 461 BasicBlock::iterator Next; 462 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 463 Next = std::next(I); 464 MadeChange |= visit(*I); 465 } 466 } 467 468 return MadeChange; 469 } 470 471 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 472 "AMDGPU IR optimizations", false, false) 473 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) 474 INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, 475 "AMDGPU IR optimizations", false, false) 476 477 char AMDGPUCodeGenPrepare::ID = 0; 478 479 FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) { 480 return new AMDGPUCodeGenPrepare(TM); 481 } 482