1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass does misc. AMDGPU optimizations on IR before instruction 11 /// selection. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "AMDGPUSubtarget.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "llvm/ADT/StringRef.h" 19 #include "llvm/Analysis/AssumptionCache.h" 20 #include "llvm/Analysis/ConstantFolding.h" 21 #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 22 #include "llvm/Analysis/Loads.h" 23 #include "llvm/Analysis/ValueTracking.h" 24 #include "llvm/CodeGen/Passes.h" 25 #include "llvm/CodeGen/TargetPassConfig.h" 26 #include "llvm/IR/Attributes.h" 27 #include "llvm/IR/BasicBlock.h" 28 #include "llvm/IR/Constants.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/Function.h" 31 #include "llvm/IR/IRBuilder.h" 32 #include "llvm/IR/InstVisitor.h" 33 #include "llvm/IR/InstrTypes.h" 34 #include "llvm/IR/Instruction.h" 35 #include "llvm/IR/Instructions.h" 36 #include "llvm/IR/IntrinsicInst.h" 37 #include "llvm/IR/Intrinsics.h" 38 #include "llvm/IR/LLVMContext.h" 39 #include "llvm/IR/Operator.h" 40 #include "llvm/IR/Type.h" 41 #include "llvm/IR/Value.h" 42 #include "llvm/Transforms/Utils/IntegerDivision.h" 43 #include "llvm/InitializePasses.h" 44 #include "llvm/Pass.h" 45 #include "llvm/Support/Casting.h" 46 #include <cassert> 47 #include <iterator> 48 49 #define DEBUG_TYPE "amdgpu-codegenprepare" 50 51 using namespace llvm; 52 53 namespace { 54 55 static cl::opt<bool> WidenLoads( 56 "amdgpu-codegenprepare-widen-constant-loads", 57 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"), 58 cl::ReallyHidden, 59 cl::init(true)); 60 61 static cl::opt<bool> UseMul24Intrin( 62 "amdgpu-codegenprepare-mul24", 63 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"), 64 cl::ReallyHidden, 65 cl::init(true)); 66 67 // Legalize 64-bit division by using the generic IR expansion. 68 static cl::opt<bool> ExpandDiv64InIR( 69 "amdgpu-codegenprepare-expand-div64", 70 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"), 71 cl::ReallyHidden, 72 cl::init(false)); 73 74 // Leave all division operations as they are. This supersedes ExpandDiv64InIR 75 // and is used for testing the legalizer. 76 static cl::opt<bool> DisableIDivExpand( 77 "amdgpu-codegenprepare-disable-idiv-expansion", 78 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"), 79 cl::ReallyHidden, 80 cl::init(false)); 81 82 class AMDGPUCodeGenPrepare : public FunctionPass, 83 public InstVisitor<AMDGPUCodeGenPrepare, bool> { 84 const GCNSubtarget *ST = nullptr; 85 AssumptionCache *AC = nullptr; 86 DominatorTree *DT = nullptr; 87 LegacyDivergenceAnalysis *DA = nullptr; 88 Module *Mod = nullptr; 89 const DataLayout *DL = nullptr; 90 bool HasUnsafeFPMath = false; 91 bool HasFP32Denormals = false; 92 93 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to 94 /// binary operation \p V. 95 /// 96 /// \returns Binary operation \p V. 97 /// \returns \p T's base element bit width. 98 unsigned getBaseElementBitWidth(const Type *T) const; 99 100 /// \returns Equivalent 32 bit integer type for given type \p T. For example, 101 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> 102 /// is returned. 103 Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 104 105 /// \returns True if binary operation \p I is a signed binary operation, false 106 /// otherwise. 107 bool isSigned(const BinaryOperator &I) const; 108 109 /// \returns True if the condition of 'select' operation \p I comes from a 110 /// signed 'icmp' operation, false otherwise. 111 bool isSigned(const SelectInst &I) const; 112 113 /// \returns True if type \p T needs to be promoted to 32 bit integer type, 114 /// false otherwise. 115 bool needsPromotionToI32(const Type *T) const; 116 117 /// Promotes uniform binary operation \p I to equivalent 32 bit binary 118 /// operation. 119 /// 120 /// \details \p I's base element bit width must be greater than 1 and less 121 /// than or equal 16. Promotion is done by sign or zero extending operands to 122 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and 123 /// truncating the result of 32 bit binary operation back to \p I's original 124 /// type. Division operation is not promoted. 125 /// 126 /// \returns True if \p I is promoted to equivalent 32 bit binary operation, 127 /// false otherwise. 128 bool promoteUniformOpToI32(BinaryOperator &I) const; 129 130 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. 131 /// 132 /// \details \p I's base element bit width must be greater than 1 and less 133 /// than or equal 16. Promotion is done by sign or zero extending operands to 134 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. 135 /// 136 /// \returns True. 137 bool promoteUniformOpToI32(ICmpInst &I) const; 138 139 /// Promotes uniform 'select' operation \p I to 32 bit 'select' 140 /// operation. 141 /// 142 /// \details \p I's base element bit width must be greater than 1 and less 143 /// than or equal 16. Promotion is done by sign or zero extending operands to 144 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the 145 /// result of 32 bit 'select' operation back to \p I's original type. 146 /// 147 /// \returns True. 148 bool promoteUniformOpToI32(SelectInst &I) const; 149 150 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' 151 /// intrinsic. 152 /// 153 /// \details \p I's base element bit width must be greater than 1 and less 154 /// than or equal 16. Promotion is done by zero extending the operand to 32 155 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the 156 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the 157 /// shift amount is 32 minus \p I's base element bit width), and truncating 158 /// the result of the shift operation back to \p I's original type. 159 /// 160 /// \returns True. 161 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; 162 163 164 unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const; 165 unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const; 166 bool isI24(Value *V, unsigned ScalarSize) const; 167 bool isU24(Value *V, unsigned ScalarSize) const; 168 169 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24. 170 /// SelectionDAG has an issue where an and asserting the bits are known 171 bool replaceMulWithMul24(BinaryOperator &I) const; 172 173 /// Perform same function as equivalently named function in DAGCombiner. Since 174 /// we expand some divisions here, we need to perform this before obscuring. 175 bool foldBinOpIntoSelect(BinaryOperator &I) const; 176 177 bool divHasSpecialOptimization(BinaryOperator &I, 178 Value *Num, Value *Den) const; 179 int getDivNumBits(BinaryOperator &I, 180 Value *Num, Value *Den, 181 unsigned AtLeast, bool Signed) const; 182 183 /// Expands 24 bit div or rem. 184 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I, 185 Value *Num, Value *Den, 186 bool IsDiv, bool IsSigned) const; 187 188 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I, 189 Value *Num, Value *Den, unsigned NumBits, 190 bool IsDiv, bool IsSigned) const; 191 192 /// Expands 32 bit div or rem. 193 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I, 194 Value *Num, Value *Den) const; 195 196 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I, 197 Value *Num, Value *Den) const; 198 void expandDivRem64(BinaryOperator &I) const; 199 200 /// Widen a scalar load. 201 /// 202 /// \details \p Widen scalar load for uniform, small type loads from constant 203 // memory / to a full 32-bits and then truncate the input to allow a scalar 204 // load instead of a vector load. 205 // 206 /// \returns True. 207 208 bool canWidenScalarExtLoad(LoadInst &I) const; 209 210 public: 211 static char ID; 212 213 AMDGPUCodeGenPrepare() : FunctionPass(ID) {} 214 215 bool visitFDiv(BinaryOperator &I); 216 217 bool visitInstruction(Instruction &I) { return false; } 218 bool visitBinaryOperator(BinaryOperator &I); 219 bool visitLoadInst(LoadInst &I); 220 bool visitICmpInst(ICmpInst &I); 221 bool visitSelectInst(SelectInst &I); 222 223 bool visitIntrinsicInst(IntrinsicInst &I); 224 bool visitBitreverseIntrinsicInst(IntrinsicInst &I); 225 226 bool doInitialization(Module &M) override; 227 bool runOnFunction(Function &F) override; 228 229 StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 230 231 void getAnalysisUsage(AnalysisUsage &AU) const override { 232 AU.addRequired<AssumptionCacheTracker>(); 233 AU.addRequired<LegacyDivergenceAnalysis>(); 234 AU.setPreservesAll(); 235 } 236 }; 237 238 } // end anonymous namespace 239 240 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { 241 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 242 243 if (T->isIntegerTy()) 244 return T->getIntegerBitWidth(); 245 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); 246 } 247 248 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 249 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 250 251 if (T->isIntegerTy()) 252 return B.getInt32Ty(); 253 return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 254 } 255 256 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 257 return I.getOpcode() == Instruction::AShr || 258 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 259 } 260 261 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 262 return isa<ICmpInst>(I.getOperand(0)) ? 263 cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 264 } 265 266 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { 267 const IntegerType *IntTy = dyn_cast<IntegerType>(T); 268 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) 269 return true; 270 271 if (const VectorType *VT = dyn_cast<VectorType>(T)) { 272 // TODO: The set of packed operations is more limited, so may want to 273 // promote some anyway. 274 if (ST->hasVOP3PInsts()) 275 return false; 276 277 return needsPromotionToI32(VT->getElementType()); 278 } 279 280 return false; 281 } 282 283 // Return true if the op promoted to i32 should have nsw set. 284 static bool promotedOpIsNSW(const Instruction &I) { 285 switch (I.getOpcode()) { 286 case Instruction::Shl: 287 case Instruction::Add: 288 case Instruction::Sub: 289 return true; 290 case Instruction::Mul: 291 return I.hasNoUnsignedWrap(); 292 default: 293 return false; 294 } 295 } 296 297 // Return true if the op promoted to i32 should have nuw set. 298 static bool promotedOpIsNUW(const Instruction &I) { 299 switch (I.getOpcode()) { 300 case Instruction::Shl: 301 case Instruction::Add: 302 case Instruction::Mul: 303 return true; 304 case Instruction::Sub: 305 return I.hasNoUnsignedWrap(); 306 default: 307 return false; 308 } 309 } 310 311 bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const { 312 Type *Ty = I.getType(); 313 const DataLayout &DL = Mod->getDataLayout(); 314 int TySize = DL.getTypeSizeInBits(Ty); 315 unsigned Align = I.getAlignment() ? 316 I.getAlignment() : DL.getABITypeAlignment(Ty); 317 318 return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I); 319 } 320 321 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { 322 assert(needsPromotionToI32(I.getType()) && 323 "I does not need promotion to i32"); 324 325 if (I.getOpcode() == Instruction::SDiv || 326 I.getOpcode() == Instruction::UDiv || 327 I.getOpcode() == Instruction::SRem || 328 I.getOpcode() == Instruction::URem) 329 return false; 330 331 IRBuilder<> Builder(&I); 332 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 333 334 Type *I32Ty = getI32Ty(Builder, I.getType()); 335 Value *ExtOp0 = nullptr; 336 Value *ExtOp1 = nullptr; 337 Value *ExtRes = nullptr; 338 Value *TruncRes = nullptr; 339 340 if (isSigned(I)) { 341 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 342 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 343 } else { 344 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 345 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 346 } 347 348 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); 349 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { 350 if (promotedOpIsNSW(cast<Instruction>(I))) 351 Inst->setHasNoSignedWrap(); 352 353 if (promotedOpIsNUW(cast<Instruction>(I))) 354 Inst->setHasNoUnsignedWrap(); 355 356 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) 357 Inst->setIsExact(ExactOp->isExact()); 358 } 359 360 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 361 362 I.replaceAllUsesWith(TruncRes); 363 I.eraseFromParent(); 364 365 return true; 366 } 367 368 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { 369 assert(needsPromotionToI32(I.getOperand(0)->getType()) && 370 "I does not need promotion to i32"); 371 372 IRBuilder<> Builder(&I); 373 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 374 375 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); 376 Value *ExtOp0 = nullptr; 377 Value *ExtOp1 = nullptr; 378 Value *NewICmp = nullptr; 379 380 if (I.isSigned()) { 381 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 382 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 383 } else { 384 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 385 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 386 } 387 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 388 389 I.replaceAllUsesWith(NewICmp); 390 I.eraseFromParent(); 391 392 return true; 393 } 394 395 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { 396 assert(needsPromotionToI32(I.getType()) && 397 "I does not need promotion to i32"); 398 399 IRBuilder<> Builder(&I); 400 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 401 402 Type *I32Ty = getI32Ty(Builder, I.getType()); 403 Value *ExtOp1 = nullptr; 404 Value *ExtOp2 = nullptr; 405 Value *ExtRes = nullptr; 406 Value *TruncRes = nullptr; 407 408 if (isSigned(I)) { 409 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 410 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 411 } else { 412 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 413 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 414 } 415 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 416 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 417 418 I.replaceAllUsesWith(TruncRes); 419 I.eraseFromParent(); 420 421 return true; 422 } 423 424 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( 425 IntrinsicInst &I) const { 426 assert(I.getIntrinsicID() == Intrinsic::bitreverse && 427 "I must be bitreverse intrinsic"); 428 assert(needsPromotionToI32(I.getType()) && 429 "I does not need promotion to i32"); 430 431 IRBuilder<> Builder(&I); 432 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 433 434 Type *I32Ty = getI32Ty(Builder, I.getType()); 435 Function *I32 = 436 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); 437 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); 438 Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); 439 Value *LShrOp = 440 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); 441 Value *TruncRes = 442 Builder.CreateTrunc(LShrOp, I.getType()); 443 444 I.replaceAllUsesWith(TruncRes); 445 I.eraseFromParent(); 446 447 return true; 448 } 449 450 unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op, 451 unsigned ScalarSize) const { 452 KnownBits Known = computeKnownBits(Op, *DL, 0, AC); 453 return ScalarSize - Known.countMinLeadingZeros(); 454 } 455 456 unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op, 457 unsigned ScalarSize) const { 458 // In order for this to be a signed 24-bit value, bit 23, must 459 // be a sign bit. 460 return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC); 461 } 462 463 bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const { 464 return ScalarSize >= 24 && // Types less than 24-bit should be treated 465 // as unsigned 24-bit values. 466 numBitsSigned(V, ScalarSize) < 24; 467 } 468 469 bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const { 470 return numBitsUnsigned(V, ScalarSize) <= 24; 471 } 472 473 static void extractValues(IRBuilder<> &Builder, 474 SmallVectorImpl<Value *> &Values, Value *V) { 475 VectorType *VT = dyn_cast<VectorType>(V->getType()); 476 if (!VT) { 477 Values.push_back(V); 478 return; 479 } 480 481 for (int I = 0, E = VT->getNumElements(); I != E; ++I) 482 Values.push_back(Builder.CreateExtractElement(V, I)); 483 } 484 485 static Value *insertValues(IRBuilder<> &Builder, 486 Type *Ty, 487 SmallVectorImpl<Value *> &Values) { 488 if (Values.size() == 1) 489 return Values[0]; 490 491 Value *NewVal = UndefValue::get(Ty); 492 for (int I = 0, E = Values.size(); I != E; ++I) 493 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I); 494 495 return NewVal; 496 } 497 498 bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const { 499 if (I.getOpcode() != Instruction::Mul) 500 return false; 501 502 Type *Ty = I.getType(); 503 unsigned Size = Ty->getScalarSizeInBits(); 504 if (Size <= 16 && ST->has16BitInsts()) 505 return false; 506 507 // Prefer scalar if this could be s_mul_i32 508 if (DA->isUniform(&I)) 509 return false; 510 511 Value *LHS = I.getOperand(0); 512 Value *RHS = I.getOperand(1); 513 IRBuilder<> Builder(&I); 514 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 515 516 Intrinsic::ID IntrID = Intrinsic::not_intrinsic; 517 518 // TODO: Should this try to match mulhi24? 519 if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) { 520 IntrID = Intrinsic::amdgcn_mul_u24; 521 } else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) { 522 IntrID = Intrinsic::amdgcn_mul_i24; 523 } else 524 return false; 525 526 SmallVector<Value *, 4> LHSVals; 527 SmallVector<Value *, 4> RHSVals; 528 SmallVector<Value *, 4> ResultVals; 529 extractValues(Builder, LHSVals, LHS); 530 extractValues(Builder, RHSVals, RHS); 531 532 533 IntegerType *I32Ty = Builder.getInt32Ty(); 534 FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID); 535 for (int I = 0, E = LHSVals.size(); I != E; ++I) { 536 Value *LHS, *RHS; 537 if (IntrID == Intrinsic::amdgcn_mul_u24) { 538 LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty); 539 RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty); 540 } else { 541 LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty); 542 RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty); 543 } 544 545 Value *Result = Builder.CreateCall(Intrin, {LHS, RHS}); 546 547 if (IntrID == Intrinsic::amdgcn_mul_u24) { 548 ResultVals.push_back(Builder.CreateZExtOrTrunc(Result, 549 LHSVals[I]->getType())); 550 } else { 551 ResultVals.push_back(Builder.CreateSExtOrTrunc(Result, 552 LHSVals[I]->getType())); 553 } 554 } 555 556 Value *NewVal = insertValues(Builder, Ty, ResultVals); 557 NewVal->takeName(&I); 558 I.replaceAllUsesWith(NewVal); 559 I.eraseFromParent(); 560 561 return true; 562 } 563 564 // Find a select instruction, which may have been casted. This is mostly to deal 565 // with cases where i16 selects were promoted here to i32. 566 static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) { 567 Cast = nullptr; 568 if (SelectInst *Sel = dyn_cast<SelectInst>(V)) 569 return Sel; 570 571 if ((Cast = dyn_cast<CastInst>(V))) { 572 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0))) 573 return Sel; 574 } 575 576 return nullptr; 577 } 578 579 bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const { 580 // Don't do this unless the old select is going away. We want to eliminate the 581 // binary operator, not replace a binop with a select. 582 int SelOpNo = 0; 583 584 CastInst *CastOp; 585 586 // TODO: Should probably try to handle some cases with multiple 587 // users. Duplicating the select may be profitable for division. 588 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp); 589 if (!Sel || !Sel->hasOneUse()) { 590 SelOpNo = 1; 591 Sel = findSelectThroughCast(BO.getOperand(1), CastOp); 592 } 593 594 if (!Sel || !Sel->hasOneUse()) 595 return false; 596 597 Constant *CT = dyn_cast<Constant>(Sel->getTrueValue()); 598 Constant *CF = dyn_cast<Constant>(Sel->getFalseValue()); 599 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1)); 600 if (!CBO || !CT || !CF) 601 return false; 602 603 if (CastOp) { 604 if (!CastOp->hasOneUse()) 605 return false; 606 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL); 607 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL); 608 } 609 610 // TODO: Handle special 0/-1 cases DAG combine does, although we only really 611 // need to handle divisions here. 612 Constant *FoldedT = SelOpNo ? 613 ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) : 614 ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL); 615 if (isa<ConstantExpr>(FoldedT)) 616 return false; 617 618 Constant *FoldedF = SelOpNo ? 619 ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) : 620 ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL); 621 if (isa<ConstantExpr>(FoldedF)) 622 return false; 623 624 IRBuilder<> Builder(&BO); 625 Builder.SetCurrentDebugLocation(BO.getDebugLoc()); 626 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO)) 627 Builder.setFastMathFlags(FPOp->getFastMathFlags()); 628 629 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(), 630 FoldedT, FoldedF); 631 NewSelect->takeName(&BO); 632 BO.replaceAllUsesWith(NewSelect); 633 BO.eraseFromParent(); 634 if (CastOp) 635 CastOp->eraseFromParent(); 636 Sel->eraseFromParent(); 637 return true; 638 } 639 640 // Optimize fdiv with rcp: 641 // 642 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is 643 // allowed with unsafe-fp-math or afn. 644 // 645 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn. 646 static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp, 647 bool RcpIsAccurate, IRBuilder<> Builder, 648 Module *Mod) { 649 650 if (!AllowInaccurateRcp && !RcpIsAccurate) 651 return nullptr; 652 653 Type *Ty = Den->getType(); 654 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) { 655 if (AllowInaccurateRcp || RcpIsAccurate) { 656 if (CLHS->isExactlyValue(1.0)) { 657 Function *Decl = Intrinsic::getDeclaration( 658 Mod, Intrinsic::amdgcn_rcp, Ty); 659 660 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 661 // the CI documentation has a worst case error of 1 ulp. 662 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 663 // use it as long as we aren't trying to use denormals. 664 // 665 // v_rcp_f16 and v_rsq_f16 DO support denormals. 666 667 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't 668 // insert rsq intrinsic here. 669 670 // 1.0 / x -> rcp(x) 671 return Builder.CreateCall(Decl, { Den }); 672 } 673 674 // Same as for 1.0, but expand the sign out of the constant. 675 if (CLHS->isExactlyValue(-1.0)) { 676 Function *Decl = Intrinsic::getDeclaration( 677 Mod, Intrinsic::amdgcn_rcp, Ty); 678 679 // -1.0 / x -> rcp (fneg x) 680 Value *FNeg = Builder.CreateFNeg(Den); 681 return Builder.CreateCall(Decl, { FNeg }); 682 } 683 } 684 } 685 686 if (AllowInaccurateRcp) { 687 Function *Decl = Intrinsic::getDeclaration( 688 Mod, Intrinsic::amdgcn_rcp, Ty); 689 690 // Turn into multiply by the reciprocal. 691 // x / y -> x * (1.0 / y) 692 Value *Recip = Builder.CreateCall(Decl, { Den }); 693 return Builder.CreateFMul(Num, Recip); 694 } 695 return nullptr; 696 } 697 698 // optimize with fdiv.fast: 699 // 700 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. 701 // 702 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp. 703 // 704 // NOTE: optimizeWithRcp should be tried first because rcp is the preference. 705 static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy, 706 bool HasDenormals, IRBuilder<> Builder, 707 Module *Mod) { 708 // fdiv.fast can achieve 2.5 ULP accuracy. 709 if (ReqdAccuracy < 2.5f) 710 return nullptr; 711 712 // Only have fdiv.fast for f32. 713 Type *Ty = Den->getType(); 714 if (!Ty->isFloatTy()) 715 return nullptr; 716 717 bool NumIsOne = false; 718 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) { 719 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0)) 720 NumIsOne = true; 721 } 722 723 // fdiv does not support denormals. But 1.0/x is always fine to use it. 724 if (HasDenormals && !NumIsOne) 725 return nullptr; 726 727 Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); 728 return Builder.CreateCall(Decl, { Num, Den }); 729 } 730 731 // Optimizations is performed based on fpmath, fast math flags as well as 732 // denormals to optimize fdiv with either rcp or fdiv.fast. 733 // 734 // With rcp: 735 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is 736 // allowed with unsafe-fp-math or afn. 737 // 738 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn. 739 // 740 // With fdiv.fast: 741 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. 742 // 743 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp. 744 // 745 // NOTE: rcp is the preference in cases that both are legal. 746 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 747 748 Type *Ty = FDiv.getType()->getScalarType(); 749 750 // No intrinsic for fdiv16 if target does not support f16. 751 if (Ty->isHalfTy() && !ST->has16BitInsts()) 752 return false; 753 754 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 755 const float ReqdAccuracy = FPOp->getFPAccuracy(); 756 757 // Inaccurate rcp is allowed with unsafe-fp-math or afn. 758 FastMathFlags FMF = FPOp->getFastMathFlags(); 759 const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc(); 760 761 // rcp_f16 is accurate for !fpmath >= 1.0ulp. 762 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. 763 // rcp_f64 is never accurate. 764 const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) || 765 (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f); 766 767 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator())); 768 Builder.setFastMathFlags(FMF); 769 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 770 771 Value *Num = FDiv.getOperand(0); 772 Value *Den = FDiv.getOperand(1); 773 774 Value *NewFDiv = nullptr; 775 if (VectorType *VT = dyn_cast<VectorType>(FDiv.getType())) { 776 NewFDiv = UndefValue::get(VT); 777 778 // FIXME: Doesn't do the right thing for cases where the vector is partially 779 // constant. This works when the scalarizer pass is run first. 780 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 781 Value *NumEltI = Builder.CreateExtractElement(Num, I); 782 Value *DenEltI = Builder.CreateExtractElement(Den, I); 783 // Try rcp first. 784 Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp, 785 RcpIsAccurate, Builder, Mod); 786 if (!NewElt) // Try fdiv.fast. 787 NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy, 788 HasFP32Denormals, Builder, Mod); 789 if (!NewElt) // Keep the original. 790 NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 791 792 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 793 } 794 } else { // Scalar FDiv. 795 // Try rcp first. 796 NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate, 797 Builder, Mod); 798 if (!NewFDiv) { // Try fdiv.fast. 799 NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals, 800 Builder, Mod); 801 } 802 } 803 804 if (NewFDiv) { 805 FDiv.replaceAllUsesWith(NewFDiv); 806 NewFDiv->takeName(&FDiv); 807 FDiv.eraseFromParent(); 808 } 809 810 return !!NewFDiv; 811 } 812 813 static bool hasUnsafeFPMath(const Function &F) { 814 Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 815 return Attr.getValueAsString() == "true"; 816 } 817 818 static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder, 819 Value *LHS, Value *RHS) { 820 Type *I32Ty = Builder.getInt32Ty(); 821 Type *I64Ty = Builder.getInt64Ty(); 822 823 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty); 824 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty); 825 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64); 826 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty); 827 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32)); 828 Hi = Builder.CreateTrunc(Hi, I32Ty); 829 return std::make_pair(Lo, Hi); 830 } 831 832 static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) { 833 return getMul64(Builder, LHS, RHS).second; 834 } 835 836 /// Figure out how many bits are really needed for this ddivision. \p AtLeast is 837 /// an optimization hint to bypass the second ComputeNumSignBits call if we the 838 /// first one is insufficient. Returns -1 on failure. 839 int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I, 840 Value *Num, Value *Den, 841 unsigned AtLeast, bool IsSigned) const { 842 const DataLayout &DL = Mod->getDataLayout(); 843 unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I); 844 if (LHSSignBits < AtLeast) 845 return -1; 846 847 unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I); 848 if (RHSSignBits < AtLeast) 849 return -1; 850 851 unsigned SignBits = std::min(LHSSignBits, RHSSignBits); 852 unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits; 853 if (IsSigned) 854 ++DivBits; 855 return DivBits; 856 } 857 858 // The fractional part of a float is enough to accurately represent up to 859 // a 24-bit signed integer. 860 Value *AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, 861 BinaryOperator &I, 862 Value *Num, Value *Den, 863 bool IsDiv, bool IsSigned) const { 864 int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned); 865 if (DivBits == -1) 866 return nullptr; 867 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned); 868 } 869 870 Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder, 871 BinaryOperator &I, 872 Value *Num, Value *Den, 873 unsigned DivBits, 874 bool IsDiv, bool IsSigned) const { 875 Type *I32Ty = Builder.getInt32Ty(); 876 Num = Builder.CreateTrunc(Num, I32Ty); 877 Den = Builder.CreateTrunc(Den, I32Ty); 878 879 Type *F32Ty = Builder.getFloatTy(); 880 ConstantInt *One = Builder.getInt32(1); 881 Value *JQ = One; 882 883 if (IsSigned) { 884 // char|short jq = ia ^ ib; 885 JQ = Builder.CreateXor(Num, Den); 886 887 // jq = jq >> (bitsize - 2) 888 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30)); 889 890 // jq = jq | 0x1 891 JQ = Builder.CreateOr(JQ, One); 892 } 893 894 // int ia = (int)LHS; 895 Value *IA = Num; 896 897 // int ib, (int)RHS; 898 Value *IB = Den; 899 900 // float fa = (float)ia; 901 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty) 902 : Builder.CreateUIToFP(IA, F32Ty); 903 904 // float fb = (float)ib; 905 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty) 906 : Builder.CreateUIToFP(IB,F32Ty); 907 908 Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, 909 Builder.getFloatTy()); 910 Value *RCP = Builder.CreateCall(RcpDecl, { FB }); 911 Value *FQM = Builder.CreateFMul(FA, RCP); 912 913 // fq = trunc(fqm); 914 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM); 915 FQ->copyFastMathFlags(Builder.getFastMathFlags()); 916 917 // float fqneg = -fq; 918 Value *FQNeg = Builder.CreateFNeg(FQ); 919 920 // float fr = mad(fqneg, fb, fa); 921 Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz, 922 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ); 923 924 // int iq = (int)fq; 925 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty) 926 : Builder.CreateFPToUI(FQ, I32Ty); 927 928 // fr = fabs(fr); 929 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ); 930 931 // fb = fabs(fb); 932 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ); 933 934 // int cv = fr >= fb; 935 Value *CV = Builder.CreateFCmpOGE(FR, FB); 936 937 // jq = (cv ? jq : 0); 938 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0)); 939 940 // dst = iq + jq; 941 Value *Div = Builder.CreateAdd(IQ, JQ); 942 943 Value *Res = Div; 944 if (!IsDiv) { 945 // Rem needs compensation, it's easier to recompute it 946 Value *Rem = Builder.CreateMul(Div, Den); 947 Res = Builder.CreateSub(Num, Rem); 948 } 949 950 if (DivBits != 0 && DivBits < 32) { 951 // Extend in register from the number of bits this divide really is. 952 if (IsSigned) { 953 int InRegBits = 32 - DivBits; 954 955 Res = Builder.CreateShl(Res, InRegBits); 956 Res = Builder.CreateAShr(Res, InRegBits); 957 } else { 958 ConstantInt *TruncMask 959 = Builder.getInt32((UINT64_C(1) << DivBits) - 1); 960 Res = Builder.CreateAnd(Res, TruncMask); 961 } 962 } 963 964 return Res; 965 } 966 967 // Try to recognize special cases the DAG will emit special, better expansions 968 // than the general expansion we do here. 969 970 // TODO: It would be better to just directly handle those optimizations here. 971 bool AMDGPUCodeGenPrepare::divHasSpecialOptimization( 972 BinaryOperator &I, Value *Num, Value *Den) const { 973 if (Constant *C = dyn_cast<Constant>(Den)) { 974 // Arbitrary constants get a better expansion as long as a wider mulhi is 975 // legal. 976 if (C->getType()->getScalarSizeInBits() <= 32) 977 return true; 978 979 // TODO: Sdiv check for not exact for some reason. 980 981 // If there's no wider mulhi, there's only a better expansion for powers of 982 // two. 983 // TODO: Should really know for each vector element. 984 if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT)) 985 return true; 986 987 return false; 988 } 989 990 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) { 991 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 992 if (BinOpDen->getOpcode() == Instruction::Shl && 993 isa<Constant>(BinOpDen->getOperand(0)) && 994 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true, 995 0, AC, &I, DT)) { 996 return true; 997 } 998 } 999 1000 return false; 1001 } 1002 1003 Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, 1004 BinaryOperator &I, 1005 Value *Num, Value *Den) const { 1006 Instruction::BinaryOps Opc = I.getOpcode(); 1007 assert(Opc == Instruction::URem || Opc == Instruction::UDiv || 1008 Opc == Instruction::SRem || Opc == Instruction::SDiv); 1009 1010 FastMathFlags FMF; 1011 FMF.setFast(); 1012 Builder.setFastMathFlags(FMF); 1013 1014 if (divHasSpecialOptimization(I, Num, Den)) 1015 return nullptr; // Keep it for later optimization. 1016 1017 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv; 1018 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv; 1019 1020 Type *Ty = Num->getType(); 1021 Type *I32Ty = Builder.getInt32Ty(); 1022 Type *F32Ty = Builder.getFloatTy(); 1023 1024 if (Ty->getScalarSizeInBits() < 32) { 1025 if (IsSigned) { 1026 Num = Builder.CreateSExt(Num, I32Ty); 1027 Den = Builder.CreateSExt(Den, I32Ty); 1028 } else { 1029 Num = Builder.CreateZExt(Num, I32Ty); 1030 Den = Builder.CreateZExt(Den, I32Ty); 1031 } 1032 } 1033 1034 if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) { 1035 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) : 1036 Builder.CreateZExtOrTrunc(Res, Ty); 1037 } 1038 1039 ConstantInt *Zero = Builder.getInt32(0); 1040 ConstantInt *One = Builder.getInt32(1); 1041 1042 Value *Sign = nullptr; 1043 if (IsSigned) { 1044 ConstantInt *K31 = Builder.getInt32(31); 1045 Value *LHSign = Builder.CreateAShr(Num, K31); 1046 Value *RHSign = Builder.CreateAShr(Den, K31); 1047 // Remainder sign is the same as LHS 1048 Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign; 1049 1050 Num = Builder.CreateAdd(Num, LHSign); 1051 Den = Builder.CreateAdd(Den, RHSign); 1052 1053 Num = Builder.CreateXor(Num, LHSign); 1054 Den = Builder.CreateXor(Den, RHSign); 1055 } 1056 1057 // RCP = URECIP(Den) = 2^32 / Den + e 1058 // e is rounding error. 1059 Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty); 1060 1061 Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, 1062 Builder.getFloatTy()); 1063 Value *RCP_F32 = Builder.CreateCall(RcpDecl, { DEN_F32 }); 1064 Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000)); 1065 Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1); 1066 Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty); 1067 1068 // RCP_LO, RCP_HI = mul(RCP, Den) */ 1069 Value *RCP_LO, *RCP_HI; 1070 std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den); 1071 1072 // NEG_RCP_LO = -RCP_LO 1073 Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO); 1074 1075 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 1076 Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero); 1077 Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO); 1078 1079 // Calculate the rounding error from the URECIP instruction 1080 // E = mulhu(ABS_RCP_LO, RCP) 1081 Value *E = getMulHu(Builder, ABS_RCP_LO, RCP); 1082 1083 // RCP_A_E = RCP + E 1084 Value *RCP_A_E = Builder.CreateAdd(RCP, E); 1085 1086 // RCP_S_E = RCP - E 1087 Value *RCP_S_E = Builder.CreateSub(RCP, E); 1088 1089 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 1090 Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E); 1091 1092 // Quotient = mulhu(Tmp0, Num) 1093 Value *Quotient = getMulHu(Builder, Tmp0, Num); 1094 1095 // Num_S_Remainder = Quotient * Den 1096 Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den); 1097 1098 // Remainder = Num - Num_S_Remainder 1099 Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder); 1100 1101 // Remainder_GE_Den = Remainder >= Den; 1102 Value *Remainder_GE_Den = Builder.CreateICmpUGE(Remainder, Den); 1103 1104 // Remainder_GE_Zero = Num >= Num_S_Remainder 1105 Value *Remainder_GE_Zero = Builder.CreateICmpUGE(Num, Num_S_Remainder); 1106 1107 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 1108 Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero); 1109 1110 Value *Res; 1111 if (IsDiv) { 1112 // Quotient_A_One = Quotient + 1 1113 Value *Quotient_A_One = Builder.CreateAdd(Quotient, One); 1114 1115 // Quotient_S_One = Quotient - 1 1116 Value *Quotient_S_One = Builder.CreateSub(Quotient, One); 1117 1118 // Div = (Tmp1 ? Quotient_A_One : Quotient) 1119 Value *Div = Builder.CreateSelect(Tmp1, Quotient_A_One, Quotient); 1120 1121 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 1122 Res = Builder.CreateSelect(Remainder_GE_Zero, Div, Quotient_S_One); 1123 } else { 1124 // Remainder_S_Den = Remainder - Den 1125 Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den); 1126 1127 // Remainder_A_Den = Remainder + Den 1128 Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den); 1129 1130 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 1131 Value *Rem = Builder.CreateSelect(Tmp1, Remainder_S_Den, Remainder); 1132 1133 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 1134 Res = Builder.CreateSelect(Remainder_GE_Zero, Rem, Remainder_A_Den); 1135 } 1136 1137 if (IsSigned) { 1138 Res = Builder.CreateXor(Res, Sign); 1139 Res = Builder.CreateSub(Res, Sign); 1140 } 1141 1142 Res = Builder.CreateTrunc(Res, Ty); 1143 1144 return Res; 1145 } 1146 1147 Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder, 1148 BinaryOperator &I, 1149 Value *Num, Value *Den) const { 1150 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den)) 1151 return nullptr; // Keep it for later optimization. 1152 1153 Instruction::BinaryOps Opc = I.getOpcode(); 1154 1155 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv; 1156 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem; 1157 1158 int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned); 1159 if (NumDivBits == -1) 1160 return nullptr; 1161 1162 Value *Narrowed = nullptr; 1163 if (NumDivBits <= 24) { 1164 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits, 1165 IsDiv, IsSigned); 1166 } else if (NumDivBits <= 32) { 1167 Narrowed = expandDivRem32(Builder, I, Num, Den); 1168 } 1169 1170 if (Narrowed) { 1171 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) : 1172 Builder.CreateZExt(Narrowed, Num->getType()); 1173 } 1174 1175 return nullptr; 1176 } 1177 1178 void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const { 1179 Instruction::BinaryOps Opc = I.getOpcode(); 1180 // Do the general expansion. 1181 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) { 1182 expandDivisionUpTo64Bits(&I); 1183 return; 1184 } 1185 1186 if (Opc == Instruction::URem || Opc == Instruction::SRem) { 1187 expandRemainderUpTo64Bits(&I); 1188 return; 1189 } 1190 1191 llvm_unreachable("not a division"); 1192 } 1193 1194 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 1195 if (foldBinOpIntoSelect(I)) 1196 return true; 1197 1198 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 1199 DA->isUniform(&I) && promoteUniformOpToI32(I)) 1200 return true; 1201 1202 if (UseMul24Intrin && replaceMulWithMul24(I)) 1203 return true; 1204 1205 bool Changed = false; 1206 Instruction::BinaryOps Opc = I.getOpcode(); 1207 Type *Ty = I.getType(); 1208 Value *NewDiv = nullptr; 1209 unsigned ScalarSize = Ty->getScalarSizeInBits(); 1210 1211 SmallVector<BinaryOperator *, 8> Div64ToExpand; 1212 1213 if ((Opc == Instruction::URem || Opc == Instruction::UDiv || 1214 Opc == Instruction::SRem || Opc == Instruction::SDiv) && 1215 ScalarSize <= 64 && 1216 !DisableIDivExpand) { 1217 Value *Num = I.getOperand(0); 1218 Value *Den = I.getOperand(1); 1219 IRBuilder<> Builder(&I); 1220 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 1221 1222 if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 1223 NewDiv = UndefValue::get(VT); 1224 1225 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) { 1226 Value *NumEltN = Builder.CreateExtractElement(Num, N); 1227 Value *DenEltN = Builder.CreateExtractElement(Den, N); 1228 1229 Value *NewElt; 1230 if (ScalarSize <= 32) { 1231 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); 1232 if (!NewElt) 1233 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); 1234 } else { 1235 // See if this 64-bit division can be shrunk to 32/24-bits before 1236 // producing the general expansion. 1237 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN); 1238 if (!NewElt) { 1239 // The general 64-bit expansion introduces control flow and doesn't 1240 // return the new value. Just insert a scalar copy and defer 1241 // expanding it. 1242 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); 1243 Div64ToExpand.push_back(cast<BinaryOperator>(NewElt)); 1244 } 1245 } 1246 1247 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N); 1248 } 1249 } else { 1250 if (ScalarSize <= 32) 1251 NewDiv = expandDivRem32(Builder, I, Num, Den); 1252 else { 1253 NewDiv = shrinkDivRem64(Builder, I, Num, Den); 1254 if (!NewDiv) 1255 Div64ToExpand.push_back(&I); 1256 } 1257 } 1258 1259 if (NewDiv) { 1260 I.replaceAllUsesWith(NewDiv); 1261 I.eraseFromParent(); 1262 Changed = true; 1263 } 1264 } 1265 1266 if (ExpandDiv64InIR) { 1267 // TODO: We get much worse code in specially handled constant cases. 1268 for (BinaryOperator *Div : Div64ToExpand) { 1269 expandDivRem64(*Div); 1270 Changed = true; 1271 } 1272 } 1273 1274 return Changed; 1275 } 1276 1277 bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { 1278 if (!WidenLoads) 1279 return false; 1280 1281 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 1282 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && 1283 canWidenScalarExtLoad(I)) { 1284 IRBuilder<> Builder(&I); 1285 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 1286 1287 Type *I32Ty = Builder.getInt32Ty(); 1288 Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); 1289 Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); 1290 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast); 1291 WidenLoad->copyMetadata(I); 1292 1293 // If we have range metadata, we need to convert the type, and not make 1294 // assumptions about the high bits. 1295 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) { 1296 ConstantInt *Lower = 1297 mdconst::extract<ConstantInt>(Range->getOperand(0)); 1298 1299 if (Lower->getValue().isNullValue()) { 1300 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr); 1301 } else { 1302 Metadata *LowAndHigh[] = { 1303 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))), 1304 // Don't make assumptions about the high bits. 1305 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0)) 1306 }; 1307 1308 WidenLoad->setMetadata(LLVMContext::MD_range, 1309 MDNode::get(Mod->getContext(), LowAndHigh)); 1310 } 1311 } 1312 1313 int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); 1314 Type *IntNTy = Builder.getIntNTy(TySize); 1315 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); 1316 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); 1317 I.replaceAllUsesWith(ValOrig); 1318 I.eraseFromParent(); 1319 return true; 1320 } 1321 1322 return false; 1323 } 1324 1325 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 1326 bool Changed = false; 1327 1328 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && 1329 DA->isUniform(&I)) 1330 Changed |= promoteUniformOpToI32(I); 1331 1332 return Changed; 1333 } 1334 1335 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 1336 bool Changed = false; 1337 1338 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 1339 DA->isUniform(&I)) 1340 Changed |= promoteUniformOpToI32(I); 1341 1342 return Changed; 1343 } 1344 1345 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { 1346 switch (I.getIntrinsicID()) { 1347 case Intrinsic::bitreverse: 1348 return visitBitreverseIntrinsicInst(I); 1349 default: 1350 return false; 1351 } 1352 } 1353 1354 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { 1355 bool Changed = false; 1356 1357 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 1358 DA->isUniform(&I)) 1359 Changed |= promoteUniformBitreverseToI32(I); 1360 1361 return Changed; 1362 } 1363 1364 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 1365 Mod = &M; 1366 DL = &Mod->getDataLayout(); 1367 return false; 1368 } 1369 1370 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 1371 if (skipFunction(F)) 1372 return false; 1373 1374 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 1375 if (!TPC) 1376 return false; 1377 1378 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>(); 1379 ST = &TM.getSubtarget<GCNSubtarget>(F); 1380 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1381 DA = &getAnalysis<LegacyDivergenceAnalysis>(); 1382 1383 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 1384 DT = DTWP ? &DTWP->getDomTree() : nullptr; 1385 1386 HasUnsafeFPMath = hasUnsafeFPMath(F); 1387 HasFP32Denormals = ST->hasFP32Denormals(F); 1388 1389 bool MadeChange = false; 1390 1391 Function::iterator NextBB; 1392 for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) { 1393 BasicBlock *BB = &*FI; 1394 NextBB = std::next(FI); 1395 1396 BasicBlock::iterator Next; 1397 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I = Next) { 1398 Next = std::next(I); 1399 1400 MadeChange |= visit(*I); 1401 1402 if (Next != E) { // Control flow changed 1403 BasicBlock *NextInstBB = Next->getParent(); 1404 if (NextInstBB != BB) { 1405 BB = NextInstBB; 1406 E = BB->end(); 1407 FE = F.end(); 1408 } 1409 } 1410 } 1411 } 1412 1413 return MadeChange; 1414 } 1415 1416 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 1417 "AMDGPU IR optimizations", false, false) 1418 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1419 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 1420 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", 1421 false, false) 1422 1423 char AMDGPUCodeGenPrepare::ID = 0; 1424 1425 FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { 1426 return new AMDGPUCodeGenPrepare(); 1427 } 1428