1 //===- BypassSlowDivision.cpp - Bypass slow division ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains an optimization for div and rem on architectures that 10 // execute short instructions significantly faster than longer instructions. 11 // For example, on Intel Atom 32-bit divides are slow enough that during 12 // runtime it is profitable to check the value of the operands, and if they are 13 // positive and less than 256 use an unsigned 8-bit divide. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/Transforms/Utils/BypassSlowDivision.h" 18 #include "llvm/ADT/DenseMap.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallPtrSet.h" 21 #include "llvm/Analysis/ValueTracking.h" 22 #include "llvm/IR/BasicBlock.h" 23 #include "llvm/IR/Constants.h" 24 #include "llvm/IR/DerivedTypes.h" 25 #include "llvm/IR/Function.h" 26 #include "llvm/IR/IRBuilder.h" 27 #include "llvm/IR/Instruction.h" 28 #include "llvm/IR/Instructions.h" 29 #include "llvm/IR/Type.h" 30 #include "llvm/IR/Value.h" 31 #include "llvm/Support/Casting.h" 32 #include "llvm/Support/KnownBits.h" 33 #include "llvm/Transforms/Utils/Local.h" 34 #include <cassert> 35 #include <cstdint> 36 37 using namespace llvm; 38 39 #define DEBUG_TYPE "bypass-slow-division" 40 41 namespace { 42 43 struct QuotRemPair { 44 Value *Quotient; 45 Value *Remainder; 46 47 QuotRemPair(Value *InQuotient, Value *InRemainder) 48 : Quotient(InQuotient), Remainder(InRemainder) {} 49 }; 50 51 /// A quotient and remainder, plus a BB from which they logically "originate". 52 /// If you use Quotient or Remainder in a Phi node, you should use BB as its 53 /// corresponding predecessor. 54 struct QuotRemWithBB { 55 BasicBlock *BB = nullptr; 56 Value *Quotient = nullptr; 57 Value *Remainder = nullptr; 58 }; 59 60 using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>; 61 using BypassWidthsTy = DenseMap<unsigned, unsigned>; 62 using VisitedSetTy = SmallPtrSet<Instruction *, 4>; 63 64 enum ValueRange { 65 /// Operand definitely fits into BypassType. No runtime checks are needed. 66 VALRNG_KNOWN_SHORT, 67 /// A runtime check is required, as value range is unknown. 68 VALRNG_UNKNOWN, 69 /// Operand is unlikely to fit into BypassType. The bypassing should be 70 /// disabled. 71 VALRNG_LIKELY_LONG 72 }; 73 74 class FastDivInsertionTask { 75 bool IsValidTask = false; 76 Instruction *SlowDivOrRem = nullptr; 77 IntegerType *BypassType = nullptr; 78 BasicBlock *MainBB = nullptr; 79 80 bool isHashLikeValue(Value *V, VisitedSetTy &Visited); 81 ValueRange getValueRange(Value *Op, VisitedSetTy &Visited); 82 QuotRemWithBB createSlowBB(BasicBlock *Successor); 83 QuotRemWithBB createFastBB(BasicBlock *Successor); 84 QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, 85 BasicBlock *PhiBB); 86 Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2); 87 std::optional<QuotRemPair> insertFastDivAndRem(); 88 89 bool isSignedOp() { 90 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 91 SlowDivOrRem->getOpcode() == Instruction::SRem; 92 } 93 94 bool isDivisionOp() { 95 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 96 SlowDivOrRem->getOpcode() == Instruction::UDiv; 97 } 98 99 Type *getSlowType() { return SlowDivOrRem->getType(); } 100 101 public: 102 FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths); 103 104 Value *getReplacement(DivCacheTy &Cache); 105 }; 106 107 } // end anonymous namespace 108 109 FastDivInsertionTask::FastDivInsertionTask(Instruction *I, 110 const BypassWidthsTy &BypassWidths) { 111 switch (I->getOpcode()) { 112 case Instruction::UDiv: 113 case Instruction::SDiv: 114 case Instruction::URem: 115 case Instruction::SRem: 116 SlowDivOrRem = I; 117 break; 118 default: 119 // I is not a div/rem operation. 120 return; 121 } 122 123 // Skip division on vector types. Only optimize integer instructions. 124 IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType()); 125 if (!SlowType) 126 return; 127 128 // Skip if this bitwidth is not bypassed. 129 auto BI = BypassWidths.find(SlowType->getBitWidth()); 130 if (BI == BypassWidths.end()) 131 return; 132 133 // Get type for div/rem instruction with bypass bitwidth. 134 IntegerType *BT = IntegerType::get(I->getContext(), BI->second); 135 BypassType = BT; 136 137 // The original basic block. 138 MainBB = I->getParent(); 139 140 // The instruction is indeed a slow div or rem operation. 141 IsValidTask = true; 142 } 143 144 /// Reuses previously-computed dividend or remainder from the current BB if 145 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to 146 /// perform the optimization and caches the resulting dividend and remainder. 147 /// If no replacement can be generated, nullptr is returned. 148 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) { 149 // First, make sure that the task is valid. 150 if (!IsValidTask) 151 return nullptr; 152 153 // Then, look for a value in Cache. 154 Value *Dividend = SlowDivOrRem->getOperand(0); 155 Value *Divisor = SlowDivOrRem->getOperand(1); 156 DivRemMapKey Key(isSignedOp(), Dividend, Divisor); 157 auto CacheI = Cache.find(Key); 158 159 if (CacheI == Cache.end()) { 160 // If previous instance does not exist, try to insert fast div. 161 std::optional<QuotRemPair> OptResult = insertFastDivAndRem(); 162 // Bail out if insertFastDivAndRem has failed. 163 if (!OptResult) 164 return nullptr; 165 CacheI = Cache.insert({Key, *OptResult}).first; 166 } 167 168 QuotRemPair &Value = CacheI->second; 169 return isDivisionOp() ? Value.Quotient : Value.Remainder; 170 } 171 172 /// Check if a value looks like a hash. 173 /// 174 /// The routine is expected to detect values computed using the most common hash 175 /// algorithms. Typically, hash computations end with one of the following 176 /// instructions: 177 /// 178 /// 1) MUL with a constant wider than BypassType 179 /// 2) XOR instruction 180 /// 181 /// And even if we are wrong and the value is not a hash, it is still quite 182 /// unlikely that such values will fit into BypassType. 183 /// 184 /// To detect string hash algorithms like FNV we have to look through PHI-nodes. 185 /// It is implemented as a depth-first search for values that look neither long 186 /// nor hash-like. 187 bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) { 188 Instruction *I = dyn_cast<Instruction>(V); 189 if (!I) 190 return false; 191 192 switch (I->getOpcode()) { 193 case Instruction::Xor: 194 return true; 195 case Instruction::Mul: { 196 // After Constant Hoisting pass, long constants may be represented as 197 // bitcast instructions. As a result, some constants may look like an 198 // instruction at first, and an additional check is necessary to find out if 199 // an operand is actually a constant. 200 Value *Op1 = I->getOperand(1); 201 ConstantInt *C = dyn_cast<ConstantInt>(Op1); 202 if (!C && isa<BitCastInst>(Op1)) 203 C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0)); 204 return C && C->getValue().getSignificantBits() > BypassType->getBitWidth(); 205 } 206 case Instruction::PHI: 207 // Stop IR traversal in case of a crazy input code. This limits recursion 208 // depth. 209 if (Visited.size() >= 16) 210 return false; 211 // Do not visit nodes that have been visited already. We return true because 212 // it means that we couldn't find any value that doesn't look hash-like. 213 if (!Visited.insert(I).second) 214 return true; 215 return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) { 216 // Ignore undef values as they probably don't affect the division 217 // operands. 218 return getValueRange(V, Visited) == VALRNG_LIKELY_LONG || 219 isa<UndefValue>(V); 220 }); 221 default: 222 return false; 223 } 224 } 225 226 /// Check if an integer value fits into our bypass type. 227 ValueRange FastDivInsertionTask::getValueRange(Value *V, 228 VisitedSetTy &Visited) { 229 unsigned ShortLen = BypassType->getBitWidth(); 230 unsigned LongLen = V->getType()->getIntegerBitWidth(); 231 232 assert(LongLen > ShortLen && "Value type must be wider than BypassType"); 233 unsigned HiBits = LongLen - ShortLen; 234 235 const DataLayout &DL = SlowDivOrRem->getDataLayout(); 236 KnownBits Known(LongLen); 237 238 computeKnownBits(V, Known, DL); 239 240 if (Known.countMinLeadingZeros() >= HiBits) 241 return VALRNG_KNOWN_SHORT; 242 243 if (Known.countMaxLeadingZeros() < HiBits) 244 return VALRNG_LIKELY_LONG; 245 246 // Long integer divisions are often used in hashtable implementations. It's 247 // not worth bypassing such divisions because hash values are extremely 248 // unlikely to have enough leading zeros. The call below tries to detect 249 // values that are unlikely to fit BypassType (including hashes). 250 if (isHashLikeValue(V, Visited)) 251 return VALRNG_LIKELY_LONG; 252 253 return VALRNG_UNKNOWN; 254 } 255 256 /// Add new basic block for slow div and rem operations and put it before 257 /// SuccessorBB. 258 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { 259 QuotRemWithBB DivRemPair; 260 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 261 MainBB->getParent(), SuccessorBB); 262 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 263 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 264 265 Value *Dividend = SlowDivOrRem->getOperand(0); 266 Value *Divisor = SlowDivOrRem->getOperand(1); 267 268 if (isSignedOp()) { 269 DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor); 270 DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor); 271 } else { 272 DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor); 273 DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor); 274 } 275 276 Builder.CreateBr(SuccessorBB); 277 return DivRemPair; 278 } 279 280 /// Add new basic block for fast div and rem operations and put it before 281 /// SuccessorBB. 282 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) { 283 QuotRemWithBB DivRemPair; 284 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 285 MainBB->getParent(), SuccessorBB); 286 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 287 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 288 289 Value *Dividend = SlowDivOrRem->getOperand(0); 290 Value *Divisor = SlowDivOrRem->getOperand(1); 291 Value *ShortDivisorV = 292 Builder.CreateCast(Instruction::Trunc, Divisor, BypassType); 293 Value *ShortDividendV = 294 Builder.CreateCast(Instruction::Trunc, Dividend, BypassType); 295 296 // udiv/urem because this optimization only handles positive numbers. 297 Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV); 298 Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV); 299 DivRemPair.Quotient = 300 Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType()); 301 DivRemPair.Remainder = 302 Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType()); 303 Builder.CreateBr(SuccessorBB); 304 305 return DivRemPair; 306 } 307 308 /// Creates Phi nodes for result of Div and Rem. 309 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS, 310 QuotRemWithBB &RHS, 311 BasicBlock *PhiBB) { 312 IRBuilder<> Builder(PhiBB, PhiBB->begin()); 313 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 314 PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2); 315 QuoPhi->addIncoming(LHS.Quotient, LHS.BB); 316 QuoPhi->addIncoming(RHS.Quotient, RHS.BB); 317 PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2); 318 RemPhi->addIncoming(LHS.Remainder, LHS.BB); 319 RemPhi->addIncoming(RHS.Remainder, RHS.BB); 320 return QuotRemPair(QuoPhi, RemPhi); 321 } 322 323 /// Creates a runtime check to test whether both the divisor and dividend fit 324 /// into BypassType. The check is inserted at the end of MainBB. True return 325 /// value means that the operands fit. Either of the operands may be NULL if it 326 /// doesn't need a runtime check. 327 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { 328 assert((Op1 || Op2) && "Nothing to check"); 329 IRBuilder<> Builder(MainBB, MainBB->end()); 330 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 331 332 Value *OrV; 333 if (Op1 && Op2) 334 OrV = Builder.CreateOr(Op1, Op2); 335 else 336 OrV = Op1 ? Op1 : Op2; 337 338 // BitMask is inverted to check if the operands are 339 // larger than the bypass type 340 uint64_t BitMask = ~BypassType->getBitMask(); 341 Value *AndV = Builder.CreateAnd(OrV, BitMask); 342 343 // Compare operand values 344 Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0); 345 return Builder.CreateICmpEQ(AndV, ZeroV); 346 } 347 348 /// Substitutes the div/rem instruction with code that checks the value of the 349 /// operands and uses a shorter-faster div/rem instruction when possible. 350 std::optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() { 351 Value *Dividend = SlowDivOrRem->getOperand(0); 352 Value *Divisor = SlowDivOrRem->getOperand(1); 353 354 VisitedSetTy SetL; 355 ValueRange DividendRange = getValueRange(Dividend, SetL); 356 if (DividendRange == VALRNG_LIKELY_LONG) 357 return std::nullopt; 358 359 VisitedSetTy SetR; 360 ValueRange DivisorRange = getValueRange(Divisor, SetR); 361 if (DivisorRange == VALRNG_LIKELY_LONG) 362 return std::nullopt; 363 364 bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT); 365 bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT); 366 367 if (DividendShort && DivisorShort) { 368 // If both operands are known to be short then just replace the long 369 // division with a short one in-place. Since we're not introducing control 370 // flow in this case, narrowing the division is always a win, even if the 371 // divisor is a constant (and will later get replaced by a multiplication). 372 373 IRBuilder<> Builder(SlowDivOrRem); 374 Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); 375 Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); 376 Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor); 377 Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor); 378 Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType()); 379 Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType()); 380 return QuotRemPair(ExtDiv, ExtRem); 381 } 382 383 if (isa<ConstantInt>(Divisor)) { 384 // If the divisor is not a constant, DAGCombiner will convert it to a 385 // multiplication by a magic constant. It isn't clear if it is worth 386 // introducing control flow to get a narrower multiply. 387 return std::nullopt; 388 } 389 390 // After Constant Hoisting pass, long constants may be represented as 391 // bitcast instructions. As a result, some constants may look like an 392 // instruction at first, and an additional check is necessary to find out if 393 // an operand is actually a constant. 394 if (auto *BCI = dyn_cast<BitCastInst>(Divisor)) 395 if (BCI->getParent() == SlowDivOrRem->getParent() && 396 isa<ConstantInt>(BCI->getOperand(0))) 397 return std::nullopt; 398 399 IRBuilder<> Builder(MainBB, MainBB->end()); 400 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 401 402 if (DividendShort && !isSignedOp()) { 403 // If the division is unsigned and Dividend is known to be short, then 404 // either 405 // 1) Divisor is less or equal to Dividend, and the result can be computed 406 // with a short division. 407 // 2) Divisor is greater than Dividend. In this case, no division is needed 408 // at all: The quotient is 0 and the remainder is equal to Dividend. 409 // 410 // So instead of checking at runtime whether Divisor fits into BypassType, 411 // we emit a runtime check to differentiate between these two cases. This 412 // lets us entirely avoid a long div. 413 414 // Split the basic block before the div/rem. 415 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 416 // Remove the unconditional branch from MainBB to SuccessorBB. 417 MainBB->back().eraseFromParent(); 418 QuotRemWithBB Long; 419 Long.BB = MainBB; 420 Long.Quotient = ConstantInt::get(getSlowType(), 0); 421 Long.Remainder = Dividend; 422 QuotRemWithBB Fast = createFastBB(SuccessorBB); 423 QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); 424 Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); 425 Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); 426 return Result; 427 } else { 428 // General case. Create both slow and fast div/rem pairs and choose one of 429 // them at runtime. 430 431 // Split the basic block before the div/rem. 432 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 433 // Remove the unconditional branch from MainBB to SuccessorBB. 434 MainBB->back().eraseFromParent(); 435 QuotRemWithBB Fast = createFastBB(SuccessorBB); 436 QuotRemWithBB Slow = createSlowBB(SuccessorBB); 437 QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); 438 Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, 439 DivisorShort ? nullptr : Divisor); 440 Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); 441 return Result; 442 } 443 } 444 445 /// This optimization identifies DIV/REM instructions in a BB that can be 446 /// profitably bypassed and carried out with a shorter, faster divide. 447 bool llvm::bypassSlowDivision(BasicBlock *BB, 448 const BypassWidthsTy &BypassWidths) { 449 DivCacheTy PerBBDivCache; 450 451 bool MadeChange = false; 452 Instruction *Next = &*BB->begin(); 453 while (Next != nullptr) { 454 // We may add instructions immediately after I, but we want to skip over 455 // them. 456 Instruction *I = Next; 457 Next = Next->getNextNode(); 458 459 // Ignore dead code to save time and avoid bugs. 460 if (I->hasNUses(0)) 461 continue; 462 463 FastDivInsertionTask Task(I, BypassWidths); 464 if (Value *Replacement = Task.getReplacement(PerBBDivCache)) { 465 I->replaceAllUsesWith(Replacement); 466 I->eraseFromParent(); 467 MadeChange = true; 468 } 469 } 470 471 // Above we eagerly create divs and rems, as pairs, so that we can efficiently 472 // create divrem machine instructions. Now erase any unused divs / rems so we 473 // don't leave extra instructions sitting around. 474 for (auto &KV : PerBBDivCache) 475 for (Value *V : {KV.second.Quotient, KV.second.Remainder}) 476 RecursivelyDeleteTriviallyDeadInstructions(V); 477 478 return MadeChange; 479 } 480