xref: /llvm-project/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp (revision 013f4a46d1978e370f940df3cbd04fb0399a04fe)
1 //===- BypassSlowDivision.cpp - Bypass slow division ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains an optimization for div and rem on architectures that
10 // execute short instructions significantly faster than longer instructions.
11 // For example, on Intel Atom 32-bit divides are slow enough that during
12 // runtime it is profitable to check the value of the operands, and if they are
13 // positive and less than 256 use an unsigned 8-bit divide.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
18 #include "llvm/ADT/DenseMap.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallPtrSet.h"
21 #include "llvm/Analysis/ValueTracking.h"
22 #include "llvm/IR/BasicBlock.h"
23 #include "llvm/IR/Constants.h"
24 #include "llvm/IR/DerivedTypes.h"
25 #include "llvm/IR/Function.h"
26 #include "llvm/IR/IRBuilder.h"
27 #include "llvm/IR/Instruction.h"
28 #include "llvm/IR/Instructions.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/IR/Value.h"
31 #include "llvm/Support/Casting.h"
32 #include "llvm/Support/KnownBits.h"
33 #include "llvm/Transforms/Utils/Local.h"
34 #include <cassert>
35 #include <cstdint>
36 
37 using namespace llvm;
38 
39 #define DEBUG_TYPE "bypass-slow-division"
40 
41 namespace {
42 
43   struct QuotRemPair {
44     Value *Quotient;
45     Value *Remainder;
46 
47     QuotRemPair(Value *InQuotient, Value *InRemainder)
48         : Quotient(InQuotient), Remainder(InRemainder) {}
49   };
50 
51   /// A quotient and remainder, plus a BB from which they logically "originate".
52   /// If you use Quotient or Remainder in a Phi node, you should use BB as its
53   /// corresponding predecessor.
54   struct QuotRemWithBB {
55     BasicBlock *BB = nullptr;
56     Value *Quotient = nullptr;
57     Value *Remainder = nullptr;
58   };
59 
60 using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
61 using BypassWidthsTy = DenseMap<unsigned, unsigned>;
62 using VisitedSetTy = SmallPtrSet<Instruction *, 4>;
63 
64 enum ValueRange {
65   /// Operand definitely fits into BypassType. No runtime checks are needed.
66   VALRNG_KNOWN_SHORT,
67   /// A runtime check is required, as value range is unknown.
68   VALRNG_UNKNOWN,
69   /// Operand is unlikely to fit into BypassType. The bypassing should be
70   /// disabled.
71   VALRNG_LIKELY_LONG
72 };
73 
74 class FastDivInsertionTask {
75   bool IsValidTask = false;
76   Instruction *SlowDivOrRem = nullptr;
77   IntegerType *BypassType = nullptr;
78   BasicBlock *MainBB = nullptr;
79 
80   bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
81   ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
82   QuotRemWithBB createSlowBB(BasicBlock *Successor);
83   QuotRemWithBB createFastBB(BasicBlock *Successor);
84   QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
85                                    BasicBlock *PhiBB);
86   Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
87   std::optional<QuotRemPair> insertFastDivAndRem();
88 
89   bool isSignedOp() {
90     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
91            SlowDivOrRem->getOpcode() == Instruction::SRem;
92   }
93 
94   bool isDivisionOp() {
95     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
96            SlowDivOrRem->getOpcode() == Instruction::UDiv;
97   }
98 
99   Type *getSlowType() { return SlowDivOrRem->getType(); }
100 
101 public:
102   FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
103 
104   Value *getReplacement(DivCacheTy &Cache);
105 };
106 
107 } // end anonymous namespace
108 
109 FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
110                                            const BypassWidthsTy &BypassWidths) {
111   switch (I->getOpcode()) {
112   case Instruction::UDiv:
113   case Instruction::SDiv:
114   case Instruction::URem:
115   case Instruction::SRem:
116     SlowDivOrRem = I;
117     break;
118   default:
119     // I is not a div/rem operation.
120     return;
121   }
122 
123   // Skip division on vector types. Only optimize integer instructions.
124   IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
125   if (!SlowType)
126     return;
127 
128   // Skip if this bitwidth is not bypassed.
129   auto BI = BypassWidths.find(SlowType->getBitWidth());
130   if (BI == BypassWidths.end())
131     return;
132 
133   // Get type for div/rem instruction with bypass bitwidth.
134   IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
135   BypassType = BT;
136 
137   // The original basic block.
138   MainBB = I->getParent();
139 
140   // The instruction is indeed a slow div or rem operation.
141   IsValidTask = true;
142 }
143 
144 /// Reuses previously-computed dividend or remainder from the current BB if
145 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to
146 /// perform the optimization and caches the resulting dividend and remainder.
147 /// If no replacement can be generated, nullptr is returned.
148 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
149   // First, make sure that the task is valid.
150   if (!IsValidTask)
151     return nullptr;
152 
153   // Then, look for a value in Cache.
154   Value *Dividend = SlowDivOrRem->getOperand(0);
155   Value *Divisor = SlowDivOrRem->getOperand(1);
156   DivRemMapKey Key(isSignedOp(), Dividend, Divisor);
157   auto CacheI = Cache.find(Key);
158 
159   if (CacheI == Cache.end()) {
160     // If previous instance does not exist, try to insert fast div.
161     std::optional<QuotRemPair> OptResult = insertFastDivAndRem();
162     // Bail out if insertFastDivAndRem has failed.
163     if (!OptResult)
164       return nullptr;
165     CacheI = Cache.insert({Key, *OptResult}).first;
166   }
167 
168   QuotRemPair &Value = CacheI->second;
169   return isDivisionOp() ? Value.Quotient : Value.Remainder;
170 }
171 
172 /// Check if a value looks like a hash.
173 ///
174 /// The routine is expected to detect values computed using the most common hash
175 /// algorithms. Typically, hash computations end with one of the following
176 /// instructions:
177 ///
178 /// 1) MUL with a constant wider than BypassType
179 /// 2) XOR instruction
180 ///
181 /// And even if we are wrong and the value is not a hash, it is still quite
182 /// unlikely that such values will fit into BypassType.
183 ///
184 /// To detect string hash algorithms like FNV we have to look through PHI-nodes.
185 /// It is implemented as a depth-first search for values that look neither long
186 /// nor hash-like.
187 bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
188   Instruction *I = dyn_cast<Instruction>(V);
189   if (!I)
190     return false;
191 
192   switch (I->getOpcode()) {
193   case Instruction::Xor:
194     return true;
195   case Instruction::Mul: {
196     // After Constant Hoisting pass, long constants may be represented as
197     // bitcast instructions. As a result, some constants may look like an
198     // instruction at first, and an additional check is necessary to find out if
199     // an operand is actually a constant.
200     Value *Op1 = I->getOperand(1);
201     ConstantInt *C = dyn_cast<ConstantInt>(Op1);
202     if (!C && isa<BitCastInst>(Op1))
203       C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
204     return C && C->getValue().getSignificantBits() > BypassType->getBitWidth();
205   }
206   case Instruction::PHI:
207     // Stop IR traversal in case of a crazy input code. This limits recursion
208     // depth.
209     if (Visited.size() >= 16)
210       return false;
211     // Do not visit nodes that have been visited already. We return true because
212     // it means that we couldn't find any value that doesn't look hash-like.
213     if (!Visited.insert(I).second)
214       return true;
215     return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
216       // Ignore undef values as they probably don't affect the division
217       // operands.
218       return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
219              isa<UndefValue>(V);
220     });
221   default:
222     return false;
223   }
224 }
225 
226 /// Check if an integer value fits into our bypass type.
227 ValueRange FastDivInsertionTask::getValueRange(Value *V,
228                                                VisitedSetTy &Visited) {
229   unsigned ShortLen = BypassType->getBitWidth();
230   unsigned LongLen = V->getType()->getIntegerBitWidth();
231 
232   assert(LongLen > ShortLen && "Value type must be wider than BypassType");
233   unsigned HiBits = LongLen - ShortLen;
234 
235   const DataLayout &DL = SlowDivOrRem->getDataLayout();
236   KnownBits Known(LongLen);
237 
238   computeKnownBits(V, Known, DL);
239 
240   if (Known.countMinLeadingZeros() >= HiBits)
241     return VALRNG_KNOWN_SHORT;
242 
243   if (Known.countMaxLeadingZeros() < HiBits)
244     return VALRNG_LIKELY_LONG;
245 
246   // Long integer divisions are often used in hashtable implementations. It's
247   // not worth bypassing such divisions because hash values are extremely
248   // unlikely to have enough leading zeros. The call below tries to detect
249   // values that are unlikely to fit BypassType (including hashes).
250   if (isHashLikeValue(V, Visited))
251     return VALRNG_LIKELY_LONG;
252 
253   return VALRNG_UNKNOWN;
254 }
255 
256 /// Add new basic block for slow div and rem operations and put it before
257 /// SuccessorBB.
258 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
259   QuotRemWithBB DivRemPair;
260   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
261                                      MainBB->getParent(), SuccessorBB);
262   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
263   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
264 
265   Value *Dividend = SlowDivOrRem->getOperand(0);
266   Value *Divisor = SlowDivOrRem->getOperand(1);
267 
268   if (isSignedOp()) {
269     DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
270     DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
271   } else {
272     DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
273     DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
274   }
275 
276   Builder.CreateBr(SuccessorBB);
277   return DivRemPair;
278 }
279 
280 /// Add new basic block for fast div and rem operations and put it before
281 /// SuccessorBB.
282 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
283   QuotRemWithBB DivRemPair;
284   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
285                                      MainBB->getParent(), SuccessorBB);
286   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
287   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
288 
289   Value *Dividend = SlowDivOrRem->getOperand(0);
290   Value *Divisor = SlowDivOrRem->getOperand(1);
291   Value *ShortDivisorV =
292       Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
293   Value *ShortDividendV =
294       Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
295 
296   // udiv/urem because this optimization only handles positive numbers.
297   Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
298   Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
299   DivRemPair.Quotient =
300       Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
301   DivRemPair.Remainder =
302       Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
303   Builder.CreateBr(SuccessorBB);
304 
305   return DivRemPair;
306 }
307 
308 /// Creates Phi nodes for result of Div and Rem.
309 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
310                                                        QuotRemWithBB &RHS,
311                                                        BasicBlock *PhiBB) {
312   IRBuilder<> Builder(PhiBB, PhiBB->begin());
313   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
314   PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
315   QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
316   QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
317   PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
318   RemPhi->addIncoming(LHS.Remainder, LHS.BB);
319   RemPhi->addIncoming(RHS.Remainder, RHS.BB);
320   return QuotRemPair(QuoPhi, RemPhi);
321 }
322 
323 /// Creates a runtime check to test whether both the divisor and dividend fit
324 /// into BypassType. The check is inserted at the end of MainBB. True return
325 /// value means that the operands fit. Either of the operands may be NULL if it
326 /// doesn't need a runtime check.
327 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
328   assert((Op1 || Op2) && "Nothing to check");
329   IRBuilder<> Builder(MainBB, MainBB->end());
330   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
331 
332   Value *OrV;
333   if (Op1 && Op2)
334     OrV = Builder.CreateOr(Op1, Op2);
335   else
336     OrV = Op1 ? Op1 : Op2;
337 
338   // BitMask is inverted to check if the operands are
339   // larger than the bypass type
340   uint64_t BitMask = ~BypassType->getBitMask();
341   Value *AndV = Builder.CreateAnd(OrV, BitMask);
342 
343   // Compare operand values
344   Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
345   return Builder.CreateICmpEQ(AndV, ZeroV);
346 }
347 
348 /// Substitutes the div/rem instruction with code that checks the value of the
349 /// operands and uses a shorter-faster div/rem instruction when possible.
350 std::optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
351   Value *Dividend = SlowDivOrRem->getOperand(0);
352   Value *Divisor = SlowDivOrRem->getOperand(1);
353 
354   VisitedSetTy SetL;
355   ValueRange DividendRange = getValueRange(Dividend, SetL);
356   if (DividendRange == VALRNG_LIKELY_LONG)
357     return std::nullopt;
358 
359   VisitedSetTy SetR;
360   ValueRange DivisorRange = getValueRange(Divisor, SetR);
361   if (DivisorRange == VALRNG_LIKELY_LONG)
362     return std::nullopt;
363 
364   bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
365   bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
366 
367   if (DividendShort && DivisorShort) {
368     // If both operands are known to be short then just replace the long
369     // division with a short one in-place.  Since we're not introducing control
370     // flow in this case, narrowing the division is always a win, even if the
371     // divisor is a constant (and will later get replaced by a multiplication).
372 
373     IRBuilder<> Builder(SlowDivOrRem);
374     Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
375     Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
376     Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
377     Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
378     Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
379     Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
380     return QuotRemPair(ExtDiv, ExtRem);
381   }
382 
383   if (isa<ConstantInt>(Divisor)) {
384     // If the divisor is not a constant, DAGCombiner will convert it to a
385     // multiplication by a magic constant.  It isn't clear if it is worth
386     // introducing control flow to get a narrower multiply.
387     return std::nullopt;
388   }
389 
390   // After Constant Hoisting pass, long constants may be represented as
391   // bitcast instructions. As a result, some constants may look like an
392   // instruction at first, and an additional check is necessary to find out if
393   // an operand is actually a constant.
394   if (auto *BCI = dyn_cast<BitCastInst>(Divisor))
395     if (BCI->getParent() == SlowDivOrRem->getParent() &&
396         isa<ConstantInt>(BCI->getOperand(0)))
397       return std::nullopt;
398 
399   IRBuilder<> Builder(MainBB, MainBB->end());
400   Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
401 
402   if (DividendShort && !isSignedOp()) {
403     // If the division is unsigned and Dividend is known to be short, then
404     // either
405     // 1) Divisor is less or equal to Dividend, and the result can be computed
406     //    with a short division.
407     // 2) Divisor is greater than Dividend. In this case, no division is needed
408     //    at all: The quotient is 0 and the remainder is equal to Dividend.
409     //
410     // So instead of checking at runtime whether Divisor fits into BypassType,
411     // we emit a runtime check to differentiate between these two cases. This
412     // lets us entirely avoid a long div.
413 
414     // Split the basic block before the div/rem.
415     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
416     // Remove the unconditional branch from MainBB to SuccessorBB.
417     MainBB->back().eraseFromParent();
418     QuotRemWithBB Long;
419     Long.BB = MainBB;
420     Long.Quotient = ConstantInt::get(getSlowType(), 0);
421     Long.Remainder = Dividend;
422     QuotRemWithBB Fast = createFastBB(SuccessorBB);
423     QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
424     Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
425     Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
426     return Result;
427   } else {
428     // General case. Create both slow and fast div/rem pairs and choose one of
429     // them at runtime.
430 
431     // Split the basic block before the div/rem.
432     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
433     // Remove the unconditional branch from MainBB to SuccessorBB.
434     MainBB->back().eraseFromParent();
435     QuotRemWithBB Fast = createFastBB(SuccessorBB);
436     QuotRemWithBB Slow = createSlowBB(SuccessorBB);
437     QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
438     Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
439                                             DivisorShort ? nullptr : Divisor);
440     Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
441     return Result;
442   }
443 }
444 
445 /// This optimization identifies DIV/REM instructions in a BB that can be
446 /// profitably bypassed and carried out with a shorter, faster divide.
447 bool llvm::bypassSlowDivision(BasicBlock *BB,
448                               const BypassWidthsTy &BypassWidths) {
449   DivCacheTy PerBBDivCache;
450 
451   bool MadeChange = false;
452   Instruction *Next = &*BB->begin();
453   while (Next != nullptr) {
454     // We may add instructions immediately after I, but we want to skip over
455     // them.
456     Instruction *I = Next;
457     Next = Next->getNextNode();
458 
459     // Ignore dead code to save time and avoid bugs.
460     if (I->hasNUses(0))
461       continue;
462 
463     FastDivInsertionTask Task(I, BypassWidths);
464     if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
465       I->replaceAllUsesWith(Replacement);
466       I->eraseFromParent();
467       MadeChange = true;
468     }
469   }
470 
471   // Above we eagerly create divs and rems, as pairs, so that we can efficiently
472   // create divrem machine instructions.  Now erase any unused divs / rems so we
473   // don't leave extra instructions sitting around.
474   for (auto &KV : PerBBDivCache)
475     for (Value *V : {KV.second.Quotient, KV.second.Remainder})
476       RecursivelyDeleteTriviallyDeadInstructions(V);
477 
478   return MadeChange;
479 }
480