1//===- BypassSlowDivision.cpp - Bypass slow division ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains an optimization for div and rem on architectures that
10// execute short instructions significantly faster than longer instructions.
11// For example, on Intel Atom 32-bit divides are slow enough that during
12// runtime it is profitable to check the value of the operands, and if they are
13// positive and less than 256 use an unsigned 8-bit divide.
14//
15//===----------------------------------------------------------------------===//
16
17#include "llvm/Transforms/Utils/BypassSlowDivision.h"
18#include "llvm/ADT/DenseMap.h"
19#include "llvm/ADT/None.h"
20#include "llvm/ADT/Optional.h"
21#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/Transforms/Utils/Local.h"
24#include "llvm/Analysis/ValueTracking.h"
25#include "llvm/IR/BasicBlock.h"
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/DerivedTypes.h"
28#include "llvm/IR/Function.h"
29#include "llvm/IR/IRBuilder.h"
30#include "llvm/IR/Instruction.h"
31#include "llvm/IR/Instructions.h"
32#include "llvm/IR/Module.h"
33#include "llvm/IR/Type.h"
34#include "llvm/IR/Value.h"
35#include "llvm/Support/Casting.h"
36#include "llvm/Support/KnownBits.h"
37#include <cassert>
38#include <cstdint>
39
40using namespace llvm;
41
42#define DEBUG_TYPE "bypass-slow-division"
43
44namespace {
45
46  struct QuotRemPair {
47    Value *Quotient;
48    Value *Remainder;
49
50    QuotRemPair(Value *InQuotient, Value *InRemainder)
51        : Quotient(InQuotient), Remainder(InRemainder) {}
52  };
53
54  /// A quotient and remainder, plus a BB from which they logically "originate".
55  /// If you use Quotient or Remainder in a Phi node, you should use BB as its
56  /// corresponding predecessor.
57  struct QuotRemWithBB {
58    BasicBlock *BB = nullptr;
59    Value *Quotient = nullptr;
60    Value *Remainder = nullptr;
61  };
62
63using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
64using BypassWidthsTy = DenseMap<unsigned, unsigned>;
65using VisitedSetTy = SmallPtrSet<Instruction *, 4>;
66
67enum ValueRange {
68  /// Operand definitely fits into BypassType. No runtime checks are needed.
69  VALRNG_KNOWN_SHORT,
70  /// A runtime check is required, as value range is unknown.
71  VALRNG_UNKNOWN,
72  /// Operand is unlikely to fit into BypassType. The bypassing should be
73  /// disabled.
74  VALRNG_LIKELY_LONG
75};
76
77class FastDivInsertionTask {
78  bool IsValidTask = false;
79  Instruction *SlowDivOrRem = nullptr;
80  IntegerType *BypassType = nullptr;
81  BasicBlock *MainBB = nullptr;
82
83  bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
84  ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
85  QuotRemWithBB createSlowBB(BasicBlock *Successor);
86  QuotRemWithBB createFastBB(BasicBlock *Successor);
87  QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
88                                   BasicBlock *PhiBB);
89  Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
90  Optional<QuotRemPair> insertFastDivAndRem();
91
92  bool isSignedOp() {
93    return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
94           SlowDivOrRem->getOpcode() == Instruction::SRem;
95  }
96
97  bool isDivisionOp() {
98    return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
99           SlowDivOrRem->getOpcode() == Instruction::UDiv;
100  }
101
102  Type *getSlowType() { return SlowDivOrRem->getType(); }
103
104public:
105  FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
106
107  Value *getReplacement(DivCacheTy &Cache);
108};
109
110} // end anonymous namespace
111
112FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
113                                           const BypassWidthsTy &BypassWidths) {
114  switch (I->getOpcode()) {
115  case Instruction::UDiv:
116  case Instruction::SDiv:
117  case Instruction::URem:
118  case Instruction::SRem:
119    SlowDivOrRem = I;
120    break;
121  default:
122    // I is not a div/rem operation.
123    return;
124  }
125
126  // Skip division on vector types. Only optimize integer instructions.
127  IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
128  if (!SlowType)
129    return;
130
131  // Skip if this bitwidth is not bypassed.
132  auto BI = BypassWidths.find(SlowType->getBitWidth());
133  if (BI == BypassWidths.end())
134    return;
135
136  // Get type for div/rem instruction with bypass bitwidth.
137  IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
138  BypassType = BT;
139
140  // The original basic block.
141  MainBB = I->getParent();
142
143  // The instruction is indeed a slow div or rem operation.
144  IsValidTask = true;
145}
146
147/// Reuses previously-computed dividend or remainder from the current BB if
148/// operands and operation are identical. Otherwise calls insertFastDivAndRem to
149/// perform the optimization and caches the resulting dividend and remainder.
150/// If no replacement can be generated, nullptr is returned.
151Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
152  // First, make sure that the task is valid.
153  if (!IsValidTask)
154    return nullptr;
155
156  // Then, look for a value in Cache.
157  Value *Dividend = SlowDivOrRem->getOperand(0);
158  Value *Divisor = SlowDivOrRem->getOperand(1);
159  DivRemMapKey Key(isSignedOp(), Dividend, Divisor);
160  auto CacheI = Cache.find(Key);
161
162  if (CacheI == Cache.end()) {
163    // If previous instance does not exist, try to insert fast div.
164    Optional<QuotRemPair> OptResult = insertFastDivAndRem();
165    // Bail out if insertFastDivAndRem has failed.
166    if (!OptResult)
167      return nullptr;
168    CacheI = Cache.insert({Key, *OptResult}).first;
169  }
170
171  QuotRemPair &Value = CacheI->second;
172  return isDivisionOp() ? Value.Quotient : Value.Remainder;
173}
174
175/// Check if a value looks like a hash.
176///
177/// The routine is expected to detect values computed using the most common hash
178/// algorithms. Typically, hash computations end with one of the following
179/// instructions:
180///
181/// 1) MUL with a constant wider than BypassType
182/// 2) XOR instruction
183///
184/// And even if we are wrong and the value is not a hash, it is still quite
185/// unlikely that such values will fit into BypassType.
186///
187/// To detect string hash algorithms like FNV we have to look through PHI-nodes.
188/// It is implemented as a depth-first search for values that look neither long
189/// nor hash-like.
190bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
191  Instruction *I = dyn_cast<Instruction>(V);
192  if (!I)
193    return false;
194
195  switch (I->getOpcode()) {
196  case Instruction::Xor:
197    return true;
198  case Instruction::Mul: {
199    // After Constant Hoisting pass, long constants may be represented as
200    // bitcast instructions. As a result, some constants may look like an
201    // instruction at first, and an additional check is necessary to find out if
202    // an operand is actually a constant.
203    Value *Op1 = I->getOperand(1);
204    ConstantInt *C = dyn_cast<ConstantInt>(Op1);
205    if (!C && isa<BitCastInst>(Op1))
206      C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
207    return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth();
208  }
209  case Instruction::PHI:
210    // Stop IR traversal in case of a crazy input code. This limits recursion
211    // depth.
212    if (Visited.size() >= 16)
213      return false;
214    // Do not visit nodes that have been visited already. We return true because
215    // it means that we couldn't find any value that doesn't look hash-like.
216    if (!Visited.insert(I).second)
217      return true;
218    return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
219      // Ignore undef values as they probably don't affect the division
220      // operands.
221      return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
222             isa<UndefValue>(V);
223    });
224  default:
225    return false;
226  }
227}
228
229/// Check if an integer value fits into our bypass type.
230ValueRange FastDivInsertionTask::getValueRange(Value *V,
231                                               VisitedSetTy &Visited) {
232  unsigned ShortLen = BypassType->getBitWidth();
233  unsigned LongLen = V->getType()->getIntegerBitWidth();
234
235  assert(LongLen > ShortLen && "Value type must be wider than BypassType");
236  unsigned HiBits = LongLen - ShortLen;
237
238  const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
239  KnownBits Known(LongLen);
240
241  computeKnownBits(V, Known, DL);
242
243  if (Known.countMinLeadingZeros() >= HiBits)
244    return VALRNG_KNOWN_SHORT;
245
246  if (Known.countMaxLeadingZeros() < HiBits)
247    return VALRNG_LIKELY_LONG;
248
249  // Long integer divisions are often used in hashtable implementations. It's
250  // not worth bypassing such divisions because hash values are extremely
251  // unlikely to have enough leading zeros. The call below tries to detect
252  // values that are unlikely to fit BypassType (including hashes).
253  if (isHashLikeValue(V, Visited))
254    return VALRNG_LIKELY_LONG;
255
256  return VALRNG_UNKNOWN;
257}
258
259/// Add new basic block for slow div and rem operations and put it before
260/// SuccessorBB.
261QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
262  QuotRemWithBB DivRemPair;
263  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
264                                     MainBB->getParent(), SuccessorBB);
265  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
266  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
267
268  Value *Dividend = SlowDivOrRem->getOperand(0);
269  Value *Divisor = SlowDivOrRem->getOperand(1);
270
271  if (isSignedOp()) {
272    DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
273    DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
274  } else {
275    DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
276    DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
277  }
278
279  Builder.CreateBr(SuccessorBB);
280  return DivRemPair;
281}
282
283/// Add new basic block for fast div and rem operations and put it before
284/// SuccessorBB.
285QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
286  QuotRemWithBB DivRemPair;
287  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
288                                     MainBB->getParent(), SuccessorBB);
289  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
290  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
291
292  Value *Dividend = SlowDivOrRem->getOperand(0);
293  Value *Divisor = SlowDivOrRem->getOperand(1);
294  Value *ShortDivisorV =
295      Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
296  Value *ShortDividendV =
297      Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
298
299  // udiv/urem because this optimization only handles positive numbers.
300  Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
301  Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
302  DivRemPair.Quotient =
303      Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
304  DivRemPair.Remainder =
305      Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
306  Builder.CreateBr(SuccessorBB);
307
308  return DivRemPair;
309}
310
311/// Creates Phi nodes for result of Div and Rem.
312QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
313                                                       QuotRemWithBB &RHS,
314                                                       BasicBlock *PhiBB) {
315  IRBuilder<> Builder(PhiBB, PhiBB->begin());
316  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
317  PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
318  QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
319  QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
320  PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
321  RemPhi->addIncoming(LHS.Remainder, LHS.BB);
322  RemPhi->addIncoming(RHS.Remainder, RHS.BB);
323  return QuotRemPair(QuoPhi, RemPhi);
324}
325
326/// Creates a runtime check to test whether both the divisor and dividend fit
327/// into BypassType. The check is inserted at the end of MainBB. True return
328/// value means that the operands fit. Either of the operands may be NULL if it
329/// doesn't need a runtime check.
330Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
331  assert((Op1 || Op2) && "Nothing to check");
332  IRBuilder<> Builder(MainBB, MainBB->end());
333  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
334
335  Value *OrV;
336  if (Op1 && Op2)
337    OrV = Builder.CreateOr(Op1, Op2);
338  else
339    OrV = Op1 ? Op1 : Op2;
340
341  // BitMask is inverted to check if the operands are
342  // larger than the bypass type
343  uint64_t BitMask = ~BypassType->getBitMask();
344  Value *AndV = Builder.CreateAnd(OrV, BitMask);
345
346  // Compare operand values
347  Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
348  return Builder.CreateICmpEQ(AndV, ZeroV);
349}
350
351/// Substitutes the div/rem instruction with code that checks the value of the
352/// operands and uses a shorter-faster div/rem instruction when possible.
353Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
354  Value *Dividend = SlowDivOrRem->getOperand(0);
355  Value *Divisor = SlowDivOrRem->getOperand(1);
356
357  VisitedSetTy SetL;
358  ValueRange DividendRange = getValueRange(Dividend, SetL);
359  if (DividendRange == VALRNG_LIKELY_LONG)
360    return None;
361
362  VisitedSetTy SetR;
363  ValueRange DivisorRange = getValueRange(Divisor, SetR);
364  if (DivisorRange == VALRNG_LIKELY_LONG)
365    return None;
366
367  bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
368  bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
369
370  if (DividendShort && DivisorShort) {
371    // If both operands are known to be short then just replace the long
372    // division with a short one in-place.  Since we're not introducing control
373    // flow in this case, narrowing the division is always a win, even if the
374    // divisor is a constant (and will later get replaced by a multiplication).
375
376    IRBuilder<> Builder(SlowDivOrRem);
377    Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
378    Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
379    Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
380    Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
381    Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
382    Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
383    return QuotRemPair(ExtDiv, ExtRem);
384  }
385
386  if (isa<ConstantInt>(Divisor)) {
387    // If the divisor is not a constant, DAGCombiner will convert it to a
388    // multiplication by a magic constant.  It isn't clear if it is worth
389    // introducing control flow to get a narrower multiply.
390    return None;
391  }
392
393  // After Constant Hoisting pass, long constants may be represented as
394  // bitcast instructions. As a result, some constants may look like an
395  // instruction at first, and an additional check is necessary to find out if
396  // an operand is actually a constant.
397  if (auto *BCI = dyn_cast<BitCastInst>(Divisor))
398    if (BCI->getParent() == SlowDivOrRem->getParent() &&
399        isa<ConstantInt>(BCI->getOperand(0)))
400      return None;
401
402  IRBuilder<> Builder(MainBB, MainBB->end());
403  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
404
405  if (DividendShort && !isSignedOp()) {
406    // If the division is unsigned and Dividend is known to be short, then
407    // either
408    // 1) Divisor is less or equal to Dividend, and the result can be computed
409    //    with a short division.
410    // 2) Divisor is greater than Dividend. In this case, no division is needed
411    //    at all: The quotient is 0 and the remainder is equal to Dividend.
412    //
413    // So instead of checking at runtime whether Divisor fits into BypassType,
414    // we emit a runtime check to differentiate between these two cases. This
415    // lets us entirely avoid a long div.
416
417    // Split the basic block before the div/rem.
418    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
419    // Remove the unconditional branch from MainBB to SuccessorBB.
420    MainBB->getInstList().back().eraseFromParent();
421    QuotRemWithBB Long;
422    Long.BB = MainBB;
423    Long.Quotient = ConstantInt::get(getSlowType(), 0);
424    Long.Remainder = Dividend;
425    QuotRemWithBB Fast = createFastBB(SuccessorBB);
426    QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
427    Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
428    Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
429    return Result;
430  } else {
431    // General case. Create both slow and fast div/rem pairs and choose one of
432    // them at runtime.
433
434    // Split the basic block before the div/rem.
435    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
436    // Remove the unconditional branch from MainBB to SuccessorBB.
437    MainBB->getInstList().back().eraseFromParent();
438    QuotRemWithBB Fast = createFastBB(SuccessorBB);
439    QuotRemWithBB Slow = createSlowBB(SuccessorBB);
440    QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
441    Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
442                                            DivisorShort ? nullptr : Divisor);
443    Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
444    return Result;
445  }
446}
447
448/// This optimization identifies DIV/REM instructions in a BB that can be
449/// profitably bypassed and carried out with a shorter, faster divide.
450bool llvm::bypassSlowDivision(BasicBlock *BB,
451                              const BypassWidthsTy &BypassWidths) {
452  DivCacheTy PerBBDivCache;
453
454  bool MadeChange = false;
455  Instruction *Next = &*BB->begin();
456  while (Next != nullptr) {
457    // We may add instructions immediately after I, but we want to skip over
458    // them.
459    Instruction *I = Next;
460    Next = Next->getNextNode();
461
462    // Ignore dead code to save time and avoid bugs.
463    if (I->hasNUses(0))
464      continue;
465
466    FastDivInsertionTask Task(I, BypassWidths);
467    if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
468      I->replaceAllUsesWith(Replacement);
469      I->eraseFromParent();
470      MadeChange = true;
471    }
472  }
473
474  // Above we eagerly create divs and rems, as pairs, so that we can efficiently
475  // create divrem machine instructions.  Now erase any unused divs / rems so we
476  // don't leave extra instructions sitting around.
477  for (auto &KV : PerBBDivCache)
478    for (Value *V : {KV.second.Quotient, KV.second.Remainder})
479      RecursivelyDeleteTriviallyDeadInstructions(V);
480
481  return MadeChange;
482}
483